expression
templates
in eigen

Expression templates
=
Templates that
represent expressions

why?

float a, b, c, d;

a = b + c + d;

why?

float a, b, c, d;

a = b + c + d;
a = (b + c) + d;

why?

Vec a, b, c, d;

a = b + c + d;
a = (b + c) + d;
    ^^^^^^^
   temporary

why?

Vec a, b, c, d;

a = b + c + d;
a = (b + c) + d;
    ^^^^^^^
   temporary

why?

EAGER Evaluation
CREATES TEMPORARIES

tmp1.x = b.x + c.x;
tmp1.y = b.y + c.y;
tmp1.z = b.z + c.z;

tmp2.x = tmp1.x + d.x;
tmp2.y = tmp1.y + d.y;
tmp2.z = tmp1.z + d.z;

a.x = tmp2.x;
a.y = tmp2.y;
a.z = tmp2.z;

why?

class Vec {
    Vec
    operator+ (const Vec& other) const
    {
      return Vector (
        x+other.x, 
        y+other.y, 
        z+other.z);
    }

    void
    operator= (const Vec& other)
    {
      x = other.x;
      y = other.y;
      z = other.z;
    }
};

why?

LAZY evaluation
=

fast

// Faster way
a.x = b.x + c.x + d.x;
a.y = b.y + c.y + d.y;
a.z = b.z + c.z + d.z;

hOW?

Vec a, b, c, d;

a = b + c + d;
a = (b + c) + d;
    ^^^^^^^
   temporary

hOW?

a = ((b + c) + d);

a = (BinarySum(b, c) + d);

a = BinarySum(
        BinarySum(b, c), 
        d
    );

hOW?

class BinarySum {
    float eval(size_t i) {
        return lhs.eval(i) + rhs.eval(i);
    }
};

void Vec::operator= (const BinarySum& other) {
    for (size_t i = 0; i < 3; ++i)
        data_[i] = other.eval(i);
}

hOW?

auto tmp1 = BinarySum(b, c);
auto tmp2 = BinarySum(tmp1, d);
a = tmp2;

hOW?

auto tmp1 = BinarySum(b, c);
auto tmp2 = BinarySum(tmp1, d);
a = tmp2;

for (size_t i = 0; i < 3; ++i) {
    a[i] = tmp2.eval(i);
}

hOW?

auto tmp1 = BinarySum(b, c);
auto tmp2 = BinarySum(tmp1, d);
a = tmp2;

for (size_t i = 0; i < 3; ++i) {
    a[i] = tmp2.eval(i);
    a[i] = tmp1.eval(i) + d[i];
}

hOW?

auto tmp1 = BinarySum(b, c);
auto tmp2 = BinarySum(tmp1, d);
a = tmp2;

for (size_t i = 0; i < 3; ++i) {
    a[i] = tmp2.eval(i);
    a[i] = tmp1.eval(i) + d[i];
    a[i] = b[i] + c[i] + d[i];
}

Templates

multiple types
multiple operations

UnaryOp<Op, Type>;
// sqrt, exp, neg, ...

BinaryOp<Op, TypeL, TypeR>;
// dot product, /, ...

CwiseBinaryOp<
    Op, TypeL, TypeR
>;
// +, -, cwise prod, ...

Templates

multiple types
multiple operations

UnaryOp<Op, Type>;
// sqrt, exp, neg, ...

BinaryOp<Op, TypeL, TypeR>;
// dot product, /, ...

CwiseBinaryOp<
    Op, TypeL, TypeR
>;
// +, -, cwise prod, ...

DEEP DIVE

VectorXf v, w;

VectorXf u = v + w;

DEEP DIVE

VectorXf u = v + w;

MatrixBase::operator+(const MatrixBase&) -> 
    CwiseBinaryOp<
        internal::scalar_sum_op<float>,
        VectorXf, VectorXf
    >;

// Why not
// MatrixBase::operator+(const CWiseBinaryOp<...> &);

// CwiseBinaryOp is a type
// Don't want to redefine all operations
// for every possible combinations.
// CRTP
class CwiseBinaryOp : MatrixBase<CWiseBinaryOp>;

DEEP DIVE

VectorXf u = CwiseBinaryOp<...>(v, w);

Matrix& operator=(const MatrixBase<OtherDerived>& other) {
  return Base::operator=(other.derived());
}

Derived&
MatrixBase::operator=(const MatrixBase<OtherDerived>& other);
// With
Derived = VectorXf;
OtherDerived = CwiseBinaryOp<...>;

DEEP DIVE

VectorXf u = CwiseBinaryOp<...>(v, w);

// ASSIGNMENT STRATEGY SELECTION
Derived&
MatrixBase::operator=(const MatrixBase<OtherDerived>& other) {
    return internal::assign_selector<Derived,OtherDerived>::
        run(derived(), other.derived());
}

template<typename Derived, typename OtherDerived,
  bool EvalBeforeAssigning = int(OtherDerived::Flags) &
                             EvalBeforeAssigningBit,
  bool NeedToTranspose = ...>
struct internal::assign_selector;

DEEP DIVE

// ASSIGNMENT STRATEGY SELECTION
template<typename Derived, typename OtherDerived>
struct internal::assign_selector<Derived,OtherDerived,false,false>
{
    static Derived& run(Derived& dst, const OtherDerived& other) {
        return dst.lazyAssign(other.derived());
    }
}

template<...>
Derived& MatrixBase<Derived>::lazyAssign(
    const MatrixBase<OtherDerived>& other
) {
    internal::assign_impl<Derived, OtherDerived>
        ::run(derived(),other.derived());
    return derived();
}

// ASSIGNMENT EXECUTION
struct internal::assign_impl<
    Derived1, Derived2, LinearVectorization, NoUnrolling>
{
    static void run(Derived1 &dst, const Derived2 &src) {
        [...]
 
        // Unaligned copy
        for(int index = 0; index < alignedStart; index++)
            dst.copyCoeff(index, src);
 
        // Vectorized copy
        for(int index = alignedStart; index < alignedEnd; index += packetSize) {
            dst.template copyPacket<
                Derived2, Aligned, internal::assign_traits<Derived1,Derived2
            >::SrcAlignment>(index, src);
        }
 
        // Unaligned copy
        for(int index = alignedEnd; index < size; index++)
            dst.copyCoeff(index, src);
    }
};

// VECTORIZED ASSIGNMENT EXECUTION

// Vector copy loop from prev. slide
for(int index = alignedStart; index < alignedEnd; index += packetSize) {
    dst.template copyPacket<
        Derived2, Aligned, internal::assign_traits<Derived1,Derived2
    >::SrcAlignment>(index, src);
}
 
inline void MatrixBase<Derived>::copyPacket(
    int index, const MatrixBase<OtherDerived>& other
) {
  //        STORING DATA
  derived().template writePacket<StoreMode>(
      index, 
  //                  LOADING DATA
      other.derived().template packet<LoadMode>(index)
  );
}

// STORING
template<int StoreMode>
inline void writePacket(int index, const PacketScalar& x)
{
    internal::pstoret<Scalar, PacketScalar, StoreMode>
        (m_storage.data() + index, x);
}

template<typename Scalar, typename Packet, int LoadMode>
inline void internal::pstoret(Scalar* to, const Packet& from)
{
    if(LoadMode == Aligned)
        internal::pstore(to, from);
    else
        internal::pstoreu(to, from);
}

template<> inline void internal::pstore(
    float*  to, const __m128&  from
) {
    _mm_store_ps(to, from);
}

// LOADING and SUMMING
class CwiseBinaryOp {
    template<int LoadMode>
    inline PacketScalar packet(int index) const {
        //               SUM
        return m_functor.packetOp(
        //        LHS VECTOR LOAD
            m_lhs.template packet<LoadMode>(index), 
        //        RHS VECTOR LOAD
            m_rhs.template packet<LoadMode>(index)
        );
    }
};

// LOADS are similar to STORES

// LOADING and SUMMING
class CwiseBinaryOp {
    template<int LoadMode>
    inline PacketScalar packet(int index) const {
        //               SUM
        return m_functor.packetOp(...);
    }
};

template<typename Scalar> struct internal::scalar_sum_op {
    template<typename PacketScalar>
    inline const PacketScalar packetOp(
        const PacketScalar& a, const PacketScalar& b
    ) const {
        return internal::padd(a,b);
    }
};

template<> inline __m128  internal::padd (
    const __m128&  a, const __m128&  b
) { return _mm_add_ps(a,b); }

VectorXf u = v + w;

// Has now become

// Unaligned copy
for(int index = 0; index < alignedStart; index++)
    dst.copyCoeff(index, src);

// Vectorized copy
for(int index = alignedStart; index < alignedEnd; index += packetSize) {
    // _mm_store_ps, _mm_add_ps, _mm_load_ps
}

// Unaligned copy
for(int index = alignedEnd; index < size; index++)
    dst.copyCoeff(index, src);

EIGEN FEATURES

Allocator to auto align memory to use SSE
Integrated heuristic cache cost model (e.g. mat1 = mat2 * (mat3 + mat4))
Sparse matrices
Decompositions/Linear systems
"Unsupported" Module:
- FFT
- Autodiff (unsupported)
- General tensors
- Slicing/striding
- Convolutions
- CUDA/OpenCL/multithread

expression
templates
in eigen

why?

why?

why?

why?

why?

EAGER Evaluation
CREATES TEMPORARIES

why?

why?

LAZY evaluation
=

fast

hOW?

hOW?

hOW?

hOW?

hOW?

hOW?

hOW?

Templates

multiple types

multiple operations

Templates

multiple types

multiple operations

DEEP DIVE

DEEP DIVE

DEEP DIVE

DEEP DIVE

DEEP DIVE

EIGEN FEATURES

THE
END

Expression Templates in Eigen

Expression Templates in Eigen

svalorzen

expression templates in eigen

why?

why?

why?

why?

why?

EAGER Evaluation CREATES TEMPORARIES

why?

why?

LAZY evaluation =

fast

hOW?

hOW?

hOW?

hOW?

hOW?

hOW?

hOW?

Templates

multiple types

multiple operations

Templates

multiple types

multiple operations

DEEP DIVE

DEEP DIVE

DEEP DIVE

DEEP DIVE

DEEP DIVE

EIGEN FEATURES

THE END

Expression Templates in Eigen

More from svalorzen

expression
templates
in eigen

EAGER Evaluation
CREATES TEMPORARIES

LAZY evaluation
=

THE
END