expression
templates
in eigen

Expression templates
=
Templates that
represent expressions

why?

float a, b, c, d;

a = b + c + d;

why?

float a, b, c, d;

a = b + c + d;
a = (b + c) + d;

why?

Vec a, b, c, d;

a = b + c + d;
a = (b + c) + d;
    ^^^^^^^
   temporary

why?

Vec a, b, c, d;

a = b + c + d;
a = (b + c) + d;
    ^^^^^^^
   temporary

why?

EAGER Evaluation
CREATES TEMPORARIES

tmp1.x = b.x + c.x;
tmp1.y = b.y + c.y;
tmp1.z = b.z + c.z;

tmp2.x = tmp1.x + d.x;
tmp2.y = tmp1.y + d.y;
tmp2.z = tmp1.z + d.z;

a.x = tmp2.x;
a.y = tmp2.y;
a.z = tmp2.z;

why?

class Vec {
    Vec
    operator+ (const Vec& other) const
    {
      return Vector (
        x+other.x, 
        y+other.y, 
        z+other.z);
    }

    void
    operator= (const Vec& other)
    {
      x = other.x;
      y = other.y;
      z = other.z;
    }
};

why?

LAZY evaluation
=

fast

// Faster way
a.x = b.x + c.x + d.x;
a.y = b.y + c.y + d.y;
a.z = b.z + c.z + d.z;

hOW?

Vec a, b, c, d;

a = b + c + d;
a = (b + c) + d;
    ^^^^^^^
   temporary

hOW?

a = ((b + c) + d);

a = (BinarySum(b, c) + d);

a = BinarySum(
        BinarySum(b, c), 
        d
    );

hOW?

class BinarySum {
    float eval(size_t i) {
        return lhs.eval(i) + rhs.eval(i);
    }
};

void Vec::operator= (const BinarySum& other) {
    for (size_t i = 0; i < 3; ++i)
        data_[i] = other.eval(i);
}

hOW?

auto tmp1 = BinarySum(b, c);
auto tmp2 = BinarySum(tmp1, d);
a = tmp2;

hOW?

auto tmp1 = BinarySum(b, c);
auto tmp2 = BinarySum(tmp1, d);
a = tmp2;

for (size_t i = 0; i < 3; ++i) {
    a[i] = tmp2.eval(i);
}

hOW?

auto tmp1 = BinarySum(b, c);
auto tmp2 = BinarySum(tmp1, d);
a = tmp2;

for (size_t i = 0; i < 3; ++i) {
    a[i] = tmp2.eval(i);
    a[i] = tmp1.eval(i) + d[i];
}

hOW?

auto tmp1 = BinarySum(b, c);
auto tmp2 = BinarySum(tmp1, d);
a = tmp2;

for (size_t i = 0; i < 3; ++i) {
    a[i] = tmp2.eval(i);
    a[i] = tmp1.eval(i) + d[i];
    a[i] = b[i] + c[i] + d[i];
}

Templates

  • multiple types

  • multiple operations

UnaryOp<Op, Type>;
// sqrt, exp, neg, ...

BinaryOp<Op, TypeL, TypeR>;
// dot product, /, ...

CwiseBinaryOp<
    Op, TypeL, TypeR
>;
// +, -, cwise prod, ...

Templates

  • multiple types

  • multiple operations

UnaryOp<Op, Type>;
// sqrt, exp, neg, ...

BinaryOp<Op, TypeL, TypeR>;
// dot product, /, ...

CwiseBinaryOp<
    Op, TypeL, TypeR
>;
// +, -, cwise prod, ...

DEEP DIVE

VectorXf v, w;

VectorXf u = v + w;

DEEP DIVE

VectorXf u = v + w;

MatrixBase::operator+(const MatrixBase&) -> 
    CwiseBinaryOp<
        internal::scalar_sum_op<float>,
        VectorXf, VectorXf
    >;

// Why not
// MatrixBase::operator+(const CWiseBinaryOp<...> &);

// CwiseBinaryOp is a type
// Don't want to redefine all operations
// for every possible combinations.
// CRTP
class CwiseBinaryOp : MatrixBase<CWiseBinaryOp>;

DEEP DIVE

VectorXf u = CwiseBinaryOp<...>(v, w);

Matrix& operator=(const MatrixBase<OtherDerived>& other) {
  return Base::operator=(other.derived());
}

Derived&
MatrixBase::operator=(const MatrixBase<OtherDerived>& other);
// With
Derived = VectorXf;
OtherDerived = CwiseBinaryOp<...>;

DEEP DIVE

VectorXf u = CwiseBinaryOp<...>(v, w);

// ASSIGNMENT STRATEGY SELECTION
Derived&
MatrixBase::operator=(const MatrixBase<OtherDerived>& other) {
    return internal::assign_selector<Derived,OtherDerived>::
        run(derived(), other.derived());
}

template<typename Derived, typename OtherDerived,
  bool EvalBeforeAssigning = int(OtherDerived::Flags) &
                             EvalBeforeAssigningBit,
  bool NeedToTranspose = ...>
struct internal::assign_selector;

DEEP DIVE

// ASSIGNMENT STRATEGY SELECTION
template<typename Derived, typename OtherDerived>
struct internal::assign_selector<Derived,OtherDerived,false,false>
{
    static Derived& run(Derived& dst, const OtherDerived& other) {
        return dst.lazyAssign(other.derived());
    }
}

template<...>
Derived& MatrixBase<Derived>::lazyAssign(
    const MatrixBase<OtherDerived>& other
) {
    internal::assign_impl<Derived, OtherDerived>
        ::run(derived(),other.derived());
    return derived();
}
// ASSIGNMENT EXECUTION
struct internal::assign_impl<
    Derived1, Derived2, LinearVectorization, NoUnrolling>
{
    static void run(Derived1 &dst, const Derived2 &src) {
        [...]
 
        // Unaligned copy
        for(int index = 0; index < alignedStart; index++)
            dst.copyCoeff(index, src);
 
        // Vectorized copy
        for(int index = alignedStart; index < alignedEnd; index += packetSize) {
            dst.template copyPacket<
                Derived2, Aligned, internal::assign_traits<Derived1,Derived2
            >::SrcAlignment>(index, src);
        }
 
        // Unaligned copy
        for(int index = alignedEnd; index < size; index++)
            dst.copyCoeff(index, src);
    }
};
// VECTORIZED ASSIGNMENT EXECUTION

// Vector copy loop from prev. slide
for(int index = alignedStart; index < alignedEnd; index += packetSize) {
    dst.template copyPacket<
        Derived2, Aligned, internal::assign_traits<Derived1,Derived2
    >::SrcAlignment>(index, src);
}
 
inline void MatrixBase<Derived>::copyPacket(
    int index, const MatrixBase<OtherDerived>& other
) {
  //        STORING DATA
  derived().template writePacket<StoreMode>(
      index, 
  //                  LOADING DATA
      other.derived().template packet<LoadMode>(index)
  );
}
// STORING
template<int StoreMode>
inline void writePacket(int index, const PacketScalar& x)
{
    internal::pstoret<Scalar, PacketScalar, StoreMode>
        (m_storage.data() + index, x);
}

template<typename Scalar, typename Packet, int LoadMode>
inline void internal::pstoret(Scalar* to, const Packet& from)
{
    if(LoadMode == Aligned)
        internal::pstore(to, from);
    else
        internal::pstoreu(to, from);
}

template<> inline void internal::pstore(
    float*  to, const __m128&  from
) {
    _mm_store_ps(to, from);
}
// LOADING and SUMMING
class CwiseBinaryOp {
    template<int LoadMode>
    inline PacketScalar packet(int index) const {
        //               SUM
        return m_functor.packetOp(
        //        LHS VECTOR LOAD
            m_lhs.template packet<LoadMode>(index), 
        //        RHS VECTOR LOAD
            m_rhs.template packet<LoadMode>(index)
        );
    }
};

// LOADS are similar to STORES
// LOADING and SUMMING
class CwiseBinaryOp {
    template<int LoadMode>
    inline PacketScalar packet(int index) const {
        //               SUM
        return m_functor.packetOp(...);
    }
};

template<typename Scalar> struct internal::scalar_sum_op {
    template<typename PacketScalar>
    inline const PacketScalar packetOp(
        const PacketScalar& a, const PacketScalar& b
    ) const {
        return internal::padd(a,b);
    }
};

template<> inline __m128  internal::padd (
    const __m128&  a, const __m128&  b
) { return _mm_add_ps(a,b); }
VectorXf u = v + w;

// Has now become

// Unaligned copy
for(int index = 0; index < alignedStart; index++)
    dst.copyCoeff(index, src);

// Vectorized copy
for(int index = alignedStart; index < alignedEnd; index += packetSize) {
    // _mm_store_ps, _mm_add_ps, _mm_load_ps
}

// Unaligned copy
for(int index = alignedEnd; index < size; index++)
    dst.copyCoeff(index, src);

EIGEN FEATURES

  • Allocator to auto align memory to use SSE
  • Integrated heuristic cache cost model (e.g. mat1 = mat2 * (mat3 + mat4))
  • Sparse matrices
  • Decompositions/Linear systems
  • "Unsupported" Module:
    • FFT
    • Autodiff (unsupported)
    • General tensors
    • Slicing/striding
    • Convolutions
    • CUDA/OpenCL/multithread

THE
END

Expression Templates in Eigen

By svalorzen

Expression Templates in Eigen

  • 18