// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM   : ALL
// PRODUCT   : COMMON
// VISIBILITY   : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setNormalOfTriangle(Vec4 &result, Vec4_ a, Vec4_ b, Vec4_ c)
{
    Vec4 ba,ca;
    ba.setSub( b, a );
    ca.setSub( c, a );
    result.setCross( ba , ca);
}


#ifndef HK_VECTOR4fUTIL_atan2
template <>
HK_INLINE hkSimdFloat32 HK_CALL hkVector4UtilImpl<hkFloat32>::atan2(hkSimdFloat32Parameter y, hkSimdFloat32Parameter x)
{
    return hkSimdFloat32::fromFloat(hkMath::atan2(y.getReal(),x.getReal()));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::atan2(hkVector4fParameter y, hkVector4fParameter x, hkVector4f& result)
{
    result.set(hkMath::atan2(y(0),x(0)), hkMath::atan2(y(1),x(1)), hkMath::atan2(y(2),x(2)), hkMath::atan2(y(3),x(3)));
}
template <>
HK_INLINE hkSimdFloat32 HK_CALL hkVector4UtilImpl<hkFloat32>::atan2(hkVector4fParameter v)
{
    return hkSimdFloat32::fromFloat(hkMath::atan2(v(1),v(0)));
}
#endif
#ifndef HK_VECTOR4dUTIL_atan2
template <>
HK_INLINE hkSimdDouble64 HK_CALL hkVector4UtilImpl<hkDouble64>::atan2(hkSimdDouble64Parameter y, hkSimdDouble64Parameter x)
{
    return hkSimdDouble64::fromFloat(hkMath::atan2(y.getReal(),x.getReal()));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::atan2(hkVector4dParameter y, hkVector4dParameter x, hkVector4d& result)
{
    result.set(hkMath::atan2(y(0),x(0)), hkMath::atan2(y(1),x(1)), hkMath::atan2(y(2),x(2)), hkMath::atan2(y(3),x(3)));
}
template <>
HK_INLINE hkSimdDouble64 HK_CALL hkVector4UtilImpl<hkDouble64>::atan2(hkVector4dParameter v)
{
    return hkSimdDouble64::fromFloat(hkMath::atan2(v(1),v(0)));
}
#endif

// Average absolute error 0.063428
// Max absolute error 0.571373
// About 8.5x faster than ::atan2f for 4 simultaneous values
template <typename FT>
HK_INLINE typename hkFloatTypes<FT>::Scalar HK_CALL hkVector4UtilImpl<FT>::linearAtan2Approximation(Scalar_ y, Scalar_ x)
{
    const typename hkFloatTypes<FT>::Scalar c2 = hkFloatTypes<FT>::Scalar::fromFloat(typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596));
    const typename hkFloatTypes<FT>::Scalar pi_quarter = hkFloatTypes<FT>::Scalar::template getConstant<HK_QUADREAL_PI_QUARTER>();

    typename hkFloatTypes<FT>::Scalar abs_y; abs_y.setAbs(y);
    const typename hkFloatTypes<FT>::Scalar xPlusAbsY = x + abs_y;

    // case x<0
    typename hkFloatTypes<FT>::Scalar angle0;
    {
        typename hkFloatTypes<FT>::Scalar r; r.template setDiv<HK_ACC_23_BIT,HK_DIV_IGNORE>(xPlusAbsY, abs_y - x);
        angle0.setSubMul(c2, pi_quarter, r);
    }

    // case x>=0
    typename hkFloatTypes<FT>::Scalar angle1;
    {
        typename hkFloatTypes<FT>::Scalar r; r.template setDiv<HK_ACC_23_BIT,HK_DIV_IGNORE>(x - abs_y, xPlusAbsY);
        angle1.setSubMul(pi_quarter, pi_quarter, r);
    }

    typename hkFloatTypes<FT>::Scalar result;
    // select case
    result.setSelect(x.lessZero(), angle0, angle1);

    // invert symmetry
    result.setFlipSign(result, y.lessZero());

    return result;
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::linearAtan2Approximation(Vec4_ y, Vec4_ x, Vec4& result)
{
    // 3 * pi/4
    static HK_ALIGN_DOUBLE( const typename hkFloatTypes<FT>::Float c2[4] ) = { typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596), typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596), typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596), typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596) };

    const Vec4& pi_quarter = Vec4::template getConstant<HK_QUADREAL_PI_QUARTER>();

    const Comparison x_less_zero = x.lessZero();
    const Comparison y_less_zero = y.lessZero();

    Vec4 abs_y; abs_y.setAbs(y);
    Vec4 xPlusAbsY; xPlusAbsY.setAdd(x, abs_y);

    // case x<0
    Vec4 angle0;
    {
        Vec4 d; d.setSub(abs_y, x);
        Vec4 r; r.template setDiv<HK_ACC_23_BIT,HK_DIV_IGNORE>(xPlusAbsY, d);
        angle0.setSubMul(*reinterpret_cast<const Vec4*>(c2), pi_quarter, r);
    }

    // case x>=0
    Vec4 angle1;
    {
        Vec4 n; n.setSub(x, abs_y);
        Vec4 r; r.template setDiv<HK_ACC_23_BIT,HK_DIV_IGNORE>(n, xPlusAbsY);
        angle1.setSubMul(pi_quarter, pi_quarter, r);
    }

    // select case
    result.setSelect(x_less_zero, angle0, angle1);

    // invert symmetry
    result.setFlipSign(result, y_less_zero);
}

// Average absolute error 0.063424
// Max absolute error 1.374024
// About 13x faster than ::atan2f for 4 simultaneous values
template <typename FT>
HK_INLINE typename hkFloatTypes<FT>::Scalar HK_CALL hkVector4UtilImpl<FT>::linearAtan2ApproximationRough(Scalar_ y, Scalar_ x)
{
    const Scalar c2 = Scalar::fromFloat(typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596));
    const Scalar pi_quarter = Scalar::template getConstant<HK_QUADREAL_PI_QUARTER>();

    Scalar abs_y; abs_y.setAbs(y);
    const Scalar xPlusAbsY = x + abs_y;

    // case x<0
    Scalar angle0;
    {
        Scalar r; r.template setDiv<HK_ACC_12_BIT,HK_DIV_IGNORE>(xPlusAbsY, abs_y - x);
        angle0.setSubMul(c2, pi_quarter, r);
    }

    // case x>=0
    Scalar angle1;
    {
        Scalar r; r.template setDiv<HK_ACC_12_BIT,HK_DIV_IGNORE>(x - abs_y, xPlusAbsY);
        angle1.setSubMul(pi_quarter, pi_quarter, r);
    }

    Scalar result;
    // select case
    result.setSelect(x.lessZero(), angle0, angle1);

    // invert symmetry
    result.setFlipSign(result, y.lessZero());

    return result;
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::linearAtan2ApproximationRough(Vec4_ y, Vec4_ x, Vec4& result)
{
    // 3 * pi/4
    static HK_ALIGN_DOUBLE( const typename hkFloatTypes<FT>::Float c2[4] ) = { typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596), typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596), typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596), typename hkFloatTypes<FT>::Float(2.3561944901923449288469825374596) };

    const Vec4& pi_quarter = Vec4::template getConstant<HK_QUADREAL_PI_QUARTER>();

    const Comparison x_less_zero = x.lessZero();
    const Comparison y_less_zero = y.lessZero();

    Vec4 abs_y; abs_y.setAbs(y);
    Vec4 xPlusAbsY; xPlusAbsY.setAdd(x, abs_y);

    // case x<0
    Vec4 angle0;
    {
        Vec4 d; d.setSub(abs_y, x);
        Vec4 r; r.template setDiv<HK_ACC_12_BIT,HK_DIV_IGNORE>(xPlusAbsY, d);
        angle0.setSubMul(*reinterpret_cast<const Vec4*>(c2), pi_quarter, r);
    }

    // case x>=0
    Vec4 angle1;
    {
        Vec4 n; n.setSub(x, abs_y);
        Vec4 r; r.template setDiv<HK_ACC_12_BIT,HK_DIV_IGNORE>(n, xPlusAbsY);
        angle1.setSubMul(pi_quarter, pi_quarter, r);
    }

    // select case
    result.setSelect(x_less_zero, angle0, angle1);

    // invert symmetry
    result.setFlipSign(result, y_less_zero);
}

#ifndef HK_VECTOR4fUTIL_logApproximation
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::logApproximation(hkVector4fParameter v, hkVector4f& result)
{
    result.set(hkMath::logApproximation(v(0)), hkMath::logApproximation(v(1)), hkMath::logApproximation(v(2)), hkMath::logApproximation(v(3)));
}
#endif
#ifndef HK_VECTOR4dUTIL_logApproximation
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::logApproximation(hkVector4dParameter v, hkVector4d& result)
{
    result.set(hkMath::logApproximation(v(0)), hkMath::logApproximation(v(1)), hkMath::logApproximation(v(2)), hkMath::logApproximation(v(3)));
}
#endif

#ifndef HK_VECTOR4fUTIL_sinCos
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::sinCos(hkVector4fParameter r, hkVector4f& sines, hkVector4f& cosines)
{
      sines.set(hkMath::sin(r(0)),hkMath::sin(r(1)),hkMath::sin(r(2)),hkMath::sin(r(3)));
    cosines.set(hkMath::cos(r(0)),hkMath::cos(r(1)),hkMath::cos(r(2)),hkMath::cos(r(3)));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::sinCos(hkVector4fParameter r, hkVector4f& sc)
{
    sc.set(hkMath::sin(r(0)),hkMath::cos(r(1)),hkMath::sin(r(2)),hkMath::cos(r(3)));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::sinCos(hkSimdFloat32Parameter r, hkVector4f& sc)
{
    const hkFloat32 rr = r.getReal();
    const hkFloat32 sr = hkMath::sin(rr);
    const hkFloat32 cr = hkMath::cos(rr);
    sc.set(sr,cr,sr,cr);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::sinCos(hkSimdFloat32Parameter r, hkSimdFloat32& s, hkSimdFloat32& c)
{
    const hkFloat32 rr = r.getReal();
    s.setFromFloat( hkMath::sin(rr) );
    c.setFromFloat( hkMath::cos(rr) );
}
#endif
#ifndef HK_VECTOR4dUTIL_sinCos
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::sinCos(hkVector4dParameter r, hkVector4d& sines, hkVector4d& cosines)
{
      sines.set(hkMath::sin(r(0)),hkMath::sin(r(1)),hkMath::sin(r(2)),hkMath::sin(r(3)));
    cosines.set(hkMath::cos(r(0)),hkMath::cos(r(1)),hkMath::cos(r(2)),hkMath::cos(r(3)));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::sinCos(hkVector4dParameter r, hkVector4d& sc)
{
    sc.set(hkMath::sin(r(0)),hkMath::cos(r(1)),hkMath::sin(r(2)),hkMath::cos(r(3)));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::sinCos(hkSimdDouble64Parameter r, hkVector4d& sc)
{
    const hkDouble64 rr = r.getReal();
    const hkDouble64 sr = hkMath::sin(rr);
    const hkDouble64 cr = hkMath::cos(rr);
    sc.set(sr,cr,sr,cr);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::sinCos(hkSimdDouble64Parameter r, hkSimdDouble64& s, hkSimdDouble64& c)
{
    const hkDouble64 rr = r.getReal();
    s.setFromFloat( hkMath::sin(rr) );
    c.setFromFloat( hkMath::cos(rr) );
}
#endif

#ifndef HK_VECTOR4fUTIL_sinCosApproximation
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::sinCosApproximation(hkVector4fParameter r, hkVector4f& sc)
{
    sinCos(r,sc);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::sinCosApproximation(hkSimdFloat32Parameter r, hkVector4f& sc)
{
    sinCos(r,sc);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::sinCosApproximation(hkSimdFloat32Parameter r, hkSimdFloat32& s, hkSimdFloat32& c)
{
    sinCos(r,s,c);
}
#endif
#ifndef HK_VECTOR4dUTIL_sinCosApproximation
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::sinCosApproximation(hkVector4dParameter r, hkVector4d& sc)
{
    sinCos(r,sc);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::sinCosApproximation(hkSimdDouble64Parameter r, hkVector4d& sc)
{
    sinCos(r,sc);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::sinCosApproximation(hkSimdDouble64Parameter r, hkSimdDouble64& s, hkSimdDouble64& c)
{
    sinCos(r,s,c);
}
#endif

#ifndef HK_VECTOR4fUTIL_aSinAcos
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::aSin(hkVector4fParameter r, hkVector4f& sc)
{
    sc.set(hkMath::asin(r(0)),hkMath::asin(r(1)),hkMath::asin(r(2)),hkMath::asin(r(3)));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::aCos(hkVector4fParameter r, hkVector4f& sc)
{
    sc.set(hkMath::acos(r(0)),hkMath::acos(r(1)),hkMath::acos(r(2)),hkMath::acos(r(3)));
}
template <>
HK_INLINE hkSimdFloat32 HK_CALL hkVector4UtilImpl<hkFloat32>::aSin(hkSimdFloat32Parameter r)
{
    return hkSimdFloat32::fromFloat( hkMath::asin(r.getReal()) );
}
template <>
HK_INLINE hkSimdFloat32 HK_CALL hkVector4UtilImpl<hkFloat32>::aCos(hkSimdFloat32Parameter r)
{
    return hkSimdFloat32::fromFloat( hkMath::acos(r.getReal()) );
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::aSinAcos(hkVector4fParameter r, hkVector4f& sc)
{
    sc.set(hkMath::asin(r(0)),hkMath::acos(r(1)),hkMath::asin(r(2)),hkMath::acos(r(3)));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::aSinAcos(hkSimdFloat32Parameter r, hkVector4f& sc)
{
    const hkFloat32 rr = r.getReal();
    const hkFloat32 sr = hkMath::asin(rr);
    const hkFloat32 cr = hkMath::acos(rr);
    sc.set(sr,cr,sr,cr);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::aSinAcos(hkSimdFloat32Parameter r, hkSimdFloat32& s, hkSimdFloat32& c)
{
    const hkFloat32 rr = r.getReal();
    s.setFromFloat( hkMath::asin(rr) );
    c.setFromFloat( hkMath::acos(rr) );
}
#endif
#ifndef HK_VECTOR4dUTIL_aSinAcos
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::aSin(hkVector4dParameter r, hkVector4d& sc)
{
    sc.set(hkMath::asin(r(0)),hkMath::asin(r(1)),hkMath::asin(r(2)),hkMath::asin(r(3)));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::aCos(hkVector4dParameter r, hkVector4d& sc)
{
    sc.set(hkMath::acos(r(0)),hkMath::acos(r(1)),hkMath::acos(r(2)),hkMath::acos(r(3)));
}
template <>
HK_INLINE hkSimdDouble64 HK_CALL hkVector4UtilImpl<hkDouble64>::aSin(hkSimdDouble64Parameter r)
{
    return hkSimdDouble64::fromFloat( hkMath::asin(r.getReal()) );
}
template <>
HK_INLINE hkSimdDouble64 HK_CALL hkVector4UtilImpl<hkDouble64>::aCos(hkSimdDouble64Parameter r)
{
    return hkSimdDouble64::fromFloat( hkMath::acos(r.getReal()) );
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::aSinAcos(hkVector4dParameter r, hkVector4d& sc)
{
    sc.set(hkMath::asin(r(0)),hkMath::acos(r(1)),hkMath::asin(r(2)),hkMath::acos(r(3)));
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::aSinAcos(hkSimdDouble64Parameter r, hkVector4d& sc)
{
    const hkDouble64 rr = r.getReal();
    const hkDouble64 sr = hkMath::asin(rr);
    const hkDouble64 cr = hkMath::acos(rr);
    sc.set(sr,cr,sr,cr);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::aSinAcos(hkSimdDouble64Parameter r, hkSimdDouble64& s, hkSimdDouble64& c)
{
    const hkDouble64 rr = r.getReal();
    s.setFromFloat( hkMath::asin(rr) );
    c.setFromFloat( hkMath::acos(rr) );
}
#endif




#ifndef HK_VECTOR4fUTIL_convertToUint16WithClip
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::convertToUint16WithClip( hkVector4fParameter in, hkVector4fParameter offset, hkVector4fParameter scale, hkVector4fParameter min, hkVector4fParameter max, hkIntUnion64& out)
{
    hkVector4f x;
    x.setAdd(in,offset);
    x.mul(scale);
    x.setMin(x,max);
    x.setMax(x,min);

#if defined(HK_USING_GENERIC_INT_VECTOR_IMPLEMENTATION)
    hkVector4f convertToUint16WithClipMagic; convertToUint16WithClipMagic.setAll(65536.0f);
    x.add( convertToUint16WithClipMagic );

    union
    {
        hkQuadFloat32 q;
        hkUint32 i[4];
    } u;

    u.q = x.m_quad;
    // note implicitly throw away top 16 bits
    out.i16[0] = hkInt16(u.i[0] >> 7);
    out.i16[1] = hkInt16(u.i[1] >> 7);
    out.i16[2] = hkInt16(u.i[2] >> 7);
    out.i16[3] = hkInt16(u.i[3] >> 7);
#else
    hkIntVector i;
    i.setConvertF32toS32Unchecked(x);
    i.setConvertU32ToU16(i,i);
    i.store<2>(&out.u32[0]); // 2xu32 = 4xu16
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_convertToUint16WithClip
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::convertToUint16WithClip( hkVector4dParameter in, hkVector4dParameter offset, hkVector4dParameter scale, hkVector4dParameter min, hkVector4dParameter max, hkIntUnion64& out)
{
    hkVector4d x;
    x.setAdd(in,offset);
    x.mul(scale);
    x.setMin(x,max);
    x.setMax(x,min);

    hkIntVector i;
    i.setConvertF32toS32Unchecked(x);
    i.setConvertU32ToU16(i,i);
    i.store<2>(&out.u32[0]); // 2xu32 = 4xu16
}
#endif

#ifndef HK_VECTOR4fUTIL_convertToUint32
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::convertToUint32(hkVector4fParameter in, hkVector4fParameter offset, hkVector4fParameter scale, _Out_writes_all_(4) hkUint32* out)
{
    hkVector4f x;
    x.setAdd(in,offset);
    x.mul(scale);
    hkIntVector i;
    i.setConvertF32toU32(x);
    i.store<4>(out);
}
#endif
#ifndef HK_VECTOR4dUTIL_convertToUint32
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::convertToUint32( hkVector4dParameter in, hkVector4dParameter offset, hkVector4dParameter scale, hkUint32* out)
{
    hkVector4d x;
    x.setAdd(in,offset);
    x.mul(scale);
    hkIntVector i;
    i.setConvertF32toU32(x);
    i.store<4>(out);
}
#endif

#ifndef HK_VECTOR4fUTIL_convertToUint32WithClip
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::convertToUint32WithClip( hkVector4fParameter in, hkVector4fParameter offset, hkVector4fParameter scale, hkVector4fParameter min, hkVector4fParameter max, _Out_writes_all_(4) hkUint32* out)
{
    hkVector4f x;
    x.setAdd(in,offset);
    x.mul(scale);
    x.setMin(x,max);
    x.setMax(x,min);
    hkIntVector i;
    i.setConvertF32toU32(x);
    i.store<4>(out);
}
#endif
#ifndef HK_VECTOR4dUTIL_convertToUint32WithClip
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::convertToUint32WithClip( hkVector4dParameter in, hkVector4dParameter offset, hkVector4dParameter scale, hkVector4dParameter min, hkVector4dParameter max, hkUint32* out)
{
    hkVector4d x;
    x.setAdd(in,offset);
    x.mul(scale);
    x.setMin(x,max);
    x.setMax(x,min);
    hkIntVector i;
    i.setConvertF32toU32(x);
    i.store<4>(out);
}
#endif


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::calculatePerpendicularVector(Vec4_ vectorIn, Vec4& biVectorOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    Vec4 h; h.setXYZ_0( vectorIn );
    Vec4 d0; d0.template setPermutation<hkVectorPermutation::YXWW>(h);
    Vec4 d1; d1.template setPermutation<hkVectorPermutation::ZWXW>(h);
    const Scalar l0 = vectorIn.template lengthSquared<2>(); // same as d0.lengthSquared<3>
    const Scalar l1 = d1.template lengthSquared<4>();           // actually 3 but .w is zero and lengthSquared<4> is faster
    d0.template setNeg<1>(d0);
    d1.template setNeg<1>(d1);
    const Comparison c = l0.less( l1 );
    biVectorOut.setSelect(c,d1,d0);
#else
    // find the indices of (one of) the smallest component(s) and the indices of
    // the remaining two components in the vector
    int min = 0;
    int ok1 = 1;
    int ok2 = 2;

    typename hkFloatTypes<FT>::Float a0 = hkMath::fabs(vectorIn(0));
    typename hkFloatTypes<FT>::Float a1 = hkMath::fabs(vectorIn(1));
    typename hkFloatTypes<FT>::Float a2 = hkMath::fabs(vectorIn(2));

    // sort the indices to make min index point to the smallest
    if( a1 < a0 )
    {
        ok1 = 0;
        min = 1;
        a0 = a1;
    }

    if(a2 < a0)
    {
        ok2 = min;
        min = 2;
    }

    biVectorOut.setZero();
    biVectorOut(ok1) =  vectorIn(ok2);
    biVectorOut(ok2) = -vectorIn(ok1);
#endif
}



template <typename FT>
template<bool INVERSE2>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::calculatePerpendicularNormalizedVectors(Vec4_ vectorIn, Vec4& biVectorOut, Vec4& bi2VectorOut)
{
    HK_ASSERT_NO_MSG( 0xf034fd5f, vectorIn.template isNormalized<3>(1e-3f) );
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    Vec4 h; h.setXYZ_0( vectorIn );
    Vec4 d0; d0.template setPermutation<hkVectorPermutation::YXWW>(h);
    Vec4 d1; d1.template setPermutation<hkVectorPermutation::ZWXW>(h);
    const Scalar l0 = vectorIn.template lengthSquared<2>(); // same as d0.lengthSquared<3>
    const Scalar l1 = d1.template lengthSquared<4>();           // actually 3 but .w is zero and lengthSquared<4> is faster
    Scalar len; len.setMax( l0, l1 );
    const Comparison c = l0.less( l1 );
    d0.template setNeg<1>(d0);
    d1.template setNeg<1>(d1);
    Vec4 o0; o0.setSelect(c,d1,d0);
    Vec4 o1;
    if ( INVERSE2   )
    {
        o1.setCross( o0, vectorIn );
    }
    else
    {
        o1.setCross( vectorIn, o0 );
    }
    Scalar invLen = len.template sqrtInverse<HK_ACC_MID, HK_SQRT_IGNORE>(  );
    biVectorOut. setMul( invLen, o0 );
    bi2VectorOut.setMul( invLen, o1 );
#else
    // find the indices of (one of) the smallest component(s) and the indices of
    // the remaining two components in the vector
    int min = 0;
    int ok1 = 1;
    int ok2 = 2;

    typename hkFloatTypes<FT>::Float a0 = hkMath::fabs(vectorIn(0));
    typename hkFloatTypes<FT>::Float a1 = hkMath::fabs(vectorIn(1));
    typename hkFloatTypes<FT>::Float a2 = hkMath::fabs(vectorIn(2));

    // sort the indices to make min index point to the smallest
    if( a1 < a0 )
    {
        ok1 = 0;
        min = 1;
        a0 = a1;
    }

    if(a2 < a0)
    {
        ok2 = min;
        min = 2;
    }

    biVectorOut.setZero();
    biVectorOut(ok1) =  vectorIn(ok2);
    biVectorOut(ok2) = -vectorIn(ok1);
    biVectorOut.template normalize<3,HK_ACC_MID, HK_SQRT_IGNORE>();
    if(INVERSE2)
    {
        bi2VectorOut.setCross( biVectorOut, vectorIn );
    }
    else
    {
        bi2VectorOut.setCross( vectorIn, biVectorOut );
    }
#endif
}



// vector containing col-major 2x2 matrix [r0c0, r1c0, r0c1, r1c1]
template <typename FT>
HK_INLINE hkResult HK_CALL hkVector4UtilImpl<FT>::invert2x2Matrix(Vec4_ m, Scalar_ tolerance, Vec4& out)
{
    Vec4 mt; mt.template setPermutation<hkVectorPermutation::WZWZ>(m);
    Vec4 p; p.setMul(m,mt);
    const Scalar D = p.template getComponent<0>() - p.template getComponent<1>();
    Scalar absD; absD.setAbs(D);

    if (absD.isLessEqual(tolerance))
    {
        return HK_FAILURE;  // may be an ordinary result
    }

    const Scalar DI = D.reciprocal();

    Vec4 o;
    o.template setPermutation<hkVectorPermutation::YZWX>(m);
    o.template setNeg<2>(o);
    o.mul(DI);

    out.template setPermutation<hkVectorPermutation::ZYXW>(o);

    return HK_SUCCESS;
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformPoints( const Transform& t, _In_reads_(numVectors) const Vec4* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) Vec4* vectorsOut )
{
    HK_MATH_ASSERT( 0xf0237abd, numVectors > 0, "At least one element required");
    Transform unaliased = t;
    int i = numVectors - 1;
    do
    {
        vectorsOut[i]._setTransformedPos( unaliased, vectorsIn[i] );
    } while( --i >= 0 );
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformPoints(const QTransform& t, _In_reads_(numVectors) const Vec4* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) Vec4* vectorsOut)
{
    HK_MATH_ASSERT( 0xf0237abd, numVectors > 0, "At least one element required");
    QTransform unaliased = t;
    int i = numVectors - 1;
    do
    {
        vectorsOut[i]._setTransformedPos( unaliased, vectorsIn[i] );
    } while( --i >= 0 );
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformPoints(const QsTransform& t, _In_reads_(numVectors) const Vec4* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) Vec4* vectorsOut)
{
    HK_MATH_ASSERT( 0xf0237abd, numVectors > 0, "At least one element required");
    QsTransform unaliased = t;
    int i = numVectors - 1;
    do
    {
        vectorsOut[i]._setTransformedPos( unaliased, vectorsIn[i] );
    } while( --i >= 0 );
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformPlaneEquations(const Transform& t, _In_reads_(numPlanes) const Vec4* HK_RESTRICT planes, int numPlanes, _Inout_updates_all_(numPlanes) Vec4* HK_RESTRICT planesOut)
{
    const Rotation rotation = t.getRotation();  // we are making a copy here to allow for better compiler support

    Vec4 tr;    tr.template setNeg<4>(t.getTranslation());
    Vec4 pivotShift; pivotShift._setRotatedInverseDir( rotation, tr );

    int p = numPlanes - 1;
    do
    {
        const Scalar newDist = planes->dot4xyz1(pivotShift);

        Vec4 h; h._setRotatedDir(rotation, planes[0] );
        planesOut->setXYZ_W(h, newDist);

        planes++;
        planesOut++;
    }
    while ( --p >=0 );
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformPlaneEquation( const Transform& transform, Vec4_ plane, Vec4& planeOut )
{
    Vec4    worldPlane; worldPlane._setRotatedDir(transform.getRotation(), plane);
    planeOut.setXYZ_W( worldPlane, plane.template getComponent<3>() - worldPlane.template dot<3>(transform.getTranslation()) );
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformInversePlaneEquation( const Transform& transform, Vec4_ plane, Vec4& planeOut )
{
    Scalar offset =   plane.dot3(transform.getTranslation() );
    Vec4 worldPlane;    worldPlane._setRotatedInverseDir(transform.getRotation(), plane);
    planeOut.setXYZ_W( worldPlane, plane.getW() + offset );
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformAndScalePlane( const Transform& transform, const Vec4& scale, Vec4& planeInOut )
{
    // Transform a plane equation
    //  n^T(x-p) = 0
    //
    //    becomes
    //
    //  Tn(n)^T(x-T(p)) = 0,  where   T(x)=RSx+t   and   Tn(x) = RS^-1x
    //
    //    T is hkQsTransformf, and Tn is the inverse transpose of RS, used
    //    to transform normal directions.
    //
    //  (RS^-1 n)^T( x - (RSp+t)) = 0
    //  (RS^-1 n)^T x - (RS^-1 n)^T(RSp+t)) = 0
    //  (RS^-1 n)^T x - n^T S^-1 R^T R S p - (RS^-1 n)^T t = 0
    //                      \-identity-/
    //    so
    //
    //   n'^T x - n^T p - n'^T t = 0   where  n' = RS^-1 n
    //  \xyz/     \------w-----/
    //  since we know n and n^T p, we only need to compute n'
    //  and n'^T t. Finally we must scale the equation so that
    //  the new normal (xyz) is unit length

    HK_MATH_ASSERT( 0xf0237ab1, planeInOut.notEqualZero().template anyIsSet<hkVector4ComparisonMask::MASK_XYZ>(), "invalid plane definition"); // note we do not require normal length
    const Scalar w = planeInOut.getW();
    const Vec4 translation = transform.getTranslation();
    HK_MATH_ASSERT( 0xf0237ab0, scale.greater(Vec4::template getConstant<HK_QUADREAL_EPS>()).allAreSet(), "scale must not be smaller than numeric epsilon value to not destroy the plane definition");
    Vec4 invScale; invScale.template setReciprocal<HK_ACC_MID,HK_DIV_IGNORE>(scale);
    Vec4 planeLocal; planeLocal.setMul(planeInOut, invScale);
    planeLocal._setRotatedDir(transform.getRotation(), planeLocal);
    const Scalar invNorm = planeLocal.template lengthInverse<3,HK_ACC_MID,HK_SQRT_IGNORE>();
    const Scalar dot = planeLocal.template dot<3>(translation);
    planeLocal.setW( w-dot );
    planeLocal.mul(invNorm); // notice w must be scaled as well
    planeInOut = planeLocal;
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::scalePlane( const Vec4& scale, Vec4& planeInOut )
{
    HK_MATH_ASSERT( 0xf0237ab2, planeInOut.notEqualZero().template anyIsSet<hkVector4ComparisonMask::MASK_XYZ>(), "invalid plane definition"); // note we do not require normal length
    // By setting R = identity in the derivation in transformAndScalePlane(),
    // we get a slightly simplified method for scaling plane equations
    HK_ON_DEBUG(typename hkFloatTypes<FT>::Vector absScale; absScale.setAbs(scale));
    HK_MATH_ASSERT( 0xf0237ab3, absScale.greater(Vec4::template getConstant<HK_QUADREAL_EPS>()).allAreSet(), "scale must not be smaller than numeric epsilon value to not destroy the plane definition");

    const Scalar w = planeInOut.getW();
    Vec4 invScale;     invScale.template setReciprocal<HK_ACC_MID,HK_DIV_IGNORE>(scale);
    Vec4 planeLocal;   planeLocal.setMul(planeInOut, invScale);
    const Scalar invNorm = planeLocal.template lengthInverse<3,HK_ACC_MID,HK_SQRT_IGNORE>();
    planeLocal.setW( w  );
    planeLocal.mul(invNorm); // notice w must be scaled as well
    planeInOut = planeLocal;
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::translatePlane( const Vec4& translation, Vec4& planeInOut )
{
    HK_MATH_ASSERT( 0xf0237ab4, planeInOut.notEqualZero().template anyIsSet<hkVector4ComparisonMask::MASK_XYZ>(), "invalid plane definition"); // note we do not require normal length
    // By setting S = R = identity in the derivation in transformAndScalePlane(),
    Vec4 planeLocal = planeInOut;
    const Scalar w = planeLocal.getW();
    const Scalar dot = planeLocal.template dot<3>(translation);
    planeLocal.setW( w-dot );
    planeInOut = planeLocal;
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::mul4xyz1Points(const Transform& t, _In_reads_(numVectors) const Vec4* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) Vec4* vectorsOut)
{
    HK_MATH_ASSERT( 0xf0237abe, numVectors > 0, "At least one element required");
    Transform unaliased = t;
#if defined(HK_PLATFORM_MAC) || defined(HK_PLATFORM_IOS_SIM)
    for (int i=0; i<numVectors; ++i)
    {
        vectorsOut[i]._setTransformedPos( unaliased, vectorsIn[i] );
    }
#else
    int i = numVectors - 1;
    do
    {
        vectorsOut[i]._setTransformedPos( unaliased, vectorsIn[i] );
    } while( --i >= 0 );
#endif
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformSpheres(const Transform& t, _In_reads_(numVectors) const Vec4* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) Vec4* vectorsOut)
{
    HK_MATH_ASSERT( 0xf0237abf, numVectors > 0, "At least one element required");
    Transform unaliased = t;
#if defined(HK_PLATFORM_MAC) || defined(HK_PLATFORM_IOS_SIM)
    for (int i=0; i<numVectors; ++i)
    {
        Vec4 v;
        v._setTransformedPos( unaliased, vectorsIn[i] ); // position
        v.setW(vectorsIn[i]);           // radius
        vectorsOut[i] = v;
    }
#else
    int i = numVectors - 1;
    do
    {
        Vec4 v;
        v._setTransformedPos( unaliased, vectorsIn[i] ); // position
        v.setW(vectorsIn[i]);           // radius
        vectorsOut[i] = v;
    } while( --i >= 0 );
#endif
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::transformSpheres(const QsTransform& t, _In_reads_(numVectors) const Vec4* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) Vec4* vectorsOut)
{
    HK_MATH_ASSERT( 0xf0237abf, numVectors > 0, "At least one element required");
    QsTransform unaliased = t;
#if defined(HK_PLATFORM_MAC) || defined(HK_PLATFORM_IOS_SIM)
    for (int i=0; i<numVectors; ++i)
    {
        Vec4 v;
        v._setTransformedPos( unaliased, vectorsIn[i] ); // position
        v.setW(vectorsIn[i]);           // radius
        vectorsOut[i] = v;
    }
#else
    int i = numVectors - 1;
    do
    {
        Vec4 v;
        v._setTransformedPos( unaliased, vectorsIn[i] ); // position
        v.setW(vectorsIn[i]);           // radius
        vectorsOut[i] = v;
    } while( --i >= 0 );
#endif
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::rotatePoints(const Matrix3& t, _In_reads_(numVectors) const Vec4* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) Vec4* vectorsOut)
{
    HK_MATH_ASSERT( 0xf0237ac0, numVectors > 0, "At least one element required");
    Matrix3 unaliased = t;
#if defined(HK_PLATFORM_MAC) || defined(HK_PLATFORM_IOS_SIM)
    for (int i=0; i<numVectors; ++i)
    {
        vectorsOut[i]._setRotatedDir( unaliased, vectorsIn[i] );
    }
#else
    int i = numVectors - 1;
    do
    {
        vectorsOut[i]._setRotatedDir( unaliased, vectorsIn[i] );
    } while( --i >= 0 );
#endif
}

#ifndef HK_VECTOR4fUTIL_transpose
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::transpose(hkVector4f& v0, hkVector4f& v1, hkVector4f& v2)
{
    HK_TRANSPOSE3f(v0, v1, v2);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::transpose(hkVector4f& v0, hkVector4f& v1, hkVector4f& v2, hkVector4f& v3)
{
    HK_TRANSPOSE4f(v0, v1, v2, v3);
}
#endif
#ifndef HK_VECTOR4dUTIL_transpose
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::transpose(hkVector4d& v0, hkVector4d& v1, hkVector4d& v2)
{
    HK_TRANSPOSE3d(v0, v1, v2);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::transpose(hkVector4d& v0, hkVector4d& v1, hkVector4d& v2, hkVector4d& v3)
{
    HK_TRANSPOSE4d(v0, v1, v2, v3);
}
#endif


#ifndef HK_VECTOR4fUTIL_rotateInversePoints
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::rotateInversePoints(const hkRotationf& t, _In_reads_(numVectors) const hkVector4f* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) hkVector4f* vectorsOut)
{
    HK_MATH_ASSERT( 0xf0237ac1, numVectors > 0, "At least one element required");

#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0 = t.getColumn<0>();
    hkVector4f c1 = t.getColumn<1>();
    hkVector4f c2 = t.getColumn<2>();
    HK_TRANSPOSE3f(c0,c1,c2);

    for( int i = 0; i < numVectors; ++i )
    {
        const hkVector4f b = vectorsIn[i];
        const hkSimdFloat32 b0 = b.getComponent<0>();
        hkVector4f r0; r0.setMul( c0, b0 );
        const hkSimdFloat32 b1 = b.getComponent<1>();
        hkVector4f r1; r1.setMul( c1, b1 );
        const hkSimdFloat32 b2 = b.getComponent<2>();
        hkVector4f r2; r2.setMul( c2, b2 );

        hkVector4f out;
        out.setAdd(r0,r1);
        out.add(r2);

        vectorsOut[i] = out;
    }
#else
    hkRotationf unaliased = t;
#if defined(HK_PLATFORM_MAC) || defined(HK_PLATFORM_IOS_SIM)
    for (int i=0; i<numVectors; ++i)
    {
        vectorsOut[i]._setRotatedInverseDir( unaliased, vectorsIn[i] );
    }
#else
    int i = numVectors - 1;
    do
    {
        vectorsOut[i]._setRotatedInverseDir( unaliased, vectorsIn[i] );
    } while( --i >= 0 );
#endif
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_rotateInversePoints
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::rotateInversePoints(const hkRotationd& t, _In_reads_(numVectors) const hkVector4d* vectorsIn, int numVectors, _Inout_updates_all_(numVectors) hkVector4d* vectorsOut)
{
    HK_MATH_ASSERT( 0xf0237ac1, numVectors > 0, "At least one element required");

#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0 = t.getColumn<0>();
    hkVector4d c1 = t.getColumn<1>();
    hkVector4d c2 = t.getColumn<2>();
    HK_TRANSPOSE3d(c0,c1,c2);

    for( int i = 0; i < numVectors; ++i )
    {
        const hkVector4d b = vectorsIn[i];
        const hkSimdDouble64 b0 = b.getComponent<0>();
        hkVector4d r0; r0.setMul( c0, b0 );
        const hkSimdDouble64 b1 = b.getComponent<1>();
        hkVector4d r1; r1.setMul( c1, b1 );
        const hkSimdDouble64 b2 = b.getComponent<2>();
        hkVector4d r2; r2.setMul( c2, b2 );

        hkVector4d out;
        out.setAdd(r0,r1);
        out.add(r2);

        vectorsOut[i] = out;
    }
#else
    hkRotationd unaliased = t;
#if defined(HK_PLATFORM_MAC) || defined(HK_PLATFORM_IOS_SIM)
    for (int i=0; i<numVectors; ++i)
    {
        vectorsOut[i]._setRotatedInverseDir( unaliased, vectorsIn[i] );
    }
#else
    int i = numVectors - 1;
    do
    {
        vectorsOut[i]._setRotatedInverseDir( unaliased, vectorsIn[i] );
    } while( --i >= 0 );
#endif
#endif
}
#endif

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setMul( const Matrix3& aTb, const Matrix3& bTc, Matrix3& aTcOut )
{
    HK_ASSERT_NO_MSG(0x6d9d1d43,  &aTcOut != &aTb );
    HK_ASSERT_NO_MSG(0x64a8df81,  &aTcOut != &bTc );
    rotatePoints( aTb, &bTc.getColumn(0), 3, &aTcOut.getColumn(0) );
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setInverseMul( const Rotation& bTa, const Matrix3& bTc, Matrix3& aTcOut )
{
    rotateInversePoints( bTa, &bTc.getColumn(0), 3, &aTcOut.getColumn(0) );
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::_setMulInverseMul(const Transform& bTa, const Transform &bTc, _Inout_ Transform* tOut)
{
    Vec4 h; h.setSub(bTc.getTranslation(), bTa.getTranslation() );

    Vec4 tmpRot0;   tmpRot0._setRotatedInverseDir( bTa.getRotation(), bTc.template getColumn<0>() );
    Vec4 tmpRot1;   tmpRot1._setRotatedInverseDir( bTa.getRotation(), bTc.template getColumn<1>() );
    Vec4 tmpRot2;   tmpRot2._setRotatedInverseDir( bTa.getRotation(), bTc.template getColumn<2>() );
    Vec4 tmpRot3;   tmpRot3._setRotatedInverseDir( bTa.getRotation(), h);
    tOut->template setColumn<0>(tmpRot0);
    tOut->template setColumn<1>(tmpRot1);
    tOut->template setColumn<2>(tmpRot2);
    tOut->template setColumn<3>(tmpRot3);
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::buildOrthonormal( Vec4_ dir, Matrix3& out )
{
    Vec4 perp;
    calculatePerpendicularVector( dir, perp );
    perp.template normalize<3>();

    Vec4 cross;
    cross.setCross( dir, perp );

    out.template setColumn<0>(dir);
    out.template setColumn<1>(perp);
    out.template setColumn<2>(cross);
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::buildOrthonormal( Vec4_ dir, Vec4_ dir2, Matrix3& out )
{
    Vec4 cross;
    cross.setCross( dir, dir2 );
    if ( cross.template lengthSquared<3>() < hkFloatTypes<FT>::Scalar::template getConstant<HK_QUADREAL_EPS_SQRD>() )
    {
        buildOrthonormal( dir, out );
        return;
    }
    cross.template normalize<3>();

    Vec4 cross1;
    cross1.setCross( cross, dir );

    out.template setColumn<0>(dir);
    out.template setColumn<1>(cross1);
    out.template setColumn<2>(cross);
}


#ifndef HK_VECTOR4fUTIL_dot3_3vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot3_3vs3(hkVector4fParameter a0, hkVector4fParameter b0, hkVector4fParameter a1, hkVector4fParameter b1, hkVector4fParameter a2, hkVector4fParameter b2, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0,b0);
    hkVector4f c1; c1.setMul(a1,b1);
    hkVector4f c2; c2.setMul(a2,b2);

    HK_TRANSPOSE3f(c0,c1,c2);

    dotsOut.setAdd(c0,c1);
    dotsOut.add(c2);
#else
    dotsOut.setComponent<0>( a0.dot<3>( b0 ) );
    dotsOut.setComponent<1>( a1.dot<3>( b1 ) );
    dotsOut.setComponent<2>( a2.dot<3>( b2 ) );
#if !defined(HK_DEBUG_SLOW)
    dotsOut.zeroComponent<3>();
#else
    HK_ON_DEBUG( ((hkUint32*)&dotsOut)[3] = 0xffffffff; )
#endif
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot3_3vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot3_3vs3(hkVector4dParameter a0, hkVector4dParameter b0, hkVector4dParameter a1, hkVector4dParameter b1, hkVector4dParameter a2, hkVector4dParameter b2, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0,b0);
    hkVector4d c1; c1.setMul(a1,b1);
    hkVector4d c2; c2.setMul(a2,b2);

    HK_TRANSPOSE3d(c0,c1,c2);

    dotsOut.setAdd(c0,c1);
    dotsOut.add(c2);
#else
    dotsOut.setComponent<0>( a0.dot<3>( b0 ) );
    dotsOut.setComponent<1>( a1.dot<3>( b1 ) );
    dotsOut.setComponent<2>( a2.dot<3>( b2 ) );
#if !defined(HK_DEBUG_SLOW)
    dotsOut.zeroComponent<3>();
#else
    HK_ON_DEBUG( ((hkUint64*)&dotsOut)[3] = 0xffffffffffffffffull; )
#endif
#endif
}
#endif


#ifndef HK_VECTOR4fUTIL_dot3_4vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot3_4vs4( hkVector4fParameter a0, hkVector4fParameter b0, hkVector4fParameter a1, hkVector4fParameter b1, hkVector4fParameter a2, hkVector4fParameter b2, hkVector4fParameter a3, hkVector4fParameter b3, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0,b0);
    hkVector4f c1; c1.setMul(a1,b1);
    hkVector4f c2; c2.setMul(a2,b2);
    hkVector4f c3; c3.setMul(a3,b3);

    HK_TRANSPOSE4f(c0,c1,c2,c3);

    dotsOut.setAdd(c0,c1);
    dotsOut.add(c2);
#else
    dotsOut.set( a0.dot<3>( b0 ),
        a1.dot<3>( b1 ),
        a2.dot<3>( b2 ),
        a3.dot<3>( b3 ) );
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot3_4vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot3_4vs4( hkVector4dParameter a0, hkVector4dParameter b0, hkVector4dParameter a1, hkVector4dParameter b1, hkVector4dParameter a2, hkVector4dParameter b2, hkVector4dParameter a3, hkVector4dParameter b3, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0,b0);
    hkVector4d c1; c1.setMul(a1,b1);
    hkVector4d c2; c2.setMul(a2,b2);
    hkVector4d c3; c3.setMul(a3,b3);

    HK_TRANSPOSE4d(c0,c1,c2,c3);

    dotsOut.setAdd(c0,c1);
    dotsOut.add(c2);
#else
    dotsOut.set( a0.dot<3>( b0 ),
        a1.dot<3>( b1 ),
        a2.dot<3>( b2 ),
        a3.dot<3>( b3 ) );
#endif
}
#endif


#ifndef HK_VECTOR4fUTIL_dot4_3vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot4_3vs3(hkVector4fParameter a0, hkVector4fParameter b0, hkVector4fParameter a1, hkVector4fParameter b1, hkVector4fParameter a2, hkVector4fParameter b2, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0, b0);
    hkVector4f c1; c1.setMul(a1, b1);
    hkVector4f c2; c2.setMul(a2, b2);

    hkVector4f c3 = c2;
    HK_TRANSPOSE4f(c0, c1, c2, c3);
    hkVector4f s; s.setAdd(c2, c3);
    dotsOut.setAdd(c0, c1);
    dotsOut.add(s);
#else
    dotsOut.set(a0.dot<4>(b0), a1.dot<4>(b1), a2.dot<4>(b2), hkSimdFloat32_0);
    HK_ON_DEBUG(((hkUint32*)&dotsOut)[3] = 0xffffffff;)
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot4_3vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot4_3vs3(hkVector4dParameter a0, hkVector4dParameter b0, hkVector4dParameter a1, hkVector4dParameter b1, hkVector4dParameter a2, hkVector4dParameter b2, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0, b0);
    hkVector4d c1; c1.setMul(a1, b1);
    hkVector4d c2; c2.setMul(a2, b2);

    hkVector4d c3 = c2;
    HK_TRANSPOSE4d(c0, c1, c2, c3);
    hkVector4d s; s.setAdd(c2, c3);
    dotsOut.setAdd(c0, c1);
    dotsOut.add(s);
#else
    dotsOut.set(a0.dot<4>(b0), a1.dot<4>(b1), a2.dot<4>(b2), hkSimdDouble64_0);
    HK_ON_DEBUG(((hkUint64*)&dotsOut)[3] = 0xffffffffffffffffull;)
#endif
}
#endif


#ifndef HK_VECTOR4fUTIL_dot4_4vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot4_4vs4( hkVector4fParameter a0, hkVector4fParameter b0, hkVector4fParameter a1, hkVector4fParameter b1, hkVector4fParameter a2, hkVector4fParameter b2, hkVector4fParameter a3, hkVector4fParameter b3, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0,b0);
    hkVector4f c1; c1.setMul(a1,b1);
    hkVector4f c2; c2.setMul(a2,b2);
    hkVector4f c3; c3.setMul(a3,b3);

    HK_TRANSPOSE4f(c0,c1,c2,c3);

    hkVector4f s; s.setAdd(c2, c3);
    dotsOut.setAdd(c0,c1);
    dotsOut.add(s);
#else
    dotsOut.set( a0.dot<4>( b0 ),
        a1.dot<4>( b1 ),
        a2.dot<4>( b2 ),
        a3.dot<4>( b3 ) );
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot4_4vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot4_4vs4( hkVector4dParameter a0, hkVector4dParameter b0, hkVector4dParameter a1, hkVector4dParameter b1, hkVector4dParameter a2, hkVector4dParameter b2, hkVector4dParameter a3, hkVector4dParameter b3, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0,b0);
    hkVector4d c1; c1.setMul(a1,b1);
    hkVector4d c2; c2.setMul(a2,b2);
    hkVector4d c3; c3.setMul(a3,b3);

    HK_TRANSPOSE4d(c0,c1,c2,c3);

    hkVector4d s; s.setAdd(c2, c3);
    dotsOut.setAdd(c0,c1);
    dotsOut.add(s);
#else
    dotsOut.set( a0.dot<4>( b0 ),
        a1.dot<4>( b1 ),
        a2.dot<4>( b2 ),
        a3.dot<4>( b3 ) );
#endif
}
#endif



#ifndef HK_VECTOR4fUTIL_dot3_1vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot3_1vs4( hkVector4fParameter vectorIn, hkVector4fParameter a0, hkVector4fParameter a1, hkVector4fParameter a2, hkVector4fParameter a3, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0, vectorIn);
    hkVector4f c1; c1.setMul(a1, vectorIn);
    hkVector4f c2; c2.setMul(a2, vectorIn);
    hkVector4f c3; c3.setMul(a3, vectorIn);

    HK_TRANSPOSE4f(c0, c1, c2, c3);

    dotsOut.setAdd(c0, c1);
    dotsOut.add(c2);
#else
    dotsOut.set( a0.dot<3>( vectorIn ),
        a1.dot<3>( vectorIn ),
        a2.dot<3>( vectorIn ),
        a3.dot<3>( vectorIn ) );
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot3_1vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot3_1vs4( hkVector4dParameter vectorIn, hkVector4dParameter a0, hkVector4dParameter a1, hkVector4dParameter a2, hkVector4dParameter a3, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0, vectorIn);
    hkVector4d c1; c1.setMul(a1, vectorIn);
    hkVector4d c2; c2.setMul(a2, vectorIn);
    hkVector4d c3; c3.setMul(a3, vectorIn);

    HK_TRANSPOSE4d(c0, c1, c2, c3);

    dotsOut.setAdd(c0, c1);
    dotsOut.add(c2);
#else
    dotsOut.set( a0.dot<3>( vectorIn ),
        a1.dot<3>( vectorIn ),
        a2.dot<3>( vectorIn ),
        a3.dot<3>( vectorIn ) );
#endif
}
#endif


//
//  Sets this vector components: this(i) = vector.dot<3>(AI) for i=0..2

#ifndef HK_VECTOR4fUTIL_dot3_1vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot3_1vs3(hkVector4fParameter vectorIn, hkVector4fParameter a0, hkVector4fParameter a1, hkVector4fParameter a2, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0, vectorIn);
    hkVector4f c1; c1.setMul(a1, vectorIn);
    hkVector4f c2; c2.setMul(a2, vectorIn);

    HK_TRANSPOSE3f(c0, c1, c2);

    dotsOut.setAdd(c0, c1);
    dotsOut.add(c2);
#else
    dotsOut.set( a0.dot<3>( vectorIn ),
        a1.dot<3>( vectorIn ),
        a2.dot<3>( vectorIn ),
        hkSimdFloat32_0 );
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot3_1vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot3_1vs3(hkVector4dParameter vectorIn, hkVector4dParameter a0, hkVector4dParameter a1, hkVector4dParameter a2, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0, vectorIn);
    hkVector4d c1; c1.setMul(a1, vectorIn);
    hkVector4d c2; c2.setMul(a2, vectorIn);

    HK_TRANSPOSE3d(c0, c1, c2);

    dotsOut.setAdd(c0, c1);
    dotsOut.add(c2);
#else
    dotsOut.set( a0.dot<3>( vectorIn ),
        a1.dot<3>( vectorIn ),
        a2.dot<3>( vectorIn ),
        hkSimdDouble64_0 );
#endif
}
#endif



#ifndef HK_VECTOR4fUTIL_dot4_1vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot4_1vs4( hkVector4fParameter vectorIn, hkVector4fParameter a0, hkVector4fParameter a1, hkVector4fParameter a2, hkVector4fParameter a3, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0, vectorIn);
    hkVector4f c1; c1.setMul(a1, vectorIn);
    hkVector4f c2; c2.setMul(a2, vectorIn);
    hkVector4f c3; c3.setMul(a3, vectorIn);

    HK_TRANSPOSE4f(c0, c1, c2, c3);

    hkVector4f s; s.setAdd(c2, c3);
    dotsOut.setAdd(c0, c1);
    dotsOut.add(s);
#else
    dotsOut.set( a0.dot<4>( vectorIn ),
        a1.dot<4>( vectorIn ),
        a2.dot<4>( vectorIn ),
        a3.dot<4>( vectorIn ) );
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot4_1vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot4_1vs4( hkVector4dParameter vectorIn, hkVector4dParameter a0, hkVector4dParameter a1, hkVector4dParameter a2, hkVector4dParameter a3, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0, vectorIn);
    hkVector4d c1; c1.setMul(a1, vectorIn);
    hkVector4d c2; c2.setMul(a2, vectorIn);
    hkVector4d c3; c3.setMul(a3, vectorIn);

    HK_TRANSPOSE4d(c0, c1, c2, c3);

    hkVector4d s; s.setAdd(c2, c3);
    dotsOut.setAdd(c0, c1);
    dotsOut.add(s);
#else
    dotsOut.set( a0.dot<4>( vectorIn ),
        a1.dot<4>( vectorIn ),
        a2.dot<4>( vectorIn ),
        a3.dot<4>( vectorIn ) );
#endif
}
#endif


//
//  Sets this vector components: this(i) = vector.dot<4>(AI) for i=0..2

#ifndef HK_VECTOR4fUTIL_dot4_1vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot4_1vs3( hkVector4fParameter vectorIn, hkVector4fParameter a0, hkVector4fParameter a1, hkVector4fParameter a2, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0, vectorIn);
    hkVector4f c1; c1.setMul(a1, vectorIn);
    hkVector4f c2; c2.setMul(a2, vectorIn);
    hkVector4f c3; c3.setZero();
    HK_TRANSPOSE4f(c0, c1, c2, c3);

    hkVector4f s; s.setAdd(c2, c3);
    dotsOut.setAdd(c0, c1);
    dotsOut.add(s);
#else
    dotsOut.set( a0.dot<4>( vectorIn ), a1.dot<4>( vectorIn ), a2.dot<4>( vectorIn ), hkSimdFloat32_0 );
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot4_1vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot4_1vs3( hkVector4dParameter vectorIn, hkVector4dParameter a0, hkVector4dParameter a1, hkVector4dParameter a2, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0, vectorIn);
    hkVector4d c1; c1.setMul(a1, vectorIn);
    hkVector4d c2; c2.setMul(a2, vectorIn);
    hkVector4d c3; c3.setZero();
    HK_TRANSPOSE4d(c0, c1, c2, c3);

    hkVector4d s; s.setAdd(c2, c3);
    dotsOut.setAdd(c0, c1);
    dotsOut.add(s);
#else
    dotsOut.set( a0.dot<4>( vectorIn ), a1.dot<4>( vectorIn ), a2.dot<4>( vectorIn ), hkSimdDouble64_0 );
#endif
}
#endif

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::dot4xyz1_1vs4( Vec4_ vectorIn, Vec4_ a0, Vec4_ a1, Vec4_ a2, Vec4_ a3, Vec4& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    Vec4 dots3;
    hkVector4UtilImpl<FT>::dot3_1vs4(vectorIn, a0, a1, a2, a3, dots3);
    dotsOut.setAdd(dots3, vectorIn.getW());
#else
    dotsOut.set( vectorIn.dot4xyz1( a0 ),
        vectorIn.dot4xyz1( a1 ),
        vectorIn.dot4xyz1( a2 ),
        vectorIn.dot4xyz1( a3 ) );
#endif
}

#ifndef HK_VECTOR4fUTIL_dot3_2vs2
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot3_2vs2( hkVector4fParameter a0, hkVector4fParameter a2, hkVector4fParameter b0, hkVector4fParameter b1, hkVector4f& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4f c0; c0.setMul(a0,b0);
    hkVector4f c1; c1.setMul(a0,b1);
    hkVector4f c2; c2.setMul(a2,b0);
    hkVector4f c3; c3.setMul(a2,b1);

    HK_TRANSPOSE4f(c0,c1,c2,c3);

    dotsOut.setAdd(c0,c1);
    dotsOut.add(c2);
#else
    dotsOut.set( a0.dot<3>( b0 ),
        a0.dot<3>( b1 ),
        a2.dot<3>( b0 ),
        a2.dot<3>( b1 ) );
#endif
}
#endif
#ifndef HK_VECTOR4dUTIL_dot3_2vs2
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::dot3_2vs2( hkVector4dParameter a0, hkVector4dParameter a2, hkVector4dParameter b0, hkVector4dParameter b1, hkVector4d& dotsOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    hkVector4d c0; c0.setMul(a0,b0);
    hkVector4d c1; c1.setMul(a0,b1);
    hkVector4d c2; c2.setMul(a2,b0);
    hkVector4d c3; c3.setMul(a2,b1);

    HK_TRANSPOSE4d(c0,c1,c2,c3);

    dotsOut.setAdd(c0,c1);
    dotsOut.add(c2);
#else
    dotsOut.set( a0.dot<3>( b0 ),
        a0.dot<3>( b1 ),
        a2.dot<3>( b0 ),
        a2.dot<3>( b1 ) );
#endif
}
#endif

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::dot4xyz1_2vs2( Vec4_ a0, Vec4_ a2, Vec4_ b0, Vec4_ b1, Vec4& dotsOut)
{
    Vec4 dot3s;
    hkVector4UtilImpl<FT>::dot3_2vs2(a0, a2, b0, b1, dot3s);
    Vec4 w;
    hkVector4UtilImpl<FT>::template setPermutation2<3,3,7,7>(a0, a2, w);
    dotsOut.setAdd(dot3s, w);
}


//
//  Computes an = (a x n), bn = (b x n), cn = (c x n)

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::cross_3vs1(   Vec4_ a, Vec4_ b, Vec4_ c, Vec4_ n,
                                                                Vec4& an, Vec4& bn, Vec4& cn)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    Vec4 tmp_n; tmp_n.template setPermutation<hkVectorPermutation::ZXYW>(n);
    Vec4 tmp_a; tmp_a.template setPermutation<hkVectorPermutation::YZXW>(a);
    Vec4 tmp_b; tmp_b.template setPermutation<hkVectorPermutation::YZXW>(b);
    Vec4 tmp_c; tmp_c.template setPermutation<hkVectorPermutation::YZXW>(c);

    tmp_a.mul(tmp_n);
    tmp_b.mul(tmp_n);
    tmp_c.mul(tmp_n);

    Vec4 p_n; p_n.template setPermutation<hkVectorPermutation::YZXW>(n);
    Vec4 p_a; p_a.template setPermutation<hkVectorPermutation::ZXYW>(a);
    Vec4 p_b; p_b.template setPermutation<hkVectorPermutation::ZXYW>(b);
    Vec4 p_c; p_c.template setPermutation<hkVectorPermutation::ZXYW>(c);

    an.setSubMul(tmp_a, p_n, p_a);
    bn.setSubMul(tmp_b, p_n, p_b);
    cn.setSubMul(tmp_c, p_n, p_c);
#else
    an.setCross(a, n);
    bn.setCross(b, n);
    cn.setCross(c, n);
#endif
}


//
//  Computes an = (a x n), bn = (b x n), cn = (c x n), dn = (d x n)

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::cross_4vs1(   Vec4_ a, Vec4_ b, Vec4_ c, Vec4_ d, Vec4_ n,
                                                                Vec4& an, Vec4& bn, Vec4& cn, Vec4& dn)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    Vec4 tmp4; tmp4.template setPermutation<hkVectorPermutation::ZXYW>(n);

    Vec4 t_a; t_a.template setPermutation<hkVectorPermutation::YZXW>(a);
    Vec4 t_b; t_b.template setPermutation<hkVectorPermutation::YZXW>(b);
    Vec4 t_c; t_c.template setPermutation<hkVectorPermutation::YZXW>(c);
    Vec4 t_d; t_d.template setPermutation<hkVectorPermutation::YZXW>(d);

    t_a.mul(tmp4);
    t_b.mul(tmp4);
    t_c.mul(tmp4);
    t_d.mul(tmp4);

    Vec4 ntmp4; ntmp4.template setPermutation<hkVectorPermutation::YZXW>(n);

    Vec4 p_a; p_a.template setPermutation<hkVectorPermutation::ZXYW>(a);
    Vec4 p_b; p_b.template setPermutation<hkVectorPermutation::ZXYW>(b);
    Vec4 p_c; p_c.template setPermutation<hkVectorPermutation::ZXYW>(c);
    Vec4 p_d; p_d.template setPermutation<hkVectorPermutation::ZXYW>(d);

    an.setSubMul(t_a, ntmp4, p_a);
    bn.setSubMul(t_b, ntmp4, p_b);
    cn.setSubMul(t_c, ntmp4, p_c);
    dn.setSubMul(t_d, ntmp4, p_d);
#else
    an.setCross(a, n);
    bn.setCross(b, n);
    cn.setCross(c, n);
    dn.setCross(d, n);
#endif
}

template <typename FT>
template <int X>
HK_INLINE typename hkFloatTypes<FT>::Scalar HK_CALL hkVector4UtilImpl<FT>::dot3X( Vec4_ a0, Vec4_ a1, Vec4_ b0, Vec4_ b1, Vec4_ c0, Vec4_ c1)
{
    Vec4 a; a.setMul( a0, a1 );
    Vec4 b; b.setMul( b0, b1 );
    a.addMul( c0, c1 );
    a.add( b );
    Scalar result = a.template horizontalAdd<X>();
    return result;
}

template <typename FT>
template <int X>
HK_INLINE typename hkFloatTypes<FT>::Scalar HK_CALL hkVector4UtilImpl<FT>::dot2X( Vec4_ a0, Vec4_ a1, Vec4_ b0, Vec4_ b1)
{
#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED && HK_SSE_VERSION >= 0x31
    Scalar result = a0.template dot<X>(a1) + b0.template dot<X>(b1);
#else
    Vec4 a; a.setMul( a0, a1 );
    a.addMul( b0, b1 );
    Scalar result = a.template horizontalAdd<X>();
#endif
    return result;
}


//
//  Computes the cross products: (vA, vB), (vB, vC), (vC, vA)

template <typename FT>
HK_INLINE void hkVector4UtilImpl<FT>::computeCyclicCrossProducts(   Vec4_ vA, Vec4_ vB, Vec4_ vC,
                                                                        Vec4& vAB, Vec4& vBC, Vec4& vCA)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)
    Vec4 tmpa; tmpa.template setPermutation<hkVectorPermutation::YZXW>(vA); // tmpa = (ay, az, ax, *)
    Vec4 tmpb; tmpb.template setPermutation<hkVectorPermutation::ZXYW>(vB); // tmpb = (bz, bx, by, *)
    Vec4 tmpc; tmpc.template setPermutation<hkVectorPermutation::YZXW>(vC); // tmpc = (cy, cz, cx, *)
    Vec4 tmpd; tmpd.template setPermutation<hkVectorPermutation::ZXYW>(vA); // tmpd = (az, ax, ay, *)

    Vec4 ab; ab.setMul(tmpa, tmpb); // (ay * bz, az * bx, ax * by, *)
    Vec4 bc; bc.setMul(tmpb, tmpc); // (bz * cy, bx * cz, by * cx, *)
    Vec4 ca; ca.setMul(tmpc, tmpd); // (cy * az, cz * ax, cx * ay, *)

    Vec4 pb; pb.template setPermutation<hkVectorPermutation::YZXW>(vB); // tmpb = (by, bz, bx, *)
    Vec4 pc; pc.template setPermutation<hkVectorPermutation::ZXYW>(vC); // tmpc = (cz, cx, cy, *)
    vAB.setSubMul(ab, tmpd, pb);                    // (ay * bz, az * bx, ax * by, *) - (az * by, ax * bz, ay * bx, *) = (vA x vB)
    Vec4 sb; sb.setMul(pb, pc);                                         // tmpb = (by * cz, bz * cx, bx * cy, *)
    vCA.setSubMul(ca, pc, tmpa);                    // (cy * az, cz * ax, cx * ay, *) - (cz * ay, cx * az, cy * ax, *) = (vC x vA)
    vBC.setSub(sb, bc);                                     // (by * cz, bz * cx, bx * cy, *) - (bz * cy, bx * cz, by * cx, *) = (vB x vC)*/
#else
    vAB.setCross(vA, vB);
    vBC.setCross(vB, vC);
    vCA.setCross(vC, vA);
#endif
}


template <typename FT>
HK_INLINE void hkVector4UtilImpl<FT>::computeQuadEdgePlanes( Vec4_ vA, Vec4_ vB, Vec4_ vC, Vec4_ vD, Vec4* HK_RESTRICT normalOut, Transform* HK_RESTRICT edgesOut )
{
    Vec4 vAB;   vAB.setSub(vB, vA);
    Vec4 vBC;   vBC.setSub(vC, vB);
    Vec4 normal;    normal.setCross(vAB, vBC );


    // edges
    Vec4 vCD;   vCD.setSub(vD, vC);
    Vec4 vDA;   vDA.setSub(vA, vD);
    Vec4 epA, epB, epC, epD;
    cross_4vs1( vAB, vBC, vCD, vDA, normal, epA, epB, epC, epD );

    // Find closest point in A
    normal.template normalize<3>();
    normal.setW(-normal.template dot<3>(vA));
    *normalOut = normal;

    Vec4 o; dot3_4vs4( vA, epA, vB, epB, vC, epC, vD, epD, o );

    edgesOut->setRows4( epA, epB, epC, epD );
    Vec4 len2;
    len2.setMul( edgesOut->template getColumn<0>(), edgesOut->template getColumn<0>() );
    len2.addMul( edgesOut->template getColumn<1>(), edgesOut->template getColumn<1>() );
    len2.addMul( edgesOut->template getColumn<2>(), edgesOut->template getColumn<2>() );
    Vec4 invLen; invLen.template setSqrtInverse<HK_ACC_12_BIT, HK_SQRT_SET_ZERO>( len2 );

    edgesOut->getColumn(0).mul(invLen);
    edgesOut->getColumn(1).mul(invLen);
    edgesOut->getColumn(2).mul(invLen);
    invLen.template setNeg<4>(invLen);
    edgesOut->getColumn(3).setMul( o, invLen);
}


template <typename FT>
HK_INLINE const typename hkFloatTypes<FT>::Scalar HK_CALL hkVector4UtilImpl<FT>::distToLineSquared( Vec4_ a, Vec4_ b, Vec4_ p )
{
    Vec4 ab, ap;
    ab.setSub( b, a );
    ap.setSub( p, a );

    typename hkFloatTypes<FT>::Scalar projectionAsPercentOfSegment;
    projectionAsPercentOfSegment.setDiv( ap.template dot<3>(ab), ab.template lengthSquared<3>() );
    ap.subMul( projectionAsPercentOfSegment, ab );
    return ap.template lengthSquared<3>();
}


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setMax44(Vec4_ a, Vec4_ b, Vec4_ c, Vec4_ d, Vec4& out)
{
    Vec4    t0; t0.setMax(a,b);
    Vec4    t1; t1.setMax(c,d);
    out.setMax(t0,t1);
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setMin44(Vec4_ a, Vec4_ b, Vec4_ c, Vec4_ d, Vec4& out)
{
    Vec4    t0; t0.setMin(a,b);
    Vec4    t1; t1.setMin(c,d);
    out.setMin(t0,t1);
}


template <typename FT>
HK_INLINE hkBool32 hkVector4UtilImpl<FT>::anyNonZero3( Vec4_ a, Vec4_ b, Vec4_ c, Vec4_ d )
{
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkInt32 hkVector4Util_valueMask[4]);

    hkIntVector ai; ai.loadAsFloat32BitRepresentation(a);
    hkIntVector bi; bi.loadAsFloat32BitRepresentation(b);
    hkIntVector ci; ci.loadAsFloat32BitRepresentation(c);
    hkIntVector di; di.loadAsFloat32BitRepresentation(d);
    ai.setOr(ai,bi);
    ci.setOr(ci,di);
    ai.setOr(ai,ci);
    ai.setAnd(ai, (const hkIntVector&)hkVector4Util_valueMask);
    hkVector4Comparison h = ai.greaterZeroS32( );
    int mask = h.getMask(hkVector4ComparisonMask::MASK_XYZ);
    return mask;
}

template <typename FT>
HK_INLINE hkBool32 hkVector4UtilImpl<FT>::anyNonZero3(Vec4_ a, Vec4_ b)
{
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkInt32 hkVector4Util_valueMask[4]);

    hkIntVector ai; ai.loadAsFloat32BitRepresentation(a);
    hkIntVector bi; bi.loadAsFloat32BitRepresentation(b);
    ai.setOr(ai, bi);
    ai.setAnd(ai, (const hkIntVector&)hkVector4Util_valueMask);
    hkVector4Comparison h = ai.greaterZeroS32( );
    int mask = h.getMask(hkVector4ComparisonMask::MASK_XYZ);
    return mask;
}


// we need to convert the quaternion to int. To avoid problems
// at -1 and 1, we encode [-1.1 .. 1.1]
#define HK_VECTOR4UTIL_PACK_RANGE 1.1f
template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::unPackInt32IntoQuaternion( hkUint32 ivalue, Vec4& qout )
{
    hkIntVector c; c.setFirstComponent(ivalue); c.setSplit8To32(c);
    hkIntVector h; h.setAll(0x80);
    hkIntVector i; i.setSubU32(c,h); i.convertS32ToF32(qout);

    Scalar scale; scale.setFromFloat(typename hkFloatTypes<FT>::Float(HK_VECTOR4UTIL_PACK_RANGE/128.0f));
    qout.mul(scale);
}


#ifndef HK_VECTOR4fUTIL_convertQuaternionToRotation
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::convertQuaternionToRotation( const hkQuaternionf& qi, hkRotationf& rotationOut )
{
    
    HK_MATH_ASSERT(0x1ff88f0e, qi.isOk(), "hkQuaternionf not normalized/invalid!");
    const hkVector4f q = qi.m_vec;

    hkVector4f q2; q2.setAdd(q,q);
    hkVector4f xq2; xq2.setMul(q2, q.getComponent<0>());
    hkVector4f yq2; yq2.setMul(q2, q.getComponent<1>());
    hkVector4f zq2; zq2.setMul(q2, q.getComponent<2>());
    hkVector4f wq2; wq2.setMul(q2, q.getComponent<3>());

    const hkSimdFloat32 one = hkSimdFloat32_1;
    hkSimdFloat32 zero; zero.setZero();

    //  1-(yy+zz),  xy+wz,      xz-wy
    //  xy-wz,      1-(xx+zz),  yz+wx
    //  xz+wy,      yz-wx,      1-(xx+yy)
    hkVector4f c0; c0.set( one-(yq2.getComponent<1>()+zq2.getComponent<2>()), xq2.getComponent<1>()+wq2.getComponent<2>(), xq2.getComponent<2>()-wq2.getComponent<1>(), zero );
    hkVector4f c1; c1.set( xq2.getComponent<1>()-wq2.getComponent<2>(), one-(xq2.getComponent<0>()+zq2.getComponent<2>()), yq2.getComponent<2>()+wq2.getComponent<0>(), zero );
    hkVector4f c2; c2.set( xq2.getComponent<2>()+wq2.getComponent<1>(), yq2.getComponent<2>()-wq2.getComponent<0>(), one-(xq2.getComponent<0>()+yq2.getComponent<1>()), zero );

    rotationOut.setColumn<0>(c0);
    rotationOut.setColumn<1>(c1);
    rotationOut.setColumn<2>(c2);
}
#endif
#ifndef HK_VECTOR4dUTIL_convertQuaternionToRotation
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::convertQuaternionToRotation( const hkQuaterniond& qi, hkRotationd& rotationOut )
{
    
    HK_MATH_ASSERT(0x1ff88f0e, qi.isOk(), "hkQuaterniond not normalized/invalid!");
    const hkVector4d q = qi.m_vec;

    hkVector4d q2; q2.setAdd(q,q);
    hkVector4d xq2; xq2.setMul(q2, q.getComponent<0>());
    hkVector4d yq2; yq2.setMul(q2, q.getComponent<1>());
    hkVector4d zq2; zq2.setMul(q2, q.getComponent<2>());
    hkVector4d wq2; wq2.setMul(q2, q.getComponent<3>());

    const hkSimdDouble64 one = hkSimdDouble64_1;
    hkSimdDouble64 zero; zero.setZero();

    //  1-(yy+zz),  xy+wz,      xz-wy
    //  xy-wz,      1-(xx+zz),  yz+wx
    //  xz+wy,      yz-wx,      1-(xx+yy)
    hkVector4d c0; c0.set( one-(yq2.getComponent<1>()+zq2.getComponent<2>()), xq2.getComponent<1>()+wq2.getComponent<2>(), xq2.getComponent<2>()-wq2.getComponent<1>(), zero );
    hkVector4d c1; c1.set( xq2.getComponent<1>()-wq2.getComponent<2>(), one-(xq2.getComponent<0>()+zq2.getComponent<2>()), yq2.getComponent<2>()+wq2.getComponent<0>(), zero );
    hkVector4d c2; c2.set( xq2.getComponent<2>()+wq2.getComponent<1>(), yq2.getComponent<2>()-wq2.getComponent<0>(), one-(xq2.getComponent<0>()+yq2.getComponent<1>()), zero );

    rotationOut.setColumn<0>(c0);
    rotationOut.setColumn<1>(c1);
    rotationOut.setColumn<2>(c2);
}
#endif


template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setXAtVectorMax( Vec4_ check, hkIntVector& iVinOut, Vec4& v0InOut, Vec4& v1InOut)
{
#if (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED) && !defined(HK_USING_GENERIC_INT_VECTOR_IMPLEMENTATION)
    Vec4 zw; zw.template setPermutation<hkVectorPermutation::ZWZW>( check );
    Comparison zwGreater = zw.greater( check );

    zw.setMax( zw, check );
    hkIntVector ivXY, ivZW;   ivZW.setPermutation<hkVectorPermutation::ZWZW>( iVinOut ); ivXY.setSelect( zwGreater, ivZW, iVinOut );
    Vec4   v0XY, v0ZW;   v0ZW.template setPermutation<hkVectorPermutation::ZWZW>( v0InOut ); v0XY.setSelect( zwGreater, v0ZW, v0InOut );
    Vec4   v1XY, v1ZW;   v1ZW.template setPermutation<hkVectorPermutation::ZWZW>( v1InOut ); v1XY.setSelect( zwGreater, v1ZW, v1InOut );

    Vec4 y; y.template setBroadcast<1>( zw );
    Comparison yGreater = y.greater( zw );

    hkIntVector ivY; ivY.setBroadcast<1>( ivXY ); iVinOut.setSelect( yGreater, ivY, ivXY );
    Vec4 v0Y;   v0Y.template setBroadcast<1>( v0XY ); v0InOut.setSelect( yGreater, v0Y, v0XY );
    Vec4 v1Y;   v1Y.template setBroadcast<1>( v1XY ); v1InOut.setSelect( yGreater, v1Y, v1XY );

#else
    int index = check.template getIndexOfMaxComponent<4>();
    iVinOut.setAll( iVinOut.getU32(index) );
    v0InOut.setBroadcast(index, v0InOut );
    v1InOut.setBroadcast(index, v1InOut );
#endif
}


#define HK_VECTOR4UTIL_PERM_MASK \
    hkVectorPermutation::Permutation( \
    ((i & 0x3) << 12) \
    | ((j & 0x3) << 8) \
    | ((k & 0x3) << 4) \
    | ((l & 0x3) << 0))

#define HK_VECTOR4UTIL_SELECT_MASK \
    hkVector4ComparisonMask::Mask( \
    ((i & 0x4) ? 0 : hkVector4ComparisonMask::MASK_X) \
    | ((j & 0x4) ? 0 : hkVector4ComparisonMask::MASK_Y) \
    | ((k & 0x4) ? 0 : hkVector4ComparisonMask::MASK_Z) \
    | ((l & 0x4) ? 0 : hkVector4ComparisonMask::MASK_W))

#ifndef HK_VECTOR4fUTIL_setPermutation2
template <typename FT>
template<unsigned int i, unsigned int j, unsigned int k, unsigned int l>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setPermutation2(hkVector4fParameter a, hkVector4fParameter b, hkVector4f & out)
{
    HK_COMPILE_TIME_ASSERT(i<8 && j<8 && k<8 && l<8);

    hkVector4f aPerm; aPerm.setPermutation<HK_VECTOR4UTIL_PERM_MASK>(a);
    hkVector4f bPerm; bPerm.setPermutation<HK_VECTOR4UTIL_PERM_MASK>(b);
    out.setSelect<HK_VECTOR4UTIL_SELECT_MASK>(aPerm, bPerm);
}
#endif
#ifndef HK_VECTOR4dUTIL_setPermutation2
template <typename FT>
template<unsigned int i, unsigned int j, unsigned int k, unsigned int l>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setPermutation2(hkVector4dParameter a, hkVector4dParameter b, hkVector4d & out)
{
    HK_COMPILE_TIME_ASSERT(i<8 && j<8 && k<8 && l<8);

    hkVector4d aPerm; aPerm.setPermutation<HK_VECTOR4UTIL_PERM_MASK>(a);
    hkVector4d bPerm; bPerm.setPermutation<HK_VECTOR4UTIL_PERM_MASK>(b);
    out.setSelect<HK_VECTOR4UTIL_SELECT_MASK>(aPerm, bPerm);
}
#endif

#undef HK_VECTOR4UTIL_PERM_MASK
#undef HK_VECTOR4UTIL_SELECT_MASK

#ifndef HK_VECTOR4UTIL_sortVectorIncreasing
template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::sortVectorIncreasing(Vec4& xyzw)
{
    // Implemented as a 5-comparator, 3-round sorting network
    // Round 1: X/Z, Y/W
    {
        Vec4 zwxy; zwxy.template setPermutation<hkVectorPermutation::ZWXY>(xyzw);
        Comparison comp = xyzw.less(zwxy);
        Comparison czw; czw.template set<hkVector4ComparisonMask::MASK_ZW>();
        comp.setXor(comp, czw);
        xyzw.setSelect(comp, xyzw, zwxy);
    }

    // Round 2: X/Y, Z/W
    {
        Vec4 yxwz; yxwz.template setPermutation<hkVectorPermutation::YXWZ>(xyzw);
        Comparison comp = xyzw.less(yxwz);
        Comparison cyw; cyw.template set<hkVector4ComparisonMask::MASK_YW>();
        comp.setXor(comp, cyw);
        xyzw.setSelect(comp, xyzw, yxwz);
    }

    // Round 3: Y/Z
    {
        Vec4 xzyw; xzyw.template setPermutation<hkVectorPermutation::XZYW>(xyzw);
        Comparison comp = xyzw.less(xzyw);
        Comparison cz; cz.template set<hkVector4ComparisonMask::MASK_Z>();
        comp.setXor(comp, cz);
        xyzw.setSelect(comp, xyzw, xzyw);
    }
}
#endif

#ifndef HK_VECTOR4UTIL_sortVectorDecreasing
template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::sortVectorDecreasing(Vec4& xyzw)
{
    // Implemented as a 5-comparator, 3-round sorting network
    // Round 1: X/Z, Y/W
    {
        Vec4 zwxy; zwxy.template setPermutation<hkVectorPermutation::ZWXY>(xyzw);
        Comparison comp = xyzw.greater(zwxy);
        Comparison czw; czw.template set<hkVector4ComparisonMask::MASK_ZW>();
        comp.setXor(comp, czw);
        xyzw.setSelect(comp, xyzw, zwxy);
    }

    // Round 2: X/Y, Z/W
    {
        Vec4 yxwz; yxwz.template setPermutation<hkVectorPermutation::YXWZ>(xyzw);
        Comparison comp = xyzw.greater(yxwz);
        Comparison cyw; cyw.template set<hkVector4ComparisonMask::MASK_YW>();
        comp.setXor(comp, cyw);
        xyzw.setSelect(comp, xyzw, yxwz);
    }

    // Round 3: Y/Z
    {
        Vec4 xzyw; xzyw.template setPermutation<hkVectorPermutation::XZYW>(xyzw);
        Comparison comp = xyzw.greater(xzyw);
        Comparison cz; cz.template set<hkVector4ComparisonMask::MASK_Z>();
        comp.setXor(comp, cz);
        xyzw.setSelect(comp, xyzw, xzyw);
    }
}
#endif

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setLength( Scalar_ desiredLength, Vec4& vectorInOut )
{
    Vec4 v = vectorInOut;
    Scalar newLenInv = v.template lengthInverse<3, HK_ACC_MID, HK_SQRT_SET_ZERO>();
    Scalar factor = newLenInv * desiredLength;
    vectorInOut.setMul( v, factor );
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::setProjectedLength( Scalar_ desiredLength, Vec4_ scaleMeasurePlane, Scalar_ maxScale, Vec4& vectorInOut )
{
    Vec4 v = vectorInOut;
    Scalar dot2 = v.template dot<3>(scaleMeasurePlane);
    Vec4 v2; v2.setSubMul( v, scaleMeasurePlane, dot2 );

    Scalar newLenInv = v2.template lengthInverse<3, HK_ACC_MID, HK_SQRT_SET_ZERO>();
    Scalar factor = newLenInv * desiredLength;
    factor.setMin( factor, maxScale );
    factor.setMax( factor, -maxScale );
    vectorInOut.setMul( v, factor );
}


#ifndef HK_VECTOR4fUTIL_convertComparison
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::convertComparison(const hkVector4fComparison& cmpin, hkVector4dComparison& cmpout)
{
    cmpout.set(cmpin.getMask());
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::convertComparison(const hkVector4fComparison& cmpin, hkVector4fComparison& cmpout)
{
    cmpout.m_mask = cmpin.m_mask;
}
#endif
#ifndef HK_VECTOR4dUTIL_convertComparison
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::convertComparison(const hkVector4dComparison& cmpin, hkVector4fComparison& cmpout)
{
    cmpout.set(cmpin.getMask());
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::convertComparison(const hkVector4dComparison& cmpin, hkVector4dComparison& cmpout)
{
    cmpout.m_mask = cmpin.m_mask;
}
#endif

#ifndef HK_VECTOR4fUTIL_convertVector
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::convertVector(hkVector4fParameter vin, hkVector4d& vout)
{
    typedef union { hkQuadFloat32 q; hkFloat32 f32[4]; } q2f;
    const q2f* in = (const q2f*)&vin.m_quad;
    vout.load<4>(in->f32);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::convertVector(hkVector4fParameter vin, hkVector4f& vout)
{
    vout = vin;
}
#endif
#ifndef HK_VECTOR4dUTIL_convertVector
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::convertVector(hkVector4dParameter vin, hkVector4f& vout)
{
    typedef union { hkQuadDouble64 q; hkDouble64 d64[4]; } q2d;
    const q2d* in = (const q2d*)&vin.m_quad;
    vout.load<4>(in->d64);
}
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkDouble64>::convertVector(hkVector4dParameter vin, hkVector4d& vout)
{
    vout = vin;
}
#endif


template <typename FT>
void hkVector4UtilImpl<FT>::getFloatToInt16FloorCorrection(typename hkFloatTypes<FT>::Float& out)
{
    typename hkFloatTypes<FT>::Float l = typename hkFloatTypes<FT>::Float(10);
    typename hkFloatTypes<FT>::Float h = typename hkFloatTypes<FT>::Float(11);
    Vec4 one = Vec4::template getConstant<HK_QUADREAL_1>();

    for (int i = 0; i < 23; i++ )
    {
        typename hkFloatTypes<FT>::Float m = (l+h) * typename hkFloatTypes<FT>::Float(0.5f);
        Vec4 p; p.setAll( m );

        hkIntUnion64 iout;
        convertToUint16WithClip( p, Vec4::getZero(), one, Vec4::getZero(), Vec4::template getConstant<HK_QUADREAL_HIGH>(), iout );

        if ( iout.u16[0] < 11 )
        {
            l = m;
        }
        else
        {
            h = m;
        }
    }
    out = (l+h) * typename hkFloatTypes<FT>::Float(0.5f) - typename hkFloatTypes<FT>::Float(11);
}

template <typename FT>
void hkVector4UtilImpl<FT>::getFloatToInt32FloorCorrection(typename hkFloatTypes<FT>::Float& out)
{
    typename hkFloatTypes<FT>::Float l = typename hkFloatTypes<FT>::Float(10);
    typename hkFloatTypes<FT>::Float h = typename hkFloatTypes<FT>::Float(11);
    Vec4 one = Vec4::template getConstant<HK_QUADREAL_1>();

    for (int i = 0; i < 23; i++ )
    {
        typename hkFloatTypes<FT>::Float m = (l+h) * typename hkFloatTypes<FT>::Float(0.5f);
        Vec4 p; p.setAll( m );

        hkUint32 iout[4];
        convertToUint32( p, Vec4::getZero(), one, iout );

        if ( iout[0] < 11 )
        {
            l = m;
        }
        else
        {
            h = m;
        }
    }
    out = (l+h) * typename hkFloatTypes<FT>::Float(0.5f) - typename hkFloatTypes<FT>::Float(11);
}


// we need to convert the quaternion to int. To avoid problems
// at -1 and 1, we encode [-1.1 .. 1.1]
// pack a quaternion into a single 32 bit integer
template <typename FT>
hkUint32 hkVector4UtilImpl<FT>::packQuaternionIntoInt32( Vec4_ qin )
{
    Vec4 x;
    x.setAddMul(Vec4::template getConstant<HK_QUADREAL_INV_2>(), qin, hkFloatTypes<FT>::Scalar::fromFloat(typename hkFloatTypes<FT>::Float(128.0f/HK_VECTOR4UTIL_PACK_RANGE)));

    hkIntVector i; i.setConvertF32toS32Unchecked(x);
    hkIntVector c; c.setAll(0x80);
    hkIntVector r; r.setAddU32(i,c);
    hkIntVector f; f.setAll(0xff);
    hkIntVector p; p.setAnd(r,f);

    p.setConvertSaturateS32ToS16(p,p); // saturation won't happen because of masking above
    p.setConvertSaturateS16ToU8(p,p);
    return p.getU32<0>();
}
#undef HK_VECTOR4UTIL_PACK_RANGE


//
// Conversions
//


template <typename FT>
void HK_CALL hkVector4UtilImpl<FT>::convert4QuaternionsToRotations(_In_reads_(4) const Quaternion* HK_RESTRICT quats,
    _Inout_ Rotation* HK_RESTRICT r0Out, _Inout_ Rotation* HK_RESTRICT r1Out,
    _Inout_ Rotation* HK_RESTRICT r2Out, _Inout_ Rotation* HK_RESTRICT r3Out )
{
    convertQuaternionToRotation( quats[0], *r0Out );
    convertQuaternionToRotation( quats[1], *r1Out );
    convertQuaternionToRotation( quats[2], *r2Out );
    convertQuaternionToRotation( quats[3], *r3Out );
}

template <typename FT>
void HK_CALL hkVector4UtilImpl<FT>::convertQuaternionToRotationNoInline( const Quaternion& qi, Rotation& rotationOut )
{
    convertQuaternionToRotation(qi,rotationOut);
}



//
template <typename FT>
/*static*/ HK_INLINE const typename hkFloatTypes<FT>::Scalar HK_CALL hkVector4UtilImpl<FT>::toRange(Scalar_ v, Scalar_ radius, Scalar_ diameter, Scalar& k)
{
    Scalar diamr; diamr.template setReciprocal<HK_ACC_FULL, HK_DIV_IGNORE>(diameter);

    return toRange(v, radius, diameter, diamr, k);
}

template <typename FT>
/*static*/ HK_INLINE const typename hkFloatTypes<FT>::Scalar HK_CALL hkVector4UtilImpl<FT>::toRange(Scalar_ v, Scalar_ radius, Scalar& k)
{
    Scalar diam; diam.setAdd(radius, radius);
    Scalar diamr; diamr.template setReciprocal<HK_ACC_FULL, HK_DIV_IGNORE>(diam);

    return toRange(v, radius, diam, diamr, k);
}

template <typename FT>
/*static*/ HK_INLINE const typename hkFloatTypes<FT>::Scalar HK_CALL hkVector4UtilImpl<FT>::toRange(Scalar_ v, Scalar_ radius, Scalar_ diameter, Scalar_ diameterReciprocal, Scalar& k)
{
    // Compute the number of 2Pi multiples needed.
    k.setZero();
    Scalar cv = v;
    Scalar mradius = -radius;
    Scalar cvc;
    do
    {
        Scalar kf; kf.setAdd(cv, radius); kf.setMul( kf, diameterReciprocal);
        kf.setFloor(kf);

        cv.subMul(kf, diameter);
        k += kf;
        cvc.setClamped(cv, mradius, radius);
    } while ( cvc.isNotEqual(cv) );

    return cv;
}


#ifndef HK_VECTOR4dUTIL_streamStore
template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::streamStore1( Scalar_ v, FT* dest )
{
    v.template store<1>(dest);
}

template <typename FT>
HK_INLINE void HK_CALL hkVector4UtilImpl<FT>::streamStore4( Vec4_ v, FT* dest )
{
    v.template store<4>( dest );
}
#endif

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
