// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM   : WIN32 X64 DURANGO PS4
// PRODUCT   : COMMON
// VISIBILITY   : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0

// Gcc 4.5.1 can mis-optimise _mm_move_sd
#if defined(HK_COMPILER_GCC) && (HK_COMPILER_GCC_VERSION <= 40501)
#define MOVE_SD(a, b) _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1, 0))
#else
#define MOVE_SD(a, b) _mm_move_sd(a, b)
#endif

#ifndef HK_DISABLE_MATH_CONSTRUCTORS
/* construct, assign, zero */
HK_INLINE hkVector4d::hkVector4d(hkDouble64 a, hkDouble64 b, hkDouble64 c, hkDouble64 d)
{
    m_quad.xy = _mm_setr_pd(a,b);
    m_quad.zw = _mm_setr_pd(c,d);
}



HK_INLINE hkVector4d::hkVector4d(const hkVector4d& v)
{
    m_quad = v.m_quad;
}
#endif

void hkVector4d::set(const hkQuadDouble64& q)
{
    m_quad = q;
}

HK_INLINE void hkVector4d::set(hkDouble64 a, hkDouble64 b, hkDouble64 c, hkDouble64 d)
{
    m_quad.xy = _mm_setr_pd(a,b);
    m_quad.zw = _mm_setr_pd(c,d);
}

HK_INLINE void hkVector4d::set( hkSimdDouble64Parameter a, hkSimdDouble64Parameter b, hkSimdDouble64Parameter c, hkSimdDouble64Parameter d )
{
    m_quad.xy = _mm_unpacklo_pd(a.m_real, b.m_real);
    m_quad.zw = _mm_unpacklo_pd(c.m_real, d.m_real);
}

HK_INLINE void hkVector4d::setAll(const hkDouble64& a)
{
#if HK_SSE_VERSION >= 0x30
    m_quad.xy = _mm_loaddup_pd(&a);
    m_quad.zw = _mm_loaddup_pd(&a);
#else
    m_quad.xy = _mm_load1_pd(&a);
    m_quad.zw = _mm_load1_pd(&a);
#endif
}

HK_INLINE void hkVector4d::setAll(hkSimdDouble64Parameter a)
{
    m_quad.xy = a.m_real;
    m_quad.zw = a.m_real;
}

template<int vectorConstant>
HK_INLINE void hkVector4d::setConstant()
{
    HK_VECTOR4_CONSTANT_CHECK;
    switch (vectorConstant)
    {
    case HK_QUADREAL_0:
        {
            m_quad.xy = _mm_setzero_pd();
            m_quad.zw = _mm_setzero_pd();
            break;
        }
    case HK_QUADREAL_1000:
    case HK_QUADREAL_0100:
        {
            m_quad.xy = g_vectordConstants[vectorConstant].xy;
            m_quad.zw = _mm_setzero_pd();
            break;
        }
    case HK_QUADREAL_0010:
    case HK_QUADREAL_0001:
        {
            m_quad.xy = _mm_setzero_pd();
            m_quad.zw = g_vectordConstants[vectorConstant].zw;
            break;
        }
    case HK_QUADREAL_1100:
        {
            m_quad.xy = g_vectordConstants[vectorConstant].xy;
            m_quad.zw = _mm_setzero_pd();
            break;
        }
    case HK_QUADREAL_0011:
        {
            m_quad.xy = _mm_setzero_pd();
            m_quad.zw = g_vectordConstants[vectorConstant].zw;
            break;
        }
    default:
        {
            m_quad.xy = g_vectordConstants[vectorConstant].xy;
            m_quad.zw = g_vectordConstants[vectorConstant].zw;
            break;
        }
    }
}

HK_INLINE void hkVector4d::setZero()
{
    m_quad.xy = _mm_setzero_pd();
    m_quad.zw = _mm_setzero_pd();
}

template <>
HK_INLINE void hkVector4d::zeroComponent<0>()
{
    m_quad.xy = MOVE_SD(m_quad.xy, _mm_setzero_pd());
}

template <>
HK_INLINE void hkVector4d::zeroComponent<1>()
{
#if defined(HK_COMPILER_MSVC)
    // VS bug: movq instruction causes mem corruption
    m_quad.xy = MOVE_SD(_mm_setzero_pd(), m_quad.xy);
#else
    m_quad.xy = _mm_castsi128_pd(_mm_move_epi64(_mm_castpd_si128(m_quad.xy)));
#endif
}

template <>
HK_INLINE void hkVector4d::zeroComponent<2>()
{
    m_quad.zw = MOVE_SD(m_quad.zw, _mm_setzero_pd());
}

template <>
HK_INLINE void hkVector4d::zeroComponent<3>()
{
#if defined(HK_COMPILER_MSVC)
    // VS bug: movq instruction causes mem corruption
    m_quad.zw = MOVE_SD(_mm_setzero_pd(), m_quad.zw);
#else
    m_quad.zw = _mm_castsi128_pd(_mm_move_epi64(_mm_castpd_si128(m_quad.zw)));
#endif
}

template <int N>
HK_INLINE void hkVector4d::zeroComponent()
{
    HK_VECTOR4_NOT_IMPLEMENTED;
}

HK_INLINE void hkVector4d::zeroComponent(const int i)
{
    HK_MATH_ASSERT(0x3bc36625, (i>=0) && (i<4), "Component index out of range");
    switch (i)
    {
        case 3:  zeroComponent<3>(); break;
        case 2:  zeroComponent<2>(); break;
        case 1:  zeroComponent<1>(); break;
        default: zeroComponent<0>(); break;
    }
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::getComponent<0>() const
{
    return hkSimdDouble64::convert(_mm_unpacklo_pd(m_quad.xy, m_quad.xy));
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::getComponent<1>() const
{
    return hkSimdDouble64::convert(_mm_unpackhi_pd(m_quad.xy, m_quad.xy));
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::getComponent<2>() const
{
    return hkSimdDouble64::convert(_mm_unpacklo_pd(m_quad.zw, m_quad.zw));
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::getComponent<3>() const
{
    return hkSimdDouble64::convert(_mm_unpackhi_pd(m_quad.zw, m_quad.zw));
}

template <int N>
HK_INLINE const hkSimdDouble64 hkVector4d::getComponent() const
{
    HK_VECTOR4_NOT_IMPLEMENTED;
    return hkSimdDouble64::getConstant<HK_QUADREAL_0>();
}

HK_INLINE const hkSimdDouble64 hkVector4d::getComponent(const int i) const
{
    HK_MATH_ASSERT(0x6d0c31d7, i>=0 && i<4, "index out of bounds for component access");
    switch (i)
    {
        case 1:  return getComponent<1>();
        case 2:  return getComponent<2>();
        case 3:  return getComponent<3>();
        default: return getComponent<0>();
    }
}

HK_INLINE void hkVector4d::setAdd(hkVector4dParameter v0, hkVector4dParameter v1)
{
    m_quad.xy = _mm_add_pd(v0.m_quad.xy, v1.m_quad.xy);
    m_quad.zw = _mm_add_pd(v0.m_quad.zw, v1.m_quad.zw);
}

HK_INLINE void hkVector4d::setAdd(hkVector4dParameter v0, hkSimdDouble64Parameter v1)
{
    m_quad.xy = _mm_add_pd(v0.m_quad.xy, v1.m_real);
    m_quad.zw = _mm_add_pd(v0.m_quad.zw, v1.m_real);
}

HK_INLINE void hkVector4d::setSub(hkVector4dParameter v0, hkVector4dParameter v1)
{
    m_quad.xy = _mm_sub_pd(v0.m_quad.xy, v1.m_quad.xy);
    m_quad.zw = _mm_sub_pd(v0.m_quad.zw, v1.m_quad.zw);
}

HK_INLINE void hkVector4d::setSub(hkVector4dParameter v0, hkSimdDouble64Parameter v1)
{
    m_quad.xy = _mm_sub_pd(v0.m_quad.xy, v1.m_real);
    m_quad.zw = _mm_sub_pd(v0.m_quad.zw, v1.m_real);
}

HK_INLINE void hkVector4d::setMul(hkVector4dParameter v0, hkVector4dParameter v1)
{
    m_quad.xy = _mm_mul_pd(v0.m_quad.xy, v1.m_quad.xy);
    m_quad.zw = _mm_mul_pd(v0.m_quad.zw, v1.m_quad.zw);
}

HK_INLINE void hkVector4d::setMul(hkVector4dParameter v1, hkSimdDouble64Parameter r)
{
    m_quad.xy = _mm_mul_pd( r.m_real, v1.m_quad.xy);
    m_quad.zw = _mm_mul_pd( r.m_real, v1.m_quad.zw);
}

HK_INLINE void hkVector4d::setSubMul(hkVector4dParameter a, hkVector4dParameter b, hkSimdDouble64Parameter r)
{
    m_quad.xy = _mm_sub_pd( a.m_quad.xy, _mm_mul_pd( r.m_real, b.m_quad.xy) );
    m_quad.zw = _mm_sub_pd( a.m_quad.zw, _mm_mul_pd( r.m_real, b.m_quad.zw) );
}

HK_INLINE void hkVector4d::setAddMul(hkVector4dParameter a, hkVector4dParameter b, hkSimdDouble64Parameter r)
{
    m_quad.xy = _mm_add_pd( a.m_quad.xy, _mm_mul_pd( r.m_real, b.m_quad.xy) );
    m_quad.zw = _mm_add_pd( a.m_quad.zw, _mm_mul_pd( r.m_real, b.m_quad.zw) );
}

HK_INLINE void hkVector4d::setAddMul(hkVector4dParameter a, hkVector4dParameter x, hkVector4dParameter y)
{
    m_quad.xy = _mm_add_pd( a.m_quad.xy, _mm_mul_pd( x.m_quad.xy, y.m_quad.xy) );
    m_quad.zw = _mm_add_pd( a.m_quad.zw, _mm_mul_pd( x.m_quad.zw, y.m_quad.zw) );
}

HK_INLINE void hkVector4d::setSubMul(hkVector4dParameter a, hkVector4dParameter x, hkVector4dParameter y)
{
    m_quad.xy = _mm_sub_pd( a.m_quad.xy, _mm_mul_pd( x.m_quad.xy, y.m_quad.xy) );
    m_quad.zw = _mm_sub_pd( a.m_quad.zw, _mm_mul_pd( x.m_quad.zw, y.m_quad.zw) );
}

HK_INLINE void hkVector4d::setCross( hkVector4dParameter v0, hkVector4dParameter v1 )
{
    const hkSingleDouble64 cross0XY = _mm_mul_pd(v0.m_quad.xy, _mm_shuffle_pd(v1.m_quad.xy, v1.m_quad.zw, _MM_SHUFFLE2(0,1)));
    const hkSingleDouble64 cross0ZW = _mm_mul_pd(v0.m_quad.zw, _mm_shuffle_pd(v1.m_quad.xy, v1.m_quad.zw, _MM_SHUFFLE2(1,0)));

    const hkSingleDouble64 cross1XY = _mm_mul_pd(v1.m_quad.xy, _mm_shuffle_pd(v0.m_quad.xy, v0.m_quad.zw, _MM_SHUFFLE2(0,1)));
    const hkSingleDouble64 cross1ZW = _mm_mul_pd(v1.m_quad.zw, _mm_shuffle_pd(v0.m_quad.xy, v0.m_quad.zw, _MM_SHUFFLE2(1,0)));

    const hkSingleDouble64 diffXY = _mm_sub_pd(cross0XY, cross1XY);
    const hkSingleDouble64 diffZW = _mm_sub_pd(cross0ZW, cross1ZW);

    m_quad.xy = _mm_shuffle_pd(diffXY, diffZW, _MM_SHUFFLE2(0,1));
    m_quad.zw = _mm_shuffle_pd(diffXY, diffZW, _MM_SHUFFLE2(1,0));
}

HK_INLINE const hkVector4dComparison hkVector4d::equal(hkVector4dParameter a) const
{
    hkVector4dComparison comp;
    comp.m_mask.xy = _mm_cmpeq_pd(m_quad.xy, a.m_quad.xy);
    comp.m_mask.zw = _mm_cmpeq_pd(m_quad.zw, a.m_quad.zw);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::notEqual(hkVector4dParameter a) const
{
    hkVector4dComparison comp;
    comp.m_mask.xy = _mm_cmpneq_pd(m_quad.xy, a.m_quad.xy);
    comp.m_mask.zw = _mm_cmpneq_pd(m_quad.zw, a.m_quad.zw);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::less(hkVector4dParameter a) const
{
    hkVector4dComparison comp;
    comp.m_mask.xy = _mm_cmplt_pd(m_quad.xy, a.m_quad.xy);
    comp.m_mask.zw = _mm_cmplt_pd(m_quad.zw, a.m_quad.zw);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::lessEqual(hkVector4dParameter a) const
{
    hkVector4dComparison comp;
    comp.m_mask.xy = _mm_cmple_pd(m_quad.xy, a.m_quad.xy);
    comp.m_mask.zw = _mm_cmple_pd(m_quad.zw, a.m_quad.zw);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::greater(hkVector4dParameter a) const
{
    hkVector4dComparison comp;
    comp.m_mask.xy = _mm_cmpgt_pd(m_quad.xy, a.m_quad.xy);
    comp.m_mask.zw = _mm_cmpgt_pd(m_quad.zw, a.m_quad.zw);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::greaterEqual(hkVector4dParameter a) const
{
    hkVector4dComparison comp;
    comp.m_mask.xy = _mm_cmpge_pd(m_quad.xy, a.m_quad.xy);
    comp.m_mask.zw = _mm_cmpge_pd(m_quad.zw, a.m_quad.zw);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::lessZero() const
{
    hkVector4dComparison comp;
    const hkSingleDouble64 zero = _mm_setzero_pd();
    comp.m_mask.xy = _mm_cmplt_pd(m_quad.xy, zero);
    comp.m_mask.zw = _mm_cmplt_pd(m_quad.zw, zero);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::lessEqualZero() const
{
    hkVector4dComparison comp;
    const hkSingleDouble64 zero = _mm_setzero_pd();
    comp.m_mask.xy = _mm_cmple_pd(m_quad.xy, zero);
    comp.m_mask.zw = _mm_cmple_pd(m_quad.zw, zero);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::greaterZero() const
{
    hkVector4dComparison comp;
    const hkSingleDouble64 zero = _mm_setzero_pd();
    comp.m_mask.xy = _mm_cmpgt_pd(m_quad.xy, zero);
    comp.m_mask.zw = _mm_cmpgt_pd(m_quad.zw, zero);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::greaterEqualZero() const
{
    hkVector4dComparison comp;
    const hkSingleDouble64 zero = _mm_setzero_pd();
    comp.m_mask.xy = _mm_cmpge_pd(m_quad.xy, zero);
    comp.m_mask.zw = _mm_cmpge_pd(m_quad.zw, zero);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::equalZero() const
{
    hkVector4dComparison comp;
    const hkSingleDouble64 zero = _mm_setzero_pd();
    comp.m_mask.xy = _mm_cmpeq_pd(m_quad.xy, zero);
    comp.m_mask.zw = _mm_cmpeq_pd(m_quad.zw, zero);
    return comp;
}

HK_INLINE const hkVector4dComparison hkVector4d::notEqualZero() const
{
    hkVector4dComparison comp;
    const hkSingleDouble64 zero = _mm_setzero_pd();
    comp.m_mask.xy = _mm_cmpneq_pd(m_quad.xy, zero);
    comp.m_mask.zw = _mm_cmpneq_pd(m_quad.zw, zero);
    return comp;
}

HK_INLINE void hkVector4d::setSelect( hkVector4dComparisonParameter comp, hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
#if HK_SSE_VERSION >= 0x41
    m_quad.xy = _mm_blendv_pd(falseValue.m_quad.xy, trueValue.m_quad.xy, comp.m_mask.xy);
    m_quad.zw = _mm_blendv_pd(falseValue.m_quad.zw, trueValue.m_quad.zw, comp.m_mask.zw);
#else
    m_quad.xy = _mm_or_pd( _mm_and_pd(comp.m_mask.xy, trueValue.m_quad.xy), _mm_andnot_pd(comp.m_mask.xy, falseValue.m_quad.xy) );
    m_quad.zw = _mm_or_pd( _mm_and_pd(comp.m_mask.zw, trueValue.m_quad.zw), _mm_andnot_pd(comp.m_mask.zw, falseValue.m_quad.zw) );
#endif
}


template<>
HK_INLINE void hkVector4d::setSelect<hkVector4ComparisonMask::MASK_X>( hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
    m_quad.xy = MOVE_SD(falseValue.m_quad.xy, trueValue.m_quad.xy);
    m_quad.zw = falseValue.m_quad.zw;
}

template<>
HK_INLINE void hkVector4d::setSelect<hkVector4ComparisonMask::MASK_YZW>( hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
    m_quad.xy = MOVE_SD(trueValue.m_quad.xy, falseValue.m_quad.xy);
    m_quad.zw = trueValue.m_quad.zw;
}

template<>
HK_INLINE void hkVector4d::setSelect<hkVector4ComparisonMask::MASK_XY>( hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
    m_quad.xy = trueValue.m_quad.xy;
    m_quad.zw = falseValue.m_quad.zw;
}

template<>
HK_INLINE void hkVector4d::setSelect<hkVector4ComparisonMask::MASK_ZW>( hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
    m_quad.xy = falseValue.m_quad.xy;
    m_quad.zw = trueValue.m_quad.zw;
}

template<>
HK_INLINE void hkVector4d::setSelect<hkVector4ComparisonMask::MASK_XYZ>( hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
    m_quad.xy = trueValue.m_quad.xy;
    m_quad.zw = MOVE_SD(falseValue.m_quad.zw, trueValue.m_quad.zw);
}

template<>
HK_INLINE void hkVector4d::setSelect<hkVector4ComparisonMask::MASK_W>( hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
    m_quad.xy = falseValue.m_quad.xy;
    m_quad.zw = MOVE_SD(trueValue.m_quad.zw, falseValue.m_quad.zw);
}

#if HK_SSE_VERSION >= 0x41
template<hkVector4ComparisonMask::Mask M>
HK_INLINE void hkVector4d::setSelect( hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
    HK_VECTORdCOMPARISON_MASK_CHECK;
    m_quad.xy = _mm_blend_pd(falseValue.m_quad.xy, trueValue.m_quad.xy, M & 0x3);
    m_quad.zw = _mm_blend_pd(falseValue.m_quad.zw, trueValue.m_quad.zw, (M>>2));
}
#else
template<hkVector4ComparisonMask::Mask M>
HK_INLINE void hkVector4d::setSelect( hkVector4dParameter trueValue, hkVector4dParameter falseValue )
{
    hkVector4dComparison comp; comp.set<M>();
    setSelect(comp, trueValue, falseValue);
}
#endif

HK_INLINE void hkVector4d::setClampedZeroOne( hkVector4dParameter a )
{
    // This ensures that if a is NAN, clamped will be 1 afterwards
    const __m128d one = g_vectordConstants[HK_QUADREAL_1].xy;
    __m128d GTxy = _mm_cmpgt_pd( one, a.m_quad.xy );
    __m128d GTzw = _mm_cmpgt_pd( one, a.m_quad.zw );
#if HK_SSE_VERSION >= 0x41
    __m128d clamped_xy = _mm_blendv_pd(one, a.m_quad.xy, GTxy);
    __m128d clamped_zw = _mm_blendv_pd(one, a.m_quad.zw, GTzw);
#else
    __m128d clamped_xy = _mm_or_pd( _mm_and_pd(GTxy, a.m_quad.xy), _mm_andnot_pd(GTxy, one) );
    __m128d clamped_zw = _mm_or_pd( _mm_and_pd(GTzw, a.m_quad.zw), _mm_andnot_pd(GTzw, one) );
#endif
    m_quad.xy = _mm_max_pd(_mm_setzero_pd(), clamped_xy);
    m_quad.zw = _mm_max_pd(_mm_setzero_pd(), clamped_zw);
}

HK_INLINE void hkVector4d::zeroIfFalse( hkVector4dComparisonParameter comp )
{
    m_quad.xy = _mm_and_pd(comp.m_mask.xy, m_quad.xy);
    m_quad.zw = _mm_and_pd(comp.m_mask.zw, m_quad.zw);
}

HK_INLINE void hkVector4d::zeroIfTrue( hkVector4dComparisonParameter comp )
{
    m_quad.xy = _mm_andnot_pd(comp.m_mask.xy, m_quad.xy);
    m_quad.zw = _mm_andnot_pd(comp.m_mask.zw, m_quad.zw);
}


template <>
HK_INLINE void hkVector4d::setNeg<1>(hkVector4dParameter v)
{
    __m128i mask = _mm_insert_epi16(_mm_setzero_si128(), 0x8000, 0x3);
    m_quad.xy = _mm_xor_pd(v.m_quad.xy, _mm_castsi128_pd(mask));
    m_quad.zw = v.m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setNeg<2>(hkVector4dParameter v)
{
    __m128i mask = _mm_insert_epi16(_mm_setzero_si128(), 0x8000, 0x3);
    mask = _mm_shuffle_epi32(mask, _MM_SHUFFLE(1,0,1,0));
    m_quad.xy = _mm_xor_pd(v.m_quad.xy, _mm_castsi128_pd(mask));
    m_quad.zw = v.m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setNeg<3>(hkVector4dParameter v)
{
    __m128i mask = _mm_insert_epi16(_mm_setzero_si128(), 0x8000, 0x3);
    m_quad.zw = _mm_xor_pd(v.m_quad.zw, _mm_castsi128_pd(mask));
    mask = _mm_shuffle_epi32(mask, _MM_SHUFFLE(1,0,1,0));
    m_quad.xy = _mm_xor_pd(v.m_quad.xy, _mm_castsi128_pd(mask));
}

template <>
HK_INLINE void hkVector4d::setNeg<4>(hkVector4dParameter v)
{
    __m128i mask = _mm_insert_epi16(_mm_setzero_si128(), 0x8000, 0x3);
    mask = _mm_shuffle_epi32(mask, _MM_SHUFFLE(1,0,1,0));
    m_quad.xy = _mm_xor_pd(v.m_quad.xy, _mm_castsi128_pd(mask));
    m_quad.zw = _mm_xor_pd(v.m_quad.zw, _mm_castsi128_pd(mask));
}

template <int N>
HK_INLINE void hkVector4d::setNeg(hkVector4dParameter v)
{
    HK_VECTOR4_NOT_IMPLEMENTED;
}

HK_INLINE void hkVector4d::setAbs(hkVector4dParameter v)
{
    m_quad.xy = hkMath::twoFabs(v.m_quad.xy);
    m_quad.zw = hkMath::twoFabs(v.m_quad.zw);
}

HK_INLINE void hkVector4d::setMin(hkVector4dParameter a, hkVector4dParameter b)
{
    m_quad.xy = _mm_min_pd(a.m_quad.xy, b.m_quad.xy);
    m_quad.zw = _mm_min_pd(a.m_quad.zw, b.m_quad.zw);
}

HK_INLINE void hkVector4d::setMax(hkVector4dParameter a, hkVector4dParameter b)
{
    m_quad.xy = _mm_max_pd(a.m_quad.xy, b.m_quad.xy);
    m_quad.zw = _mm_max_pd(a.m_quad.zw, b.m_quad.zw);
}



/* matrix3, rotation, quaternion, transform */

HK_INLINE void hkVector4d::_setRotatedDir(const hkMatrix3d& r, hkVector4dParameter b )
{
    const hkQuadDouble64 c0 = r.getColumn<0>().m_quad;
    const hkQuadDouble64 c1 = r.getColumn<1>().m_quad;
    const hkQuadDouble64 c2 = r.getColumn<2>().m_quad;

    const hkSingleDouble64 b0 = _mm_unpacklo_pd( b.m_quad.xy, b.m_quad.xy );
    const hkSingleDouble64 b1 = _mm_unpackhi_pd( b.m_quad.xy, b.m_quad.xy );
    const hkSingleDouble64 b2 = _mm_unpacklo_pd( b.m_quad.zw, b.m_quad.zw );

    {
        const hkSingleDouble64 r0 = _mm_mul_pd( c0.xy, b0 );
        const hkSingleDouble64 r1 = _mm_mul_pd( c1.xy, b1 );
        const hkSingleDouble64 r2 = _mm_mul_pd( c2.xy, b2 );

        m_quad.xy = _mm_add_pd( _mm_add_pd(r0, r1), r2 );
    }
    {
        const hkSingleDouble64 r0 = _mm_mul_pd( c0.zw, b0 );
        const hkSingleDouble64 r1 = _mm_mul_pd( c1.zw, b1 );
        const hkSingleDouble64 r2 = _mm_mul_pd( c2.zw, b2 );

        m_quad.zw = _mm_add_pd( _mm_add_pd(r0, r1), r2 );
    }
}

HK_INLINE void hkVector4d::_setRotatedInverseDir(const hkMatrix3d& r, hkVector4dParameter b )
{
#if HK_SSE_VERSION >= 0x41
    const hkQuadDouble64 c0 = r.getColumn<0>().m_quad;
    const hkQuadDouble64 c1 = r.getColumn<1>().m_quad;
    const hkQuadDouble64 c2 = r.getColumn<2>().m_quad;

    const hkSingleDouble64 r0a = _mm_dp_pd( c0.xy, b.m_quad.xy, 0x31 );
    const hkSingleDouble64 r0b = _mm_dp_pd( c0.zw, b.m_quad.zw, 0x11 );

    const hkSingleDouble64 r1a = _mm_dp_pd( c1.xy, b.m_quad.xy, 0x32 );
    const hkSingleDouble64 r1b = _mm_dp_pd( c1.zw, b.m_quad.zw, 0x12 );

    const hkSingleDouble64 r2a = _mm_dp_pd( c2.xy, b.m_quad.xy, 0x31 );
    const hkSingleDouble64 r2b = _mm_dp_pd( c2.zw, b.m_quad.zw, 0x11 );

    m_quad.xy = _mm_or_pd( _mm_add_pd(r0a, r0b), _mm_add_pd(r1a, r1b) );
    m_quad.zw = _mm_add_pd(r2a, r2b);
#else
    hkVector4d c0 = r.getColumn<0>();
    hkVector4d c1 = r.getColumn<1>();
    hkVector4d c2 = r.getColumn<2>();

    HK_TRANSPOSE3d(c0,c1,c2);

    const hkSingleDouble64 b0 = _mm_unpacklo_pd( b.m_quad.xy, b.m_quad.xy );
    const hkSingleDouble64 b1 = _mm_unpackhi_pd( b.m_quad.xy, b.m_quad.xy );
    const hkSingleDouble64 b2 = _mm_unpacklo_pd( b.m_quad.zw, b.m_quad.zw );

    {
        const hkSingleDouble64 r0 = _mm_mul_pd( c0.m_quad.xy, b0 );
        const hkSingleDouble64 r1 = _mm_mul_pd( c1.m_quad.xy, b1 );
        const hkSingleDouble64 r2 = _mm_mul_pd( c2.m_quad.xy, b2 );

        m_quad.xy = _mm_add_pd( _mm_add_pd(r0, r1), r2 );
    }
    {
        const hkSingleDouble64 r0 = _mm_mul_pd( c0.m_quad.zw, b0 );
        const hkSingleDouble64 r1 = _mm_mul_pd( c1.m_quad.zw, b1 );
        const hkSingleDouble64 r2 = _mm_mul_pd( c2.m_quad.zw, b2 );

        m_quad.zw = _mm_add_pd( _mm_add_pd(r0, r1), r2 );
    }
#endif
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::dot<1>(hkVector4dParameter a) const
{
#if HK_SSE_VERSION >= 0x41
    return hkSimdDouble64::convert(_mm_dp_pd(m_quad.xy, a.m_quad.xy, 0x13));
#else
    const hkSingleDouble64 x2 = _mm_mul_sd(m_quad.xy, a.m_quad.xy);
    return hkSimdDouble64::convert(_mm_unpacklo_pd(x2, x2));
#endif
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::dot<2>(hkVector4dParameter a) const
{
#if HK_SSE_VERSION >= 0x41
    return hkSimdDouble64::convert(_mm_dp_pd(m_quad.xy, a.m_quad.xy, 0x33));
#elif HK_SSE_VERSION >= 0x30
    const hkSingleDouble64 x2 = _mm_mul_pd(m_quad.xy,a.m_quad.xy);
    return hkSimdDouble64::convert(_mm_hadd_pd(x2,x2));
#else
    const hkSingleDouble64 x2 = _mm_mul_pd(m_quad.xy,a.m_quad.xy);
    const hkSingleDouble64 result = _mm_add_pd( _mm_unpacklo_pd(x2,x2), _mm_unpackhi_pd(x2,x2) ); // xy xy
    return hkSimdDouble64::convert(result);
#endif
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::dot<3>(hkVector4dParameter a) const
{
#if HK_SSE_VERSION >= 0x41
    const hkSingleDouble64 xy = _mm_dp_pd(m_quad.xy, a.m_quad.xy, 0x33);
    const hkSingleDouble64 z  = _mm_dp_pd(m_quad.zw, a.m_quad.zw, 0x13);
    return hkSimdDouble64::convert(_mm_add_pd(xy,z));
#elif HK_SSE_VERSION >= 0x30
    const hkSingleDouble64 x2a = _mm_mul_pd(m_quad.xy,a.m_quad.xy);
    const hkSingleDouble64 x2b = _mm_mul_pd(m_quad.zw,a.m_quad.zw);
    const hkSingleDouble64 hsum = _mm_hadd_pd(x2a,x2a); // xy xy
    const hkSingleDouble64 z = _mm_unpacklo_pd(x2b,x2b); // zz
    return hkSimdDouble64::convert(_mm_add_pd(hsum, z)); // xyz xyz
#else
    const hkSingleDouble64 x2a = _mm_mul_pd(m_quad.xy,a.m_quad.xy);
    const hkSingleDouble64 x2b = _mm_mul_pd(m_quad.zw,a.m_quad.zw);
    const hkSingleDouble64 xySum = _mm_add_pd( _mm_unpacklo_pd(x2a,x2a), _mm_unpackhi_pd(x2a,x2a) ); // xy xy
    const hkSingleDouble64 z = _mm_unpacklo_pd(x2b,x2b); // zz
    const hkSingleDouble64 result = _mm_add_pd( z, xySum); // xyz xyz
    return hkSimdDouble64::convert(result);
#endif
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::dot<4>(hkVector4dParameter a) const
{
#if HK_SSE_VERSION >= 0x41
    const hkSingleDouble64 xy = _mm_dp_pd(m_quad.xy, a.m_quad.xy, 0x33);
    const hkSingleDouble64 zw = _mm_dp_pd(m_quad.zw, a.m_quad.zw, 0x33);
    return hkSimdDouble64::convert(_mm_add_pd(xy,zw));
#elif HK_SSE_VERSION >= 0x30
    const hkSingleDouble64 x2a = _mm_mul_pd(m_quad.xy,a.m_quad.xy);
    const hkSingleDouble64 x2b = _mm_mul_pd(m_quad.zw,a.m_quad.zw);
    const hkSingleDouble64 hsum0 = _mm_hadd_pd(x2a,x2b); // xy zw
    return hkSimdDouble64::convert(_mm_hadd_pd(hsum0,hsum0)); // xyzw all 4
#else
    const hkSingleDouble64 x2a = _mm_mul_pd(m_quad.xy,a.m_quad.xy);
    const hkSingleDouble64 x2b = _mm_mul_pd(m_quad.zw,a.m_quad.zw);
    const hkSingleDouble64 sum0a = _mm_add_pd( x2a,x2b ); // xz yw
    const hkSingleDouble64 sum0b = _mm_add_pd( _mm_shuffle_pd(sum0a,sum0a,_MM_SHUFFLE2(0,1)), sum0a); // xzyw ywxz
    return hkSimdDouble64::convert(sum0b);
#endif
}

template <int N>
HK_INLINE const hkSimdDouble64 hkVector4d::dot(hkVector4dParameter a) const
{
    HK_VECTOR4_NOT_IMPLEMENTED;
    return hkSimdDouble64::getConstant<HK_QUADREAL_0>();
}


HK_INLINE const hkSimdDouble64 hkVector4d::dot4xyz1(hkVector4dParameter a) const
{
#if HK_SSE_VERSION >= 0x41
    const hkSingleDouble64 xy = _mm_dp_pd(m_quad.xy, a.m_quad.xy, 0x33);
    const hkSingleDouble64 z  = _mm_dp_pd(m_quad.zw, a.m_quad.zw, 0x13);
    const hkSingleDouble64 xyz= _mm_add_pd(xy,z);
    const hkSingleDouble64 w  = _mm_unpackhi_pd(m_quad.zw, m_quad.zw);
    return hkSimdDouble64::convert(_mm_add_pd(xyz,w));
#elif HK_SSE_VERSION >= 0x30
    const hkSingleDouble64 xx2a = _mm_mul_pd(m_quad.xy,a.m_quad.xy);
    const hkSingleDouble64 xx2bf = _mm_mul_pd(m_quad.zw,a.m_quad.zw);
    const hkSingleDouble64 xx2b = _mm_shuffle_pd(m_quad.zw, xx2bf, _MM_SHUFFLE2(0,1));  // replace w by this.w
    const hkSingleDouble64 hsum0 = _mm_hadd_pd(xx2a,xx2b); // xy zw
    return hkSimdDouble64::convert(_mm_hadd_pd(hsum0,hsum0)); // xyzw all 4
#else
    const hkSingleDouble64 xx2a = _mm_mul_pd(m_quad.xy,a.m_quad.xy);
    const hkSingleDouble64 xx2bf = _mm_mul_pd(m_quad.zw,a.m_quad.zw);
    const hkSingleDouble64 xx2b = _mm_shuffle_pd(m_quad.zw, xx2bf, _MM_SHUFFLE2(0,1));  // replace w by this.w
    const hkSingleDouble64 sum0a = _mm_add_pd( xx2a,xx2b ); // xz yw
    const hkSingleDouble64 sum0b = _mm_add_pd( _mm_shuffle_pd(sum0a,sum0a,_MM_SHUFFLE2(0,1)), sum0a); // xzyw ywxz
    return hkSimdDouble64::convert(sum0b);
#endif
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalAdd<2>() const
{
#if HK_SSE_VERSION >= 0x30
    return hkSimdDouble64::convert(_mm_hadd_pd(m_quad.xy, m_quad.xy));
#else
    return hkSimdDouble64::convert(_mm_add_pd( _mm_shuffle_pd(m_quad.xy,m_quad.xy,_MM_SHUFFLE2(0,1)), m_quad.xy));
#endif
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalAdd<3>() const
{
#if HK_SSE_VERSION >= 0x30
    const hkSingleDouble64 x2 = _mm_hadd_pd(m_quad.xy, m_quad.xy);
    return hkSimdDouble64::convert(_mm_add_pd( _mm_unpacklo_pd(m_quad.zw,m_quad.zw), x2));
#else
    const hkSingleDouble64 xySum = _mm_add_pd( _mm_shuffle_pd(m_quad.xy,m_quad.xy,_MM_SHUFFLE2(0,1)), m_quad.xy);
    return hkSimdDouble64::convert(_mm_add_pd( _mm_unpacklo_pd(m_quad.zw,m_quad.zw), xySum));
#endif
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalAdd<4>() const
{
#if HK_SSE_VERSION >= 0x30
    const hkSingleDouble64 x2a = _mm_hadd_pd(m_quad.xy, m_quad.zw);
    return hkSimdDouble64::convert(_mm_hadd_pd(x2a, x2a));
#else
    const hkSingleDouble64 sum0 = _mm_add_pd( m_quad.xy, m_quad.zw ); // xz yw
    const hkSingleDouble64 sum1 = _mm_add_pd( _mm_shuffle_pd( sum0, sum0,_MM_SHUFFLE2(0,1)), sum0 ); // xzyw ywxz
    return hkSimdDouble64::convert(_mm_shuffle_pd(sum1,sum1,_MM_SHUFFLE2(0,0)));
#endif
}

template <int N>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalAdd() const
{
    HK_VECTOR4_NOT_IMPLEMENTED;
    return hkSimdDouble64::getConstant<HK_QUADREAL_0>();
}

void hkVector4d::setPairedAdd(hkVector4dParameter a, hkVector4dParameter b)
{
    set(
        a.getComponent<0>() + a.getComponent<1>(),
        a.getComponent<2>() + a.getComponent<3>(),
        b.getComponent<0>() + b.getComponent<1>(),
        b.getComponent<2>() + b.getComponent<3>()
        );
}


template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMul<2>() const
{
    return hkSimdDouble64::convert(_mm_mul_pd( _mm_shuffle_pd(m_quad.xy,m_quad.xy,_MM_SHUFFLE2(0,1)), m_quad.xy));
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMul<3>() const
{
    const hkSingleDouble64 xyProd = _mm_mul_pd( _mm_shuffle_pd(m_quad.xy,m_quad.xy,_MM_SHUFFLE2(0,1)), m_quad.xy);
    return hkSimdDouble64::convert(_mm_mul_pd( _mm_unpacklo_pd(m_quad.zw,m_quad.zw), xyProd));
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMul<4>() const
{
    const hkSingleDouble64 p0 = _mm_mul_pd( m_quad.xy, m_quad.zw ); // xz yw
    const hkSingleDouble64 p1 = _mm_mul_pd( _mm_shuffle_pd( p0, p0,_MM_SHUFFLE2(0,1)), p0 ); // xzyw ywxz
    return hkSimdDouble64::convert(_mm_shuffle_pd(p1,p1,_MM_SHUFFLE2(0,0)));
}

template <int N>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMul() const
{
    HK_VECTOR4_NOT_IMPLEMENTED;
    return hkSimdDouble64::getConstant<HK_QUADREAL_0>();
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMin<1>() const
{
    return getComponent<0>();
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMin<2>() const
{
    const hkSingleDouble64 m = _mm_min_pd( _mm_shuffle_pd(m_quad.xy,m_quad.xy,_MM_SHUFFLE2(0,1)), m_quad.xy); // xy yx
    return hkSimdDouble64::convert(_mm_shuffle_pd(m,m,_MM_SHUFFLE2(0,0)));
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMin<3>() const
{
    const hkSingleDouble64 xy = _mm_min_pd( _mm_shuffle_pd(m_quad.xy,m_quad.xy,_MM_SHUFFLE2(0,1)), m_quad.xy); // xy yx
    const hkSingleDouble64 xyz = _mm_min_pd( m_quad.zw, xy ); // xyz yxw
    return hkSimdDouble64::convert(_mm_shuffle_pd(xyz,xyz,_MM_SHUFFLE2(0,0)));
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMin<4>() const
{
    const hkSingleDouble64 sum0 = _mm_min_pd( m_quad.xy, m_quad.zw ); // xz yw
    const hkSingleDouble64 sum1 = _mm_min_pd( _mm_shuffle_pd( sum0, sum0,_MM_SHUFFLE2(0,1)), sum0 ); // xzyw ywxz
    return hkSimdDouble64::convert(_mm_shuffle_pd(sum1,sum1,_MM_SHUFFLE2(0,0)));
}

template <int N>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMin() const
{
    HK_VECTOR4_NOT_IMPLEMENTED;
    return hkSimdDouble64::getConstant<HK_QUADREAL_0>();
}

/* operator () */

HK_INLINE hkDouble64& hkVector4d::operator() (int a)
{
    HK_MATH_ASSERT(0x6d0c31d7, a>=0 && a<4, "index out of bounds for component access");
    if (a<2)
        return HK_M128(m_quad.xy).m128d_f64[a];
    else
        return HK_M128(m_quad.zw).m128d_f64[a-2];
}

HK_INLINE const hkDouble64& hkVector4d::operator() (int a) const
{
    HK_MATH_ASSERT(0x6d0c31d7, a>=0 && a<4, "index out of bounds for component access");
    if (a<2)
        return HK_M128(m_quad.xy).m128d_f64[a];
    else
        return HK_M128(m_quad.zw).m128d_f64[a-2];
}

HK_INLINE void hkVector4d::setXYZ_W(hkVector4dParameter xyz, hkVector4dParameter w)
{
    m_quad.xy = xyz.m_quad.xy;
    m_quad.zw = MOVE_SD(w.m_quad.zw, xyz.m_quad.zw);
}

HK_INLINE void hkVector4d::setXYZ_W(hkVector4dParameter xyz, hkSimdDouble64Parameter w)
{
    m_quad.xy = xyz.m_quad.xy;
    m_quad.zw = MOVE_SD(w.m_real, xyz.m_quad.zw);
}

HK_INLINE void hkVector4d::setW(hkVector4dParameter w)
{
    m_quad.zw = MOVE_SD(w.m_quad.zw, m_quad.zw);
}

HK_INLINE void hkVector4d::setXYZ(hkVector4dParameter xyz)
{
    m_quad.xy = xyz.m_quad.xy;
    m_quad.zw = MOVE_SD(m_quad.zw, xyz.m_quad.zw);
}

HK_INLINE void hkVector4d::setXYZ(hkDouble64 v)
{
#if HK_SSE_VERSION >= 0x30
    m_quad.xy = _mm_loaddup_pd(&v);
#else
    m_quad.xy = _mm_load1_pd(&v);
#endif
    m_quad.zw = MOVE_SD(m_quad.zw,m_quad.xy);
}

HK_INLINE void hkVector4d::setXYZ(hkSimdDouble64Parameter v)
{
    m_quad.xy = v.m_real;
    m_quad.zw = MOVE_SD(m_quad.zw, v.m_real);
}

HK_INLINE void hkVector4d::setXYZ_0(hkVector4dParameter xyz)
{
    m_quad.xy = xyz.m_quad.xy;
#if defined(HK_COMPILER_MSVC)
    // VS bug: movq instruction causes mem corruption
    m_quad.zw = MOVE_SD(_mm_setzero_pd(), xyz.m_quad.zw);
#else
    m_quad.zw = _mm_castsi128_pd(_mm_move_epi64(_mm_castpd_si128(xyz.m_quad.zw)));
#endif
}


template <>
HK_INLINE void hkVector4d::setComponent<0>(hkSimdDouble64Parameter val)
{
    m_quad.xy = MOVE_SD( m_quad.xy, val.m_real );
}
template <>
HK_INLINE void hkVector4d::setComponent<1>(hkSimdDouble64Parameter val)
{
    m_quad.xy = MOVE_SD( val.m_real, m_quad.xy );
}
template <>
HK_INLINE void hkVector4d::setComponent<2>(hkSimdDouble64Parameter val)
{
    m_quad.zw = MOVE_SD( m_quad.zw, val.m_real);
}
template <>
HK_INLINE void hkVector4d::setComponent<3>(hkSimdDouble64Parameter val)
{
    m_quad.zw = MOVE_SD( val.m_real, m_quad.zw);
}

template <int I>
HK_INLINE void hkVector4d::setComponent(hkSimdDouble64Parameter val)
{
    HK_VECTOR4_SUBINDEX_CHECK;
}

HK_INLINE void hkVector4d::setComponent(const int I, hkSimdDouble64Parameter val)
{
    HK_MATH_ASSERT(0x6d0c31d7, I>=0 && I<4, "index out of bounds for component access");

#if defined(HK_COMPILER_GCC)
    if (__builtin_constant_p(I))
    {
        switch(I)
        {
            case 3: setComponent<3>(val); break;
            case 2: setComponent<2>(val); break;
            case 1: setComponent<1>(val); break;
            default: setComponent<0>(val); break;
        }
    }
    else
#endif
    {
        hkVector4dComparison cmp;   cmp.set((hkVector4ComparisonMask::Mask)(hkVector4ComparisonMask::MASK_X << I));

        if (I<2)
        {
#if HK_SSE_VERSION >= 0x41
            m_quad.xy = _mm_blendv_pd(m_quad.xy, val.m_real, cmp.m_mask.xy);
#else
            m_quad.xy = _mm_or_pd( _mm_and_pd(cmp.m_mask.xy, val.m_real), _mm_andnot_pd(cmp.m_mask.xy, m_quad.xy) );
#endif
        }
        else
        {
#if HK_SSE_VERSION >= 0x41
            m_quad.zw = _mm_blendv_pd(m_quad.zw, val.m_real, cmp.m_mask.zw);
#else
            m_quad.zw = _mm_or_pd( _mm_and_pd(cmp.m_mask.zw, val.m_real), _mm_andnot_pd(cmp.m_mask.zw, m_quad.zw) );
#endif
        }
    }
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMax<1>() const
{
    return getComponent<0>();
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMax<2>() const
{
    const hkSingleDouble64 xy = _mm_max_pd( _mm_shuffle_pd(m_quad.xy,m_quad.xy,_MM_SHUFFLE2(0,1)), m_quad.xy); // xy yx
    return hkSimdDouble64::convert( _mm_shuffle_pd(xy,xy,_MM_SHUFFLE2(0,0)) );
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMax<3>() const
{
    const hkSingleDouble64 xy = _mm_max_pd( _mm_shuffle_pd(m_quad.xy,m_quad.xy,_MM_SHUFFLE2(0,1)), m_quad.xy); // xy yx
    const hkSingleDouble64 xyz = _mm_max_pd( xy, m_quad.zw ); // xyz yxw
    return hkSimdDouble64::convert( _mm_shuffle_pd(xyz,xyz,_MM_SHUFFLE2(0,0)) );
}

template <>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMax<4>() const
{
    const hkSingleDouble64 sum0 = _mm_max_pd( m_quad.xy, m_quad.zw ); // xz yw
    const hkSingleDouble64 sum1 = _mm_max_pd( _mm_shuffle_pd( sum0, sum0,_MM_SHUFFLE2(0,1)), sum0 ); // xzyw ywxz
    return hkSimdDouble64::convert( _mm_shuffle_pd(sum1,sum1,_MM_SHUFFLE2(0,0)) );
}

template <int N>
HK_INLINE const hkSimdDouble64 hkVector4d::horizontalMax() const
{
    HK_VECTOR4_NOT_IMPLEMENTED;
    return hkSimdDouble64::getConstant<HK_QUADREAL_0>();
}

template <>
HK_INLINE hkBool32 hkVector4d::isOk<1>() const
{
#ifdef HK_COMPILER_MSVC
    // 64-bit integer cmpeq is MSVC-specific
    __m128i isInvalidXY = _mm_cmpeq_epi64(_mm_and_si128(*(const __m128i*)&m_quad.xy, *(__m128i*)g_mantissaMaskd), *(__m128i*)g_mantissaMaskd);
    return !(_mm_movemask_epi8(isInvalidXY) & 0x0001);
#else
    __m128i quadXYMasked = _mm_and_si128(*(const __m128i*)&m_quad.xy, *(__m128i*)g_mantissaMaskd);
    __m128d isInvalidXY = _mm_cmpeq_pd(*(__m128d*)(&quadXYMasked), *(__m128d*)g_mantissaMaskd);
    return !(_mm_movemask_epi8(*(__m128i*)(&isInvalidXY)) & 0x0001);
#endif
}

template <>
HK_INLINE hkBool32 hkVector4d::isOk<2>() const
{
#ifdef HK_COMPILER_MSVC
    // 64-bit integer cmpeq is MSVC-specific
    __m128i isInvalidXY = _mm_cmpeq_epi64(_mm_and_si128(*(const __m128i*)&m_quad.xy, *(__m128i*)g_mantissaMaskd), *(__m128i*)g_mantissaMaskd);
    return !(_mm_movemask_epi8(isInvalidXY) & 0x0001);
#else
    __m128i quadXYMasked = _mm_and_si128(*(const __m128i*)&m_quad.xy, *(__m128i*)g_mantissaMaskd);
    __m128d isInvalidXY = _mm_cmpeq_pd(*(__m128d*)(&quadXYMasked), *(__m128d*)g_mantissaMaskd);
    return !(_mm_movemask_epi8(*(__m128i*)(&isInvalidXY)) & 0x0101);
#endif
}

template <>
HK_INLINE hkBool32 hkVector4d::isOk<3>() const
{
#ifdef HK_COMPILER_MSVC
    // 64-bit integer cmpeq is MSVC-specific
    __m128i isInvalidXY = _mm_cmpeq_epi64(_mm_and_si128(*(const __m128i*)&m_quad.xy, *(__m128i*)g_mantissaMaskd), *(__m128i*)g_mantissaMaskd);
    __m128i isInvalidZW = _mm_cmpeq_epi64(_mm_and_si128(*(const __m128i*)&m_quad.zw, *(__m128i*)g_mantissaMaskd), *(__m128i*)g_mantissaMaskd);
    return !((_mm_movemask_epi8(isInvalidXY) & 0x0101) || (_mm_movemask_epi8(isInvalidZW) & 0x0001));
#else
    __m128i quadXYMasked = _mm_and_si128(*(const __m128i*)&m_quad.xy, *(__m128i*)g_mantissaMaskd);
    __m128d isInvalidXY = _mm_cmpeq_pd(*(__m128d*)(&quadXYMasked), *(__m128d*)g_mantissaMaskd);

    __m128i quadZWMasked = _mm_and_si128(*(const __m128i*)&m_quad.zw, *(__m128i*)g_mantissaMaskd);
    __m128d isInvalidZW = _mm_cmpeq_pd(*(__m128d*)(&quadZWMasked), *(__m128d*)g_mantissaMaskd);

    return !((_mm_movemask_epi8(*(__m128i*)(&isInvalidXY)) & 0x0101) || (_mm_movemask_epi8(*(__m128i*)(&isInvalidZW)) & 0x0001));
#endif
}

template <>
HK_INLINE hkBool32 hkVector4d::isOk<4>() const
{
#ifdef HK_COMPILER_MSVC
    // 64-bit integer cmpeq is MSVC-specific
    __m128i isInvalidXY = _mm_cmpeq_epi64(_mm_and_si128(*(const __m128i*)&m_quad.xy, *(__m128i*)g_mantissaMaskd), *(__m128i*)g_mantissaMaskd);
    __m128i isInvalidZW = _mm_cmpeq_epi64(_mm_and_si128(*(const __m128i*)&m_quad.zw, *(__m128i*)g_mantissaMaskd), *(__m128i*)g_mantissaMaskd);
    return !((_mm_movemask_epi8(isInvalidXY) & 0x0101) || (_mm_movemask_epi8(isInvalidZW) & 0x0101));
#else
    __m128i quadXYMasked = _mm_and_si128(*(const __m128i*)&m_quad.xy, *(__m128i*)g_mantissaMaskd);
    __m128d isInvalidXY = _mm_cmpeq_pd(*(__m128d*)(&quadXYMasked), *(__m128d*)g_mantissaMaskd);

    __m128i quadZWMasked = _mm_and_si128(*(const __m128i*)&m_quad.zw, *(__m128i*)g_mantissaMaskd);
    __m128d isInvalidZW = _mm_cmpeq_pd(*(__m128d*)(&quadZWMasked), *(__m128d*)g_mantissaMaskd);

    return !((_mm_movemask_epi8(*(__m128i*)(&isInvalidXY)) & 0x0101) || (_mm_movemask_epi8(*(__m128i*)(&isInvalidZW)) & 0x0101));
#endif
}

template <int N>
HK_INLINE hkBool32 hkVector4d::isOk() const
{
    HK_VECTOR4_NOT_IMPLEMENTED;
    return false;
}

#if HK_SSE_VERSION >= 0x30
template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XXZZ>(hkVector4dParameter v)
{
    m_quad.xy = _mm_movedup_pd(v.m_quad.xy);
    m_quad.zw = _mm_movedup_pd(v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XXYY>(hkVector4dParameter v)
{
    m_quad.zw = _mm_unpackhi_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.xy = _mm_movedup_pd(v.m_quad.xy);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XXXX>(hkVector4dParameter v)
{
    m_quad.zw = _mm_movedup_pd(v.m_quad.xy);
    m_quad.xy = m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZZZZ>(hkVector4dParameter v)
{
    m_quad.xy = _mm_movedup_pd(v.m_quad.zw);
    m_quad.zw = m_quad.xy;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZYZZ>(hkVector4dParameter v)
{
    m_quad.xy = MOVE_SD( v.m_quad.xy, v.m_quad.zw );
    m_quad.zw = _mm_movedup_pd(v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZZYY>(hkVector4dParameter v)
{
    const __m128d yy = _mm_unpackhi_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.xy = _mm_movedup_pd(v.m_quad.zw);
    m_quad.zw = yy;
}
#else
template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XXZZ>(hkVector4dParameter v)
{
    m_quad.xy = _mm_unpacklo_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.zw = _mm_unpacklo_pd(v.m_quad.zw,v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XXYY>(hkVector4dParameter v)
{
    m_quad.zw = _mm_unpackhi_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.xy = _mm_unpacklo_pd(v.m_quad.xy,v.m_quad.xy);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XXXX>(hkVector4dParameter v)
{
    m_quad.zw = _mm_unpacklo_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.xy = m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZZZZ>(hkVector4dParameter v)
{
    m_quad.xy = _mm_unpacklo_pd(v.m_quad.zw,v.m_quad.zw);
    m_quad.zw = m_quad.xy;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZYZZ>(hkVector4dParameter v)
{
    m_quad.xy = MOVE_SD( v.m_quad.xy, v.m_quad.zw );
    m_quad.zw = _mm_unpacklo_pd(v.m_quad.zw,v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZYZW>(hkVector4dParameter v)
{
    m_quad.xy = MOVE_SD( v.m_quad.xy, v.m_quad.zw );
    m_quad.zw = v.m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZZYY>(hkVector4dParameter v)
{
    const __m128d yy = _mm_unpackhi_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.xy = _mm_unpacklo_pd(v.m_quad.zw,v.m_quad.zw);
    m_quad.zw = yy;
}
#endif

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YYYY>(hkVector4dParameter v)
{
    m_quad.zw = _mm_unpackhi_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.xy = m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YYWW>(hkVector4dParameter v)
{
    m_quad.xy = _mm_unpackhi_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.zw = _mm_unpackhi_pd(v.m_quad.zw,v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YYXX>(hkVector4dParameter v)
{
    m_quad.zw = _mm_unpacklo_pd(v.m_quad.xy,v.m_quad.xy);
    m_quad.xy = _mm_unpackhi_pd(v.m_quad.xy,v.m_quad.xy);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XZYW>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_unpacklo_pd(xy,v.m_quad.zw);
    m_quad.zw = _mm_unpackhi_pd(xy,v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YXXY>(hkVector4dParameter v)
{
    m_quad.zw = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( v.m_quad.xy, v.m_quad.xy,_MM_SHUFFLE2(0,1));
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YXZW>(hkVector4dParameter v)
{
    m_quad.xy = _mm_shuffle_pd( v.m_quad.xy, v.m_quad.xy,_MM_SHUFFLE2(0,1));
    m_quad.zw = v.m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YXWW>(hkVector4dParameter v)
{
    m_quad.xy = _mm_shuffle_pd( v.m_quad.xy, v.m_quad.xy,_MM_SHUFFLE2(0,1));
    m_quad.zw = _mm_unpackhi_pd(v.m_quad.zw,v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YZXW>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( xy, v.m_quad.zw, _MM_SHUFFLE2(0,1));
    m_quad.zw = MOVE_SD( v.m_quad.zw, xy );
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YZZW>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( xy, v.m_quad.zw, _MM_SHUFFLE2(0,1));
    m_quad.zw = v.m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YZZY>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( xy, v.m_quad.zw, _MM_SHUFFLE2(0,1));
    m_quad.zw = MOVE_SD(xy, v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YZXZ>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( xy, v.m_quad.zw, _MM_SHUFFLE2(0,1));
    m_quad.zw = _mm_unpacklo_pd( xy, v.m_quad.zw );
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YZWX>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( xy, v.m_quad.zw, _MM_SHUFFLE2(0,1));
    m_quad.zw = _mm_shuffle_pd( v.m_quad.zw, xy, _MM_SHUFFLE2(0,1));
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::YWZX>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_unpackhi_pd( xy, v.m_quad.zw );
    m_quad.zw = _mm_unpacklo_pd( v.m_quad.zw, xy );
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XYWZ>(hkVector4dParameter v)
{
    m_quad.xy = v.m_quad.xy;
    m_quad.zw = _mm_shuffle_pd( v.m_quad.zw, v.m_quad.zw,_MM_SHUFFLE2(0,1));
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XYZZ>(hkVector4dParameter v)
{
    m_quad.xy = v.m_quad.xy;
    m_quad.zw = _mm_shuffle_pd( v.m_quad.zw, v.m_quad.zw,_MM_SHUFFLE2(0,0));
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XYXY>(hkVector4dParameter v)
{
    m_quad.zw = v.m_quad.xy;
    m_quad.xy = v.m_quad.xy;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZWZW>(hkVector4dParameter v)
{
    m_quad.xy = v.m_quad.zw;
    m_quad.zw = v.m_quad.zw;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZWXY>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = v.m_quad.zw;
    m_quad.zw = xy;
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZWXW>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = v.m_quad.zw;
    m_quad.zw = MOVE_SD( v.m_quad.zw, xy );
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::WXXW>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( v.m_quad.zw, xy, _MM_SHUFFLE2(0,1));
    m_quad.zw = MOVE_SD( v.m_quad.zw, xy );
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZXYW>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_unpacklo_pd(v.m_quad.zw, xy);
    m_quad.zw = _mm_unpackhi_pd(xy, v.m_quad.zw);
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::ZXYZ>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_unpacklo_pd(v.m_quad.zw, xy);
    m_quad.zw = _mm_shuffle_pd(xy, v.m_quad.zw, _MM_SHUFFLE2(0,1));
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::WXYZ>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( v.m_quad.zw, xy, _MM_SHUFFLE2(0,1));
    m_quad.zw = _mm_shuffle_pd( xy, v.m_quad.zw, _MM_SHUFFLE2(0,1));
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::WZYX>(hkVector4dParameter v)
{
    const __m128d xy = v.m_quad.xy;
    m_quad.xy = _mm_shuffle_pd( v.m_quad.zw, v.m_quad.zw, _MM_SHUFFLE2(0,1));
    m_quad.zw = _mm_shuffle_pd(          xy,          xy, _MM_SHUFFLE2(0,1));
}

template <>
HK_INLINE void hkVector4d::setPermutation<hkVectorPermutation::XYZW>(hkVector4dParameter v)
{
    m_quad = v.m_quad;
}

template <hkVectorPermutation::Permutation P>
HK_INLINE void hkVector4d::setPermutation(hkVector4dParameter v)
{
    // temp vars for alias safety
    const hkSimdDouble64 X = v.getComponent<(P>>12)&0x3>();
    const hkSimdDouble64 Y = v.getComponent<(P>>8)&0x3>();
    const hkSimdDouble64 Z = v.getComponent<(P>>4)&0x3>();
    const hkSimdDouble64 W = v.getComponent<(P)&0x3>();

    m_quad.xy = MOVE_SD(Y.m_real, X.m_real);
    m_quad.zw = MOVE_SD(W.m_real, Z.m_real);
}


HK_INLINE void hkVector4d::setFlipSign(hkVector4dParameter v, hkVector4dComparisonParameter mask)
{
    const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(mask.m_mask.xy),63),63);
    const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(mask.m_mask.zw),63),63);
    m_quad.xy = _mm_xor_pd(v.m_quad.xy, _mm_castsi128_pd(maskXY));
    m_quad.zw = _mm_xor_pd(v.m_quad.zw, _mm_castsi128_pd(maskZW));
}

HK_INLINE void hkVector4d::setFlipSign(hkVector4dParameter v, hkVector4dParameter vSign)
{
    const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(vSign.m_quad.xy),63),63);
    const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(vSign.m_quad.zw),63),63);
    m_quad.xy = _mm_xor_pd(v.m_quad.xy, _mm_castsi128_pd(maskXY));
    m_quad.zw = _mm_xor_pd(v.m_quad.zw, _mm_castsi128_pd(maskZW));
}

HK_INLINE void hkVector4d::setFlipSign(hkVector4dParameter v, hkSimdDouble64Parameter sSign)
{
    const __m128i mask = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(sSign.m_real),63),63);
    m_quad.xy = _mm_xor_pd(v.m_quad.xy, _mm_castsi128_pd(mask));
    m_quad.zw = _mm_xor_pd(v.m_quad.zw, _mm_castsi128_pd(mask));
}



//
// advanced interface
//

namespace hkVector4_AdvancedInterface
{
    
    HK_INLINE __m128d m128dSelect( const __m128d& mask, const __m128d& trueValue, const __m128d& falseValue )
    {
#if HK_SSE_VERSION >= 0x41
        return _mm_blendv_pd(falseValue, trueValue, mask);
#else
        return _mm_or_pd( _mm_and_pd(mask, trueValue), _mm_andnot_pd(mask, falseValue) );
#endif
    }

    template<hkMathAccuracyMode ACC>
    HK_INLINE void avoidDivideByZeroInDebug(const __m128d& equalsZeroXY, const __m128d& equalsZeroZW, hkQuadDouble64& quad)
    {
#if defined(HK_ALLOW_FPU_EXCEPTION_CHECKING) 
        if(ACC == HK_ACC_FULL)
        {
            quad.xy = m128dSelect(equalsZeroXY, g_vectordConstants[HK_QUADREAL_1].xy, quad.xy);
            quad.zw = m128dSelect(equalsZeroZW, g_vectordConstants[HK_QUADREAL_1].xy, quad.zw);
        }
#endif
    }

template <hkMathAccuracyMode A>
struct unrolld_setReciprocalFunc { HK_INLINE static void apply(hkQuadFloat32& self, const hkQuadFloat32& a)
{
    switch (A)
    {
        case HK_ACC_23_BIT: self = hkMath::quadReciprocal(a); break;
        case HK_ACC_12_BIT: self = _mm_rcp_ps(a); break;
        default:            self = _mm_div_ps(g_vectorfConstants[HK_QUADREAL_1],a); break; // HK_ACC_FULL
    }
} };

template <hkMathAccuracyMode A, hkMathDivByZeroMode D>
struct unrolld_setReciprocal { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <hkMathAccuracyMode A>
struct unrolld_setReciprocal<A, HK_DIV_IGNORE> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    switch (A)
    {
        case HK_ACC_23_BIT:
        case HK_ACC_12_BIT:
            {
                const __m128 xy = _mm_cvtpd_ps(a.m_quad.xy);
                const __m128 zw = _mm_cvtpd_ps(a.m_quad.zw);
                const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
                hkQuadFloat32 e; unrolld_setReciprocalFunc<A>::apply(e, xyzw);
                self.xy = _mm_cvtps_pd(e);
                self.zw = _mm_cvtps_pd(_mm_shuffle_ps(e,e,_MM_SHUFFLE(1,0,3,2)));
            }
            break;
        default:
            {
                self.xy = _mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, a.m_quad.xy);
                self.zw = _mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, a.m_quad.zw);
            }
            break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setReciprocal<A, HK_DIV_SET_ZERO> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    switch (A)
    {
        case HK_ACC_23_BIT:
        case HK_ACC_12_BIT:
            {
                const __m128 xy = _mm_cvtpd_ps(a.m_quad.xy);
                const __m128 zw = _mm_cvtpd_ps(a.m_quad.zw);
                const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
                const __m128 equalsZero = _mm_cmpeq_ps(xyzw, _mm_setzero_ps());
                hkQuadFloat32 e; unrolld_setReciprocalFunc<A>::apply(e, xyzw);
                e = _mm_andnot_ps(equalsZero, e);
                self.xy = _mm_cvtps_pd(e);
                self.zw = _mm_cvtps_pd(_mm_shuffle_ps(e,e,_MM_SHUFFLE(1,0,3,2)));
            }
            break;
        default:
            {
                const __m128d equalsZeroXY = _mm_cmpeq_pd(a.m_quad.xy, _mm_setzero_pd());
                const __m128d equalsZeroZW = _mm_cmpeq_pd(a.m_quad.zw, _mm_setzero_pd());
                hkVector4d aNonZero = a;
                avoidDivideByZeroInDebug<A>( equalsZeroXY, equalsZeroZW, aNonZero.m_quad );
                self.xy = _mm_andnot_pd(equalsZeroXY,_mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, aNonZero.m_quad.xy));
                self.zw = _mm_andnot_pd(equalsZeroZW,_mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, aNonZero.m_quad.zw));
            }
            break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setReciprocal<A, HK_DIV_SET_HIGH> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    switch (A)
    {
        case HK_ACC_23_BIT:
        case HK_ACC_12_BIT:
            {
                const __m128 xy = _mm_cvtpd_ps(a.m_quad.xy);
                const __m128 zw = _mm_cvtpd_ps(a.m_quad.zw);
                const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));

                hkQuadFloat32 e; unrolld_setReciprocalFunc<A>::apply(e, xyzw);
                const __m128d selfXY = _mm_cvtps_pd(e);
                const __m128d selfZW = _mm_cvtps_pd(_mm_shuffle_ps(e,e,_MM_SHUFFLE(1,0,3,2)));

                const __m128d huge = _mm_set1_pd(HK_DOUBLE_HIGH);
                const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.xy),63),63);
                const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.zw),63),63);
                const __m128d hugeXY = _mm_or_pd(huge, _mm_castsi128_pd(maskXY));
                const __m128d hugeZW = _mm_or_pd(huge, _mm_castsi128_pd(maskZW));

                const __m128i equalsZero = _mm_castps_si128(_mm_cmpeq_ps(xyzw, _mm_setzero_ps()));
                const __m128d equalsZeroXY = _mm_castsi128_pd(_mm_unpacklo_epi32(equalsZero,equalsZero));
                const __m128d equalsZeroZW = _mm_castsi128_pd(_mm_unpackhi_epi32(equalsZero,equalsZero));

#if HK_SSE_VERSION >= 0x41
                self.xy = _mm_blendv_pd(selfXY, hugeXY, equalsZeroXY);
                self.zw = _mm_blendv_pd(selfZW, hugeZW, equalsZeroZW);
#else
                self.xy = _mm_or_pd( _mm_and_pd(equalsZeroXY, hugeXY), _mm_andnot_pd(equalsZeroXY, selfXY) );
                self.zw = _mm_or_pd( _mm_and_pd(equalsZeroZW, hugeZW), _mm_andnot_pd(equalsZeroZW, selfZW) );
#endif
            }
            break;
        default:
            {
                const __m128d equalsZeroXY = _mm_cmpeq_pd(a.m_quad.xy, _mm_setzero_pd());
                const __m128d equalsZeroZW = _mm_cmpeq_pd(a.m_quad.zw, _mm_setzero_pd());
                hkVector4d aNonZero = a;
                avoidDivideByZeroInDebug<A>( equalsZeroXY, equalsZeroZW, aNonZero.m_quad );
                const __m128d selfXY = _mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, aNonZero.m_quad.xy);
                const __m128d selfZW = _mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, aNonZero.m_quad.zw);

                const __m128d huge = _mm_set1_pd(HK_DOUBLE_HIGH);
                const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.xy),63),63);
                const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.zw),63),63);
                const __m128d hugeXY = _mm_or_pd(huge, _mm_castsi128_pd(maskXY));
                const __m128d hugeZW = _mm_or_pd(huge, _mm_castsi128_pd(maskZW));

#if HK_SSE_VERSION >= 0x41
                self.xy = _mm_blendv_pd(selfXY, hugeXY, equalsZeroXY);
                self.zw = _mm_blendv_pd(selfZW, hugeZW, equalsZeroZW);
#else
                self.xy = _mm_or_pd( _mm_and_pd(equalsZeroXY, hugeXY), _mm_andnot_pd(equalsZeroXY, selfXY) );
                self.zw = _mm_or_pd( _mm_and_pd(equalsZeroZW, hugeZW), _mm_andnot_pd(equalsZeroZW, selfZW) );
#endif
            }
            break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setReciprocal<A, HK_DIV_SET_MAX> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    switch (A)
    {
        case HK_ACC_23_BIT:
        case HK_ACC_12_BIT:
            {
                const __m128 xy = _mm_cvtpd_ps(a.m_quad.xy);
                const __m128 zw = _mm_cvtpd_ps(a.m_quad.zw);
                const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));

                hkQuadFloat32 e; unrolld_setReciprocalFunc<A>::apply(e, xyzw);
                const __m128d selfXY = _mm_cvtps_pd(e);
                const __m128d selfZW = _mm_cvtps_pd(_mm_shuffle_ps(e,e,_MM_SHUFFLE(1,0,3,2)));

                const __m128d huge = _mm_set1_pd(HK_DOUBLE_MAX);
                const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.xy),63),63);
                const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.zw),63),63);
                const __m128d hugeXY = _mm_or_pd(huge, _mm_castsi128_pd(maskXY));
                const __m128d hugeZW = _mm_or_pd(huge, _mm_castsi128_pd(maskZW));

                const __m128i equalsZero = _mm_castps_si128(_mm_cmpeq_ps(xyzw, _mm_setzero_ps()));
                const __m128d equalsZeroXY = _mm_castsi128_pd(_mm_unpacklo_epi32(equalsZero,equalsZero));
                const __m128d equalsZeroZW = _mm_castsi128_pd(_mm_unpackhi_epi32(equalsZero,equalsZero));

    #if HK_SSE_VERSION >= 0x41
                self.xy = _mm_blendv_pd(selfXY, hugeXY, equalsZeroXY);
                self.zw = _mm_blendv_pd(selfZW, hugeZW, equalsZeroZW);
    #else
                self.xy = _mm_or_pd( _mm_and_pd(equalsZeroXY, hugeXY), _mm_andnot_pd(equalsZeroXY, selfXY) );
                self.zw = _mm_or_pd( _mm_and_pd(equalsZeroZW, hugeZW), _mm_andnot_pd(equalsZeroZW, selfZW) );
    #endif
            }
            break;
        default:
            {
                const __m128d equalsZeroXY = _mm_cmpeq_pd(a.m_quad.xy, _mm_setzero_pd());
                const __m128d equalsZeroZW = _mm_cmpeq_pd(a.m_quad.zw, _mm_setzero_pd());
                hkVector4d aNonZero = a;
                avoidDivideByZeroInDebug<A>( equalsZeroXY, equalsZeroZW, aNonZero.m_quad );
                const __m128d selfXY = _mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, aNonZero.m_quad.xy);
                const __m128d selfZW = _mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, aNonZero.m_quad.zw);

                const __m128d huge = _mm_set1_pd(HK_DOUBLE_MAX);
                const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.xy),63),63);
                const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.zw),63),63);
                const __m128d hugeXY = _mm_or_pd(huge, _mm_castsi128_pd(maskXY));
                const __m128d hugeZW = _mm_or_pd(huge, _mm_castsi128_pd(maskZW));

    #if HK_SSE_VERSION >= 0x41
                self.xy = _mm_blendv_pd(selfXY, hugeXY, equalsZeroXY);
                self.zw = _mm_blendv_pd(selfZW, hugeZW, equalsZeroZW);
    #else
                self.xy = _mm_or_pd( _mm_and_pd(equalsZeroXY, hugeXY), _mm_andnot_pd(equalsZeroXY, selfXY) );
                self.zw = _mm_or_pd( _mm_and_pd(equalsZeroZW, hugeZW), _mm_andnot_pd(equalsZeroZW, selfZW) );
    #endif
            }
            break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setReciprocal<A, HK_DIV_SET_ZERO_AND_ONE> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    unrolld_setReciprocal<A, HK_DIV_SET_ZERO>::apply(self, a);
    const __m128d one = g_vectordConstants[HK_QUADREAL_1].xy;
    const __m128d eps = g_vectordConstants[HK_QUADREAL_EPS].xy;
    const __m128d absValXY = hkMath::twoFabs(_mm_sub_pd(self.xy, one));
    const __m128d absValZW = hkMath::twoFabs(_mm_sub_pd(self.zw, one));
    const __m128d lessEqualEpsXY = _mm_cmple_pd(absValXY, eps);
    const __m128d lessEqualEpsZW = _mm_cmple_pd(absValZW, eps);
#if HK_SSE_VERSION >= 0x41
    self.xy = _mm_blendv_pd(self.xy, one, lessEqualEpsXY);
    self.zw = _mm_blendv_pd(self.zw, one, lessEqualEpsZW);
#else
    self.xy = _mm_or_pd( _mm_and_pd(lessEqualEpsXY, one), _mm_andnot_pd(lessEqualEpsXY, self.xy) );
    self.zw = _mm_or_pd( _mm_and_pd(lessEqualEpsZW, one), _mm_andnot_pd(lessEqualEpsZW, self.zw) );
#endif
} };

} // namespace

template <hkMathAccuracyMode A, hkMathDivByZeroMode D>
HK_INLINE void hkVector4d::setReciprocal(hkVector4dParameter a)
{
    hkVector4_AdvancedInterface::unrolld_setReciprocal<A,D>::apply(m_quad,a);
}

HK_INLINE void hkVector4d::setReciprocal(hkVector4dParameter a)
{
    hkVector4_AdvancedInterface::unrolld_setReciprocal<HK_ACC_MID,HK_DIV_IGNORE>::apply(m_quad,a);
}



namespace hkVector4_AdvancedInterface
{

template <hkMathAccuracyMode A, hkMathDivByZeroMode D>
struct unrolld_setDiv { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a, hkVector4dParameter b)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <hkMathAccuracyMode A>
struct unrolld_setDiv<A, HK_DIV_IGNORE> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a, hkVector4dParameter b)
{
    switch (A)
    {
        case HK_ACC_23_BIT:
        case HK_ACC_12_BIT:
            {
                hkQuadDouble64 invB; unrolld_setReciprocal<A, HK_DIV_IGNORE>::apply(invB, b);
                self.xy = _mm_mul_pd(a.m_quad.xy,invB.xy);
                self.zw = _mm_mul_pd(a.m_quad.zw,invB.zw);
            }
            break;
        default:
            {
                self.xy = _mm_div_pd(a.m_quad.xy, b.m_quad.xy);
                self.zw = _mm_div_pd(a.m_quad.zw, b.m_quad.zw);
            }
            break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setDiv<A, HK_DIV_SET_ZERO> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a, hkVector4dParameter b)
{
    switch (A)
    {
    case HK_ACC_23_BIT:
    case HK_ACC_12_BIT:
        {
            hkQuadDouble64 invB; unrolld_setReciprocal<A, HK_DIV_SET_ZERO>::apply(invB, b);
            self.xy = _mm_mul_pd(a.m_quad.xy,invB.xy);
            self.zw = _mm_mul_pd(a.m_quad.zw,invB.zw);
        }
        break;
    default:
        {
            const __m128d equalsZeroXY = _mm_cmpeq_pd(b.m_quad.xy, _mm_setzero_pd());
            const __m128d equalsZeroZW = _mm_cmpeq_pd(b.m_quad.zw, _mm_setzero_pd());
            hkVector4d bNonZero = b;
            avoidDivideByZeroInDebug<A>( equalsZeroXY, equalsZeroZW, bNonZero.m_quad );
            self.xy = _mm_andnot_pd(equalsZeroXY,_mm_div_pd(a.m_quad.xy, bNonZero.m_quad.xy));
            self.zw = _mm_andnot_pd(equalsZeroZW,_mm_div_pd(a.m_quad.zw, bNonZero.m_quad.zw));
        }
        break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setDiv<A, HK_DIV_SET_HIGH> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a, hkVector4dParameter b)
{
    switch (A)
    {
    case HK_ACC_23_BIT:
    case HK_ACC_12_BIT:
        {
            const __m128 xy = _mm_cvtpd_ps(b.m_quad.xy);
            const __m128 zw = _mm_cvtpd_ps(b.m_quad.zw);
            const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));

            hkQuadFloat32 e; unrolld_setReciprocalFunc<A>::apply(e, xyzw);
            const __m128d selfXY = _mm_cvtps_pd(e);
            const __m128d selfZW = _mm_cvtps_pd(_mm_shuffle_ps(e,e,_MM_SHUFFLE(1,0,3,2)));

            const __m128d huge = _mm_set1_pd(HK_DOUBLE_HIGH);
            const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.xy),63),63);
            const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.zw),63),63);
            const __m128d hugeXY = _mm_or_pd(huge, _mm_castsi128_pd(maskXY));
            const __m128d hugeZW = _mm_or_pd(huge, _mm_castsi128_pd(maskZW));

            const __m128i equalsZero = _mm_castps_si128(_mm_cmpeq_ps(xyzw, _mm_setzero_ps()));
            const __m128d equalsZeroXY = _mm_castsi128_pd(_mm_unpacklo_epi32(equalsZero,equalsZero));
            const __m128d equalsZeroZW = _mm_castsi128_pd(_mm_unpackhi_epi32(equalsZero,equalsZero));

#if HK_SSE_VERSION >= 0x41
            self.xy = _mm_blendv_pd(_mm_mul_pd(a.m_quad.xy,selfXY), hugeXY, equalsZeroXY);
            self.zw = _mm_blendv_pd(_mm_mul_pd(a.m_quad.zw,selfZW), hugeZW, equalsZeroZW);
#else
            self.xy = _mm_or_pd( _mm_and_pd(equalsZeroXY, hugeXY), _mm_andnot_pd(equalsZeroXY, _mm_mul_pd(a.m_quad.xy,selfXY)) );
            self.zw = _mm_or_pd( _mm_and_pd(equalsZeroZW, hugeZW), _mm_andnot_pd(equalsZeroZW, _mm_mul_pd(a.m_quad.zw,selfZW)) );
#endif
        }
        break;
    default:
        {
            const __m128d equalsZeroXY = _mm_cmpeq_pd(b.m_quad.xy, _mm_setzero_pd());
            const __m128d equalsZeroZW = _mm_cmpeq_pd(b.m_quad.zw, _mm_setzero_pd());
            hkVector4d bNonZero = b;
            avoidDivideByZeroInDebug<A>( equalsZeroXY, equalsZeroZW, bNonZero.m_quad );
            const __m128d selfXY = _mm_div_pd(a.m_quad.xy, bNonZero.m_quad.xy);
            const __m128d selfZW = _mm_div_pd(a.m_quad.zw, bNonZero.m_quad.zw);

            const __m128d huge = _mm_set1_pd(HK_DOUBLE_HIGH);

#if HK_SSE_VERSION >= 0x41
            self.xy = _mm_blendv_pd(selfXY, huge, equalsZeroXY);
            self.zw = _mm_blendv_pd(selfZW, huge, equalsZeroZW);
#else
            self.xy = _mm_or_pd( _mm_and_pd(equalsZeroXY, huge), _mm_andnot_pd(equalsZeroXY, selfXY) );
            self.zw = _mm_or_pd( _mm_and_pd(equalsZeroZW, huge), _mm_andnot_pd(equalsZeroZW, selfZW) );
#endif
        }
        break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setDiv<A, HK_DIV_SET_MAX> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a, hkVector4dParameter b)
{
    switch (A)
    {
    case HK_ACC_23_BIT:
    case HK_ACC_12_BIT:
        {
            const __m128 xy = _mm_cvtpd_ps(b.m_quad.xy);
            const __m128 zw = _mm_cvtpd_ps(b.m_quad.zw);
            const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));

            hkQuadFloat32 e; unrolld_setReciprocalFunc<A>::apply(e, xyzw);
            const __m128d selfXY = _mm_cvtps_pd(e);
            const __m128d selfZW = _mm_cvtps_pd(_mm_shuffle_ps(e,e,_MM_SHUFFLE(1,0,3,2)));

            const __m128d huge = _mm_set1_pd(HK_DOUBLE_MAX);
            const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.xy),63),63);
            const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.zw),63),63);
            const __m128d hugeXY = _mm_or_pd(huge, _mm_castsi128_pd(maskXY));
            const __m128d hugeZW = _mm_or_pd(huge, _mm_castsi128_pd(maskZW));

            const __m128i equalsZero = _mm_castps_si128(_mm_cmpeq_ps(xyzw, _mm_setzero_ps()));
            const __m128d equalsZeroXY = _mm_castsi128_pd(_mm_unpacklo_epi32(equalsZero,equalsZero));
            const __m128d equalsZeroZW = _mm_castsi128_pd(_mm_unpackhi_epi32(equalsZero,equalsZero));

#if HK_SSE_VERSION >= 0x41
            self.xy = _mm_blendv_pd(_mm_mul_pd(a.m_quad.xy,selfXY), hugeXY, equalsZeroXY);
            self.zw = _mm_blendv_pd(_mm_mul_pd(a.m_quad.zw,selfZW), hugeZW, equalsZeroZW);
#else
            self.xy = _mm_or_pd( _mm_and_pd(equalsZeroXY, hugeXY), _mm_andnot_pd(equalsZeroXY, _mm_mul_pd(a.m_quad.xy,selfXY)) );
            self.zw = _mm_or_pd( _mm_and_pd(equalsZeroZW, hugeZW), _mm_andnot_pd(equalsZeroZW, _mm_mul_pd(a.m_quad.zw,selfZW)) );
#endif
        }
        break;
    default:
        {
            const __m128d equalsZeroXY = _mm_cmpeq_pd(b.m_quad.xy, _mm_setzero_pd());
            const __m128d equalsZeroZW = _mm_cmpeq_pd(b.m_quad.zw, _mm_setzero_pd());
            hkVector4d bNonZero = b;
            avoidDivideByZeroInDebug<A>( equalsZeroXY, equalsZeroZW, bNonZero.m_quad );
            const __m128d selfXY = _mm_div_pd(a.m_quad.xy, bNonZero.m_quad.xy);
            const __m128d selfZW = _mm_div_pd(a.m_quad.zw, bNonZero.m_quad.zw);

            const __m128d huge = _mm_set1_pd(HK_DOUBLE_MAX);
            const __m128i maskXY = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.xy),63),63);
            const __m128i maskZW = _mm_slli_epi64(_mm_srli_epi64(_mm_castpd_si128(a.m_quad.zw),63),63);
            const __m128d hugeXY = _mm_or_pd(huge, _mm_castsi128_pd(maskXY));
            const __m128d hugeZW = _mm_or_pd(huge, _mm_castsi128_pd(maskZW));

#if HK_SSE_VERSION >= 0x41
            self.xy = _mm_blendv_pd(selfXY, hugeXY, equalsZeroXY);
            self.zw = _mm_blendv_pd(selfZW, hugeZW, equalsZeroZW);
#else
            self.xy = _mm_or_pd( _mm_and_pd(equalsZeroXY, hugeXY), _mm_andnot_pd(equalsZeroXY, selfXY) );
            self.zw = _mm_or_pd( _mm_and_pd(equalsZeroZW, hugeZW), _mm_andnot_pd(equalsZeroZW, selfZW) );
#endif
        }
        break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setDiv<A, HK_DIV_SET_ZERO_AND_ONE> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a, hkVector4dParameter b)
{
    unrolld_setDiv<A, HK_DIV_SET_ZERO>::apply(self, a, b);
    const __m128d one = g_vectordConstants[HK_QUADREAL_1].xy;
    const __m128d eps = g_vectordConstants[HK_QUADREAL_EPS].xy;
    const __m128d absValXY = hkMath::twoFabs(_mm_sub_pd(self.xy, one));
    const __m128d absValZW = hkMath::twoFabs(_mm_sub_pd(self.zw, one));
    const __m128d lessEqualEpsXY = _mm_cmple_pd(absValXY, eps);
    const __m128d lessEqualEpsZW = _mm_cmple_pd(absValZW, eps);
#if HK_SSE_VERSION >= 0x41
    self.xy = _mm_blendv_pd(self.xy, one, lessEqualEpsXY);
    self.zw = _mm_blendv_pd(self.zw, one, lessEqualEpsZW);
#else
    self.xy = _mm_or_pd( _mm_and_pd(lessEqualEpsXY, one), _mm_andnot_pd(lessEqualEpsXY, self.xy) );
    self.zw = _mm_or_pd( _mm_and_pd(lessEqualEpsZW, one), _mm_andnot_pd(lessEqualEpsZW, self.zw) );
#endif
} };

} // namespace

template <hkMathAccuracyMode A, hkMathDivByZeroMode D>
HK_INLINE void hkVector4d::setDiv(hkVector4dParameter v0, hkVector4dParameter v1)
{
    hkVector4_AdvancedInterface::unrolld_setDiv<A,D>::apply(m_quad,v0,v1);
}

HK_INLINE void hkVector4d::setDiv(hkVector4dParameter v0, hkVector4dParameter v1)
{
    hkVector4_AdvancedInterface::unrolld_setDiv<HK_ACC_MID,HK_DIV_IGNORE>::apply(m_quad,v0,v1);
}

template <hkMathAccuracyMode A, hkMathDivByZeroMode D>
HK_INLINE void hkVector4d::div(hkVector4dParameter a)
{
    setDiv<A,D>( *this, a );
}


namespace hkVector4_AdvancedInterface
{

template <hkMathAccuracyMode A>
struct unrolld_setSqrtInverseFunc { HK_INLINE static void apply(hkQuadFloat32& self, const hkQuadFloat32& a)
{
    switch (A)
    {
        case HK_ACC_23_BIT: self = hkMath::quadReciprocalSquareRoot(a); break;
        case HK_ACC_12_BIT: self = _mm_rsqrt_ps(a); break;
        default:            self = _mm_div_ps(g_vectorfConstants[HK_QUADREAL_1],_mm_sqrt_ps(a)); break; // HK_ACC_FULL
    }
} };

template <hkMathAccuracyMode A, hkMathNegSqrtMode S>
struct unrolld_setSqrt { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <hkMathAccuracyMode A>
struct unrolld_setSqrt<A, HK_SQRT_IGNORE> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    switch (A)
    {
        case HK_ACC_23_BIT:
        case HK_ACC_12_BIT:
            {
                const __m128 xy = _mm_cvtpd_ps(a.m_quad.xy);
                const __m128 zw = _mm_cvtpd_ps(a.m_quad.zw);
                const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
                hkQuadFloat32 re; unrolld_setSqrtInverseFunc<A>::apply(re, xyzw);
                self.xy = _mm_mul_pd(a.m_quad.xy,_mm_cvtps_pd(re));
                self.zw = _mm_mul_pd(a.m_quad.zw,_mm_cvtps_pd(_mm_shuffle_ps(re,re,_MM_SHUFFLE(1,0,3,2))));
            }
            break;
        default:
            {
                self.xy = _mm_sqrt_pd(a.m_quad.xy);
                self.zw = _mm_sqrt_pd(a.m_quad.zw);
            }
            break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setSqrt<A, HK_SQRT_SET_ZERO> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    switch (A)
    {
    case HK_ACC_23_BIT:
    case HK_ACC_12_BIT:
        {
            const __m128 xy = _mm_cvtpd_ps(a.m_quad.xy);
            const __m128 zw = _mm_cvtpd_ps(a.m_quad.zw);
            const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
            const __m128 equalsZero = _mm_cmple_ps(xyzw, _mm_setzero_ps());
            hkQuadFloat32 re; unrolld_setSqrtInverseFunc<A>::apply(re, xyzw);
            re = _mm_andnot_ps(equalsZero, re);
            self.xy = _mm_mul_pd(a.m_quad.xy,_mm_cvtps_pd(re));
            self.zw = _mm_mul_pd(a.m_quad.zw,_mm_cvtps_pd(_mm_shuffle_ps(re,re,_MM_SHUFFLE(1,0,3,2))));
        }
        break;
    default:
        {
            const __m128d equalsZeroXY = _mm_cmple_pd(a.m_quad.xy, _mm_setzero_pd());
            const __m128d equalsZeroZW = _mm_cmple_pd(a.m_quad.zw, _mm_setzero_pd());
            self.xy = _mm_andnot_pd(equalsZeroXY,_mm_sqrt_pd(a.m_quad.xy));
            self.zw = _mm_andnot_pd(equalsZeroZW,_mm_sqrt_pd(a.m_quad.zw));
        }
        break; // HK_ACC_FULL
    }
} };

} // namespace

template <hkMathAccuracyMode A, hkMathNegSqrtMode S>
HK_INLINE void hkVector4d::setSqrt(hkVector4dParameter a)
{
    hkVector4_AdvancedInterface::unrolld_setSqrt<A,S>::apply(m_quad, a);
}

HK_INLINE void hkVector4d::setSqrt(hkVector4dParameter a)
{
    hkVector4_AdvancedInterface::unrolld_setSqrt<HK_ACC_MID,HK_SQRT_SET_ZERO>::apply(m_quad, a);
}



namespace hkVector4_AdvancedInterface
{

template <hkMathAccuracyMode A, hkMathNegSqrtMode S>
struct unrolld_setSqrtInverse { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <hkMathAccuracyMode A>
struct unrolld_setSqrtInverse<A, HK_SQRT_IGNORE> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    switch (A)
    {
    case HK_ACC_23_BIT:
    case HK_ACC_12_BIT:
        {
            const __m128 xy = _mm_cvtpd_ps(a.m_quad.xy);
            const __m128 zw = _mm_cvtpd_ps(a.m_quad.zw);
            const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
            hkQuadFloat32 re; unrolld_setSqrtInverseFunc<A>::apply(re, xyzw);
            self.xy = _mm_cvtps_pd(re);
            self.zw = _mm_cvtps_pd(_mm_shuffle_ps(re,re,_MM_SHUFFLE(1,0,3,2)));
        }
        break;
    default:
        {
            self.xy = _mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, _mm_sqrt_pd(a.m_quad.xy));
            self.zw = _mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, _mm_sqrt_pd(a.m_quad.zw));
        }
        break; // HK_ACC_FULL
    }
} };
template <hkMathAccuracyMode A>
struct unrolld_setSqrtInverse<A, HK_SQRT_SET_ZERO> { HK_INLINE static void apply(hkQuadDouble64& self, hkVector4dParameter a)
{
    switch (A)
    {
    case HK_ACC_23_BIT:
    case HK_ACC_12_BIT:
        {
            const __m128 xy = _mm_cvtpd_ps(a.m_quad.xy);
            const __m128 zw = _mm_cvtpd_ps(a.m_quad.zw);
            const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
            const __m128 equalsZero = _mm_cmple_ps(xyzw, _mm_setzero_ps());
            hkQuadFloat32 re; unrolld_setSqrtInverseFunc<A>::apply(re, xyzw);
            re = _mm_andnot_ps(equalsZero, re);
            self.xy = _mm_cvtps_pd(re);
            self.zw = _mm_cvtps_pd(_mm_shuffle_ps(re,re,_MM_SHUFFLE(1,0,3,2)));
        }
        break;
    default:
        {
            const __m128d equalsZeroXY = _mm_cmple_pd(a.m_quad.xy, _mm_setzero_pd());
            const __m128d equalsZeroZW = _mm_cmple_pd(a.m_quad.zw, _mm_setzero_pd());
            hkVector4d aNonZero = a;
            avoidDivideByZeroInDebug<A>( equalsZeroXY, equalsZeroZW, aNonZero.m_quad );
            self.xy = _mm_andnot_pd(equalsZeroXY,_mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, _mm_sqrt_pd(aNonZero.m_quad.xy)));
            self.zw = _mm_andnot_pd(equalsZeroZW,_mm_div_pd(g_vectordConstants[HK_QUADREAL_1].xy, _mm_sqrt_pd(aNonZero.m_quad.zw)));
        }
        break; // HK_ACC_FULL
    }
} };
} // namespace

template <hkMathAccuracyMode A, hkMathNegSqrtMode S>
HK_INLINE void hkVector4d::setSqrtInverse(hkVector4dParameter a)
{
    hkVector4_AdvancedInterface::unrolld_setSqrtInverse<A,S>::apply(m_quad,a);
}

HK_INLINE void hkVector4d::setSqrtInverse(hkVector4dParameter a)
{
    hkVector4_AdvancedInterface::unrolld_setSqrtInverse<HK_ACC_MID,HK_SQRT_SET_ZERO>::apply(m_quad,a);
}



namespace hkVector4_AdvancedInterface
{
template <int N, hkMathIoMode A>
struct unrolld_load { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkFloat32* HK_RESTRICT p)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <int N, hkMathIoMode A>
struct unrolld_load_D { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N)  hkDouble64* HK_RESTRICT p)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <int N>
struct unrolld_load<N, HK_IO_BYTE_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkFloat32* HK_RESTRICT p)
{
    switch (N)
    {
    case 1:
        {
            __m128 a = _mm_load_ss(p);
            self.xy = _mm_cvtps_pd(a);
            HK_ON_DEBUG( HK_M128(self.xy).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[0] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    case 2:
        {
            __m128d a = _mm_load_sd((const double*)p);
            self.xy = _mm_cvtps_pd(_mm_castpd_ps(a));
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[0] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    case 3:
        {
            __m128d a = _mm_load_sd((const double*)p);
            __m128 b = _mm_load_ss(p+2);
            self.xy = _mm_cvtps_pd(_mm_castpd_ps(a));
            self.zw = _mm_cvtps_pd(b);
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    default:
        {
#if HK_SSE_VERSION >= 0x30
            __m128 a = _mm_castsi128_ps(_mm_lddqu_si128((const __m128i*)p));
#else
            __m128 a = _mm_loadu_ps(p);
#endif
            self.xy = _mm_cvtps_pd(a);
            self.zw = _mm_cvtps_pd(_mm_shuffle_ps(a,a,_MM_SHUFFLE(1,0,3,2)));
        }
        break;
    }
} };
template <int N>
struct unrolld_load_D<N, HK_IO_BYTE_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkDouble64* HK_RESTRICT p)
{
    switch (N)
    {
    case 1:
        {
            self.xy = _mm_load_sd(p);
            HK_ON_DEBUG( HK_M128(self.xy).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[0] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    case 2:
        {
#if HK_SSE_VERSION >= 0x30
            self.xy = _mm_castsi128_pd(_mm_lddqu_si128((const __m128i*)p));
#else
            self.xy = _mm_loadu_pd(p);
#endif
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[0] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    case 3:
        {
#if HK_SSE_VERSION >= 0x30
            self.xy = _mm_castsi128_pd(_mm_lddqu_si128((const __m128i*)p));
#else
            self.xy = _mm_loadu_pd(p);
#endif
            self.zw = _mm_load_sd(p+2);
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    default:
        {
#if HK_SSE_VERSION >= 0x30
            self.xy = _mm_castsi128_pd(_mm_lddqu_si128((const __m128i*)p));
            self.zw = _mm_castsi128_pd(_mm_lddqu_si128((const __m128i*)(p+2)));
#else
            self.xy = _mm_loadu_pd(p);
            self.zw = _mm_loadu_pd(p+2);
#endif
        }
        break;
    }
} };
template <int N>
struct unrolld_load<N, HK_IO_NATIVE_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkFloat32* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & (sizeof(hkFloat32)-1) ) == 0, "pointer must be aligned to native size of hkFloat32.");
    unrolld_load<N, HK_IO_BYTE_ALIGNED>::apply(self,p);
} };
template <int N>
struct unrolld_load_D<N, HK_IO_NATIVE_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkDouble64* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & (sizeof(hkDouble64)-1) ) == 0, "pointer must be aligned to native size of hkDouble64.");
    unrolld_load_D<N, HK_IO_BYTE_ALIGNED>::apply(self,p);
} };
template <int N>
struct unrolld_load<N, HK_IO_SIMD_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkFloat32* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & ((sizeof(hkFloat32)*(N!=3?N:4) )-1) ) == 0, "pointer must be aligned for SIMD.");
    if (N==4)
    {
        __m128 a = _mm_load_ps(p);
        self.xy = _mm_cvtps_pd(a);
        self.zw = _mm_cvtps_pd(_mm_shuffle_ps(a,a,_MM_SHUFFLE(1,0,3,2)));
    }
    else
    {
        unrolld_load<N, HK_IO_NATIVE_ALIGNED>::apply(self,p);
    }
} };
template <int N>
struct unrolld_load_D<N, HK_IO_SIMD_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkDouble64* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & ((sizeof(hkDouble64)*(N!=3?N:4) )-1) ) == 0, "pointer must be aligned for SIMD.");
    switch (N)
    {
    case 2:
        {
            self.xy = _mm_load_pd(p);
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[0] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    case 3:
        {
            self.xy = _mm_load_pd(p);
            self.zw = _mm_load_sd(p+2);
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    case 4:
        {
            self.xy = _mm_load_pd(p);
            self.zw = _mm_load_pd(p+2);
        }
        break;
    default:
        {
            unrolld_load_D<N, HK_IO_NATIVE_ALIGNED>::apply(self,p);
        }
        break;
    }
} };
} // namespace

template <int N, hkMathIoMode A>
HK_INLINE void hkVector4d::load(_In_reads_(N) const hkFloat32* p)
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_load<N,A>::apply(m_quad, p);
}

template <int N, hkMathIoMode A>
HK_INLINE void hkVector4d::load(_In_reads_(N) const hkDouble64* p)
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_load_D<N,A>::apply(m_quad, p);
}

template <int N>
HK_INLINE void hkVector4d::load(_In_reads_(N) const hkFloat32* p)
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_load<N,HK_IO_SIMD_ALIGNED>::apply(m_quad, p);
}

template <int N>
HK_INLINE void hkVector4d::load(_In_reads_(N) const hkDouble64* p)
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_load_D<N,HK_IO_SIMD_ALIGNED>::apply(m_quad, p);
}




namespace hkVector4_AdvancedInterface
{
template <int N, hkMathIoMode A>
struct unrolld_loadH { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkHalf16* HK_RESTRICT p)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <int N>
struct unrolld_loadH<N, HK_IO_BYTE_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkHalf16* HK_RESTRICT p)
{
    switch (N)
    {
    case 1:
        {
            __m128 f = _mm_castsi128_ps( _mm_cvtsi32_si128(p->getStorage() << 16) );
            self.xy = _mm_cvtps_pd(f);
            HK_ON_DEBUG( HK_M128(self.xy).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[0] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    case 2:
        {
            __m128i twohalfs = _mm_castps_si128( _mm_load_ss((const float*)p) );
            __m128  twofloats = _mm_castsi128_ps( _mm_unpacklo_epi16(_mm_setzero_si128(), twohalfs) );
            self.xy = _mm_cvtps_pd(twofloats);
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[0] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    case 3:
        {
            HK_ALIGN16(hkHalf16 tmp[4]);
            tmp[0] = p[0];
            tmp[1] = p[1];
            tmp[2] = p[2];
#if defined(HK_COMPILER_MSVC)
            // VS bug: movq instruction causes mem corruption
            __m128i fourhalfs = _mm_castpd_si128(_mm_load_sd((const double*)tmp));
#else
            __m128i fourhalfs = _mm_loadl_epi64((const __m128i*)tmp);
#endif
            __m128  fourfloats = _mm_castsi128_ps( _mm_unpacklo_epi16(_mm_setzero_si128(), fourhalfs) );
            self.xy = _mm_cvtps_pd(fourfloats);
            self.zw = _mm_cvtps_pd(_mm_shuffle_ps(fourfloats,fourfloats,_MM_SHUFFLE(1,0,3,2)));
            HK_ON_DEBUG( HK_M128(self.zw).m128d_f64[1] = HK_M128(HK_VECTOR4d_DEBUG_FILL_VALUE.xy).m128d_f64[0]; )
        }
        break;
    default:
        {
            __m128i fourhalfs = _mm_castpd_si128(_mm_load_sd((const double*)p));
            __m128  fourfloats = _mm_castsi128_ps( _mm_unpacklo_epi16(_mm_setzero_si128(), fourhalfs) );
            self.xy = _mm_cvtps_pd(fourfloats);
            self.zw = _mm_cvtps_pd(_mm_shuffle_ps(fourfloats,fourfloats,_MM_SHUFFLE(1,0,3,2)));
        }
        break;
    }
} };
template <int N>
struct unrolld_loadH<N, HK_IO_NATIVE_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkHalf16* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & (sizeof(hkHalf16)-1) ) == 0, "pointer must be aligned to native size of hkHalf16.");
    unrolld_loadH<N, HK_IO_BYTE_ALIGNED>::apply(self,p);
} };
template <int N>
struct unrolld_loadH<N, HK_IO_SIMD_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkHalf16* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & ((sizeof(hkHalf16)*(N!=3?N:4) )-1) ) == 0, "pointer must be aligned for SIMD.");
    switch (N)
    {
    case 4:
        {
#if defined(HK_COMPILER_MSVC)
            // VS bug: movq instruction causes mem corruption
            __m128i fourhalfs = _mm_castpd_si128(_mm_load_sd((const double*)p));
#else
            __m128i fourhalfs = _mm_loadl_epi64((const __m128i*)p);
#endif
            __m128  fourfloats = _mm_castsi128_ps( _mm_unpacklo_epi16(_mm_setzero_si128(), fourhalfs) );
            self.xy = _mm_cvtps_pd(fourfloats);
            self.zw = _mm_cvtps_pd(_mm_shuffle_ps(fourfloats,fourfloats,_MM_SHUFFLE(1,0,3,2)));
        }
        break;
    default:
        {
            unrolld_loadH<N, HK_IO_NATIVE_ALIGNED>::apply(self,p);
        }
        break;
    }
} };
} // namespace

template <int N, hkMathIoMode A>
HK_INLINE void hkVector4d::load(_In_reads_(N) const hkHalf16* p)
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_loadH<N,A>::apply(m_quad, p);
}

template <int N>
HK_INLINE void hkVector4d::load(_In_reads_(N) const hkHalf16* p)
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_loadH<N,HK_IO_SIMD_ALIGNED>::apply(m_quad, p);
}



namespace hkVector4_AdvancedInterface
{
template <int N, hkMathIoMode A>
struct unrolld_loadF16 { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkFloat16* HK_RESTRICT p)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <int N>
struct unrolld_loadF16<N, HK_IO_BYTE_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkFloat16* HK_RESTRICT p)
{
    HK_ALIGN16(double tmp[4]);
    switch (N)
    {
    case 4:  tmp[3] = p[3].getFloat32();
    case 3:  tmp[2] = p[2].getFloat32();
    case 2:  tmp[1] = p[1].getFloat32();
    default: tmp[0] = p[0].getFloat32(); break;
    }

    self.xy = _mm_load_pd(tmp);
    self.zw = _mm_load_pd(tmp + 2);
} };
template <int N>
struct unrolld_loadF16<N, HK_IO_NATIVE_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkFloat16* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & (sizeof(hkFloat16)-1) ) == 0, "pointer must be aligned to native size of hkFloat16.");
    unrolld_loadF16<N, HK_IO_BYTE_ALIGNED>::apply(self,p);
} };
template <int N>
struct unrolld_loadF16<N, HK_IO_SIMD_ALIGNED> { HK_INLINE static void apply(hkQuadDouble64& self, _In_reads_(N) const hkFloat16* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & ((sizeof(hkFloat16)*(N!=3?N:4) )-1) ) == 0, "pointer must be aligned for SIMD.");
    unrolld_loadF16<N, HK_IO_NATIVE_ALIGNED>::apply(self,p);
} };
} // namespace

template <int N, hkMathIoMode A>
HK_INLINE void hkVector4d::load(_In_reads_(N) const hkFloat16* p)
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_loadF16<N,A>::apply(m_quad, p);
}

template <int N>
HK_INLINE void hkVector4d::load(_In_reads_(N) const hkFloat16* p)
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_loadF16<N,HK_IO_SIMD_ALIGNED>::apply(m_quad, p);
}



namespace hkVector4_AdvancedInterface
{
template <int N, hkMathIoMode A>
struct unrolld_store { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkFloat32* HK_RESTRICT p)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <int N, hkMathIoMode A>
struct unrolld_store_D { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkDouble64* HK_RESTRICT p)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <int N>
struct unrolld_store<N, HK_IO_BYTE_ALIGNED> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkFloat32* HK_RESTRICT p)
{
    switch (N)
    {
    case 1:
        {
            __m128 a = _mm_cvtpd_ps(self.xy);
            _mm_store_ss(p, a);
        }
        break;
    case 2:
        {
            __m128 a = _mm_cvtpd_ps(self.xy);
            _mm_storel_pi( (__m64*)p, a);
        }
        break;
    case 3:
        {
            __m128 a = _mm_cvtpd_ps(self.xy);
            __m128 b = _mm_cvtpd_ps(self.zw);
            _mm_storel_pi( (__m64*)p, a);
            _mm_store_ss(p+2, b);
        }
        break;
    default:
        {
            __m128 a = _mm_cvtpd_ps(self.xy);
            __m128 b = _mm_cvtpd_ps(self.zw);
            __m128 xyzw = _mm_movelh_ps(a,b);
            _mm_storeu_ps(p, xyzw);
        }
        break;
    }
} };
template <int N>
struct unrolld_store_D<N, HK_IO_BYTE_ALIGNED> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkDouble64* HK_RESTRICT p)
{
    switch (N)
    {
    case 1:
        {
            _mm_store_sd(p, self.xy);
        }
        break;
    case 2:
        {
            _mm_storeu_pd(p, self.xy);
        }
        break;
    case 3:
        {
            _mm_storeu_pd(p, self.xy);
            _mm_store_sd(p+2, self.zw);
        }
        break;
    default:
        {
            _mm_storeu_pd(p, self.xy);
            _mm_storeu_pd(p+2, self.zw);
        }
        break;
    }
} };
template <int N>
struct unrolld_store<N, HK_IO_NATIVE_ALIGNED> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkFloat32* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & (sizeof(hkFloat32)-1) ) == 0, "pointer must be aligned to native size of hkFloat32.");
    unrolld_store<N, HK_IO_BYTE_ALIGNED>::apply(self,p);
} };
template <int N>
struct unrolld_store_D<N, HK_IO_NATIVE_ALIGNED> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkDouble64* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & (sizeof(hkDouble64)-1) ) == 0, "pointer must be aligned to native size of hkDouble64.");
    unrolld_store_D<N, HK_IO_BYTE_ALIGNED>::apply(self,p);
} };
template <int N>
struct unrolld_store<N, HK_IO_SIMD_ALIGNED> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkFloat32* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & ((sizeof(hkFloat32)*(N!=3?N:4) )-1) ) == 0, "pointer must be aligned for SIMD.");
    if (N==4)
    {
        __m128 a = _mm_cvtpd_ps(self.xy);
        __m128 b = _mm_cvtpd_ps(self.zw);
        __m128 xyzw = _mm_movelh_ps(a,b);
        _mm_store_ps(p, xyzw);
    }
    else
    {
        unrolld_store<N, HK_IO_NATIVE_ALIGNED>::apply(self,p);
    }
} };
template <int N>
struct unrolld_store_D<N, HK_IO_SIMD_ALIGNED> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkDouble64* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & ((sizeof(hkDouble64)*(N!=3?N:4) )-1) ) == 0, "pointer must be aligned for SIMD.");
    switch (N)
    {
    case 2:
        {
            _mm_store_pd(p, self.xy);
        }
        break;
    case 3:
        {
            _mm_store_pd(p, self.xy);
            _mm_store_sd(p+2, self.zw);
        }
        break;
    case 4:
        {
            _mm_store_pd(p, self.xy);
            _mm_store_pd(p+2, self.zw);
        }
        break;
    default:
        {
            unrolld_store_D<N, HK_IO_NATIVE_ALIGNED>::apply(self,p);
        }
        break;
    }
} };
} // namespace

template <int N, hkMathIoMode A, hkMathRoundingMode R>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkFloat32* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_store<N,A>::apply(m_quad, p);
}

template <int N, hkMathIoMode A, hkMathRoundingMode R>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkDouble64* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_store_D<N,A>::apply(m_quad, p);
}

template <int N, hkMathIoMode A>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkFloat32* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_store<N,A>::apply(m_quad, p);
}

template <int N, hkMathIoMode A>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkDouble64* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_store_D<N,A>::apply(m_quad, p);
}

template <int N>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkFloat32* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_store<N,HK_IO_SIMD_ALIGNED>::apply(m_quad, p);
}

template <int N>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkDouble64* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_store_D<N,HK_IO_SIMD_ALIGNED>::apply(m_quad, p);
}


namespace hkVector4_AdvancedInterface
{
template <int N, hkMathRoundingMode R>
HK_INLINE static void convertf64half(const hkQuadDouble64& self, __m128i& packed)
{
    static HK_ALIGN16(const unsigned int scale[4])      = {0x3F808000, 0x3F808000, 0x3F808000, 0x3F808000}; // 1 + 1/256 as float32

    __m128 xyzw;
    if (N<3)
    {
        xyzw = _mm_cvtpd_ps(self.xy);
    }
    else
    {
        __m128 xy = _mm_cvtpd_ps(self.xy);
        __m128 zw = _mm_cvtpd_ps(self.zw);
        xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
    }

    if (R == HK_ROUND_NEAREST)
    {
        xyzw = _mm_mul_ps(xyzw, *((__m128*)&scale));
    }

    __m128i tmp0 = _mm_srai_epi32( _mm_castps_si128(xyzw), 16 );
    packed = _mm_packs_epi32(tmp0, tmp0);
}
template <int N, hkMathIoMode A, hkMathRoundingMode R>
struct unrolld_storeH { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkHalf16* HK_RESTRICT p)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <int N, hkMathRoundingMode R>
struct unrolld_storeH<N, HK_IO_BYTE_ALIGNED, R> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkHalf16* HK_RESTRICT p)
{
    __m128i packed;
    convertf64half<N,R>(self,packed);

    switch (N)
    {
    case 1:
        {
            HK_ALIGN16(hkHalf16 tmp[2]);
            _mm_store_ss((float*)tmp, _mm_castsi128_ps(packed));
            p[0] = tmp[0];
        }
        break;
    case 2:
        {
            _mm_store_ss((float*)p, _mm_castsi128_ps(packed));
        }
        break;
    case 3:
        {
            HK_ALIGN16(hkHalf16 tmp[4]);
#if defined(HK_COMPILER_MSVC)
            // VS bug: movq instruction causes mem corruption
            _mm_store_sd((double*)tmp, _mm_castsi128_pd(packed));
#else
            _mm_storel_epi64((__m128i*)tmp, packed);
#endif
            p[0] = tmp[0];
            p[1] = tmp[1];
            p[2] = tmp[2];
        }
        break;
    default:
        {
            _mm_store_sd((double*) p, _mm_castsi128_pd(packed));
        }
        break;
    }
} };
template <int N, hkMathRoundingMode R>
struct unrolld_storeH<N, HK_IO_NATIVE_ALIGNED, R> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkHalf16* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & (sizeof(hkHalf16)-1) ) == 0, "pointer must be aligned to native size of hkHalf16.");
    unrolld_storeH<N, HK_IO_BYTE_ALIGNED, R>::apply(self,p);
} };
template <int N, hkMathRoundingMode R>
struct unrolld_storeH<N, HK_IO_SIMD_ALIGNED, R> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkHalf16* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & ((sizeof(hkHalf16)*(N!=3?N:4) )-1) ) == 0, "pointer must be aligned for SIMD.");
    switch (N)
    {
    case 4:
        {
            __m128i packed;
            convertf64half<N,R>(self,packed);
#if defined(HK_COMPILER_MSVC)
            // VS bug: movq instruction causes mem corruption
            _mm_store_sd((double*)p, _mm_castsi128_pd(packed));
#else
            _mm_storel_epi64((__m128i*)p, packed);
#endif
        }
        break;
    default:
        {
            unrolld_storeH<N, HK_IO_NATIVE_ALIGNED, R>::apply(self,p);
        }
        break;
    }
} };
} // namespace

template <int N, hkMathIoMode A, hkMathRoundingMode R>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkHalf16* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_storeH<N,A,R>::apply(m_quad, p);
}

template <int N, hkMathIoMode A>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkHalf16* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_storeH<N,A,HK_ROUND_DEFAULT>::apply(m_quad, p);
}

template <int N>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkHalf16* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_storeH<N,HK_IO_SIMD_ALIGNED,HK_ROUND_DEFAULT>::apply(m_quad, p);
}




namespace hkVector4_AdvancedInterface
{
template <int N, hkMathIoMode A, hkMathRoundingMode R>
struct unrolld_storeF16 { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkFloat16* HK_RESTRICT p)
{
    HK_VECTOR4_TEMPLATE_CONFIG_NOT_IMPLEMENTED;
} };
template <int N, hkMathRoundingMode R>
struct unrolld_storeF16<N, HK_IO_BYTE_ALIGNED, R> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkFloat16* HK_RESTRICT p)
{
    HK_ALIGN16(double tmp[4]);
    _mm_store_pd(tmp + 0, self.xy);
    _mm_store_pd(tmp + 2, self.zw);
    switch (N)
    {
    case 4:  p[3].setReal<(R == HK_ROUND_NEAREST)>(hkReal(tmp[3]));
    case 3:  p[2].setReal<(R == HK_ROUND_NEAREST)>(hkReal(tmp[2]));
    case 2:  p[1].setReal<(R == HK_ROUND_NEAREST)>(hkReal(tmp[1]));
    default: p[0].setReal<(R == HK_ROUND_NEAREST)>(hkReal(tmp[0])); break;
    }
} };
template <int N, hkMathRoundingMode R>
struct unrolld_storeF16<N, HK_IO_NATIVE_ALIGNED, R> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkFloat16* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & (sizeof(hkFloat16)-1) ) == 0, "pointer must be aligned to native size of hkFloat16.");
    unrolld_storeF16<N, HK_IO_BYTE_ALIGNED, R>::apply(self,p);
} };
template <int N, hkMathRoundingMode R>
struct unrolld_storeF16<N, HK_IO_SIMD_ALIGNED, R> { HK_INLINE static void apply(const hkQuadDouble64& self, _Out_writes_all_(N) hkFloat16* HK_RESTRICT p)
{
    HK_MATH_ASSERT(0x64211c2f, ( ((hkUlong)p) & ((sizeof(hkFloat16)*(N!=3?N:4) )-1) ) == 0, "pointer must be aligned for SIMD.");
    unrolld_storeF16<N, HK_IO_NATIVE_ALIGNED, R>::apply(self,p);
} };
} // namespace

template <int N, hkMathIoMode A, hkMathRoundingMode R>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkFloat16* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_storeF16<N,A,R>::apply(m_quad, p);
}

template <int N, hkMathIoMode A>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkFloat16* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_storeF16<N,A,HK_ROUND_DEFAULT>::apply(m_quad, p);
}

template <int N>
HK_INLINE void hkVector4d::store(_Out_writes_all_(N) hkFloat16* p) const
{
    HK_VECTOR4_UNSUPPORTED_LENGTH_CHECK;
    hkVector4_AdvancedInterface::unrolld_storeF16<N,HK_IO_SIMD_ALIGNED,HK_ROUND_DEFAULT>::apply(m_quad, p);
}

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
