// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM   : ANDROID APOLLO_ARM IOS METRO_ARM UWP_ARM NX32 NX64
// PRODUCT   : COMMON
// VISIBILITY   : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0

#ifndef HK_COMPILER_HAS_INTRINSICS_NEON
#error Cant include this header on non Neon setups..
#endif

// dot3vs3
#if 0
dotsOut.set(a0.dot<3>(b0), a1.dot<3>(b1), a2.dot<3>(b2), a2.dot<3>(b2));
#endif

// vzip dot3vs3
#if 0
// (a00b00, a01b01, a02b02, a03b03)
float32x4_t c0 = vmulq_f32(a.getColumn<0>().m_quad, b.getColumn<0>().m_quad);
// (a10b10, a11b11, a12b12, a13b13)
float32x4_t c1 = vmulq_f32(a.getColumn<1>().m_quad, b.getColumn<1>().m_quad);
// (a20b20, a21b21, a22b22, a23b23)
float32x4_t c2 = vmulq_f32(a.getColumn<2>().m_quad, b.getColumn<2>().m_quad);

// (a00b00, a01b01)
float32x2_t xy0 = vget_low_f32(c0);
// (a10b10, a11b11)
float32x2_t xy1 = vget_low_f32(c1);
// (a20b20, a21b21)
float32x2_t xy2 = vget_low_f32(c2);

// (a00b00 + a01b01, a10b10 + a11b11)
float32x2_t xy01 = vpadd_f32(xy0, xy1);
// (a20b20 + a21b21, a20b20 + a21b21)
float32x2_t xy22 = vpadd_f32(xy2, xy2);

// (a00b00 + a01b01, a10b10 + a11b11, a20b20 + a21b21, a20b20 + a21b21)
float32x4_t xy = vcombine_f32(xy01, xy22);

// vzipq_f32(c0, c1): [(a00b00, a10b10, a01b01, a11b11), (a02b02, a12b12, a03b03, a13b13)]
// (a02b02, a12b12, a03b03, a13b13)
float32x4_t zzww01 = vzipq_f32(c0, c1).val[1];
// (a02b02, a12b12)
float32x2_t zz01 = vget_low_f32(zzww01);

// (a02b02, a12b12, a22b22, a23b23)
float32x4_t zzz = vcombine_f32(zz01, vget_high_f32(c2));

// (a00b00 + a01b01 + a02b02, a10b10 + a11b11 + a12b12, a20b20 + a21b21 + a22b22, a20b20 + a21b21 + a23b23)
dotsOut.m_quad = vaddq_f32(zzz, xy);
#endif

#define HK_VECTOR4fUTIL_dot3_3vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot3_3vs3(Vec4_ a0, Vec4_ b0, Vec4_ a1, Vec4_ b1, Vec4_ a2, Vec4_ b2, Vec4& dotsOut)
{
    // (a00b00, a01b01, a02b02, a03b03)
    hkVector4f c0; c0.setMul(a0, b0);
    // (a10b10, a11b11, a12b12, a13b13)
    hkVector4f c1; c1.setMul(a1, b1);
    // (a20b20, a21b21, a22b22, a23b23)
    hkVector4f c2; c2.setMul(a2, b2);

    // [(a00b00, a10b10, a02b02, a12b12), (a01b01, a11b11, a03b03, a13b13)]
    float32x4x2_t t01 = vtrnq_f32(c0.m_quad, c1.m_quad);
    // (a00b00 + a01b01, a10b10 + a11b11)
    float32x2_t xy01 = vadd_f32(vget_low_f32(t01.val[0]), vget_low_f32(t01.val[1]));
    // (a00b00 + a01b01 + a02b02, a10b10 + a11b11 + a12b12)
    float32x2_t xyz01 = vadd_f32(xy01, vget_high_f32(t01.val[0]));

    // (a20b20 + a21b21, a20b20 + a21b21)
    float32x2_t xy22 = vpadd_f32(vget_low_f32(c2.m_quad), vget_low_f32(c2.m_quad));
    // (a22b22, a22b22)
    float32x2_t z22 = vdup_lane_f32(vget_high_f32(c2.m_quad), 0);
    // (a20b20 + a21b21 + a22b22, a20b20 + a21b21 + a22b22)
    float32x2_t xyz22 = vadd_f32(xy22, z22);

    // (a00b00 + a01b01 + a02b02, a10b10 + a11b11 + a12b12, a20b20 + a21b21 + a22b22, a20b20 + a21b21 + a22b22)
    dotsOut.m_quad = vcombine_f32(xyz01, xyz22);
}

#define HK_VECTOR4fUTIL_dot4_3vs3
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot4_3vs3(hkVector4fParameter a0, hkVector4fParameter b0, hkVector4fParameter a1, hkVector4fParameter b1, hkVector4fParameter a2, hkVector4fParameter b2, hkVector4f& dotsOut)
{
    hkVector4f c0; c0.setMul(a0, b0);
    hkVector4f c1; c1.setMul(a1, b1);
    hkVector4f c2; c2.setMul(a2, b2);
    hkVector4f l; l.setPairedAdd(c0, c1);
    hkVector4f h; h.setPairedAdd(c2, c2);
    dotsOut.setPairedAdd(l, h);
}

#define HK_VECTOR4fUTIL_dot4_4vs4
template <>
HK_INLINE void HK_CALL hkVector4UtilImpl<hkFloat32>::dot4_4vs4(hkVector4fParameter a0, hkVector4fParameter b0, hkVector4fParameter a1, hkVector4fParameter b1, hkVector4fParameter a2, hkVector4fParameter b2, hkVector4fParameter a3, hkVector4fParameter b3, hkVector4f& dotsOut)
{
    hkVector4f c0; c0.setMul(a0, b0);
    hkVector4f c1; c1.setMul(a1, b1);
    hkVector4f c2; c2.setMul(a2, b2);
    hkVector4f c3; c3.setMul(a3, b3);
    hkVector4f l; l.setPairedAdd(c0, c1);
    hkVector4f h; h.setPairedAdd(c2, c3);
    dotsOut.setPairedAdd(l, h);
}

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
