/* 
 * 
 * Confidential Information of Telekinesys Research Limited (t/a Havok). Not for disclosure or distribution without Havok's
 * prior written consent. This software contains code, techniques and know-how which is confidential and proprietary to Havok.
 * Level 2 and Level 3 source code contains trade secrets of Havok. Havok Software (C) Copyright 1999-2010 Telekinesys Research Limited t/a Havok. All Rights Reserved. Use of this software is subject to the terms of an end user license agreement.
 * 
 */

//
//	Possible outcomes from the Minkowski portal refinement step

#define REPLACE_TETRA1_WITH_TETRA4	0
#define REPLACE_TETRA2_WITH_TETRA4	1
#define REPLACE_TETRA3_WITH_TETRA4	2
#define REPLACE_UNDEFINED			3

//
//	Normal indices, per tetrahedron face

#define FACE_012	1
#define FACE_023	0
#define FACE_031	2

//
//	Types of outputs from the next candidate map

#define VIOLATING_PLANE_012		FACE_012	// The ray is outside tetrahedron face 012. Return the index of the vertex that needs to be replaced, i.e. vTetra[2]
#define VIOLATING_PLANE_023		FACE_023	// The ray is outside tetrahedron face 023. Return the index of the vertex that needs to be replaced, i.e. vTetra[0]
#define VIOLATING_PLANE_031		FACE_031	// The ray is outside tetrahedron face 031. Return the index of the vertex that needs to be replaced, i.e. vTetra[1]
#define NO_VIOLATING_PLANE		3			// The ray is completely contained. Do nothing!

#if defined(HK_PLATFORM_WIN32) && (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED)

//
//	Sorts the components of a vector

#define _sortElements_
HK_FORCE_INLINE void sortElements(hkVector4& vValues, hkVector4& vIndices)
{
	// reg0 = [4, 3, 2, 1]
	register hkQuadReal regVal0 = vValues.getQuad();
	register hkQuadReal regIdx0 = vIndices.getQuad();

	//	MOVLHPS(1, 0);
	register hkQuadReal regVal1 = _mm_movelh_ps(regVal0, regVal0);
	register hkQuadReal regIdx1 = _mm_movelh_ps(regIdx0, regIdx0);

	//	SWAP(1, 0);
	{
		register hkQuadReal tmpVal = regVal1;
		register hkQuadReal tmpIdx = regIdx1;

		// cmp = (regVal1 > regVal0) ? true : false
		register hkQuadReal cmp = _mm_cmpgt_ps(regVal1, regVal0);

		// regVal1 = (regVal1 > regVal0) ? regVal0 : regVal1
		// regIdx1 = (regVal1 > regVal0) ? regIdx0 : regIdx1
		regVal1 = _mm_or_ps(_mm_and_ps(cmp, regVal0), _mm_andnot_ps(cmp, regVal1));
		regIdx1 = _mm_or_ps(_mm_and_ps(cmp, regIdx0), _mm_andnot_ps(cmp, regIdx1));

		// regVal0 = (regVal1 > regVal0) ? regVal1 : regVal0
		// regIdx0 = (regVal1 > regVal0) ? regIdx1 : regIdx0
		regVal0 = _mm_or_ps(_mm_and_ps(cmp, tmpVal), _mm_andnot_ps(cmp, regVal0));
		regIdx0 = _mm_or_ps(_mm_and_ps(cmp, tmpIdx), _mm_andnot_ps(cmp, regIdx0));
	}

	//	MOVHLPS(0, 1);
	regVal0 = _mm_movehl_ps(regVal0, regVal1);
	regIdx0 = _mm_movehl_ps(regIdx0, regIdx1);

	//	SHUFPS(1, 0, 136);
	regVal1 = _mm_shuffle_ps(regVal1, regVal0, 0x88);
	regIdx1 = _mm_shuffle_ps(regIdx1, regIdx0, 0x88);

	//	SWAP(1, 0);
	{
		// 		register hkQuadReal tmp = regVal1;
		// 		regVal1 = _mm_min_ps(regVal1, regVal0);
		// 		regVal0 = _mm_max_ps(tmp, regVal0);

		register hkQuadReal tmpVal = regVal1;
		register hkQuadReal tmpIdx = regIdx1;

		// cmp = (regVal1 > regVal0) ? true : false
		register hkQuadReal cmp = _mm_cmpgt_ps(regVal1, regVal0);

		// regVal1 = (regVal1 > regVal0) ? regVal0 : regVal1
		// regIdx1 = (regVal1 > regVal0) ? regIdx0 : regIdx1
		regVal1 = _mm_or_ps(_mm_and_ps(cmp, regVal0), _mm_andnot_ps(cmp, regVal1));
		regIdx1 = _mm_or_ps(_mm_and_ps(cmp, regIdx0), _mm_andnot_ps(cmp, regIdx1));

		// regVal0 = (regVal1 > regVal0) ? regVal1 : regVal0
		// regIdx0 = (regVal1 > regVal0) ? regIdx1 : regIdx0
		regVal0 = _mm_or_ps(_mm_and_ps(cmp, tmpVal), _mm_andnot_ps(cmp, regVal0));
		regIdx0 = _mm_or_ps(_mm_and_ps(cmp, tmpIdx), _mm_andnot_ps(cmp, regIdx0));
	}

	//	MOVLHPS(2, 0);
	register hkQuadReal regVal2 = _mm_movelh_ps(regVal0, regVal0);
	register hkQuadReal regIdx2 = _mm_movelh_ps(regIdx0, regIdx0);

	//	UNPCKHPS(3, 1);
	register hkQuadReal regVal3 = _mm_unpackhi_ps(regVal1, regVal1);
	register hkQuadReal regIdx3 = _mm_unpackhi_ps(regIdx1, regIdx1);

	//	SWAP(2, 3);
	{
		// 		register hkQuadReal tmp = regVal2;
		// 		regVal2 = _mm_min_ps(regVal2, regVal3);
		// 		regVal3 = _mm_max_ps(tmp, regVal3);

		register hkQuadReal tmpVal = regVal2;
		register hkQuadReal tmpIdx = regIdx2;

		// cmp = (regVal2 > regVal3) ? true : false
		register hkQuadReal cmp = _mm_cmpgt_ps(regVal2, regVal3);

		// regVal2 = (regVal2 > regVal3) ? regVal3 : regVal2
		// regIdx2 = (regVal2 > regVal3) ? regIdx3 : regIdx2
		regVal2 = _mm_or_ps(_mm_and_ps(cmp, regVal3), _mm_andnot_ps(cmp, regVal2));
		regIdx2 = _mm_or_ps(_mm_and_ps(cmp, regIdx3), _mm_andnot_ps(cmp, regIdx2));

		// regVal3 = (regVal2 > regVal3) ? regVal2 : regVal3
		// regIdx3 = (regVal2 > regVal3) ? regIdx2 : regIdx3
		regVal3 = _mm_or_ps(_mm_and_ps(cmp, tmpVal), _mm_andnot_ps(cmp, regVal3));
		regIdx3 = _mm_or_ps(_mm_and_ps(cmp, tmpIdx), _mm_andnot_ps(cmp, regIdx3));
	}

	//	SHUFPS(1, 2, 49);
	regVal1 = _mm_shuffle_ps(regVal1, regVal2, 0x31);
	regIdx1 = _mm_shuffle_ps(regIdx1, regIdx2, 0x31);

	//	MOVHLPS(0, 3);
	regVal0 = _mm_movehl_ps(regVal0, regVal3);
	regIdx0 = _mm_movehl_ps(regIdx0, regIdx3);

	//	SHUFPS(1, 0, 216);
	regVal1 = _mm_shuffle_ps(regVal1, regVal0, 0xD8);
	regIdx1 = _mm_shuffle_ps(regIdx1, regIdx0, 0xD8);

	//	MULTI_WRITE_BACK(1, 0, 1);
	vValues.getQuad() = regVal1;
	vIndices.getQuad() = regIdx1;
}

//
//	vValuesA must be sorted descending, vValuesB must be sorted ascending. The values in the registers are actually sorted in reverse
//	This will output in vectors A the maximum 4 values of the 2 vectors

#define _bitonicMerge_
HK_FORCE_INLINE void bitonicMerge(hkVector4& vValuesA, hkVector4& vIndicesA, hkVector4& vValuesB, hkVector4& vIndicesB)
{
	// A = [a0, a1, a2, a3]
	// B = [b0, b1, b2, b3]
	register hkQuadReal regValA = vValuesA.getQuad();
	register hkQuadReal regValB = vValuesB.getQuad();
	register hkQuadReal regIdxA = vIndicesA.getQuad();
	register hkQuadReal regIdxB = vIndicesB.getQuad();

	// Stage 1. Compare and shuffle
	{
		// cmp = (regA > regB) ? true : false
		register hkQuadReal cmp = _mm_cmpgt_ps(regValA, regValB);

		// regValuesMin = (regA > regB) ? regB : regA = [L0, L1, L2, L3]
		// regValuesMax = (regA > regB) ? regA : regB = [H0, H1, H2, H3]
		register hkQuadReal regMin = _mm_or_ps(_mm_and_ps(cmp, regValB), _mm_andnot_ps(cmp, regValA));
		register hkQuadReal regMax = _mm_or_ps(_mm_and_ps(cmp, regValA), _mm_andnot_ps(cmp, regValB));

		// regA = [L0, L1, H0, H1]
		// regB = [L2, L3, H2, H3]
		regValA = _mm_movehl_ps(regMin, regMax);
		regValB = _mm_movelh_ps(regMax, regMin);

		// Do the same for indices
		regMin	= _mm_or_ps(_mm_and_ps(cmp, regIdxB), _mm_andnot_ps(cmp, regIdxA));
		regMax	= _mm_or_ps(_mm_and_ps(cmp, regIdxA), _mm_andnot_ps(cmp, regIdxB));
		regIdxA	= _mm_movehl_ps(regMin, regMax);
		regIdxB	= _mm_movelh_ps(regMax, regMin);
	}

	// Stage 2. Compare and shuffle
	{
		// cmp = (regA > regB) ? true : false
		register hkQuadReal cmp = _mm_cmpgt_ps(regValA, regValB);

		// regValuesMin = (regA > regB) ? regB : regA = [L0, L1, L2, L3]
		// regValuesMax = (regA > regB) ? regA : regB = [H0, H1, H2, H3]
		register hkQuadReal regMin = _mm_or_ps(_mm_and_ps(cmp, regValB), _mm_andnot_ps(cmp, regValA));
		register hkQuadReal regMax = _mm_or_ps(_mm_and_ps(cmp, regValA), _mm_andnot_ps(cmp, regValB));

		// tmpL = [L0, H0, L1, H1]
		// tmpH = [L2, H2, L3, H3]
		// regA = [L0, H0, L2, H2]
		// regB = [L1, H1, L3, H3]
		register hkQuadReal tmpL = _mm_unpackhi_ps(regMax, regMin);
		register hkQuadReal tmpH = _mm_unpacklo_ps(regMax, regMin);
		regValA = _mm_movehl_ps(tmpL, tmpH);
		regValB = _mm_movelh_ps(tmpH, tmpL);

		// Do the same for indices
		regMin	= _mm_or_ps(_mm_and_ps(cmp, regIdxB), _mm_andnot_ps(cmp, regIdxA));
		regMax	= _mm_or_ps(_mm_and_ps(cmp, regIdxA), _mm_andnot_ps(cmp, regIdxB));
		tmpL	= _mm_unpackhi_ps(regMax, regMin);
		tmpH	= _mm_unpacklo_ps(regMax, regMin);
		regIdxA	= _mm_movehl_ps(tmpL, tmpH);
		regIdxB	= _mm_movelh_ps(tmpH, tmpL);
	}

	// Stage 3. Compare and shuffle
	{
		// cmp = (regA > regB) ? true : false
		register hkQuadReal cmp = _mm_cmpgt_ps(regValA, regValB);

		// regValuesMin = (regA > regB) ? regB : regA = [L0, L1, L2, L3]
		// regValuesMax = (regA > regB) ? regA : regB = [H0, H1, H2, H3]
		register hkQuadReal regMin = _mm_or_ps(_mm_and_ps(cmp, regValB), _mm_andnot_ps(cmp, regValA));
		register hkQuadReal regMax = _mm_or_ps(_mm_and_ps(cmp, regValA), _mm_andnot_ps(cmp, regValB));

		// De-interleave
		// vValuesA = [L0, H0, L1, H1]
		// vValuesB = [L2, H2, L3, H3]
		//		vValuesA.getQuad() = _mm_unpackhi_ps(regMax, regMin);
		//		vValuesB.getQuad() = _mm_unpacklo_ps(regMax, regMin);
		vValuesA.getQuad() = _mm_unpacklo_ps(regMax, regMin);

		// Do the same for indices
		regMin = _mm_or_ps(_mm_and_ps(cmp, regIdxB), _mm_andnot_ps(cmp, regIdxA));
		regMax = _mm_or_ps(_mm_and_ps(cmp, regIdxA), _mm_andnot_ps(cmp, regIdxB));
		// 		vIndicesA.getQuad() = _mm_unpackhi_ps(regMax, regMin);
		// 		vIndicesB.getQuad() = _mm_unpacklo_ps(regMax, regMin);
		vIndicesA.getQuad() = _mm_unpacklo_ps(regMax, regMin);
	}
}

//
//	Returns the mask in a platform independent way, i.e. MASK_X in bit 0, MASK_Y in bit 1, MASK_Z in bit 2 and MASK_W in bit 3

#define _getPlatformIndependentMask_
HK_FORCE_INLINE int getPlatformIndependentMask(const hkVector4Comparison& cmp)
{
	return cmp.getMask();
}

//
//	Computes the cross products: (vA, vB), (vB, vC), (vC, vA)

#define _computeCyclicCrossProducts_
HK_FORCE_INLINE void computeCyclicCrossProducts(const hkVector4& vA, const hkVector4& vB, const hkVector4& vC, 
												hkVector4& vAB, hkVector4& vBC, hkVector4& vCA)
{
	// Initially, we have:
	//	- a = vTetra1 = [ax, ay, az, *]
	//	- b = vTetra2 = [bx, by, bz, *]
	//	- c = vTetra3 = [cx, cy, cz, *]

	// a_yzx = [ay, az, ax, *]
	// b_zxy = [bz, bx, by, *]
	register hkQuadReal a = vA.getQuad();
	register hkQuadReal a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
	register hkQuadReal b = vB.getQuad();
	register hkQuadReal b_zxy = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2));

	// crossX = [ay * bz, az * bx, ax * by, *]
	register hkQuadReal crossX = _mm_mul_ps(a_yzx, b_zxy);

	// c_yzx = [cy, cz, cx, *]
	// c_zxy = [cz, cx, cy, *]
	register hkQuadReal c = vC.getQuad();
	register hkQuadReal c_yzx = _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
	register hkQuadReal c_zxy = _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 1, 0, 2));

	// crossY = [bz * cy, bx * cz, by * cx, *]
	// crossZ = [cz * ay, cx * az, cy * ax, *]
	register hkQuadReal crossY = _mm_mul_ps(b_zxy, c_yzx);
	register hkQuadReal crossZ = _mm_mul_ps(c_zxy, a_yzx);

	// a_zxy = [az, ax, ay, *]
	// b_yzx = [by, bz, bx, *]
	register hkQuadReal a_zxy = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2));
	register hkQuadReal b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));

	// vAB = [(ay * bz) - (az * by), (az * bx) - (ax * bz), (ax * by) - (ay * bx), *]
	// vBC = [(by * cz) - (bz * cy), (bz * cx) - (bx * cz), (bx * cy) - (by * cx), *]
	// vCA = [(cy * az) - (cz * ay), (cz * ax) - (cx * az), (cx * ay) - (cy * ax), *]
	vAB.getQuad() = _mm_sub_ps(crossX, _mm_mul_ps(a_zxy, b_yzx));
	vBC.getQuad() = _mm_sub_ps(_mm_mul_ps(b_yzx, c_zxy), crossY);
	vCA.getQuad() = _mm_sub_ps(_mm_mul_ps(c_yzx, a_zxy), crossZ);
}

//
//	Sets this vector components: this(i) = vector.dot3(ai) for i=0..3

#define _dot3_1vs3_
HK_FORCE_INLINE void dot3_1vs3( const hkVector4& vIn, const hkVector4& vA, const hkVector4& vB, const hkVector4& vC, hkVector4& vDotsOut)
{
	// Compute the dot products
	// dotX = [vIn.x * vA.x, vIn.y * vA.y, vIn.z * vA.z, *]
	// dotY = [vIn.x * vB.x, vIn.y * vB.y, vIn.z * vB.z, *]
	// dotZ = [vIn.x * vC.x, vIn.x * vC.x, vIn.z * vC.z, *]
	register hkQuadReal v = vIn.getQuad();
	register hkQuadReal dotX = _mm_mul_ps(v, vA.getQuad());
	register hkQuadReal dotY = _mm_mul_ps(v, vB.getQuad());
	register hkQuadReal dotZ = _mm_mul_ps(v, vC.getQuad());

	// Transpose for summation
	// dotX = [vIn.x * vA.x, vIn.x * vB.x, vIn.x * vC.x, *]
	// dotY = [vIn.y * vA.y, vIn.y * vB.y, vIn.x * vC.x, *]
	// dotZ = [vIn.z * vA.z, vIn.z * vB.z, vIn.z * vC.z, *]
	HK_TRANSPOSE3(dotX, dotY, dotZ);

	// Sum together
	// vDotsOut = [dot(vIn, vA), dot(vIn, vB), dot(vIn, vC), *]
	vDotsOut.getQuad() = _mm_add_ps(dotX, _mm_add_ps(dotY, dotZ));
}

//
//	Computes the cross products: cross(vCommon, v1),  cross(vCommon, v2),  cross(vCommon, v3)

#define _cross_1vs3_
HK_FORCE_INLINE void cross_1vs3(const hkVector4& vCommon,
								const hkVector4& v1, const hkVector4& v2, const hkVector4& v3,
								hkVector4& vCross1, hkVector4& vCross2, hkVector4& vCross3)
{
	// v1		= [ax, ay, az, *]
	// v2		= [bx, by, bz, *]
	// v3		= [cx, cy, cz, *]
	// vCommon	= [dx, dy, dz, *]

	{
		// Shuffle d. d_yzx = [dy, dz, dx, *]
		register hkQuadReal tmp = vCommon.getQuad();
		register hkQuadReal d_yzx = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 0, 2, 1));

		// vCross1 = a_zxy * d_yzx
		tmp = v1.getQuad();
		vCross1.getQuad() = _mm_mul_ps(d_yzx, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 1, 0, 2)));

		// vCross2 = b_zxy * d_yzx
		tmp = v2.getQuad();
		vCross2.getQuad() = _mm_mul_ps(d_yzx, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 1, 0, 2)));

		// crossZ = c_zxy * d_yzx
		tmp = v3.getQuad();
		vCross3.getQuad() = _mm_mul_ps(d_yzx, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 1, 0, 2)));
	}

	{
		// Shuffle d. d_zxy = [dz, dx, dy, *]
		register hkQuadReal tmp = vCommon.getQuad();
		register hkQuadReal d_zxy = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 1, 0, 2));

		// vCross1 = a_zxy * d_yzx - a_yzx * d_zxy
		tmp = v1.getQuad();
		vCross1.getQuad() = _mm_sub_ps(vCross1.getQuad(), _mm_mul_ps(d_zxy, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 0, 2, 1))));

		// vCross2 = b_zxy * d_yzx - b_yzx * d_zxy
		tmp = v2.getQuad();
		vCross2.getQuad() = _mm_sub_ps(vCross2.getQuad(), _mm_mul_ps(d_zxy, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 0, 2, 1))));

		// crossZ = c_zxy * d_yzx - c_yzx * d_zxy
		tmp = v3.getQuad();
		vCross3.getQuad() = _mm_sub_ps(vCross3.getQuad(), _mm_mul_ps(d_zxy, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 0, 2, 1))));
	}
}

#elif defined(HK_PLATFORM_PPU) || defined(HK_PLATFORM_SPU)

//
//	Computes the cross products: (vA, vB), (vB, vC), (vC, vA)

#define _computeCyclicCrossProducts_
HK_FORCE_INLINE void computeCyclicCrossProducts(const hkVector4& vA, const hkVector4& vB, const hkVector4& vC, 
												hkVector4& vAB, hkVector4& vBC, hkVector4& vCA)
{
	// Initially, we have:
	//	- a = vTetra1 = [ax, ay, az, *]
	//	- b = vTetra2 = [bx, by, bz, *]
	//	- c = vTetra3 = [cx, cy, cz, *]
	const vector unsigned char perm_1203 = (vector unsigned char){4,5,6,7, 8,9,10,11, 0,1,2,3, 12,13,14,15};	// ( 1, 2, 0, 3 )
	const vector unsigned char perm_2013 = (vector unsigned char){8,9,10,11, 0,1,2,3, 4,5,6,7, 12,13,14,15};	// ( 2, 0, 1, 3 )

	// a_yzx = [ay, az, ax, *]
	// a_zxy = [az, ax, ay, *]
	register hkQuadReal a = vA.getQuad();
	register hkQuadReal a_yzx = vec_perm(a, a, perm_1203);
	register hkQuadReal a_zxy = vec_perm(a, a, perm_2013);

	// b_zxy = [bz, bx, by, *]
	// b_yzx = [by, bz, bx, *]
	register hkQuadReal b = vB.getQuad();
	register hkQuadReal b_yzx = vec_perm(b, b, perm_1203);
	register hkQuadReal b_zxy = vec_perm(b, b, perm_2013);

	// c_yzx = [cy, cz, cx, *]
	// c_zxy = [cz, cx, cy, *]
	register hkQuadReal c = vC.getQuad();
	register hkQuadReal c_yzx = vec_perm(c, c, perm_1203);
	register hkQuadReal c_zxy = vec_perm(c, c, perm_2013);

	// vAB = [(ay * bz) - (az * by), (az * bx) - (ax * bz), (ax * by) - (ay * bx), *]
	// vBC = [(by * cz) - (bz * cy), (bz * cx) - (bx * cz), (bx * cy) - (by * cx), *]
	// vCA = [(cy * az) - (cz * ay), (cz * ax) - (cx * az), (cx * ay) - (cy * ax), *]
	vAB.getQuad() = vec_nmsub(a_zxy, b_yzx, vec_mul(a_yzx, b_zxy));
	vBC.getQuad() = vec_nmsub(b_zxy, c_yzx, vec_mul(b_yzx, c_zxy));
	vCA.getQuad() = vec_nmsub(c_zxy, a_yzx, vec_mul(c_yzx, a_zxy));
}

//
//	Sets this vector components: this(i) = vector.dot3(ai) for i=0..3

#define _dot3_1vs3_
HK_FORCE_INLINE void dot3_1vs3( const hkVector4& vIn, const hkVector4& vA, const hkVector4& vB, const hkVector4& vC, hkVector4& vDotsOut)
{
	// Compute the dot products
	// dotX = [vIn.x * vA.x, vIn.y * vA.y, vIn.z * vA.z, *]
	// dotY = [vIn.x * vB.x, vIn.y * vB.y, vIn.z * vB.z, *]
	// dotZ = [vIn.x * vC.x, vIn.x * vC.x, vIn.z * vC.z, *]
	register hkQuadReal v = vIn.getQuad();
	register hkQuadReal dotX = vec_mul(v, vA.getQuad());
	register hkQuadReal dotY = vec_mul(v, vB.getQuad());
	register hkQuadReal dotZ = vec_mul(v, vC.getQuad());

	// Transpose for summation
	// dotX = [vIn.x * vA.x, vIn.x * vB.x, vIn.x * vC.x, *]
	// dotY = [vIn.y * vA.y, vIn.y * vB.y, vIn.x * vC.x, *]
	// dotZ = [vIn.z * vA.z, vIn.z * vB.z, vIn.z * vC.z, *]
	HK_TRANSPOSE3_QUAD(dotX, dotY, dotZ);

	// Sum together
	// vDotsOut = [dot(vIn, vA), dot(vIn, vB), dot(vIn, vC), *]
	vDotsOut.getQuad() = vec_add(dotX, vec_add(dotY, dotZ));
}

//
//	Computes the cross products: cross(vCommon, v1),  cross(vCommon, v2),  cross(vCommon, v3)

#define _cross_1vs3_
HK_FORCE_INLINE void cross_1vs3(const hkVector4& vCommon,
								const hkVector4& v1, const hkVector4& v2, const hkVector4& v3,
								hkVector4& vCross1, hkVector4& vCross2, hkVector4& vCross3)
{
	// v1		= [ax, ay, az, *]
	// v2		= [bx, by, bz, *]
	// v3		= [cx, cy, cz, *]
	// vCommon	= [dx, dy, dz, *]
	const vector unsigned char perm_1203 = (vector unsigned char){4,5,6,7, 8,9,10,11, 0,1,2,3, 12,13,14,15};	// ( 1, 2, 0, 3 )
	const vector unsigned char perm_2013 = (vector unsigned char){8,9,10,11, 0,1,2,3, 4,5,6,7, 12,13,14,15};	// ( 2, 0, 1, 3 )

	{
		// Shuffle d. d_yzx = [dy, dz, dx, *]
		register hkQuadReal d_yzx = vec_perm(vCommon.getQuad(), vCommon.getQuad(), perm_1203);

		// vCross1 = a_zxy * d_yzx
		vCross1.getQuad() = vec_mul(d_yzx, vec_perm(v1.getQuad(), v1.getQuad(), perm_2013));

		// vCross2 = b_zxy * d_yzx
		vCross2.getQuad() = vec_mul(d_yzx, vec_perm(v2.getQuad(), v2.getQuad(), perm_2013));

		// crossZ = c_zxy * d_yzx
		vCross3.getQuad() = vec_mul(d_yzx, vec_perm(v3.getQuad(), v3.getQuad(), perm_2013));
	}

	{
		// Shuffle d. d_zxy = [dz, dx, dy, *]
		register hkQuadReal d_zxy = vec_perm(vCommon.getQuad(), vCommon.getQuad(), perm_2013);

		// vCross1 = a_zxy * d_yzx - a_yzx * d_zxy
		vCross1.getQuad() = vec_nmsub(d_zxy, vec_perm(v1.getQuad(), v1.getQuad(), perm_1203), vCross1.getQuad());

		// vCross2 = b_zxy * d_yzx - b_yzx * d_zxy
		vCross2.getQuad() = vec_nmsub(d_zxy, vec_perm(v2.getQuad(), v2.getQuad(), perm_1203), vCross2.getQuad());

		// crossZ = c_zxy * d_yzx - c_yzx * d_zxy
		vCross3.getQuad() = vec_nmsub(d_zxy, vec_perm(v3.getQuad(), v3.getQuad(), perm_1203), vCross3.getQuad());
	}
}

//
//	Returns the mask in a platform independent way, i.e. MASK_X in bit 0, MASK_Y in bit 1, MASK_Z in bit 2 and MASK_W in bit 3

#define _getPlatformIndependentMask_
HK_FORCE_INLINE int getPlatformIndependentMask(const hkVector4Comparison& cmp)
{
	vec_uint4 mask = *(vec_uint4*)const_cast<hkVector4Comparison*>(&cmp);

	{
		vec_uint4 spread = vec_and( mask, (vec_uint4){1, 2, 4, 8} );
		union { vector signed int vi; int i[4]; } u;
		u.vi = vec_sums( (vector signed int)spread, (vector signed int)(0));
		return u.i[3];
	}
}

#elif defined(HK_PLATFORM_XBOX360)
//
//	Computes the cross products: (vA, vB), (vB, vC), (vC, vA)

#define _computeCyclicCrossProducts_
HK_FORCE_INLINE void computeCyclicCrossProducts(const hkVector4& vA, const hkVector4& vB, const hkVector4& vC, 
												hkVector4& vAB, hkVector4& vBC, hkVector4& vCA)
{
	// Initially, we have:
	//	- a = vTetra1 = [ax, ay, az, *]
	//	- b = vTetra2 = [bx, by, bz, *]
	//	- c = vTetra3 = [cx, cy, cz, *]

	const hkUint32 perm_1203 = 0x63;	// ( 1, 2, 0, 3 )
	const hkUint32 perm_2013 = 0x87;	// ( 2, 0, 1, 3 )

	// a_yzx = [ay, az, ax, *]
	// b_zxy = [bz, bx, by, *]
	register hkQuadReal a_yzx = __vpermwi(vA.getQuad(), perm_1203);
	register hkQuadReal b_zxy = __vpermwi(vB.getQuad(), perm_2013);

	// crossX = [ay * bz, az * bx, ax * by, *]
	register hkQuadReal crossX = __vmulfp(a_yzx, b_zxy);

	// c_yzx = [cy, cz, cx, *]
	// c_zxy = [cz, cx, cy, *]
	register hkQuadReal c_yzx = __vpermwi(vC.getQuad(), perm_1203);
	register hkQuadReal c_zxy = __vpermwi(vC.getQuad(), perm_2013);

	// crossY = [bz * cy, bx * cz, by * cx, *]
	// crossZ = [cz * ay, cx * az, cy * ax, *]
	register hkQuadReal crossY = __vmulfp(b_zxy, c_yzx);
	register hkQuadReal crossZ = __vmulfp(c_zxy, a_yzx);

	// a_zxy = [az, ax, ay, *]
	// b_yzx = [by, bz, bx, *]
	register hkQuadReal a_zxy = __vpermwi(vA.getQuad(), perm_2013);
	register hkQuadReal b_yzx = __vpermwi(vB.getQuad(), perm_1203);

	// vAB = [(ay * bz) - (az * by), (az * bx) - (ax * bz), (ax * by) - (ay * bx), *]
	// vBC = [(by * cz) - (bz * cy), (bz * cx) - (bx * cz), (bx * cy) - (by * cx), *]
	// vCA = [(cy * az) - (cz * ay), (cz * ax) - (cx * az), (cx * ay) - (cy * ax), *]
	vAB.getQuad() = __vsubfp(crossX, __vmulfp(a_zxy, b_yzx));
	vBC.getQuad() = __vsubfp(__vmulfp(b_yzx, c_zxy), crossY);
	vCA.getQuad() = __vsubfp(__vmulfp(c_yzx, a_zxy), crossZ);
}

//
//	Sets this vector components: this(i) = vector.dot3(ai) for i=0..3

#define _dot3_1vs3_
HK_FORCE_INLINE void dot3_1vs3(const hkVector4& vIn, const hkVector4& vA, const hkVector4& vB, const hkVector4& vC, hkVector4& vDotsOut)
{
	// Compute the dot products
	// dotX = [vIn.x * vA.x, vIn.y * vA.y, vIn.z * vA.z, *]
	// dotY = [vIn.x * vB.x, vIn.y * vB.y, vIn.z * vB.z, *]
	// dotZ = [vIn.x * vC.x, vIn.x * vC.x, vIn.z * vC.z, *]

	register hkQuadReal v = vIn.getQuad();
	vDotsOut.getQuad() = __vrlimi(__vmsum3fp(v, vA.getQuad()), __vmsum3fp(v, vB.getQuad()), 4, 0);
	vDotsOut.getQuad() = __vrlimi(vDotsOut.getQuad(), __vmsum3fp(v, vC.getQuad()), 2, 0);
}

//
//	Computes the cross products: cross(vCommon, v1),  cross(vCommon, v2),  cross(vCommon, v3)

#define _cross_1vs3_
HK_FORCE_INLINE void cross_1vs3(const hkVector4& vCommon,
								const hkVector4& v1, const hkVector4& v2, const hkVector4& v3,
								hkVector4& vCross1, hkVector4& vCross2, hkVector4& vCross3)
{
	// v1		= [ax, ay, az, *]
	// v2		= [bx, by, bz, *]
	// v3		= [cx, cy, cz, *]
	// vCommon	= [dx, dy, dz, *]

	const hkUint32 perm_1203 = 0x63;	// ( 1, 2, 0, 3 )
	const hkUint32 perm_2013 = 0x87;	// ( 2, 0, 1, 3 )

	{
		// Shuffle d. d_yzx = [dy, dz, dx, *]
		register hkQuadReal d_yzx = __vpermwi(vCommon.getQuad(), perm_1203);

		// vCross1 = a_zxy * d_yzx
		vCross1.getQuad() = __vmulfp(d_yzx, __vpermwi(v1.getQuad(), perm_2013));

		// vCross2 = b_zxy * d_yzx
		vCross2.getQuad() = __vmulfp(d_yzx, __vpermwi(v2.getQuad(), perm_2013));

		// crossZ = c_zxy * d_yzx
		vCross3.getQuad() = __vmulfp(d_yzx, __vpermwi(v3.getQuad(), perm_2013));
	}

	{
		// Shuffle d. d_zxy = [dz, dx, dy, *]
		register hkQuadReal d_zxy = __vpermwi(vCommon.getQuad(), perm_2013);

		// vCross1 = a_zxy * d_yzx - a_yzx * d_zxy
		vCross1.getQuad() = __vsubfp(vCross1.getQuad(), __vmulfp(d_zxy, __vpermwi(v1.getQuad(), perm_1203)));

		// vCross2 = b_zxy * d_yzx - b_yzx * d_zxy
		vCross2.getQuad() = __vsubfp(vCross2.getQuad(), __vmulfp(d_zxy, __vpermwi(v2.getQuad(), perm_1203)));

		// crossZ = c_zxy * d_yzx - c_yzx * d_zxy
		vCross3.getQuad() = __vsubfp(vCross3.getQuad(), __vmulfp(d_zxy, __vpermwi(v3.getQuad(), perm_1203)));
	}
}

//
//	Returns the mask in a platform independent way, i.e. MASK_X in bit 0, MASK_Y in bit 1, MASK_Z in bit 2 and MASK_W in bit 3

// #define _getPlatformIndependentMask_
// HK_FORCE_INLINE int getPlatformIndependentMask(const hkVector4Comparison& cmp)
// {
// 	hkQuadReal mask = *(hkQuadReal*)const_cast<hkVector4Comparison*>(&cmp);
// 	hkQuadReal result = __vand( mask, g_vectorConstants[ HK_QUADREAL_1] );
// 	result = __vmsum4fp( result, g_vectorConstants[ HK_QUADREAL_1248] );
// 	result = __vctsxs( result, 0 );
// 	return (int)result.u[0];
// }

#endif	// HK_PLATFORM_WIN32 && SIMD

//
//	Sorts (ascending) the components of a vector

#if !defined(_sortElements_)

HK_FORCE_INLINE void sortElements(hkVector4& vValues, hkVector4& vIndices)
{
	hkBool found;
	do
	{
		found = false;
		for (int i = 0; i < 4 - 1; i++)
		{
			if ( vValues(i) > vValues(i + 1) )
			{
				hkAlgorithm::swap(vValues(i), vValues(i + 1));
				hkAlgorithm::swap(vIndices(i), vIndices(i + 1));
				found = true;
			}
		}
	}while(found);
}

#endif

//
//	vValuesA must be sorted descending, vValuesB must be sorted ascending. The values in the registers are actually sorted in reverse
//	This will output in vectors A the maximum 4 values of the 2 vectors

#if !defined(_bitonicMerge_)

HK_FORCE_INLINE void bitonicMerge(hkVector4& vValuesA, hkVector4& vIndicesA, hkVector4& vValuesB, hkVector4& vIndicesB)
{
	for (int i = 0; i < 4; i++)
	{
		const hkReal valB = vValuesB(i);
		const hkReal idxB = vIndicesB(i);

		if ( valB >= vValuesA(0) )
		{
			vValuesA(3) = vValuesA(2);		vIndicesA(3) = vIndicesA(2);
			vValuesA(2) = vValuesA(1);		vIndicesA(2) = vIndicesA(1);
			vValuesA(1) = vValuesA(0);		vIndicesA(1) = vIndicesA(0);
			vValuesA(0)	= valB;				vIndicesA(0) = idxB;
		}
		else if ( valB >= vValuesA(1) )
		{
			vValuesA(3) = vValuesA(2);		vIndicesA(3) = vIndicesA(2);
			vValuesA(2) = vValuesA(1);		vIndicesA(2) = vIndicesA(1);
			vValuesA(1)	= valB;				vIndicesA(1) = idxB;
		}
		else if ( valB >= vValuesA(2) )
		{
			vValuesA(3) = vValuesA(2);		vIndicesA(3) = vIndicesA(2);
			vValuesA(2)	= valB;				vIndicesA(2) = idxB;
		}
		else if ( valB >= vValuesA(3) )
		{
			vValuesA(3)	= valB;				vIndicesA(3) = idxB;
		}
	}
}

#endif

#if !defined(_getPlatformIndependentMask_)

HK_FORCE_INLINE int getPlatformIndependentMask(const hkVector4Comparison& cmp)
{
	const int intMask = cmp.getMask();
	return	((int)((intMask & hkVector4Comparison::MASK_X) != 0) << 0) |
		((int)((intMask & hkVector4Comparison::MASK_Y) != 0) << 1) |
		((int)((intMask & hkVector4Comparison::MASK_Z) != 0) << 2) |
		((int)((intMask & hkVector4Comparison::MASK_W) != 0) << 3);
}

#endif

//
//	Computes the cross products: (vA, vB), (vB, vC), (vC, vA)

#if !defined(_computeCyclicCrossProducts_)

HK_FORCE_INLINE void computeCyclicCrossProducts(const hkVector4& vA, const hkVector4& vB, const hkVector4& vC, 
												hkVector4& vAB, hkVector4& vBC, hkVector4& vCA)
{
	vAB.setCross(vA, vB);
	vBC.setCross(vB, vC);
	vCA.setCross(vC, vA);
}

#endif

//
//	Sets this vector components: this(i) = vector.dot3(ai) for i=0..3

#if !defined(_dot3_1vs3_)

HK_FORCE_INLINE void dot3_1vs3(const hkVector4& vIn, const hkVector4& vA, const hkVector4& vB, const hkVector4& vC, hkVector4& vDotsOut)
{
	vDotsOut(0) = vIn.dot3(vA).getReal();
	vDotsOut(1) = vIn.dot3(vB).getReal();
	vDotsOut(2) = vIn.dot3(vC).getReal();
}

#endif

//
//	Computes the cross products: cross(vCommon, v1),  cross(vCommon, v2),  cross(vCommon, v3)

#if !defined(_cross_1vs3_)

HK_FORCE_INLINE void cross_1vs3(const hkVector4& vCommon,
								const hkVector4& v1, const hkVector4& v2, const hkVector4& v3,
								hkVector4& vCross1, hkVector4& vCross2, hkVector4& vCross3)
{
	vCross1.setCross(vCommon, v1);
	vCross2.setCross(vCommon, v2);
	vCross3.setCross(vCommon, v3);
}

#endif

//
//	END!
//

/*
* Havok SDK - NO SOURCE PC DOWNLOAD, BUILD(#20101115)
* 
* Confidential Information of Havok.  (C) Copyright 1999-2010
* Telekinesys Research Limited t/a Havok. All Rights Reserved. The Havok
* Logo, and the Havok buzzsaw logo are trademarks of Havok.  Title, ownership
* rights, and intellectual property rights in the Havok software remain in
* Havok and/or its suppliers.
* 
* Use of this software for evaluation purposes is subject to and indicates
* acceptance of the End User licence Agreement for this product. A copy of
* the license is included with this software and is also available at www.havok.com/tryhavok.
* 
*/
