// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM     : ALL
// PRODUCT      : COMMON
// VISIBILITY   : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0


HK_INLINE void HK_CALL hkcdCheckForPaddedVertices( const hkcdVertex* vertices, int numVertices )
{
#if defined(HK_DEBUG)
    for( int i = numVertices; i < HK_NEXT_MULTIPLE_OF(4,numVertices); ++i )
    {
        const hkVector4 first = vertices[0];
        const hkVector4 last = vertices[numVertices-1];
        const hkVector4 v = vertices[i];
        HK_ASSERT( 0x588428b2, v.equal(last).allAreSet() || v.equal(first).allAreSet(),
            "last or first vertex must be duplicated to a multiple of 4, make sure to do so" );
    }
#endif
}


HK_INLINE void HK_CALL hkcdSupportingVertexPointsSimple(
    const hkcdVertex* HK_RESTRICT vertices, int numVertices, hkVector4Parameter direction,
    hkcdVertex* HK_RESTRICT vertexOut )
{
    HK_ASSERT_NO_MSG( 0x4c5c7d58, numVertices > 0 );

    const hkVector4* HK_RESTRICT verts = (const hkVector4* HK_RESTRICT)vertices;

    int bestVert = 0;
    hkSimdReal bestDot = direction.dot<3>(verts[0]);
    for (int i=1; i<numVertices; ++i)
    {
        const hkSimdReal otherDot = direction.dot<3>(verts[i]);
        if (otherDot.isGreater(bestDot))
        {
            bestVert = i;
            bestDot = otherDot;
        }
    }
    vertexOut->assign(verts[bestVert]);
}


HK_INLINE void HK_CALL hkcdSupportingVertexPointsTranposed(
    const hkFourTransposedPoints* HK_RESTRICT transposedVertices, int numVertexBatches, hkVector4Parameter direction,
    hkcdVertex* HK_RESTRICT vertexOut )
{
    HK_ASSERT_NO_MSG( 0x4c5c7d57, numVertexBatches > 0 ); // must have some elements or bestIndices is uninitialized
    // we use a vector since SIMD real is less performant on ARM and we should not have register pressure
    hkVector4 d0; d0.setBroadcast<0>(direction);
    hkVector4 d1; d1.setBroadcast<1>(direction);
    hkVector4 d2; d2.setBroadcast<2>(direction);

    hkVector4 bestDot;
    {
        hkVector4 x; x.setMul( d0, transposedVertices[0].m_vertices[0] );
        hkVector4 y; y.setMul( d1, transposedVertices[0].m_vertices[1] );
        bestDot.setAdd( x,y );
        hkVector4 z; z.setMul( d2, transposedVertices[0].m_vertices[2] );
        bestDot.add( z );
    }
    hkIntVector curIndices = hkIntVector::getConstant<HK_QUADINT_0123>();
    hkIntVector bestIndices = curIndices;

    // get max dots four at a time
    for ( int i = 1; i < numVertexBatches; i++ )
    {
        curIndices.setAddS32( curIndices, hkIntVector::getConstant<HK_QUADINT_4>() );

        hkVector4 curDot;

        // calculate the dot product for four vertices
        {
            hkVector4 x; x.setMul( d0, transposedVertices[i].m_vertices[0] );
            hkVector4 y; y.setMul( d1, transposedVertices[i].m_vertices[1] );
            curDot.setAdd( x,y );
            hkVector4 z; z.setMul( d2, transposedVertices[i].m_vertices[2] );
            curDot.add( z );
        }

        const hkVector4Comparison comp = curDot.greater( bestDot );
        bestDot.setSelect( comp, curDot, bestDot );
        bestIndices.setSelect( comp, curIndices, bestIndices );
    }

    // find the best of the 4 we have, break ties to lower indices
    int vertexId = bestIndices.getFirstComponentAtVectorMax(bestDot);

    {
        const hkFourTransposedPoints* HK_RESTRICT fv = transposedVertices + (unsigned(vertexId)>>2);
        int a = vertexId & 3;
        (*vertexOut)(0) = fv->m_vertices[0](a);
        (*vertexOut)(1) = fv->m_vertices[1](a);
        (*vertexOut)(2) = fv->m_vertices[2](a);
        (*vertexOut).setInt24W( vertexId );
    }
}


HK_INLINE void HK_CALL hkcdSupportingVertexPointsTransposeInplace(
    const hkcdVertex* HK_RESTRICT vertices, int numVertices, hkVector4Parameter direction,
    hkcdVertex* HK_RESTRICT vertexOut )
{
// #if defined(HK_DEBUG)
//  //HK_ASSERT_NO_MSG( 0xf03dfd45, numVertices >=4);    // needs a minimum of 4 verts
//  if ( numVertices < 4)
//  {
//      hkcdCheckForPaddedVertices( vertices, numVertices );
//  }
// #endif

    hkVector4 d0; d0.setBroadcast<0>(direction);
    hkVector4 d1; d1.setBroadcast<1>(direction);
    hkVector4 d2; d2.setBroadcast<2>(direction);

    hkVector4 bestDot;
    hkIntVector bestIndices;

    int unPadded = numVertices & 3;
    switch( unPadded )
    {
    case 1:
        {
            bestDot.setAll( vertices[0].dot3(direction) );
            bestIndices.setZero();
            break;
        }
    case 2:
        {
            hkSimdReal t0 = vertices[0].dot3(direction);
            hkSimdReal t1 = vertices[1].dot3(direction);
            bestDot.set( t0, t1, hkSimdReal_MinusInf, hkSimdReal_MinusInf );
            bestIndices = hkIntVector::getConstant<HK_QUADINT_0123>();
            break;
        }
    case 3:
        {
            // now we have 2 cases: either we have a triangle: do special code, or we have more than 4 vertices, in this
            // case we can use the 0 vertices case.
            if( numVertices == 3 )
            {
                hkSimdReal t0 = vertices[0].dot3(direction);
                hkSimdReal t1 = vertices[1].dot3(direction);
                hkSimdReal t2 = vertices[2].dot3(direction);

                hkcdVertex support; support.setSelect( t0.greater(t1), vertices[0], vertices[1] );
                t0.setMax( t0, t1 );
                vertexOut->setSelect( t0.greater(t2), support, vertices[2] );
                return;
            }
            // fall through
            unPadded = -1;      // we want to use vert 3 2 times
        }
    case 0:
        {
            hkVector4 t0 = vertices[0];
            hkVector4 t1 = vertices[1];
            hkVector4 t2 = vertices[2];
            hkVector4 t3 = vertices[3];
            HK_TRANSPOSE4( t0,t1,t2,t3 );

            bestDot.setMul( d0, t0 );
            bestDot.addMul( d1, t1 );
            hkVector4 z; z.setMul( d2, t2 );
            bestDot.add( z );
            bestIndices = hkIntVector::getConstant<HK_QUADINT_0123>();
            unPadded += 4;
        }
    }
    hkIntVector curIndices; curIndices.setAll( unPadded );
    curIndices.setAddS32( curIndices, hkIntVector::getConstant<HK_QUADINT_0123>() );

    // get max dots four at a time
    for ( int i = unPadded; i < numVertices; i+=4 )
    {
        hkVector4 t0 = vertices[i+0];
        hkVector4 t1 = vertices[i+1];
        hkVector4 t2 = vertices[i+2];
        hkVector4 t3 = vertices[i+3];
        HK_TRANSPOSE4( t0,t1,t2,t3);

        // calculate the dot product for four vertices
        hkVector4 curDot;
        curDot.setMul( d0, t0 );
        curDot.addMul( d1, t1 );
        hkVector4 z; z.setMul( d2, t2 );
        curDot.add( z );

        const hkVector4Comparison comp = curDot.greater( bestDot );
        bestDot.setMax(curDot, bestDot);
        bestIndices.setSelect(comp, curIndices, bestIndices);
        curIndices.setAddS32( curIndices, hkIntVector::getConstant<HK_QUADINT_4>() );
    }
    // find the best of the 4 we have, break ties to lower indices
    int vertexId = bestIndices.getFirstComponentAtVectorMax(bestDot);
    vertexOut[0] = vertices[vertexId];
}


#if HK_SSE_VERSION >= 0x50

inline void transpose8_ps( __m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7 )
{
    __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
    __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
    __t0 = _mm256_unpacklo_ps(row0, row1);
    __t1 = _mm256_unpackhi_ps(row0, row1);
    __t2 = _mm256_unpacklo_ps(row2, row3);
    __t3 = _mm256_unpackhi_ps(row2, row3);
    __t4 = _mm256_unpacklo_ps(row4, row5);
    __t5 = _mm256_unpackhi_ps(row4, row5);
    __t6 = _mm256_unpacklo_ps(row6, row7);
    __t7 = _mm256_unpackhi_ps(row6, row7);
    __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
    __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
    __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
    __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
    __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
    __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
    __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
    __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
    row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
    row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
    row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
    row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
    row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
    row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
    row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
    row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}

inline void transpose4x2_ps( __m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3 )
{
    __m256 _Tmp3, _Tmp2, _Tmp1, _Tmp0;

    _Tmp0 = _mm256_shuffle_ps(row0, (row1), 0x44);
    _Tmp2 = _mm256_shuffle_ps(row0, (row1), 0xEE);
    _Tmp1 = _mm256_shuffle_ps(row2, (row3), 0x44);
    _Tmp3 = _mm256_shuffle_ps(row2, (row3), 0xEE);

    row0 = _mm256_shuffle_ps(_Tmp0, _Tmp1, 0x88);
    row1 = _mm256_shuffle_ps(_Tmp0, _Tmp1, 0xDD);
    row2 = _mm256_shuffle_ps(_Tmp2, _Tmp3, 0x88);
    row3 = _mm256_shuffle_ps(_Tmp2, _Tmp3, 0xDD);
}

HK_INLINE int getFirstComponentAtVectorMax( const __m256i& index8, const __m256& v8 )
{
    __m128 v0 = _mm256_extractf128_ps(v8, 0);
    __m128 v1 = _mm256_extractf128_ps(v8, 1);

    __m128i i0 = _mm256_extracti128_si256(index8, 0);
    __m128i i1 = _mm256_extracti128_si256(index8, 1);

    __m128i cmp   = _mm_castps_si128(_mm_cmpgt_ps(v1, v0));
    __m128i i = _mm_blendv_epi8 ( i0, i1, cmp );
    __m128 v = _mm_max_ps(v0, v1);


    __m128 tmpV0     = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));       // tmpV0 = (vy, vz, vw, vx)
    hkQuadUint tmpI0 = _mm_shuffle_epi32(i, _MM_SHUFFLE(0, 3, 2, 1));       // tmpI0 = (iy, iz, iw, ix)

                                                                            // tmpI1    = [ (vx > vy) ? ix : iy, *, (vz > vw) ? iz : iw, * ]
                                                                            //          = [ idx(vx, vy), *, idx(vz, vw), * ]
    cmp = _mm_castps_si128(_mm_cmpgt_ps(tmpV0, v));
    hkQuadUint tmpI1 = _mm_blendv_epi8(i, tmpI0, cmp);

    // tmpV1 = [max(vx, vy), *, max(vz, vw), *]
    __m128 tmpV1 = _mm_max_ps(tmpV0, v);

    // tmpV0 = [max(vz, vw), *]
    // tmpI0 = [idx(vz, vw), *]
    tmpV0 = _mm_shuffle_ps(tmpV1, tmpV1, _MM_SHUFFLE(2, 2, 2, 2));
    tmpI0 = _mm_shuffle_epi32(tmpI1, _MM_SHUFFLE(2, 2, 2, 2));

    cmp   = _mm_castps_si128(_mm_cmpgt_ps(tmpV0, tmpV1));
    tmpI0 = _mm_blendv_epi8(tmpI1, tmpI0, cmp);

    // The value we're interested in is at component 0
    return _mm_cvtsi128_si32(tmpI0);
}

// alternative method, not faster
HK_INLINE int getFirstComponentAtVectorMax2( const __m256i& index8, const __m256& v )
{
    __m256 vmax = v;

    vmax = _mm256_max_ps(vmax, _mm256_castsi256_ps( _mm256_alignr_epi8(        _mm256_castps_si256(vmax), _mm256_castps_si256(vmax), 4)));
    vmax = _mm256_max_ps(vmax, _mm256_castsi256_ps( _mm256_alignr_epi8(        _mm256_castps_si256(vmax), _mm256_castps_si256(vmax), 8)));
    vmax = _mm256_max_ps(vmax, _mm256_castsi256_ps( _mm256_permute2x128_si256( _mm256_castps_si256(vmax), _mm256_castps_si256(vmax), 0x01)));

    __m256i vcmp = _mm256_castps_si256(_mm256_cmp_ps(v, vmax, 0));
    int mask = _mm256_movemask_epi8(vcmp);

    // __builtin_ctz
    int index = _mm_popcnt_u32( (mask & -mask) - 1)/4;
    return ((int*)&index8)[index];
}

HK_INLINE void HK_CALL hkcdSupportingVertexPointsTransposeInplaceAvx(
    const hkcdVertex* HK_RESTRICT vertices, int numVertices, hkVector4Parameter direction,
    hkcdVertex* HK_RESTRICT vertexOut )
{
    __m256 d0 = _mm256_set1_ps(direction(0));
    __m256 d1 = _mm256_set1_ps(direction(1));
    __m256 d2 = _mm256_set1_ps(direction(2));

    __m256 bestDot;
    __m256i bestIndices;
    {
        __m256 t0 = *(__m256*)&vertices[0];
        __m256 t1 = *(__m256*)&vertices[2];
        __m256 t2 = *(__m256*)&vertices[4];
        __m256 t3 = *(__m256*)&vertices[6];
        transpose4x2_ps(t0, t1, t2, t3);

        bestDot = _mm256_mul_ps(d0, t0);
        bestDot = _mm256_add_ps(bestDot, _mm256_mul_ps(d1, t1));
        bestDot = _mm256_add_ps(bestDot, _mm256_mul_ps(d2, t2));

        bestIndices = _mm256_set_epi32(0, 2, 4, 6, 1, 3, 5, 7);
    }
    __m256i curIndices = bestIndices;

    // get max dots four at a time
    for (int i = 8; i < numVertices; i += 8)
    {
        __m256 t0 = *(__m256*)&vertices[i+0];
        __m256 t1 = *(__m256*)&vertices[i+2];
        __m256 t2 = *(__m256*)&vertices[i+4];
        __m256 t3 = *(__m256*)&vertices[i+6];
        transpose4x2_ps(t0, t1, t2, t3);

        __m256 curDot;
        curDot = _mm256_mul_ps(d0, t0);
        curDot = _mm256_add_ps(curDot, _mm256_mul_ps(d1, t1));
        curDot = _mm256_add_ps(curDot, _mm256_mul_ps(d2, t2));

        // calculate the dot product for four vertices

        __m256 comp = _mm256_cmp_ps( curDot, bestDot, 14 );
        bestDot = _mm256_blendv_ps(bestDot, curDot, comp);
        bestDot = _mm256_max_ps(bestDot, curDot);

        bestIndices = _mm256_blendv_epi8(bestIndices, curIndices, _mm256_castps_si256(comp));
        curIndices  = _mm256_add_epi32(curIndices, _mm256_set1_epi32(8) );
    }

    // find the best of the 4 we have, break ties to lower indices
    int vertexId = getFirstComponentAtVectorMax2(bestIndices, bestDot);
    vertexOut[0] = vertices[vertexId];
}

#endif  // HK_SSE_VERSION >= 0x50


void HK_CALL hkcdSupportingVertexPoints(
    const hkcdVertex* vertices, int numVertices, hkVector4Parameter direction,
    hkcdVertex* HK_RESTRICT vertexOut )
{
#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED
    hkcdSupportingVertexPointsTransposeInplace( vertices, numVertices, direction, vertexOut );
#else
    hkcdSupportingVertexPointsSimple( vertices, numVertices, direction, vertexOut );
#endif
}

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
