// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM   : ALL
// PRODUCT   : COMMON
// VISIBILITY   : CLIENT
//
// ------------------------------------------------------TKBMS v1.0

#include <Geometry/Collide/hkcdCollide.h>
#include <Geometry/Collide/DataStructures/SimdTree/hkcdSimdTree.h>
#include <Common/Base/Container/BitField/hkBitField.h>
#include <Common/Base/Math/Vector/hkVectorSort.h>

// Disable inlining when either:
// - Compiling for windows RT.
// - Double builds.
// - Non-SIMD builds (COM-2615).
#if defined(HK_PLATFORM_WINRT) || defined(HK_REAL_IS_DOUBLE) || (HK_CONFIG_SIMD == HK_CONFIG_SIMD_DISABLED)
#define HK_QUERY_INLINING  HK_NEVER_INLINE
#else
#define HK_QUERY_INLINING HK_INLINE
#endif

// COM-2615 Disable optimization on windows / nosimd builds.
#if defined(_WINDOWS) && (HK_CONFIG_SIMD == HK_CONFIG_SIMD_DISABLED)
#pragma optimize( "", off )
#pragma warning( disable : 4748 ) // /GS can not protect parameters and local variables from local buffer overrun because optimizations are disabled in function
#endif

#define HKCD_SIMDTREE_UNROOL_OVERLAPS   false

namespace hkcdSimdTreeUtils
{
    // LeafCollectorQuery
    // Collector wrapper used by aabbOverlaps_OutOfLine and aabbOverlapsNearMiss_OutOfLine.
    // The implementation minimizes calls to m_collector->addLeaves.
    struct LeafCollectorQuery
    {
        HK_INLINE LeafCollectorQuery() : m_numLeaves( 0 ), m_collector( HK_NULL ), m_nmp( HK_NULL ) {}

        HK_INLINE void processSimdTreeLeaf( hkUint32 leaf )
        {
            // Just accumulate leaves and flush if m_leaves is full.
            m_leaves[ m_numLeaves++ ] = leaf;
            flush( false );
        }

        HK_INLINE void processSimdTreeLeaf( hkUint32 leaf, hkAabb* HK_RESTRICT nmp )
        {
            // Just accumulate leaves and flush if m_leaves is full.
            HK_ASSERT( 0x6F7460F6, m_nmp == HK_NULL || m_nmp == nmp, "Inconsistent NMP pointer." );
            m_leaves[ m_numLeaves++ ] = leaf;
            m_nmp = nmp;
            flush( false );
        }

        HK_INLINE void processSimdTreeLeaves( const hkIntVector& leaves, int numLeaves )
        {
            // Just accumulate leaves and flush if m_leaves is full.
            leaves.store<4, HK_IO_NATIVE_ALIGNED>( m_leaves + m_numLeaves );
            m_numLeaves += numLeaves;
            flush( false );
        }

        HK_ALWAYS_INLINE void flush( bool force )
        {
            // Call collector with accumulated leaves if \p force is true or if the buffer is full.
            // Note: The -4 here is to account for the fact that up to 4 leaves can be pushed at time.
            if ( m_numLeaves > 0 && ( force || m_numLeaves >= int( HK_COUNT_OF( m_leaves ) - 4 ) ) )
            {
                m_collector->addLeaves( m_leaves, m_numLeaves, m_nmp );
                m_numLeaves = 0;
            }
        }

        HK_ALWAYS_INLINE bool isDone() { return false; }

        hkUint32                        m_leaves[ 64 ];
        int                             m_numLeaves;
        hkcdSimdTree::LeafCollector*    m_collector;
        hkAabb*                         m_nmp;
    };

    // Buffered tree overlaps query.
    // The implementation minimizes calls to m_collector->addPairs.
    struct BufferedQuery
    {
        // Configuration
        enum Config
        {
            MARGIN = 64,                    // Minimum is 16+1 but make room for future unrolls.
            CAPACITY = 256,                 // Capacity.
            THRESHOLD = CAPACITY - MARGIN   // Value at which flush needs to be performed.
        };

        // Constructor.
        HK_INLINE BufferedQuery( _In_ hkcdSimdTree::PairCollector* collector ) : m_collector( collector ), m_size( 0 )
        {
            HKCD_SET_STACK_WRITE_BARRIER( m_lanes[ 0 ] );
            HKCD_SET_STACK_WRITE_BARRIER( m_lanes[ 1 ] );
        }

        // Destructor.
        HK_INLINE ~BufferedQuery()
        {
            HKCD_CHECK_STACK_WRITE_BARRIER( m_lanes[ 0 ] );
            HKCD_CHECK_STACK_WRITE_BARRIER( m_lanes[ 1 ] );
            flush();
        }

        //
        HK_INLINE void flushIfNeeded()
        {
            if ( HK_VERY_UNLIKELY( m_size >= THRESHOLD ) )
            {
                flush();
            }
        }

        //
        HK_INLINE void flush()
        {
            if ( HK_VERY_LIKELY( m_size > 0 ) )
            {
                m_collector->addPairs( m_lanes[ 0 ], m_lanes[ 1 ], m_size );
                m_size = 0;
            }
        }

        //
        HK_INLINE void processSimdTreeLeaves( hkUint32 a, const hkIntVector& vb, int count )
        {
            HK_ASSERT( 0xD7CCE3E7, ( m_size + 4 ) < CAPACITY, "Buffer overrun." );
            hkIntVector va; va.setAll( a );
            va.store<4, HK_IO_NATIVE_ALIGNED>( m_lanes[ 0 ] + m_size );
            vb.store<4, HK_IO_NATIVE_ALIGNED>( m_lanes[ 1 ] + m_size );
            m_size += count;
        }

        //
        HK_INLINE void processSimdTreeLeaves( const hkIntVector& va, const hkIntVector& vb, int count )
        {
            HK_ASSERT( 0xD7CCE3E7, ( m_size + 4 ) < CAPACITY, "Buffer overrun." );
            va.store<4, HK_IO_NATIVE_ALIGNED>( m_lanes[ 0 ] + m_size );
            vb.store<4, HK_IO_NATIVE_ALIGNED>( m_lanes[ 1 ] + m_size );
            m_size += count;
        }

        hkUint32                        m_lanes[ 2 ][ CAPACITY ];
        hkcdSimdTree::PairCollector*    m_collector;
        int                             m_size;
    };

    // Forward the results of an unary query to a binary one.
    template <bool OPERAND_FIRST, typename QUERY>
    struct PairwiseQueryWrapper
    {
        HK_INLINE PairwiseQueryWrapper( hkUint32 operand, QUERY& query ) : m_query( query ) { m_operand.setAll( operand ); }

        HK_INLINE void processSimdTreeLeaves( const hkIntVector& leaves, int count )
        {
            if ( OPERAND_FIRST )
                m_query.processSimdTreeLeaves( m_operand, leaves, count );
            else
                m_query.processSimdTreeLeaves( leaves, m_operand, count );
        }

        HK_INLINE void flushIfNeeded() { m_query.flushIfNeeded(); }

        hkIntVector m_operand;
        QUERY&      m_query;
    };

    // QueryMaskFilter
    struct QueryMaskFilter
    {
        HK_INLINE QueryMaskFilter( _In_ const hkcdSimdTree::PairCollector* collector ) : m_queryMaskA( collector->getQueryMaskA() ), m_queryMaskB( collector->getQueryMaskB() ) {}

        HK_INLINE int   isActive( hkUint32 nodeIndexA ) const { return m_queryMaskA->get( nodeIndexA ); }
        HK_INLINE int   isActive( hkUint32 nodeIndexA, hkUint32 nodeIndexB ) const { return m_queryMaskA->get( nodeIndexA ) || m_queryMaskB->get( nodeIndexB ); }

        const hkBitField* HK_RESTRICT   m_queryMaskA;
        const hkBitField* HK_RESTRICT   m_queryMaskB;
    };

    // NullQueryMaskFilter
    struct NullQueryMaskFilter
    {
        HK_INLINE       NullQueryMaskFilter() {}
        HK_INLINE int   isActive( hkUint32 nodeIndexA ) const { return 1; }
        HK_INLINE int   isActive( hkUint32 nodeIndexA, hkUint32 nodeIndexB ) const { return 1; }
    };

    //
    // Supporting methods for treeOverlaps queries.
    //

    //
    template <typename COMPACT_AND_COUNT, typename QUERY>
    HK_QUERY_INLINING void processAxAs( int INDEX, _In_reads_( _Inexpressible_() ) const hkcdSimdTree::Node* nodes, QUERY& query,
        _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stackA, _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stackB, _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stack,
        _In_reads_( INDEX + 1 ) const hkcdSimdTree::TransposedAabb* aabbT,
        _In_ const hkcdSimdTree::Node* HK_RESTRICT node, const hkIntVector& dataValues, const hkVector4Comparison& internals, hkVector4Comparison::Mask internalsMask )
    {
        HKCD_SIMDTREE_TIME_CODE_BLOCK( "processAxAs" );

        hkVector4Comparison mask; mask.set( hkVector4Comparison::Mask( ( 0xf >> ( INDEX + 1 ) ) << ( INDEX + 1 ) ) );
        hkVector4Comparison overlaps; overlaps.setAnd( overlaps_aabb_1v4( aabbT[ INDEX ], node ), mask );

        hkVector4Comparison overlappingInternals; overlappingInternals.setAnd( overlaps, internals );
        hkVector4Comparison overlappingLeaves; overlappingLeaves.setAndNot( overlaps, internals );

        hkIntVector i123;
        const int   internalsCount = COMPACT_AND_COUNT::compactAndCount( overlappingInternals, dataValues, &i123 );

        hkIntVector l123, j123;
        const int   leavesCount = COMPACT_AND_COUNT::compactAndCount( overlappingLeaves, dataValues, hkIntVector::getConstant<HK_QUADINT_0123>(), &l123, &j123 );

        if ( ( internalsMask >> INDEX ) & 1 )
        {
            // I0 I123
            hkIntVector i0; i0.setAll( dataValues.getU32( INDEX ) );
            i0.store<4, HK_IO_NATIVE_ALIGNED>( stackA );
            i123.store<4, HK_IO_NATIVE_ALIGNED>( stackB );
            stackA += internalsCount;
            stackB += internalsCount;

            // I0 L123, use switch-case with fall-through statements.
            PairwiseQueryWrapper<false, QUERY>  subQuery( 0, query );
            i0.store<4, HK_IO_NATIVE_ALIGNED>( stack );

            for ( int i = leavesCount - 1; i >= 0; i-- )
            {
                subQuery.m_operand.setAll( l123.getU32( i ) );
                aabbOverlaps_Internal<COMPACT_AND_COUNT, HKCD_SIMDTREE_UNROOL_OVERLAPS, false, true>( aabbT[ j123.getU32( i ) ], nodes, subQuery, stack + i, stack + i + 1 );

            }
        }
        else
        {
            // L0 I123
            PairwiseQueryWrapper<true, QUERY> subQuery( dataValues.getU32( INDEX ), query );
            i123.store<4, HK_IO_NATIVE_ALIGNED>( stack );
            aabbOverlaps_Internal<COMPACT_AND_COUNT, HKCD_SIMDTREE_UNROOL_OVERLAPS, false, true>( aabbT[ INDEX ], nodes, subQuery, stack, stack + internalsCount );

            // L0 L123
            query.processSimdTreeLeaves( dataValues.getU32( INDEX ), l123, leavesCount );
        }
    }

    //
    template <typename COMPACT_AND_COUNT, typename QUERY>
    HK_QUERY_INLINING void processAA( _In_reads_( _Inexpressible_() ) const hkcdSimdTree::Node* nodes, _In_ const hkcdSimdTree::Node* HK_RESTRICT node, _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stack,
        _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stackA, _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stackB, QUERY& query )
    {
        HKCD_SIMDTREE_TIME_CODE_BLOCK( "processAA" );

        #if HKCD_SIMDTREE_ENABLE_INTERNAL_COUNTERS
        hkcdSimdTree::g_profileCounters.m_processAA_calls++;
        #endif

        hkIntVector data; data.load<4>( node->m_data );
        hkIntVector maskedData; maskedData.setShiftLeft32<31>( data );
        hkIntVector dataValues; dataValues.setShiftRight32<1>( data );

        hkcdSimdTree::TransposedAabb    aabbT[ 4 ];
        aabbT[ 0 ].splat<0>( *node );
        aabbT[ 1 ].splat<1>( *node );
        aabbT[ 2 ].splat<2>( *node );
        aabbT[ 3 ].splat<3>( *node );

        hkVector4Comparison             internals; internals.setAndNot( maskedData.equalZeroS32(), dataValues.equalZeroS32() );
        const hkVector4Comparison::Mask internalsMask = internals.getMask();

        hkIntVector internalsValues;
        const int   numInternals = COMPACT_AND_COUNT::compactAndCount( internals, dataValues, &internalsValues );
        internalsValues.store<4, HK_IO_NATIVE_ALIGNED>( stack );
        stack += numInternals;

        for ( int i = 0; i < 3; i++ )
        {
            processAxAs<COMPACT_AND_COUNT>( i, nodes, query, stackA, stackB, stack, aabbT, node, dataValues, internals, internalsMask );
        }
    }

    //
    template <typename COMPACT_AND_COUNT, typename QUERY>
    HK_QUERY_INLINING void processBxAs( int B_INDEX, _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT stack, QUERY& query, int transposedMasks,
        _In_reads_( _Inexpressible_() ) const hkcdSimdTree::Node* HK_RESTRICT nodesA, const hkIntVector& dataValuesA,
        _In_ const hkcdSimdTree::Node* HK_RESTRICT nodeB, const hkIntVector& dataValuesB )
    {
        HKCD_SIMDTREE_TIME_CODE_BLOCK( "processBxAs" );

        #if HKCD_SIMDTREE_ENABLE_INTERNAL_COUNTERS
        hkcdSimdTree::g_profileCounters.m_processBxAs_calls++;
        #endif

        const hkVector4Comparison::Mask     mask = hkVector4Comparison::Mask( ( transposedMasks >> ( B_INDEX << 2 ) ) & 15 );
        if ( mask )
        {
            PairwiseQueryWrapper<false, QUERY>  subQuery( dataValuesB.getU32( B_INDEX ), query );
            hkcdSimdTree::TransposedAabb        aabbT; aabbT.splat( *nodeB, B_INDEX );
            hkIntVector                         internals;
            const int                           count = COMPACT_AND_COUNT::compactAndCount( mask, dataValuesA, &internals );

            internals.store<4, HK_IO_NATIVE_ALIGNED>( stack );
            aabbOverlaps_Internal<COMPACT_AND_COUNT, HKCD_SIMDTREE_UNROOL_OVERLAPS, false, true>( aabbT, nodesA, subQuery, stack, stack + count );
        }
    }

    //
    template <typename COMPACT_AND_COUNT, typename QUERY>
    HK_QUERY_INLINING void processAxBs( int A_INDEX, _Inout_ const hkcdSimdTree::Node*& nodesB, QUERY& query,
        _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stackA, _In_ const hkcdSimdTree::Node* HK_RESTRICT nodeA, const hkIntVector& dataValuesA, hkVector4Comparison::Mask dataInternalsA,
        _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stackB, _In_ const hkcdSimdTree::Node* HK_RESTRICT nodeB, const hkIntVector& dataValuesB, const hkVector4Comparison& dataInternalsB,
        int& transposedMasks )
    {
        HKCD_SIMDTREE_TIME_CODE_BLOCK( "processAxBs" );

        #if HKCD_SIMDTREE_ENABLE_INTERNAL_COUNTERS
        hkcdSimdTree::g_profileCounters.m_processAxBs_calls++;
        #endif

        hkcdSimdTree::TransposedAabb    aabbT; aabbT.splat( *nodeA, A_INDEX );
        hkVector4Comparison             overlapsAxBs = overlaps_aabb_1v4( aabbT, nodeB );

        if ( !overlapsAxBs.anyIsSet() ) return;

        hkVector4Comparison overlappingInternalsB; overlappingInternalsB.setAnd( overlapsAxBs, dataInternalsB );
        hkVector4Comparison overlappingLeavesB; overlappingLeavesB.setAndNot( overlapsAxBs, dataInternalsB );

        hkIntVector         overlappingInternalsValuesB;
        const int           numInternals = COMPACT_AND_COUNT::compactAndCount( overlappingInternalsB, dataValuesB, &overlappingInternalsValuesB );

        if ( dataInternalsA & ( 1 << A_INDEX ) )
        {
            // IA vs IB.
            hkIntVector valuesA; valuesA.setAll( dataValuesA.getU32( A_INDEX ) );
            valuesA.store<4, HK_IO_NATIVE_ALIGNED>( stackA );
            overlappingInternalsValuesB.store<4, HK_IO_NATIVE_ALIGNED>( stackB );
            stackA += numInternals;
            stackB += numInternals;

            // IA vs LB.
            transposedMasks |= hkVectorSort::Tables::s_spreadMaskBits[ A_INDEX ][ overlappingLeavesB.getMask() ];
        }
        else
        {
            // LA vs IB.
            PairwiseQueryWrapper<true, QUERY> subQuery( dataValuesA.getU32( A_INDEX ), query );
            overlappingInternalsValuesB.store<4, HK_IO_NATIVE_ALIGNED>( stackB );
            aabbOverlaps_Internal<COMPACT_AND_COUNT, HKCD_SIMDTREE_UNROOL_OVERLAPS, false, true>( aabbT, nodesB, subQuery, stackB, stackB + numInternals );

            // LA vs LB.
            hkIntVector overlappingLeavesValuesB;
            const int   numLeaves = COMPACT_AND_COUNT::compactAndCount( overlappingLeavesB, dataValuesB, &overlappingLeavesValuesB );
            query.processSimdTreeLeaves( dataValuesA.getU32( A_INDEX ), overlappingLeavesValuesB, numLeaves );
        }
    }

    //
    template <typename COMPACT_AND_COUNT, typename QUERY>
    HK_QUERY_INLINING void processAB( _In_reads_( _Inexpressible_() ) const hkcdSimdTree::Node* nodesA, _In_ const hkcdSimdTree::Node* HK_RESTRICT nodeA, _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stackA,
        _In_reads_( _Inexpressible_() ) const hkcdSimdTree::Node* nodesB, _In_ const hkcdSimdTree::Node* HK_RESTRICT nodeB, _Inout_updates_( _Inexpressible_() ) hkUint32* HK_RESTRICT & stackB, QUERY& query )
    {
        HKCD_SIMDTREE_TIME_CODE_BLOCK( "processAB" );

        #if HKCD_SIMDTREE_ENABLE_INTERNAL_COUNTERS
        hkcdSimdTree::g_profileCounters.m_processAB_calls++;
        #endif

        hkIntVector         dataA; dataA.load<4>( nodeA->m_data );
        hkIntVector         dataB; dataB.load<4>( nodeB->m_data );

        hkIntVector         dataValuesA; dataValuesA.setShiftRight32<1>( dataA );
        hkIntVector         dataValuesB; dataValuesB.setShiftRight32<1>( dataB );

        hkIntVector         dataMaskedA; dataMaskedA.setShiftLeft32<31>( dataA );
        hkIntVector         dataMaskedB; dataMaskedB.setShiftLeft32<31>( dataB );

        hkVector4Comparison dataInternalsA = dataMaskedA.equalZeroS32();
        hkVector4Comparison dataInternalsB = dataMaskedB.equalZeroS32();

        if ( dataInternalsA.allAreSet() )
        {
            if ( dataInternalsB.allAreSet() )
            {
                // IA-IB, traverse both nodes.
                #if HKCD_SIMDTREE_ENABLE_INTERNAL_COUNTERS
                hkcdSimdTree::g_profileCounters.m_processAB_II_calls++;
                #endif
                const hkVector4Comparison   overlaps_A0_Bs = overlaps_aabb_1v4<0>( nodeA, nodeB );
                const hkVector4Comparison   overlaps_A1_Bs = overlaps_aabb_1v4<1>( nodeA, nodeB );
                const hkVector4Comparison   overlaps_A2_Bs = overlaps_aabb_1v4<2>( nodeA, nodeB );
                const hkVector4Comparison   overlaps_A3_Bs = overlaps_aabb_1v4<3>( nodeA, nodeB );

                #define PROCESS( _overlaps_, _index_ )  if ( _overlaps_.anyIsSet() ) \
                                                        { \
                                                            hkIntVector ia; ia.setAll( dataValuesA.getU32<_index_>() ); \
                                                            hkIntVector ib; \
                                                            const int   count = COMPACT_AND_COUNT::compactAndCount( _overlaps_, dataValuesB, &ib ); \
                                                            ia.store<4, HK_IO_NATIVE_ALIGNED>( stackA ); stackA += count; \
                                                            ib.store<4, HK_IO_NATIVE_ALIGNED>( stackB ); stackB += count; \
                                                        }

                PROCESS( overlaps_A0_Bs, 0 );
                PROCESS( overlaps_A1_Bs, 1 );
                PROCESS( overlaps_A2_Bs, 2 );
                PROCESS( overlaps_A3_Bs, 3 );
                #undef PROCESS
                return;
            }
        }

        // MA-MB, case where both nodes can have mixture of internals and leaves.
        hkIntVector         leaves; leaves.setAnd( dataMaskedA, dataMaskedB );
        hkVector4Comparison allLeaves; allLeaves.setNot( leaves.equalZeroS32() );
        if ( allLeaves.allAreSet() )
        {
            // All leaves.
            #if HKCD_SIMDTREE_ENABLE_INTERNAL_COUNTERS
            hkcdSimdTree::g_profileCounters.m_processAB_LL_calls++;
            #endif

            const hkVector4Comparison   overlaps_A0_Bs = overlaps_aabb_1v4<0>( nodeA, nodeB );
            const hkVector4Comparison   overlaps_A1_Bs = overlaps_aabb_1v4<1>( nodeA, nodeB );
            const hkVector4Comparison   overlaps_A2_Bs = overlaps_aabb_1v4<2>( nodeA, nodeB );
            const hkVector4Comparison   overlaps_A3_Bs = overlaps_aabb_1v4<3>( nodeA, nodeB );

            #define PROCESS( _overlaps_, _index_ )  if ( _overlaps_.anyIsSet() ) \
                                                    { \
                                                        hkIntVector _leaves; \
                                                        const int   count = COMPACT_AND_COUNT::compactAndCount( _overlaps_, dataValuesB, &_leaves ); \
                                                        query.processSimdTreeLeaves( dataValuesA.getU32<_index_>(), _leaves, count); \
                                                    }

            PROCESS( overlaps_A0_Bs, 0 );
            PROCESS( overlaps_A1_Bs, 1 );
            PROCESS( overlaps_A2_Bs, 2 );
            PROCESS( overlaps_A3_Bs, 3 );
            #undef PROCESS
        }
        else
        {
            // Mixed.
            const hkVector4Comparison::Mask dataInternalsA_mask = dataInternalsA.getMask();
            int     transposedMasks = 0;
            for ( int i = 0; i < 4; i++ )
            {
                processAxBs<COMPACT_AND_COUNT>( i, nodesB, query, stackA, nodeA, dataValuesA, dataInternalsA_mask, stackB, nodeB, dataValuesB, dataInternalsB, transposedMasks );
            }

            if ( transposedMasks )
            {
                for ( int i = 0; i < 4; i++ )
                {
                    processBxAs<COMPACT_AND_COUNT>( i, stackA, query, transposedMasks, nodesA, dataValuesA, nodeB, dataValuesB );
                }
            }
        }
    }

    //
    template <typename COMPACT_AND_COUNT, typename FILTER>
    HK_NEVER_INLINE void        treeOverlaps( const hkcdSimdTree& treeA, const hkcdSimdTree& treeB, _Inout_ hkcdSimdTree::PairCollector* HK_RESTRICT collector, hkUint32 rootA, hkUint32 rootB, const FILTER& filter )
    {
        HK_ASSERT( 0x41DBEA78, rootA >= 1 && int( rootA ) < treeA.m_nodes.getSize(), "Invalid root A." );
        HK_ASSERT( 0x41DBEA79, rootB >= 1 && int( rootB ) < treeB.m_nodes.getSize(), "Invalid root B." );

        HKCD_SIMDTREE_TIME_CODE_BLOCK( "hkcdSimdTree::treeOverlaps" );

        // Prepare.
        BufferedQuery   query( collector );

        const hkcdSimdTree::Node* HK_RESTRICT       nodesA = treeA.m_nodes.begin();

        hkUint32                    binaryStackBuffers[ 2 ][ hkcdSimdTree::BINARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( binaryStackBuffers[ 0 ] ); HKCD_SET_STACK_WRITE_BARRIER( binaryStackBuffers[ 1 ] );
        hkUint32* HK_RESTRICT       stackA = binaryStackBuffers[ 0 ];
        hkUint32* HK_RESTRICT       stackB = binaryStackBuffers[ 1 ];

        if ( ( &treeA == &treeB ) && ( rootA == rootB ) )
        {
            // Tree against itself.
            HKCD_SIMDTREE_TIME_CODE_BLOCK( "Self" );

            hkUint32                unaryStackBuffer[ hkcdSimdTree::UNARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( unaryStackBuffer );
            hkUint32* HK_RESTRICT   stack = unaryStackBuffer; *stack++ = rootA;

            do
            {
                const hkUint32  nodeIndex = *( --stack );
                if ( filter.isActive( nodeIndex ) )
                {
                    const hkcdSimdTree::Node* HK_RESTRICT   node = nodesA + nodeIndex;
                    processAA<COMPACT_AND_COUNT>( nodesA, node, stack, stackA, stackB, query );

                    query.flushIfNeeded();

                    while ( stackA > binaryStackBuffers[ 0 ] )
                    {
                        const hkUint32  nodeIndexA = *( --stackA );
                        const hkUint32  nodeIndexB = *( --stackB );

                        if ( filter.isActive( nodeIndexA, nodeIndexB ) )
                        {
                            HK_ASSERT( 0xBC75C998, ( stackA - binaryStackBuffers[ 0 ] ) == ( stackB - binaryStackBuffers[ 1 ] ), "Inconsistent stacks" );
                            HK_ASSERT( 0xBC75C999, ( int( stackA - binaryStackBuffers[ 0 ] ) + 8 ) < hkcdSimdTree::BINARY_STACK_SIZE, "Stack overflow" );

                            processAB<COMPACT_AND_COUNT>( nodesA, nodesA + nodeIndexA, stackA, nodesA, nodesA + nodeIndexB, stackB, query );

                            query.flushIfNeeded();
                        }
                    }
                }

            } while ( stack > unaryStackBuffer );

            HKCD_CHECK_STACK_WRITE_BARRIER( unaryStackBuffer );
        }
        else
        {
            // tree A against tree B.
            HKCD_SIMDTREE_TIME_CODE_BLOCK( "Binary" );

            const hkcdSimdTree::Node* HK_RESTRICT       nodesB = treeB.m_nodes.begin();
            *stackA++ = rootA;
            *stackB++ = rootB;
            do
            {
                const hkUint32  nodeIndexA = *( --stackA );
                const hkUint32  nodeIndexB = *( --stackB );

                if ( filter.isActive( nodeIndexA, nodeIndexB ) )
                {
                    HK_ASSERT( 0xBC75C998, ( stackA - binaryStackBuffers[ 0 ] ) == ( stackB - binaryStackBuffers[ 1 ] ), "Inconsistent stacks" );
                    HK_ASSERT( 0xBC75C999, ( int( stackA - binaryStackBuffers[ 0 ] ) + 8 ) < hkcdSimdTree::BINARY_STACK_SIZE, "Stack overflow" );

                    processAB<COMPACT_AND_COUNT>( nodesA, nodesA + nodeIndexA, stackA, nodesB, nodesB + nodeIndexB, stackB, query );

                    query.flushIfNeeded();
                }
            } while ( stackA > binaryStackBuffers[ 0 ] );
        }

        HKCD_CHECK_STACK_WRITE_BARRIER( binaryStackBuffers[ 0 ] );
        HKCD_CHECK_STACK_WRITE_BARRIER( binaryStackBuffers[ 1 ] );
    }

    HK_INLINE void prefetchAndPushOnStack( const hkcdSimdTree::Node** HK_RESTRICT & stack, const hkcdSimdTree::Node* node )
    {
        hkMath::forcePrefetch<sizeof( hkcdSimdTree::Node )>( node );
        *stack++ = node;
    }

    //
    template <int A, int B>
    HK_INLINE const hkcdSimdTree::Node* sortPush2( const hkcdSimdTree::Node* nodes, hkVector4Parameter fractions, const hkcdSimdTree::Node** HK_RESTRICT & stack, const hkIntVector& data )
    {
        if ( fractions( A ) < fractions( B ) )
        {
            prefetchAndPushOnStack( stack, nodes + data.getU32<B>() );
            return nodes + data.getU32<A>();
        }
        else
        {
            prefetchAndPushOnStack( stack, nodes + data.getU32<A>() );
            return nodes + data.getU32<B>();
        }
    }

    HK_INLINE const hkcdSimdTree::Node* sortPushX( const hkcdSimdTree::Node* nodes, hkVector4Parameter fractions, int mask, const hkcdSimdTree::Node** HK_RESTRICT & stack, const hkIntVector& data )
    {
        switch ( mask )
        {
            // any combination of 1 bit
        case    hkVector4ComparisonMask::MASK_X:    return nodes + data.getU32<0>();
        case    hkVector4ComparisonMask::MASK_Y:    return nodes + data.getU32<1>();
        case    hkVector4ComparisonMask::MASK_Z:    return nodes + data.getU32<2>();
        case    hkVector4ComparisonMask::MASK_W:    return nodes + data.getU32<3>();

            // any combination of 2 bits
        case    hkVector4ComparisonMask::MASK_XY:   return sortPush2<0, 1>( nodes, fractions, stack, data );
        case    hkVector4ComparisonMask::MASK_XZ:   return sortPush2<0, 2>( nodes, fractions, stack, data );
        case    hkVector4ComparisonMask::MASK_XW:   return sortPush2<0, 3>( nodes, fractions, stack, data );
        case    hkVector4ComparisonMask::MASK_YZ:   return sortPush2<1, 2>( nodes, fractions, stack, data );
        case    hkVector4ComparisonMask::MASK_YW:   return sortPush2<1, 3>( nodes, fractions, stack, data );
        case    hkVector4ComparisonMask::MASK_ZW:   return sortPush2<2, 3>( nodes, fractions, stack, data );

            // any combination of 3 bits
        case hkVector4ComparisonMask::MASK_YZW:
        case hkVector4ComparisonMask::MASK_XZW:
        case hkVector4ComparisonMask::MASK_XYW:
        case hkVector4ComparisonMask::MASK_XYZ:
        default:
        {
            hkVector4Comparison comp; comp.set( ( hkVector4ComparisonMask::Mask ) mask );
            hkVector4 maskedFractions; maskedFractions.setSelect( comp, fractions, hkVector4::getConstant<HK_QUADREAL_MAX>() );
            const hkInt8*   lut = hkVectorSort::Tables::s_orderingTable[ calcSortMask( maskedFractions ) ];
            prefetchAndPushOnStack( stack, nodes + data.getU32( lut[ 2 ] ) );
            prefetchAndPushOnStack( stack, nodes + data.getU32( lut[ 1 ] ) );
            return nodes + data.getU32( lut[ 0 ] );
        }

        // any combination of 4 bits
        case hkVector4ComparisonMask::MASK_XYZW:
        {
            const hkInt8*   lut = hkVectorSort::Tables::s_orderingTable[ calcSortMask( fractions ) ];
            prefetchAndPushOnStack( stack, nodes + data.getU32( lut[ 3 ] ) );
            prefetchAndPushOnStack( stack, nodes + data.getU32( lut[ 2 ] ) );
            prefetchAndPushOnStack( stack, nodes + data.getU32( lut[ 1 ] ) );
            return nodes + data.getU32( lut[ 0 ] );
        }
        }
    }

    template <typename COMPACT_AND_COUNT>
    HK_INLINE void  rayCastImpl( const hkcdSimdTree::Node* nodes, const hkcdRay& ray, hkReal initialFraction, hkcdSimdTree::ProcessRayCastLeaves& processor, hkUint32 root )
    {
        if ( initialFraction < 0 ) return;

        // Prepare constants.
        hkFourTransposedPoints  orgT; orgT.setAll( ray.m_origin );
        hkFourTransposedPoints  invDirT; invDirT.setAll( ray.m_invDirection );

        const int rayDirMask = ray.m_direction.greaterEqualZero().getMask();
        const int nearXoffset = rayDirMask & 1 ? HK_OFFSET_OF( hkcdSimdTree::Node, m_lx ) : HK_OFFSET_OF( hkcdSimdTree::Node, m_hx );
        const int farXoffset = rayDirMask & 1 ? HK_OFFSET_OF( hkcdSimdTree::Node, m_hx ) : HK_OFFSET_OF( hkcdSimdTree::Node, m_lx );
        const int nearYoffset = rayDirMask & 2 ? HK_OFFSET_OF( hkcdSimdTree::Node, m_ly ) : HK_OFFSET_OF( hkcdSimdTree::Node, m_hy );
        const int farYoffset = rayDirMask & 2 ? HK_OFFSET_OF( hkcdSimdTree::Node, m_hy ) : HK_OFFSET_OF( hkcdSimdTree::Node, m_ly );
        const int nearZoffset = rayDirMask & 4 ? HK_OFFSET_OF( hkcdSimdTree::Node, m_lz ) : HK_OFFSET_OF( hkcdSimdTree::Node, m_hz );
        const int farZoffset = rayDirMask & 4 ? HK_OFFSET_OF( hkcdSimdTree::Node, m_hz ) : HK_OFFSET_OF( hkcdSimdTree::Node, m_lz );

        // Prepare stack.
        const hkcdSimdTree::Node*   buffer[ hkcdSimdTree::UNARY_STACK_SIZE ]; buffer[ HK_COUNT_OF( buffer ) - 1 ] = HK_NULL;
        const hkcdSimdTree::Node**  stack = buffer;

        // Traverse.
        hkVector4                   maxFractions; maxFractions.setAll( initialFraction );
        hkReal                      fraction = initialFraction;
        const hkcdSimdTree::Node*   node = nodes + root;
        for ( ;; )
        {
            // Ray cast against node's four AABB.
            hkVector4   hitFractions;
            int         hitMasks = hkcdSimdTreeUtils::rayCastFourAabbs( ray, nearXoffset, farXoffset, nearYoffset, farYoffset, nearZoffset, farZoffset, orgT, invDirT, maxFractions, node, &hitFractions );

            // If there's any hits.
            if ( hitMasks )
            {
                hkIntVector rawData; rawData.load<4>( node->m_data );
                hkIntVector data; data.setShiftRight32<1>( rawData );
                hkIntVector masked; masked.setAnd( rawData, hkIntVector::getConstant<HK_QUADINT_1>() );
                const int hitLeaves = masked.greaterZeroS32().getMask() & hitMasks;

                // If there's any leaf hits.
                if ( hitLeaves )
                {
                    // Process leaves.
                    hkIntVector leaves;
                    const int numLeaves = COMPACT_AND_COUNT::compactAndCount( ( hkVector4ComparisonMask::Mask ) hitLeaves, data, &leaves );
                    const hkReal newFraction = processor.process( reinterpret_cast<const hkUint32*>( &leaves ), numLeaves, ray );
                    if ( newFraction != fraction )
                    {
                        maxFractions.setAll( newFraction );
                        fraction = newFraction;
                        hitMasks &= hitFractions.less( maxFractions ).getMask();
                        if ( newFraction < 0 ) return;
                    }
                    hitMasks &= ~hitLeaves;
                }

                // If there's any node hits.
                if ( hitMasks )
                {
                    // Process nodes.
                    node = hkcdSimdTreeUtils::sortPushX( nodes, hitFractions, hitMasks, stack, data );
                    continue;
                }
            }

            // Pop from the stack or exit.
            if ( stack > buffer )
                node = *( --stack );
            else
                break;
        }

        if ( buffer[ HK_COUNT_OF( buffer ) - 1 ] ) { HK_ERROR( 0xCA3F6C0F, "Stack overflow" ); }
    }
}

//
// hkcdSimdTree
//

//
void        hkcdSimdTree::insert( const hkAabb& aabb, hkUint32 data )
{
    // Find an insertion point by traversing the root from the root selecting the path closest to the center of the AABB to be inserted.
    const hkUint32 leafData = ( data << 1 ) | 1;
    hkFourTransposedPoints center2; center2.setAll( aabb.m_min + aabb.m_max );
    for ( int nodeIndex = 1;;)
    {
        Node& node = m_nodes[ nodeIndex ];
        hkIntVector nodeData; nodeData.load<4, HK_IO_SIMD_ALIGNED>( node.m_data );

        // Pick the closest child.
        const hkVector4 dx2 = hkMath::pow<2>( ( node.m_lx + node.m_hx ) - center2.m_vertices[ 0 ] );
        const hkVector4 dy2 = hkMath::pow<2>( ( node.m_ly + node.m_hy ) - center2.m_vertices[ 1 ] );
        const hkVector4 dz2 = hkMath::pow<2>( ( node.m_lz + node.m_hz ) - center2.m_vertices[ 2 ] );

        const int indexOfClosestChild = ( dx2 + dy2 + dz2 ).getIndexOfMinComponent<4>();
        const int firstFreeIndex = nodeData.getIndexOfMinComponent<4>();

        if ( nodeData.getU32( firstFreeIndex ) )
        {
            // If no free slot is available in the node, go down the path of 'indexOfClosestChild'
            const hkUint32 closestSlotData = nodeData.getU32( indexOfClosestChild );
            if ( 0 == ( closestSlotData & 1 ) )
            {
                // Internal, merge AABB and dive-in.
                hkAabb compoundAabb;
                node.getAabb( indexOfClosestChild, &compoundAabb );
                compoundAabb.includeAabb( aabb );
                node.setAabb( indexOfClosestChild, compoundAabb );

                nodeIndex = closestSlotData >> 1;
            }
            else
            {
                // Leaf, slit and insert.
                Node newNode;
                newNode.setEmpty();
                reinterpret_cast<hkIntVector*>( newNode.m_data )->setZero();

                hkAabb prevAabb; node.getAabb( indexOfClosestChild, &prevAabb );
                hkAabb compoundAabb = prevAabb; compoundAabb.includeAabb( aabb );

                newNode.setAabb<0>( prevAabb ); newNode.m_data[ 0 ] = closestSlotData;
                newNode.setAabb<1>( aabb ); newNode.m_data[ 1 ] = leafData;

                node.setAabb( indexOfClosestChild, compoundAabb );
                node.m_data[ indexOfClosestChild ] = m_nodes.getSize() << 1;

                m_nodes.pushBack( newNode );

                return;
            }
        }
        else
        {
            // Free slot, insert there.
            node.setAabb( firstFreeIndex, aabb );
            node.m_data[ firstFreeIndex ] = leafData;
            return;
        }
    }
}

//
void        hkcdSimdTree::treeOverlaps( const hkcdSimdTree& treeA, const hkcdSimdTree& treeB, _Inout_ PairCollector* collector, hkUint32 rootA, hkUint32 rootB )
{
    HK_ASSERT( 0x69c699db, &treeA != &treeB || collector->getQueryMaskA() == collector->getQueryMaskB(), "Mask A and B must be the same." );
    if ( collector->getQueryMaskA() )
    {
        const hkcdSimdTreeUtils::QueryMaskFilter    filter( collector );
        #define call_treeOverlaps( _compact_and_count_ ) hkcdSimdTreeUtils::treeOverlaps<_compact_and_count_>( treeA, treeB, collector, rootA, rootB, filter )
        HK_VECTOR_SORT_SELECT_COMPACT_AND_COUNT( call_treeOverlaps );
        #undef call_treeOverlaps
    }
    else
    {
        const hkcdSimdTreeUtils::NullQueryMaskFilter    filter;
        #define call_treeOverlaps( _compact_and_count_ ) hkcdSimdTreeUtils::treeOverlaps<_compact_and_count_>( treeA, treeB, collector, rootA, rootB, filter )
        HK_VECTOR_SORT_SELECT_COMPACT_AND_COUNT( call_treeOverlaps );
        #undef call_treeOverlaps
    }
}

//
void    hkcdSimdTree::aabbOverlaps_OutOfLine( const hkAabb& aabb, _Inout_ LeafCollector* collector, hkUint32 root ) const
{
    hkcdSimdTreeUtils::LeafCollectorQuery   query; query.m_collector = collector;
    aabbOverlaps( aabb, query, root );
    query.flush( true );
}

//
void    hkcdSimdTree::aabbOverlapsNearMiss_OutOfLine( const hkAabb& aabb, _Inout_ LeafCollector* collector, hkAabb& nmp, hkUint32 root ) const
{
    hkcdSimdTreeUtils::LeafCollectorQuery   query; query.m_collector = collector;
    aabbOverlapsNearMissNew( aabb, query, nmp, root );
    query.flush( true );
}

void    hkcdSimdTree::rayCast_OutOfLine( const hkcdRay& ray, hkReal initialFraction, ProcessRayCastLeaves& processor, hkUint32 root ) const
{
    #define GetRayCastImplFunc( _compact_and_count_ ) hkcdSimdTreeUtils::rayCastImpl<_compact_and_count_>(m_nodes.begin(), ray, initialFraction, processor, root)
    HK_VECTOR_SORT_SELECT_COMPACT_AND_COUNT( GetRayCastImplFunc );
    #undef GetRayCastImplFunc

}

void hkcdSimdTree::copyFrom( const hkcdSimdTree& other )
{
    this->m_nodes = other.m_nodes;
}

#if defined(_WINDOWS) && (HK_CONFIG_SIMD == HK_CONFIG_SIMD_DISABLED)
#pragma optimize( "", on )
#endif

/*
 * Havok SDK - Product file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
