// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM   : ALL
// PRODUCT   : COMMON
// VISIBILITY   : CLIENT
//
// ------------------------------------------------------TKBMS v1.0

#if !defined(__HAVOK_PARSER__)

#define HKCD_STACK_WRITE_BARRIER 0xCF213844

#if defined(HKCD_STACK_WRITE_BARRIER)
#define HKCD_SET_STACK_WRITE_BARRIER(_array_)       (_array_)[HK_COUNT_OF((_array_))-1] = HKCD_STACK_WRITE_BARRIER
#define HKCD_CHECK_STACK_WRITE_BARRIER(_array_)     if((_array_)[HK_COUNT_OF((_array_))-1] != HKCD_STACK_WRITE_BARRIER) { HK_ERROR(0xCA3F6C0F, "Stack overflow"); }
#else
#define HKCD_SET_STACK_WRITE_BARRIER(_array_)
#define HKCD_CHECK_STACK_WRITE_BARRIER(_array_)
#endif

#if HKCD_ENABLE_SIMDTREE_TIMERS
#define HKCD_SIMDTREE_TIME_CODE_BLOCK(_name_)       HK_TIME_CODE_BLOCK(_name_,HK_NULL)
#else
#define HKCD_SIMDTREE_TIME_CODE_BLOCK(_name_)
#endif

#define HKCD_CHECK_STACK_PUSH( _n_ )    HK_ASSERT( 0xCC9BE08E, ((stack - buffer) + (_n_)) < HK_COUNT_OF(buffer), "Stack overflow")


namespace hkcdSimdTreeUtils
{

    // Shortcut to hkcdSimdTree::TransposedAabb
    typedef hkcdSimdTree::TransposedAabb    TransposedAabb;


    // Return a 6 bits integer with the following meaning for each bit:
    // [X<W] : [X<Z] : [Y<W] : [X<Y] : [Y<Z] : [Z<W]
    HK_INLINE int calcSortMask( hkVector4Parameter xyzw )
    {
        hkVector4   yzwx; yzwx.setPermutation<hkVectorPermutation::YZWX>( xyzw );
        hkVector4   zwxy; zwxy.setPermutation<hkVectorPermutation::ZWXY>( xyzw );
        hkVector4   wxyz; wxyz.setPermutation<hkVectorPermutation::WXYZ>( xyzw );

        const int   m0 = xyzw.less( yzwx ).getMask() & 7;
        const int   m1 = xyzw.less( zwxy ).getMask() & 3;
        const int   m2 = xyzw.less( wxyz ).getMask() & 1;

        return m0 | ( m1 << 3 ) | ( m2 << 5 );
    }

    // Push the children A and B on a stack sorted by their fractions.
    template <int A, int B>
    HK_INLINE hkUint32  sortPushTwo( hkVector4Parameter fractions, hkUint32* HK_RESTRICT & stack, const hkcdSimdTree::Node* node )
    {
        if ( fractions( A ) < fractions( B ) )
        {
            *stack++ = node->getChildData<B>();
            return node->getChildData<A>();
        }
        else
        {
            *stack++ = node->getChildData<A>();
            return node->getChildData<B>();
        }
    }

    // Push up to four children, depending on mask, on a stack sorted by their fractions.
    HK_INLINE hkUint32  sortPush( hkVector4Parameter fractions, int mask, hkUint32* HK_RESTRICT & stack, const hkcdSimdTree::Node* node )
    {
        switch ( mask )
        {
            // any combination of 1 bit
        case    hkVector4ComparisonMask::MASK_X:    return node->getChildData<0>();
        case    hkVector4ComparisonMask::MASK_Y:    return node->getChildData<1>();
        case    hkVector4ComparisonMask::MASK_Z:    return node->getChildData<2>();
        case    hkVector4ComparisonMask::MASK_W:    return node->getChildData<3>();

            // any combination of 2 bits
        case    hkVector4ComparisonMask::MASK_XY:   return sortPushTwo<0, 1>( fractions, stack, node );
        case    hkVector4ComparisonMask::MASK_XZ:   return sortPushTwo<0, 2>( fractions, stack, node );
        case    hkVector4ComparisonMask::MASK_XW:   return sortPushTwo<0, 3>( fractions, stack, node );
        case    hkVector4ComparisonMask::MASK_YZ:   return sortPushTwo<1, 2>( fractions, stack, node );
        case    hkVector4ComparisonMask::MASK_YW:   return sortPushTwo<1, 3>( fractions, stack, node );
        case    hkVector4ComparisonMask::MASK_ZW:   return sortPushTwo<2, 3>( fractions, stack, node );

            // any combination of 3 bits
        case hkVector4ComparisonMask::MASK_YZW:
        case hkVector4ComparisonMask::MASK_XZW:
        case hkVector4ComparisonMask::MASK_XYW:
        case hkVector4ComparisonMask::MASK_XYZ:
        default:
        {
            hkVector4Comparison comp; comp.set( ( hkVector4ComparisonMask::Mask ) mask );
            hkVector4 maskedFractions; maskedFractions.setSelect( comp, fractions, hkVector4::getConstant<HK_QUADREAL_MAX>() );
            const hkInt8*   lut = hkVectorSort::Tables::s_orderingTable[ calcSortMask( maskedFractions ) ];
            *stack++ = node->getChildData( lut[ 2 ] );
            *stack++ = node->getChildData( lut[ 1 ] );
            return node->getChildData( lut[ 0 ] );
        }

        // any combination of 4 bits
        case hkVector4ComparisonMask::MASK_XYZW:
        {
            const hkInt8*   lut = hkVectorSort::Tables::s_orderingTable[ calcSortMask( fractions ) ];
            *stack++ = node->getChildData( lut[ 3 ] );
            *stack++ = node->getChildData( lut[ 2 ] );
            *stack++ = node->getChildData( lut[ 1 ] );
            return node->getChildData( lut[ 0 ] );
        }
        }
    }

    /// Compute the overlaps between an AABB  (\p a in it's transposed form) and four others (\p b).
    HK_INLINE hkVector4Comparison   overlaps_aabb_1v4( const hkcdSimdTree::TransposedAabb& a, const hkcdFourAabb* b )
    {
        hkVector4Comparison cx; cx.setAnd( a.m_lx.lessEqual( b->m_hx ), a.m_hx.greaterEqual( b->m_lx ) );
        hkVector4Comparison cy; cy.setAnd( a.m_ly.lessEqual( b->m_hy ), a.m_hy.greaterEqual( b->m_ly ) );
        hkVector4Comparison cz; cz.setAnd( a.m_lz.lessEqual( b->m_hz ), a.m_hz.greaterEqual( b->m_lz ) );

        hkVector4Comparison cxy; cxy.setAnd( cx, cy );
        hkVector4Comparison cxyz; cxyz.setAnd( cxy, cz );

        return cxyz;
    }

    /// Compute the overlaps between an AABB (from \p a indexed by \p INDEX) and four others (\p b).
    template <int INDEX>
    HK_INLINE hkVector4Comparison   overlaps_aabb_1v4( const hkcdFourAabb* HK_RESTRICT a, const hkcdFourAabb* HK_RESTRICT b )
    {
        hkVector4           lx, hx; lx.setBroadcast<INDEX>( a->m_lx ); hx.setBroadcast<INDEX>( a->m_hx );
        hkVector4           ly, hy; ly.setBroadcast<INDEX>( a->m_ly ); hy.setBroadcast<INDEX>( a->m_hy );
        hkVector4           lz, hz; lz.setBroadcast<INDEX>( a->m_lz ); hz.setBroadcast<INDEX>( a->m_hz );
        hkVector4Comparison cx; cx.setAnd( lx.lessEqual( b->m_hx ), hx.greaterEqual( b->m_lx ) );
        hkVector4Comparison cy; cy.setAnd( ly.lessEqual( b->m_hy ), hy.greaterEqual( b->m_ly ) );
        hkVector4Comparison cz; cz.setAnd( lz.lessEqual( b->m_hz ), hz.greaterEqual( b->m_lz ) );

        hkVector4Comparison cxy; cxy.setAnd( cx, cy );
        hkVector4Comparison cxyz; cxyz.setAnd( cxy, cz );

        return cxyz;
    }

    /// Process \p overlaps between \p node and \p aabbT, push internal children on a stack and called the query implementation to process leaves.
    template <typename COMPACT_AND_COUNT>
    struct processA_node
    {
        template <typename QUERY>
        static HK_INLINE void   process( const TransposedAabb& aabbT, const hkcdSimdTree::Node* nodes, const hkcdSimdTree::Node* HK_RESTRICT node, QUERY& query, hkUint32* HK_RESTRICT& stack )
        {
            hkIntVector data; data.load<4>( node->m_data );
            hkIntVector dataMasked; dataMasked.setShiftLeft32<31>( data );
            hkIntVector dataValues; dataValues.setShiftRight32<1>( data );

            hkVector4Comparison children; children.setNot( data.equalZeroS32() );
            hkVector4Comparison internals; internals.setAnd( children, dataMasked.equalZeroS32() );
            hkVector4Comparison leaves; leaves.setAnd( children, dataMasked.lessZeroS32() );
            const hkVector4Comparison   overlaps = hkcdSimdTreeUtils::overlaps_aabb_1v4( aabbT, node );

            hkVector4Comparison overlappingInternals; overlappingInternals.setAnd( internals, overlaps );
            hkVector4Comparison missedInternals; missedInternals.setAndNot( internals, overlaps );

            hkVector4Comparison overlappingLeaves; overlappingLeaves.setAnd( leaves, overlaps );
            hkVector4Comparison missedLeaves; missedLeaves.setAndNot( leaves, overlaps );

            query.processSimdTreeNode( node, overlappingInternals, missedInternals, overlappingLeaves, missedLeaves );

            hkIntVector internalData;
            const int   numInterals = COMPACT_AND_COUNT::compactAndCount( overlappingInternals, dataValues, &internalData );
            internalData.store<4, HK_IO_NATIVE_ALIGNED>( stack );
            stack += numInterals;
        }
    };

    /// Process \p overlaps between \p node and \p aabbT, push internal children on a stack and called the query implementation to process leaves.
    template <typename COMPACT_AND_COUNT>
    struct processA_leaves
    {
        template <typename QUERY>
        static HK_INLINE void   process( const TransposedAabb& aabbT, const hkcdSimdTree::Node* nodes, const hkcdSimdTree::Node* HK_RESTRICT node, QUERY& query, hkUint32* HK_RESTRICT& stack )
        {
            hkIntVector data; data.load<4>( node->m_data );
            hkIntVector dataMasked; dataMasked.setShiftLeft32<31>( data );
            hkIntVector dataValues; dataValues.setShiftRight32<1>( data );

            const hkVector4Comparison   internals = dataMasked.equalZeroS32();
            const hkVector4Comparison   overlaps = hkcdSimdTreeUtils::overlaps_aabb_1v4( aabbT, node );

            hkVector4Comparison validInternals; validInternals.setAnd( overlaps, internals );
            hkVector4Comparison validLeaves; validLeaves.setAndNot( overlaps, internals );

            hkIntVector internalData;
            const int   numInterals = COMPACT_AND_COUNT::compactAndCount( validInternals, dataValues, &internalData );
            internalData.store<4, HK_IO_NATIVE_ALIGNED>( stack );
            stack += numInterals;

            hkIntVector leavesData;
            const int   numLeaves = COMPACT_AND_COUNT::compactAndCount( validLeaves, dataValues, &leavesData );

            query.processSimdTreeLeaves( leavesData, numLeaves );
        }
    };

    /// Trampoline that call query flushIfNeeded method implement.
    struct queryHasFlush
    {
        template <typename QUERY>
        static HK_INLINE void flushIfNeeded( QUERY& query ) { query.flushIfNeeded(); }
    };

    /// Trampoline for queries that do not implement flushIfNeeded.
    struct queryHasNoFlush
    {
        template <typename QUERY>
        static HK_INLINE void flushIfNeeded( QUERY& ) {}
    };

    /// Process root nodes pushed on the stack (\p stackBase and \p stack).
    /// Traverse tree until all overlapping nodes have been processed.
    template <typename COMPACT_AND_COUNT, bool UNROLL, bool PROCESS_NODES, bool HAS_FLUSH, typename QUERY>
    HK_INLINE void  aabbOverlaps_Internal( const TransposedAabb& aabbT, const hkcdSimdTree::Node* nodes, QUERY& query, const hkUint32* stackBase, hkUint32* HK_RESTRICT stack )
    {
        HKCD_SIMDTREE_TIME_CODE_BLOCK( "aabbOverlaps_Internal" );

        // Compute stack size from the pointers.
        hk_size_t stackSize = stack - stackBase;
        if ( stackSize )
        {
            do
            {
                if ( UNROLL && stackSize >= 2 )
                {
                    // If UNROLL is true and at least two nodes are present on the stack, process them together to improve performances.
                    const hkcdSimdTree::Node* HK_RESTRICT node0 = &nodes[ stack[ -2 ] ];
                    const hkcdSimdTree::Node* HK_RESTRICT node1 = &nodes[ stack[ -1 ] ];
                    stack -= 2;
                    hkTrait::If< PROCESS_NODES, processA_node<COMPACT_AND_COUNT>, processA_leaves<COMPACT_AND_COUNT> >::Type::process( aabbT, nodes, node0, query, stack );
                    hkTrait::If< PROCESS_NODES, processA_node<COMPACT_AND_COUNT>, processA_leaves<COMPACT_AND_COUNT> >::Type::process( aabbT, nodes, node1, query, stack );
                }
                else
                {
                    // Pop and process the top node from the stack
                    --stack;
                    hkTrait::If< PROCESS_NODES, processA_node<COMPACT_AND_COUNT>, processA_leaves<COMPACT_AND_COUNT> >::Type::process( aabbT, nodes, &nodes[ *stack ], query, stack );
                }
                // Update stack size.
                stackSize = stack - stackBase;

                // Call query's flushIfNeeded method is present.
                hkTrait::If< HAS_FLUSH, queryHasFlush, queryHasNoFlush >::Type::flushIfNeeded( query );
            } while ( stackSize );
        }
    }

    /// Trampoline method that select the most appropriate implementation of COMPACT_AND_COUNT and called the specialized aabbOverlaps_Internal with it.
    template <bool TRACK_NODES, typename QUERY>
    void    aabbOverlaps_Trampoline( const hkcdSimdTree* tree, const TransposedAabb& aabbT, QUERY& query, hkUint32 root )
    {
        const hkcdSimdTree::Node* HK_RESTRICT   nodes = tree->m_nodes.begin();
        hkUint32                buffer[ hkcdSimdTree::UNARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( buffer );
        hkUint32*               stack = buffer; *stack++ = root;

        #define call_aabbOverlaps_Internal( _compact_and_count_ ) aabbOverlaps_Internal< _compact_and_count_, !TRACK_NODES, TRACK_NODES, false >(aabbT, nodes, query, buffer, stack)
        HK_VECTOR_SORT_SELECT_COMPACT_AND_COUNT( call_aabbOverlaps_Internal );
        #undef call_aabbOverlaps_Internal

        HKCD_CHECK_STACK_WRITE_BARRIER( buffer );
    }

    /// Implementation of hkcdSimdTree::ProcessRayCastLeaves for queries that do not implement batched process leaves.
    template <typename QUERY, bool HAS_BATCH_SUPPORT = false>
    struct ProcessSimdTreeRayCastLeaves : public hkcdSimdTree::ProcessRayCastLeaves
    {
        HK_INLINE ProcessSimdTreeRayCastLeaves( QUERY& query ) : m_query( query ) {}
        static HK_INLINE hkReal process( QUERY& query, const hkUint32* leaves, int numLeaves, const hkcdRay& ray ) { for ( int i = 0; i < numLeaves; ++i ) query.processSimdTreeLeaf( leaves[ i ], ray ); return query.isDone() ? -1 : query.getFraction().getReal(); }
        hkReal process( const hkUint32* leaves, int numLeaves, const hkcdRay& ray ) HK_OVERRIDE { return process( m_query, leaves, numLeaves, ray ); }
        QUERY& m_query;
    };

    /// Implementation of hkcdSimdTree::ProcessRayCastLeaves for queries that do implement batched process leaves.
    template <typename QUERY>
    struct ProcessSimdTreeRayCastLeaves<QUERY, true> : public hkcdSimdTree::ProcessRayCastLeaves
    {
        HK_INLINE ProcessSimdTreeRayCastLeaves( QUERY& query ) : m_query( query ) {}
        static HK_INLINE hkReal process( QUERY& query, const hkUint32* leaves, int numLeaves, const hkcdRay& ray ) { return query.processSimdTreeLeaves( leaves, numLeaves, ray ); }
        hkReal process( const hkUint32* leaves, int numLeaves, const hkcdRay& ray ) HK_OVERRIDE { return process( m_query, leaves, numLeaves, ray ); }
        QUERY& m_query;
    };

    /// Horizontal mins using integers.
    HK_ALWAYS_INLINE hkVector4 minI( const hkVector4& x, const hkVector4& y ) { hkVector4 t; t.setMinI( x, y ); return t; }

    /// Horizontal mins using integers.
    HK_ALWAYS_INLINE hkVector4 minI( const hkVector4& x, const hkVector4& y, const hkVector4& z, const hkVector4& w ) { return minI( minI( x, y ), minI( z, w ) ); }

    /// Horizontal maxs using integers.
    HK_ALWAYS_INLINE hkVector4 maxI( const hkVector4& x, const hkVector4& y ) { hkVector4 t; t.setMaxI( x, y ); return t; }

    /// Horizontal maxs using integers.
    HK_ALWAYS_INLINE hkVector4 maxI( const hkVector4& x, const hkVector4& y, const hkVector4& z, const hkVector4& w ) { return maxI( maxI( x, y ), maxI( z, w ) ); }

    /// ray cast against four AABBs.
    HK_INLINE int   rayCastFourAabbs( const hkcdRay& ray,
        int nearXoffset, int farXoffset,
        int nearYoffset, int farYoffset,
        int nearZoffset, int farZoffset,
        const hkFourTransposedPoints& orgT, const hkFourTransposedPoints& invDirT,
        const hkVector4& fractionsIn, const hkcdFourAabb* node,
        hkVector4* HK_RESTRICT fractionsOut )
    {
        const hkUint8* basePtr = reinterpret_cast<const hkUint8*>( node );

        const hkVector4 nearX = ( ( *reinterpret_cast<const hkVector4*>( basePtr + nearXoffset ) ) - orgT.m_vertices[ 0 ] ) * invDirT.m_vertices[ 0 ];
        const hkVector4 farX = ( ( *reinterpret_cast<const hkVector4*>( basePtr + farXoffset ) ) - orgT.m_vertices[ 0 ] ) * invDirT.m_vertices[ 0 ];

        const hkVector4 nearY = ( ( *reinterpret_cast<const hkVector4*>( basePtr + nearYoffset ) ) - orgT.m_vertices[ 1 ] ) * invDirT.m_vertices[ 1 ];
        const hkVector4 farY = ( ( *reinterpret_cast<const hkVector4*>( basePtr + farYoffset ) ) - orgT.m_vertices[ 1 ] ) * invDirT.m_vertices[ 1 ];

        const hkVector4 nearZ = ( ( *reinterpret_cast<const hkVector4*>( basePtr + nearZoffset ) ) - orgT.m_vertices[ 2 ] ) * invDirT.m_vertices[ 2 ];
        const hkVector4 farZ = ( ( *reinterpret_cast<const hkVector4*>( basePtr + farZoffset ) ) - orgT.m_vertices[ 2 ] ) * invDirT.m_vertices[ 2 ];

        const hkVector4 nearMax = maxI( nearX, nearY, nearZ, hkVector4::getZero() );
        const hkVector4 farMin = minI( farX, farY, farZ, fractionsIn );

        *fractionsOut = nearMax;

        #if defined(HK_ENABLE_SSE_CODE_PATH) && HK_SSE_VERSION >= 0x42
        // Use integer comparison.
        
        return _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpgt_epi32( _mm_castps_si128( nearMax.m_quad ), _mm_castps_si128( farMin.m_quad ) ) ) ) ^ 15;
        #else
        return nearMax.lessEqual( farMin ).getMask();
        #endif
    }

    HK_INLINE hkVector4Comparison   rayCastFourAabbs( const hkcdRay& ray,
        const hkFourTransposedPoints& orgT, const hkFourTransposedPoints& invDirT,
        const hkVector4& fractionsIn, const hkcdFourAabb* node,
        hkVector4* HK_RESTRICT fractionsOut )
    {
        hkVector4   lx; lx.setSub( node->m_lx, orgT.m_vertices[ 0 ] );
        hkVector4   hx; hx.setSub( node->m_hx, orgT.m_vertices[ 0 ] );
        hkVector4   nearXt; nearXt.setMul( lx, invDirT.m_vertices[ 0 ] );
        hkVector4   farXt; farXt.setMul( hx, invDirT.m_vertices[ 0 ] );

        hkVector4   ly; ly.setSub( node->m_ly, orgT.m_vertices[ 1 ] );
        hkVector4   hy; hy.setSub( node->m_hy, orgT.m_vertices[ 1 ] );
        hkVector4   nearYt; nearYt.setMul( ly, invDirT.m_vertices[ 1 ] );
        hkVector4   farYt; farYt.setMul( hy, invDirT.m_vertices[ 1 ] );

        hkVector4   lz; lz.setSub( node->m_lz, orgT.m_vertices[ 2 ] );
        hkVector4   hz; hz.setSub( node->m_hz, orgT.m_vertices[ 2 ] );
        hkVector4   nearZt; nearZt.setMul( lz, invDirT.m_vertices[ 2 ] );
        hkVector4   farZt; farZt.setMul( hz, invDirT.m_vertices[ 2 ] );

        hkVector4   nearX; nearX.setMin( nearXt, farXt );
        hkVector4   farX; farX.setMax( nearXt, farXt );

        hkVector4   nearY; nearY.setMin( nearYt, farYt );
        hkVector4   farY; farY.setMax( nearYt, farYt );

        hkVector4   nearZ; nearZ.setMin( nearZt, farZt );
        hkVector4   farZ; farZ.setMax( nearZt, farZt );

        hkVector4   nearMax; nearMax.setMax( nearX, nearY ); nearMax.setMax( nearMax, nearZ ); nearMax.setMax( nearMax, hkVector4::getZero() );
        hkVector4   farMin; farMin.setMin( farX, farY ); farMin.setMin( farMin, farZ ); farMin.setMin( farMin, fractionsIn );

        hkVector4Comparison mask; mask.setAnd( nearMax.lessEqual( farMin ), node->m_lx.lessEqual( node->m_hx ) );

        *fractionsOut = nearMax;

        #if defined(HKCD_DETECT_RAY_AABB_FALSE_NEGATIVES)
        for ( int i = 0; i < 4; ++i )
        {
            if ( 0 == ( mask.getMask() & ( 1 << i ) ) )
            {
                hkAabb      aabb; node->getAabb( i, &aabb );
                hkSimdReal  fraction = fractionsIn.getComponent( i );
                const bool  result = hcdIntersectRayAabb_Exact( ray, aabb, fraction );
                if ( result ) HK_ERROR( 0xAF28F32A, "Ray-AABB false negative detected" );
            }
        }
        #endif

        return mask;
    }
}

template <typename QUERY>
void    hkcdSimdTree::rayCast( const hkcdRay& ray, QUERY& query, hkUint32 root ) const
{
    hkcdSimdTreeUtils::ProcessSimdTreeRayCastLeaves<QUERY, HasMethod_processSimdTreeLeaves<QUERY>::Any> processor( query );
    rayCast_OutOfLine( ray, query.getFraction().getReal(), processor, root );
}

template <typename QUERY>
inline void hkcdSimdTree::aabbCast( const hkcdRay & ray, const hkVector4 & halfExtents, QUERY & query, bool sort, hkUint32 root ) const
{
    if ( query.isDone() )
    {
        return;
    }

    hkFourTransposedPoints  halfExtentsT; halfExtentsT.setAll( halfExtents );
    hkFourTransposedPoints  orgT; orgT.setAll( ray.m_origin );
    hkFourTransposedPoints  invDirT; invDirT.setAll( ray.m_invDirection );

    hkVector4               maxFractions; maxFractions.setAll( query.getFraction() );

    HK_ALIGN16( hkUint32    indexBuffer[ UNARY_STACK_SIZE * 2 ] ); HKCD_SET_STACK_WRITE_BARRIER( indexBuffer );
    HK_ALIGN16( hkReal fractionBuffer[ UNARY_STACK_SIZE * 2 ] );
    int         stackSize = 1;

    indexBuffer[ 0 ] = root << 1;
    fractionBuffer[ 0 ] = 0;

    do
    {
        hkUint32 nodeData;
        hkReal nodeFraction;
        if ( sort )
        {
            // sort is true, pick the closest of the 4 last items from the stack.
            const int n = hkMath::min2( 4, stackSize );
            const hkReal* f = &fractionBuffer[ stackSize - n ];
            int minIndex = stackSize - n;
            switch ( n )
            {
            default: break;
            case 2: minIndex += f[ 0 ] < f[ 1 ] ? 0 : 1; break;
            case 3: { hkVector4 v; v.load<4, HK_IO_NATIVE_ALIGNED>( f ); minIndex += v.getIndexOfMinComponent<3>(); } break;
            case 4: { hkVector4 v; v.load<4, HK_IO_NATIVE_ALIGNED>( f ); minIndex += v.getIndexOfMinComponent<4>(); } break;
            }

            // If the closest is further away than maxFractions, pop n items and continue.
            if ( maxFractions( 0 ) < fractionBuffer[ minIndex ] ) { stackSize -= n; continue; }

            // Fetch index and exchange.
            nodeData = indexBuffer[ minIndex ];
            nodeFraction = fractionBuffer[ minIndex ];
            fractionBuffer[ minIndex ] = fractionBuffer[ --stackSize ];
            indexBuffer[ minIndex ] = indexBuffer[ stackSize ];
        }
        else
        {
            // sort is false, pick the top of the stack.
            nodeData = indexBuffer[ --stackSize ];
            nodeFraction = fractionBuffer[ stackSize ];
        }

        if ( 0 == ( nodeData & 1 ) )
        {
            // Internal.
            const Node*     node = &m_nodes[ int( nodeData >> 1 ) ];
            hkcdFourAabb    sum;

            sum.m_lx.setSub( node->m_lx, halfExtentsT.m_vertices[ 0 ] );
            sum.m_hx.setAdd( node->m_hx, halfExtentsT.m_vertices[ 0 ] );

            sum.m_ly.setSub( node->m_ly, halfExtentsT.m_vertices[ 1 ] );
            sum.m_hy.setAdd( node->m_hy, halfExtentsT.m_vertices[ 1 ] );

            sum.m_lz.setSub( node->m_lz, halfExtentsT.m_vertices[ 2 ] );
            sum.m_hz.setAdd( node->m_hz, halfExtentsT.m_vertices[ 2 ] );

            hkVector4               hitFractions;
            hkVector4Comparison     hitMasks = hkcdSimdTreeUtils::rayCastFourAabbs( ray, orgT, invDirT, maxFractions, &sum, &hitFractions );

            const int mask = hitMasks.getMask();
            if ( mask )
            {
                if ( mask == 15 )
                {
                    hitFractions.store<4, HK_IO_NATIVE_ALIGNED>( fractionBuffer + stackSize );
                    reinterpret_cast<const hkIntVector*>( node->m_data )->store<4, HK_IO_NATIVE_ALIGNED>( indexBuffer + stackSize );
                    stackSize += 4;
                }
                else
                {
                    #define PUSH_ELEMENT(_index_) { fractionBuffer[stackSize]= hitFractions(_index_); indexBuffer[stackSize++] = node->m_data[_index_]; }
                    if ( mask & 1 ) PUSH_ELEMENT( 0 );
                    if ( mask & 2 ) PUSH_ELEMENT( 1 );
                    if ( mask & 4 ) PUSH_ELEMENT( 2 );
                    if ( mask & 8 ) PUSH_ELEMENT( 3 );
                    #undef PUSH_ELEMENT
                }
            }
        }
        else
        {
            // Leaf.
            query.processSimdTreeLeaf( nodeData >> 1, nodeFraction, ray, halfExtents );
            maxFractions.setAll( query.getFraction() );
            if ( query.isDone() )
            {
                return;
            }
        }
    } while ( stackSize );
    HKCD_CHECK_STACK_WRITE_BARRIER( indexBuffer );
}

#endif // !defined(__HAVOK_PARSER__)

//
template <typename QUERY>
void    hkcdSimdTree::aabbOverlaps( const hkAabb& aabb, QUERY& query, hkUint32 root ) const
{
    const TransposedAabb    aabbT( aabb );
    aabbOverlaps( aabbT, query, root );
}

//
template <typename QUERY>
void    hkcdSimdTree::aabbOverlaps( const TransposedAabb& aabbT, QUERY& query, hkUint32 root ) const
{
    hkcdSimdTreeUtils::aabbOverlaps_Trampoline<false>( this, aabbT, query, root );
}

//
template <typename QUERY>
void    hkcdSimdTree::aabbOverlapsNodes( const hkAabb& aabb, QUERY& query, hkUint32 root ) const
{
    const TransposedAabb    aabbT( aabb );
    hkcdSimdTreeUtils::aabbOverlaps_Trampoline<true>( this, aabbT, query, root );
}


// faster version, only works well with SSE 4.2
template <typename QUERY>
void    hkcdSimdTree::aabbOverlapsNearMissNew( const hkAabb& aabb, QUERY& query, hkAabb& nmp, hkUint32 root ) const
{
    HK_ASSERT( 0xB18A8A81, nmp.contains( aabb ), "Invalid initial NMP" );

    const TransposedAabb    aabbT( aabb );

    hkcdFourAabb    nmpT;
    {
        hkVector4 inf = hkVector4::getConstant( HK_QUADREAL_MAX );
        nmpT.m_lx = inf; nmpT.m_ly = inf; nmpT.m_lz = inf;
        nmpT.m_hx = inf; nmpT.m_hy = inf; nmpT.m_hz = inf;
    }

    hkUint32                buffer[ UNARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( buffer );
    hkUint32*               stack = buffer; *stack++ = root << 1;

    hkIntVector iAnd; iAnd.setAll( 0xffffffe0 );
    hkIntVector iOrLx; iOrLx.set( 0x00, 0x01, 0x02, 0x03 );
    hkIntVector iOrHx; iOrHx.set( 0x04, 0x05, 0x06, 0x07 );
    hkIntVector iOrLy; iOrLy.set( 0x08, 0x09, 0x0a, 0x0b );
    hkIntVector iOrHy; iOrHy.set( 0x0c, 0x0d, 0x0e, 0x0f );
    hkIntVector iOrLz; iOrLz.set( 0x10, 0x11, 0x12, 0x13 );
    hkIntVector iOrHz; iOrHz.set( 0x14, 0x15, 0x16, 0x17 );

    do
    {
        hkUint32    nodeData = *( --stack );
    processNode: {}
        if ( 0 == ( nodeData & 1 ) )
        {
            // Internal.
            const Node*             node = &m_nodes[ int( nodeData >> 1 ) ];

            hkVector4f  ms_lo_x = hkIntVector::injectBitsf( node->m_lx - aabbT.m_hx, iAnd, iOrLx );
            hkVector4f  ms_lo_y = hkIntVector::injectBitsf( node->m_ly - aabbT.m_hy, iAnd, iOrLy );
            hkVector4f  ms_lo_z = hkIntVector::injectBitsf( node->m_lz - aabbT.m_hz, iAnd, iOrLz );
            hkVector4f  ms_hi_x = hkIntVector::injectBitsf( aabbT.m_lx - node->m_hx, iAnd, iOrHx );
            hkVector4f  ms_hi_y = hkIntVector::injectBitsf( aabbT.m_ly - node->m_hy, iAnd, iOrHy );
            hkVector4f  ms_hi_z = hkIntVector::injectBitsf( aabbT.m_lz - node->m_hz, iAnd, iOrHz );

            ms_lo_x.setMaxI( ms_lo_x, ms_hi_x );    ms_lo_y.setMaxI( ms_lo_y, ms_hi_y );    ms_lo_z.setMaxI( ms_lo_z, ms_hi_z );
            ms_lo_x.setMaxI( ms_lo_x, ms_lo_y );    ms_lo_x.setMaxI( ms_lo_x, ms_lo_z );

            hkVector4 v = hkVec4_0;
            hkVector4 w = hkVec4_0;

            hkIntVector v4; v4.loadAsFloat32BitRepresentation( ms_lo_x );
            hkIntVector isMax; isMax.setShiftRight32<2>( v4 );  isMax.setShiftLeft32<31>( isMax );  // now we have the indices of the most interesting distance, but only if positive

            if ( stack > &buffer[ UNARY_STACK_SIZE - 4 ] )    // don't risk a stack overflow, early out
            {
                break;
            }
            {
                const int I = 0;
                hkInt32 value = (hkInt32) v4.getU32<I>();
                if ( value > 0 )
                {
                    int k = value & 0x1f;
                    v.load<1>( &( (const hkReal*) node )[ k ] );     w.load<1>( &( (const hkReal*) &nmpT )[ k ] );
                    //isMax.setPermutation<hkVectorPermutation::Permutation( 0x123 | 0x1000 * I )>( isMax );
                    hkIntVector::xorHighf( v, isMax, v );
                    v.setMin( v, w );
                    v.store<1>( &( (hkReal*) &nmpT )[ k ] );
                }
                else { *( stack++ ) = node->getChildData<I>(); }
            }
            {
                const int I = 1;
                hkInt32 value = (hkInt32) v4.getU32<I>();
                if ( value > 0 )
                {
                    int k = value & 0x1f;
                    v.load<1>( &( (const hkReal*) node )[ k ] );     w.load<1>( &( (const hkReal*) &nmpT )[ k ] );
                    isMax.setPermutation<hkVectorPermutation::Permutation( 0x123 | 0x1000 * I )>( isMax );
                    hkIntVector::xorHighf( v, isMax, v );
                    v.setMin( v, w );
                    v.store<1>( &( (hkReal*) &nmpT )[ k ] );
                }
                else { *( stack++ ) = node->getChildData<I>(); }
            }
            {
                const int I = 2;
                hkInt32 value = (hkInt32) v4.getU32<I>();
                if ( value > 0 )
                {
                    int k = value & 0x1f;
                    v.load<1>( &( (const hkReal*) node )[ k ] );     w.load<1>( &( (const hkReal*) &nmpT )[ k ] );
                    isMax.setPermutation<hkVectorPermutation::Permutation( 0x123 | 0x1000 * I )>( isMax );
                    hkIntVector::xorHighf( v, isMax, v );
                    v.setMin( v, w );
                    v.store<1>( &( (hkReal*) &nmpT )[ k ] );
                }
                else { *( stack++ ) = node->getChildData<I>(); }
            }
            {
                const int I = 3;
                hkInt32 value = (hkInt32) v4.getU32<I>();
                if ( value > 0 )
                {
                    int k = value & 0x1f;
                    v.load<1>( &( (const hkReal*) node )[ k ] );     w.load<1>( &( (const hkReal*) &nmpT )[ k ] );
                    isMax.setPermutation<hkVectorPermutation::Permutation( 0x123 | 0x1000 * I )>( isMax );
                    hkIntVector::xorHighf( v, isMax, v );
                    v.setMin( v, w );
                    v.store<1>( &( (hkReal*) &nmpT )[ k ] );
                }
                else
                {
                    nodeData = node->getChildData<I>();
                    goto processNode;
                }
            }


        }
        else
        {
            // Leaf.
            query.processSimdTreeLeaf( nodeData >> 1, &nmp );
            if ( query.isDone() )
            {
                break;
            }
        }
    } while ( stack != buffer );

    // convert nmpT to normal aabb
    hkAabb internalNmp;
    {
        hkVector4 a = nmpT.m_hx;        hkVector4 b = nmpT.m_hy;
        hkVector4 c = nmpT.m_hz;        hkVector4 d = hkVec4_0;
        hkMath::transpose( a, b, c, d );

        a.setMin( a, b );  c.setMin( c, d );      a.setMin( a, c );
        internalNmp.m_min.setNeg<4>( a );

        a = nmpT.m_lx;  b = nmpT.m_ly;  c = nmpT.m_lz;  d = hkVec4_0;
        hkMath::transpose( a, b, c, d );

        a.setMin( a, b );   c.setMin( c, d );   a.setMin( a, c );
        internalNmp.m_max = a;
    }

    nmp.setIntersection( nmp, internalNmp );
    HK_ASSERT( 0x202AC02E, nmp.contains( aabb ), "Invalid NMP" );
    HKCD_CHECK_STACK_WRITE_BARRIER( buffer );
}


//
template <typename QUERY>
void    hkcdSimdTree::closestFromAabb( const hkAabb& aabb, hkSimdRealParameter initialDistanceSquared, QUERY& query, hkUint32 root ) const
{
    if ( query.isDone() )
    {
        return;
    }

    const hkVector4         zero( hkVector4::getZero() );
    const TransposedAabb    aabbT( aabb );
    hkVector4               distanceSquared; distanceSquared.setAll( initialDistanceSquared );

    hkUint32    buffer[ UNARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( buffer );
    hkUint32*   stack = buffer; *stack++ = root << 1;
    do
    {
        hkUint32    nodeData = *( --stack );
    processNode: {}
        if ( 0 == ( nodeData & 1 ) )
        {
            // Internal.
            const Node*             node = &m_nodes[ int( nodeData >> 1 ) ];

            hkVector4   ms_lx; ms_lx.setSub( aabbT.m_lx, node->m_hx );
            hkVector4   ms_ly; ms_ly.setSub( aabbT.m_ly, node->m_hy );
            hkVector4   ms_lz; ms_lz.setSub( aabbT.m_lz, node->m_hz );
            hkVector4   ms_hx; ms_hx.setSub( aabbT.m_hx, node->m_lx );
            hkVector4   ms_hy; ms_hy.setSub( aabbT.m_hy, node->m_ly );
            hkVector4   ms_hz; ms_hz.setSub( aabbT.m_hz, node->m_lz );

            hkVector4   px; px.setMax( zero, ms_lx ); px.setMin( px, ms_hx ); px.mul( px );
            hkVector4   py; py.setMax( zero, ms_ly ); py.setMin( py, ms_hy ); py.mul( py );
            hkVector4   pz; pz.setMax( zero, ms_lz ); pz.setMin( pz, ms_hz ); pz.mul( pz );
            hkVector4   d; d.setAdd( px, py ); d.add( pz );

            hkVector4Comparison c; c.setAnd( d.lessEqual( distanceSquared ), node->m_lx.lessEqual( node->m_hx ) );

            if ( c.anyIsSet<hkVector4ComparisonMask::MASK_XYZW>() )
            {
                HKCD_CHECK_STACK_PUSH( 3 );
                nodeData = hkcdSimdTreeUtils::sortPush( d, c.getMask(), stack, node );
                goto processNode;
            }
        }
        else
        {
            // Leaf.
            hkSimdReal  d = distanceSquared.getComponent<0>();
            query.processSimdTreeLeaf( nodeData >> 1, aabb, d );
            distanceSquared.setAll( d );
            if ( query.isDone() )
            {
                return;
            }
        }
    } while ( stack != buffer );
    HKCD_CHECK_STACK_WRITE_BARRIER( buffer );
}

//
template <typename QUERY>
void    hkcdSimdTree::closestFromPoint( hkVector4Parameter point, hkSimdRealParameter initialDistanceSquared, QUERY& query, hkUint32 root ) const
{
    if ( query.isDone() )
    {
        return;
    }

    hkFourTransposedPoints  pointT; pointT.setAll( point );
    hkVector4               distanceSquared; distanceSquared.setAll( initialDistanceSquared );

    hkUint32    buffer[ UNARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( buffer );
    hkUint32*   stack = buffer; *stack++ = root << 1;
    do
    {
        hkUint32    nodeData = *( --stack );
    processNode: {}
        if ( 0 == ( nodeData & 1 ) )
        {
            // Internal.
            const Node*             node = &m_nodes[ int( nodeData >> 1 ) ];

            hkVector4   px; px.setMax( pointT.m_vertices[ 0 ], node->m_lx ); px.setMin( px, node->m_hx ); px.sub( pointT.m_vertices[ 0 ] ); px.mul( px );
            hkVector4   py; py.setMax( pointT.m_vertices[ 1 ], node->m_ly ); py.setMin( py, node->m_hy ); py.sub( pointT.m_vertices[ 1 ] ); py.mul( py );
            hkVector4   pz; pz.setMax( pointT.m_vertices[ 2 ], node->m_lz ); pz.setMin( pz, node->m_hz ); pz.sub( pointT.m_vertices[ 2 ] ); pz.mul( pz );
            hkVector4   d; d.setAdd( px, py ); d.add( pz );
            hkVector4Comparison c; c.setAnd( d.lessEqual( distanceSquared ), node->m_lx.lessEqual( node->m_hx ) );

            if ( c.anyIsSet<hkVector4ComparisonMask::MASK_XYZW>() )
            {
                HKCD_CHECK_STACK_PUSH( 3 );
                nodeData = hkcdSimdTreeUtils::sortPush( d, c.getMask(), stack, node );
                goto processNode;
            }
        }
        else
        {
            // Leaf.
            hkSimdReal  d = distanceSquared.getComponent<0>();
            query.processSimdTreeLeaf( nodeData >> 1, point, d );
            distanceSquared.setAll( d );
            if ( query.isDone() )
            {
                return;
            }
        }
    } while ( stack != buffer );
    HKCD_CHECK_STACK_WRITE_BARRIER( buffer );
}

//
template <typename QUERY>
void    hkcdSimdTree::closestFromPointParticles( hkVector4Parameter point, hkSimdRealParameter initialDistanceSquared, QUERY& query, hkUint32 root ) const
{
    if ( query.isDone() )
    {
        return;
    }

    hkSimdReal initialEOT = initialDistanceSquared;

    struct NodeAndDistanceSquared
    {
        const Node* m_node;
        hkVector4 m_distanceSquared;
        hkVector4Comparison m_comparison;

        HK_INLINE void set( const Node* node, hkVector4Parameter distanceSquared, hkVector4ComparisonParameter comparison )
        {
            m_node = node;
            m_distanceSquared = distanceSquared;
            m_comparison = comparison;
        }
    };

    struct LeafAndDistanceSquared
    {
        hkUint32 m_nodeData;
        hkReal m_distanceSquared;

        HK_INLINE void set( hkUint32 nodeData, hkReal distanceSquared )
        {
            m_nodeData = nodeData;
            m_distanceSquared = distanceSquared;
        }
    };

    hkInplaceArray<NodeAndDistanceSquared, 128> filteredOutNodesBasedOnEOT;
    hkInplaceArray<LeafAndDistanceSquared, 8> filteredOutLeavesBasedOnEOT;
    bool checkEOT = true;

    hkFourTransposedPoints  pointT; pointT.setAll( point );
    hkVector4               distanceSquared; distanceSquared.setAll( initialDistanceSquared );

    hkUint32    buffer[ UNARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( buffer );
    hkUint32*   stack = buffer; *stack++ = root << 1;

    do
    {
    getNode: hkUint32   nodeData = *( --stack );
    processNode: {}
        if ( 0 == ( nodeData & 1 ) )
        {
            // Internal.
            const Node*             node = &m_nodes[ int( nodeData >> 1 ) ];

            hkVector4   px; px.setMax( pointT.m_vertices[ 0 ], node->m_lx ); px.setMin( px, node->m_hx ); px.sub( pointT.m_vertices[ 0 ] ); px.mul( px );
            hkVector4   py; py.setMax( pointT.m_vertices[ 1 ], node->m_ly ); py.setMin( py, node->m_hy ); py.sub( pointT.m_vertices[ 1 ] ); py.mul( py );
            hkVector4   pz; pz.setMax( pointT.m_vertices[ 2 ], node->m_lz ); pz.setMin( pz, node->m_hz ); pz.sub( pointT.m_vertices[ 2 ] ); pz.mul( pz );
            hkVector4   d; d.setAdd( px, py ); d.add( pz );
            hkVector4Comparison c; c.setAnd( d.lessEqual( distanceSquared ), node->m_lx.lessEqual( node->m_hx ) );

            if ( c.anyIsSet<hkVector4ComparisonMask::MASK_XYZW>() )
            {
                HKCD_CHECK_STACK_PUSH( 3 );
                nodeData = hkcdSimdTreeUtils::sortPush( d, c.getMask(), stack, node );

                if ( checkEOT && !c.allAreSet<hkVector4ComparisonMask::MASK_XYZW>() )
                {
                    c.setNot( c );
                    filteredOutNodesBasedOnEOT.expandOne().set( node, d, c );
                }

                goto processNode;
            }
            else if ( checkEOT )
            {
                c.setNot( c );
                filteredOutNodesBasedOnEOT.expandOne().set( node, d, c );
            }
        }
        else
        {
            int oldNumHits = query.m_collector->getNumHits();

            // Leaf.
            hkSimdReal  d = distanceSquared.getComponent<0>();
            query.processSimdTreeLeaf( nodeData >> 1, point, d );

            if ( checkEOT && oldNumHits == query.m_collector->getNumHits() )
            {
                filteredOutLeavesBasedOnEOT.expandOne().set( nodeData >> 1, d.getReal() );
            }

            distanceSquared.setAll( d );
            if ( query.isDone() )
            {
                return;
            }
        }
    } while ( stack != buffer );

    if ( distanceSquared.getComponent<0>().isGreater( initialEOT ) )
    {
        if ( checkEOT )
        {
            checkEOT = false;

            for ( int i = 0; i < filteredOutNodesBasedOnEOT.getSize(); i++ )
            {
                hkVector4Comparison comp; comp.setAnd( filteredOutNodesBasedOnEOT[ i ].m_distanceSquared.lessEqual( distanceSquared ), filteredOutNodesBasedOnEOT[ i ].m_node->m_lx.lessEqual( filteredOutNodesBasedOnEOT[ i ].m_node->m_hx ) );
                comp.setAnd( comp, filteredOutNodesBasedOnEOT[ i ].m_comparison );
                if ( comp.anyIsSet<hkVector4ComparisonMask::MASK_XYZW>() )
                {
                    hkUint32 nd = hkcdSimdTreeUtils::sortPush( filteredOutNodesBasedOnEOT[ i ].m_distanceSquared, comp.getMask(), stack, filteredOutNodesBasedOnEOT[ i ].m_node );
                    *stack++ = nd;
                }
            }

            if ( stack != buffer )
            {
                goto getNode;
            }
        }

        for ( int i = 0; i < filteredOutLeavesBasedOnEOT.getSize(); i++ )
        {
            hkSimdReal d = hkSimdReal::fromFloat( filteredOutLeavesBasedOnEOT[ i ].m_distanceSquared );
            query.processSimdTreeLeaf( filteredOutLeavesBasedOnEOT[ i ].m_nodeData, point, d );
        }
    }

    HKCD_CHECK_STACK_WRITE_BARRIER( buffer );
}

//
template <typename ACCEPT>
void    hkcdSimdTree::genericProcess( ACCEPT& accept, hkUint32 root )
{
    hkUint32    buffer[ UNARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( buffer );
    hkUint32*   stack = buffer; *stack++ = root << 1;
    do
    {
        const hkUint32  nodeData = *( --stack ); HK_ASSERT( 0x39F65FDE, ( nodeData & 1 ) == 0 && nodeData != 0, "Node must be internal and valid." );
        Node*           node = &m_nodes[ int( nodeData >> 1 ) ];
        hkIntVector     data; data.load<4, HK_IO_SIMD_ALIGNED>( node->m_data );
        hkIntVector     leafData; leafData.setAnd( data, hkIntVector::getConstant<HK_QUADINT_1>() );
        hkVector4Comparison internals; internals.setAnd( data.greaterZeroS32(), leafData.equalZeroS32() );
        const int       accepted = accept( node ) & internals.getMask();
        if ( accepted & 1 ) *stack++ = node->m_data[ 0 ];
        if ( accepted & 2 ) *stack++ = node->m_data[ 1 ];
        if ( accepted & 4 ) *stack++ = node->m_data[ 2 ];
        if ( accepted & 8 ) *stack++ = node->m_data[ 3 ];
    } while ( stack != buffer );
    HKCD_CHECK_STACK_WRITE_BARRIER( buffer );
}

//
template <typename ACCEPT>
void    hkcdSimdTree::genericProcess( ACCEPT& accept, hkUint32 root ) const
{
    hkUint32    buffer[ UNARY_STACK_SIZE ]; HKCD_SET_STACK_WRITE_BARRIER( buffer );
    hkUint32*   stack = buffer; *stack++ = root << 1;
    do
    {
        const hkUint32  nodeData = *( --stack ); HK_ASSERT( 0x39F65FDE, ( nodeData & 1 ) == 0 && nodeData != 0, "Node must be internal and valid." );
        const Node*     node = &m_nodes[ int( nodeData >> 1 ) ];
        hkIntVector     internals; internals.setAndNot( *(const hkIntVector*) node->m_data, hkIntVector::getConstant<HK_QUADINT_1>() );
        const int       accepted = accept( node ) & internals.greaterZeroS32().getMask();
        if ( accepted & 1 ) *stack++ = node->m_data[ 0 ];
        if ( accepted & 2 ) *stack++ = node->m_data[ 1 ];
        if ( accepted & 4 ) *stack++ = node->m_data[ 2 ];
        if ( accepted & 8 ) *stack++ = node->m_data[ 3 ];
    } while ( stack != buffer );
    HKCD_CHECK_STACK_WRITE_BARRIER( buffer );
}

//
HK_INLINE void      hkcdSimdTree::getDomain( hkAabb* HK_RESTRICT aabb ) const
{
    const Node& root = m_nodes[ 1 ];
    aabb->m_min.set( root.m_lx.horizontalMin<4>(), root.m_ly.horizontalMin<4>(), root.m_lz.horizontalMin<4>(), hkSimdReal_0 );
    aabb->m_max.set( root.m_hx.horizontalMax<4>(), root.m_hy.horizontalMax<4>(), root.m_hz.horizontalMax<4>(), hkSimdReal_0 );
}

//
HK_INLINE int           hkcdSimdTree::getMemoryFootPrint() const
{
    return sizeof( *this ) + sizeof( Node ) * m_nodes.getSize();
}

//
HK_INLINE bool      hkcdSimdTree::isEmpty() const
{
    return !m_nodes[ 1 ].isAllocated();
}

//
inline void hkcdSimdTree::clear()
{
    m_nodes.setSize( 2 );
    m_nodes[ 0 ].clear();
    m_nodes[ 1 ].clear();
}

//
// hkcdSimdTree::Node
//

//
HK_INLINE hkBool32 hkcdSimdTree::Node::isAllocated() const
{
    return m_lx.lessEqual( m_hx ).anyIsSet<hkVector4ComparisonMask::MASK_XYZW>();
}

//
// hkcdSimdTree::TransposedAabb
//

//
template <int INDEX>
HK_INLINE   void hkcdSimdTree::TransposedAabb::splat( const hkcdFourAabb& aabbs )
{
    m_lx.setBroadcast<INDEX>( aabbs.m_lx ); m_hx.setBroadcast<INDEX>( aabbs.m_hx );
    m_ly.setBroadcast<INDEX>( aabbs.m_ly ); m_hy.setBroadcast<INDEX>( aabbs.m_hy );
    m_lz.setBroadcast<INDEX>( aabbs.m_lz ); m_hz.setBroadcast<INDEX>( aabbs.m_hz );
}

//
HK_INLINE   void hkcdSimdTree::TransposedAabb::splat( const hkcdFourAabb& aabbs, int index )
{
    switch ( index )
    {
    case 0:     splat<0>( aabbs ); break;
    case 1:     splat<1>( aabbs ); break;
    case 2:     splat<2>( aabbs ); break;
    default:    splat<3>( aabbs ); break;
    }
}

//
HK_INLINE void hkcdSimdTree::TransposedAabb::splat( const hkAabb& aabb )
{
    m_lx.setBroadcast<0>( aabb.m_min ); m_ly.setBroadcast<1>( aabb.m_min ); m_lz.setBroadcast<2>( aabb.m_min );
    m_hx.setBroadcast<0>( aabb.m_max ); m_hy.setBroadcast<1>( aabb.m_max ); m_hz.setBroadcast<2>( aabb.m_max );
}

//
HK_INLINE void hkcdSimdTree::TransposedAabb::gatherInner( hkAabb* HK_RESTRICT aabb ) const
{
    aabb->m_min.set( m_lx.horizontalMax<4>(), m_ly.horizontalMax<4>(), m_lz.horizontalMax<4>(), hkSimdReal_0 );
    aabb->m_max.set( m_hx.horizontalMin<4>(), m_hy.horizontalMin<4>(), m_hz.horizontalMin<4>(), hkSimdReal_0 );
}

//
HK_INLINE void hkcdSimdTree::TransposedAabb::gatherOuter( hkAabb* HK_RESTRICT aabb ) const
{
    aabb->m_min.set( m_lx.horizontalMin<4>(), m_ly.horizontalMin<4>(), m_lz.horizontalMin<4>(), hkSimdReal_0 );
    aabb->m_max.set( m_hx.horizontalMax<4>(), m_hy.horizontalMax<4>(), m_hz.horizontalMax<4>(), hkSimdReal_0 );
}

/*
 * Havok SDK - Product file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
