// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM   : ALL
// PRODUCT   : COMMON
// VISIBILITY   : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0


#include <Common/Base/hkBase.h>
#include <Common/Base/Algorithm/Sort/hkRadixSort.h>

void HK_CALL hkRadixSort::sort16(_Inout_updates_(numObjects) SortData16* data, int numObjects, _Inout_updates_(numObjects) SortData16* buffer)
{
    const int increment = 4;

    HK_ASSERT(0xf0e591df, (numObjects & (increment - 1)) == 0, "You can only sort an array with a multiple of 4 size");
    HK_ALIGN16(int table0[HK_RADIX_SORT_NUM_TABLES]);
    HK_ALIGN16(int table1[HK_RADIX_SORT_NUM_TABLES]);
    {
        hkString::memClear16(table0, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
        hkString::memClear16(table1, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
    }


#define UPDATE_TABLE16( source, offset )    {   \
    int t0 = source[offset].m_keys[HK_RADIX_SORT_KEY16_0];      \
    int t1 = source[offset].m_keys[HK_RADIX_SORT_KEY16_1];      \
    int v0 = table0 [ t0 ];                                     \
    int v1 = table1[ t1 ];                                      \
    v0 += 1;    v1 += 1;                                        \
    table0 [t0]  = v0;                                          \
    table1 [t1]  = v1;                                          \
    }

    //
    // calculate the bucket size for each run
    //
    {
        const SortData16* HK_RESTRICT source = data;
        const SortData16* HK_RESTRICT dest = buffer;

        // count num objects per table entry and prefetch data
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { UPDATE_TABLE16(source, 0); }       hkMath::prefetch128(hkAddByteOffsetConst(source, 1024));
            if (increment > 1) { UPDATE_TABLE16(source, 1); }   hkMath::prefetch128(hkAddByteOffsetConst(dest, 1024));
            if (increment > 2) { UPDATE_TABLE16(source, 2); }
            if (increment > 3) { UPDATE_TABLE16(source, 3); }
            source += increment;
            dest += increment;
        }
    }

    // distribute objects
    SortData16* c0[HK_RADIX_SORT_NUM_TABLES];
    SortData16* c1[HK_RADIX_SORT_NUM_TABLES];
    {
        SortData16* HK_RESTRICT source = data;
        SortData16* HK_RESTRICT dest = buffer;
        c0[0] = dest;   c1[0] = source;
        for (int i = 1; i < HK_RADIX_SORT_NUM_TABLES; ++i)
        {
            c0[i] = c0[i - 1] + table0[i - 1];
            c1[i] = c1[i - 1] + table1[i - 1];
        }
    }

    // sort using lower bits
    {
        const SortData16* HK_RESTRICT source = data;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i + 0].m_keys[HK_RADIX_SORT_KEY16_0]);  *(c0[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY16_0]);  *(c0[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY16_0]);  *(c0[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY16_0]);  *(c0[ti]++) = source[i + 3]; }
        }
    }

    // sort using higher bits
    {
        const SortData16* HK_RESTRICT source = buffer;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i].m_keys[HK_RADIX_SORT_KEY16_1]);  *(c1[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY16_1]);  *(c1[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY16_1]);  *(c1[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY16_1]);  *(c1[ti]++) = source[i + 3]; }
        }
    }

#ifdef HK_DEBUG_SLOW
    // check
    for (int i = 0; i < numObjects - 1; i++)
    {
        HK_ASSERT_NO_MSG(0x23502af3, data[i].m_key <= data[i + 1].m_key);
    }
#endif
}

HK_DETAIL_DIAG_MSVC_SUPPRESS(6262)
void HK_CALL hkRadixSort::sort64(_Inout_updates_(numObjects) SortData64* data, int numObjects, _Inout_updates_(numObjects) SortData64* buffer)
{
    const int increment = 4;

    HK_ASSERT(0xf0e591de, (numObjects & (increment - 1)) == 0, "You can only sort an array with a multiple of 4 size");

    HK_ALIGN16(int table0[HK_RADIX_SORT_NUM_TABLES]);
    HK_ALIGN16(int table1[HK_RADIX_SORT_NUM_TABLES]);
    HK_ALIGN16(int table2[HK_RADIX_SORT_NUM_TABLES]);
    HK_ALIGN16(int table3[HK_RADIX_SORT_NUM_TABLES]);
    HK_ALIGN16(int table4[HK_RADIX_SORT_NUM_TABLES]);
    HK_ALIGN16(int table5[HK_RADIX_SORT_NUM_TABLES]);
    HK_ALIGN16(int table6[HK_RADIX_SORT_NUM_TABLES]);
    HK_ALIGN16(int table7[HK_RADIX_SORT_NUM_TABLES]);
    {
        hkString::memClear16(table0, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
        hkString::memClear16(table1, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
        hkString::memClear16(table2, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
        hkString::memClear16(table3, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
        hkString::memClear16(table4, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
        hkString::memClear16(table5, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
        hkString::memClear16(table6, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
        hkString::memClear16(table7, HK_RADIX_SORT_NUM_TABLES * sizeof(int) / 16);
    }


#define UPDATE_TABLE64( source, offset )    {   \
    int t0 = source[offset].m_keys[HK_RADIX_SORT_KEY64_0];      \
    int t1 = source[offset].m_keys[HK_RADIX_SORT_KEY64_1];      \
    int t2 = source[offset].m_keys[HK_RADIX_SORT_KEY64_2];      \
    int t3 = source[offset].m_keys[HK_RADIX_SORT_KEY64_3];      \
    int t4 = source[offset].m_keys[HK_RADIX_SORT_KEY64_4];      \
    int t5 = source[offset].m_keys[HK_RADIX_SORT_KEY64_5];      \
    int t6 = source[offset].m_keys[HK_RADIX_SORT_KEY64_6];      \
    int t7 = source[offset].m_keys[HK_RADIX_SORT_KEY64_7];      \
    int v0 = table0 [ t0 ];                                     \
    int v1 = table1 [ t1 ];                                     \
    int v2 = table2 [ t2 ];                                     \
    int v3 = table3 [ t3 ];                                     \
    int v4 = table4 [ t4 ];                                     \
    int v5 = table5 [ t5 ];                                     \
    int v6 = table6 [ t6 ];                                     \
    int v7 = table7 [ t7 ];                                     \
    v0 += 1;    v1 += 1; v2 += 1; v3 += 1;                      \
    v4 += 1;    v5 += 1; v6 += 1; v7 += 1;                      \
    table0 [t0]  = v0;                                          \
    table1 [t1]  = v1;                                          \
    table2 [t2]  = v2;                                          \
    table3 [t3]  = v3;                                          \
    table4 [t4]  = v4;                                          \
    table5 [t5]  = v5;                                          \
    table6 [t6]  = v6;                                          \
    table7 [t7]  = v7;                                          \
    }

    //
    // calculate the bucket size for each run
    //
    {
        const SortData64* HK_RESTRICT source = data;
        const SortData64* HK_RESTRICT dest = buffer;

        // count num objects per table entry and prefetch data
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { UPDATE_TABLE64(source, 0); }
            hkMath::prefetch128(hkAddByteOffsetConst(source, 1024));
            if (increment > 1) { UPDATE_TABLE64(source, 1); }
            hkMath::prefetch128(hkAddByteOffsetConst(dest, 1024));
            if (increment > 2) { UPDATE_TABLE64(source, 2); }
            if (increment > 3) { UPDATE_TABLE64(source, 3); }
            source += increment;
            dest += increment;
        }
    }

    // distribute objects
    SortData64* c0[HK_RADIX_SORT_NUM_TABLES];
    SortData64* c1[HK_RADIX_SORT_NUM_TABLES];
    SortData64* c2[HK_RADIX_SORT_NUM_TABLES];
    SortData64* c3[HK_RADIX_SORT_NUM_TABLES];
    SortData64* c4[HK_RADIX_SORT_NUM_TABLES];
    SortData64* c5[HK_RADIX_SORT_NUM_TABLES];
    SortData64* c6[HK_RADIX_SORT_NUM_TABLES];
    SortData64* c7[HK_RADIX_SORT_NUM_TABLES];
    {
        SortData64* HK_RESTRICT source = data;
        SortData64* HK_RESTRICT dest = buffer;
        c0[0] = dest;   c1[0] = source; c2[0] = dest;   c3[0] = source;
        c4[0] = dest;   c5[0] = source; c6[0] = dest;   c7[0] = source;
        for (int i = 1; i < HK_RADIX_SORT_NUM_TABLES; ++i)
        {
            c0[i] = c0[i - 1] + table0[i - 1];
            c1[i] = c1[i - 1] + table1[i - 1];
            c2[i] = c2[i - 1] + table2[i - 1];
            c3[i] = c3[i - 1] + table3[i - 1];
            c4[i] = c4[i - 1] + table4[i - 1];
            c5[i] = c5[i - 1] + table5[i - 1];
            c6[i] = c6[i - 1] + table6[i - 1];
            c7[i] = c7[i - 1] + table7[i - 1];
        }
    }

    // sort LSB 0
    {
        const SortData64* HK_RESTRICT source = data;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0)
            {
                const int ti = (source[i + 0].m_keys[HK_RADIX_SORT_KEY64_0]);
                *(c0[ti]++) = source[i];
            }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY64_0]);  *(c0[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY64_0]);  *(c0[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY64_0]);  *(c0[ti]++) = source[i + 3]; }
        }
    }

    // sort LSB 1
    {
        const SortData64* HK_RESTRICT source = buffer;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i].m_keys[HK_RADIX_SORT_KEY64_1]);  *(c1[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY64_1]);  *(c1[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY64_1]);  *(c1[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY64_1]);  *(c1[ti]++) = source[i + 3]; }
        }
    }

    // sort LSB 2
    {
        const SortData64* HK_RESTRICT source = data;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i].m_keys[HK_RADIX_SORT_KEY64_2]);  *(c2[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY64_2]);  *(c2[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY64_2]);  *(c2[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY64_2]);  *(c2[ti]++) = source[i + 3]; }
        }
    }

    // sort LSB 3
    {
        const SortData64* HK_RESTRICT source = buffer;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i].m_keys[HK_RADIX_SORT_KEY64_3]);  *(c3[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY64_3]);  *(c3[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY64_3]);  *(c3[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY64_3]);  *(c3[ti]++) = source[i + 3]; }
        }
    }

    // sort LSB 4
    {
        const SortData64* HK_RESTRICT source = data;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i].m_keys[HK_RADIX_SORT_KEY64_4]);  *(c4[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY64_4]);  *(c4[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY64_4]);  *(c4[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY64_4]);  *(c4[ti]++) = source[i + 3]; }
        }
    }

    // sort LSB 5
    {
        const SortData64* HK_RESTRICT source = buffer;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i].m_keys[HK_RADIX_SORT_KEY64_5]);  *(c5[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY64_5]);  *(c5[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY64_5]);  *(c5[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY64_5]);  *(c5[ti]++) = source[i + 3]; }
        }
    }

    // sort LSB 6
    {
        const SortData64* HK_RESTRICT source = data;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i].m_keys[HK_RADIX_SORT_KEY64_6]);  *(c6[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY64_6]);  *(c6[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY64_6]);  *(c6[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY64_6]);  *(c6[ti]++) = source[i + 3]; }
        }
    }

    // sort LSB 7
    {
        const SortData64* HK_RESTRICT source = buffer;
        for (int i = 0; i < numObjects; i += increment)
        {
            // we need this, because there is no other way to convince OACR that numObjects is a multiple of 4
            _Analysis_assume_(i + 4 <= numObjects);

            if (increment > 0) { const int ti = (source[i].m_keys[HK_RADIX_SORT_KEY64_7]);  *(c7[ti]++) = source[i]; }
            if (increment > 1) { const int ti = (source[i + 1].m_keys[HK_RADIX_SORT_KEY64_7]);  *(c7[ti]++) = source[i + 1]; }
            if (increment > 2) { const int ti = (source[i + 2].m_keys[HK_RADIX_SORT_KEY64_7]);  *(c7[ti]++) = source[i + 2]; }
            if (increment > 3) { const int ti = (source[i + 3].m_keys[HK_RADIX_SORT_KEY64_7]);  *(c7[ti]++) = source[i + 3]; }
        }
    }

#ifdef HK_DEBUG_SLOW
    // check
    for (int i = 0; i < numObjects - 1; i++)
    {
        HK_ASSERT_NO_MSG(0x23502af3, data[i].m_key <= data[i + 1].m_key);
    }
#endif
}

template<>
void HK_CALL hkRadixSort::sort<hkRadixSort::SortData16>(hkRadixSort::SortData16* data, int numObjects, hkRadixSort::SortData16* buffer)
{
    sort16(data, numObjects, buffer);
}
template<>
void HK_CALL hkRadixSort::sort<hkRadixSort::SortData32>(hkRadixSort::SortData32* data, int numObjects, hkRadixSort::SortData32* buffer)
{
    sort32(data, numObjects, buffer);
}
template<>
void HK_CALL hkRadixSort::sort<hkRadixSort::SortData64>(hkRadixSort::SortData64* data, int numObjects, hkRadixSort::SortData64* buffer)
{
    sort64(data, numObjects, buffer);
}

// template class hkRadixSort< hkRadixSort::SortData16 >;
// template class hkRadixSort< hkRadixSort::SortData32 >;
// template class hkRadixSort< hkRadixSort::SortData64 >;

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
