// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM     : ALL
// PRODUCT      : COMMON
// VISIBILITY       : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0

#include <Common/ImageUtilities/hkImageUtilities.h>

#include <Common/ImageUtilities/Conversion/hkImageConversion.h>

#include <Common/ImageUtilities/Image/hkImageFormat.h>
#include <Common/ImageUtilities/Image/hkImage.h>
#include <Common/Base/Container/Hash/hkHashMap.h>
#include <Common/Base/System/hkBaseSystem.h>
#include <Common/Base/Math/Vector/hkIntVector.h>
#include <Common/Base/Types/Color/hkColor.h>
#include <Common/Base/Thread/Pool/hkCpuThreadPool.h>
#include <Common/Base/Thread/TaskQueue/hkTask.h>
#include <Common/Base/Thread/TaskQueue/Default/hkDefaultTaskQueue.h>
#include <Common/Base/Thread/Concurrency/hkConcurrency.h>

HK_ENUMERABLE_CLASS_IMPLEMENTATION(hkImageConversionStep);

namespace
{
    struct TableEntry
    {
        TableEntry()
        {
            m_step = nullptr;
            m_sourceFormat = hkImageFormat::INVALID;
            m_targetFormat = hkImageFormat::INVALID;
            m_flags = hkImageConversionFlags::None;
            m_cost = HK_FLOAT_MAX;
        }

        TableEntry(_In_opt_ const hkImageConversionStep* pStep, const hkImageConversionEntry& entry)
        {
            m_step = pStep;
            m_sourceFormat = entry.m_sourceFormat;
            m_targetFormat = entry.m_targetFormat;

            hkUint32 sourceBpp = hkImageFormat::getBitsPerPixel(m_sourceFormat);
            hkUint32 targetBpp = hkImageFormat::getBitsPerPixel(m_targetFormat);

            m_flags = entry.m_flags;

            // Base cost is amount of bits processed
            m_cost = static_cast<float>(sourceBpp + targetBpp);

            // Penalty for non-inplace conversion
            if ((m_flags & hkImageConversionFlags::InPlace) == 0)
            {
                m_cost *= 2;
            }

            // Penalize formats that aren't aligned to powers of two
            if (!hkMath::isPower2(sourceBpp) || !hkMath::isPower2(targetBpp))
            {
                m_cost *= 2;
            }
        }

        const hkImageConversionStep* m_step;
        hkImageFormat::Enum m_sourceFormat;
        hkImageFormat::Enum m_targetFormat;
        hkImageConversionFlags::Enum m_flags;
        float m_cost;

        static TableEntry chain(const TableEntry& a, const TableEntry& b)
        {
            TableEntry entry;
            entry.m_step = a.m_step;
            entry.m_cost = a.m_cost + b.m_cost;
            entry.m_sourceFormat = a.m_sourceFormat;
            entry.m_targetFormat = a.m_targetFormat;
            entry.m_flags = a.m_flags;
            return entry;
        }

        bool operator<(const TableEntry& other) const
        {
            return m_cost < other.m_cost;
        }

        bool isAdmissible() const
        {
            return m_cost < HK_FLOAT_MAX;
        }
    };

    hkHashMap<hkTuple<hkImageFormat::Enum, hkImageFormat::Enum>, TableEntry> s_conversionTable;
    bool s_conversionTableValid = false;

    struct IntermediateBuffer
    {
        IntermediateBuffer(hkUint32 bitsPerPixel) : m_bitsPerPixel(bitsPerPixel) {}
        hkUint32 m_bitsPerPixel;
    };

    hkUint32 allocateScratchBufferIndex(hkInplaceArray<IntermediateBuffer, 16>& scratchBuffers, hkUint32 bitsPerPixel, hkUint32 excludedIndex)
    {
        int foundIndex = -1;

        for (hkUint32 bufferIndex = 0; bufferIndex < hkUint32(scratchBuffers.getSize()); ++bufferIndex)
        {
            if (bufferIndex == excludedIndex)
            {
                continue;
            }

            if (scratchBuffers[bufferIndex].m_bitsPerPixel == bitsPerPixel)
            {
                foundIndex = bufferIndex;
                break;
            }
        }

        if (foundIndex >= 0)
        {
            // Reuse existing scratch buffer
            return foundIndex;
        }
        else
        {
            // Allocate new scratch buffer
            scratchBuffers.pushBack(IntermediateBuffer(bitsPerPixel));
            return scratchBuffers.getSize() - 1;
        }
    }


    static hkResult HK_CALL clearTable(_In_opt_ void*)
    {
        s_conversionTable.clearAndDeallocate();
        s_conversionTableValid = false;
        return HK_SUCCESS;
    }

    static hkBaseSystem::InitNode s_clearTable("Clear image conversion table", HK_NULL, &clearTable);
}

hkImageConversionStep::hkImageConversionStep()
{
    s_conversionTableValid = false;
}

hkImageConversionStep::~hkImageConversionStep()
{
    s_conversionTableValid = false;
}

bool hkImageConversionStep::allowExternalThreading() const
{
    return true;
}

hkResult hkImageConversion::buildPath(hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat, bool bSourceEqualsTarget, hkInplaceArray<hkImageConversion::ConversionPathNode, 16> &path_out, hkUint32& numScratchBuffers_out)
{
    path_out.clear();
    numScratchBuffers_out = 0;

    if (sourceFormat == targetFormat)
    {
        ConversionPathNode node;
        node.m_sourceFormat = sourceFormat;
        node.m_targetFormat = targetFormat;
        node.m_inPlace = bSourceEqualsTarget;
        node.m_sourceBufferIndex = 0;
        node.m_targetBufferIndex = 0;
        node.m_step = nullptr;
        path_out.pushBack(node);
        return HK_SUCCESS;
    }

    if (!s_conversionTableValid)
    {
        rebuildConversionTable();
    }

    for (hkImageFormat::Enum current = sourceFormat; current != targetFormat;)
    {
        hkTuple<hkImageFormat::Enum, hkImageFormat::Enum> currentTableIndex = hkTupleT::make(current, targetFormat);

        TableEntry entry = s_conversionTable.getWithDefault(currentTableIndex, TableEntry());
        if (entry.m_step == nullptr)
        {
            return HK_FAILURE;
        }

        hkImageConversion::ConversionPathNode step;
        step.m_sourceFormat = entry.m_sourceFormat;
        step.m_targetFormat = entry.m_targetFormat;
        step.m_inPlace = entry.m_flags & hkImageConversionFlags::InPlace;
        step.m_step = entry.m_step;

        current = entry.m_targetFormat;

        path_out.pushBack(step);
    }

    hkInplaceArray<IntermediateBuffer, 16> scratchBuffers;
    scratchBuffers.pushBack(IntermediateBuffer(hkImageFormat::getBitsPerPixel(targetFormat)));

    for (int i = path_out.getSize() - 1; i >= 0; --i)
    {
        if (i == path_out.getSize() - 1)
            path_out[i].m_targetBufferIndex = 0;
        else
            path_out[i].m_targetBufferIndex = path_out[i + 1].m_sourceBufferIndex;

        if (i > 0)
        {
            if (path_out[i].m_inPlace)
            {
                path_out[i].m_sourceBufferIndex = path_out[i].m_targetBufferIndex;
            }
            else
            {
                hkUint32 bitsPerPixel = hkImageFormat::getBitsPerPixel(path_out[i].m_sourceFormat);

                path_out[i].m_sourceBufferIndex = allocateScratchBufferIndex(scratchBuffers, bitsPerPixel, path_out[i].m_targetBufferIndex);
            }
        }
    }

    if (bSourceEqualsTarget)
    {
        // Enforce constraint that source == target
        path_out[0].m_sourceBufferIndex = 0;

        // Did we accidentally break the in-place invariant?
        if (path_out[0].m_sourceBufferIndex == path_out[0].m_targetBufferIndex && !path_out[0].m_inPlace)
        {
            if (path_out.getSize() == 1)
            {
                // Only a single step, so we need to add a copy step
                hkImageConversion::ConversionPathNode copy;
                copy.m_inPlace = false;
                copy.m_sourceFormat = sourceFormat;
                copy.m_targetFormat = sourceFormat;
                copy.m_sourceBufferIndex = path_out[0].m_sourceBufferIndex;
                copy.m_targetBufferIndex = allocateScratchBufferIndex(scratchBuffers, hkImageFormat::getBitsPerPixel(path_out[0].m_sourceFormat), path_out[0].m_sourceBufferIndex);
                path_out[0].m_sourceBufferIndex = copy.m_targetBufferIndex;
                copy.m_step = nullptr;
                path_out.insertAt(0, copy);
            }
            else
            {
                // Turn second step to non-inplace
                path_out[1].m_inPlace = false;
                path_out[1].m_sourceBufferIndex = allocateScratchBufferIndex(scratchBuffers, hkImageFormat::getBitsPerPixel(path_out[1].m_sourceFormat), path_out[0].m_sourceBufferIndex);
                path_out[0].m_targetBufferIndex = path_out[1].m_sourceBufferIndex;
            }
        }
    }
    else
    {
        path_out[0].m_sourceBufferIndex = scratchBuffers.getSize();
    }

    numScratchBuffers_out = scratchBuffers.getSize() - 1;

    return HK_SUCCESS;
}

void hkImageConversion::rebuildConversionTable()
{
    s_conversionTable.clear();

    // Prime conversion table with known conversions
    for (hkImageConversionStep* conversion = hkImageConversionStep::getFirstInstance(); conversion; conversion = conversion->getNextInstance())
    {
        hkArrayView<const hkImageConversionEntry> entries = conversion->getSupportedConversions();

        for (hkUint32 subIndex = 0; subIndex < (hkUint32)entries.getSize(); subIndex++)
        {
            const hkImageConversionEntry& subConversion = entries[subIndex];

            if (subConversion.m_flags & hkImageConversionFlags::InPlace)
            {
                HK_ASSERT(0x3c6bc7a0, hkImageFormat::isCompressed(subConversion.m_sourceFormat) == hkImageFormat::isCompressed(subConversion.m_targetFormat) &&
                    hkImageFormat::getBitsPerPixel(subConversion.m_sourceFormat) == hkImageFormat::getBitsPerPixel(subConversion.m_targetFormat),
                    "In-place conversions are only allowed between formats of the same number of bits per pixel and compressedness");
            }


            hkTuple<hkImageFormat::Enum, hkImageFormat::Enum> tableIndex = hkTupleT::make(subConversion.m_sourceFormat, subConversion.m_targetFormat);

            // Use the cheapest known conversion for each combination in case there are multiple ones
            TableEntry candidate(conversion, subConversion);

            if (candidate < s_conversionTable.getWithDefault(tableIndex, TableEntry()))
            {
                s_conversionTable.insert(tableIndex, candidate);
            }
        }
    }

    for (hkUint32 i = 0; i < hkImageFormat::COUNT; i++)
    {
        hkImageFormat::Enum format = static_cast<hkImageFormat::Enum>(i);
        // Add copy-conversion (from and to same format)
        s_conversionTable.insert(hkTupleT::make(format, format), TableEntry(nullptr, hkImageConversionEntry(hkImageConversionEntry(static_cast<hkImageFormat::Enum>(i), static_cast<hkImageFormat::Enum>(i), hkImageConversionFlags::InPlace))));
    }

    // Straight from http://en.wikipedia.org/wiki/Floyd-Warshall_algorithm
    for (hkUint32 k = 1; k < hkImageFormat::COUNT; k++)
    {
        for (hkUint32 i = 1; i < hkImageFormat::COUNT; i++)
        {
            if (k == i)
            {
                continue;
            }

            hkTuple<hkImageFormat::Enum, hkImageFormat::Enum> tableIndexIK = hkTupleT::make(static_cast<hkImageFormat::Enum>(i), static_cast<hkImageFormat::Enum>(k));
            if (!s_conversionTable.getWithDefault(tableIndexIK, TableEntry()).isAdmissible())
            {
                continue;
            }

            for (hkUint32 j = 1; j < hkImageFormat::COUNT; j++)
            {
                if (j == i || j == k)
                {
                    continue;
                }

                hkTuple<hkImageFormat::Enum, hkImageFormat::Enum> tableIndexIJ = hkTupleT::make(static_cast<hkImageFormat::Enum>(i), static_cast<hkImageFormat::Enum>(j));
                hkTuple<hkImageFormat::Enum, hkImageFormat::Enum> tableIndexKJ = hkTupleT::make(static_cast<hkImageFormat::Enum>(k), static_cast<hkImageFormat::Enum>(j));

                if (!s_conversionTable.getWithDefault(tableIndexKJ, TableEntry()).isAdmissible())
                {
                    continue;
                }

                TableEntry candidate = TableEntry::chain(s_conversionTable.getWithDefault(tableIndexIK, TableEntry()), s_conversionTable.getWithDefault(tableIndexKJ, TableEntry()));

                if (candidate.isAdmissible() && candidate < s_conversionTable.getWithDefault(tableIndexIJ, TableEntry()))
                {
                    // To convert from format I to format J, first convert from I to K
                    s_conversionTable.insert(tableIndexIJ, candidate);
                }
            }
        }
    }

    s_conversionTableValid = true;
}

hkResult hkImageConversion::convert(const hkImage& source, hkImage& target, hkImageFormat::Enum targetFormat, _In_opt_ hkThreadPool* threadPool)
{
    hkImageFormat::Enum sourceFormat = source.getFormat();

    // Trivial copy
    if (sourceFormat == targetFormat)
    {
        if ( &source != &target)
        {   // copy if not already the same
            target = source;
        }
        return HK_SUCCESS;
    }

    hkInplaceArray<ConversionPathNode, 16> path;
    hkUint32 numScratchBuffers = 0;
    if (buildPath(sourceFormat, targetFormat, &source == &target, path, numScratchBuffers).isFailure())
    {
        return HK_FAILURE;
    }

    return convert(source, target, path, numScratchBuffers, threadPool);
}

hkResult hkImageConversion::convert(const hkImage& source, hkImage& target, hkArrayView<ConversionPathNode> path, hkUint32 numScratchBuffers, _In_opt_ hkThreadPool* threadPool)
{
    HK_ASSERT_NO_MSG(0x6955cfbc, path.getSize() > 0);
    HK_ASSERT_NO_MSG(0x3c11217e, path[0].m_sourceFormat == source.getFormat());

    hkInplaceArray<hkImage, 16> intermediates;
    intermediates.setSize(numScratchBuffers);

    const hkImage* pSource = &source;

    for (hkInt32 i = 0; i < path.getSize(); ++i)
    {
        hkUint32 targetIndex = path[i].m_targetBufferIndex;

        hkImage* pTarget = targetIndex == 0 ? &target : &intermediates[targetIndex - 1];

        if (convertSingleStep(path[i].m_step, *pSource, *pTarget, path[i].m_targetFormat, threadPool).isFailure())
        {
            return HK_FAILURE;
        }

        pSource = pTarget;
    }

    return HK_SUCCESS;
}

hkResult hkImageConversion::convertRaw(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat)
{
    if (numElements == 0)
    {
        return HK_SUCCESS;
    }

    // Trivial copy
    if (sourceFormat == targetFormat)
    {
        if (target.begin() != source.begin())
            hkString::memMove(target.begin(), source.begin(), numElements * hkUint64(hkImageFormat::getBitsPerPixel(sourceFormat)) / 8);
        return HK_SUCCESS;
    }

    if (hkImageFormat::isCompressed(sourceFormat) || hkImageFormat::isCompressed(targetFormat))
    {
        return HK_FAILURE;
    }

    hkInplaceArray<ConversionPathNode, 16> path;
    hkUint32 numScratchBuffers;
    if (buildPath(sourceFormat, targetFormat, source.begin() == target.begin(), path, numScratchBuffers).isFailure())
    {
        return HK_FAILURE;
    }

    return convertRaw(source, target, numElements, path, numScratchBuffers);
}

hkResult hkImageConversion::convertRaw(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkArrayView<ConversionPathNode> path, hkUint32 numScratchBuffers)
{
    HK_ASSERT(0x53732d45, path.getSize() > 0, "Path of length 0 is invalid.");

    if (numElements == 0)
    {
        return HK_SUCCESS;
    }

    if (hkImageFormat::isCompressed(path.begin()->m_sourceFormat) || hkImageFormat::isCompressed((path.end() - 1)->m_targetFormat))
    {
        return HK_FAILURE;
    }

    hkInplaceArray<hkInplaceArray<char, 2048>, 16> intermediates;
    intermediates.setSize(numScratchBuffers);

    for (hkInt32 i = 0; i < path.getSize(); ++i)
    {
        hkUint32 targetIndex = path[i].m_targetBufferIndex;
        hkUint32 targetBpp = hkImageFormat::getBitsPerPixel(path[i].m_targetFormat);

        hkArrayView<void> stepTarget;
        if (targetIndex == 0)
        {
            stepTarget = target;
        }
        else
        {
            hkUint32 expectedSize = static_cast<hkUint32>(targetBpp * numElements / 8);
            intermediates[targetIndex - 1].setSize(expectedSize);
            stepTarget = hkArrayView<void>(intermediates[targetIndex - 1].begin(), intermediates[targetIndex - 1].getSize());
        }

        if (path[i].m_step == nullptr)
        {
            hkString::memMove(stepTarget.begin(), source.begin(), numElements * targetBpp / 8);
        }
        else
        {
            if (static_cast<const hkImageConversionStepLinear*>(path[i].m_step)->
                convertPixels(source, stepTarget, numElements, path[i].m_sourceFormat, path[i].m_targetFormat).isFailure())
            {
                return HK_FAILURE;
            }
        }

        source = hkArrayView<const void>(stepTarget.begin(), stepTarget.getSize());
    }

    return HK_SUCCESS;
}

hkResult hkImageConversion::convertSingleStep(_In_opt_ const hkImageConversionStep* pStep, const hkImage& source, hkImage& target, hkImageFormat::Enum targetFormat, _In_opt_ hkThreadPool* threadPool)
{
    if (!pStep)
    {
        target = source;
        return HK_SUCCESS;
    }

    hkImageFormat::Enum sourceFormat = source.getFormat();

    hkImageHeader header = source.getHeader();
    header.setFormat(targetFormat);
    target.reset(header);

    if (!hkImageFormat::isCompressed(sourceFormat))
    {
        if (!hkImageFormat::isCompressed(targetFormat))
        {
            // we have to do the computation in 64-bit otherwise it might overflow for very large textures (8k x 4k or bigger).
            hkUint64 numElements = hkUint64(8) * (hkUint64)target.getDataSize() / (hkUint64)hkImageFormat::getBitsPerPixel( targetFormat );
            return static_cast<const hkImageConversionStepLinear*>(pStep)->
                convertPixels(
                    hkArrayView<const void>(source.getDataPointer<void>(), source.getDataSize()),
                    hkArrayView<void>(target.getDataPointer<void>(), target.getDataSize()),
                    (hkUint32)numElements, sourceFormat, targetFormat);
        }
        else
        {
            return convertSingleStepCompress(source, target, sourceFormat, targetFormat, pStep, threadPool);
        }
    }
    else
    {
        return convertSingleStepDecompress(source, target, sourceFormat, targetFormat, pStep);
    }
}

hkResult hkImageConversion::convertSingleStepDecompress(const hkImage &source, hkImage &target, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat, _In_ const hkImageConversionStep* pStep)
{
    for (hkUint32 arrayIndex = 0; arrayIndex < source.getNumArrayElements(); arrayIndex++)
    {
        for (hkUint32 face = 0; face < source.getNumFaces(); face++)
        {
            for (hkUint32 mipLevel = 0; mipLevel < source.getNumMipLevels(); mipLevel++)
            {
                const hkUint32 width = target.getWidth(mipLevel);
                const hkUint32 height = target.getHeight(mipLevel);

                const hkUint32 blockSizeX = hkImageFormat::getBlockWidth(sourceFormat);
                const hkUint32 blockSizeY = hkImageFormat::getBlockHeight(sourceFormat);

                const hkUint32 numBlocksX = source.getNumBlocksX(mipLevel);
                const hkUint32 numBlocksY = source.getNumBlocksY(mipLevel);

                const hkUint32 targetRowPitch = target.getRowPitch(mipLevel);
                const hkUint32 targetBytesPerPixel = hkImageFormat::getBitsPerPixel(targetFormat) / 8;

                const hkUint32 blockSizeInBytes = hkImageFormat::getBitsPerBlock(sourceFormat) / 8;

                // Decompress into a temp memory block so we don't have to explicitly handle the case where the image is not a multiple of the block size
                hkInplaceArray<hkUint8, 256> tempBuffer;
                tempBuffer.setSize(numBlocksX * blockSizeX * blockSizeY * targetBytesPerPixel);

                for (hkUint32 slice = 0; slice < source.getDepth(mipLevel); slice++)
                {
                    for (hkUint32 blockY = 0; blockY < numBlocksY; blockY++)
                    {
                        const void* sourcePointer = source.getBlockPointer<void>(mipLevel, face, arrayIndex, 0, blockY, slice);

                        if (static_cast<const hkImageConversionStepDecompressBlocks*>(pStep)->
                            decompressBlocks(
                                hkArrayView<const void>(sourcePointer, numBlocksX * blockSizeInBytes),
                                hkArrayView<void>(tempBuffer.begin(), tempBuffer.getSize()), numBlocksX, sourceFormat, targetFormat).isFailure())
                        {
                            return HK_FAILURE;
                        }

                        for (hkUint32 blockX = 0; blockX < numBlocksX; blockX++)
                        {
                            hkUint8* targetPointer = target.getPixelPointer<hkUint8>(
                                mipLevel, face, arrayIndex,
                                blockX * blockSizeX, blockY * blockSizeY, slice);

                            // Copy into actual target, clamping to image dimensions
                            hkUint32 copyWidth = hkMath::min2(blockSizeX, width - blockX * blockSizeX);
                            hkUint32 copyHeight = hkMath::min2(blockSizeY, height - blockY * blockSizeY);
                            for (hkUint32 row = 0; row < copyHeight; row++)
                            {
                                hkString::memCpy(targetPointer, &tempBuffer[(blockX * blockSizeX + row) * blockSizeY * targetBytesPerPixel], copyWidth * targetBytesPerPixel);
                                targetPointer += targetRowPitch;
                            }
                        }
                    }
                }
            }
        }
    }

    return HK_SUCCESS;
}

namespace
{
    class CompressTask : public hkTask
    {
    public:
        virtual void process(const hkTask::Input& input) HK_OVERRIDE
        {
            hkUint32 blockRowStart = m_rowsPerTask * input.m_multiplicityIndex;
            hkUint32 blockRowEnd = hkMath::min2(m_numBlocksY, blockRowStart + m_rowsPerTask);

            hkUint32 numBlockRows = blockRowEnd - blockRowStart;

            hkUint32 blockHeight = hkImageFormat::getBlockHeight(m_targetFormat);
            hkUint32 blockWidth = hkImageFormat::getBlockWidth(m_targetFormat);

            hkUint32 sourceBlockRowPitch = blockHeight * hkImageFormat::getRowPitch(m_sourceFormat, m_numBlocksX * blockWidth);
            hkUint32 targetBlockRowPitch = blockHeight * hkImageFormat::getRowPitch(m_targetFormat, m_numBlocksX * blockWidth);

            hkArrayView<const void> source = hkArrayViewT::make(hkAddByteOffset(m_source.begin(), blockRowStart * sourceBlockRowPitch), numBlockRows * sourceBlockRowPitch);
            hkArrayView<      void> target = hkArrayViewT::make(hkAddByteOffset(m_target.begin(), blockRowStart * targetBlockRowPitch), numBlockRows * targetBlockRowPitch);

            if (m_step->compressBlocks(source, target, m_numBlocksX, numBlockRows, m_sourceFormat, m_targetFormat).isFailure())
            {
                m_anyFailed = true;
            }
        }

        hkArrayView<const void> m_source;
        hkArrayView<void> m_target;

        hkUint32 m_numBlocksX;
        hkUint32 m_numBlocksY;

        hkImageFormat::Enum m_sourceFormat;
        hkImageFormat::Enum m_targetFormat;

        const hkImageConversionStepCompressBlocks* m_step;

        hkUint32 m_rowsPerTask;

        bool m_anyFailed;
    };
}

hkResult hkImageConversion::convertSingleStepCompress(const hkImage &source, hkImage &target, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat,
                                                      _In_ const hkImageConversionStep* pStep, _In_opt_ hkThreadPool* threadPool)
{
    hkDefaultTaskQueue queue;

    for (hkUint32 arrayIndex = 0; arrayIndex < source.getNumArrayElements(); arrayIndex++)
    {
        for (hkUint32 face = 0; face < source.getNumFaces(); face++)
        {
            for (hkUint32 mipLevel = 0; mipLevel < source.getNumMipLevels(); mipLevel++)
            {
                const hkUint32 sourceWidth = source.getWidth(mipLevel);
                const hkUint32 sourceHeight = source.getHeight(mipLevel);

                const hkUint32 numBlocksX = target.getNumBlocksX(mipLevel);
                const hkUint32 numBlocksY = target.getNumBlocksY(mipLevel);

                const hkUint32 targetWidth = numBlocksX * hkImageFormat::getBlockWidth(targetFormat);
                const hkUint32 targetHeight = numBlocksY * hkImageFormat::getBlockHeight(targetFormat);

                const hkUint32 sourceRowPitch = source.getRowPitch(mipLevel);
                const hkUint32 sourceBytesPerPixel = hkImageFormat::getBitsPerPixel(sourceFormat) / 8;

                const hkUint32 blockSizeInBytes = hkImageFormat::getBitsPerBlock(targetFormat) / 8;

                // Pad image to multiple of block size for compression
                hkImageHeader paddedSliceHeader;
                paddedSliceHeader.setWidth(targetWidth);
                paddedSliceHeader.setHeight(targetHeight);
                paddedSliceHeader.setFormat(sourceFormat);

                hkImage paddedSlice;
                paddedSlice.reset(paddedSliceHeader);

                for (hkUint32 slice = 0; slice < source.getDepth(mipLevel); slice++)
                {
                    for (hkUint32 y = 0; y < targetHeight; ++y)
                    {
                        hkUint32 sourceY = hkMath::min2(y, sourceHeight - 1);

                        hkString::memCpy(paddedSlice.getPixelPointer<void>(0, 0, 0, 0, y),
                            source.getPixelPointer<void>(mipLevel, face, arrayIndex, 0, sourceY),
                            sourceRowPitch);

                        for (hkUint32 x = sourceWidth; x < targetWidth; ++x)
                        {
                            hkString::memCpy(paddedSlice.getPixelPointer<void>(0, 0, 0, x, y),
                                source.getPixelPointer<void>(mipLevel, face, arrayIndex, sourceWidth - 1, sourceY),
                                sourceBytesPerPixel);
                        }
                    }

                    CompressTask task;

                    task.m_source = hkArrayView<const void>(paddedSlice.getDataPointer<void>(), paddedSlice.getDataSize());
                    task.m_target = hkArrayView<void>(target.getBlockPointer<void>(mipLevel, face, arrayIndex, 0, 0, slice), numBlocksX * numBlocksY * blockSizeInBytes);
                    task.m_numBlocksX = numBlocksX;
                    task.m_numBlocksY = numBlocksY;
                    task.m_sourceFormat = sourceFormat;
                    task.m_targetFormat = targetFormat;
                    task.m_step = static_cast<const hkImageConversionStepCompressBlocks*>(pStep);
                    task.m_anyFailed = false;

                    if (threadPool && pStep->allowExternalThreading())
                    {
                        hkUint32 blocksPerTask = 128;
                        hkUint32 rowsPerTask = hkMath::max2(1U, blocksPerTask / numBlocksX);
                        hkUint32 numTasks = hkMath::clamp(numBlocksY / rowsPerTask, 1u, 32u);
                        rowsPerTask = (numBlocksY - 1) / numTasks + 1;

                        task.m_rowsPerTask = rowsPerTask;


                        hkTaskQueue::Handle handle;
                        queue.allocateHandles(&handle, 1);
                        queue.initHandle(handle, &task);
                        queue.setMultiplicity(handle, numTasks);
                        queue.submitHandles(&handle, 1);
                        threadPool->processTaskQueue(&queue);
                        queue.processAndFree(handle);

                        queue.close();
                        threadPool->waitForCompletion();
                        queue.reset();
                    }
                    else
                    {
                        task.m_rowsPerTask = numBlocksY;
                        task.process(hkTask::Input());
                    }

                    if (task.m_anyFailed)
                    {
                        return HK_FAILURE;
                    }
                }
            }
        }
    }

    return HK_SUCCESS;
}


bool hkImageConversion::isConvertible(hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat)
{
    if (!s_conversionTableValid)
    {
        rebuildConversionTable();
    }

    hkTuple<hkImageFormat::Enum, hkImageFormat::Enum> tableIndex = hkTupleT::make(sourceFormat, targetFormat);
    return s_conversionTable.contains(tableIndex);
}

hkImageFormat::Enum hkImageConversion::findClosestCompatibleFormat(hkImageFormat::Enum format, hkArrayView<const hkImageFormat::Enum> compatibleFormats)
{
    if (!s_conversionTableValid)
    {
        rebuildConversionTable();
    }

    TableEntry bestEntry;
    hkImageFormat::Enum bestFormat = hkImageFormat::INVALID;

    for (hkUint32 targetIndex = 0; targetIndex < hkUint32(compatibleFormats.getSize()); targetIndex++)
    {
        hkTuple<hkImageFormat::Enum, hkImageFormat::Enum> tableIndex = hkTupleT::make(format, compatibleFormats[targetIndex]);
        TableEntry candidate = s_conversionTable.getWithDefault(tableIndex, TableEntry());
        if (candidate < bestEntry)
        {
            bestEntry = candidate;
            bestFormat = compatibleFormats[targetIndex];
        }
    }

    return bestFormat;
}


static bool isAligned(const void* pointer)
{
    return reinterpret_cast<hkUlong>(pointer) % 16 == 0;
}

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    const hkImageConversionEntry hkImageSwizzleConversion32_2103_supported[6] = {
        hkImageConversionEntry(hkImageFormat::B8_G8_R8_A8_UNSIGNED_NORMALIZED, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::InPlace),
        hkImageConversionEntry(hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageFormat::B8_G8_R8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::InPlace),
        hkImageConversionEntry(hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageFormat::B8_G8_R8_X8_UNSIGNED_NORMALIZED, hkImageConversionFlags::InPlace),
        hkImageConversionEntry(hkImageFormat::B8_G8_R8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageConversionFlags::InPlace),
        hkImageConversionEntry(hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageFormat::B8_G8_R8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageConversionFlags::InPlace),
        hkImageConversionEntry(hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageFormat::B8_G8_R8_X8_UNSIGNED_NORMALIZED_SRGB, hkImageConversionFlags::InPlace),
    };
}

struct hkImageSwizzleConversion32_2103 : public hkImageConversionStepLinear
{
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageSwizzleConversion32_2103_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = 4;
        hkUint32 targetStride = 4;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

#if defined(HK_SSE_VERSION) && (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED) && defined(HK_ARCH_INTEL)
        if (isAligned(sourcePointer) && isAligned(targetPointer))
        {
#if HK_SSE_VERSION >= 0x30
            const hkUint32 elementsPerBatch = 8;

            __m128i shuffleMask = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2);

            // Intel optimization manual, Color Pixel Format Conversion Using SSE3
            while (numElements >= elementsPerBatch)
            {
                __m128i in0 = reinterpret_cast<const __m128i*>(sourcePointer)[0];
                __m128i in1 = reinterpret_cast<const __m128i*>(sourcePointer)[1];

                reinterpret_cast<__m128i*>(targetPointer)[0] = _mm_shuffle_epi8(in0, shuffleMask);
                reinterpret_cast<__m128i*>(targetPointer)[1] = _mm_shuffle_epi8(in1, shuffleMask);

                sourcePointer = hkAddByteOffset(sourcePointer, sourceStride * elementsPerBatch);
                targetPointer = hkAddByteOffset(targetPointer, targetStride * elementsPerBatch);
                numElements -= elementsPerBatch;
            }
#else
            const hkUint32 elementsPerBatch = 8;

            __m128i mask1 = _mm_set1_epi32(0xff00ff00);
            __m128i mask2 = _mm_set1_epi32(0x00ff00ff);

            // Intel optimization manual, Color Pixel Format Conversion Using SSE2
            while (numElements >= elementsPerBatch)
            {
                __m128i in0 = reinterpret_cast<const __m128i*>(sourcePointer)[0];
                __m128i in1 = reinterpret_cast<const __m128i*>(sourcePointer)[1];

                reinterpret_cast<__m128i*>(targetPointer)[0] = _mm_or_si128(_mm_and_si128(in0, mask1), _mm_and_si128(_mm_or_si128(_mm_slli_epi32(in0, 16), _mm_srli_epi32(in0, 16)), mask2));
                reinterpret_cast<__m128i*>(targetPointer)[1] = _mm_or_si128(_mm_and_si128(in1, mask1), _mm_and_si128(_mm_or_si128(_mm_slli_epi32(in1, 16), _mm_srli_epi32(in1, 16)), mask2));

                sourcePointer = hkAddByteOffset(sourcePointer, sourceStride * elementsPerBatch);
                targetPointer = hkAddByteOffset(targetPointer, targetStride * elementsPerBatch);
                numElements -= elementsPerBatch;
            }
#endif
        }

#endif

        while (numElements)
        {
            hkUint8 a, b, c, d;
            a = reinterpret_cast<const hkUint8*>(sourcePointer)[2];
            b = reinterpret_cast<const hkUint8*>(sourcePointer)[1];
            c = reinterpret_cast<const hkUint8*>(sourcePointer)[0];
            d = reinterpret_cast<const hkUint8*>(sourcePointer)[3];
            reinterpret_cast<hkUint8*>(targetPointer)[0] = a;
            reinterpret_cast<hkUint8*>(targetPointer)[1] = b;
            reinterpret_cast<hkUint8*>(targetPointer)[2] = c;
            reinterpret_cast<hkUint8*>(targetPointer)[3] = d;

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    const hkImageConversionEntry hkImageConversion_AddAlpha_supported[] = {
        hkImageConversionEntry(hkImageFormat::B8_G8_R8_UNSIGNED_NORMALIZED,      hkImageFormat::B8_G8_R8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::B8_G8_R8_UNSIGNED_NORMALIZED,      hkImageFormat::B8_G8_R8_X8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::B8_G8_R8_UNSIGNED_NORMALIZED_SRGB, hkImageFormat::B8_G8_R8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::B8_G8_R8_UNSIGNED_NORMALIZED_SRGB, hkImageFormat::B8_G8_R8_X8_UNSIGNED_NORMALIZED_SRGB, hkImageConversionFlags::None),

        hkImageConversionEntry( hkImageFormat::R8_G8_B8_UNSIGNED_NORMALIZED,      hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None ),
      //hkImageConversionEntry( hkImageFormat::R8_G8_B8_UNSIGNED_NORMALIZED,      hkImageFormat::R8_G8_B8_X8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None ),
        hkImageConversionEntry( hkImageFormat::R8_G8_B8_UNSIGNED_NORMALIZED_SRGB, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageConversionFlags::None ),
      //hkImageConversionEntry( hkImageFormat::R8_G8_B8_UNSIGNED_NORMALIZED_SRGB, hkImageFormat::R8_G8_B8_X8_UNSIGNED_NORMALIZED_SRGB, hkImageConversionFlags::None ),
    };
}

class hkImageConversion_AddAlpha : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make( hkImageConversion_AddAlpha_supported );
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = 3;
        hkUint32 targetStride = 4;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

#if HK_ENDIAN_LITTLE
        const hkUint32 elementsPerBatch = 4;

        while (numElements >= elementsPerBatch)
        {
            hkUint32 source0 = reinterpret_cast<const hkUint32*>(sourcePointer)[0];
            hkUint32 source1 = reinterpret_cast<const hkUint32*>(sourcePointer)[1];
            hkUint32 source2 = reinterpret_cast<const hkUint32*>(sourcePointer)[2];

            hkUint32 target0 = source0 | 0xFF000000;
            hkUint32 target1 = (source0 >> 24) | (source1 << 8) | 0xFF000000;
            hkUint32 target2 = (source1 >> 16) | (source2 << 16) | 0xFF000000;
            hkUint32 target3 = (source2 >> 8) | 0xFF000000;

            reinterpret_cast<hkUint32*>(targetPointer)[0] = target0;
            reinterpret_cast<hkUint32*>(targetPointer)[1] = target1;
            reinterpret_cast<hkUint32*>(targetPointer)[2] = target2;
            reinterpret_cast<hkUint32*>(targetPointer)[3] = target3;

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride * elementsPerBatch);
            targetPointer = hkAddByteOffset(targetPointer, targetStride * elementsPerBatch);
            numElements -= elementsPerBatch;
        }
#endif

        while (numElements)
        {
            reinterpret_cast<hkUint8*>(targetPointer)[0] = reinterpret_cast<const hkUint8*>(sourcePointer)[0];
            reinterpret_cast<hkUint8*>(targetPointer)[1] = reinterpret_cast<const hkUint8*>(sourcePointer)[1];
            reinterpret_cast<hkUint8*>(targetPointer)[2] = reinterpret_cast<const hkUint8*>(sourcePointer)[2];
            reinterpret_cast<hkUint8*>(targetPointer)[3] = 0xFF;

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    const hkImageConversionEntry hkImageConversion_UNORM8_to_FLOAT_supported[] = {
        hkImageConversionEntry(hkImageFormat::R8_UNSIGNED_NORMALIZED, hkImageFormat::R32_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R8_G8_UNSIGNED_NORMALIZED, hkImageFormat::R32_G32_FLOAT, hkImageConversionFlags::None),
        //hkImageConversionEntry(hkImageFormat::R8G8B8_UNORM, hkImageFormat::R32G32B32_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageConversionFlags::None),
    };
}


class hkImageConversion_UNORM8_to_FLOAT : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make( hkImageConversion_UNORM8_to_FLOAT_supported );
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        // Work with single channels instead of pixels
        numElements *= hkImageFormat::getBitsPerPixel(sourceFormat) / 8;

        hkUint32 sourceStride = 1;
        hkUint32 targetStride = 4;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

        if (isAligned(sourcePointer) && isAligned(targetPointer))
        {
            const hkUint32 elementsPerBatch = 16;

            hkVector4f scale;
            scale.setAll(hkSimdFloat32_Inv_255);

            while (numElements >= elementsPerBatch)
            {
                hkIntVector in;
                in.load<4, HK_IO_SIMD_ALIGNED>(static_cast<const hkUint32*>(sourcePointer));

                hkIntVector zero;
                zero.setZero();

                hkIntVector short0, short1;
                short0.setMergeHead8(in, zero);
                short1.setMergeTail8(in, zero);

                hkIntVector int0, int1, int2, int3;
                int0.setMergeHead16(short0, zero);
                int1.setMergeTail16(short0, zero);
                int2.setMergeHead16(short1, zero);
                int3.setMergeTail16(short1, zero);

                hkVector4f float0, float1, float2, float3;
                int0.convertS32ToF32(float0);
                int1.convertS32ToF32(float1);
                int2.convertS32ToF32(float2);
                int3.convertS32ToF32(float3);

                float0.mul(scale);
                float1.mul(scale);
                float2.mul(scale);
                float3.mul(scale);

                float0.store<4, HK_IO_SIMD_ALIGNED>(static_cast<float*>(targetPointer) + 0);
                float1.store<4, HK_IO_SIMD_ALIGNED>(static_cast<float*>(targetPointer) + 4);
                float2.store<4, HK_IO_SIMD_ALIGNED>(static_cast<float*>(targetPointer) + 8);
                float3.store<4, HK_IO_SIMD_ALIGNED>(static_cast<float*>(targetPointer) + 12);

                sourcePointer = hkAddByteOffset(sourcePointer, sourceStride * elementsPerBatch);
                targetPointer = hkAddByteOffset(targetPointer, targetStride * elementsPerBatch);
                numElements -= elementsPerBatch;
            }
        }

        while (numElements)
        {
            *reinterpret_cast<float*>(targetPointer) = static_cast<float>(*reinterpret_cast<const hkUint8*>(sourcePointer)) * (1.0f / 255.0f);

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    const hkImageConversionEntry hkImageConversion_UNORM16_to_FLOAT_supported[] = {
        hkImageConversionEntry(hkImageFormat::R16_UNSIGNED_NORMALIZED, hkImageFormat::R32_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R16_G16_UNSIGNED_NORMALIZED, hkImageFormat::R32_G32_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R16_G16_B16_A16_UNSIGNED_NORMALIZED, hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageConversionFlags::None),
    };
}

class hkImageConversion_UNORM16_to_FLOAT : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_UNORM16_to_FLOAT_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        // Work with single channels instead of pixels
        numElements *= hkImageFormat::getBitsPerPixel(sourceFormat) / 16;

        hkUint32 sourceStride = 2;
        hkUint32 targetStride = 4;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

        while (numElements)
        {
            *reinterpret_cast<float*>(targetPointer) = static_cast<float>(*reinterpret_cast<const hkUint16*>(sourcePointer)) * (1.0f / 65535.0f);

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    static const hkImageConversionEntry hkImageConversion_UNORM8_SRGB_to_FLOAT_supported[] = {
        hkImageConversionEntry(hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageConversionFlags::None),
    };
}

class hkImageConversion_UNORM8_SRGB_to_FLOAT : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_UNORM8_SRGB_to_FLOAT_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = 4;
        hkUint32 targetStride = 16;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

        if (isAligned(sourcePointer) && isAligned(targetPointer))
        {
            const hkUint32 elementsPerBatch = 4;

            hkVector4f scale;
            scale.setAll(hkSimdFloat32_Inv_255);

            while (numElements >= elementsPerBatch)
            {
                hkIntVector in;
                in.load<4, HK_IO_SIMD_ALIGNED>(static_cast<const hkUint32*>(sourcePointer));

                hkIntVector zero;
                zero.setZero();

                hkIntVector short0, short1;
                short0.setMergeHead8(in, zero);
                short1.setMergeTail8(in, zero);

                hkIntVector int0, int1, int2, int3;
                int0.setMergeHead16(short0, zero);
                int1.setMergeTail16(short0, zero);
                int2.setMergeHead16(short1, zero);
                int3.setMergeTail16(short1, zero);

                hkVector4f floats[4];
                int0.convertS32ToF32(floats[0]);
                int1.convertS32ToF32(floats[1]);
                int2.convertS32ToF32(floats[2]);
                int3.convertS32ToF32(floats[3]);

                floats[0].mul(scale);
                floats[1].mul(scale);
                floats[2].mul(scale);
                floats[3].mul(scale);

                // Transpose so that the vectors contain 4 x red, 4 x green, 4 x blue, 4x alpha
                HK_TRANSPOSE4f(floats[0], floats[1], floats[2], floats[3]);

                // Fast and accurate sRGB curve, inspired by http://lists.blender.org/pipermail/bf-blender-cvs/2014-January/061278.html
                //
                // The linear part is computed as usual. The tricky part is computing
                // x^2.4. This is done by deriving x^0.8 via 3 Newton iterations as the
                // 5th root of x^4 (see http://en.wikipedia.org/wiki/Nth_root_algorithm)
                // starting from an initial approximation by reinterpreting the float
                // as int (similar to the inverse square root trick), and cubing
                // to get x^2.4.
                //
                // Constant factors during the Newton iterations have been mechanically
                // moved around to minimize the number of operations, resulting in the
                // weird factors below.
                for (hkUint32 i = 0; i < 3; i++)    // Alpha isn't encoded as sRGB so only loop up to 3
                {
                    hkVector4f linearCutoff;
                    linearCutoff.setAll(0.04045f);

                    hkVector4f linearOffset;
                    linearOffset.setAll(0.055f);

                    hkVector4f linearScale;
                    linearScale.setAll(1.0f / 12.92f);

                    hkVector4f srgbValue = floats[i];

                    hkVector4fComparison linearPartMask = srgbValue.less(linearCutoff);

                    hkVector4f linearPart;
                    linearPart.setMul(srgbValue, linearScale);

                    hkVector4f base;
                    base.setAdd(srgbValue, linearOffset);

                    // Initial approximation via magic
                    hkVector4f x;
                    x.setMul(base, hkSimdFloat32::fromFloat(3611622602.84f)); // 2^(127/0.8 - 127))

                    hkIntVector tmpInt;
                    tmpInt.loadAsFloat32BitRepresentation(x);
                    tmpInt.convertS32ToF32(x);
                    x.mul(hkSimdFloat32::fromFloat(0.8f));
                    tmpInt.setConvertF32toS32(x);
                    tmpInt.storeAsFloat32BitRepresentation(x);

                    hkVector4f base2;
                    base2.setMul(base, base);

                    hkVector4f base4;
                    base4.setMul(base2, base2);

                    // Newton iter 1
                    hkVector4f x2, x4, t;
                    x2.setMul(x, x);
                    x4.setMul(x2, x2);
                    t.setDiv<HK_ACC_12_BIT, HK_DIV_IGNORE>(base4, x4);
                    x.setAddMul(t, x, hkSimdFloat32_4);

                    // Newton iter 2
                    x2.setMul(x, x);
                    x4.setMul(x2, x2);
                    t.setDiv<HK_ACC_12_BIT, HK_DIV_IGNORE>(base4, x4);
                    x.setAddMul(t, x, hkSimdFloat32::fromFloat(0.00128f));

                    // Newton iter 3 (note we use a precise division this time)
                    x2.setMul(x, x);
                    x4.setMul(x2, x2);
                    t.setDiv<HK_ACC_FULL, HK_DIV_IGNORE>(base4, x4);
                    x.setAddMul(t, x, hkSimdFloat32::fromFloat(122070312500.0f));

                    // Cube and multiply in any remaining factors
                    x2.setMul(x, x);
                    hkVector4f expPart;
                    expPart.setMul(x, hkSimdFloat32::fromFloat(4.834637627e-28f));
                    expPart.mul(x2);

                    // Blend linear and exponential part
                    floats[i].setSelect(linearPartMask, linearPart, expPart);
                }

                // Transpose back
                HK_TRANSPOSE4f(floats[0], floats[1], floats[2], floats[3]);

                floats[0].store<4, HK_IO_SIMD_ALIGNED>(static_cast<float*>(targetPointer) + 0);
                floats[1].store<4, HK_IO_SIMD_ALIGNED>(static_cast<float*>(targetPointer) + 4);
                floats[2].store<4, HK_IO_SIMD_ALIGNED>(static_cast<float*>(targetPointer) + 8);
                floats[3].store<4, HK_IO_SIMD_ALIGNED>(static_cast<float*>(targetPointer) + 12);

                sourcePointer = hkAddByteOffset(sourcePointer, sourceStride * elementsPerBatch);
                targetPointer = hkAddByteOffset(targetPointer, targetStride * elementsPerBatch);
                numElements -= elementsPerBatch;
            }
        }

        while (numElements)
        {
            *static_cast<hkColorf*>(targetPointer) = hkColorf(*static_cast<const hkColorUbGamma*>(sourcePointer));

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    static const hkImageConversionEntry hkImageConversion_FLOAT_to_UNORM8_supported[] = {
        hkImageConversionEntry(hkImageFormat::R32_FLOAT, hkImageFormat::R8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_G32_FLOAT, hkImageFormat::R8_G8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
        //hkImageConversionEntry(hkImageFormat::R32G32B32_FLOAT, hkImageFormat::R8G8B8_UNORM, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
    };
}

class hkImageConversion_FLOAT_to_UNORM8 : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make( hkImageConversion_FLOAT_to_UNORM8_supported );
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        // Work with single channels instead of pixels
        numElements *= hkImageFormat::getBitsPerPixel(targetFormat) / 8;

        hkUint32 sourceStride = 4;
        hkUint32 targetStride = 1;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

#if defined(HK_SSE_VERSION) && (HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED) && defined(HK_ARCH_INTEL)
        // Not implemented generically since NaN handling is not sufficiently exposed via hkVector4f
        if (isAligned(sourcePointer) && isAligned(targetPointer))
        {
            const hkUint32 elementsPerBatch = 16;

            __m128 zero = _mm_setzero_ps();
            __m128 one = _mm_set1_ps(1.0f);
            __m128 scale = _mm_set1_ps(255.0f);
            __m128 half = _mm_set1_ps(0.5f);

            while (numElements >= elementsPerBatch)
            {
                __m128 float0 = _mm_load_ps(static_cast<const float*>(sourcePointer) + 0);
                __m128 float1 = _mm_load_ps(static_cast<const float*>(sourcePointer) + 4);
                __m128 float2 = _mm_load_ps(static_cast<const float*>(sourcePointer) + 8);
                __m128 float3 = _mm_load_ps(static_cast<const float*>(sourcePointer) + 12);

                // Clamp NaN to zero
                float0 = _mm_and_ps(_mm_cmpord_ps(float0, zero), float0);
                float1 = _mm_and_ps(_mm_cmpord_ps(float1, zero), float1);
                float2 = _mm_and_ps(_mm_cmpord_ps(float2, zero), float2);
                float3 = _mm_and_ps(_mm_cmpord_ps(float3, zero), float3);

                // Saturate
                float0 = _mm_max_ps(zero, _mm_min_ps(one, float0));
                float1 = _mm_max_ps(zero, _mm_min_ps(one, float1));
                float2 = _mm_max_ps(zero, _mm_min_ps(one, float2));
                float3 = _mm_max_ps(zero, _mm_min_ps(one, float3));

                float0 = _mm_mul_ps(float0, scale);
                float1 = _mm_mul_ps(float1, scale);
                float2 = _mm_mul_ps(float2, scale);
                float3 = _mm_mul_ps(float3, scale);

                // Add 0.5f and truncate for rounding as required by D3D spec
                float0 = _mm_add_ps(float0, half);
                float1 = _mm_add_ps(float1, half);
                float2 = _mm_add_ps(float2, half);
                float3 = _mm_add_ps(float3, half);

                __m128i int0 = _mm_cvttps_epi32(float0);
                __m128i int1 = _mm_cvttps_epi32(float1);
                __m128i int2 = _mm_cvttps_epi32(float2);
                __m128i int3 = _mm_cvttps_epi32(float3);

                __m128i short0 = _mm_packs_epi32(int0, int1);
                __m128i short1 = _mm_packs_epi32(int2, int3);

                _mm_store_si128(reinterpret_cast<__m128i*>(targetPointer), _mm_packus_epi16(short0, short1));

                sourcePointer = hkAddByteOffset(sourcePointer, sourceStride * elementsPerBatch);
                targetPointer = hkAddByteOffset(targetPointer, targetStride * elementsPerBatch);
                numElements -= elementsPerBatch;
            }
        }

#endif

        while (numElements)
        {
            float value = *reinterpret_cast<const float*>(sourcePointer);

            if (!hkSimdFloat32::fromFloat(value).isOk()) // NaN
            {
                *reinterpret_cast<hkUint8*>(targetPointer) = 0;
            }
            else
            {
                value = hkMath::clamp(value, 0.0f, 1.0f);

                value = value * 255;

                value += 0.5f;

                // Cast to int drops decimal fraction as required by D3D data conversion rules
                *reinterpret_cast<hkUint8*>(targetPointer) = static_cast<hkUint8>(value);
            }

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    const hkImageConversionEntry hkImageConversion_FLOAT_to_UNORM16_supported[] = {
        hkImageConversionEntry(hkImageFormat::R32_FLOAT, hkImageFormat::R16_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_G32_FLOAT, hkImageFormat::R16_G16_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageFormat::R16_G16_B16_A16_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
    };
}

class hkImageConversion_FLOAT_to_UNORM16 : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_FLOAT_to_UNORM16_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        // Work with single channels instead of pixels
        numElements *= hkImageFormat::getBitsPerPixel(targetFormat) / 16;

        hkUint32 sourceStride = 4;
        hkUint32 targetStride = 2;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

        while (numElements)
        {
            float value = *reinterpret_cast<const float*>(sourcePointer);

            if (!hkSimdFloat32::fromFloat(value).isOk()) // NaN
            {
                *reinterpret_cast<hkUint16*>(targetPointer) = 0;
            }
            else
            {
                value = hkMath::clamp(value, 0.0f, 1.0f);

                value = value * 65535;

                value += 0.5f;

                // Cast to int drops decimal fraction as required by D3D data conversion rules
                *reinterpret_cast<hkUint16*>(targetPointer) = static_cast<hkUint16>(value);
            }

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    static const hkImageConversionEntry hkImageConversion_FLOAT_to_UNORM8_SRGB_supported[] = {
        hkImageConversionEntry(hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED_SRGB, hkImageConversionFlags::None),
    };
}

class hkImageConversion_FLOAT_to_UNORM8_SRGB : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_FLOAT_to_UNORM8_SRGB_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = 16;
        hkUint32 targetStride = 4;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

        while (numElements)
        {
            *reinterpret_cast<hkColorUbGamma*>(targetPointer) = *reinterpret_cast<const hkColorf*>(sourcePointer);

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    const hkImageConversionEntry hkImageConversion_FLOAT32_to_16_supported[] = {
        hkImageConversionEntry(hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageFormat::R16_G16_B16_A16_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_G32_FLOAT, hkImageFormat::R16_G16_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_FLOAT, hkImageFormat::R16_FLOAT, hkImageConversionFlags::None),
    };
}

class hkImageConversion_FLOAT32_to_16 : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_FLOAT32_to_16_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = 4;
        hkUint32 targetStride = 2;

        // Work in terms of individual floats
        numElements *= hkImageFormat::getBitsPerPixel(sourceFormat) / 32;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

        while (numElements)
        {
            reinterpret_cast<hkFloat16*>(targetPointer)->setReal<true>(*reinterpret_cast<const float*>(sourcePointer));

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    const hkImageConversionEntry hkImageConversion_FLOAT16_to_32_supported[] = {
        hkImageConversionEntry(hkImageFormat::R16_G16_B16_A16_FLOAT, hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R16_G16_FLOAT, hkImageFormat::R32_G32_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R16_FLOAT, hkImageFormat::R32_FLOAT, hkImageConversionFlags::None),
    };
}

class hkImageConversion_FLOAT16_to_32 : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_FLOAT16_to_32_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = 2;
        hkUint32 targetStride = 4;

        // Work in terms of individual floats
        numElements *= hkImageFormat::getBitsPerPixel(targetFormat) / 32;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

        while (numElements)
        {
            *reinterpret_cast<float*>(targetPointer) = reinterpret_cast<const hkFloat16*>(sourcePointer)->getFloat32();

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

class hkImageConversion_R32_G32_B32_FLOAT_to_B8_G8_R8_UNSIGNED_NORMALIZED : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        static const hkImageConversionEntry supported[] =
        {
        hkImageConversionEntry( hkImageFormat::R32_G32_B32_FLOAT, hkImageFormat::B8_G8_R8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None )
        };
        return hkArrayViewT::make( supported );
    }

    virtual hkResult convertPixels( hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat ) const HK_OVERRIDE
    {
        const hkFloat32* inputs = reinterpret_cast<const hkFloat32*>(source.begin());
        hkUint8* outputs = reinterpret_cast<hkUint8*>(target.begin());

        hkConcurrency::parallelForEx( numElements, [inputs, outputs]( hkConcurrency::Iterator elements )
        {
            for ( int elementIndex : elements )
            {
                hkVector4 i; i.load<3, HK_IO_NATIVE_ALIGNED>( inputs + elementIndex * 3 );
                hkVector4 j; j.setClampedZeroOne( i );
                j *= hkSimdReal_255;
                j += hkVector4::ctor( hkSimdReal_Inv2 );
                hkIntVector ji; ji.setConvertF32toS32( j );
                hkUint8* HK_RESTRICT k = outputs + elementIndex * 3;
                k[0] = hkUint8( ji.getComponent<2>() );
                k[1] = hkUint8( ji.getComponent<1>() );
                k[2] = hkUint8( ji.getComponent<0>() );
            }
        } );
        return HK_SUCCESS;
    }
};


class hkImageConversion_R32_G32_B32_FLOAT_to_R8_G8_B8_UNSIGNED_NORMALIZED : public hkImageConversionStepLinear
{
    public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        static const hkImageConversionEntry supported[] =
        {
            hkImageConversionEntry( hkImageFormat::R32_G32_B32_FLOAT, hkImageFormat::R8_G8_B8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None ),
            hkImageConversionEntry( hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None )
        };
        return hkArrayViewT::make( supported );
    }

    virtual hkResult convertPixels( hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat ) const HK_OVERRIDE
    {
        const hkFloat32* inputs = reinterpret_cast<const hkFloat32*>(source.begin());
        hkUint8* outputs = reinterpret_cast<hkUint8*>(target.begin());
        int numBytesPerPixel = hkImageFormat::getBitsPerPixel( targetFormat ) / 8;
        numElements *= numBytesPerPixel;
        const int numFloatsPerBatch = 32;
        int numBatches = numElements / numFloatsPerBatch;

        hkConcurrency::parallelForEx( numBatches, [inputs, outputs, numFloatsPerBatch]( hkConcurrency::Iterator elements )
        {
            for ( int batchIndex : elements )
            {
                for ( int i = 0; i < numFloatsPerBatch; i += 4 )
                {
                    int k = i + batchIndex * numFloatsPerBatch;
                    hkVector4 val; val.load<4, HK_IO_BYTE_ALIGNED>( inputs + k );
                    val.setClampedZeroOne( val );
                    val *= hkSimdReal_255;
                    val += hkVector4::ctor( hkSimdReal_Inv2 );
                    hkIntVector ji; ji.setConvertF32toS32( val );
                    ji.setConvertSaturateS32ToS16( ji, ji );
                    ji.setConvertSaturateS16ToU8( ji, ji );

                    hkUint32* HK_RESTRICT out = (hkUint32*)(outputs + k);
                    ji.store<1, HK_IO_BYTE_ALIGNED>( out );
                }
            }
        } );
        // do the remaining pixels
        for ( unsigned int i = numBatches*numFloatsPerBatch; i < numElements; i++ )
        {
            hkReal x = inputs[i] * 255.0f + 0.5f;
            x = hkMath::clamp( x, 0, 255.0f );
            outputs[i] = hkUint8( x );
        }

        return HK_SUCCESS;
    }
};

// convert to 8 bit and add alpha
class hkImageConversion_R32_G32_B32_FLOAT_to_R8_G8_B8_A8_UNSIGNED_NORMALIZED : public hkImageConversionStepLinear
{
    public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        static const hkImageConversionEntry supported[] =
        {
            hkImageConversionEntry( hkImageFormat::R32_G32_B32_FLOAT, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None )
        };
        return hkArrayViewT::make( supported );
    }

    virtual hkResult convertPixels( hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numPixels, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat ) const HK_OVERRIDE
    {
        const hkFloat32* inputs = reinterpret_cast<const hkFloat32*>(source.begin());
        hkUint8* outputs = reinterpret_cast<hkUint8*>(target.begin());

        const int numPixelserBatch = 32;
        int numBatches = numPixels / numPixelserBatch;

        hkConcurrency::parallelForEx( numBatches, [inputs, outputs, numPixelserBatch]( hkConcurrency::Iterator elements )
        {
            for ( int batchIndex : elements )
            {
                for ( int i = 0; i < numPixelserBatch; i ++ )
                {
                    int k = 3 * (i + batchIndex * numPixelserBatch);
                    hkVector4 val; val.load<3, HK_IO_BYTE_ALIGNED>( inputs + k );
                    val.setClampedZeroOne( val );
                    val.setW( hkSimdReal_1 );   // alpha
                    val *= hkSimdReal_255;
                    val += hkVector4::ctor( hkSimdReal_Inv2 );
                    hkIntVector ji; ji.setConvertF32toS32( val );
                    ji.setConvertSaturateS32ToS16( ji, ji );
                    ji.setConvertSaturateS16ToU8( ji, ji );

                    int k2 = 4 * (i + batchIndex * numPixelserBatch);
                    hkUint32* HK_RESTRICT out = (hkUint32*)(outputs + k2);
                    ji.store<1, HK_IO_BYTE_ALIGNED>( out );
                }
            }
        } );
        // do the remaining pixels
        for ( unsigned int i = numBatches*numPixelserBatch; i < numPixels; i++ )
        {
            hkVector4 val; val.load<3, HK_IO_BYTE_ALIGNED>( inputs + 3*i );
            val.setClampedZeroOne( val );
            val.setW( hkSimdReal_1 );   // alpha
            val *= hkSimdReal_255;
            val += hkVector4::ctor( hkSimdReal_Inv2 );
            hkIntVector ji; ji.setConvertF32toS32( val );
            ji.setConvertSaturateS32ToS16( ji, ji );
            ji.setConvertSaturateS16ToU8( ji, ji );

            hkUint32* HK_RESTRICT out = (hkUint32*)(outputs + 4*i);
            ji.store<1, HK_IO_BYTE_ALIGNED>( out );
        }

        return HK_SUCCESS;
    }
};


namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    static const hkImageConversionEntry hkImageConversion_Pad_to_RGBA32_supported[] = {
        hkImageConversionEntry(hkImageFormat::R32_FLOAT, hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_G32_FLOAT, hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_G32_B32_FLOAT, hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageConversionFlags::None),
    };
}

class hkImageConversion_Pad_to_RGBA32 : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_Pad_to_RGBA32_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = hkImageFormat::getBitsPerPixel(sourceFormat) / 8;
        hkUint32 targetStride = hkImageFormat::getBitsPerPixel(targetFormat) / 8;

        const float* sourcePointer = static_cast<const float*>(source.begin());
        float* targetPointer = static_cast<float*>(target.begin());

        const hkUint32 numChannels = sourceStride / sizeof(float);

        while (numElements)
        {
            // Copy existing channels
            hkString::memCpy4(targetPointer, sourcePointer, numChannels);

            // Fill others with zero
            hkString::memSet4(targetPointer + numChannels, 0, 3 - numChannels);

            // Set alpha to 1
            targetPointer[3] = 1.0f;

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    const hkImageConversionEntry hkImageConversion_Pad_to_RGBA8_supported[] = {
        hkImageConversionEntry(hkImageFormat::R8_UNSIGNED_NORMALIZED, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R8_G8_UNSIGNED_NORMALIZED, hkImageFormat::R8_G8_B8_A8_UNSIGNED_NORMALIZED, hkImageConversionFlags::None),
    };
}

class hkImageConversion_Pad_to_RGBA8 : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_Pad_to_RGBA8_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = hkImageFormat::getBitsPerPixel(sourceFormat) / 8;
        hkUint32 targetStride = hkImageFormat::getBitsPerPixel(targetFormat) / 8;

        const hkUint8* sourcePointer = static_cast<const hkUint8*>(source.begin());
        hkUint8* targetPointer = static_cast<hkUint8*>(target.begin());

        const hkUint32 numChannels = sourceStride / sizeof(hkUint8);

        while (numElements)
        {
            // Copy existing channels
            hkString::memCpy(targetPointer, sourcePointer, numChannels);

            // Fill others with zero
            hkString::memSet(targetPointer + numChannels, 0, 3 - numChannels);

            // Set alpha to 1
            targetPointer[3] = 0xFF;

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};

namespace HK_UNITY_ANONYMOUS_NAMESPACE
{
    static const hkImageConversionEntry hkImageConversion_FLOAT_to_R11G11B10_supported[] = {
        hkImageConversionEntry(hkImageFormat::R32_G32_B32_A32_FLOAT, hkImageFormat::R11_G11_B10_UNSIGNED_FLOAT, hkImageConversionFlags::None),
        hkImageConversionEntry(hkImageFormat::R32_G32_B32_FLOAT, hkImageFormat::R11_G11_B10_UNSIGNED_FLOAT, hkImageConversionFlags::None),
    };
}

class hkImageConversion_FLOAT_to_R11G11B10 : public hkImageConversionStepLinear
{
public:
    virtual hkArrayView<const hkImageConversionEntry> getSupportedConversions() const HK_OVERRIDE
    {
        HK_UNITY_USING_ANONYMOUS_NAMESPACE;
        return hkArrayViewT::make(hkImageConversion_FLOAT_to_R11G11B10_supported);
    }

    virtual hkResult convertPixels(hkArrayView<const void> source, hkArrayView<void> target, hkUint32 numElements, hkImageFormat::Enum sourceFormat, hkImageFormat::Enum targetFormat) const HK_OVERRIDE
    {
        hkUint32 sourceStride = hkImageFormat::getBitsPerPixel(sourceFormat) / 8;
        hkUint32 targetStride = hkImageFormat::getBitsPerPixel(targetFormat) / 8;

        const void* sourcePointer = source.begin();
        void* targetPointer = target.begin();

        while (numElements)
        {
            // Adapted from DirectXMath's XMStoreFloat3PK
            hkUint32 IValue[3];
            hkString::memCpy4(IValue, sourcePointer, 3);

            hkUint32 Result[3];

            // X & Y Channels (5-bit exponent, 6-bit mantissa)
            for (hkUint32 j = 0; j < 2; ++j)
            {
                hkUint32 Sign = IValue[j] & 0x80000000;
                hkUint32 I = IValue[j] & 0x7FFFFFFF;

                if ((I & 0x7F800000) == 0x7F800000)
                {
                    // INF or NAN
                    Result[j] = 0x7c0;
                    if ((I & 0x7FFFFF) != 0)
                    {
                        Result[j] = 0x7c0 | (((I >> 17) | (I >> 11) | (I >> 6) | (I)) & 0x3f);
                    }
                    else if (Sign)
                    {
                        // -INF is clamped to 0 since 3PK is positive only
                        Result[j] = 0;
                    }
                }
                else if (Sign)
                {
                    // 3PK is positive only, so clamp to zero
                    Result[j] = 0;
                }
                else if (I > 0x477E0000U)
                {
                    // The number is too large to be represented as a float11, set to max
                    Result[j] = 0x7BF;
                }
                else
                {
                    if (I < 0x38800000U)
                    {
                        // The number is too small to be represented as a normalized float11
                        // Convert it to a denormalized value.
                        hkUint32 Shift = 113U - (I >> 23U);
                        I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
                    }
                    else
                    {
                        // Rebias the exponent to represent the value as a normalized float11
                        I += 0xC8000000U;
                    }

                    Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U) & 0x7ffU;
                }
            }

            // Z Channel (5-bit exponent, 5-bit mantissa)
            hkUint32 Sign = IValue[2] & 0x80000000;
            hkUint32 I = IValue[2] & 0x7FFFFFFF;

            if ((I & 0x7F800000) == 0x7F800000)
            {
                // INF or NAN
                Result[2] = 0x3e0;
                if (I & 0x7FFFFF)
                {
                    Result[2] = 0x3e0 | (((I >> 18) | (I >> 13) | (I >> 3) | (I)) & 0x1f);
                }
                else if (Sign)
                {
                    // -INF is clamped to 0 since 3PK is positive only
                    Result[2] = 0;
                }
            }
            else if (Sign)
            {
                // 3PK is positive only, so clamp to zero
                Result[2] = 0;
            }
            else if (I > 0x477C0000U)
            {
                // The number is too large to be represented as a float10, set to max
                Result[2] = 0x3df;
            }
            else
            {
                if (I < 0x38800000U)
                {
                    // The number is too small to be represented as a normalized float10
                    // Convert it to a denormalized value.
                    hkUint32 Shift = 113U - (I >> 23U);
                    I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
                }
                else
                {
                    // Rebias the exponent to represent the value as a normalized float10
                    I += 0xC8000000U;
                }

                Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U) & 0x3ffU;
            }

            // Pack Result into memory
            *reinterpret_cast<hkUint32*>(targetPointer) = (Result[0] & 0x7ff)
                | ((Result[1] & 0x7ff) << 11)
                | ((Result[2] & 0x3ff) << 22);

            sourcePointer = hkAddByteOffset(sourcePointer, sourceStride);
            targetPointer = hkAddByteOffset(targetPointer, targetStride);
            numElements--;
        }

        return HK_SUCCESS;
    }
};


static hkImageSwizzleConversion32_2103 s_swizzle32;
static hkImageConversion_AddAlpha s_addAlpha;
static hkImageConversion_UNORM8_to_FLOAT s_unorm8_to_float;
static hkImageConversion_UNORM16_to_FLOAT s_unorm16_to_float;
static hkImageConversion_UNORM8_SRGB_to_FLOAT s_unorm8_srgb_to_float;
static hkImageConversion_FLOAT_to_UNORM8 s_float_to_unorm8;
static hkImageConversion_FLOAT_to_UNORM16 s_float_to_unorm16;
static hkImageConversion_FLOAT_to_UNORM8_SRGB s_float_to_unorm8_srgb;
static hkImageConversion_FLOAT32_to_16 s_float32_to16;
static hkImageConversion_FLOAT16_to_32 s_float16_to32;
static hkImageConversion_Pad_to_RGBA32 s_pad_to_rgba32;
static hkImageConversion_Pad_to_RGBA8 s_pad_to_rgba8;
static hkImageConversion_FLOAT_to_R11G11B10 s_float_to_111110;
static hkImageConversion_R32_G32_B32_FLOAT_to_R8_G8_B8_UNSIGNED_NORMALIZED s_R32_G32_B32_FLOAT_to_R8_G8_B8_UNSIGNED_NORMALIZED;
static hkImageConversion_R32_G32_B32_FLOAT_to_R8_G8_B8_A8_UNSIGNED_NORMALIZED s_R32_G32_B32_FLOAT_to_R8_G8_B8_A8_UNSIGNED_NORMALIZED;
static hkImageConversion_R32_G32_B32_FLOAT_to_B8_G8_R8_UNSIGNED_NORMALIZED s_R32_G32_B32_FLOAT_to_B8_G8_R8_UNSIGNED_NORMALIZED;

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
