// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM     : ALL
// PRODUCT      : COMMON
// VISIBILITY   : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0

#pragma once

#if defined(HK_COMPILER_HAS_INTRINSICS_IA32) && !defined(__HAVOK_PARSER__)
#   if defined(HK_COMPILER_MSVC)
#       include <intrin.h>
#       include <immintrin.h>
#   elif defined(HK_COMPILER_GCC) || defined(HK_COMPILER_CLANG)
#       include <x86intrin.h>
#   else
#       error IA32 intrinsics defined, but no information on where to include them from
#   endif
#endif

//#define HKFPMATH_CHECK_WORD_RANGE

#ifdef HK_DYNAMIC_DLL
#   undef HKFPMATH_64_BIT_WORDS
#else
#   if defined(HK_ARCH_X64)
#       define HKFPMATH_64_BIT_WORDS
#   elif defined(HK_ARCH_IA32)
#       undef HKFPMATH_64_BIT_WORDS
#   else
#       undef HKFPMATH_64_BIT_WORDS
//#error Need to add support for this architecture in hkFpMathDetail.h, to select between 32 and 64 bit words
#endif
#endif

// Assembly routines declared elsewhere
#if defined(HK_ARCH_X64) && defined(HK_COMPILER_MSVC) && !defined(HK_DYNAMIC_DLL)

/// Computes (uLow<<64 | uHigh) / v, where the quotient must fit in 64 bits.
extern "C" hkUint64 hkDivUint128Uint64(hkUint64 uLow, hkUint64 uHigh, hkUint64 v);

/// Computes (uLow<<64 | uHigh) / v, where the quotient must fit in 64 bits. Also returns the remainder.
///
/// Also fills the remainder into *rOut.
extern "C" hkUint64 hkDivModUint128Uint64(hkUint64 uLow, hkUint64 uHigh, hkUint64 v, hkUint64* rOut);

#endif


namespace hkFpMath
{
    namespace Detail
    {
#ifdef HKFPMATH_64_BIT_WORDS
        typedef hkUint64 MultiWordUintType;
        typedef hkInt64 MultiWordIntType;
#   if defined(HK_COMPILER_GCC) || defined(HK_COMPILER_CLANG)
        typedef unsigned __int128 DoubleMultiWordUintType;
        typedef signed __int128 DoubleMultiWordIntType;
#   endif
#else
        typedef hkUint32 MultiWordUintType;
        typedef hkUint64 DoubleMultiWordUintType;
        typedef hkInt32 MultiWordIntType;
        typedef hkInt64 DoubleMultiWordIntType;
#endif
        static const double MULTI_WORD_RADIX = double(MultiWordUintType(-1)) + double(1);

        enum { MULTI_WORD_UINT_BITS = sizeof(MultiWordUintType) * 8 };
        enum { LOG2_MULTI_WORD_UINT_BITS = hkMath::Log2<MULTI_WORD_UINT_BITS>::ANSWER };

        const MultiWordUintType WORD_SIGN_MASK = MultiWordUintType(1) << (MULTI_WORD_UINT_BITS - 1);

        /// A utility to convert from any number of bits to the smallest geq size supported by ExactUintStorageType/ExactIntStorageType
        template<int BITS>
        struct StorageBits
        {
            enum {
                VALUE =
                BITS <= 8 ? 8 :
                BITS <= 16 ? 16 :
                BITS <= 32 ? 32 :
#ifdef HKFPMATH_64_BIT_WORDS
                BITS <= 64 ? 64 :
#endif
                (BITS + MULTI_WORD_UINT_BITS - 1) / MULTI_WORD_UINT_BITS * MULTI_WORD_UINT_BITS
            };
        };

        /// An arbitrarily large uint stored as an array of words
        template<int WORDS>
        struct MultiWordUint
        {
            HK_DECLARE_CLASS(MultiWordUint, New, Pod);

            MultiWordUintType m_words[WORDS];

            MultiWordUintType & operator[](int i) { return m_words[i]; }
            MultiWordUintType operator[](int i) const { return m_words[i]; }
        };

        /// Storage type for a uint, as long as it's a supported size
        template<int BITS>
        struct ExactUintStorageType
        {
            HK_COMPILE_TIME_ASSERT(BITS > 0 && BITS % MULTI_WORD_UINT_BITS == 0);
            typedef MultiWordUint<BITS / MULTI_WORD_UINT_BITS> Type;
        };

        template<> struct ExactUintStorageType<8> { typedef hkUint8  Type; };
        template<> struct ExactUintStorageType<16> { typedef hkUint16 Type; };
        template<> struct ExactUintStorageType<32> { typedef hkUint32 Type; };
#ifdef HKFPMATH_64_BIT_WORDS
        template<> struct ExactUintStorageType<64> { typedef hkUint64 Type; };
#endif

        /// Storage type for a uint of any size
        template<int BITS>
        struct UintStorageType
        {
            typedef typename ExactUintStorageType<StorageBits<BITS>::VALUE>::Type Type;
        };

        template<typename T>
        struct CheckSingle
        {
            typedef T Type;
        };

        template<int WORDS>
        struct CheckSingle<MultiWordUint<WORDS> >
        {
        };

#define HKFPMATH_CHECK_SINGLE(T) typename CheckSingle<T>::Type checkSingle_ ## T = 1

        /// Signed version of an unsigned type, for sign-extension
        template<typename T> struct EquivalentIntType { };
        template<> struct EquivalentIntType<hkUint8> { typedef hkInt8  Type; };
        template<> struct EquivalentIntType<hkUint16> { typedef hkInt16 Type; };
        template<> struct EquivalentIntType<hkUint32> { typedef hkInt32 Type; };
        template<> struct EquivalentIntType<hkUint64> { typedef hkInt64 Type; };
        template<> struct EquivalentIntType<hkInt8> { typedef hkInt8  Type; };
        template<> struct EquivalentIntType<hkInt16> { typedef hkInt16 Type; };
        template<> struct EquivalentIntType<hkInt32> { typedef hkInt32 Type; };
        template<> struct EquivalentIntType<hkInt64> { typedef hkInt64 Type; };

        template<typename T>
        inline typename EquivalentIntType<T>::Type makeSigned(T x)
        {
            return (typename EquivalentIntType<T>::Type)(x);
        }

        /** Sign-extend an unsigned value to MultiWordUintType size */
        template<typename T>
        inline MultiWordUintType signExtend(T x)
        {
            return MultiWordUintType(MultiWordIntType(makeSigned(x)));
        }

        /** Splat msb to all bits */
        template<typename T>
        inline MultiWordUintType signSplat(T x)
        {
            return MultiWordUintType(MultiWordIntType(makeSigned(x)) >> (MULTI_WORD_UINT_BITS - 1));
        }

        template<int TYPE_A_WORDS>
        inline MultiWordUintType signSplat(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            return signSplat(a[TYPE_A_WORDS-1]);
        }

// - MSVC has adc intrinsics starting in 2013
// - GCC has it since 5.1.
// - PlayStation(R)4 currently gives ICE.
// - other clang platforms wrongly attribute it to the adx extension (unclear
//   which version this is fixed in).
#if defined(HK_COMPILER_HAS_INTRINSICS_IA32) && \
    (!defined(HK_COMPILER_MSVC) || HK_COMPILER_MSVC_VERSION >= 1800) && \
    !defined(HK_PLATFORM_PS4) && \
    !defined(HK_COMPILER_CLANG) && \
    (!defined(HK_COMPILER_GCC) || HK_COMPILER_GCC_VERSION >= 0x50100)
#define HK_COMPILER_HAS_ADC_INTRINSICS
#endif

        inline void addCarry(unsigned char & carryInOut, MultiWordUintType a, MultiWordUintType b, MultiWordUintType & res)
        {
#if defined(__HAVOK_PARSER__)
            // nothing needed here
#elif defined(HK_COMPILER_HAS_ADC_INTRINSICS)
#if defined(HKFPMATH_64_BIT_WORDS)
            carryInOut = _addcarry_u64(carryInOut, a, b, &res);
#else
            carryInOut = _addcarry_u32(carryInOut, a, b, &res);
#endif
#else
            MultiWordUintType sum = a+b;
            bool carry = (sum < a);
            sum += carryInOut;
            carry |= (sum < carryInOut);
            carryInOut = carry;
            res = sum;
#endif
        }

        inline void subBorrow(unsigned char & borrowInOut, MultiWordUintType a, MultiWordUintType b, MultiWordUintType & res)
        {
#if defined(__HAVOK_PARSER__)
            // nothing needed here
#elif defined(HK_COMPILER_HAS_ADC_INTRINSICS)
#if defined(HKFPMATH_64_BIT_WORDS)
            borrowInOut = _subborrow_u64(borrowInOut, a, b, &res);
#else
            borrowInOut = _subborrow_u32(borrowInOut, a, b, &res);
#endif
#else
            bool borrow = a<b;
            MultiWordUintType diff = a-b;
            borrow |= diff<borrowInOut;
            diff -= borrowInOut;
            borrowInOut = borrow;
            res = diff;
#endif
        }

        inline void mulWordsUnsigned(MultiWordUintType a, MultiWordUintType b, MultiWordUintType & lo, MultiWordUintType & hi)
        {
#if defined(__HAVOK_PARSER__)
            // nothing needed here
#elif defined(HKFPMATH_64_BIT_WORDS) && defined(HK_ARCH_X64) && defined(HK_COMPILER_MSVC)
            lo = MultiWordUintType(a*b);
            hi = __umulh(a, b);
#else
            DoubleMultiWordUintType hilo = DoubleMultiWordUintType(a)*b;
            lo = MultiWordUintType(hilo);
            hi = MultiWordUintType(hilo >> MULTI_WORD_UINT_BITS);
#endif
        }

        inline void mulWordsSigned(MultiWordUintType a, MultiWordUintType b, MultiWordUintType & lo, MultiWordUintType & hi)
        {
#if defined(__HAVOK_PARSER__)
            // nothing needed here
#elif defined(HKFPMATH_64_BIT_WORDS) && defined(HK_ARCH_X64) && defined(HK_COMPILER_MSVC)
            lo = MultiWordUintType(a*b);
            hi = __mulh(a, b);
#else
            DoubleMultiWordIntType hilo = DoubleMultiWordIntType(MultiWordIntType(a))*MultiWordIntType(b);
            lo = MultiWordUintType(hilo);
            hi = MultiWordUintType(DoubleMultiWordUintType(hilo) >> MULTI_WORD_UINT_BITS);
#endif
        }

        inline void divWordsUnsigned(MultiWordUintType uLow, MultiWordUintType uHigh, MultiWordUintType v, MultiWordUintType & qOut, MultiWordUintType & rOut)
        {
#if defined(__HAVOK_PARSER__)
            // nothing needed here
#elif defined(HKFPMATH_64_BIT_WORDS) && defined(HK_ARCH_X64) && defined(HK_COMPILER_MSVC)
            qOut = hkDivModUint128Uint64(uLow, uHigh, v, &rOut);
#elif (defined(HK_ARCH_X64) || defined(HK_ARCH_IA32)) && (defined(HK_COMPILER_GCC) || defined(HK_COMPILER_CLANG))
#if defined(HKFPMATH_64_BIT_WORDS)
            __asm__(
                "movq %[uLow], %%rax\n\t"
                "movq %[uHigh], %%rdx\n\t"
                "divq %[v]\n\t"
                "movq %%rdx, %[r]\n\t"
                "movq %%rax, %[q]"
                : [q] "=rm" (qOut), [r] "=rm" (rOut)
                : [uLow] "rm" (uLow), [uHigh] "rm" (uHigh), [v] "rm" (v)
                : "rax", "rdx"
            );
#else
            __asm__(
                "movl %[uLow], %%eax\n\t"
                "movl %[uHigh], %%edx\n\t"
                "divl %[v]\n\t"
                "movl %%edx, %[r]\n\t"
                "movl %%eax, %[q]"
                : [q] "=rm" (qOut), [r] "=rm" (rOut)
                : [uLow] "rm" (uLow), [uHigh] "rm" (uHigh), [v] "rm" (v)
                : "eax", "edx"
            );
#endif
#elif defined(HK_ARCH_IA32) && defined(HK_COMPILER_MSVC)
            MultiWordUintType q, r;
            __asm
            {
                mov eax, uLow;
                mov edx, uHigh;
                div v;
                mov q, eax;
                mov r, edx;
            }
            qOut = q;
            rOut = r;
#else
            DoubleMultiWordUintType u = (DoubleMultiWordUintType(uHigh) << MULTI_WORD_UINT_BITS) | DoubleMultiWordUintType(uLow);
            qOut = MultiWordUintType(u/v);
            rOut = MultiWordUintType(u%v);
#endif
        }

        template<typename TypeRes>
        inline void setZeroUint(TypeRes & res) { res = 0; }

        template<int TYPE_RES_WORDS>
        inline void setZeroUint(MultiWordUint<TYPE_RES_WORDS> & res)
        {
            for(int i = 0; i < TYPE_RES_WORDS; i++)
            {
                res[i] = 0;
            }
        }

        template<int EXACT_BITS, typename TypeRes>
        inline void setMinValInt(TypeRes & res)
        {
            const MultiWordUintType MAX_VAL = (MultiWordUintType(1) << (EXACT_BITS - 1)) - 1;
            res = TypeRes(-MultiWordIntType(MAX_VAL));
        }

        template<int EXACT_BITS, int TYPE_RES_WORDS>
        inline void setMinValInt(MultiWordUint<TYPE_RES_WORDS> & res)
        {
            // Bit pattern is 11..1100..00 in the top word, 0 in middle words, 00..01 in bottom word.
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS > 1);
            res[0] = 1;
            for(int i = 1; i<TYPE_RES_WORDS-1; i++)
            {
                res[i] = 0;
            }

            res[TYPE_RES_WORDS-1] = (~MultiWordUintType(0)) << ((EXACT_BITS-1)%MULTI_WORD_UINT_BITS);
        }

        template<int EXACT_BITS, typename TypeRes>
        inline void setMaxValInt(TypeRes & res)
        {
            const MultiWordUintType MAX_VAL = (MultiWordUintType(1) << (EXACT_BITS - 1)) - 1;
            res = TypeRes(MAX_VAL);
        }

        template<int EXACT_BITS, int TYPE_RES_WORDS>
        inline void setMaxValInt(MultiWordUint<TYPE_RES_WORDS> & res)
        {
            // Bit pattern is 00..0011..11 in the top word, 11..11 in other words.
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS > 1);

            for(int i = 0; i < TYPE_RES_WORDS - 1; i++)
            {
                res[i] = (~MultiWordUintType(0));
            }

            res[TYPE_RES_WORDS-1] = (MultiWordUintType(1) << ((EXACT_BITS-1)%MULTI_WORD_UINT_BITS))-1;
        }

        template<typename TypeA, typename TypeB, typename TypeRes>
        inline void addUintUint(TypeA a, TypeB b, TypeRes & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB), HKFPMATH_CHECK_SINGLE(TypeRes))
        {
            HK_COMPILE_TIME_ASSERT(sizeof(TypeA) <= sizeof(TypeRes));
            HK_COMPILE_TIME_ASSERT(sizeof(TypeB) <= sizeof(TypeRes));
            res = TypeRes(a) + b;
        }

        template<typename TypeA, typename TypeB>
        inline void addUintUint(TypeA a, TypeB b, MultiWordUint<2> & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB))
        {
            unsigned char carry = 0;
            addCarry(carry, MultiWordUintType(a), MultiWordUintType(b), res[0]);
            res[1] = carry;
        }

        template<int TYPE_A_WORDS, typename TypeB, int TYPE_RES_WORDS>
        inline void addUintUint(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeB))
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS == TYPE_A_WORDS || TYPE_RES_WORDS == TYPE_A_WORDS + 1);

            unsigned char carry = 0;
            addCarry(carry, a[0], MultiWordUintType(b), res[0]);
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                addCarry(carry, a[i], 0, res[i]);
            }
            if(TYPE_RES_WORDS > TYPE_A_WORDS)
            {
                res[TYPE_RES_WORDS - 1] = carry;
            }
        }

        template<typename TypeA, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void addUintUint(TypeA a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            addUintUint(b, a, res);
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void addUintUint(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_B_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_A_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_B_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS <= (TYPE_A_WORDS > TYPE_B_WORDS ? TYPE_A_WORDS : TYPE_B_WORDS) + 1);


            if(TYPE_A_WORDS < TYPE_B_WORDS)
            {
                addUintUint(b, a, res);
                return;
            }

            unsigned char carry = 0;
            for(int i = 0; i < TYPE_B_WORDS; i++)
            {
                addCarry(carry, a[i], b[i], res[i]);
            }
            for(int i = TYPE_B_WORDS; i < TYPE_A_WORDS; i++)
            {
                addCarry(carry, a[i], 0, res[i]);
            }
            if(TYPE_RES_WORDS > TYPE_A_WORDS)
            {
                res[TYPE_RES_WORDS - 1] = carry;
            }
        }

        // a += b*c
        template<int TYPE_A_WORDS, int TYPE_B_WORDS>
        inline void inPlaceAddMulMultiMultiSingle(MultiWordUintType a[TYPE_A_WORDS], const MultiWordUintType b[TYPE_B_WORDS], MultiWordUintType c)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_B_WORDS <= TYPE_A_WORDS);

            // Addmul from even entries of b, then from odd entries. This avoids having to do a three-way add.
            {
                unsigned char carry = 0;
                int i;
                for(i = 0; i < TYPE_B_WORDS; i += 2)
                {
                    MultiWordUintType lw, uw;
                    mulWordsUnsigned(b[i], c, lw, uw);
                    addCarry(carry, a[i], lw, a[i]);
                    addCarry(carry, a[i+1], uw, a[i+1]);
                }
                for(; i<TYPE_A_WORDS; i++)
                {
                    addCarry(carry, a[i], 0, a[i]);
                }
            }

            {
                unsigned char carry = 0;
                int i;
                for(i = 1; i < TYPE_B_WORDS; i += 2)
                {
                    MultiWordUintType lw, uw;
                    mulWordsUnsigned(b[i], c, lw, uw);
                    addCarry(carry, a[i], lw, a[i]);
                    addCarry(carry, a[i+1], uw, a[i+1]);
                }
                for(; i<TYPE_A_WORDS; i++)
                {
                    addCarry(carry, a[i], 0, a[i]);
                }
            }
        }

        template<typename TypeA, typename TypeB, typename TypeRes>
        inline void mulUintUint(TypeA a, TypeB b, TypeRes & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB), HKFPMATH_CHECK_SINGLE(TypeRes))
        {
            HK_COMPILE_TIME_ASSERT(sizeof(TypeA) <= sizeof(TypeRes));
            HK_COMPILE_TIME_ASSERT(sizeof(TypeB) <= sizeof(TypeRes));
            res = TypeRes(a) * b;
        }

        template<typename TypeA, typename TypeB>
        inline void mulUintUint(TypeA a, TypeB b, MultiWordUint<2> & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB))
        {
            mulWordsUnsigned(a, b, res[0], res[1]);
        }

        template<int TYPE_A_WORDS, typename TypeB, int TYPE_RES_WORDS>
        inline void mulUintUint(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeB))
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_A_WORDS);

            MultiWordUintType lw, uw;
            mulWordsUnsigned(a[0], b, lw, uw);
            res[0] = MultiWordUintType(lw);
            unsigned char carry = 0;
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                MultiWordUintType prevUw = uw;
                mulWordsUnsigned(a[i], b, lw, uw);
                addCarry(carry, lw, prevUw, res[i]);
            }
            if(TYPE_RES_WORDS > TYPE_A_WORDS)
            {
                res[TYPE_A_WORDS] = carry + uw;
            }
            for(int i = TYPE_A_WORDS+1; i<TYPE_RES_WORDS; i++)
            {
                res[i] = 0;
            }
        }

        template<typename TypeA, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void mulUintUint(TypeA a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            mulUintUint(b, a, res);
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int TYPE_RES_WORDS, int i>
        struct mulMultiMulti_aux
        {
            static inline void exec(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
            {
                inPlaceAddMulMultiMultiSingle<TYPE_RES_WORDS - i, TYPE_A_WORDS>(res.m_words + i, a.m_words, b[i]);
                mulMultiMulti_aux<TYPE_A_WORDS, TYPE_B_WORDS, TYPE_RES_WORDS, i + 1>::exec(a, b, res);
            }
        };

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        struct mulMultiMulti_aux<TYPE_A_WORDS, TYPE_B_WORDS, TYPE_RES_WORDS, TYPE_B_WORDS>
        {
            static inline void exec(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
            {

            }
        };

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void mulUintUint(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_B_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS == TYPE_A_WORDS + TYPE_B_WORDS || TYPE_RES_WORDS == TYPE_A_WORDS + TYPE_B_WORDS - 1);

            setZeroUint(res);

            mulMultiMulti_aux<TYPE_A_WORDS, TYPE_B_WORDS, TYPE_RES_WORDS, 0>::exec(a, b, res);
        }

        template<typename TypeA>
        inline bool equalZero(TypeA a)
        {
            return a == 0;
        }

        template<int TYPE_A_WORDS>
        inline bool equalZero(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            bool eq0 = true;
            for(int i = 0; i < TYPE_A_WORDS; i++)
            {
                eq0 &= (a[i] == 0);
            }
            return eq0;
        }

        template<typename TypeA, typename TypeB>
        inline bool lessUintUint(TypeA a, TypeB b)
        {
            return a < b;
        }

        template<int TYPE_A_WORDS, typename TypeB>
        inline bool lessUintUint(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b)
        {
            bool less = a[0] < b;
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                less &= bool(a[i] == 0); // any nonzero words above the first force it to not be less
            }

            return less;
        }

        template<typename TypeA, int TYPE_B_WORDS>
        inline bool lessUintUint(TypeA a, MultiWordUint<TYPE_B_WORDS> const& b)
        {
            bool less = a < b[0];
            for(int i = 1; i < TYPE_B_WORDS; i++)
            {
                less |= bool(b[i]); // any nonzero words above the first force it to be less
            }

            return less;
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS>
        inline bool lessUintUint(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b)
        {
            bool eq, gt, lt;

            if(TYPE_A_WORDS > TYPE_B_WORDS) // Handle extra a, if a is longer
            {
                gt = false;
                for(int i = TYPE_A_WORDS - 1; i >= TYPE_B_WORDS; i--)
                {
                    bool thisGt = (a[i] != 0);
                    gt |= thisGt;
                }
                eq = !gt;
                lt = false;
            }
            else if(TYPE_A_WORDS < TYPE_B_WORDS) // Handle extra b, if a is longer
            {
                lt = false;
                for(int i = TYPE_B_WORDS - 1; i >= TYPE_A_WORDS; i--)
                {
                    bool thisLt = (b[i] != 0);
                    lt |= thisLt;
                }
                eq = !lt;
                gt = false;
            }
            else
            {
                eq = true; gt = false; lt = false;
            }

            // handle matched part
            for(int i = (TYPE_A_WORDS < TYPE_B_WORDS ? TYPE_A_WORDS : TYPE_B_WORDS) - 1; i >= 0; i--)
            {
                bool thisLt = (a[i] < b[i]);
                bool thisGt = (a[i] > b[i]);

                lt |= (eq & thisLt);
                gt |= (eq & thisGt);
                eq &= !(thisLt | thisGt);
            }

            return lt;
        }

        template<typename TypeA, typename TypeB>
        inline bool equalUintUint(TypeA a, TypeB b)
        {
            return a == b;
        }

        template<int TYPE_A_WORDS, typename TypeB>
        inline bool equalUintUint(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b)
        {
            bool equal = a[0] == b;
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                equal &= bool(a[i] == 0); // any nonzero words above the first force it to be unequal
            }

            return equal;
        }

        template<typename TypeA, int TYPE_B_WORDS>
        inline bool equalUintUint(TypeA a, MultiWordUint<TYPE_B_WORDS> const& b)
        {
            return equalUintUint(b, a);
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS>
        inline bool equalUintUint(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b)
        {
            bool equal = true;

            // handle matched part
            for(int i = 0; i < (TYPE_A_WORDS<TYPE_B_WORDS ? TYPE_A_WORDS : TYPE_B_WORDS); i++)
            {
                equal &= bool(a[i] == b[i]);
            }

            // handle extra a, if a is longer
            for(int i = TYPE_B_WORDS; i < TYPE_A_WORDS; i++)
            {
                equal &= bool(a[i] == 0); // Any nonzero words above b force it to be unequal
            }

            // handle extra b, if a is longer
            for(int i = TYPE_A_WORDS; i < TYPE_B_WORDS; i++)
            {
                equal &= bool(b[i] == 0); // Any nonzero words above a force it to be unequal
            }

            return equal;
        }

        template<int EXACT_BITS, typename TypeA>
        inline bool checkWordRangeUint(TypeA const& a)
        {
            if(EXACT_BITS >= sizeof(TypeA)*8)
            {
                return true;
            }

            return (a >> EXACT_BITS) == 0;
        }

        template<int EXACT_BITS, int TYPE_A_WORDS>
        inline bool checkWordRangeUint(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            if(EXACT_BITS >= MULTI_WORD_UINT_BITS * TYPE_A_WORDS)
            {
                return true;
            }

            bool inRange = true;

            int firstZeroWord = (EXACT_BITS + MULTI_WORD_UINT_BITS-1) >> LOG2_MULTI_WORD_UINT_BITS;
            for(int i = firstZeroWord; i<TYPE_A_WORDS; i++)
            {
                inRange &= (a[i] == 0);
            }

            enum { MSB_BITS = EXACT_BITS & (MULTI_WORD_UINT_BITS-1) };

            if(MSB_BITS != 0)
            {
                inRange &= checkWordRangeUint<MSB_BITS, MultiWordUintType>(a[firstZeroWord-1]);
            }

            return inRange;
        }

        template<typename TypeA, typename TypeRes>
        inline void narrowUint(TypeA a, TypeRes & res)
        {
            HK_COMPILE_TIME_ASSERT(sizeof(res) <= sizeof(a));

            res = TypeRes(a);
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0x94751ad8, res == a);
#endif
        }

        // unimplemented, but simplifies implementation of resize
        template<typename TypeA, int TYPE_RES_WORDS>
        inline void narrowUint(TypeA a, MultiWordUint<TYPE_RES_WORDS> & res);

        template<int TYPE_A_WORDS, typename TypeRes>
        inline void narrowUint(MultiWordUint<TYPE_A_WORDS> const& a, TypeRes & res)
        {
            res = TypeRes(a[0]);
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0x94751ad8, res == a[0]);
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                HK_ASSERT(0x94751ad8, a[i] == 0);
            }
#endif
        }

        template<int TYPE_A_WORDS, int TYPE_RES_WORDS>
        inline void narrowUint(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS <= TYPE_A_WORDS);

            for(int i = 0; i < TYPE_RES_WORDS; i++)
            {
                res[i] = a[i];
            }

#ifdef HKFPMATH_CHECK_WORD_RANGE
            for(int i = TYPE_RES_WORDS; i < TYPE_A_WORDS; i++)
            {
                HK_ASSERT(0x94751ad8, a[i] == 0);
            }
#endif
        }

        template<typename TypeA, typename TypeRes>
        inline void widenUint(TypeA a, TypeRes & res)
        {
            HK_COMPILE_TIME_ASSERT(sizeof(res) >= sizeof(a));

            res = a;
        }

        template<typename TypeA, int TYPE_RES_WORDS>
        inline void widenUint(TypeA a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            res[0] = a;
            for(int i = 1; i < TYPE_RES_WORDS; i++)
            {
                res[i] = 0;
            }
        }

        // unimplemented, but simplifies implementation of resize
        template<int TYPE_A_WORDS, typename TypeRes>
        inline void widenUint(MultiWordUint<TYPE_A_WORDS> const& a, TypeRes & res);

        template<int TYPE_A_WORDS, int TYPE_RES_WORDS>
        inline void widenUint(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_A_WORDS);

            for(int i = 0; i < TYPE_A_WORDS; i++)
            {
                res[i] = a[i];
            }
            for(int i = TYPE_A_WORDS; i < TYPE_RES_WORDS; i++)
            {
                res[i] = 0;
            }
        }

        template<typename TypeA, typename TypeRes>
        inline void resizeUint_aux(TypeA const& a, TypeRes & res, hkTrait::TrueType)
        {
            widenUint(a, res);
        }

        template<typename TypeA, typename TypeRes>
        inline void resizeUint_aux(TypeA const& a, TypeRes & res, hkTrait::FalseType)
        {
            narrowUint(a, res);
        }

        template<typename TypeA, typename TypeRes>
        inline void resizeUint(TypeA const& a, TypeRes & res)
        {
            resizeUint_aux(a, res, typename hkTrait::TraitBool<sizeof(TypeRes) >= sizeof(TypeA)>::Type());
        }

        template<typename TypeA>
        inline MultiWordUintType getWordUint(TypeA const& a, int i)
        {
            return (i == 0) ? a : 0;
        }

        template<int TYPE_A_WORDS>
        inline MultiWordUintType getWordUint(MultiWordUint<TYPE_A_WORDS> const& a, int i)
        {
            return (i < TYPE_A_WORDS) ? a[i] : 0;
        }

        template<typename TypeA>
        inline void inPlaceSetWordUint(TypeA & a, int i, MultiWordUintType word, HKFPMATH_CHECK_SINGLE(TypeA))
        {
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0x97325973, i == 0);
#endif
            a = TypeA(word);
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0x97325974, a == word);
#endif
        }

        template<int TYPE_A_WORDS>
        inline void inPlaceSetWordUint(MultiWordUint<TYPE_A_WORDS> & a, int i, MultiWordUintType word)
        {
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0x97325973, i < TYPE_A_WORDS);
#endif
            a[i] = word;
        }

        template<int EXACT_BITS, typename TypeA>
        inline void inPlaceSaturateInt(TypeA & a, bool negative, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            // max representable positive int
            MultiWordUintType val = (1 << (EXACT_BITS-1)) - 1;

            // If negative, complement for max representable negative int
            val ^= MultiWordUintType(-MultiWordIntType(negative));

            a = TypeA(val);
        }

        template<int EXACT_BITS, int TYPE_A_WORDS>
        inline void inPlaceSaturateInt(MultiWordUint<TYPE_A_WORDS> & a, bool negative)
        {
            MultiWordUintType splatted = MultiWordUintType(-MultiWordIntType(negative));

            /// The number of low-end words to be entirely filled with 0 (for negative) or 1 (for positive)
            const int FILLED_WORDS = EXACT_BITS / MULTI_WORD_UINT_BITS;

            /// The number of significant words; those above these words will be entirely filled with 1 (for negative) or 0 (for positive)
            const int RES_WORDS = (EXACT_BITS + MULTI_WORD_UINT_BITS - 1) / MULTI_WORD_UINT_BITS;

            // Set low words
            for(int i = 0; i<FILLED_WORDS; i++)
            {
                a[i] = ~splatted;
            }

            // Set middle word
            if(EXACT_BITS % MULTI_WORD_UINT_BITS != 0)
            {
                inPlaceSaturateInt<EXACT_BITS>(a[FILLED_WORDS], negative);
            }

            // Set high words
            for(int i = RES_WORDS; i<TYPE_A_WORDS; i++)
            {
                a[i] = splatted;
            }

            // Set the LSB to 1, because we clamp negative numbers to 0x1000....0001
            a[0] |= 0x1;
        }

        template<typename TypeA>
        inline void inPlaceShrUint(TypeA & a, int bits)
        {
#ifdef HKFPMATH_CHECK_WORD_RANGE
            // TODO need range assertion here
#endif
            a >>= bits;
        }

        template<int TYPE_A_WORDS>
        inline void inPlaceShrUint(MultiWordUint<TYPE_A_WORDS> & a, int totalBits)
        {
#ifdef HKFPMATH_CHECK_WORD_RANGE
            // TODO need range assertion here
#endif
            int words = totalBits >> LOG2_MULTI_WORD_UINT_BITS;
            int bits = totalBits & ((1 << LOG2_MULTI_WORD_UINT_BITS)-1);

            if(bits == 0)
            {
                // word-size shift
                for(int i = 0; i<TYPE_A_WORDS-words; i++)
                {
                    a[i] = a[i+words];
                }
                for(int i = TYPE_A_WORDS-words; i<TYPE_A_WORDS; i++)
                {
                    a[i] = 0;
                }
            }
            else
            {
                // general case
                for(int i = 0; i<TYPE_A_WORDS-words-1; i++)
                {
                    a[i] = (a[i+words] >> bits) | (a[i+words+1] << (MULTI_WORD_UINT_BITS-bits));
                }
                a[TYPE_A_WORDS-words-1] = a[TYPE_A_WORDS-1] >> bits;
                for(int i = TYPE_A_WORDS-words; i<TYPE_A_WORDS; i++)
                {
                    a[i] = 0;
                }
            }
        }

        template<typename TypeA>
        inline void inPlaceShlUint(TypeA & a, int bits)
        {
            //HK_COMPILE_TIME_ASSERT(bits < 8 * sizeof(TypeA));
#ifdef HKFPMATH_CHECK_WORD_RANGE
            // TODO need range assertion here
#endif
            a <<= bits;
        }

        template<int TYPE_A_WORDS>
        inline void inPlaceShlUint(MultiWordUint<TYPE_A_WORDS> & a, int totalBits)
        {
#ifdef HKFPMATH_CHECK_WORD_RANGE
            // TODO need range assertion here
#endif
            int words = totalBits >> LOG2_MULTI_WORD_UINT_BITS;
            int bits = totalBits & ((1 << LOG2_MULTI_WORD_UINT_BITS)-1);

            if(bits == 0)
            {
                // word-size shift
                for(int i = TYPE_A_WORDS-1; i>=words; i--)
                {
                    a[i] = a[i-words];
                }
                for(int i = words-1; i>=0; i--)
                {
                    a[i] = 0;
                }
            }
            else
            {
                // general case
                for(int i = TYPE_A_WORDS-1; i>=words+1; i--)
                {
                    a[i] = (a[i-words] << bits) | (a[i-words-1] >> (MULTI_WORD_UINT_BITS-bits));
                }
                a[words] = a[0] << bits;
                for(int i = words-1; i>=0; i--)
                {
                    a[i] = 0;
                }
            }
        }

        template<typename TypeA>
        inline int countLeadingZeroesUint(TypeA a)
        {
            return hkMath::countLeadingZeros(MultiWordUintType(a)) - (sizeof(MultiWordUintType)*8 - sizeof(TypeA)*8);
        }

        template<int TYPE_A_WORDS>
        inline int countLeadingZeroesUint(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            for(int i = TYPE_A_WORDS-1; i>=0; i--)
            {
                if(a[i] != 0)
                {
                    return (TYPE_A_WORDS-i-1)*MULTI_WORD_UINT_BITS + countLeadingZeroesUint(a[i]);
                }
            }

            return TYPE_A_WORDS * MULTI_WORD_UINT_BITS;
        }

        template<typename TypeA>
        inline int ceilLog2Uint(TypeA a)
        {
            return (MULTI_WORD_UINT_BITS - hkMath::countLeadingZeros(MultiWordUintType(a)-1)) & (MULTI_WORD_UINT_BITS-1);
        }

        template<int TYPE_A_WORDS>
        inline int ceilLog2Uint(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            for(int i = TYPE_A_WORDS-1; i>=0; i--)
            {
                if(a[i] != 0)
                {
                    return (MULTI_WORD_UINT_BITS - hkMath::countLeadingZeros(a[i]-1)) + MULTI_WORD_UINT_BITS*i;
                }
            }

            return 0;
        }

        template<typename TypeA>
        inline int ceilLog2Int(TypeA a)
        {
            return ceilLog2Uint(hkMath::abs(makeSigned(a)));
        }

        template<int TYPE_A_WORDS>
        inline int ceilLog2Int(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            MultiWordUintType splatted = signSplat(a);
            for(int i = TYPE_A_WORDS-1; i>=0; i--)
            {
                if(a[i] != splatted)
                {
                    MultiWordUintType aAbs = (a[i] ^ splatted) - splatted; // can't use hkMath::abs here, because INT_MIN (not in the top word)
                    return (MULTI_WORD_UINT_BITS - hkMath::countLeadingZeros(aAbs-1)) + MULTI_WORD_UINT_BITS*i;
                }
            }

            return 0;
        }

        inline int countTrailingZerosUint(MultiWordUintType a)
        {
            return hkMath::countTrailingZeros(a);
        }

        template<typename TypeA>
        inline int countTrailingZerosUint(TypeA a)
        {
            return hkMath::min2(hkMath::countTrailingZeros(MultiWordUintType(a)), sizeof(TypeA)*8);
        }

        template<int TYPE_A_WORDS>
        inline int countTrailingZerosUint(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            for(int i = 0; i<TYPE_A_WORDS; i++)
            {
                if(a[i] != 0)
                {
                    return i*MULTI_WORD_UINT_BITS + countTrailingZerosUint(a[i]);
                }
            }

            return TYPE_A_WORDS * MULTI_WORD_UINT_BITS;
        }

        template<typename TypeA, typename TypeB, typename TypeQ, typename TypeR>
        inline void divModUint(
            TypeA a, TypeB b, TypeQ & q, TypeR & r,
            HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB), HKFPMATH_CHECK_SINGLE(TypeQ), HKFPMATH_CHECK_SINGLE(TypeR))
        {
            HK_COMPILE_TIME_ASSERT(sizeof(TypeQ) >= sizeof(TypeA));
            HK_COMPILE_TIME_ASSERT(sizeof(TypeR) >= sizeof(TypeB));

            q = TypeQ(a / b);
            r = TypeR(a % b);
        }

        template<int TYPE_A_WORDS, typename TypeB, int TYPE_Q_WORDS, typename TypeR>
        inline void divModUint(
            MultiWordUint<TYPE_A_WORDS> const& a, TypeB b, MultiWordUint<TYPE_Q_WORDS> & q, TypeR & r,
            HKFPMATH_CHECK_SINGLE(TypeB), HKFPMATH_CHECK_SINGLE(TypeR))
        {
#ifdef HKFPMATH_64_BIT_WORDS
            HK_COMPILE_TIME_ASSERT(TYPE_Q_WORDS >= TYPE_A_WORDS);
            HK_COMPILE_TIME_ASSERT(sizeof(TypeR) >= sizeof(TypeB));
            int i;
            for(i = TYPE_Q_WORDS-1; i >= TYPE_A_WORDS; i--)
            {
                q[i] = 0;
            }
            for(; i > 0; i--)
            {
                if(a[i] == 0)
                {
                    q[i] = 0;
                }
                else
                {
                    break;
                }
            }

            MultiWordUintType rr;
            divModUint(a[i], b, q[i], rr);

            for(i--; i >= 0; i--)
            {
                divWordsUnsigned(a[i], rr, b, q[i], rr);
                //q[i] = hkDivModUint128Uint64(a[i], rr, b, &rr);
            }

            r = TypeR(rr);
#else
            HK_COMPILE_TIME_ASSERT(TYPE_Q_WORDS >= TYPE_A_WORDS);
            HK_COMPILE_TIME_ASSERT(sizeof(TypeR) >= sizeof(TypeB));
            MultiWordUintType rr = 0;
            for(int i = TYPE_A_WORDS; i < TYPE_Q_WORDS; i++)
            {
                q[i] = 0;
            }

            for(int i = TYPE_A_WORDS - 1; i >= 0; i--)
            {
                DoubleMultiWordUintType aw = (DoubleMultiWordUintType(rr) << MULTI_WORD_UINT_BITS) | a[i];
                q[i] = MultiWordUintType(aw / b);
                rr = MultiWordUintType(aw % b);
            }
            r = TypeR(rr);
#endif
        }

        template<int TYPE_U_WORDS, int TYPE_V_WORDS, int TYPE_Q_WORDS, int TYPE_R_WORDS>
        inline void divModUint(MultiWordUint<TYPE_U_WORDS> const& u, MultiWordUint<TYPE_V_WORDS> const& v, MultiWordUint<TYPE_Q_WORDS> & q, MultiWordUint<TYPE_R_WORDS> & r)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_Q_WORDS >= TYPE_U_WORDS);

            setZeroUint(q); // Need to zero-fill for later anyway

            if(lessUintUint(u, v))
            {
                resizeUint(u, r);
                return;
            }

            int m = TYPE_U_WORDS;
            while(u[m-1] == 0) m--;

            int n = TYPE_V_WORDS;
            while(v[n-1] == 0) n--;

            if(n == 1)
            {
                setZeroUint(r);
                divModUint(u, v[0], q, r[0]);
                return;
            }

            int s = countLeadingZeroesUint(v[n-1]);

            MultiWordUintType un[TYPE_U_WORDS+1];
            MultiWordUintType vn[TYPE_V_WORDS];

            // special case zero-shift, for architectures that shift modulo the word size
            if(s != 0)
            {
                for(int i = n-1; i>0; i--)
                {
                    vn[i] = (v[i] << s) | (v[i-1] >> (MULTI_WORD_UINT_BITS-s));
                }
                vn[0] = v[0] << s;

                un[m] = u[m-1] >> (MULTI_WORD_UINT_BITS-s);
                for(int i = m-1; i>0; i--)
                {
                    un[i] = (u[i] << s) | (u[i-1] >> (MULTI_WORD_UINT_BITS-s));
                }
                un[0] = u[0] << s;
            }
            else
            {
                for(int i = 0; i<m; i++)
                {
                    un[i] = u[i];
                }
                un[m] = 0;

                for(int i = 0; i<n; i++)
                {
                    vn[i] = v[i];
                }
            }

            for(int j = m-n; j >= 0; j--)
            {
                MultiWordUintType qHat;
                if(un[j+n] >= vn[n-1])
                {
                    qHat = ~MultiWordUintType(0);
                }
                else
                {
                    MultiWordUintType rHat;
                    divWordsUnsigned(un[j+n-1], un[j+n], vn[n-1], qHat, rHat);
                again:
                    MultiWordUintType uHatLo, uHatHi;
                    mulWordsUnsigned(qHat, vn[n-2], uHatLo, uHatHi);
                    if(uHatHi > rHat || (uHatHi == rHat && uHatLo > un[j+n-2]))
                    {
                        qHat--;
                        unsigned char carry = 0;
                        addCarry(carry, rHat, vn[n-1], rHat);
                        if(carry == 0)
                        {
                            goto again;
                        }
                    }
                }

                // Subtract either odd or even paired entries, such that there's one extra entry on top. This is never
                // enough to underflow (we need the other entries as well for that).
                {
                    unsigned char borrow = 0;
                    for(int i = n&1; i < n; i += 2)
                    {
                        MultiWordUintType lo, hi;
                        mulWordsUnsigned(vn[i], qHat, lo, hi);
                        subBorrow(borrow, un[i+j], lo, un[i+j]);
                        subBorrow(borrow, un[i+j+1], hi, un[i+j+1]);
                    }
                    un[j+n] -= borrow;
                }

                // Now subtract the other entries. The final borrow flag tells us whether we've underflowed.
                unsigned char borrow = 0;
                for(int i = 1-(n&1); i < n; i += 2)
                {
                    MultiWordUintType lo, hi;
                    mulWordsUnsigned(vn[i], qHat, lo, hi);
                    subBorrow(borrow, un[i+j], lo, un[i+j]);
                    subBorrow(borrow, un[i+j+1], hi, un[i+j+1]);
                }

                if(HK_VERY_UNLIKELY(borrow))
                {
                    qHat--;
                    unsigned char carry = 0;
                    for(int i = 0; i < n; i++)
                    {
                        addCarry(carry, un[i+j], vn[i], un[i+j]);
                    }
                }

                q[j] = qHat;
            }

            if(s != 0)
            {
                for(int i = 0; i<n; i++)
                {
                    r[i] = (un[i] >> s) | (un[i+1] << (MULTI_WORD_UINT_BITS-s));
                }
            }
            else
            {
                for(int i = 0; i<n; i++)
                {
                    r[i] = un[i];
                }
            }

            for(int i = n; i<TYPE_R_WORDS; i++)
            {
                r[i] = 0;
            }

            for(int i = m; i<TYPE_Q_WORDS; i++)
            {
                q[i] = 0;
            }
        }

        template<typename TypeA>
        inline bool lessZeroInt(TypeA a)
        {
            return makeSigned(a) < 0;
        }

        template<int TYPE_A_WORDS>
        inline bool lessZeroInt(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            return MultiWordIntType(a[TYPE_A_WORDS - 1]) < 0;
        }

        template<typename TypeA>
        inline int sgnInt(TypeA a)
        {
            return int(signSplat(a) | (1-equalZero(a)));
        }

        template<typename TypeA, typename TypeB>
        inline bool lessIntInt(TypeA a, TypeB b)
        {
            return makeSigned(a) < makeSigned(b);
        }

        template<int TYPE_A_WORDS, typename TypeB>
        inline bool lessIntInt(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b)
        {
            bool aLt0 = lessZeroInt(a);
            bool bLt0 = lessZeroInt(b);

            bool gt = !aLt0 & bLt0;
            bool lt = aLt0 & !bLt0;
            bool eq = (aLt0 == bLt0);

            {
                bool thisLt = (a[0] < signExtend(b));
                bool thisGt = (a[0] > signExtend(b));

                lt |= (eq & thisLt);
                gt |= (eq & thisGt);
                eq &= !(thisLt | thisGt);
            }

            MultiWordUintType bSignSplatted = signSplat(b);
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                bool thisLt = (a[i] < bSignSplatted);
                bool thisGt = (a[i] > bSignSplatted);

                lt |= (eq & thisLt);
                gt |= (eq & thisGt);
                eq &= !(thisLt | thisGt);
            }

            return lt;
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS>
        inline bool lessIntInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b)
        {
            bool aLt0 = lessZeroInt<TYPE_A_WORDS>(a);
            bool bLt0 = lessZeroInt<TYPE_B_WORDS>(b);

            bool gt = !aLt0 & bLt0;
            bool lt = aLt0 & !bLt0;
            bool eq = (aLt0 == bLt0);

            if(TYPE_A_WORDS > TYPE_B_WORDS) // Handle extra a, if a is longer
            {
                MultiWordUintType bSignSplatted = signSplat(b);

                for(int i = TYPE_A_WORDS - 1; i >= TYPE_B_WORDS; i--)
                {
                    bool thisLt = (a[i] < bSignSplatted);
                    bool thisGt = (a[i] > bSignSplatted);
                    lt |= (eq & thisLt);
                    gt |= (eq & thisGt);
                    eq &= !(thisLt | thisGt);
                }
            }

            if(TYPE_A_WORDS < TYPE_B_WORDS) // Handle extra b, if a is longer
            {
                MultiWordUintType aSignSplatted = signSplat(a);

                for(int i = TYPE_B_WORDS - 1; i >= TYPE_A_WORDS; i--)
                {
                    bool thisLt = (aSignSplatted < b[i]);
                    bool thisGt = (aSignSplatted > b[i]);
                    lt |= (eq & thisLt);
                    gt |= (eq & thisGt);
                    eq &= !(thisLt | thisGt);
                }
            }

            // handle matched part
            for(int i = (TYPE_A_WORDS < TYPE_B_WORDS ? TYPE_A_WORDS : TYPE_B_WORDS) - 1; i >= 0; i--)
            {
                bool thisLt = (a[i] < b[i]);
                bool thisGt = (a[i] > b[i]);

                lt |= (eq & thisLt);
                gt |= (eq & thisGt);
                eq &= !(thisLt | thisGt);
            }

            return lt;
        }

        template<int TYPE_A_WORDS, typename TypeB>
        inline bool greaterIntInt(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b)
        {
            bool aLt0 = lessZeroInt(a);
            bool bLt0 = lessZeroInt(b);

            bool gt = !aLt0 & bLt0;
            bool lt = aLt0 & !bLt0;
            bool eq = (aLt0 == bLt0);

            {
                bool thisLt = (a[0] < signExtend(b));
                bool thisGt = (a[0] > signExtend(b));

                lt |= (eq & thisLt);
                gt |= (eq & thisGt);
                eq &= !(thisLt | thisGt);
            }

            MultiWordUintType bSignSplatted = signSplat(b);
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                bool thisLt = (a[i] < bSignSplatted);
                bool thisGt = (a[i] > bSignSplatted);

                lt |= (eq & thisLt);
                gt |= (eq & thisGt);
                eq &= !(thisLt | thisGt);
            }

            return gt;
        }

        template<typename TypeA, typename TypeB>
        inline bool equalIntInt(TypeA a, TypeB b)
        {
            return makeSigned(a) == makeSigned(b);
        }

        template<int TYPE_A_WORDS, typename TypeB>
        inline bool equalIntInt(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b)
        {
            MultiWordUintType bSignSplatted = signSplat(b);

            bool equal = equalIntInt(a[0], signExtend(b));
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                equal &= (a[i] == bSignSplatted);
            }
            return equal;
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS>
        inline bool equalIntInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b)
        {
            bool equal = true;

            // handle matched part
            for(int i = 0; i < (TYPE_A_WORDS < TYPE_B_WORDS ? TYPE_A_WORDS : TYPE_B_WORDS); i++)
            {
                equal &= bool(a[i] == b[i]);
            }

            // handle extra a, if a is longer
            if(TYPE_A_WORDS > TYPE_B_WORDS)
            {
                MultiWordUintType bSignSplatted = signSplat(b);
                for(int i = TYPE_B_WORDS; i < TYPE_A_WORDS; i++)
                {
                    equal &= bool(a[i] == bSignSplatted);
                }
            }

            // handle extra b, if b is longer
            if(TYPE_A_WORDS < TYPE_B_WORDS)
            {
                MultiWordUintType aSignSplatted = signSplat(a);
                for(int i = TYPE_A_WORDS; i < TYPE_B_WORDS; i++)
                {
                    equal &= bool(b[i] == aSignSplatted);
                }
            }

            return equal;
        }

        template<typename TypeA, typename TypeB, typename TypeRes>
        inline void addIntInt(TypeA a, TypeB b, TypeRes & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB), HKFPMATH_CHECK_SINGLE(TypeRes))
        {
            HK_COMPILE_TIME_ASSERT(sizeof(TypeA) <= sizeof(TypeRes));
            HK_COMPILE_TIME_ASSERT(sizeof(TypeB) <= sizeof(TypeRes));
            typedef typename EquivalentIntType<TypeRes>::Type IntTypeRes;
            res = TypeRes(IntTypeRes(makeSigned(a)) + makeSigned(b));
        }

        template<typename TypeA, typename TypeB>
        inline void addIntInt(TypeA a, TypeB b, MultiWordUint<2> & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB))
        {
            unsigned char carry = 0;
            addCarry(carry, signExtend(a), signExtend(b), res[0]);
            addCarry(carry, signSplat(a), signSplat(b), res[1]);
        }

        template<int TYPE_A_WORDS, typename TypeB, int TYPE_RES_WORDS>
        inline void addIntInt(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeB))
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS == TYPE_A_WORDS || TYPE_RES_WORDS == TYPE_A_WORDS + 1);
            unsigned char carry = 0;
            addCarry(carry, a[0], signExtend(b), res[0]);

            MultiWordUintType bSignSplatted = signSplat(b);

            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                addCarry(carry, a[i], bSignSplatted, res[i]);
            }
            if(TYPE_RES_WORDS > TYPE_A_WORDS)
            {
                res[TYPE_RES_WORDS - 1] = carry;
            }
        }

        template<typename TypeA, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        void addIntInt(TypeA a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            addIntInt(b, a, res);
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        void addIntInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_B_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_A_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_B_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS <= (TYPE_A_WORDS > TYPE_B_WORDS ? TYPE_A_WORDS : TYPE_B_WORDS) + 1);

            if(TYPE_A_WORDS < TYPE_B_WORDS)
            {
                addIntInt(b, a, res);
                return;
            }

            unsigned char carry = 0;
            for(int i = 0; i < TYPE_B_WORDS; i++)
            {
                addCarry(carry, a[i], b[i], res[i]);
            }
            if(TYPE_A_WORDS > TYPE_B_WORDS)
            {
                MultiWordUintType bSignSplatted = signSplat(b);
                for(int i = TYPE_B_WORDS; i < TYPE_A_WORDS; i++)
                {
                    addCarry(carry, a[i], bSignSplatted, res[i]);
                }
            }
            if(TYPE_RES_WORDS > TYPE_A_WORDS)
            {
                MultiWordUintType aSignSplatted = signSplat(a);
                MultiWordUintType bSignSplatted = signSplat(b);
                addCarry(carry, aSignSplatted, bSignSplatted, res[TYPE_RES_WORDS - 1]);
            }
        }

        template<typename TypeA, typename TypeB, typename TypeRes>
        inline void subIntInt(TypeA a, TypeB b, TypeRes & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB), HKFPMATH_CHECK_SINGLE(TypeRes))
        {
            HK_COMPILE_TIME_ASSERT(sizeof(TypeA) <= sizeof(TypeRes));
            HK_COMPILE_TIME_ASSERT(sizeof(TypeB) <= sizeof(TypeRes));
            typedef typename EquivalentIntType<TypeRes>::Type IntTypeRes;
            res = TypeRes(IntTypeRes(makeSigned(a)) - makeSigned(b));
        }

        template<typename TypeA, typename TypeB>
        inline void subIntInt(TypeA a, TypeB b, MultiWordUint<2> & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB))
        {
            unsigned char borrow = 0;
            subBorrow(borrow, signExtend(a), signExtend(b), res[0]);
            subBorrow(borrow, signSplat(a), signSplat(b), res[1]);
        }

        template<int TYPE_A_WORDS, typename TypeB, int TYPE_RES_WORDS>
        inline void subIntInt(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeB))
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS == TYPE_A_WORDS || TYPE_RES_WORDS == TYPE_A_WORDS + 1);

            MultiWordUintType aSignSplatted = signSplat(a);
            MultiWordUintType bSignSplatted = signSplat(b);

            unsigned char borrow = 0;
            subBorrow(borrow, a[0], signExtend(b), res[0]);
            for(int i = 1; i < TYPE_RES_WORDS; i++)
            {
                MultiWordUintType aw = i<TYPE_A_WORDS ? a[i] : aSignSplatted;
                subBorrow(borrow, aw, bSignSplatted, res[i]);
            }
        }

        template<typename TypeA, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void subIntInt(TypeA a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS == TYPE_B_WORDS || TYPE_RES_WORDS == TYPE_B_WORDS + 1);

            MultiWordUintType aSignSplatted = signSplat(a);
            MultiWordUintType bSignSplatted = signSplat(b);

            unsigned char borrow = 0;
            subBorrow(borrow, signExtend(a), b[0], res[0]);
            for(int i = 1; i < TYPE_RES_WORDS; i++)
            {
                MultiWordUintType bw = i<TYPE_B_WORDS ? b[i] : bSignSplatted;
                subBorrow(borrow, aSignSplatted, bw, res[i]);
            }
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void subIntInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_B_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_A_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_B_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS <= (TYPE_A_WORDS > TYPE_B_WORDS ? TYPE_A_WORDS : TYPE_B_WORDS) + 1);

            MultiWordUintType aSignSplatted = signSplat(a);
            MultiWordUintType bSignSplatted = signSplat(b);

            unsigned char borrow = 0;
            for(int i = 0; i < TYPE_RES_WORDS; i++)
            {
                MultiWordUintType aw = i<TYPE_A_WORDS ? a[i] : aSignSplatted;
                MultiWordUintType bw = i<TYPE_B_WORDS ? b[i] : bSignSplatted;

                subBorrow(borrow, aw, bw, res[i]);
            }
        }

        template<int TYPE_A_WORDS, typename TypeB>
        inline void inPlaceSubIntInt(MultiWordUint<TYPE_A_WORDS> & a, TypeB b)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS <= 2);

            MultiWordUintType bSignSplatted = signSplat(b);

            unsigned char borrow = 0;
            subBorrow(borrow, a[0], signExtend(b), a[0]);
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                subBorrow(borrow, a[i], bSignSplatted, &a[i]);
            }
        }

        template<int TYPE_A_WORDS, typename TypeB, int B_SHIFT_WORDS>
        inline void inPlaceSubIntIntBShifted(MultiWordUint<TYPE_A_WORDS> & a, TypeB b)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS >= B_SHIFT_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS <= B_SHIFT_WORDS + 1);

            // lower B_SHIFT_WORDS words of a are unaffected

            MultiWordUintType bSignSplatted = signSplat(b);

            unsigned char borrow = 0;
            subBorrow(borrow, a[B_SHIFT_WORDS], signExtend(b), a[B_SHIFT_WORDS]);
            for(int i = B_SHIFT_WORDS + 1; i < TYPE_A_WORDS; i++)
            {
                subBorrow(borrow, a[i], bSignSplatted, a[i]);
            }
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS>
        inline void inPlaceSubIntInt(MultiWordUint<TYPE_A_WORDS> & a, MultiWordUint<TYPE_B_WORDS> const& b)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_B_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS >= TYPE_B_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS <= TYPE_B_WORDS + 1);

            MultiWordUintType bSignSplatted = signSplat(b);

            unsigned char borrow = 0;
            for(int i = 0; i < TYPE_A_WORDS; i++)
            {
                MultiWordUintType bw = i<TYPE_B_WORDS ? b[i] : bSignSplatted;

                subBorrow(borrow, a[i], bw, a[i]);
            }
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int B_SHIFT_WORDS>
        inline void inPlaceSubIntIntBShifted(MultiWordUint<TYPE_A_WORDS> & a, MultiWordUint<TYPE_B_WORDS> const& b)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_B_WORDS > 0);
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS >= TYPE_B_WORDS + B_SHIFT_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS <= TYPE_B_WORDS + B_SHIFT_WORDS + 1);

            // lower B_SHIFT_WORDS words of a are unaffected

            MultiWordUintType bSignSplatted = signSplat(b);

            unsigned char borrow = 0;
            for(int i = B_SHIFT_WORDS; i < TYPE_A_WORDS; i++) // note subscripting of a words
            {
                MultiWordUintType bw = (i-B_SHIFT_WORDS)<TYPE_B_WORDS ? b[i-B_SHIFT_WORDS] : bSignSplatted;

                subBorrow(borrow, a[i], bw, a[i]);
            }
        }

        template<typename TypeA, typename TypeB, typename TypeRes>
        inline void mulIntInt(TypeA a, TypeB b, TypeRes & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB), HKFPMATH_CHECK_SINGLE(TypeRes))
        {
            HK_COMPILE_TIME_ASSERT(sizeof(TypeA) <= sizeof(TypeRes));
            HK_COMPILE_TIME_ASSERT(sizeof(TypeB) <= sizeof(TypeRes));
            typedef typename EquivalentIntType<TypeRes>::Type IntTypeRes;
            res = TypeRes(IntTypeRes(makeSigned(a)) * makeSigned(b));
        }

        template<typename TypeA, typename TypeB>
        inline void mulIntInt(TypeA a, TypeB b, MultiWordUint<2> & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeB))
        {
            mulWordsSigned(signExtend(a), signExtend(b), res[0], res[1]);
        }

        template<int TYPE_A_WORDS, typename TypeB, int TYPE_RES_WORDS>
        inline void mulIntInt(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeB))
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS == TYPE_A_WORDS || TYPE_RES_WORDS == TYPE_A_WORDS + 1);
            mulUintUint(a, signExtend(b), res);
            if(lessZeroInt(a))
            {
                inPlaceSubIntIntBShifted<TYPE_RES_WORDS, TypeB, TYPE_A_WORDS>(res, b);
            }
            if(lessZeroInt(b))
            {
                inPlaceSubIntIntBShifted<TYPE_RES_WORDS, TYPE_A_WORDS, 1>(res, a);
            }
        }

        template<typename TypeA, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void mulIntInt(TypeA a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            mulIntInt(b, a, res);
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void mulIntInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            mulUintUint(a, b, res);
            if(lessZeroInt(a))
            {
                inPlaceSubIntIntBShifted<TYPE_RES_WORDS, TYPE_B_WORDS, TYPE_A_WORDS>(res, b);
            }
            if(lessZeroInt(b))
            {
                inPlaceSubIntIntBShifted<TYPE_RES_WORDS, TYPE_A_WORDS, TYPE_B_WORDS>(res, a);
            }
        }

        template<typename TypeA, typename TypeRes>
        inline void narrowInt(TypeA const& a, TypeRes & res, HKFPMATH_CHECK_SINGLE(TypeA), HKFPMATH_CHECK_SINGLE(TypeRes))
        {
            HK_COMPILE_TIME_ASSERT(sizeof(res) <= sizeof(a));

            res = TypeRes(a);
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0x94751ad8, makeSigned(res) == makeSigned(a));
#endif
        }

        template<int TYPE_A_WORDS, typename TypeRes>
        inline void narrowInt(MultiWordUint<TYPE_A_WORDS> const& a, TypeRes & res, HKFPMATH_CHECK_SINGLE(TypeRes))
        {
            res = TypeRes(a[0]);
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0x94751ad8, makeSigned(res) == makeSigned(a));
            for(int i = 1; i < TYPE_A_WORDS; i++)
            {
                HK_ASSERT(0x94751ad8, a[i] == signSplat(a[0]));
            }
#endif
        }

        template<int TYPE_A_WORDS, int TYPE_RES_WORDS>
        inline void narrowInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS <= TYPE_A_WORDS);

            for(int i = 0; i < TYPE_RES_WORDS; i++)
            {
                res[i] = a[i];
            }

#ifdef HKFPMATH_CHECK_WORD_RANGE
            for(int i = TYPE_RES_WORDS; i < TYPE_A_WORDS; i++)
            {
                HK_ASSERT(0x94751ad8, a[i] == signSplat(a));
            }
#endif
        }

        template<int EXACT_BITS, typename TypeA>
        inline bool checkWordRangeInt(TypeA const& a)
        {
            if(EXACT_BITS >= sizeof(TypeA)*8)
            {
                return true;
            }

            // Signed-shift off the potential non-sign bits.
            MultiWordUintType shifted = MultiWordUintType(makeSigned(a) >> (EXACT_BITS-1));

            // Values in range will now be all 0 or all 1.
            return shifted+1 <= 1;
        }

        template<int EXACT_BITS, int TYPE_A_WORDS>
        inline bool checkWordRangeInt(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            if(EXACT_BITS >= MULTI_WORD_UINT_BITS * TYPE_A_WORDS)
            {
                return true;
            }

            MultiWordUintType signMask = signSplat(a);

            const int RANGE_WORDS = (EXACT_BITS + MULTI_WORD_UINT_BITS - 1) / MULTI_WORD_UINT_BITS;

            bool inRange = true;
            /// Check any words to be discarded entirely
            for(int i = RANGE_WORDS; i<TYPE_A_WORDS; i++)
            {
                inRange &= (a[i] == signMask);
            }

            const int EXTRA_BITS = EXACT_BITS % MULTI_WORD_UINT_BITS;
            if(EXTRA_BITS != 0)
            {
                inRange &= checkWordRangeInt<EXTRA_BITS, MultiWordUintType>(a[RANGE_WORDS-1]);
            }

            return inRange;
        }

        template<typename TypeA, typename TypeRes>
        inline void widenInt(TypeA const& a, TypeRes & res)
        {
            HK_COMPILE_TIME_ASSERT(sizeof(res) >= sizeof(a));

            res = TypeRes(typename EquivalentIntType<TypeRes>::Type(makeSigned(a)));
        }

        template<typename TypeA, int TYPE_RES_WORDS>
        inline void widenInt(TypeA const& a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            res[0] = signExtend(a);
            MultiWordUintType aSignSplatted = signSplat(a);
            for(int i = 1; i < TYPE_RES_WORDS; i++)
            {
                res[i] = aSignSplatted;
            }
        }

        template<int TYPE_A_WORDS, int TYPE_RES_WORDS>
        inline void widenInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_A_WORDS);

            for(int i = 0; i < TYPE_A_WORDS; i++)
            {
                res[i] = a[i];
            }

            MultiWordUintType aSignSplatted = signSplat(a);
            for(int i = TYPE_A_WORDS; i < TYPE_RES_WORDS; i++)
            {
                res[i] = aSignSplatted;
            }
        }

        template<typename TypeRes>
        inline void convertInt32Int(hkInt32 a, TypeRes & res)
        {
            res = TypeRes(a);
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0x97325974, makeSigned(res) == a);
#endif
        }

        template<int TYPE_RES_WORDS>
        inline void convertInt32Int(hkInt32 a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            res[0] = signExtend(a);

            MultiWordUintType aSignSplatted = signSplat(a);
            for(int i = 1; i<TYPE_RES_WORDS; i++)
            {
                res[i] = aSignSplatted;
            }
        }

        template<typename TypeA, typename TypeB>
        inline void absIntToUint(TypeA const& a, TypeB & res)
        {
            HK_COMPILE_TIME_ASSERT(sizeof(TypeB) <= sizeof(TypeA));

            res = TypeB(hkMath::abs(makeSigned(a)));

#ifdef HKFPMATH_CHECK_WORD_RANGE
            if(sizeof(TypeB) < sizeof(TypeA))
            {
                HK_ASSERT(0xca97bee4, !(makeSigned(a) < 0 && res == 0));
            }
#endif
        }

        template<int TYPE_A_WORDS, typename TypeB>
        inline void absIntToUint(MultiWordUint<TYPE_A_WORDS> const& a, TypeB & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS == 2);
            MultiWordUintType aSignSplatted = signSplat(a[1]);
#ifdef HKFPMATH_CHECK_WORD_RANGE
            HK_ASSERT(0xca97bee4, !(aSignSplatted && a[0] == 0));
#endif
            res = (a[0] ^ aSignSplatted) - aSignSplatted;
        }

        template<int TYPE_A_WORDS, int TYPE_RES_WORDS>
        inline void absIntToUint(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS == TYPE_RES_WORDS || TYPE_A_WORDS == TYPE_RES_WORDS+1);
            MultiWordUintType aSignSplatted = signSplat(a);
            unsigned char borrow = 0;
            for(int i = 0; i < TYPE_RES_WORDS; i++)
            {
                subBorrow(borrow, a[i] ^ aSignSplatted, aSignSplatted, res[i]);
            }
#ifdef HKFPMATH_CHECK_WORD_RANGE
            if(TYPE_A_WORDS > TYPE_RES_WORDS)
            {
                HK_ASSERT(0xca97bee3, borrow == 0);
            }
#endif
        }

        // negateInt is guaranteed to work in-place
        template<typename TypeA>
        inline void negateInt(TypeA a, TypeA & res, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            HK_COMPILE_TIME_ASSERT(sizeof(a) == sizeof(res));
            res = TypeA(-makeSigned(a));
        }

        template<int TYPE_A_WORDS>
        inline void negateInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_A_WORDS> & res)
        {
            unsigned char borrow = 0;
            for(int i = 0; i < TYPE_A_WORDS; i++)
            {
                subBorrow(borrow, 0, a[i], res[i]);
            }
        }


        // In all these, A is unsigned, B is signed, and Res is signed
        template<typename TypeA, typename TypeB, typename TypeRes>
        inline void mulUintInt(TypeA a, TypeB b, TypeRes & res)
        {
            res = TypeRes(a * typename EquivalentIntType<TypeRes>::Type(makeSigned(b)));
        }

        template<typename TypeA, typename TypeB>
        inline void mulUintInt(TypeA a, TypeB b, MultiWordUint<2> & res)
        {
            mulWordsUnsigned(a, b, res[0], res[1]);
            res[1] -= signSplat(b) & a;
        }

        template<typename TypeA, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void mulUintInt(TypeA a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS == TYPE_B_WORDS || TYPE_RES_WORDS == TYPE_B_WORDS + 1);
            mulUintUint(b, a, res);

            if(lessZeroInt(b))
            {
                inPlaceSubIntIntBShifted<TYPE_RES_WORDS, TypeA, TYPE_B_WORDS>(res, a);
            }
        }

        template<int TYPE_A_WORDS, typename TypeB, int TYPE_RES_WORDS>
        inline void mulUintInt(MultiWordUint<TYPE_A_WORDS> const& a, TypeB b, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS == TYPE_A_WORDS || TYPE_RES_WORDS == TYPE_A_WORDS + 1);
            mulUintUint(a, signExtend(b), res);

            if(lessZeroInt(b))
            {
                inPlaceSubIntIntBShifted<TYPE_RES_WORDS, TYPE_A_WORDS, 1>(res, a);
            }
        }

        template<int TYPE_A_WORDS, int TYPE_B_WORDS, int TYPE_RES_WORDS>
        inline void mulUintInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_B_WORDS> const& b, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            mulUintUint(a, b, res);
            if(lessZeroInt(b))
            {
                inPlaceSubIntIntBShifted<TYPE_RES_WORDS, TYPE_A_WORDS, TYPE_B_WORDS>(res, a);
            }
        }

        template<typename TypeA, typename TypeRes>
        inline void convertUintInt(TypeA const& a, TypeRes & res)
        {
            HK_COMPILE_TIME_ASSERT(sizeof(res) >= sizeof(a));

            res = a;
        }

        template<typename TypeA, int TYPE_RES_WORDS>
        inline void convertUintInt(TypeA const& a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            res[0] = a;
            for(int i = 1; i < TYPE_RES_WORDS; i++)
            {
                res[i] = 0;
            }
        }

        template<int TYPE_A_WORDS, int TYPE_RES_WORDS>
        inline void convertUintInt(MultiWordUint<TYPE_A_WORDS> const& a, MultiWordUint<TYPE_RES_WORDS> & res)
        {
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS >= TYPE_A_WORDS);
            HK_COMPILE_TIME_ASSERT(TYPE_RES_WORDS <= TYPE_A_WORDS+1);

            for(int i = 0; i < TYPE_A_WORDS; i++)
            {
                res[i] = a[i];
            }
            for(int i = TYPE_A_WORDS; i < TYPE_RES_WORDS; i++)
            {
                res[i] = 0;
            }
        }

        /// Get the MULTI_WORD_UINT_BITS at and below the highest 1 bit.
        ///
        /// expOut is filled with the number of bits right-shifted to produce the result. If the function returns x,
        /// then a was approximately equal to x * 2^expOut.
        ///
        /// If a fits in a single word, that word is returned, and expOut is set to zero.
        template<int TYPE_A_WORDS>
        inline MultiWordUintType getHighSignificantBitsUint(MultiWordUint<TYPE_A_WORDS> const& a, int & expOut)
        {
            for(int i = TYPE_A_WORDS-1; i >= 1; i--)
            {
                if(a[i] != 0)
                {
                    int shift = hkMath::countLeadingZeros(a[i]);
                    expOut = i*MULTI_WORD_UINT_BITS - shift;

                    MultiWordUintType res = a[i] << shift;
                    return shift == 0 ? res : res | (a[i-1] >> (MULTI_WORD_UINT_BITS-shift));
                }
            }

            expOut = 0;
            return a[0];
        }

        /// Get the MULTI_WORD_UINT_BITS at and below the highest bit matching the sign bit. The result always has a
        /// MSB equal to the sign bit of the input.
        ///
        /// expOut is filled with the number of bits right-shifted to produce the result. If the function returns x,
        /// then the absolute value of a was approximately equal to x * 2^expOut. If a had to be left-shifted,
        /// expOut will be negative.
        template<int TYPE_A_WORDS>
        inline MultiWordUintType getHighSignificantBitsInt(MultiWordUint<TYPE_A_WORDS> const& a, int & expOut)
        {
            MultiWordUintType signSplatted = signSplat(a);

            for(int i = TYPE_A_WORDS-1; i >= 1; i--)
            {
                if(a[i] != signSplatted)
                {
                    int shift = hkMath::countLeadingZeros(a[i] ^ signSplatted) - 1;
                    expOut = i*MULTI_WORD_UINT_BITS - shift;

                    if(shift < 0)
                    {
                        // MSB doesn't match sign bit; need to shift right 1 bit, with sign extension
                        return (a[i] >> 1) | (signSplatted & WORD_SIGN_MASK);
                    }
                    else
                    {
                        MultiWordUintType res = a[i] << shift;
                        return shift == 0 ? res : res | (a[i-1] >> (MULTI_WORD_UINT_BITS-shift));
                    }
                }
            }

            if((a[0] ^ signSplatted) & WORD_SIGN_MASK)
            {
                // MSB doesn't match sign bit; need to shift right 1 bit, with sign extension
                expOut = 1;
                return (a[0] >> 1) | (signSplatted & WORD_SIGN_MASK);
            }
            else
            {
                expOut = 0;
                return a[0];
            }
        }

        template<typename TypeA>
        inline hkFloat32 convertUintFloat(TypeA const& a);
        template<int TYPE_A_WORDS>
        inline hkFloat32 convertUintFloat(MultiWordUint<TYPE_A_WORDS> const& a);
        template<typename TypeA>
        inline hkFloat32 convertIntFloat(TypeA const& a);
        template<int TYPE_A_WORDS>
        inline hkFloat32 convertIntFloat(MultiWordUint<TYPE_A_WORDS> const& a);

        template<typename TypeA>
        inline hkFloat32 convertUintFloatUnscaled(TypeA a, int & expOut, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            expOut = 0;
            return convertUintFloat(a);
        }

        template<int TYPE_A_WORDS>
        inline hkFloat32 convertUintFloatUnscaled(MultiWordUint<TYPE_A_WORDS> const& a, int & expOut)
        {
            MultiWordUintType highSignificantBits = getHighSignificantBitsUint(a, expOut);
            return hkFloat32(highSignificantBits);
        }

        template<typename TypeA>
        inline hkFloat32 convertIntFloatUnscaled(TypeA a, int & expOut, HKFPMATH_CHECK_SINGLE(TypeA))
        {
            expOut = 0;
            return convertIntFloat(a);
        }

        template<int TYPE_A_WORDS>
        inline hkFloat32 convertIntFloatUnscaled(MultiWordUint<TYPE_A_WORDS> const& a, int & expOut)
        {
            MultiWordUintType highSignificantBits = getHighSignificantBitsInt(a, expOut);
            return hkFloat32(MultiWordIntType(highSignificantBits));
        }

        template<typename TypeA>
        inline hkFloat32 convertUintFloat(TypeA const& a)
        {
            return (hkFloat32)a;
        }

        template<int TYPE_A_WORDS>
        inline hkFloat32 convertUintFloat(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            //HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS <= 128 / MULTI_WORD_UINT_BITS);
            int exp;
            hkFloat32 x = convertUintFloatUnscaled(a, exp);

            // adjust exponent
            reinterpret_cast<hkUint32 &>(x) += exp << 23;

            return x;
        }

        template<typename TypeA>
        inline hkFloat32 convertIntFloat(TypeA const& a)
        {
            return (hkFloat32)makeSigned(a);
        }

        template<int TYPE_A_WORDS>
        inline hkFloat32 convertIntFloat(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            //HK_COMPILE_TIME_ASSERT(TYPE_A_WORDS <= 128 / MULTI_WORD_UINT_BITS);
            int exp;
            hkFloat32 x = convertIntFloatUnscaled(a, exp);

            // adjust exponent
            reinterpret_cast<hkUint32 &>(x) += exp << 23;

            return x;
        }

        template<typename TypeA>
        inline void convertFloatInt(hkFloat32 x, TypeA & res)
        {
            res = typename EquivalentIntType<TypeA>::Type(x);
        }

        template<int TYPE_A_WORDS>
        inline void convertFloatInt(hkFloat32 x, MultiWordUint<TYPE_A_WORDS> & res)
        {
            setZeroUint(res);

            hkInt32 exponent = ((reinterpret_cast<hkUint32 const&>(x) & 0x7F800000) >> 23) - 127;

            if(exponent < 0)
            {
                return;
            }

            hkUint32 mantissa = (reinterpret_cast<hkUint32 const&>(x) & 0x007fffff) | 0x00800000;

            res[0] = mantissa;

            if(exponent <= 23)
            {
                res[0] >>= 23-exponent;
            }
            else
            {
                inPlaceShlUint(res, exponent-23);
            }

            if(x<0)
            {
                negateInt(res, res);
            }
        }

        template<int BIAS, typename TypeA>
        inline void convertBiasedFloatInt(hkFloat32 x, TypeA & res)
        {
            x *= hkMath::pow(2.0f, -BIAS);
            res = typename EquivalentIntType<TypeA>::Type(x);
        }

        template<int BIAS, int TYPE_A_WORDS>
        inline void convertBiasedFloatInt(hkFloat32 x, MultiWordUint<TYPE_A_WORDS> & res)
        {
            setZeroUint(res);

            hkInt32 exponent = ((reinterpret_cast<hkUint32 const&>(x) & 0x7F800000) >> 23) - 127 - BIAS;

            if(exponent < 0)
            {
                return;
            }

            hkUint32 mantissa = (reinterpret_cast<hkUint32 const&>(x) & 0x007fffff) | 0x00800000;

            res[0] = mantissa;

            if(exponent <= 23)
            {
                res[0] >>= 23-exponent;
            }
            else
            {
                inPlaceShlUint(res, exponent-23);
            }

            if(x<0)
            {
                negateInt(res, res);
            }
        }

        template<typename TypeA>
        inline hkUint32 hash(TypeA const& a)
        {
            return hkHash::hkHashValue(a);
        }

        template<int TYPE_A_WORDS>
        inline hkUint32 hash(MultiWordUint<TYPE_A_WORDS> const& a)
        {
            hkUint32 hash = hkHash::hkHashValue(a[0]);
            for(int i = 1; i<TYPE_A_WORDS; i++)
            {
                hash = hkHash::combineHashValues(hash, hkHash::hkHashValue(a[i]));
            }

            return hash;
        }

        template<typename TypeN, typename TypeD>
        inline hkFloat32 convertIntUintRatioFloat(TypeN const& n, TypeD const& d)
        {
            int nExp;
            hkFloat32 nReal = convertIntFloatUnscaled(n, nExp);

            if(nReal == 0)
            {
                return 0;
            }

            int dExp;
            hkFloat32 dReal = convertUintFloatUnscaled(d, dExp);

            hkFloat32 x = nReal / dReal;

            // Adjust exponent. Note that this is guaranteed to leave the sign bit alone, as long as we don't
            // underflow/overflow the exponent range.
            int exp = nExp - dExp;
            reinterpret_cast<hkUint32 &>(x) += exp << 23;

            return x;
        }

        template<typename TypeN, typename TypeD>
        inline void convertIntUintRatiosFloat(TypeN const& n1, TypeN const& n2, TypeD const& d, hkFloat32 xOut[2])
        {
            int dExp;
            hkFloat32 dReal = convertUintFloatUnscaled(d, dExp);

            {
                int nExp;
                hkFloat32 nReal = convertIntFloatUnscaled(n1, nExp);
                if(nReal == 0)
                {
                    xOut[0] = 0;
                }
                else
                {
                    hkFloat32 x = nReal / dReal;

                    // Adjust exponent. Note that this is guaranteed to leave the sign bit alone, as long as we don't
                    // underflow/overflow the exponent range.
                    int exp = nExp - dExp;
                    reinterpret_cast<hkUint32 &>(x) += exp << 23;

                    xOut[0] = x;
                }
            }

            {
                int nExp;
                hkFloat32 nReal = convertIntFloatUnscaled(n2, nExp);
                if(nReal == 0)
                {
                    xOut[1] = 0;
                }
                else
                {
                    hkFloat32 x = nReal / dReal;

                    // Adjust exponent. Note that this is guaranteed to leave the sign bit alone, as long as we don't
                    // underflow/overflow the exponent range.
                    int exp = nExp - dExp;
                    reinterpret_cast<hkUint32 &>(x) += exp << 23;

                    xOut[1] = x;
                }
            }
        }

        template<typename TypeN, typename TypeD>
        inline void convertIntUintRatiosFloat(TypeN const& n1, TypeN const& n2, TypeN const& n3, TypeD const& d, hkFloat32 xOut[3])
        {
            int dExp;
            hkFloat32 dReal = convertUintFloatUnscaled(d, dExp);

            {
                int nExp;
                hkFloat32 nReal = convertIntFloatUnscaled(n1, nExp);
                if(nReal == 0)
                {
                    xOut[0] = 0;
                }
                else
                {
                    hkFloat32 x = nReal / dReal;

                    // Adjust exponent. Note that this is guaranteed to leave the sign bit alone, as long as we don't
                    // underflow/overflow the exponent range.
                    int exp = nExp - dExp;
                    reinterpret_cast<hkUint32 &>(x) += exp << 23;

                    xOut[0] = x;
                }
            }

            {
                int nExp;
                hkFloat32 nReal = convertIntFloatUnscaled(n2, nExp);
                if(nReal == 0)
                {
                    xOut[1] = 0;
                }
                else
                {
                    hkFloat32 x = nReal / dReal;

                    // Adjust exponent. Note that this is guaranteed to leave the sign bit alone, as long as we don't
                    // underflow/overflow the exponent range.
                    int exp = nExp - dExp;
                    reinterpret_cast<hkUint32 &>(x) += exp << 23;

                    xOut[1] = x;
                }
            }

            {
                int nExp;
                hkFloat32 nReal = convertIntFloatUnscaled(n3, nExp);
                if(nReal == 0)
                {
                    xOut[2] = 0;
                }
                else
                {
                    hkFloat32 x = nReal / dReal;

                    // Adjust exponent. Note that this is guaranteed to leave the sign bit alone, as long as we don't
                    // underflow/overflow the exponent range.
                    int exp = nExp - dExp;
                    reinterpret_cast<hkUint32 &>(x) += exp << 23;

                    xOut[2] = x;
                }
            }
        }
    }
}

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
