// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM   : WIN32 X64 LINUX32 LINUX64 METRO_X86 METRO_X64 APOLLO_X86 DURANGO PS4 UWP_X86 UWP_X64 ANDROID_X86 MAC
// PRODUCT   : COMMON
// VISIBILITY   : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0

#include <Common/Base/hkBase.h>

namespace hkMath
{
    // common single precision constants used for SSE math functions

    HK_ALIGN16(const hkUint32 hkSse_sinCoeff0[4]) = { 0xB94CA1F9, 0xB94CA1F9, 0xB94CA1F9, 0xB94CA1F9 }; // -1.9515295891e-4
    HK_ALIGN16(const hkUint32 hkSse_sinCoeff1[4]) = { 0x3C08839E, 0x3C08839E, 0x3C08839E, 0x3C08839E }; //  8.3321608736e-3
    HK_ALIGN16(const hkUint32 hkSse_sinCoeff2[4]) = { 0xBE2AAAA3, 0xBE2AAAA3, 0xBE2AAAA3, 0xBE2AAAA3 }; // -1.6666654611e-1

    HK_ALIGN16(const hkUint32 hkSse_cosCoeff0[4]) = { 0x37CCF5CE, 0x37CCF5CE, 0x37CCF5CE, 0x37CCF5CE }; //  2.443315711809948e-005
    HK_ALIGN16(const hkUint32 hkSse_cosCoeff1[4]) = { 0xBAB6061A, 0xBAB6061A, 0xBAB6061A, 0xBAB6061A }; // -1.388731625493765e-003
    HK_ALIGN16(const hkUint32 hkSse_cosCoeff2[4]) = { 0x3D2AAAA5, 0x3D2AAAA5, 0x3D2AAAA5, 0x3D2AAAA5 }; //  4.166664568298827e-002

    HK_ALIGN16(const hkUint32 hkSse_twoPi[4] )    = { 0x40C90FDB, 0x40C90FDB, 0x40C90FDB, 0x40C90FDB }; // 2*pi
    HK_ALIGN16(const hkUint32 hkSse_pi[4] )       = { 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB }; // pi
    HK_ALIGN16(const hkUint32 hkSse_piOver2[4])   = { 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB }; // pi / 2
    HK_ALIGN16(const hkUint32 hkSse_fourOverPi[4])= { 0x3FA2F983, 0x3FA2F983, 0x3FA2F983, 0x3FA2F983 }; // 4 / pi
    HK_ALIGN16(const hkUint32 hkSse_DP1[4])       = { 0xBF490000, 0xBF490000, 0xBF490000, 0xBF490000 }; // -0.78515625
    HK_ALIGN16(const hkUint32 hkSse_DP2[4])       = { 0xB97DA000, 0xB97DA000, 0xB97DA000, 0xB97DA000 }; // -2.4187564849853515625e-4
    HK_ALIGN16(const hkUint32 hkSse_DP3[4])       = { 0xB3222169, 0xB3222169, 0xB3222169, 0xB3222169 }; // -3.77489497744594108e-8

    HK_ALIGN16(const hkUint32 hkSse_floatHalf[4]) = { 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000 }; // 0.5
    HK_ALIGN16(const hkUint32 hkSse_floatOne[4])  = { 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 }; // 1.0
    HK_ALIGN16(const hkUint32 hkSse_floatTwo[4])  = { 0x40000000, 0x40000000, 0x40000000, 0x40000000 }; // 2.0
    HK_ALIGN16(const hkUint32 hkSse_floatThree[4])= { 0x40400000, 0x40400000, 0x40400000, 0x40400000 }; // 3.0

    HK_ALIGN16(const hkUint32 hkSse_intOne[4])    = { 0x00000001, 0x00000001, 0x00000001, 0x00000001 }; // 1

    HK_ALIGN16(const hkUint32 hkSse_signMask[4])  = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; // sign bit


    // common double precision constants used for SSE math functions

    HK_ALIGN16(const hkUint64 hkSse_D_sinCoeff0[2]) = { 0x3de5d8fd1fd19ccdull, 0x3de5d8fd1fd19ccdull }; //  1.58962301576546568060E-10
    HK_ALIGN16(const hkUint64 hkSse_D_sinCoeff1[2]) = { 0xbe5ae5e5a9291f5dull, 0xbe5ae5e5a9291f5dull }; // -2.50507477628578072866E-8
    HK_ALIGN16(const hkUint64 hkSse_D_sinCoeff2[2]) = { 0x3ec71de3567d48a1ull, 0x3ec71de3567d48a1ull }; //  2.75573136213857245213E-6
    HK_ALIGN16(const hkUint64 hkSse_D_sinCoeff3[2]) = { 0xbf2a01a019bfdf03ull, 0xbf2a01a019bfdf03ull }; // -1.98412698295895385996E-4
    HK_ALIGN16(const hkUint64 hkSse_D_sinCoeff4[2]) = { 0x3f8111111110f7d0ull, 0x3f8111111110f7d0ull }; //  8.33333333332211858878E-3
    HK_ALIGN16(const hkUint64 hkSse_D_sinCoeff5[2]) = { 0xbfc5555555555548ull, 0xbfc5555555555548ull }; // -1.66666666666666307295E-1

    HK_ALIGN16(const hkUint64 hkSse_D_cosCoeff0[2]) = { 0xbda8fa49a0861a9bull, 0xbda8fa49a0861a9bull }; // -1.13585365213876817300E-11
    HK_ALIGN16(const hkUint64 hkSse_D_cosCoeff1[2]) = { 0x3e21ee9d7b4e3f05ull, 0x3e21ee9d7b4e3f05ull }; //  2.08757008419747316778E-9
    HK_ALIGN16(const hkUint64 hkSse_D_cosCoeff2[2]) = { 0xbe927e4f7eac4bc6ull, 0xbe927e4f7eac4bc6ull }; // -2.75573141792967388112E-7
    HK_ALIGN16(const hkUint64 hkSse_D_cosCoeff3[2]) = { 0x3efa01a019c844f5ull, 0x3efa01a019c844f5ull }; //  2.48015872888517045348E-5
    HK_ALIGN16(const hkUint64 hkSse_D_cosCoeff4[2]) = { 0xbf56c16c16c14f91ull, 0xbf56c16c16c14f91ull }; // -1.38888888888730564116E-3
    HK_ALIGN16(const hkUint64 hkSse_D_cosCoeff5[2]) = { 0x3fa555555555554bull, 0x3fa555555555554bull }; //  4.16666666666665929218E-2

    HK_ALIGN16(const hkUint64 hkSse_D_atanP0[2])    = { 0xbfec007fa1f72594ull, 0xbfec007fa1f72594ull }; // -8.750608600031904122785E-1
    HK_ALIGN16(const hkUint64 hkSse_D_atanP1[2])    = { 0xc03028545b6b807aull, 0xc03028545b6b807aull }; // -1.615753718733365076637E1
    HK_ALIGN16(const hkUint64 hkSse_D_atanP2[2])    = { 0xc052c08c36880273ull, 0xc052c08c36880273ull }; // -7.500855792314704667340E1
    HK_ALIGN16(const hkUint64 hkSse_D_atanP3[2])    = { 0xc05eb8bf2d05ba25ull, 0xc05eb8bf2d05ba25ull }; // -1.228866684490136173410E2
    HK_ALIGN16(const hkUint64 hkSse_D_atanP4[2])    = { 0xc0503669fd28ec8eull, 0xc0503669fd28ec8eull }; // -6.485021904942025371773E1

    HK_ALIGN16(const hkUint64 hkSse_D_atanQ1[2])    = { 0x4038dbc45b14603cull, 0x4038dbc45b14603cull }; // 2.485846490142306297962E1
    HK_ALIGN16(const hkUint64 hkSse_D_atanQ2[2])    = { 0x4064a0dd43b8fa25ull, 0x4064a0dd43b8fa25ull }; // 1.650270098316988542046E2
    HK_ALIGN16(const hkUint64 hkSse_D_atanQ3[2])    = { 0x407b0e18d2e2be3bull, 0x407b0e18d2e2be3bull }; // 4.328810604912902668951E2
    HK_ALIGN16(const hkUint64 hkSse_D_atanQ4[2])    = { 0x407e563f13b049eaull, 0x407e563f13b049eaull }; // 4.853903996359136964868E2
    HK_ALIGN16(const hkUint64 hkSse_D_atanQ5[2])    = { 0x4068519efbbd62ecull, 0x4068519efbbd62ecull }; // 1.945506571482613964425E2

    HK_ALIGN16(const hkUint64 hkSse_D_twoPi[2] )    = { 0x401921FB54442D18ull, 0x401921FB54442D18ull }; // 2*pi
    HK_ALIGN16(const hkUint64 hkSse_D_pi[2] )       = { 0x400921FB54442D18ull, 0x400921FB54442D18ull }; // pi
    HK_ALIGN16(const hkUint64 hkSse_D_piOver2[2])   = { 0x3FF921FB54442D18ull, 0x3FF921FB54442D18ull }; // pi / 2
    HK_ALIGN16(const hkUint64 hkSse_D_piOver4[2])   = { 0x3FE921FB54442D18ull, 0x3FE921FB54442D18ull }; // pi / 4
    HK_ALIGN16(const hkUint64 hkSse_D_fourOverPi[2])= { 0x3FF45F306DC9C883ull, 0x3FF45F306DC9C883ull }; // 4 / pi
    HK_ALIGN16(const hkUint64 hkSse_D_DP1[2])       = { 0xbfe921fb40000000ull, 0xbfe921fb40000000ull }; // -7.85398125648498535156E-1
    HK_ALIGN16(const hkUint64 hkSse_D_DP2[2])       = { 0xbe64442d00000000ull, 0xbe64442d00000000ull }; // -3.77489470793079817668E-8
    HK_ALIGN16(const hkUint64 hkSse_D_DP3[2])       = { 0xbce8469898cc5170ull, 0xbce8469898cc5170ull }; // -2.69515142907905952645E-15

    HK_ALIGN16(const hkUint64 hkSse_doubleHalf[2])  = { 0x3FE0000000000000ull, 0x3FE0000000000000ull }; // 0.5
    HK_ALIGN16(const hkUint64 hkSse_doubleOne[2])   = { 0x3ff0000000000000ull, 0x3ff0000000000000ull }; // 1.0

    HK_ALIGN16(const hkUint64 hkSse_D_signMask[2])  = { 0x8000000000000000ull, 0x8000000000000000ull }; // sign bit


    /// Adapted from the book "Methods and Programs for Mathematical Functions",
    /// Stephen Lloyd Baluk Moshier, Prentice-Hall, 1989
    ///
    /// Returns radian angle between -pi/2 and +pi/2 whose tangent is x.
    /// Range reduction is from three intervals into the interval
    /// from zero to 0.66.  The approximation uses a rational
    /// function of degree 4/5 of the form x + x**3 P(x)/Q(x).
    ///
    /// ACCURACY: matches cmath on MSVC
    ///
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE      -10,10       1.8e-16     5.0e-17
    ///
    /// PERFORMANCE: About 1.2x faster than ::atan for 2 simultaneous values
#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED
    __m128d HK_CALL twoAtan(const __m128d& inX)
    {
        static HK_ALIGN16(const hkUint64 tan3pi8[2])    = { 0x4003504F333F9DE6ull, 0x4003504F333F9DE6ull }; // tan(3*pi/8) = 1+sqrt(2)
        static HK_ALIGN16(const hkUint64 point66[2])    = { 0x3FE51EB851EB851Full, 0x3FE51EB851EB851Full }; // 0.66
        static HK_ALIGN16(const hkUint64 morebits[2])   = { 0x3C91A62633145C07ull, 0x3C91A62633145C07ull }; // 6.123233995736765886130E-17 + hkSse_D_piOver2 = pi/2

        // make argument positive and save the sign
        __m128d sign = _mm_and_pd( *(__m128d*)hkSse_D_signMask, inX );
        __m128d x = _mm_andnot_pd( *(__m128d*)hkSse_D_signMask, inX );

        // range reduction
        __m128d a1 = _mm_cmpgt_pd(x , *(__m128d*)tan3pi8 );
        __m128d a2 = _mm_cmple_pd(x , *(__m128d*)point66 );

        __m128d y = _mm_andnot_pd(a2, *(__m128d*)hkSse_D_piOver4);
        __m128d z1 = _mm_div_pd(_mm_sub_pd(x,*(__m128d*)hkSse_doubleOne), _mm_add_pd(x,*(__m128d*)hkSse_doubleOne));

#if HK_SSE_VERSION >= 0x41
        z1 = _mm_blendv_pd(z1,x,a2);
        y = _mm_blendv_pd(y, *(__m128d*)hkSse_D_piOver2, a1);
        x = _mm_blendv_pd(z1, _mm_sub_pd(_mm_setzero_pd(), _mm_div_pd(*(__m128d*)hkSse_doubleOne, x)), a1);
#else
        z1 = _mm_or_pd(_mm_andnot_pd(a2, z1), _mm_and_pd(a2, x));
        y = _mm_or_pd(_mm_andnot_pd(a1, y), _mm_and_pd(a1, *(__m128d*)hkSse_D_piOver2));
        x = _mm_or_pd(_mm_andnot_pd(a1, z1), _mm_and_pd(a1, _mm_sub_pd(_mm_setzero_pd(), _mm_div_pd(*(__m128d*)hkSse_doubleOne, x))));
#endif

        // eval polynoms (Horner)
        __m128d z = _mm_mul_pd(x, x);

        __m128d polyP = *(__m128d*)hkSse_D_atanP0;
        polyP = _mm_add_pd(_mm_mul_pd(polyP, z),*(__m128d*)hkSse_D_atanP1);
        polyP = _mm_add_pd(_mm_mul_pd(polyP, z),*(__m128d*)hkSse_D_atanP2);
        polyP = _mm_add_pd(_mm_mul_pd(polyP, z),*(__m128d*)hkSse_D_atanP3);
        polyP = _mm_add_pd(_mm_mul_pd(polyP, z),*(__m128d*)hkSse_D_atanP4);

        // Q0 = 1
        __m128d polyQ = _mm_add_pd(z,*(__m128d*)hkSse_D_atanQ1);
        polyQ = _mm_add_pd(_mm_mul_pd(polyQ, z),*(__m128d*)hkSse_D_atanQ2);
        polyQ = _mm_add_pd(_mm_mul_pd(polyQ, z),*(__m128d*)hkSse_D_atanQ3);
        polyQ = _mm_add_pd(_mm_mul_pd(polyQ, z),*(__m128d*)hkSse_D_atanQ4);
        polyQ = _mm_add_pd(_mm_mul_pd(polyQ, z),*(__m128d*)hkSse_D_atanQ5);

        // calc differential
        z = _mm_div_pd(_mm_mul_pd(z, polyP), polyQ);
        z = _mm_add_pd(_mm_mul_pd(x,z),x);

        // apply correction term for truncated constant
        __m128d m = _mm_andnot_pd(a2, _mm_mul_pd(*(__m128d*)morebits,*(__m128d*)hkSse_doubleHalf));
#if HK_SSE_VERSION >= 0x41
        m = _mm_blendv_pd(m, *(__m128d*)morebits, a1);
#else
        m = _mm_or_pd(_mm_andnot_pd(a1, m), _mm_and_pd(a1, *(__m128d*)morebits));
#endif
        z = _mm_add_pd(z, m);

        // add in
        y = _mm_add_pd(y,z);

        // update sign
        y = _mm_xor_pd(y, sign);

        return y;
    }
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED
    __m128 HK_CALL quadAtan(const __m128& inX)
    {
        static HK_ALIGN16(const hkUint32 a1[4]) = { 0x3B3BD74A, 0x3B3BD74A, 0x3B3BD74A, 0x3B3BD74A }; // 0.0028662257
        static HK_ALIGN16(const hkUint32 a0[4]) = { 0xBBEB7F9E, 0xBBEB7F9E, 0xBBEB7F9E, 0xBBEB7F9E }; // -0.00718684420434729
        static HK_ALIGN16(const hkUint32 b1[4]) = { 0xC0753EB3, 0xC0753EB3, 0xC0753EB3, 0xC0753EB3 }; // -3.83195178589435
        static HK_ALIGN16(const hkUint32 b0[4]) = { 0x40B019CA, 0x40B019CA, 0x40B019CA, 0x40B019CA }; // 5.50314831143450
        static HK_ALIGN16(const hkUint32 d1[4]) = { 0xBF809FF7, 0xBF809FF7, 0xBF809FF7, 0xBF809FF7 }; // -1.00488174720983
        static HK_ALIGN16(const hkUint32 d0[4]) = { 0x40712D99, 0x40712D99, 0x40712D99, 0x40712D99 }; // 3.76840794390466
        static HK_ALIGN16(const hkUint32 e1[4]) = { 0x3FDA2291, 0x3FDA2291, 0x3FDA2291, 0x3FDA2291 }; // 1.70417985002100
        static HK_ALIGN16(const hkUint32 e0[4]) = { 0x400F22D2, 0x400F22D2, 0x400F22D2, 0x400F22D2 }; // 2.23650017675129

        __m128 x = inX;

        // get sign and abs
        __m128 sign = _mm_and_ps(*(__m128*)hkSse_signMask, x);
        __m128 xabs = _mm_andnot_ps(*(__m128*)hkSse_signMask, x);

        // get abs reciprocal
        __m128 inv_x = hkMath::quadReciprocal(x);
        inv_x = _mm_xor_ps(inv_x, *(__m128*)hkSse_signMask);

        // calc which half
        __m128 select = _mm_cmpgt_ps(xabs, *(__m128*)hkSse_floatOne);

        // get signed bias
        sign = _mm_or_ps(sign, *(__m128*)hkSse_piOver2);
        __m128 bias = _mm_and_ps(sign, select);

        // choose solution
#if HK_SSE_VERSION >= 0x41
        x = _mm_blendv_ps(x, inv_x, select);
#else
        x = _mm_or_ps(_mm_andnot_ps(select, x), _mm_and_ps(select, inv_x));
#endif

        // approximate
        __m128  z  = _mm_mul_ps(x,x);
        __m128  zm = _mm_mul_ps(z,x);
        bias = _mm_add_ps(bias,x);

        // eval polynom (Estrin)
        __m128 y  = _mm_add_ps(_mm_mul_ps(z, *(__m128*)a1), *(__m128*)a0);
        __m128 z0 = _mm_add_ps(z, *(__m128*)b1);
        __m128 z1 = _mm_add_ps(z, *(__m128*)d1);
        __m128 z2 = _mm_add_ps(z, *(__m128*)e1);
        __m128 y0 = _mm_add_ps(_mm_mul_ps(z0, z),*(__m128*)b0);
        __m128 y1 = _mm_add_ps(_mm_mul_ps(z1, z),*(__m128*)d0);
        __m128 y2 = _mm_add_ps(_mm_mul_ps(z2, z),*(__m128*)e0);
        y   = _mm_mul_ps(zm,y);
        y0  = _mm_mul_ps(y1,y0);
        y2  = _mm_mul_ps(y,y2);
        y0  = _mm_add_ps(_mm_mul_ps(y0,y2),bias);

        return y0;
    }
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

    __m128 HK_CALL quadAtan2(const __m128& y, const __m128& x)
    {
        __m128 xIsZero  = _mm_cmpeq_ps(x, _mm_setzero_ps());                // by value
        __m128 yIsZero  = _mm_cmpeq_ps(y, _mm_setzero_ps());                // by value
        __m128 xIsNeg   = _mm_cmplt_ps(x, _mm_setzero_ps());                // by value
        __m128 ySign    = _mm_and_ps(*(__m128*)hkSse_signMask, y);  // by sign

        // compute common case and zero if not applicable
#if defined(HK_ALLOW_FPU_EXCEPTION_CHECKING)
        // this is slightly less accurate and slower than the full precision call below
        __m128 ratio = _mm_mul_ps(y, quadReciprocal(x));
#else
        __m128 ratio = _mm_div_ps(y, x);
#endif
        __m128 z = quadAtan( ratio );
        z = _mm_andnot_ps(_mm_or_ps(xIsZero,yIsZero), z);

        // special case x<0: depends on y
        __m128 wx = _mm_and_ps(xIsNeg, *(__m128*)hkSse_pi);
        wx = _mm_xor_ps(wx, ySign);

        // special case x=0: depends on y<0 only
        __m128 wy = _mm_xor_ps(*(__m128*)hkSse_piOver2, ySign);
        wy = _mm_andnot_ps(yIsZero, wy);

        // select special case
#if HK_SSE_VERSION >= 0x41
        __m128 w = _mm_blendv_ps(wx,wy,xIsZero);
#else
        __m128 w = _mm_or_ps(_mm_and_ps(xIsZero,wy), _mm_andnot_ps(xIsZero, wx));
#endif

        // assemble common + special cases
        __m128 result = _mm_add_ps(w, z);

        return result;
    }
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

    __m128d HK_CALL twoAtan2(const __m128d& y, const __m128d& x)
    {
        __m128d xIsZero  = _mm_cmpeq_pd(x, _mm_setzero_pd());               // by value
        __m128d yIsZero  = _mm_cmpeq_pd(y, _mm_setzero_pd());               // by value
        __m128d xIsNeg   = _mm_cmplt_pd(x, _mm_setzero_pd());               // by value
        __m128d ySign    = _mm_and_pd(*(__m128d*)hkSse_D_signMask, y);  // by sign

        // compute common case and zero if not applicable
        __m128d ratio = _mm_div_pd(y, x);
        __m128d z = twoAtan( ratio );
        z = _mm_andnot_pd(_mm_or_pd(xIsZero,yIsZero), z);

        // special case x<0: depends on y
        __m128d wx = _mm_and_pd(xIsNeg, *(__m128d*)hkSse_D_pi);
        wx = _mm_xor_pd(wx, ySign);

        // special case x=0: depends on y<0 only
        __m128d wy = _mm_xor_pd(*(__m128d*)hkSse_D_piOver2, ySign);
        wy = _mm_andnot_pd(yIsZero, wy);

        // select special case
#if HK_SSE_VERSION >= 0x41
        __m128d w = _mm_blendv_pd(wx,wy,xIsZero);
#else
        __m128d w = _mm_or_pd(_mm_and_pd(xIsZero,wy), _mm_andnot_pd(xIsZero, wx));
#endif

        // assemble common + special cases
        __m128d result = _mm_add_pd(w, z);

        return result;
    }

#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED


    extern __m128d HK_CALL twoAtanApproximation(const __m128d& inX)
    {
        static HK_ALIGN16(const hkUint64 tan3pi8[2])    = { 0x4003504F333F9DE6ull, 0x4003504F333F9DE6ull }; // tan(3*pi/8) = 1+sqrt(2)
        static HK_ALIGN16(const hkUint64 point66[2])    = { 0x3FE51EB851EB851Full, 0x3FE51EB851EB851Full }; // 0.66

        // make argument positive and save the sign
        __m128d sign = _mm_and_pd( *(__m128d*)hkSse_D_signMask, inX );
        __m128d x = _mm_andnot_pd( *(__m128d*)hkSse_D_signMask, inX );

        // range reduction
        __m128d a1 = _mm_cmpgt_pd(x , *(__m128d*)tan3pi8 );
        __m128d a2 = _mm_cmple_pd(x , *(__m128d*)point66 );

        __m128d y = _mm_andnot_pd(a2, *(__m128d*)hkSse_D_piOver4);
        __m128d invZ = _mm_cvtps_pd(_mm_rcp_ps(_mm_add_ps(_mm_cvtpd_ps(x),*(__m128*)hkSse_floatOne)));
        __m128d z1 = _mm_mul_pd(_mm_sub_pd(x,*(__m128d*)hkSse_doubleOne), invZ);

#if HK_SSE_VERSION >= 0x41
        z1 = _mm_blendv_pd(z1,x,a2);
        y = _mm_blendv_pd(y, *(__m128d*)hkSse_D_piOver2, a1);
        x = _mm_blendv_pd(z1, _mm_sub_pd(_mm_setzero_pd(), _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x)))), a1);
#else
        z1 = _mm_or_pd(_mm_andnot_pd(a2, z1), _mm_and_pd(a2, x));
        y = _mm_or_pd(_mm_andnot_pd(a1, y), _mm_and_pd(a1, *(__m128d*)hkSse_D_piOver2));
        x = _mm_or_pd(_mm_andnot_pd(a1, z1), _mm_and_pd(a1, _mm_sub_pd(_mm_setzero_pd(), _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x))))));
#endif

        // eval polynoms (Horner)
        __m128d z = _mm_mul_pd(x, x);

        __m128d polyP = *(__m128d*)hkSse_D_atanP1;
        polyP = _mm_add_pd(_mm_mul_pd(polyP, z),*(__m128d*)hkSse_D_atanP2);
        polyP = _mm_add_pd(_mm_mul_pd(polyP, z),*(__m128d*)hkSse_D_atanP3);
        polyP = _mm_add_pd(_mm_mul_pd(polyP, z),*(__m128d*)hkSse_D_atanP4);

        // Q1 = 1
        __m128d polyQ = _mm_add_pd(z,*(__m128d*)hkSse_D_atanQ2);
        polyQ = _mm_add_pd(_mm_mul_pd(polyQ, z),*(__m128d*)hkSse_D_atanQ3);
        polyQ = _mm_add_pd(_mm_mul_pd(polyQ, z),*(__m128d*)hkSse_D_atanQ4);
        polyQ = _mm_add_pd(_mm_mul_pd(polyQ, z),*(__m128d*)hkSse_D_atanQ5);

        // calc differential
        __m128d invPolyQ = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(polyQ)));
        z = _mm_mul_pd(_mm_mul_pd(z, polyP), invPolyQ);
        z = _mm_add_pd(_mm_mul_pd(x,z),x);

        // add in
        y = _mm_add_pd(y,z);

        // update sign
        y = _mm_xor_pd(y, sign);

        return y;
    }
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

/// Adapted from the book "Methods and Programs for Mathematical Functions",
/// Stephen Lloyd Baluk Moshier, Prentice-Hall, 1989
///
/// Range reduction is into intervals of pi/4.  The reduction error is nearly eliminated
/// by extended precision modular arithmetic.
///
/// Two polynomial approximating functions are employed.
/// Between 0 and pi/4 the sine is approximated by
///      x  +  x**3 P(x**2).
/// Between pi/4 and pi/2 the cosine is represented as
///      1  -  x**2 Q(x**2).
///
/// ACCURACY: matches cmath on MSVC
///
///                      Relative error:
/// arithmetic   domain        peak       rms
///    IEEE    -4096,+4096    1.2e-7     3.0e-8
///    IEEE    -8192,+8192    3.0e-7     3.0e-8
/// Partial loss of accuracy begins to occur at x = 2^13
/// = 8192. Results may be meaningless for x >= 2^24
///
/// PERFORMANCE: About 4.3x faster than ::sinf for 4 simultaneous values

__m128 HK_CALL quadSin(const __m128& inX)
{
    // get abs and sign
    __m128 x = _mm_andnot_ps( *(__m128*)hkSse_signMask, inX);
    __m128 sign_bit = _mm_and_ps( *(__m128*)hkSse_signMask, inX);

    // scale by 4/Pi
    __m128 y = _mm_mul_ps(x, *(__m128*)hkSse_fourOverPi);

    __m128i one = *(__m128i*)hkSse_intOne;
    __m128i two = _mm_add_epi32(one,one);
    __m128i four = _mm_add_epi32(two,two);
    // store the integer part of y in emm2
    __m128i emm2 = _mm_cvttps_epi32(y);
    // j=(j+1) & (~1)
    emm2 = _mm_add_epi32(emm2, one);
    emm2 = _mm_andnot_si128(one, emm2);
    y = _mm_cvtepi32_ps(emm2);
    // get the swap sign flag
    __m128i emm0 = _mm_and_si128(emm2, four);
    emm0 = _mm_slli_epi32(emm0, 29);
    __m128 swap_sign_bit = _mm_castsi128_ps(emm0);
    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);

    // The magic pass: Extended precision modular arithmetic
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    __m128 xmm1 = *(__m128*)hkSse_DP1;
    __m128 xmm2 = *(__m128*)hkSse_DP2;
    __m128 xmm3 = *(__m128*)hkSse_DP3;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    // Evaluate the first polynom  (0 <= x <= Pi/4)
    y = *(__m128*)hkSse_cosCoeff0;
    __m128 z = _mm_mul_ps(x,x);

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, *(__m128*)hkSse_cosCoeff1);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, *(__m128*)hkSse_cosCoeff2);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    __m128 tmp = _mm_mul_ps(z, *(__m128*)hkSse_floatHalf);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, *(__m128*)hkSse_floatOne);

    // Evaluate the second polynom  (Pi/4 <= x <= 0)
    __m128 y2 = *(__m128*)hkSse_sinCoeff0;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, *(__m128*)hkSse_sinCoeff1);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, *(__m128*)hkSse_sinCoeff2);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    // get the polynom selection mask and
    // select the correct result from the two polynoms
#if HK_SSE_VERSION >= 0x41
    emm2 = _mm_slli_epi32(emm2, 30);
    __m128 poly_mask = _mm_castsi128_ps(emm2);
    y = _mm_blendv_ps(y2,y,poly_mask);
#else
    emm2 = _mm_and_si128(emm2, two);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    __m128 poly_mask = _mm_castsi128_ps(emm2);
    y2 = _mm_and_ps(poly_mask, y2);
    y = _mm_andnot_ps(poly_mask, y);
    y = _mm_or_ps(y,y2);
#endif
    // clamp
    y = _mm_min_ps(y, *(__m128*)hkSse_floatOne);

    // update the sign
    y = _mm_xor_ps(y, sign_bit);

    return y;
}
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

/// Double precision version of above
///
///                      Relative error:
/// arithmetic   domain        peak       rms
///    IEEE -1.07e9,+1.07e9  2.1e-16     5.4e-17
/// Partial loss of accuracy begins to occur at x = 2**30
/// = 1.074e9.  The loss is not gradual, but jumps suddenly to
/// about 1 part in 10e7.  Results may be meaningless for
/// x > 2**49 = 5.6e14.
///
/// PERFORMANCE: About 1.3x faster than ::sin for 2 simultaneous values

__m128d HK_CALL twoSin(const __m128d& inX)
{
    // get abs and sign
    __m128d x = _mm_andnot_pd( *(__m128d*)hkSse_D_signMask, inX);
    __m128d sign_bit = _mm_and_pd( *(__m128d*)hkSse_D_signMask, inX);

    // scale by 4/Pi
    __m128d y = _mm_mul_pd(x, *(__m128d*)hkSse_D_fourOverPi);

    __m128i one = *(__m128i*)hkSse_intOne;
    __m128i two = _mm_add_epi32(one,one);
    __m128i four = _mm_add_epi32(two,two);
    // store the integer part of y in emm2
    __m128i emm2 = _mm_cvttpd_epi32(y);
    // j=(j+1) & (~1)
    emm2 = _mm_add_epi32(emm2, one);
    emm2 = _mm_andnot_si128(one, emm2);
    y = _mm_cvtepi32_pd(emm2);
    // get the swap sign flag
    __m128i emm0 = _mm_and_si128(emm2, four);
    emm0 = _mm_slli_epi32(emm0, 29);
    emm0 = _mm_move_epi64(emm0);
    emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,3,0,2));
    __m128d swap_sign_bit = _mm_castsi128_pd(emm0);
    sign_bit = _mm_xor_pd(sign_bit, swap_sign_bit);

    // The magic pass: Extended precision modular arithmetic
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    __m128d xmm1 = *(__m128d*)hkSse_D_DP1;
    __m128d xmm2 = *(__m128d*)hkSse_D_DP2;
    __m128d xmm3 = *(__m128d*)hkSse_D_DP3;
    xmm1 = _mm_mul_pd(y, xmm1);
    xmm2 = _mm_mul_pd(y, xmm2);
    xmm3 = _mm_mul_pd(y, xmm3);
    x = _mm_add_pd(x, xmm1);
    x = _mm_add_pd(x, xmm2);
    x = _mm_add_pd(x, xmm3);

    __m128d z = _mm_mul_pd(x,x);

    // Evaluate the first polynom  (0 <= x <= Pi/4)
    y = *(__m128d*)hkSse_D_cosCoeff0;
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff1);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff2);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff3);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff4);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff5);
    y = _mm_mul_pd(y, z);
    y = _mm_mul_pd(y, z);
    __m128d tmp = _mm_mul_pd(z, *(__m128d*)hkSse_doubleHalf);
    y = _mm_sub_pd(y, tmp);
    y = _mm_add_pd(y, *(__m128d*)hkSse_doubleOne);

    // Evaluate the second polynom  (Pi/4 <= x <= 0)
    __m128d y2 = *(__m128d*)hkSse_D_sinCoeff0;
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff1);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff2);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff3);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff4);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff5);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_mul_pd(y2, x);
    y2 = _mm_add_pd(y2, x);

    // get the polynom selection mask and
    // select the correct result from the two polynoms
#if HK_SSE_VERSION >= 0x41
    emm2 = _mm_slli_epi32(emm2, 30);
    emm2 = _mm_shuffle_epi32(emm2, _MM_SHUFFLE(1,1,0,0));
    __m128d poly_mask = _mm_castsi128_pd(emm2);
    y = _mm_blendv_pd(y2,y,poly_mask);
#else
    emm2 = _mm_and_si128(emm2, two);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    emm2 = _mm_shuffle_epi32(emm2, _MM_SHUFFLE(1,1,0,0));
    __m128d poly_mask = _mm_castsi128_pd(emm2);
    y2 = _mm_and_pd(poly_mask, y2);
    y = _mm_andnot_pd(poly_mask, y);
    y = _mm_or_pd(y,y2);
#endif
    // clamp
    y = _mm_min_pd(y, *(__m128d*)hkSse_doubleOne);

    // update the sign
    y = _mm_xor_pd(y, sign_bit);

    return y;
}
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

/// Adapted from the book "Methods and Programs for Mathematical Functions",
/// Stephen Lloyd Baluk Moshier, Prentice-Hall, 1989
///
/// Range reduction is into intervals of pi/4.  The reduction error is nearly eliminated
/// by extended precision modular arithmetic.
///
/// Two polynomial approximating functions are employed.
/// Between 0 and pi/4 the sine is approximated by
///      1  -  x**2 Q(x**2).
/// Between pi/4 and pi/2 the cosine is represented as
///      x  +  x**3 P(x**2).
///
/// ACCURACY: matches cmath on MSVC
///
///                      Relative error:
/// arithmetic   domain        peak       rms
///    IEEE    -8192,+8192    3.0e-7     3.0e-8
/// Partial loss of accuracy begins to occur at x = 2^13
/// = 8192. Results may be meaningless for x >= 2^24
///
/// PERFORMANCE: About 4.3x faster than ::cosf for 4 simultaneous values
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

__m128 HK_CALL quadCos(const __m128& inX)
{
    // get abs and sign
    __m128 x = _mm_andnot_ps( *(__m128*)hkSse_signMask, inX);

    // scale by 4/Pi
    __m128 y = _mm_mul_ps(x, *(__m128*)hkSse_fourOverPi);

    __m128i one = *(__m128i*)hkSse_intOne;
    __m128i two = _mm_add_epi32(one,one);
    __m128i four = _mm_add_epi32(two,two);
    // store the integer part of y in emm2
    __m128i emm2 = _mm_cvttps_epi32(y);
    // j=(j+1) & (~1)
    emm2 = _mm_add_epi32(emm2, one);
    emm2 = _mm_andnot_si128(one, emm2);
    y = _mm_cvtepi32_ps(emm2);
    emm2 = _mm_sub_epi32(emm2, two);
    // get the swap sign flag
    __m128i emm0 = _mm_andnot_si128(emm2, four);
    emm0 = _mm_slli_epi32(emm0, 29);
    __m128 sign_bit = _mm_castsi128_ps(emm0);

    // The magic pass: Extended precision modular arithmetic
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    __m128 xmm1 = *(__m128*)hkSse_DP1;
    __m128 xmm2 = *(__m128*)hkSse_DP2;
    __m128 xmm3 = *(__m128*)hkSse_DP3;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    // Evaluate the first polynom  (0 <= x <= Pi/4)
    y = *(__m128*)hkSse_cosCoeff0;
    __m128 z = _mm_mul_ps(x,x);

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, *(__m128*)hkSse_cosCoeff1);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, *(__m128*)hkSse_cosCoeff2);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    __m128 tmp = _mm_mul_ps(z, *(__m128*)hkSse_floatHalf);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, *(__m128*)hkSse_floatOne);

    // Evaluate the second polynom  (Pi/4 <= x <= 0)
    __m128 y2 = *(__m128*)hkSse_sinCoeff0;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, *(__m128*)hkSse_sinCoeff1);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, *(__m128*)hkSse_sinCoeff2);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    // get the polynom selection mask and
    // select the correct result from the two polynoms
#if HK_SSE_VERSION >= 0x41
    emm2 = _mm_slli_epi32(emm2, 30);
    __m128 poly_mask = _mm_castsi128_ps(emm2);
    y = _mm_blendv_ps(y2,y,poly_mask);
#else
    emm2 = _mm_and_si128(emm2, two);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    __m128 poly_mask = _mm_castsi128_ps(emm2);
    y2 = _mm_and_ps(poly_mask, y2);
    y = _mm_andnot_ps(poly_mask, y);
    y = _mm_or_ps(y,y2);
#endif
    // clamp
    y = _mm_min_ps(y, *(__m128*)hkSse_floatOne);

    // update the sign
    y = _mm_xor_ps(y, sign_bit);

    return y;
}
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

/// Double precision version of above
///
///                      Relative error:
/// arithmetic   domain        peak       rms
///    IEEE -1.07e9,+1.07e9  2.1e-16     5.4e-17
/// Partial loss of accuracy begins to occur at x = 2**30
/// = 1.074e9.  The loss is not gradual, but jumps suddenly to
/// about 1 part in 10e7.  Results may be meaningless for
/// x > 2**49 = 5.6e14.
///
/// PERFORMANCE: About 1.3x faster than ::cos for 2 simultaneous values

__m128d HK_CALL twoCos(const __m128d& inX)
{
    // get abs and sign
    __m128d x = _mm_andnot_pd( *(__m128d*)hkSse_D_signMask, inX);

    // scale by 4/Pi
    __m128d y = _mm_mul_pd(x, *(__m128d*)hkSse_D_fourOverPi);

    __m128i one = *(__m128i*)hkSse_intOne;
    __m128i two = _mm_add_epi32(one,one);
    __m128i four = _mm_add_epi32(two,two);
    // store the integer part of y in emm2
    __m128i emm2 = _mm_cvttpd_epi32(y);
    // j=(j+1) & (~1)
    emm2 = _mm_add_epi32(emm2, one);
    emm2 = _mm_andnot_si128(one, emm2);
    y = _mm_cvtepi32_pd(emm2);
    emm2 = _mm_sub_epi32(emm2, two);
    // get the swap sign flag
    __m128i emm0 = _mm_andnot_si128(emm2, four);
    emm0 = _mm_slli_epi32(emm0, 29);
    emm0 = _mm_move_epi64(emm0);
    emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,3,0,2));
    __m128d sign_bit = _mm_castsi128_pd(emm0);

    // The magic pass: Extended precision modular arithmetic
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    __m128d xmm1 = *(__m128d*)hkSse_D_DP1;
    __m128d xmm2 = *(__m128d*)hkSse_D_DP2;
    __m128d xmm3 = *(__m128d*)hkSse_D_DP3;
    xmm1 = _mm_mul_pd(y, xmm1);
    xmm2 = _mm_mul_pd(y, xmm2);
    xmm3 = _mm_mul_pd(y, xmm3);
    x = _mm_add_pd(x, xmm1);
    x = _mm_add_pd(x, xmm2);
    x = _mm_add_pd(x, xmm3);

    __m128d z = _mm_mul_pd(x,x);

    // Evaluate the first polynom  (0 <= x <= Pi/4)
    y = *(__m128d*)hkSse_D_cosCoeff0;
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff1);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff2);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff3);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff4);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff5);
    y = _mm_mul_pd(y, z);
    y = _mm_mul_pd(y, z);
    __m128d tmp = _mm_mul_pd(z, *(__m128d*)hkSse_doubleHalf);
    y = _mm_sub_pd(y, tmp);
    y = _mm_add_pd(y, *(__m128d*)hkSse_doubleOne);

    // Evaluate the second polynom  (Pi/4 <= x <= 0)
    __m128d y2 = *(__m128d*)hkSse_D_sinCoeff0;
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff1);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff2);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff3);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff4);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff5);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_mul_pd(y2, x);
    y2 = _mm_add_pd(y2, x);

    // get the polynom selection mask and
    // select the correct result from the two polynoms
#if HK_SSE_VERSION >= 0x41
    emm2 = _mm_slli_epi32(emm2, 30);
    emm2 = _mm_shuffle_epi32(emm2, _MM_SHUFFLE(1,1,0,0));
    __m128d poly_mask = _mm_castsi128_pd(emm2);
    y = _mm_blendv_pd(y2,y,poly_mask);
#else
    emm2 = _mm_and_si128(emm2, two);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    emm2 = _mm_shuffle_epi32(emm2, _MM_SHUFFLE(1,1,0,0));
    __m128d poly_mask = _mm_castsi128_pd(emm2);
    y2 = _mm_and_pd(poly_mask, y2);
    y = _mm_andnot_pd(poly_mask, y);
    y = _mm_or_pd(y,y2);
#endif
    // clamp
    y = _mm_min_pd(y, *(__m128d*)hkSse_doubleOne);

    // update the sign
    y = _mm_xor_pd(y, sign_bit);

    return y;
}
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

/// Combined sin and cos calculation derived from the algorithms above.
///
/// PERFORMANCE: About 5.9x faster than ::sin and ::cos for 2 simultaneous values

void HK_CALL twoSinCos(const __m128d& inX, __m128d& s, __m128d& c)
{
    // get abs and sign
    __m128d x = _mm_andnot_pd( *(__m128d*)hkSse_D_signMask, inX);
    __m128d sign_bit_sin = _mm_and_pd( *(__m128d*)hkSse_D_signMask, inX);

    // scale by 4/Pi
    __m128d y = _mm_mul_pd(x, *(__m128d*)hkSse_D_fourOverPi);

    __m128i one = *(__m128i*)hkSse_intOne;
    __m128i two = _mm_add_epi32(one,one);
    __m128i four = _mm_add_epi32(two,two);
    // store the integer part of y in emm2
    __m128i emm2 = _mm_cvttpd_epi32(y);
    // j=(j+1) & (~1)
    emm2 = _mm_add_epi32(emm2, one);
    emm2 = _mm_andnot_si128(one, emm2);
    y = _mm_cvtepi32_pd(emm2);

    __m128i emm4 = emm2;
    emm4 = _mm_sub_epi32(emm4, two);
    emm4 = _mm_andnot_si128(emm4, four);
    emm4 = _mm_slli_epi32(emm4, 29);
    emm4 = _mm_move_epi64(emm4);
    emm4 = _mm_shuffle_epi32(emm4, _MM_SHUFFLE(1,3,0,2));
    __m128d sign_bit_cos = _mm_castsi128_pd(emm4);

    // get the swap sign flag for the sine
    __m128i emm0 = _mm_and_si128(emm2, four);
    emm0 = _mm_slli_epi32(emm0, 29);
    emm0 = _mm_move_epi64(emm0);
    emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,3,0,2));
    sign_bit_sin = _mm_xor_pd(sign_bit_sin, _mm_castsi128_pd(emm0));

    // The magic pass: Extended precision modular arithmetic
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    __m128d xmm1 = *(__m128d*)hkSse_D_DP1;
    __m128d xmm2 = *(__m128d*)hkSse_D_DP2;
    __m128d xmm3 = *(__m128d*)hkSse_D_DP3;
    xmm1 = _mm_mul_pd(y, xmm1);
    xmm2 = _mm_mul_pd(y, xmm2);
    xmm3 = _mm_mul_pd(y, xmm3);
    x = _mm_add_pd(x, xmm1);
    x = _mm_add_pd(x, xmm2);
    x = _mm_add_pd(x, xmm3);

    __m128d z = _mm_mul_pd(x,x);

    // Evaluate the first polynom  (0 <= x <= Pi/4)
    y = *(__m128d*)hkSse_D_cosCoeff0;
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff1);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff2);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff3);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff4);
    y = _mm_mul_pd(y, z);
    y = _mm_add_pd(y, *(__m128d*)hkSse_D_cosCoeff5);
    y = _mm_mul_pd(y, z);
    y = _mm_mul_pd(y, z);
    __m128d tmp = _mm_mul_pd(z, *(__m128d*)hkSse_doubleHalf);
    y = _mm_sub_pd(y, tmp);
    y = _mm_add_pd(y, *(__m128d*)hkSse_doubleOne);

    // Evaluate the second polynom  (Pi/4 <= x <= 0)
    __m128d y2 = *(__m128d*)hkSse_D_sinCoeff0;
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff1);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff2);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff3);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff4);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_add_pd(y2, *(__m128d*)hkSse_D_sinCoeff5);
    y2 = _mm_mul_pd(y2, z);
    y2 = _mm_mul_pd(y2, x);
    y2 = _mm_add_pd(y2, x);

    // get the polynom selection mask for the sine
    emm2 = _mm_and_si128(emm2, two);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    emm2 = _mm_shuffle_epi32(emm2, _MM_SHUFFLE(1,1,0,0));
    __m128d poly_mask = _mm_castsi128_pd(emm2);

    // select the correct result from the two polynoms
    __m128d ysin2 = _mm_and_pd(poly_mask, y2);
    __m128d ysin1 = _mm_andnot_pd(poly_mask, y);
    y2 = _mm_sub_pd(y2,ysin2);
    y = _mm_sub_pd(y, ysin1);

    xmm1 = _mm_add_pd(ysin1,ysin2);
    xmm2 = _mm_add_pd(y,y2);

    // clamp
    xmm1 = _mm_min_pd(xmm1, *(__m128d*)hkSse_doubleOne);
    xmm2 = _mm_min_pd(xmm2, *(__m128d*)hkSse_doubleOne);

    // update the signs
    s = _mm_xor_pd(xmm1, sign_bit_sin);
    c = _mm_xor_pd(xmm2, sign_bit_cos);
}
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

/// Combined sin and cos calculation derived from the algorithms above.
///
/// PERFORMANCE: About 6.1x faster than ::sinf and ::cosf for 4 simultaneous values

void HK_CALL quadSinCos(const __m128& inX, __m128& s, __m128& c)
{
    // get abs and sign
    __m128 x = _mm_andnot_ps( *(__m128*)hkSse_signMask, inX);
    // extract the sign bit (upper one)
    __m128 sign_bit_sin = _mm_and_ps( *(__m128*)hkSse_signMask, inX);

    // scale by 4/Pi
    __m128 y = _mm_mul_ps(x, *(__m128*)hkSse_fourOverPi);

    __m128i one = *(__m128i*)hkSse_intOne;
    __m128i two = _mm_add_epi32(one,one);
    __m128i four = _mm_add_epi32(two,two);
    // store the integer part of y in emm2
    __m128i emm2 = _mm_cvttps_epi32(y);
    // j=(j+1) & (~1)
    emm2 = _mm_add_epi32(emm2, one);
    emm2 = _mm_andnot_si128(one, emm2);
    y = _mm_cvtepi32_ps(emm2);

    __m128i emm4 = emm2;
    emm4 = _mm_sub_epi32(emm4, two);
    emm4 = _mm_andnot_si128(emm4, four);
    emm4 = _mm_slli_epi32(emm4, 29);
    __m128 sign_bit_cos = _mm_castsi128_ps(emm4);

    // get the swap sign flag for the sine
    __m128i emm0 = _mm_and_si128(emm2, four);
    emm0 = _mm_slli_epi32(emm0, 29);
    sign_bit_sin = _mm_xor_ps(sign_bit_sin, _mm_castsi128_ps(emm0));

    // The magic pass: Extended precision modular arithmetic
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    __m128 xmm1 = *(__m128*)hkSse_DP1;
    __m128 xmm2 = *(__m128*)hkSse_DP2;
    __m128 xmm3 = *(__m128*)hkSse_DP3;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    // Evaluate the first polynom  (0 <= x <= Pi/4)
    y = *(__m128*)hkSse_cosCoeff0;
    __m128 z = _mm_mul_ps(x,x);

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, *(__m128*)hkSse_cosCoeff1);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, *(__m128*)hkSse_cosCoeff2);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    __m128 tmp = _mm_mul_ps(z, *(__m128*)hkSse_floatHalf);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, *(__m128*)hkSse_floatOne);

    // Evaluate the second polynom  (Pi/4 <= x <= 0)
    __m128 y2 = *(__m128*)hkSse_sinCoeff0;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, *(__m128*)hkSse_sinCoeff1);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, *(__m128*)hkSse_sinCoeff2);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    // get the polynom selection mask for the sine
    emm2 = _mm_and_si128(emm2, two);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    __m128 poly_mask = _mm_castsi128_ps(emm2);

    // select the correct result from the two polynoms
    __m128 ysin2 = _mm_and_ps(poly_mask, y2);
    __m128 ysin1 = _mm_andnot_ps(poly_mask, y);
    y2 = _mm_sub_ps(y2,ysin2);
    y = _mm_sub_ps(y, ysin1);

    xmm1 = _mm_add_ps(ysin1,ysin2);
    xmm2 = _mm_add_ps(y,y2);

    // clamp
    xmm1 = _mm_min_ps(xmm1, *(__m128*)hkSse_floatOne);
    xmm2 = _mm_min_ps(xmm2, *(__m128*)hkSse_floatOne);

    // update the signs
    s = _mm_xor_ps(xmm1, sign_bit_sin);
    c = _mm_xor_ps(xmm2, sign_bit_cos);
}

#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

/// Adapted from the book "Methods and Programs for Mathematical Functions",
/// Stephen Lloyd Baluk Moshier, Prentice-Hall, 1989
///
/// A polynomial of the form x + x**3 P(x**2)
/// is used for |x| in the interval [0, 0.5].  If |x| > 0.5 it is
/// transformed by the identity
///
///    asin(x) = pi/2 - 2 asin( sqrt( (1-x)/2 ) ).
///
/// ACCURACY: matches cmath on MSVC
///
/// Important: unlike the cmath function, this one clamps the input to [-1,1]
///
///                      Relative error:
/// arithmetic   domain      peak         rms
///    IEEE     -1, 1       2.5e-7       5.0e-8
///
/// PERFORMANCE: About 8x faster than ::asinf for 4 simultaneous values

__m128 HK_CALL quadAsin(const __m128& xx)
{
    HK_ALIGN16(static const hkUint32 linearLimit[4])  = { 0x38D1B717, 0x38D1B717, 0x38D1B717, 0x38D1B717 }; // 1e-4
    HK_ALIGN16(static const hkUint32 coeff4[4])       = { 0x3D2CB352, 0x3D2CB352, 0x3D2CB352, 0x3D2CB352 }; // 4.2163199048E-2
    HK_ALIGN16(static const hkUint32 coeff3[4])       = { 0x3CC617E3, 0x3CC617E3, 0x3CC617E3, 0x3CC617E3 }; // 2.4181311049E-2
    HK_ALIGN16(static const hkUint32 coeff2[4])       = { 0x3D3A3EC7, 0x3D3A3EC7, 0x3D3A3EC7, 0x3D3A3EC7 }; // 4.5470025998E-2
    HK_ALIGN16(static const hkUint32 coeff1[4])       = { 0x3D9980F6, 0x3D9980F6, 0x3D9980F6, 0x3D9980F6 }; // 7.4953002686E-2
    HK_ALIGN16(static const hkUint32 coeff0[4])       = { 0x3E2AAAE4, 0x3E2AAAE4, 0x3E2AAAE4, 0x3E2AAAE4 }; // 1.6666752422E-1

    __m128 a = _mm_andnot_ps( *(__m128*)hkSse_signMask, xx); a = _mm_min_ps(*(__m128*)hkSse_floatOne,a);
    __m128 sign = _mm_and_ps(*(__m128*)hkSse_signMask, xx);

    // linear approximation
    __m128 lessLimit = _mm_cmplt_ps(a,*(__m128*)linearLimit);

    // a > 0.5
    __m128 zGT = _mm_mul_ps(*(__m128*)hkSse_floatHalf, _mm_sub_ps(*(__m128*)hkSse_floatOne, a));
    __m128 xGT = _mm_sqrt_ps(zGT);

    // a <= 0.5
    __m128 xLE = a;
    __m128 zLE = _mm_mul_ps(xLE,xLE);

    // select polynom
    __m128 rangeMask = _mm_cmpgt_ps(a, *(__m128*)hkSse_floatHalf);
#if HK_SSE_VERSION >= 0x41
    __m128 x = _mm_blendv_ps(xLE,xGT,rangeMask);
    __m128 z = _mm_blendv_ps(zLE,zGT,rangeMask);
#else
    __m128 x = _mm_or_ps(_mm_andnot_ps(rangeMask, xLE),_mm_and_ps(rangeMask, xGT));
    __m128 z = _mm_or_ps(_mm_andnot_ps(rangeMask, zLE),_mm_and_ps(rangeMask, zGT));
#endif

    // zz = (((((c4*z)+c3)*z+c2)*z+c1)*z+c0)*z*x+x
    __m128 zz = _mm_mul_ps(*(__m128*)coeff4, z);
    zz = _mm_add_ps(*(__m128*)coeff3, zz);
    zz = _mm_mul_ps(zz,z);
    zz = _mm_add_ps(*(__m128*)coeff2, zz);
    zz = _mm_mul_ps(zz,z);
    zz = _mm_add_ps(*(__m128*)coeff1, zz);
    zz = _mm_mul_ps(zz,z);
    zz = _mm_add_ps(*(__m128*)coeff0, zz);
    zz = _mm_mul_ps(zz,z);
    zz = _mm_mul_ps(zz,x);
    zz = _mm_add_ps(zz,x);

    // transform
    __m128 zzGT = _mm_add_ps(zz,zz);
    zzGT = _mm_sub_ps(*(__m128*)hkSse_piOver2,zzGT);
#if HK_SSE_VERSION >= 0x41
    zz = _mm_blendv_ps(zz,zzGT,rangeMask);
#else
    zz = _mm_or_ps(_mm_andnot_ps(rangeMask, zz),_mm_and_ps(rangeMask, zzGT));
#endif

    // select linear approximation
#if HK_SSE_VERSION >= 0x41
    zz = _mm_blendv_ps(zz,a,lessLimit);
#else
    zz = _mm_or_ps(_mm_andnot_ps(lessLimit, zz),_mm_and_ps(lessLimit, a));
#endif

    // update the sign
    zz = _mm_xor_ps(zz, sign);

    return zz;
}
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

/// Double precision version of above except that it uses
/// a rational function of the form x + x**3 P(x**2)/Q(x**2)
///
///                      Relative error:
/// arithmetic   domain        peak       rms
///    IEEE     -1, 1       1.9e-16     5.4e-17
///
/// PERFORMANCE: About 2x faster than ::asin for 2 simultaneous values

__m128d HK_CALL twoAsin(const __m128d& xx)
{
    HK_ALIGN16(static const hkUint64 linearLimit[2])  = { 0x3E45798EE2308C3Aull, 0x3E45798EE2308C3Aull }; // 1e-8
    HK_ALIGN16(static const hkUint64 limit[2])        = { 0x3FE4000000000000ull, 0x3FE4000000000000ull }; // 0.625
    HK_ALIGN16(static const hkUint64 morebits[2])     = { 0x3C91A62633145C07ull, 0x3C91A62633145C07ull }; // 6.123233995736765886130E-17
    HK_ALIGN16(static const hkUint64 R0[2])           = { 0x3f684fc3988e9f08ull, 0x3f684fc3988e9f08ull }; //  2.967721961301243206100E-3
    HK_ALIGN16(static const hkUint64 R1[2])           = { 0xbfe2079259f9290full, 0xbfe2079259f9290full }; // -5.634242780008963776856E-1
    HK_ALIGN16(static const hkUint64 R2[2])           = { 0x401bdff5baf33e6aull, 0x401bdff5baf33e6aull }; //  6.968710824104713396794E0
    HK_ALIGN16(static const hkUint64 R3[2])           = { 0xc03991aaac01ab68ull, 0xc03991aaac01ab68ull }; // -2.556901049652824852289E1
    HK_ALIGN16(static const hkUint64 R4[2])           = { 0x403c896240f3081dull, 0x403c896240f3081dull }; //  2.853665548261061424989E1
    HK_ALIGN16(static const hkUint64 S1[2])           = { 0xc035f2a2b6bf5d8cull, 0xc035f2a2b6bf5d8cull }; // -2.194779531642920639778E1
    HK_ALIGN16(static const hkUint64 S2[2])           = { 0x40626219af6a7f42ull, 0x40626219af6a7f42ull }; //  1.470656354026814941758E2
    HK_ALIGN16(static const hkUint64 S3[2])           = { 0xc077fe08959063eeull, 0xc077fe08959063eeull }; // -3.838770957603691357202E2
    HK_ALIGN16(static const hkUint64 S4[2])           = { 0x40756709b0b644beull, 0x40756709b0b644beull }; //  3.424398657913078477438E2
    HK_ALIGN16(static const hkUint64 P0[2])           = { 0x3f716b9b0bd48ad3ull, 0x3f716b9b0bd48ad3ull }; //  4.253011369004428248960E-3
    HK_ALIGN16(static const hkUint64 P1[2])           = { 0xbfe34341333e5c16ull, 0xbfe34341333e5c16ull }; // -6.019598008014123785661E-1
    HK_ALIGN16(static const hkUint64 P2[2])           = { 0x4015c74b178a2dd9ull, 0x4015c74b178a2dd9ull }; //  5.444622390564711410273E0
    HK_ALIGN16(static const hkUint64 P3[2])           = { 0xc0304331de27907bull, 0xc0304331de27907bull }; // -1.626247967210700244449E1
    HK_ALIGN16(static const hkUint64 P4[2])           = { 0x40339007da779259ull, 0x40339007da779259ull }; //  1.956261983317594739197E1
    HK_ALIGN16(static const hkUint64 P5[2])           = { 0xc020656c06ceafd5ull, 0xc020656c06ceafd5ull }; // -8.198089802484824371615E0
    HK_ALIGN16(static const hkUint64 Q1[2])           = { 0xc02d7b590b5e0eabull, 0xc02d7b590b5e0eabull }; // -1.474091372988853791896E1
    HK_ALIGN16(static const hkUint64 Q2[2])           = { 0x40519fc025fe9054ull, 0x40519fc025fe9054ull }; //  7.049610280856842141659E1
    HK_ALIGN16(static const hkUint64 Q3[2])           = { 0xc06265bb6d3576d7ull, 0xc06265bb6d3576d7ull }; // -1.471791292232726029859E2
    HK_ALIGN16(static const hkUint64 Q4[2])           = { 0x4061705684ffbf9dull, 0x4061705684ffbf9dull }; //  1.395105614657485689735E2
    HK_ALIGN16(static const hkUint64 Q5[2])           = { 0xc04898220a3607acull, 0xc04898220a3607acull }; // -4.918853881490881290097E1


    __m128d a = _mm_andnot_pd( *(__m128d*)hkSse_D_signMask, xx); a = _mm_min_pd(*(__m128d*)hkSse_doubleOne,a);
    __m128d sign = _mm_and_pd(*(__m128d*)hkSse_D_signMask, xx);

    // linear approximation
    __m128d lessLimit = _mm_cmplt_pd(a,*(__m128d*)linearLimit);

    __m128d selectGT = _mm_cmpgt_pd(a, *(__m128d*)limit);

    // a > 0.625
    __m128d zGT;
    {
        __m128d zz = _mm_sub_pd(*(__m128d*)hkSse_doubleOne, a);

        __m128d polR = _mm_mul_pd(*(__m128d*)R0, zz);
        polR = _mm_add_pd(polR, *(__m128d*)R1);
        polR = _mm_mul_pd(polR, zz);
        polR = _mm_add_pd(polR, *(__m128d*)R2);
        polR = _mm_mul_pd(polR, zz);
        polR = _mm_add_pd(polR, *(__m128d*)R3);
        polR = _mm_mul_pd(polR, zz);
        polR = _mm_add_pd(polR, *(__m128d*)R4);
        polR = _mm_mul_pd(polR, zz);

        __m128d polS = zz; // coeff0 = 1
        polS = _mm_add_pd(polS, *(__m128d*)S1);
        polS = _mm_mul_pd(polS, zz);
        polS = _mm_add_pd(polS, *(__m128d*)S2);
        polS = _mm_mul_pd(polS, zz);
        polS = _mm_add_pd(polS, *(__m128d*)S3);
        polS = _mm_mul_pd(polS, zz);
        polS = _mm_add_pd(polS, *(__m128d*)S4);

        __m128d p = _mm_div_pd(polR,polS);
        zz = _mm_sqrt_pd(_mm_add_pd(zz,zz));
        zGT = _mm_sub_pd(*(__m128d*)hkSse_D_piOver4, zz);
        zz = _mm_sub_pd(_mm_mul_pd(zz,p),*(__m128d*)morebits);
        zGT = _mm_sub_pd(zGT,zz);
        zGT = _mm_add_pd(zGT, *(__m128d*)hkSse_D_piOver4);
    }

    // a <= 0.625
    __m128d zLT;
    {
        __m128d zz = _mm_mul_pd(a,a);

        __m128d polP = _mm_mul_pd(*(__m128d*)P0, zz);
        polP = _mm_add_pd(polP, *(__m128d*)P1);
        polP = _mm_mul_pd(polP, zz);
        polP = _mm_add_pd(polP, *(__m128d*)P2);
        polP = _mm_mul_pd(polP, zz);
        polP = _mm_add_pd(polP, *(__m128d*)P3);
        polP = _mm_mul_pd(polP, zz);
        polP = _mm_add_pd(polP, *(__m128d*)P4);
        polP = _mm_mul_pd(polP, zz);
        polP = _mm_add_pd(polP, *(__m128d*)P5);
        polP = _mm_mul_pd(polP, zz);

        __m128d polQ = zz; // coeff0 = 1
        polQ = _mm_add_pd(polQ, *(__m128d*)Q1);
        polQ = _mm_mul_pd(polQ, zz);
        polQ = _mm_add_pd(polQ, *(__m128d*)Q2);
        polQ = _mm_mul_pd(polQ, zz);
        polQ = _mm_add_pd(polQ, *(__m128d*)Q3);
        polQ = _mm_mul_pd(polQ, zz);
        polQ = _mm_add_pd(polQ, *(__m128d*)Q4);
        polQ = _mm_mul_pd(polQ, zz);
        polQ = _mm_add_pd(polQ, *(__m128d*)Q5);

        zLT = _mm_div_pd(polP,polQ);
        zLT = _mm_mul_pd(a,zLT);
        zLT = _mm_add_pd(a,zLT);
    }

    __m128d z;

    // select case and linear approximation
#if HK_SSE_VERSION >= 0x41
    z = _mm_blendv_pd(zLT,zGT,selectGT);
    z = _mm_blendv_pd(z,a,lessLimit);
#else
    z = _mm_or_pd(_mm_andnot_pd(selectGT, zLT),_mm_and_pd(selectGT, zGT));
    z = _mm_or_pd(_mm_andnot_pd(lessLimit, z),_mm_and_pd(lessLimit, a));
#endif

    // update the sign
    z = _mm_xor_pd(z, sign);

    return z;
}
#endif

#if HK_CONFIG_SIMD == HK_CONFIG_SIMD_ENABLED

namespace
{
    HK_INLINE static void HK_CALL quadLogHelper(__m128& x)
    {
        static HK_ALIGN16(const unsigned int minNormalizedPosNumber[4]) = {  0x00800000,  0x00800000,  0x00800000,  0x00800000 }; // 1.1754943508222875E-38 (smallest normalized positive number)
        static HK_ALIGN16(const          int invMantissaMask[4])        = { ~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000 };
        static HK_ALIGN16(const unsigned int int0x7f[4])                = {  0x0000007f,  0x0000007f,  0x0000007f,  0x0000007f };
        static HK_ALIGN16(const unsigned int logP0[4])                  = {  0xBF4A21EF,  0xBF4A21EF,  0xBF4A21EF,  0xBF4A21EF }; // -7.89580278884799154124e-1
        static HK_ALIGN16(const unsigned int logQ0[4])                  = {  0xC20EB06A,  0xC20EB06A,  0xC20EB06A,  0xC20EB06A }; // -3.56722798256324312549e1
        static HK_ALIGN16(const unsigned int logP1[4])                  = {  0x418317E4,  0x418317E4,  0x418317E4,  0x418317E4 }; // 1.63866645699558079767e1
        static HK_ALIGN16(const unsigned int logQ1[4])                  = {  0x439C0C01,  0x439C0C01,  0x439C0C01,  0x439C0C01 }; // 3.12093766372244180303e2
        static HK_ALIGN16(const unsigned int logP2[4])                  = {  0xC2804831,  0xC2804831,  0xC2804831,  0xC2804831 }; // -6.41409952958715622951e1
        static HK_ALIGN16(const unsigned int logQ2[4])                  = {  0xC4406C49,  0xC4406C49,  0xC4406C49,  0xC4406C49 }; // -7.69691943550460008604e2
        static HK_ALIGN16(const unsigned int logC0[4])                  = {  0x3F317218,  0x3F317218,  0x3F317218,  0x3F317218 }; // 0.693147180559945

        // cut off denormalized stuff
        x = _mm_max_ps(x, *(__m128*)minNormalizedPosNumber);

        __m128i emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);

        // keep only the fractional part
        x = _mm_or_ps(_mm_and_ps(x, *(__m128*)invMantissaMask), *(__m128*)hkSse_floatOne);

        x = _mm_mul_ps(_mm_sub_ps(x, *(__m128*)hkSse_floatOne), _mm_rcp_ps(_mm_add_ps(x, *(__m128*)hkSse_floatOne)));
        x = _mm_add_ps(x, x);
        __m128 bs = x;

        x = _mm_mul_ps(x, x);

        __m128 x4 = _mm_mul_ps(*(__m128*)logP0, x);
        __m128 x6 = _mm_mul_ps(*(__m128*)logQ0, x);

        x4 = _mm_add_ps(x4, *(__m128*)logP1);
        x6 = _mm_add_ps(x6, *(__m128*)logQ1);

        x4 = _mm_mul_ps(x4, x);
        x6 = _mm_mul_ps(x6, x);

        x4 = _mm_add_ps(x4, *(__m128*)logP2);
        x6 = _mm_add_ps(x6, *(__m128*)logQ2);

        x = _mm_mul_ps(x, x4);
        x6 = _mm_rcp_ps(x6);

        x = _mm_mul_ps(x, x6);
        x = _mm_mul_ps(x, bs);

        emm0 = _mm_sub_epi32(emm0, *(__m128i*)int0x7f);
        __m128 x1 = _mm_mul_ps(_mm_cvtepi32_ps(emm0), *(__m128*)logC0);

        x = _mm_add_ps(x, bs);
        x = _mm_add_ps(x, x1);
    }
}

/// Average absolute error 0.000046
/// Max absolute error 169.777725 for r=1
/// Max absolute error 0.001070 elsewhere
/// About 3x faster than ::logf for 4 simultaneous values
hkQuadFloat32 HK_CALL quadLog(const hkQuadFloat32 &r)
{
    __m128 x = r;
    quadLogHelper(x);
    return x;
}

hkQuadDouble64 HK_CALL quadLog(const hkQuadDouble64 &r)
{
#if HK_SSE_VERSION >= 0x50
    __m128 x = _mm256_cvtpd_ps(r);
#else
    const __m128 xy = _mm_cvtpd_ps(r.xy);
    const __m128 zw = _mm_cvtpd_ps(r.zw);
    __m128 x = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
#endif
    quadLogHelper(x);

#       if HK_SSE_VERSION >= 0x50
    return _mm256_cvtps_pd(x);
#else
    hkQuadDouble64 result;
    result.xy = _mm_cvtps_pd(x);
    result.zw = _mm_cvtps_pd(_mm_shuffle_ps(x,x,_MM_SHUFFLE(1,0,3,2)));
    return result;
#endif
}
#endif
}   // namespace

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
