// TKBMS v1.0 -----------------------------------------------------
//
// PLATFORM   : WIN32 X64 LINUX32 LINUX64 METRO_X86 METRO_X64 APOLLO_X86 DURANGO PS4 UWP_X86 UWP_X64 ANDROID_X86 MAC
// PRODUCT   : COMMON
// VISIBILITY   : PUBLIC
//
// ------------------------------------------------------TKBMS v1.0


namespace hkMath
{
    // import the constants
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_signMask[4])   ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_pi[4])         ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_twoPi[4])      ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_piOver2[4])    ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_fourOverPi[4]) ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_DP1[4])        ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_DP2[4])        ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_DP3[4])        ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_floatOne[4])   ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_floatTwo[4])   ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_floatThree[4]) ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_floatHalf[4])  ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint32 hkSse_intOne[4])     ;

    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_twoPi[2] )   ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_pi[2] )      ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_piOver2[2])  ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_piOver4[2])  ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_fourOverPi[2]);
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_DP1[2])      ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_DP2[2])      ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_DP3[2])      ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_doubleHalf[2]) ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_doubleOne[2])  ;
    extern HK_EXPORT_COMMON HK_ALIGN16(const hkUint64 hkSse_D_signMask[2]) ;

    HK_INLINE hkQuadFloat32 HK_CALL convertToQuadFloat( const hkQuadDouble64& a )
    {
#if HK_SSE_VERSION >= 0x50
        return _mm256_cvtpd_ps(a);
#else
        const __m128 xy = _mm_cvtpd_ps(a.xy);
        const __m128 zw = _mm_cvtpd_ps(a.zw);
        const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));

        hkQuadFloat32 floatA = xyzw;
        return floatA;
#endif
    }

    HK_INLINE hkQuadDouble64 HK_CALL convertToQuadDouble( const hkQuadFloat32& a )
    {
#if HK_SSE_VERSION >= 0x50
        return _mm256_cvtps_pd(a);
#else
        hkQuadDouble64 doubleA;
        doubleA.xy = _mm_cvtps_pd(a);
        doubleA.zw = _mm_cvtps_pd(_mm_shuffle_ps(a,a,_MM_SHUFFLE(1,0,3,2)));
        return doubleA;
#endif
    }

    HK_INLINE hkSingleFloat32 convertToFloat(const hkSingleDouble64& a)
    {
        return _mm_cvtpd_ps(a);
    }

    HK_INLINE hkSingleDouble64 convertToDouble(const hkSingleFloat32& a)
    {
        return _mm_cvtps_pd(a);
    }


    //
    // need to implement
    //
#   define HK_MATH_quadReciprocal
    HK_INLINE hkQuadFloat32 HK_CALL quadReciprocal( const hkQuadFloat32& r )
    {
        const hkQuadFloat32 rb = _mm_rcp_ps(r);
        // One Newton-Raphson refinement iteration
        const hkQuadFloat32 rbr = _mm_mul_ps(r, rb);
        const hkQuadFloat32 d = _mm_sub_ps(*(__m128*)hkSse_floatTwo, rbr);
        return _mm_mul_ps(rb, d);
    }

    HK_INLINE hkQuadDouble64 HK_CALL quadReciprocal( const hkQuadDouble64& r )
    {
#if HK_SSE_VERSION >= 0x50
        return _mm256_div_pd(_mm256_set1_pd(1.0), r);
#else
        const __m128 xy = _mm_cvtpd_ps(r.xy);
        const __m128 zw = _mm_cvtpd_ps(r.zw);
        const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
        const __m128 rb = _mm_rcp_ps(xyzw);
        // One Newton-Raphson refinement iteration
        const __m128 rbr = _mm_mul_ps(xyzw, rb);
        const __m128 d = _mm_sub_ps(*(__m128*)hkSse_floatTwo, rbr);
        const __m128 re = _mm_mul_ps(rb, d);
        hkQuadDouble64 result;
        result.xy = _mm_cvtps_pd(re);
        result.zw = _mm_cvtps_pd(_mm_shuffle_ps(re,re,_MM_SHUFFLE(1,0,3,2)));
        return result;
#endif
    }

#   define HK_MATH_quadReciprocalSquareRoot
    HK_INLINE hkQuadFloat32 HK_CALL quadReciprocalSquareRoot( const hkQuadFloat32& r )
    {
        const hkQuadFloat32 e = _mm_rsqrt_ps(r);
        // One Newton-Raphson refinement iteration
        const hkQuadFloat32 he = _mm_mul_ps(*(__m128*)hkSse_floatHalf,e);
        const hkQuadFloat32 ree = _mm_mul_ps(_mm_mul_ps(r,e),e);
        return _mm_mul_ps(he, _mm_sub_ps(*(__m128*)hkSse_floatThree, ree) );
    }

    HK_INLINE hkQuadDouble64 HK_CALL quadReciprocalSquareRoot( const hkQuadDouble64& r )
    {
#if HK_SSE_VERSION >= 0x50
        return _mm256_div_pd(_mm256_set1_pd(1.0), _mm256_sqrt_pd(r));
#else
        const __m128 xy = _mm_cvtpd_ps(r.xy);
        const __m128 zw = _mm_cvtpd_ps(r.zw);
        const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
        const __m128 e = _mm_rsqrt_ps(xyzw);
        // One Newton-Raphson refinement iteration
        const __m128 he = _mm_mul_ps(*(__m128*)hkSse_floatHalf,e);
        const __m128 ree = _mm_mul_ps(_mm_mul_ps(xyzw,e),e);
        const __m128 re = _mm_mul_ps(he, _mm_sub_ps(*(__m128*)hkSse_floatThree, ree) );
        hkQuadDouble64 result;
        result.xy = _mm_cvtps_pd(re);
        result.zw = _mm_cvtps_pd(_mm_shuffle_ps(re,re,_MM_SHUFFLE(1,0,3,2)));
        return result;
#endif
    }

#   define HK_MATH_quadReciprocalTwoIter
    HK_INLINE hkQuadFloat32 HK_CALL quadReciprocalTwoIter( const hkQuadFloat32& r )
    {
        const __m128 two = *(__m128*)hkSse_floatTwo;
        const hkQuadFloat32 rb = _mm_rcp_ps(r);
        //One round of Newton-Raphson refinement
        const hkQuadFloat32 rbr = _mm_mul_ps(r, rb);
        const hkQuadFloat32 d = _mm_sub_ps(two, rbr);
        const hkQuadFloat32 rb1 = _mm_mul_ps(rb, d);
        //Another round
        const hkQuadFloat32 rbr2 = _mm_mul_ps(r, rb1);
        const hkQuadFloat32 d2 = _mm_sub_ps(two, rbr2);
        return _mm_mul_ps(rb1, d2);
    }

    HK_INLINE hkQuadDouble64 HK_CALL quadReciprocalTwoIter( const hkQuadDouble64& r )
    {
#if HK_SSE_VERSION >= 0x50
        return _mm256_div_pd(_mm256_set1_pd(1.0), r);
#else
        const __m128 two = *(__m128*)hkSse_floatTwo;
        const __m128 xy = _mm_cvtpd_ps(r.xy);
        const __m128 zw = _mm_cvtpd_ps(r.zw);
        const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
        const __m128 rb = _mm_rcp_ps(xyzw);
        //One round of Newton-Raphson refinement
        const __m128 rbr = _mm_mul_ps(xyzw, rb);
        const __m128 d = _mm_sub_ps(two, rbr);
        const __m128 rb1 = _mm_mul_ps(rb, d);
        //Another round
        const __m128 rbr2 = _mm_mul_ps(xyzw, rb1);
        const __m128 d2 = _mm_sub_ps(two, rbr2);
        const __m128 re = _mm_mul_ps(rb1, d2);
        hkQuadDouble64 result;
        result.xy = _mm_cvtps_pd(re);
        result.zw = _mm_cvtps_pd(_mm_shuffle_ps(re,re,_MM_SHUFFLE(1,0,3,2)));
        return result;
#endif
    }

#   define HK_MATH_quadReciprocalSquareRootTwoIter
    HK_INLINE hkQuadFloat32 HK_CALL quadReciprocalSquareRootTwoIter( const hkQuadFloat32& r )
    {
        const __m128 half = *(__m128*)hkSse_floatHalf;
        const __m128 three = *(__m128*)hkSse_floatThree;
        const hkQuadFloat32 e = _mm_rsqrt_ps(r);
        // One Newton-Raphson refinement iteration
        const hkQuadFloat32 he = _mm_mul_ps(half,e);
        const hkQuadFloat32 ree = _mm_mul_ps(_mm_mul_ps(r,e),e);
        const hkQuadFloat32 e1 = _mm_mul_ps(he, _mm_sub_ps(three, ree) );
        //Another round
        const hkQuadFloat32 he2 = _mm_mul_ps(half,e1);
        const hkQuadFloat32 ree2 = _mm_mul_ps(_mm_mul_ps(r,e1),e1);
        return _mm_mul_ps(he2, _mm_sub_ps(three, ree2) );
    }

    HK_INLINE hkQuadDouble64 HK_CALL quadReciprocalSquareRootTwoIter( const hkQuadDouble64& r )
    {
#if HK_SSE_VERSION >= 0x50
        return _mm256_div_pd(_mm256_set1_pd(1.0), _mm256_sqrt_pd(r));
#else
        const __m128 half = *(__m128*)hkSse_floatHalf;
        const __m128 three = *(__m128*)hkSse_floatThree;
        const __m128 xy = _mm_cvtpd_ps(r.xy);
        const __m128 zw = _mm_cvtpd_ps(r.zw);
        const __m128 xyzw = _mm_shuffle_ps(xy,zw,_MM_SHUFFLE(1,0,1,0));
        const __m128 e = _mm_rsqrt_ps(xyzw);
        // One Newton-Raphson refinement iteration
        const __m128 he = _mm_mul_ps(half,e);
        const __m128 ree = _mm_mul_ps(_mm_mul_ps(xyzw,e),e);
        const __m128 e1 = _mm_mul_ps(he, _mm_sub_ps(three, ree) );
        //Another round
        const __m128 he2 = _mm_mul_ps(half,e1);
        const __m128 ree2 = _mm_mul_ps(_mm_mul_ps(xyzw,e1),e1);
        const __m128 re = _mm_mul_ps(he2, _mm_sub_ps(three, ree2) );
        hkQuadDouble64 result;
        result.xy = _mm_cvtps_pd(re);
        result.zw = _mm_cvtps_pd(_mm_shuffle_ps(re,re,_MM_SHUFFLE(1,0,3,2)));
        return result;
#endif
    }

    //
    // SSE optimized implementations
    //

#   define HK_MATH_sqrt
    HK_INLINE hkFloat32 HK_CALL sqrt(const hkFloat32 r)
    {
        const __m128 s = _mm_sqrt_ss(_mm_load_ss(&r));
        return _mm_cvtss_f32(s);
    }
    HK_INLINE hkDouble64 HK_CALL sqrt(const hkDouble64 r)
    {
        const __m128d s = _mm_sqrt_sd(_mm_setzero_pd(),_mm_load_sd(&r));
        return _mm_cvtsd_f64(s);
    }

#   define HK_MATH_sqrtInverse
    HK_INLINE hkFloat32 HK_CALL sqrtInverse(const hkFloat32 r)
    {
        const __m128 s = _mm_sqrt_ss(_mm_load_ss(&r));
        return _mm_cvtss_f32(_mm_div_ss(*(__m128*)hkSse_floatOne,s));
    }
    HK_INLINE hkDouble64 HK_CALL sqrtInverse(const hkDouble64 r)
    {
        const __m128d s = _mm_sqrt_sd(_mm_setzero_pd(),_mm_load_sd(&r));
        const __m128d q = _mm_div_sd(*(__m128d*)hkSse_doubleOne,s);
        return _mm_cvtsd_f64(q);
    }


    /// Average absolute error 0.000046
    /// Max absolute error 169.777725 for r=1
    /// Max absolute error 0.001070 elsewhere
    /// About 3x faster than ::logf for 4 simultaneous values
    extern HK_EXPORT_COMMON hkQuadFloat32 HK_CALL quadLog(const hkQuadFloat32 &r);

    extern HK_EXPORT_COMMON hkQuadDouble64 HK_CALL quadLog(const hkQuadDouble64 &r);

    /// Returns radian angle between -pi/2 and +pi/2 whose tangent is x.
    /// Range reduction is from three intervals into the interval
    /// from zero to 0.66.  The approximation uses a rational
    /// function of degree 4/5 of the form x + x**3 P(x)/Q(x).
    ///
    /// ACCURACY: matches cmath on MSVC
    ///
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE      -10,10       1.8e-16     5.0e-17
    ///
    /// PERFORMANCE: About 1.2x faster than ::atan for 2 simultaneous values
    extern HK_EXPORT_COMMON __m128d HK_CALL twoAtan(const __m128d& inX);

    /// Approximate version of above using reduced accuracy for reciprocals
    /// and polynoms of degree only 3/4.
    ///
    /// Average absolute error 0.000004
    /// Max absolute error 0.000221
    /// About 1.8x faster than ::atan for 2 simultaneous values
    extern HK_EXPORT_COMMON __m128d HK_CALL twoAtanApproximation(const __m128d& inX);

    /// Returns radian angle between -pi/2 and +pi/2 whose tangent is x.
    /// Range reduction is to two intervals which are approximated by
    /// a polynomial. Unlike the method in the book, the polynomials are
    /// factored into quadratic evaluation which is faster than a Horner scheme.
    ///
    /// ACCURACY: matches cmath on MSVC
    ///
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE      -10,10       1.9e-7     4.1e-8
    ///
    /// PERFORMANCE: About 5.5x faster than ::atanf for 4 simultaneous values
    extern HK_EXPORT_COMMON __m128 HK_CALL quadAtan(const __m128& inX);

    /// Returns radian angle whose tangent is y/x.
    /// This function adheres to the ANSI standard, ie. range -PI < z=atan2(y,x) <= +PI
    /// with argument sequence (y,x) to match the cmath interface.
    /// The implementation forwards to the quadAtan(z) function.
    ///
    /// ACCURACY: matches cmath on MSVC
    ///
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE      -10,10       1.9e-7     4.1e-8
    ///
    /// PERFORMANCE: About 6.4x faster than ::atanf for 4 simultaneous values
    extern HK_EXPORT_COMMON __m128 HK_CALL quadAtan2(const __m128& y, const __m128& x);

    /// Double precision version of above
    ///
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE     -10,10        2.5e-16    6.9e-17
    ///
    /// PERFORMANCE: About 2.1x faster than ::atan2 for 2 simultaneous values
    extern HK_EXPORT_COMMON __m128d HK_CALL twoAtan2(const __m128d& y, const __m128d& x);

#   define HK_MATH_prefetch128
    HK_INLINE void HK_CALL prefetch128(_Maybevalid_ const void* p)
    {
        _mm_prefetch( (const char*)p, _MM_HINT_NTA );
    }

#   define HK_MATH_forcePrefetch
    template<int SIZE>
    HK_INLINE void HK_CALL forcePrefetch(_Maybevalid_ const void* p )
    {
        const char* q = (const char*)p;
        _mm_prefetch( q, _MM_HINT_NTA );
        if ( SIZE > 64){  _mm_prefetch( q + 64, _MM_HINT_NTA ); }
        if ( SIZE > 128){ _mm_prefetch( q + 128, _MM_HINT_NTA ); }
        if ( SIZE > 192){ _mm_prefetch( q + 192, _MM_HINT_NTA ); }
    }

#   define HK_MATH_hkToIntFast
    HK_ALWAYS_INLINE int HK_CALL hkToIntFast( const hkDouble64 r )
    {
        return _mm_cvttsd_si32(_mm_load_sd(&r));
    }
    HK_ALWAYS_INLINE int HK_CALL hkToIntFast( const hkFloat32 r )
    {
        return _mm_cvtt_ss2si(_mm_load_ss(&r));
    }

#   define HK_MATH_hkFloatToInt
    HK_INLINE int HK_CALL hkFloatToInt(const hkFloat32 r)
    {
        return hkMath::hkToIntFast(r);
    }
    HK_INLINE int HK_CALL hkFloatToInt(const hkDouble64 r)
    {
        return hkMath::hkToIntFast(r);
    }

    HK_ALWAYS_INLINE hkQuadUint HK_CALL quadF2I(const hkQuadFloat32& f)
    {
        return _mm_cvttps_epi32(f);
    }

    HK_INLINE hkQuadUint HK_CALL quadF2I(const hkQuadDouble64& f)
    {
#if HK_SSE_VERSION >= 0x50
        return _mm256_cvttpd_epi32(f);
#else
        __m128i xy = _mm_cvttpd_epi32(f.xy);
        __m128i zw = _mm_cvttpd_epi32(f.zw);
        return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(xy),_mm_castsi128_ps(zw),_MM_SHUFFLE(1,0,1,0)));
#endif
    }

    HK_ALWAYS_INLINE __m128 HK_CALL quadFabs(const __m128 &v)
    {
        return _mm_andnot_ps( *(__m128*)hkSse_signMask, v);
    }

    HK_ALWAYS_INLINE __m128d HK_CALL twoFabs(const __m128d &v)
    {
        return _mm_castsi128_pd( _mm_srli_epi64( _mm_slli_epi64( _mm_castpd_si128(v), 1 ), 1 ) );
    }

#   define HK_MATH_fabs
    HK_INLINE hkFloat32 HK_CALL fabs(const hkFloat32 r)
    {
        const __m128 v = _mm_load_ss(&r);
        const __m128 abs = quadFabs(v);
        return _mm_cvtss_f32(abs);
    }
    HK_INLINE hkDouble64 HK_CALL fabs(const hkDouble64 r)
    {
        const __m128d v = _mm_load_sd(&r);
        const __m128d abs = twoFabs(v);
        return _mm_cvtsd_f64(abs);
    }

#if HK_SSE_VERSION >= 0x31
#define HK_MATH_abs
    template<typename T>
    HK_INLINE T HK_CALL abs(T t) { return t < T(0) ? -t : t; }

    template<>
    HK_INLINE hkFloat32 HK_CALL abs(hkFloat32 r) { return fabs(r); }

    template<>
    HK_INLINE hkDouble64 HK_CALL abs(hkDouble64 r) { return fabs(r); }

    template<>
    HK_INLINE hkInt32 HK_CALL abs(hkInt32 r)
    {
        __m128i rr = _mm_cvtsi32_si128(r);
        __m128i rabs = _mm_abs_epi32(rr);
        return _mm_cvtsi128_si32(rabs);
    }

#endif

#   define HK_MATH_equal
    HK_INLINE bool HK_CALL equal(hkFloat32 x, hkFloat32 y)
    {
        const __m128 xx = _mm_load_ss(&x);
        const __m128 yy = _mm_load_ss(&y);
        const __m128 abs = quadFabs(_mm_sub_ss(xx,yy));
        const __m128 tol = _mm_set1_ps(1e-5f);
        const int mask = _mm_ucomile_ss(abs, tol);
        return bool(mask);
    }
    HK_INLINE bool HK_CALL equal(hkFloat32 x, hkFloat32 y, hkFloat32 tolerance2)
    {
        const __m128 xx = _mm_load_ss(&x);
        const __m128 yy = _mm_load_ss(&y);
        const __m128 abs = quadFabs(_mm_sub_ss(xx,yy));
        const int mask = _mm_ucomile_ss(abs, _mm_load_ss(&tolerance2));
        return bool(mask);
    }
    HK_INLINE bool HK_CALL equal(hkDouble64 x, hkFloat32 y)
    {
        const __m128d xxd = _mm_load_sd(&x);
        const __m128 xx = _mm_cvtsd_ss(_mm_setzero_ps(), xxd);
        const __m128 yy = _mm_load_ss(&y);
        const __m128 abs = quadFabs(_mm_sub_ss(xx,yy));
        const __m128 tol = _mm_set1_ps(1e-5f);
        const int mask = _mm_ucomile_ss(abs, tol);
        return bool(mask);
    }
    HK_INLINE bool HK_CALL equal(hkDouble64 x, hkFloat32 y, hkFloat32 tolerance2)
    {
        const __m128d xxd = _mm_load_sd(&x);
        const __m128 xx = _mm_cvtsd_ss(_mm_setzero_ps(), xxd);
        const __m128 yy = _mm_load_ss(&y);
        const __m128 abs = quadFabs(_mm_sub_ss(xx,yy));
        const int mask = _mm_ucomile_ss(abs, _mm_load_ss(&tolerance2));
        return bool(mask);
    }
    HK_INLINE bool HK_CALL equal(hkFloat32 x, hkDouble64 y)
    {
        const __m128d yyd = _mm_load_sd(&y);
        const __m128 yy = _mm_cvtsd_ss(_mm_setzero_ps(), yyd);
        const __m128 xx = _mm_load_ss(&x);
        const __m128 abs = quadFabs(_mm_sub_ss(xx,yy));
        const __m128 tol = _mm_set1_ps(1e-5f);
        const int mask = _mm_ucomile_ss(abs, tol);
        return bool(mask);
    }
    HK_INLINE bool HK_CALL equal(hkFloat32 x, hkDouble64 y, hkFloat32 tolerance2)
    {
        const __m128d yyd = _mm_load_sd(&y);
        const __m128 yy = _mm_cvtsd_ss(_mm_setzero_ps(), yyd);
        const __m128 xx = _mm_load_ss(&x);
        const __m128 abs = quadFabs(_mm_sub_ss(xx,yy));
        const int mask = _mm_ucomile_ss(abs, _mm_load_ss(&tolerance2));
        return bool(mask);
    }
    HK_INLINE bool HK_CALL equal(hkDouble64 x, hkDouble64 y)
    {
        const __m128d xx = _mm_load_sd(&x);
        const __m128d yy = _mm_load_sd(&y);
        const __m128d abs = twoFabs(_mm_sub_sd(xx,yy));
        const __m128d tol = _mm_set1_pd(1e-5);
        const int mask = _mm_ucomile_sd(abs, tol);
        return bool(mask);
    }
    HK_INLINE bool HK_CALL equal(hkDouble64 x, hkDouble64 y, hkDouble64 tolerance2)
    {
        const __m128d xx = _mm_load_sd(&x);
        const __m128d yy = _mm_load_sd(&y);
        const __m128d abs = twoFabs(_mm_sub_sd(xx,yy));
        const int mask = _mm_ucomile_sd(abs, _mm_load_sd(&tolerance2));
        return bool(mask);
    }
    HK_INLINE bool HK_CALL equal(hkDouble64 x, hkDouble64 y, hkFloat32 tolerance2)
    {
        const __m128d xx = _mm_load_sd(&x);
        const __m128d yy = _mm_load_sd(&y);
        const __m128d abs = twoFabs(_mm_sub_sd(xx,yy));
        const __m128 tolf = _mm_load_ss(&tolerance2);
        const __m128d told = _mm_cvtss_sd(_mm_setzero_pd(), tolf);
        const int mask = _mm_ucomile_sd(abs, told);
        return bool(mask);
    }


#   define HK_MATH_max2
    template <typename T1,typename T2>
    _Post_satisfies_(return >= x && return >= T1(y))
    HK_INLINE T1 HK_CALL max2( T1 x, T2 y)
    {
        return x > (T1)y ? x : (T1)y;
    }
    template <>
    _Post_satisfies_(return >= x && return >= y)
    HK_INLINE hkFloat32 HK_CALL max2<hkFloat32, hkFloat32>( hkFloat32 x, hkFloat32 y)
    {
        const __m128 v = _mm_max_ss(_mm_load_ss(&x),_mm_load_ss(&y));
        return _mm_cvtss_f32(v);
    }
    template <>
    _Post_satisfies_(return >= x && return >= y)
    HK_INLINE hkDouble64 HK_CALL max2<hkDouble64, hkDouble64>( hkDouble64 x, hkDouble64 y)
    {
        const __m128d v = _mm_max_sd(_mm_load_sd(&x),_mm_load_sd(&y));
        return _mm_cvtsd_f64(v);
    }
#if HK_SSE_VERSION >= 0x41
    template <>
    _Post_satisfies_(return >= x && return >= y)
    HK_INLINE hkUint32 HK_CALL max2<hkUint32>( hkUint32 x, hkUint32 y)
    {
        const __m128i v = _mm_max_epu32(_mm_cvtsi32_si128(x),_mm_cvtsi32_si128(y));
        return (hkUint32)_mm_cvtsi128_si32(v);
    }
    template <>
    _Post_satisfies_(return >= x && return >= y)
    HK_INLINE hkInt32 HK_CALL max2<hkInt32>( hkInt32 x, hkInt32 y)
    {
        const __m128i v = _mm_max_epi32(_mm_cvtsi32_si128(x),_mm_cvtsi32_si128(y));
        return _mm_cvtsi128_si32(v);
    }
#endif

#   define HK_MATH_min2
    template <typename T1,typename T2>
    _Post_satisfies_(return <= x && return <= T1(y))
    HK_INLINE T1 HK_CALL min2( T1 x, T2 y)
    {
        return x < (T1)y ? x : (T1)y;
    }

    template <>
    _Post_satisfies_(return <= x && return <= y)
    HK_INLINE hkFloat32 HK_CALL min2<hkFloat32, hkFloat32>( hkFloat32 x, hkFloat32 y)
    {
        const __m128 v = _mm_min_ss(_mm_load_ss(&x),_mm_load_ss(&y));
        return _mm_cvtss_f32(v);
    }

    template <>
    _Post_satisfies_(return <= x && return <= y)
    HK_INLINE hkDouble64 HK_CALL min2<hkDouble64, hkDouble64>( hkDouble64 x, hkDouble64 y)
    {
        const __m128d v = _mm_min_sd(_mm_load_sd(&x),_mm_load_sd(&y));
        return _mm_cvtsd_f64(v);
    }
#if HK_SSE_VERSION >= 0x41
    template <>
    _Post_satisfies_(return <= x && return <= y)
    HK_INLINE hkUint32 HK_CALL min2<hkUint32>( hkUint32 x, hkUint32 y)
    {
        const __m128i v = _mm_min_epu32(_mm_cvtsi32_si128(x),_mm_cvtsi32_si128(y));
        return (hkUint32)_mm_cvtsi128_si32(v);
    }
    template <>
    _Post_satisfies_(return <= x && return <= y)
    HK_INLINE hkInt32 HK_CALL min2<hkInt32>( hkInt32 x, hkInt32 y)
    {
        const __m128i v = _mm_min_epi32(_mm_cvtsi32_si128(x),_mm_cvtsi32_si128(y));
        return _mm_cvtsi128_si32(v);
    }
#endif

#   define HK_MATH_clamp
    template <typename T1, typename T2, typename T3>
    _Ret_range_(mi, ma)
    HK_INLINE T1 HK_CALL clamp( T1 x, T2 mi, T3 ma)
    {
        if ( x < (T1)mi ) return (T1)mi;
        if ( x > (T1)ma ) return (T1)ma;
        return x;
    }
    template <>
    _Post_satisfies_(return >= mi && return <= ma)
    HK_INLINE hkFloat32 HK_CALL clamp<hkFloat32, hkFloat32, hkFloat32>( hkFloat32 x, hkFloat32 mi, hkFloat32 ma)
    {
        const __m128 lo = _mm_max_ss(_mm_load_ss(&x),_mm_load_ss(&mi));
        const __m128 hi = _mm_min_ss(             lo,_mm_load_ss(&ma));
        return _mm_cvtss_f32(hi);
    }
    template <>
    _Post_satisfies_(return >= mi && return <= ma)
    HK_INLINE hkDouble64 HK_CALL clamp<hkDouble64, hkDouble64, hkDouble64>( hkDouble64 x, hkDouble64 mi, hkDouble64 ma)
    {
        const __m128d lo = _mm_max_sd(_mm_load_sd(&x),_mm_load_sd(&mi));
        const __m128d hi = _mm_min_sd(             lo,_mm_load_sd(&ma));
        return _mm_cvtsd_f64(hi);
    }
#if HK_SSE_VERSION >= 0x41
    template <>
    _Post_satisfies_(return >= mi && return <= ma)
    HK_INLINE hkInt32 HK_CALL clamp<hkInt32>( hkInt32 x, hkInt32 mi, hkInt32 ma)
    {
        const __m128i lo = _mm_max_epi32(_mm_cvtsi32_si128(x),_mm_cvtsi32_si128(mi));
        const __m128i hi = _mm_min_epi32(                  lo,_mm_cvtsi32_si128(ma));
        return _mm_cvtsi128_si32(hi);
    }
    template <>
    _Post_satisfies_(return >= mi && return <= ma)
    HK_INLINE hkUint32 HK_CALL clamp<hkUint32>( hkUint32 x, hkUint32 mi, hkUint32 ma)
    {
        const __m128i lo = _mm_max_epu32(_mm_cvtsi32_si128(x),_mm_cvtsi32_si128(mi));
        const __m128i hi = _mm_min_epu32(                  lo,_mm_cvtsi32_si128(ma));
        return (hkUint32)_mm_cvtsi128_si32(hi);
    }
#endif

    // Define but don't implement the specializations for hkSimdFloat32.
    // On MSVC, accidentally calling these can cause misaligned SSE access.
    template <>
    HK_INLINE hkSimdFloat32 HK_CALL min2<hkSimdFloat32, hkSimdFloat32>( hkSimdFloat32 x, hkSimdFloat32 y);

    template <>
    HK_INLINE hkSimdFloat32 HK_CALL max2<hkSimdFloat32, hkSimdFloat32>( hkSimdFloat32 x, hkSimdFloat32 y);

    template <>
    HK_INLINE hkSimdFloat32 HK_CALL clamp<hkSimdFloat32, hkSimdFloat32, hkSimdFloat32>( hkSimdFloat32 x, hkSimdFloat32 mi, hkSimdFloat32 ma);

    template <>
    HK_INLINE hkSimdDouble64 HK_CALL min2<hkSimdDouble64, hkSimdDouble64>( hkSimdDouble64 x, hkSimdDouble64 y);

    template <>
    HK_INLINE hkSimdDouble64 HK_CALL max2<hkSimdDouble64, hkSimdDouble64>( hkSimdDouble64 x, hkSimdDouble64 y);

    template <>
    HK_INLINE hkSimdDouble64 HK_CALL clamp<hkSimdDouble64, hkSimdDouble64, hkSimdDouble64>( hkSimdDouble64 x, hkSimdDouble64 mi, hkSimdDouble64 ma);



    HK_INLINE __m128 HK_CALL quadFloor(const __m128 &v)
    {
#if HK_SSE_VERSION >= 0x41
        const __m128 result = _mm_floor_ps(v);
#else
        HK_ALIGN16( const hkUint32 two23[4] )  = { 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000 }; // 2^23 as float

        const __m128 b = _mm_castsi128_ps( _mm_srli_epi32( _mm_slli_epi32( _mm_castps_si128(v), 1 ), 1 ) ); // fabs(v)
        const __m128 d = _mm_sub_ps( _mm_add_ps( _mm_add_ps( _mm_sub_ps( v, *(__m128*)&two23 ), *(__m128*)&two23 ), *(__m128*)&two23 ), *(__m128*)&two23 ); // the meat of floor
        const __m128 largeMaskE = _mm_cmpgt_ps( b, *(__m128*)&two23 ); // $ffffffff if v >= 2^23
        const __m128 g = _mm_cmplt_ps( v, d ); // check for possible off by one error
        const __m128 h = _mm_cvtepi32_ps( _mm_castps_si128(g) ); // convert positive check result to -1.0, negative to 0.0
        const __m128 t = _mm_add_ps( d, h ); // add in the error if there is one

        // Select between output result and input value based on v >= 2^23
        const __m128 result = _mm_or_ps( _mm_and_ps(largeMaskE, v), _mm_andnot_ps(largeMaskE, t) );
#endif
        return result;
    }

    HK_INLINE __m128d HK_CALL twoFloor(const __m128d &v)
    {
#if HK_SSE_VERSION >= 0x41
        const __m128d result = _mm_floor_pd(v);
#else
        HK_ALIGN16( const hkUint64 two52[2] )  = { 0x4330000000000000ull, 0x4330000000000000ull }; // 2^52 as double

        const __m128d b = _mm_castsi128_pd( _mm_srli_epi64( _mm_slli_epi64( _mm_castpd_si128(v), 1 ), 1 ) ); // fabs(v)
        const __m128d d = _mm_sub_pd( _mm_add_pd( _mm_add_pd( _mm_sub_pd( v, *(__m128d*)&two52 ), *(__m128d*)&two52 ), *(__m128d*)&two52 ), *(__m128d*)&two52 ); // the meat of floor
        const __m128d largeMaskE = _mm_cmpgt_pd( b, *(__m128d*)&two52 ); // $ffffffffffffffff if v >= 2^52
        const __m128d g = _mm_cmplt_pd( v, d ); // check for possible off by one error
        const __m128d h = _mm_cvtepi32_pd( _mm_castpd_si128(g) ); // convert positive check result to -1.0, negative to 0.0 (only need the lower 64 bit anyways, so cvtepi32 is fine)
        const __m128d t = _mm_add_pd( d, h ); // add in the error if there is one

        // Select between output result and input value based on v >= 2^52
        const __m128d result = _mm_or_pd( _mm_and_pd(largeMaskE, v), _mm_andnot_pd(largeMaskE, t) );
#endif
        return result;
    }

#   define HK_MATH_floor
    HK_INLINE hkFloat32 HK_CALL floor(const hkFloat32 r)
    {
        const __m128 v = _mm_load_ss(&r);
        const __m128 result = quadFloor(v);
        return _mm_cvtss_f32(result);
    }
    HK_INLINE hkDouble64 HK_CALL floor(const hkDouble64 r)
    {
        const __m128d v = _mm_load_sd(&r);
        const __m128d result = twoFloor(v);
        return _mm_cvtsd_f64(result);
    }

#   define HK_MATH_hkFloorToInt
    HK_INLINE int HK_CALL hkFloorToInt(const hkFloat32 r)
    {
        const __m128 v = _mm_load_ss(&r);
        const __m128 result = quadFloor(v);
        return _mm_cvtt_ss2si(result);
    }
    HK_INLINE int HK_CALL hkFloorToInt(const hkDouble64 r)
    {
        const __m128d v = _mm_load_sd(&r);
        const __m128d result = twoFloor(v);
        return _mm_cvttsd_si32(result);
    }

#   define HK_MATH_ceil
    HK_INLINE hkFloat32 HK_CALL ceil( const hkFloat32 r )
    {
        const __m128 v = _mm_load_ss(&r);
#if HK_SSE_VERSION >= 0x41
        const __m128 result = _mm_ceil_ss(v, v);
#else
        const __m128 floored = quadFloor(v);
        const __m128 hasFrac = _mm_cmpneq_ps(v, floored);
        const __m128 adjustment = _mm_and_ps(hasFrac, *(__m128*)hkSse_floatOne);
        const __m128 result = _mm_add_ps(floored, adjustment);
#endif
        return _mm_cvtss_f32(result);
    }
    HK_INLINE hkDouble64 HK_CALL ceil( const hkDouble64 r )
    {
        const __m128d v = _mm_load_sd(&r);
#if HK_SSE_VERSION >= 0x41
        const __m128d result = _mm_ceil_sd(v, v);
#else
        const __m128d floored = twoFloor(v);
        const __m128d hasFrac = _mm_cmpneq_pd(v, floored);
        const __m128d adjustment = _mm_and_pd(hasFrac, *(__m128d*)hkSse_doubleOne);
        const __m128d result = _mm_add_pd(floored, adjustment);
#endif
        return _mm_cvtsd_f64(result);
    }

    HK_INLINE __m128 HK_CALL quadMod(const __m128 &a, const __m128 &b)
    {
        const __m128 denomIsZero = _mm_cmpeq_ps(b,_mm_setzero_ps());
        const __m128 q = quadFloor(_mm_div_ps(a,b));
        __m128 result = _mm_sub_ps(a, _mm_mul_ps(q,b));
        return _mm_andnot_ps(denomIsZero, result);
    }

    HK_INLINE __m128d HK_CALL twoMod(const __m128d &a, const __m128d &b)
    {
        const __m128d denomIsZero = _mm_cmpeq_pd(b,_mm_setzero_pd());
        const __m128d q = twoFloor(_mm_div_pd(a,b));
        __m128d result = _mm_sub_pd(a, _mm_mul_pd(q,b));
        return _mm_andnot_pd(denomIsZero, result);
    }

#   define HK_MATH_fmod
    HK_INLINE hkFloat32 HK_CALL fmod(const hkFloat32 n, const hkFloat32 d)
    {
        const __m128 numer = _mm_load_ss(&n);
        const __m128 denom = _mm_load_ss(&d);
        __m128 result = quadMod(numer, denom);
        return _mm_cvtss_f32(result);
    }
    HK_INLINE hkDouble64 HK_CALL fmod(const hkDouble64 n, const hkDouble64 d)
    {
        const __m128d numer = _mm_load_sd(&n);
        const __m128d denom = _mm_load_sd(&d);
        __m128d result = twoMod(numer, denom);
        return _mm_cvtsd_f64(result);
    }


#   define HK_MATH_logApproximation
    HK_INLINE hkFloat32 HK_CALL logApproximation(const hkFloat32& r)
    {
        const hkQuadFloat32 q = _mm_load1_ps(&r);
        const hkQuadFloat32 l = quadLog(q);
        return _mm_cvtss_f32(l);
    }

    HK_INLINE hkDouble64 HK_CALL logApproximation(const hkDouble64& r)
    {
#if HK_SSE_VERSION >= 0x50
        hkQuadDouble64 q = _mm256_set1_pd(r);
        hkQuadDouble64 l = quadLog(q);
        return l.m256d_f64[0];
#else
        hkQuadDouble64 q;
        q.xy = _mm_load1_pd(&r);
        q.zw = _mm_load1_pd(&r);
        hkQuadDouble64 l = quadLog(q); // change this!
        return _mm_cvtsd_f64(l.xy);
#endif
    }

    /// Average absolute error 0.001961
    /// Max absolute error 0.013429
    /// About 10x faster than ::sinf for 4 simultaneous values
    HK_INLINE __m128 HK_CALL quadSinApproximation(const __m128& inX)
    {
        HK_ALIGN16( const hkUint32 C[4] )      = { 0xBECF817B, 0xBECF817B, 0xBECF817B, 0xBECF817B }; // -4/(pi*pi)
        HK_ALIGN16( const hkUint32 P[4] )      = { 0x3E666666, 0x3E666666, 0x3E666666, 0x3E666666 }; // 0.225

        __m128 x = inX;

        // xx = remap to [0,pi]
        __m128 gePi = _mm_cmpnlt_ps(x, *(__m128*)hkSse_pi);
        __m128 limit = _mm_and_ps(gePi, *(__m128*)hkSse_twoPi);
        __m128 xx = _mm_sub_ps(x, limit);

        // y = B * x + C * x * abs(xx)
        __m128 abs = quadFabs(xx);
        __m128 y = _mm_mul_ps(abs, xx);
        __m128 Bx = _mm_mul_ps(xx, *(__m128*)hkSse_fourOverPi);
        __m128 Cx_abs = _mm_mul_ps(y, *(__m128*)&C);
        __m128 parabola = _mm_add_ps(Bx, Cx_abs);

        // y = P * (y * abs(y) - y) + y
        abs = quadFabs(parabola);
        y = _mm_sub_ps(_mm_mul_ps(parabola, abs),parabola);
        __m128 blend = _mm_mul_ps(y, *(__m128*)&P);
        __m128 result = _mm_add_ps(blend, parabola);

        return result;
    }


    /// Average absolute error 0.001961
    /// Max absolute error 0.013429
    /// About 10x faster than ::sin for 4 simultaneous values
    HK_INLINE __m128d HK_CALL twoSinApproximation(const __m128d& inX)
    {
        HK_ALIGN16( const hkUint64 C[2] )      = { 0xBFD9F02F6222C720ull, 0xBFD9F02F6222C720ull }; // -4/(pi*pi)
        HK_ALIGN16( const hkUint64 P[2] )      = { 0x3FCCCCCCCCCCCCCDull, 0x3FCCCCCCCCCCCCCDull }; // 0.225

        __m128d x = inX;

        // xx = remap to [0,pi]
        __m128d gePi = _mm_cmpnlt_pd(x, *(__m128d*)&hkSse_D_pi);
        __m128d limit = _mm_and_pd(gePi, *(__m128d*)&hkSse_D_twoPi);
        __m128d xx = _mm_sub_pd(x, limit);

        // y = B * x + C * x * abs(xx)
        __m128d abs = twoFabs(xx);
        __m128d y = _mm_mul_pd(abs, xx);
        __m128d Bx = _mm_mul_pd(xx, *(__m128d*)&hkSse_D_fourOverPi);
        __m128d Cx_abs = _mm_mul_pd(y, *(__m128d*)&C);
        __m128d parabola = _mm_add_pd(Bx, Cx_abs);

        // y = P * (y * abs(y) - y) + y
        abs = twoFabs(parabola);
        y = _mm_sub_pd(_mm_mul_pd(parabola, abs),parabola);
        __m128d blend = _mm_mul_pd(y, *(__m128d*)&P);
        __m128d result = _mm_add_pd(blend, parabola);

        return result;
    }

    /// Adapted from the book "Methods and Programs for Mathematical Functions",
    /// Stephen Lloyd Baluk Moshier, Prentice-Hall, 1989
    ///
    /// Range reduction is into intervals of pi/4.  The reduction error is nearly eliminated
    /// by extended precision modular arithmetic.
    /// PERFORMANCE: About 4.3x faster than ::sinf for 4 simultaneous values
    extern HK_EXPORT_COMMON __m128 HK_CALL quadSin(const __m128& inX);

    /// Double precision version of above
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE -1.07e9,+1.07e9  2.1e-16     5.4e-17
    /// Partial loss of accuracy begins to occur at x = 2**30
    /// = 1.074e9.  The loss is not gradual, but jumps suddenly to
    /// about 1 part in 10e7.  Results may be meaningless for
    /// x > 2**49 = 5.6e14.
    ///
    /// PERFORMANCE: About 1.3x faster than ::sin for 2 simultaneous values
    extern HK_EXPORT_COMMON __m128d HK_CALL twoSin(const __m128d& inX);

    /// Adapted from the book "Methods and Programs for Mathematical Functions",
    /// Stephen Lloyd Baluk Moshier, Prentice-Hall, 1989
    ///
    /// Range reduction is into intervals of pi/4.  The reduction error is nearly eliminated
    /// by extended precision modular arithmetic.
    ///
    /// Two polynomial approximating functions are employed.
    /// Between 0 and pi/4 the sine is approximated by
    ///      1  -  x**2 Q(x**2).
    /// Between pi/4 and pi/2 the cosine is represented as
    ///      x  +  x**3 P(x**2).
    ///
    /// ACCURACY: matches cmath on MSVC
    ///
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE    -8192,+8192    3.0e-7     3.0e-8
    /// Partial loss of accuracy begins to occur at x = 2^13
    /// = 8192. Results may be meaningless for x >= 2^24
    ///
    /// PERFORMANCE: About 4.3x faster than ::cosf for 4 simultaneous values
    extern HK_EXPORT_COMMON __m128 HK_CALL quadCos(const __m128& inX);

    /// Double precision version of above
    ///
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE -1.07e9,+1.07e9  2.1e-16     5.4e-17
    /// Partial loss of accuracy begins to occur at x = 2**30
    /// = 1.074e9.  The loss is not gradual, but jumps suddenly to
    /// about 1 part in 10e7.  Results may be meaningless for
    /// x > 2**49 = 5.6e14.
    ///
    /// PERFORMANCE: About 1.3x faster than ::cos for 2 simultaneous values
    extern HK_EXPORT_COMMON __m128d HK_CALL twoCos(const __m128d& inX);

    /// Combined sin and cos calculation derived from the algorithms above.
    ///
    /// PERFORMANCE: About 5.9x faster than ::sin and ::cos for 2 simultaneous values
    extern HK_EXPORT_COMMON void HK_CALL twoSinCos(const __m128d& inX, __m128d& s, __m128d& c);

    /// Combined sin and cos calculation derived from the algorithms above.
    ///
    /// PERFORMANCE: About 6.1x faster than ::sinf and ::cosf for 4 simultaneous values
    extern HK_EXPORT_COMMON void HK_CALL quadSinCos(const __m128& inX, __m128& s, __m128& c);


    /// Adapted from the book "Methods and Programs for Mathematical Functions",
    /// Stephen Lloyd Baluk Moshier, Prentice-Hall, 1989
    ///
    /// A polynomial of the form x + x**3 P(x**2)
    /// is used for |x| in the interval [0, 0.5].  If |x| > 0.5 it is
    /// transformed by the identity
    ///
    ///    asin(x) = pi/2 - 2 asin( sqrt( (1-x)/2 ) ).
    ///
    /// ACCURACY: matches cmath on MSVC
    ///
    /// Important: unlike the cmath function, this one clamps the input to [-1,1]
    ///
    ///                      Relative error:
    /// arithmetic   domain      peak         rms
    ///    IEEE     -1, 1       2.5e-7       5.0e-8
    ///
    /// PERFORMANCE: About 8x faster than ::asinf for 4 simultaneous values
    extern HK_EXPORT_COMMON __m128 HK_CALL quadAsin(const __m128& xx);

    /// Double precision version of above except that it uses
    /// a rational function of the form x + x**3 P(x**2)/Q(x**2)
    ///
    ///                      Relative error:
    /// arithmetic   domain        peak       rms
    ///    IEEE     -1, 1       1.9e-16     5.4e-17
    ///
    /// PERFORMANCE: About 2x faster than ::asin for 2 simultaneous values
    extern HK_EXPORT_COMMON __m128d HK_CALL twoAsin(const __m128d& xx);

    /// Properties see quadSinApproximation
    HK_INLINE __m128 HK_CALL quadCosApproximation(const __m128& x)
    {
        // cos(x) = sin(x + pi/2)
        __m128 xx = _mm_add_ps(x, *(__m128*)&hkSse_piOver2);
        return quadSinApproximation(xx);
    }

    /// Properties see quadSinApproximation
    HK_INLINE __m128d HK_CALL twoCosApproximation(const __m128d& x)
    {
        // cos(x) = sin(x + pi/2)
        __m128d xx = _mm_add_pd(x, *(__m128d*)hkSse_D_piOver2);
        return twoSinApproximation(xx);
    }

    HK_INLINE __m128 HK_CALL quadAcos(const __m128& xx)
    {
        // acos(x) = pi/2 - asin(x)
        __m128 x = _mm_sub_ps(*(__m128*)hkSse_piOver2, quadAsin(xx));
        return x;
    }

    HK_INLINE __m128d HK_CALL twoAcos(const __m128d& xx)
    {
        // acos(x) = pi/2 - asin(x)
        __m128d x = _mm_sub_pd(*(__m128d*)hkSse_D_piOver2, twoAsin(xx));
        return x;
    }

    /// Calc cos/sin at the same time: xyzw -> sin(x)cos(y)sin(z)cos(w)
    /// Properties see quadSinApproximation
    HK_INLINE __m128 HK_CALL quadSinCosApproximation(const __m128& x)
    {
        // cos(x) = sin(x + pi/2)
        HK_ALIGN16( const hkUint32 offset[4] ) = { 0x00000000, 0x3FC90FDB, 0x00000000, 0x3FC90FDB };
        __m128 xx = _mm_add_ps(x, *(__m128*)&offset);
        return quadSinApproximation(xx);
    }

    /// Calc cos/sin at the same time: xy -> sin(x)cos(y)
    /// Properties see twoSinApproximation
    HK_INLINE __m128d HK_CALL twoSinCosApproximation(const __m128d& x)
    {
        // cos(x) = sin(x + pi/2)
        HK_ALIGN16( const hkUint64 offset[2] ) = { 0x0000000000000000ull, 0x3FF921FB54442D18ull };
        __m128d xx = _mm_add_pd(x, *(__m128d*)&offset);
        return twoSinApproximation(xx);
    }

    /// Calc acos/asin at the same time: xyzw -> asin(x)acos(y)asin(z)acos(w)
    /// Properties see quadAsin
    HK_INLINE __m128 HK_CALL quadAsinAcos(const __m128& x)
    {
        // acos(x) = pi/2 - asin(x)
        __m128 asinx = quadAsin(x);
#if HK_SSE_VERSION >= 0x41
        __m128 acosx = _mm_sub_ps(*(__m128*)hkSse_piOver2, asinx);
        return _mm_blend_ps(asinx,acosx,0xA);
#else
        HK_ALIGN16(const hkInt32 selectSin[4])     = { 0x00000000, 0x3FC90FDB, 0x00000000, 0x3FC90FDB };
        __m128 acosx = _mm_sub_ps(*(__m128*)selectSin, asinx);
        __m128 sinselect = _mm_cmpeq_ps(*(__m128*)selectSin, _mm_setzero_ps());
        return _mm_or_ps(_mm_andnot_ps(sinselect, acosx),_mm_and_ps(sinselect, asinx));
#endif
    }

    /// Calc acos/asin at the same time: xy -> asin(x)acos(y)
    /// Properties see twoAsin
    HK_INLINE __m128d HK_CALL twoAsinAcos(const __m128d& x)
    {
        // acos(x) = pi/2 - asin(x)
        __m128d asinx = twoAsin(x);
        __m128d acosx = _mm_sub_pd(*(__m128d*)hkSse_D_piOver2, asinx);
        // Gcc 4.5.1 can mis-optimize _mm_move_sd
#if defined(HK_COMPILER_GCC) && (HK_COMPILER_GCC_VERSION <= 40501)
        return _mm_shuffle_pd(asinx, acosx, _MM_SHUFFLE2(1, 0));
#else
        return _mm_move_sd(acosx, asinx);
#endif
    }

#   define HK_MATH_sin
    HK_INLINE hkFloat32 HK_CALL sin (const hkFloat32 r)
    {
        __m128 rr = _mm_load_ss(&r);
        __m128 s = quadSin(rr);
        return _mm_cvtss_f32(s);
    }
    HK_INLINE hkDouble64 HK_CALL sin (const hkDouble64 r)
    {
        __m128d rr = _mm_load_sd(&r);
        __m128d s = twoSin(rr);
        return _mm_cvtsd_f64(s);
    }

#   define HK_MATH_cos
    HK_INLINE hkFloat32 HK_CALL cos (const hkFloat32 r)
    {
        __m128 rr = _mm_load_ss(&r);
        __m128 c = quadCos(rr);
        return _mm_cvtss_f32(c);
    }
    HK_INLINE hkDouble64 HK_CALL cos (const hkDouble64 r)
    {
        __m128d rr = _mm_load_sd(&r);
        __m128d s = twoCos(rr);
        return _mm_cvtsd_f64(s);
    }

#   define HK_MATH_asin
    HK_INLINE hkFloat32 HK_CALL asin(const hkFloat32 r)
    {
        // be generous about numbers outside range
        HK_ASSERT_NO_MSG(0x286a6f5f,  hkMath::fabs(r) < 1.001f ); // assert imported from default impl
        __m128 rr = _mm_load_ss(&r);
        __m128 s = quadAsin(rr); // quadAsin clamps [-1;1]
        return _mm_cvtss_f32(s);
    }
    HK_INLINE hkDouble64 HK_CALL asin(const hkDouble64 r)
    {
        // be generous about numbers outside range
        HK_ASSERT_NO_MSG(0x286a6f5f,  hkMath::fabs(r) < 1.001 ); // assert imported from default impl
        __m128d rr = _mm_load_sd(&r);
        __m128d s = twoAsin(rr);  // twoAsin clamps [-1;1]
        return _mm_cvtsd_f64(s);
    }

#   define HK_MATH_acos
    HK_INLINE hkFloat32 HK_CALL acos(const hkFloat32 r)
    {
        // be generous about numbers outside range
        HK_ASSERT_NO_MSG(0x41278654,  hkMath::fabs(r) < 1.001f ); // assert imported from default impl
        __m128 rr = _mm_load_ss(&r);
        __m128 s = quadAcos(rr); // quadAcos clamps [-1;1]
        return _mm_cvtss_f32(s);
    }
    HK_INLINE hkDouble64 HK_CALL acos(const hkDouble64 r)
    {
        // be generous about numbers outside range
        HK_ASSERT_NO_MSG(0x41278654,  hkMath::fabs(r) < 1.001 ); // assert imported from default impl
        __m128d rr = _mm_load_sd(&r);
        __m128d s = twoAcos(rr);  // twoAcos clamps [-1;1]
        return _mm_cvtsd_f64(s);
    }

#if defined(HK_PLATFORM_PS4)
#   define HK_MATH_countLeadingZeros

    template <typename T> HK_INLINE int HK_CALL countLeadingZeros(T x);

    template <> HK_INLINE int HK_CALL countLeadingZeros<hkInt16>(hkInt16 bitMask)       {   return __lzcnt16(bitMask);  }
    template <> HK_INLINE int HK_CALL countLeadingZeros<hkUint32>(hkUint32 bitMask) {   return __lzcnt32(bitMask);  }
    template <> HK_INLINE int HK_CALL countLeadingZeros<hkUint64>(hkUint64 bitMask) {   return (int)__lzcnt64(bitMask); }
    template<> HK_INLINE int HK_CALL countLeadingZeros<int>(int bitMask)                {   return countLeadingZeros<hkUint32>((hkUint32)bitMask);  }

#elif defined(HK_COMPILER_GCC)
#   define HK_MATH_countLeadingZeros

    template <typename T> HK_INLINE int HK_CALL countLeadingZeros(T x);

    template <> HK_INLINE int HK_CALL countLeadingZeros<hkUint32>(hkUint32 bitMask) {   return bitMask ? __builtin_clz(bitMask) : 32;   }
    template <> HK_INLINE int HK_CALL countLeadingZeros<hkUint64>(hkUint64 bitMask) {   return bitMask ? __builtin_clzll(bitMask) : 64; }
    template <> HK_INLINE int HK_CALL countLeadingZeros<int>(int bitMask)               {   return countLeadingZeros<hkUint32>((hkUint32)bitMask);  }

#   define HK_MATH_countTrailingZeros

    template <typename T> HK_INLINE int HK_CALL countTrailingZeros(T x);

    template <> HK_INLINE int HK_CALL countTrailingZeros<hkUint32>(hkUint32 bitMask)    {   return bitMask ? __builtin_ctz(bitMask) : 32;   }
    template <> HK_INLINE int HK_CALL countTrailingZeros<hkUint64>(hkUint64 bitMask)    {   return bitMask ? __builtin_ctzll(bitMask) : 64; }
    template <> HK_INLINE int HK_CALL countTrailingZeros<int>(int bitMask)          {   return countTrailingZeros<hkUint32>((hkUint32)bitMask); }

//  HK_SSE_VERSION >= 0x50 needs CPUID flag LZCNT in addition
//  template <> HK_INLINE int HK_CALL countLeadingZeros<hkUint32>(hkUint32 bitMask) {   return (int)_lzcnt_u32(bitMask);    }
//  template <> HK_INLINE int HK_CALL countLeadingZeros<hkUint64>(hkUint64 bitMask) {   return (int)_lzcnt_u64(bitMask);    }
//  template <> HK_INLINE int HK_CALL countLeadingZeros<int>(int bitMask)               {   return countLeadingZeros<hkUint32>((hkUint32)bitMask);  }

//  HK_SSE_VERSION >= 0x50 needs CPUID flag BMI1 in addition
//  template <> HK_INLINE int HK_CALL countTrailingZeros<hkUint32>(hkUint32 bitMask)    {   return (int)_tzcnt_u32(bitMask);    }
//  template <> HK_INLINE int HK_CALL countTrailingZeros<hkUint64>(hkUint64 bitMask)    {   return (int)_tzcnt_u64(bitMask);    }
//  template <> HK_INLINE int HK_CALL countTrailingZeros<int>(int bitMask)          {   return countTrailingZeros<hkUint32>((hkUint32)bitMask); }

#elif 0 // no CPU instruction yet, and _BitScanReverse requires an if
    // Returns the number of zeros at the start (MSB) of the given bitMask
#   define HK_MATH_countLeadingZeros
#   pragma intrinsic(_BitScanReverse)
#   pragma intrinsic(_BitScanReverse64)

    template <typename T>
    HK_INLINE int HK_CALL countLeadingZeros(T x);

    template<>
    HK_INLINE int HK_CALL countLeadingZeros<hkUint32>(hkUint32 bitMask)
    {
        unsigned long idx;
        return (int)(_BitScanReverse(&idx, bitMask) ? 31 - idx : 32);
    }

    template<>
    HK_INLINE int HK_CALL countLeadingZeros<int>(int bitMask)
    {
        return countLeadingZeros<hkUint32>((hkUint32)x);
    }

    template <>
    HK_INLINE int HK_CALL countLeadingZeros<hkUint64>(hkUint64 bitMask)
    {
        unsigned long idx;
        return (int)(_BitScanReverse64(&idx, bitMask) ? 63 - idx : 64);
    }

    // Returns the number of zeros at the end (LSB) of the given bitMask
#   define HK_MATH_countTrailingZeros
#   pragma intrinsic(_BitScanForward)
#   pragma intrinsic(_BitScanForward64)

    template <typename T>
    HK_INLINE int HK_CALL countTrailingZeros(T x);

    template <>
    HK_INLINE int HK_CALL countTrailingZeros<hkUint32>(hkUint32 bitMask)
    {
        unsigned long idx;
        return (int)(_BitScanForward(&idx, bitMask) ? idx : 32);
    }

    template <>
    HK_INLINE int HK_CALL countTrailingZeros<int>(int x)
    {
        return countTrailingZeros<hkUint32>((hkUint32)x);
    }

    template <>
    HK_INLINE int HK_CALL countTrailingZeros<hkUint64>(hkUint64 bitMask)
    {
        unsigned long idx;
        return (int)(_BitScanForward64(&idx, bitMask) ? idx : 64);
    }
#endif


    // Returns the number of bits set to one in the given number
#if defined(HK_COMPILER_GCC)
#   define HK_MATH_countBitsSet
    HK_INLINE int HK_CALL countBitsSet(hkUint32 number)
    {
        return __builtin_popcount(number);
    }
#elif HK_SSE_VERSION >= 0x42
#   define HK_MATH_countBitsSet
    HK_INLINE int HK_CALL countBitsSet(hkUint32 number)
    {
        return _mm_popcnt_u32(number);
    }
// These versions are probably too large to be inlined
#elif 0 && HK_SSE_VERSION >= 0x31
#   define HK_MATH_countBitsSet
    HK_INLINE int HK_CALL countBitsSet(hkUint32 number)
    {
        const __m128i mask_lo = _mm_set1_epi8(0x0F);
        const __m128i mask_popcnt = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);

        __m128i cnt = _mm_cvtsi32_si128(number);
        __m128i lo = _mm_and_si128(cnt, mask_lo);
        __m128i hi = _mm_and_si128(_mm_srli_epi16(cnt, 4), mask_lo);
        lo = _mm_shuffle_epi8(mask_popcnt, lo);
        hi = _mm_shuffle_epi8(mask_popcnt, hi);
        cnt = _mm_add_epi8(lo, hi);
        // Horizontally sum up byte counters
        __m128i sum = _mm_sad_epu8(cnt, _mm_setzero_si128());
        // Horizontally add 32-bit accumulators
        sum = _mm_add_epi32(_mm_shuffle_epi32(sum, _MM_SHUFFLE(1,0,3,2)), sum);
        return _mm_cvtsi128_si32(sum);
    }
#elif 0
#   define HK_MATH_countBitsSet
    HK_INLINE int HK_CALL countBitsSet(hkUint32 number)
    {
        const __m128i mask1 = _mm_set1_epi8(0x55);
        const __m128i mask2 = _mm_set1_epi8(0x33);
        const __m128i mask4 = _mm_set1_epi8(0x0F);

        __m128i cnt = _mm_cvtsi32_si128(number);
        cnt = _mm_add_epi8( _mm_and_si128(cnt, mask1), _mm_and_si128(_mm_srli_epi16(cnt, 1), mask1) );
        cnt = _mm_add_epi8( _mm_and_si128(cnt, mask2), _mm_and_si128(_mm_srli_epi16(cnt, 2), mask2) );
        cnt = _mm_and_si128( _mm_add_epi8(cnt, _mm_srli_epi16(cnt, 4)), mask4 );
        // Horizontally sum up byte counters
        __m128i sum = _mm_sad_epu8(cnt, _mm_setzero_si128());
        // Horizontally add 32-bit accumulators
        sum = _mm_add_epi32(_mm_shuffle_epi32(sum, _MM_SHUFFLE(1,0,3,2)), sum);
        return _mm_cvtsi128_si32(sum);
    }
#endif

#if defined(HK_MATH_countBitsSet) // only define if we have a small countBitsSet method
#   define HK_MATH_isPower2
    HK_INLINE bool isPower2(unsigned int v)
    {
        return countBitsSet(v)==1;
    }
#endif
}

/*
 * Havok SDK - Base file, BUILD(#20180110)
 * 
 * Confidential Information of Microsoft Corporation.
 * Not for disclosure or distribution without Microsoft's prior written
 * consent.  This software contains code, techniques and know-how which
 * is confidential and proprietary to Microsoft.  Product and Trade Secret
 * source code contains trade secrets of Microsoft.  Havok Software (C)
 * Copyright 1999-2018 Microsoft Corporation.
 * All Rights Reserved. Use of this software is subject to the
 * terms of an end user license agreement.
 * 
 * The Havok Logo, and the Havok buzzsaw logo are trademarks of Microsoft.
 * Title, ownership rights, and intellectual property rights in the Havok
 * software remain in Microsoft and/or its suppliers.
 * 
 * Use of this software for evaluation purposes is subject to and
 * indicates acceptance of the End User licence Agreement for this
 * product. A copy of the license is included with this software and is
 * also available from Havok Support.
 * 
 */
