/* 
 * WNI emulation
 *
 * Copyright (c) 1998 Criterion Software Ltd.
 */

/*
 *      Generic software emulation of icl/WNI intrinsics.
 */

#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "rpplugin.h"
#include "rpdbgerr.h"
#include "rtintel.h"
#include "wni.h"

static const char  __RWUNUSED__  rcsid[] =
    "@@(#)$Id: wni.c,v 1.14 2001/10/01 09:05:22 markf Exp $";

#if (defined(__ICL))
/* Avoid voluminous
 *   'warning #963: no EMMS instruction before return'
 * etc warnings
 */
#pragma warning( disable : 963 )
#pragma warning( disable : 964 )
#pragma warning( disable : 965 )
#endif /* (defined(__ICL)) */

#define _rw_max(_x, _y) (((_x) > (_y)) ? (_x) : (_y))
#define _rw_min(_x, _y) (((_x) < (_y)) ? (_x) : (_y))
#define _rw_isnan(_x) ((_x) != (_x))

#if ( defined(_WIN32) && defined(_MSC_VER) && (_MSC_VER>=1000) )
#if (defined(_XBOX))
#include <xtl.h>
#else /* (defined(_XBOX)) */
#include <windows.h>
#include <crtdbg.h>
#endif /* (defined(_XBOX)) */

#define   OUTPUTDEBUGSTRING(_msg)   OutputDebugString(_msg)

#define INTEL_WNI_UNIMPLEMENTED(__func__)               \
do                                                      \
{                                                       \
   char buffer[256];                                    \
                                                        \
   _snprintf( buffer, sizeof(buffer)/sizeof(buffer[0]), \
              "%s:%d: %s Unimplemented\n",              \
              __FILE__, __LINE__, __func__ );           \
                                                        \
   OUTPUTDEBUGSTRING(buffer);                           \
                                                        \
} while (0)

#endif /* ( defined(_WIN32) && defined(_MSC_VER) && (_MSC_VER>=1000) ) */

#if (!defined(OUTPUTDEBUGSTRING))
#define OUTPUTDEBUGSTRING(_msg) /* Null op */
#endif /* (!defined(OUTPUTDEBUGSTRING)) */

#if (!defined(INTEL_WNI_UNIMPLEMENTED))
#define INTEL_WNI_UNIMPLEMENTED(__func__) /* Null op */
#endif /* (!defined(INTEL_WNI_UNIMPLEMENTED)) */

#define recip(x)  (((float)1)/((float)(x)))
#define sign(x)   (((x)<0)?0x00000001:0x00000000)
#define hiword(x) ((short)( ((long)(x)) >> 16 ))

#define SignedSaturateByte(_x) \
   ((char)( (_x) > 127 ? 127 : ( (_x) < -127 ? -127 : (_x) ) ))

#define SignedSaturateWord(_x) \
   ((short)( (_x) > 32767 ? 32767 : ( (_x) < -32767 ? -32767 : (_x) ) ))

#define UnSignedSaturateByte(_x) \
   ((unsigned char)( (_x) > 255 ? 255 : (_x) ))

#define UnSignedSaturateWord(_x) \
   ((unsigned short)( (_x) > 65535 ? 65535 : (_x) ))

/*
 * For 64 bit integer arithmetic, see:
 * //Technologies/Archive/RenderWare/release/1.4/1.4ts/include/mmacros.h
 * //Technologies/Archive/RenderWare/release/1.4/1.4ts/src/maths.c
 */

#define _m64Add(a,b) Fix64Add((a),(b))
#define _m64Sub(a,b) Fix64Sub((a),(b))
#define _m64Mul(a,b) Fix64Mul((a),(b))

/*********************************************/

static              Rt_m64
Fix64Add(Rt_m64 a, Rt_m64 b)
{
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWFUNCTION("Fix64Add");

    ao.m64 = a;
    bo.m64 = b;

    ro.ud[0] = ao.ud[0] + bo.ud[0];
    ro._d[1] =
        ao._d[1] + bo._d[1] + (0xffffffffUL - ao.ud[0] < bo.ud[0]);

    RWRETURN(ro.m64);
}

/*********************************************/

static              Rt_m64
Fix64Sub(Rt_m64 a, Rt_m64 b)
{
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWFUNCTION("Fix64Sub");

    ao.m64 = a;
    bo.m64 = b;

    ro.ud[0] = ao.ud[0] - bo.ud[0];
    ro._d[1] = ao._d[1] - bo._d[1] - (ao.ud[0] < bo.ud[0]);

    RWRETURN(ro.m64);
}

/*********************************************/

static              Rt_m64
Fix64Mul(RwInt32 a, RwInt32 b)
{

    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;
    volatile RwOverlayM64 ao_w1bo_w1;
    volatile RwOverlayM64 ao_w1bouw0;
    volatile RwOverlayM64 aouw0bo_w1;
    volatile RwOverlayM64 aouw0bouw0;

    RWFUNCTION("Fix64Mul");

    ao._d[0] = a;
    ao._d[1] = 0x0;
    bo._d[0] = b;
    bo._d[1] = 0x0;

    /* 
     * ( ao_w1 * 2^16 + aouw0 ) * ( bo_w1 * 2^16 + bouw0 ) =
     * ( ao_w1 * bo_w1) * 2 ^ 32 +
     * ( ao_w1 * bouw0) * 2 ^ 16 +
     * ( aouw0 * bo_w1) * 2 ^ 16 +
     * ( aouw0 * bouw0)
     */

    ao_w1bo_w1._d[1] = ao._w[1] * bo._w[1];

    ao_w1bouw0._d[1] = ao._w[1] * bo.uw[0];
    ao_w1bouw0.ud[0] = ao_w1bouw0._d[1] << 16;
    ao_w1bouw0._d[1] = ao_w1bouw0._d[1] >> 16;

    aouw0bo_w1._d[1] = ao.uw[0] * bo._w[1];
    aouw0bo_w1.ud[0] = aouw0bo_w1._d[1] << 16;
    aouw0bo_w1._d[1] = aouw0bo_w1._d[1] >> 16;

    aouw0bouw0.ud[0] = ao.uw[0] * bo.uw[0];

    /* Sum */

    ao.m64 = _m64Add(ao_w1bo_w1.m64, aouw0bouw0.m64);
    bo.m64 = _m64Add(ao_w1bouw0.m64, aouw0bo_w1.m64);
    ro.m64 = _m64Add(ao.m64, bo.m64);

    RWRETURN(ro.m64);
}

/* Arithmetic Operations  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_sd adds the lower DP FP (doubleprecision,
 * floatingpoint) values of a and b; the upper DP FP value is passed
 * through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_add_sd(Rt_m128d a, Rt_m128d b) /* ADDSD */
{
    /*
     * Adds the lower DP FP (doubleprecision, floatingpoint) values
     * of a and b; the upper DP FP value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0] + bo.df[0];
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_pd adds the two DP FP values of a and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_add_pd(Rt_m128d a, Rt_m128d b) /* ADDPD */
{
    /*
     * Adds the two DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0] + bo.df[0];
    ro.df[1] = ao.df[1] + bo.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_div_sd divides the lower DP FP values of a and b.
 * The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_div_sd(Rt_m128d a, Rt_m128d b) /* DIVSD */
{
    /*
     * Divides the lower DP FP values of a and b.
     * The upper DP FP value is passed through
     * from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_div_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0] / bo.df[0];
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_div_pd divides the two DP FP values of a and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_div_pd(Rt_m128d a, Rt_m128d b) /* DIVPD */
{
    /*
     * Divides the two DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_div_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0] / bo.df[0];
    ro.df[1] = ao.df[1] / bo.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_max_sd computes the maximum of the lower DP FP
 * values of a and b.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_max_sd(Rt_m128d a, Rt_m128d b) /* MAXSD */
{
    /*
     * Computes the maximum of the lower DP FP values of a and b.
     * The upper DP FP value
     * is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_max_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = _rw_max(ao.df[0], bo.df[0]);
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_max_pd computes the maxima of the two DP FP values
 * of a and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_max_pd(Rt_m128d a, Rt_m128d b) /* MAXPD */
{
    /*
     * Computes the maxima of the two DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_max_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = _rw_max(ao.df[0], bo.df[0]);
    ro.df[1] = _rw_max(ao.df[1], bo.df[1]);

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_min_sd computes the minimum of the lower DP FP
 * values of a and b.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_min_sd(Rt_m128d a, Rt_m128d b) /* MINSD */
{
    /*
     * Computes the minimum of the lower DP FP values of a and b.
     * The upper DP FP value
     * is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_min_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = _rw_min(ao.df[0], bo.df[0]);
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_min_pd computes the minima of the two DP FP values
 * of a and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_min_pd(Rt_m128d a, Rt_m128d b) /* MINPD */
{
    /*
     * Computes the minima of the two DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_min_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = _rw_min(ao.df[0], bo.df[0]);
    ro.df[1] = _rw_min(ao.df[1], bo.df[1]);

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mul_sd multiplies the lower DP FP values of a and
 * b.  The upper DP FP is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_mul_sd(Rt_m128d a, Rt_m128d b) /* MULSD */
{
    /*
     * Multiplies the lower DP FP values of a and b.
     * The upper DP FP is passed through from
     * a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mul_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0] * bo.df[0];
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mul_pd multiplies the two DP FP values of a and b.
 * *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_mul_pd(Rt_m128d a, Rt_m128d b) /* MULPD */
{
    /*
     * Multiplies the two DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mul_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0] * bo.df[0];
    ro.df[1] = ao.df[1] * bo.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sqrt_sd computes the square root of the lower DP FP
 * value of b.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_sqrt_sd(Rt_m128d a, Rt_m128d b) /* SQRTSD */
{
    /*
     * Computes the square root of the lower DP FP value of b.
     * The upper DP FP value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sqrt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = rwSqrt(bo.df[0]);
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sqrt_pd computes the square roots of the two DP FP
 * values of a.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_sqrt_pd(Rt_m128d a)       /* SQRTPD */
{
    /*
     * Computes the square roots of the two DP FP values of a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sqrt_pd"));

    ao.m128d = a;

    ro.df[0] = rwSqrt(ao.df[0]);
    ro.df[1] = rwSqrt(ao.df[1]);

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_sd subtracts the lower DP FP value of b from a.
 * The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_sub_sd(Rt_m128d a, Rt_m128d b) /* SUBSD */
{
    /*
     * Subtracts the lower DP FP value of b from a.
     * The upper DP FP value is passed
     * through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0] - bo.df[0];
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_pd subtracts the two DP FP values of b from a.
 * *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_sub_pd(Rt_m128d a, Rt_m128d b) /* SUBPD */
{
    /*
     * Subtracts the two DP FP values of b from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0] - bo.df[0];
    ro.df[1] = ao.df[1] - bo.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_andnot_pd computes the bitwise AND of the 128bit
 * value in b and the bitwise NOT of the 128bit value in a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_andnot_pd(Rt_m128d a, Rt_m128d b) /* ANDNPD */
{
    /*
     * Computes the bitwise AND of the 128bit value in b and
     * the bitwise NOT of the 128bit value in a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_andnot_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.ud[0] = (~ao.ud[0]) & bo.ud[0];
    ro.ud[1] = (~ao.ud[1]) & bo.ud[1];
    ro.ud[2] = (~ao.ud[2]) & bo.ud[2];
    ro.ud[3] = (~ao.ud[3]) & bo.ud[3];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_and_pd computes the bitwise AND of the two DP FP
 * values of a and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_and_pd(Rt_m128d a, Rt_m128d b) /* ANDPD */
{
    /*
     * Computes the bitwise AND of the two DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_and_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.ud[0] = ao.ud[0] & bo.ud[0];
    ro.ud[1] = ao.ud[1] & bo.ud[1];
    ro.ud[2] = ao.ud[2] & bo.ud[2];
    ro.ud[3] = ao.ud[3] & bo.ud[3];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_or_pd computes the bitwise OR of the two DP FP
 * values of a and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_or_pd(Rt_m128d a, Rt_m128d b) /* ORPD */
{

    /*
     * Computes the bitwise OR of the two DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_or_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.ud[0] = ao.ud[0] | bo.ud[0];
    ro.ud[1] = ao.ud[1] | bo.ud[1];
    ro.ud[2] = ao.ud[2] | bo.ud[2];
    ro.ud[3] = ao.ud[3] | bo.ud[3];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_xor_pd computes the bitwise XOR of the two DP FP
 * values of a and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_xor_pd(Rt_m128d a, Rt_m128d b) /* XORPD */
{

    /*
     * Computes the bitwise XOR of the two DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_xor_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.ud[0] = ao.ud[0] ^ bo.ud[0];
    ro.ud[1] = ao.ud[1] ^ bo.ud[1];
    ro.ud[2] = ao.ud[2] ^ bo.ud[2];
    ro.ud[3] = ao.ud[3] ^ bo.ud[3];

    RWRETURN(ro.m128d);

}

/* Comparisons */

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpeq_pd compares the two DP FP values of a and b
 * for equality.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpeq_pd(Rt_m128d a, Rt_m128d b) /* CMPEQPD */
{

    /*
     * Compares the two DP FP values of a and b for equality.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpeq_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] == bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] == bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao.df[1] == bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao.df[1] == bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmplt_pd compares the two DP FP values of a and b
 * for a less than b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmplt_pd(Rt_m128d a, Rt_m128d b) /* CMPLTPD */
{

    /*
     * Compares the two DP FP values of a and b for a less than b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmplt_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] < bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] < bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao.df[1] < bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao.df[1] < bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmple_pd compares the two DP FP values of a and b
 * for a less than or equal to b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmple_pd(Rt_m128d a, Rt_m128d b) /* CMPLEPD */
{

    /*
     * Compares the two DP FP values of a and b for a less than or equal to b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmple_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] <= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] <= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao.df[1] <= bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao.df[1] <= bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpgt_pd compares the two DP FP values of a and b
 * for a greater than b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpgt_pd(Rt_m128d a, Rt_m128d b) /* CMPLTPDr */
{

    /*
     * Compares the two DP FP values of a and b for a greater than b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpgt_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] > bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] > bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao.df[1] > bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao.df[1] > bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpge_pd compares the two DP FP values of a and b
 * for a greater than or equal to b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpge_pd(Rt_m128d a, Rt_m128d b) /* CMPLEPDr */
{

    /*
     * Compares the two DP FP values of a and b for a
     * greater than or equal to b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpge_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] >= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] >= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao.df[1] >= bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao.df[1] >= bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpord_pd compares the two DP FP values of a and b
 * for ordered.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpord_pd(Rt_m128d a, Rt_m128d b) /* CMPORDPD */
{

    /*
     * Compares the two DP FP values of a and b for ordered.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpord_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.ud[0] =
        (!(_rw_isnan(ao.df[0]) || _rw_isnan(bo.df[0]))) ? 0xffffffffL :
        0x0;
    ro.ud[1] =
        (!(_rw_isnan(ao.df[0]) || _rw_isnan(bo.df[0]))) ? 0xffffffffL :
        0x0;
    ro.ud[2] =
        (!(_rw_isnan(ao.df[1]) || _rw_isnan(bo.df[1]))) ? 0xffffffffL :
        0x0;
    ro.ud[3] =
        (!(_rw_isnan(ao.df[1]) || _rw_isnan(bo.df[1]))) ? 0xffffffffL :
        0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpunord_pd compares the two DP FP values of a and
 * b for unordered.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpunord_pd(Rt_m128d a, Rt_m128d b) /* CMPUNORDPD */
{

    /*
     * Compares the two DP FP values of a and b for unordered.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpunord_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.ud[0] =
        ((_rw_isnan(ao.df[0]) || _rw_isnan(bo.df[0]))) ? 0xffffffffL :
        0x0;
    ro.ud[1] =
        ((_rw_isnan(ao.df[0]) || _rw_isnan(bo.df[0]))) ? 0xffffffffL :
        0x0;
    ro.ud[2] =
        ((_rw_isnan(ao.df[1]) || _rw_isnan(bo.df[1]))) ? 0xffffffffL :
        0x0;
    ro.ud[3] =
        ((_rw_isnan(ao.df[1]) || _rw_isnan(bo.df[1]))) ? 0xffffffffL :
        0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpneq_pd compares the two DP FP values of a and b
 * for inequality.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpneq_pd(Rt_m128d a, Rt_m128d b) /* CMPNEQPD */
{

    /*
     * Compares the two DP FP values of a and b for inequality.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpneq_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] != bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] != bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao.df[1] != bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao.df[1] != bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnlt_pd compares the two DP FP values of a and b
 * for a not less than b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpnlt_pd(Rt_m128d a, Rt_m128d b) /* CMPNLTPD */
{

    /*
     * Compares the two DP FP values of a and b for a not less than b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnlt_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = !(ao.df[0] < bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = !(ao.df[0] < bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = !(ao.df[1] < bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = !(ao.df[1] < bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnle_pd compares the two DP FP values of a and b
 * for a not less than or equal to b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpnle_pd(Rt_m128d a, Rt_m128d b) /* CMPNLEPD */
{

    /*
     * Compares the two DP FP values of a and b for a
     * not less than or equal to b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnle_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = !(ao.df[0] <= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = !(ao.df[0] <= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = !(ao.df[1] <= bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = !(ao.df[1] <= bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpngt_pd compares the two DP FP values of a and b
 * for a not greater than b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpngt_pd(Rt_m128d a, Rt_m128d b) /* CMPNLTPDr */
{

    /*
     * Compares the two DP FP values of a and b for a not greater than b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpngt_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = !(ao.df[0] > bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = !(ao.df[0] > bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = !(ao.df[1] > bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = !(ao.df[1] > bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnge_pd compares the two DP FP values of a and b
 * for a not greater than or equal to b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpnge_pd(Rt_m128d a, Rt_m128d b) /* CMPNLEPDr */
{

    /*
     * Compares the two DP FP values of a and b for a
     * not greater than or equal to b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnge_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = !(ao.df[0] >= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = !(ao.df[0] >= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[2] = !(ao.df[1] >= bo.df[1]) ? 0xffffffff : 0x0;
    ro._d[3] = !(ao.df[1] >= bo.df[1]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpeq_sd compares the lower DP FP value of a and b
 * for equality.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpeq_sd(Rt_m128d a, Rt_m128d b) /* CMPEQSD */
{

    /*
     * Compares the lower DP FP value of a and b for equality.
     * The upper DP FP value is
     * passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpeq_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] == bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] == bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmplt_sd compares the lower DP FP value of a and b
 * for a less than b.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmplt_sd(Rt_m128d a, Rt_m128d b) /* CMPLTSD */
{

    /*
     * Compares the lower DP FP value of a and b for a less than b.
     * The upper DP FP value is
     * passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmplt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] < bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] < bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmple_sd compares the lower DP FP value of a and b
 * for a less than or equal to b.  The upper DP FP value is passed
 * through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmple_sd(Rt_m128d a, Rt_m128d b) /* CMPLESD */
{

    /*
     * Compares the lower DP FP value of a and b for a less than or equal to b.
     * The upper DP FP value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmple_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] <= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] <= bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpgt_sd compares the lower DP FP value of a and b
 * for a greater than b.  The upper DP FP value is passed through from a.
 * *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpgt_sd(Rt_m128d a, Rt_m128d b) /* CMPLTSDr */
{

    /*
     * Compares the lower DP FP value of a and b for a greater than b.
     * The upper DP FP
     * value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpgt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] > bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] > bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpge_sd compares the lower DP FP value of a and b
 * for a greater than or equal to b.  The upper DP FP value is passed
 * through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpge_sd(Rt_m128d a, Rt_m128d b) /* CMPLESDr */
{

    /*
     * Compares the lower DP FP value of a and b for a
     * greater than or equal to b.
     * The upper DP FP value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpge_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] >= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] >= bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpord_sd compares the lower DP FP value of a and b
 * for ordered.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpord_sd(Rt_m128d a, Rt_m128d b) /* CMPORDSD */
{

    /*
     * Compares the lower DP FP value of a and b for ordered.
     * The upper DP FP value is
     * passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpord_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.ud[0] =
        (!(_rw_isnan(ao.df[0]) || _rw_isnan(bo.df[0]))) ? 0xffffffffL :
        0x0;
    ro.ud[1] =
        (!(_rw_isnan(ao.df[0]) || _rw_isnan(bo.df[0]))) ? 0xffffffffL :
        0x0;

    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpunord_sd compares the lower DP FP value of a and
 * b for unordered.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpunord_sd(Rt_m128d a, Rt_m128d b) /* CMPUNORDSD */
{

    /*
     * Compares the lower DP FP value of a and b for unordered.
     * The upper DP FP value is
     * passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpunord_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.ud[0] =
        ((_rw_isnan(ao.df[0]) || _rw_isnan(bo.df[0]))) ? 0xffffffffL :
        0x0;
    ro.ud[1] =
        ((_rw_isnan(ao.df[0]) || _rw_isnan(bo.df[0]))) ? 0xffffffffL :
        0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpneq_sd compares the lower DP FP value of a and b
 * for inequality.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpneq_sd(Rt_m128d a, Rt_m128d b) /* CMPNEQSD */
{

    /*
     * Compares the lower DP FP value of a and b for inequality.
     * The upper DP FP value is
     * passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpneq_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = (ao.df[0] != bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao.df[0] != bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnlt_sd compares the lower DP FP value of a and b
 * for a not less than b.  The upper DP FP value is passed through from
 * a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpnlt_sd(Rt_m128d a, Rt_m128d b) /* CMPNLTSD */
{

    /*
     * Compares the lower DP FP value of a and b for a not less than b.
     * The upper DP FP
     * value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnlt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = !(ao.df[0] < bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = !(ao.df[0] < bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnle_sd compares the lower DP FP value of a and b
 * for a not less than or equal to b.  The upper DP FP value is passed
 * through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpnle_sd(Rt_m128d a, Rt_m128d b) /* CMPNLESD */
{

    /*
     * Compares the lower DP FP value of a and b for a
     * not less than or equal to b.
     * The upper DP FP value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnle_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = !(ao.df[0] <= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = !(ao.df[0] <= bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpngt_sd compares the lower DP FP value of a and b
 * for a not greater than b.  The upper DP FP value is passed through
 * from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpngt_sd(Rt_m128d a, Rt_m128d b) /* CMPNLTSDr */
{

    /*
     * Compares the lower DP FP value of a and b for a not greater than b.
     * The upper DP FP
     * value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpngt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = !(ao.df[0] > bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = !(ao.df[0] > bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnge_sd compares the lower DP FP value of a and b
 * for a not greater than or equal to b.  The upper DP FP value is passed
 * through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cmpnge_sd(Rt_m128d a, Rt_m128d b) /* CMPNLESDr */
{

    /*
     * Compares the lower DP FP value of a and b for a
     * not greater than or equal to b.
     * The upper DP FP value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnge_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro._d[0] = !(ao.df[0] >= bo.df[0]) ? 0xffffffff : 0x0;
    ro._d[1] = !(ao.df[0] >= bo.df[0]) ? 0xffffffff : 0x0;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comieq_sd compares the lower DP FP value of a and b
 * for a equal to b.  If a and b are equal, 1 is returned.  Otherwise 0
 * is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_comieq_sd(Rt_m128d a, Rt_m128d b) /* COMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a equal to b.
     * If a and b are equal, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comieq_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] == bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comilt_sd compares the lower DP FP value of a and b
 * for a less than b.  If a is less than b, 1 is returned.  Otherwise 0
 * is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_comilt_sd(Rt_m128d a, Rt_m128d b) /* COMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a less than b.
     * If a is less than b, 1 is
     * returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comilt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] < bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comile_sd compares the lower DP FP value of a and b
 * for a less than or equal to b.  If a is less than or equal to b, 1 is
 * returned.  Otherwise 0 is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_comile_sd(Rt_m128d a, Rt_m128d b) /* COMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a less than or equal to b.
     * If a is less
     * than or equal to b, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comile_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] <= bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comigt_sd compares the lower DP FP value of a and b
 * for a greater than b.  If a is greater than b are equal, 1 is
 * returned.  Otherwise 0 is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_comigt_sd(Rt_m128d a, Rt_m128d b) /* COMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a greater than b.
     * If a is greater than b
     * are equal, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comigt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] > bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comige_sd compares the lower DP FP value of a and b
 * for a greater than or equal to b.  If a is greater than or equal to b,
 * 1 is returned.  Otherwise 0 is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_comige_sd(Rt_m128d a, Rt_m128d b) /* COMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a
     * greater than or equal to b.
     * If a is greater than or equal to b, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comige_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] >= bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comineq_sd compares the lower DP FP value of a and
 * b for a not equal to b.  If a and b are not equal, 1 is returned.
 * Otherwise 0 is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_comineq_sd(Rt_m128d a, Rt_m128d b) /* COMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a not equal to b.
     * If a and b are not equal, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comineq_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] != bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomieq_sd compares the lower DP FP value of a and
 * b for a equal to b.  If a and b are equal, 1 is returned.  Otherwise 0
 * is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_ucomieq_sd(Rt_m128d a, Rt_m128d b) /* UCOMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a equal to b.
     * If a and b are equal, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomieq_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] == bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomilt_sd compares the lower DP FP value of a and
 * b for a less than b.  If a is less than b, 1 is returned.  Otherwise 0
 * is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_ucomilt_sd(Rt_m128d a, Rt_m128d b) /* UCOMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a less than b.
     * If a is less than b, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomilt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] < bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomile_sd compares the lower DP FP value of a and
 * b for a less than or equal to b.  If a is less than or equal to b, 1
 * is returned.  Otherwise 0 is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_ucomile_sd(Rt_m128d a, Rt_m128d b) /* UCOMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a less than or equal to b.
     * If a is less than or equal to b, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomile_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] <= bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomigt_sd compares the lower DP FP value of a and
 * b for a greater than b.  If a is greater than b are equal, 1 is
 * returned.  Otherwise 0 is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_ucomigt_sd(Rt_m128d a, Rt_m128d b) /* UCOMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a greater than b.
     * If a is greater than b are equal, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomigt_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] > bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomige_sd compares the lower DP FP value of a and
 * b for a greater than or equal to b.  If a is greater than or equal to
 * b, 1 is returned.  Otherwise 0 is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_ucomige_sd(Rt_m128d a, Rt_m128d b) /* UCOMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a
     * greater than or equal to b.
     * If a is greater than or equal to b, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomige_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] >= bo.df[0]) ? 0x1 : 0x0);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomineq_sd compares the lower DP FP value of a and
 * b for a not equal to b.  If a and b are not equal, 1 is returned.
 * Otherwise 0 is returned.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
int
Rt_mm_ucomineq_sd(Rt_m128d a, Rt_m128d b) /* UCOMISD */
{

    /*
     * Compares the lower DP FP value of a and b for a not equal to b.
     * If a and b are not equal, 1 is returned.
     * Otherwise 0 is returned.
     */
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomineq_sd"));

    ao.m128d = a;
    bo.m128d = b;

    RWRETURN((ao.df[0] != bo.df[0]) ? 0x1 : 0x0);
}

/* Conversion Operations  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtpd_ps converts the two DP FP values of a to SP
 * FP values.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128
Rt_mm_cvtpd_ps(Rt_m128d a)      /* CVTPD2PS */
{

    /*
     * Converts the two DP FP values of a to SP FP values.
     */
    volatile RpWNIOverlayM128 ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtpd_ps"));

    ao.m128d = a;

    ro._f[0] = (float) ao.df[0];
    ro._f[1] = (float) ao.df[1];
    ro._f[2] = 0.0;
    ro._f[3] = 0.0;

    RWRETURN(ro.m128);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtps_pd converts the lower two SP FP values of a
 * to DP FP values.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cvtps_pd(Rt_m128 a)       /* CVTPS2PD */
{

    /*
     * Converts the lower two SP FP values of a to DP FP values.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtps_pd"));

    ao.m128 = a;

    ro.df[0] = (double) ao._f[0];
    ro.df[1] = (double) ao._f[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtepi32_pd converts the lower two signed 32bit
 * integer values of a to DP FP values.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cvtepi32_pd(Rt_m128i a)   /* CVTDQ2PD */
{

    /*
     * Converts the lower two signed 32bit integer values of a
     * to DP FP values.
     */

    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtepi32_pd"));

    ao.m128i = a;

    ro.df[0] = (double) ao._d[0];
    ro.df[1] = (double) ao._d[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtpd_epi32 converts the two DP FP values of a to
 * 32bit signed integer values.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cvtpd_epi32(Rt_m128d a)   /* CVTPD2DQ */
{

    /*
     * Converts the two DP FP values of a to 32bit signed integer values.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtpd_epi32"));

    ao.m128d = a;

    ro._f[0] = (float) ((int) ao.df[0]);
    ro._f[1] = (float) ((int) ao.df[1]);
    ro._f[2] = 0x0;
    ro._f[3] = 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtsd_si32 converts the lower DP FP value of a to a
 * 32bit signed integer value.  *
 * \param  a   a
 *
 * \return result as described above
 */
int
Rt_mm_cvtsd_si32(Rt_m128d a)    /* CVTSD2SI */
{

    /*
     * Converts the lower DP FP value of a to a 32bit signed integer value.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtsd_si32"));

    ao.m128d = a;

    RWRETURN((int) ao.df[0]);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtsd_ss converts the lower DP FP value of b to an
 * SP FP value.  The upper SP FP values in a are passed through.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128
Rt_mm_cvtsd_ss(Rt_m128 a, Rt_m128d b) /* CVTSD2SS */
{

    /*
     * Converts the lower DP FP value of b to an SP FP value.
     * The upper SP FP values in a are passed through.
     */
    volatile RpWNIOverlayM128 ro;
    volatile RpWNIOverlayM128 ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtsd_ss"));

    ao.m128 = a;
    bo.m128d = b;

    ro._f[0] = (float) bo.df[0];
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtsi32_sd converts the signed integer value in b
 * to a DP FP value.  The upper DP FP value in a is passed through.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cvtsi32_sd(Rt_m128d a, int b) /* CVTSI2SD */
{

    /*
     * Converts the signed integer value in b to a DP FP value.
     * The upper DP FP value in a is passed through.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtsi32_sd"));

    ao.m128d = a;

    ro.df[0] = (double) b;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtss_sd converts the lower SP FP value of b to a
 * DP FP value.  The upper value DP FP value in a is passed through.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cvtss_sd(Rt_m128d a, Rt_m128 b) /* CVTSS2SD */
{

    /*
     * Converts the lower SP FP value of b to a DP FP value.
     * The upper value DP FP value in a is passed through.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtss_sd"));

    ao.m128d = a;
    bo.m128 = b;

    ro.df[0] = (double) bo._f[0];
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvttpd_epi32 converts the two DP FP values of a to
 * 32 bit signed integers using truncate.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cvttpd_epi32(Rt_m128d a)  /* CVTTPD2DQ */
{

    /*
     * Converts the two DP FP values of a to 32 bit signed integers
     * using truncate.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvttpd_epi32"));

    ao.m128d = a;

    ro._d[0] = (int) ao.df[0];
    ro._d[1] = (int) ao.df[1];
    ro._d[2] = 0x0;
    ro._d[3] = 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvttsd_si32 converts the lower DP FP value of a to
 * a 32 bit signed integer using truncate.  *
 * \param  a   a
 *
 * \return result as described above
 */
int
Rt_mm_cvttsd_si32(Rt_m128d a)   /* CVTTSD2SI */
{

    /*
     * Converts the lower DP FP value of a to a 32 bit signed integer
     * using truncate.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvttsd_si32"));

    ao.m128d = a;

    RWRETURN((int) ao.df[0]);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtepi32_ps converts the 4 signed 32 bit integer
 * values of a to SP FP values.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128
Rt_mm_cvtepi32_ps(Rt_m128i a)   /* CVTDQ2PS */
{

    /*
     * Converts the 4 signed 32 bit integer values of a to SP FP values.
     */
    volatile RpWNIOverlayM128 ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtepi32_ps"));

    ao.m128i = a;

    ro._f[0] = (float) ao._d[0];
    ro._f[1] = (float) ao._d[1];
    ro._f[2] = (float) ao._d[2];
    ro._f[3] = (float) ao._d[3];

    RWRETURN(ro.m128);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtps_epi32 converts the 4 SP FP values of a to
 * signed 32 bit integer values *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cvtps_epi32(Rt_m128 a)    /* CVTPS2DQ */
{

    /*
     * Converts the 4 SP FP values of a to signed 32 bit integer values
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtps_epi32"));

    ao.m128 = a;

    ro._d[0] = (int) ao._f[0];
    ro._d[1] = (int) ao._f[1];
    ro._d[2] = (int) ao._f[1];
    ro._d[3] = (int) ao._f[1];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvttps_epi32 converts the 4 SP FP values of a to
 * signed 32 bit integer values using truncate.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cvttps_epi32(Rt_m128 a)   /* CVTTPS2DQ */
{

    /*
     * Converts the 4 SP FP values of a to signed 32 bit integer values
     * using truncate.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvttps_epi32"));

    ao.m128 = a;

    ro._d[0] = (int) ao._f[0];
    ro._d[1] = (int) ao._f[1];
    ro._d[2] = (int) ao._f[1];
    ro._d[3] = (int) ao._f[1];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtpd_pi32 converts the two DP FP values of a to
 * 32bit signed integer values.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m64
Rt_mm_cvtpd_pi32(Rt_m128d a)    /* CVTPD2PI */
{

    /*
     * Converts the two DP FP values of a to 32bit signed integer values.
     */
    volatile RwOverlayM64 ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtpd_pi32"));

    ao.m128d = a;

    ro._d[0] = (int) ao.df[0];
    ro._d[1] = (int) ao.df[1];

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvttpd_pi32 converts the two DP FP values of a to
 * 32bit signed integer values using truncate.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m64
Rt_mm_cvttpd_pi32(Rt_m128d a)   /* CVTTPD2PI */
{

    /*
     * Converts the two DP FP values of a to 32bit signed integer values
     * using truncate.
     */
    volatile RwOverlayM64 ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvttpd_pi32"));

    ao.m128d = a;

    ro._d[0] = (int) ao.df[0];
    ro._d[1] = (int) ao.df[1];

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtpi32_pd converts the two 32bit signed integer
 * values of a to DP FP values.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_cvtpi32_pd(Rt_m64 a)      /* CVTPI2PD */
{

    /*
     * Converts the two 32bit signed integer values of a to DP FP values.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RwOverlayM64 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtpi32_pd"));

    ao.m64 = a;

    ro.df[0] = (double) ao._d[0];
    ro.df[1] = (double) ao._d[1];

    RWRETURN(ro.m128d);

}

/* Miscellaneous Operations  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpackhi_pd interleaves the upper DP FP values of a
 * and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_unpackhi_pd(Rt_m128d a, Rt_m128d b) /* UNPCKHPD */
{

    /*
     * Interleaves the upper DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpackhi_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[1];
    ro.df[1] = bo.df[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpacklo_pd interleaves the lower DP FP values of a
 * and b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_unpacklo_pd(Rt_m128d a, Rt_m128d b) /* UNPCKLPD */
{
    /*
     * Interleaves the lower DP FP values of a and b.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpacklo_pd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = ao.df[0];
    ro.df[1] = bo.df[0];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_movemask_pd creates a two bit mask from the sign
 * bits of the two DP FP values of a.  *
 * \param  a   a
 *
 * \return result as described above
 */
int
Rt_mm_movemask_pd(Rt_m128d a)   /* MOVMSKPD */
{

    /*
     * Creates a two bit mask from the sign bits of the two DP FP values of a.
     */

    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_movemask_pd"));

    ao.m128d = a;

    RWRETURN(sign(ao.df[1]) << 1 | sign(ao.df[0]));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_shuffle_pd Selects two specific DP FP values from a
 * and b,based on the mask i.The mask must be an immediate.  *
 * \param  a   a
 * \param  b   b
 * \param  i   i
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_shuffle_pd(Rt_m128d __RWUNUSED__ a, Rt_m128d __RWUNUSED__ b, int __RWUNUSED__ i) /* SHUFPD */
{
    /* Selects two specific DP FP values from a and b,
     * based on the mask i.The mask must be an immediate.
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_shuffle_pd"));

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_shuffle_pd");

    RWRETURN(Rt_mm_set_sd(0));
}

/* Load Operations  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_load_pd loads two DP FP values.  The address p must
 * be 16byte aligned.  *
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_load_pd(const double *p)  /* MOVAPD */
{

    /*
     * Loads two DP FP values.
     * The address p must be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_load_pd"));

    ro.df[0] = p[0];
    ro.df[1] = p[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_load1_pd loads a single DP FP value, copying to
 * both elements.  The address p need not be 16byte aligned.  *
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_load1_pd(const double *p) /* (MOVSD + shuffling) */
{

    /*
     * Loads a single DP FP value, copying to both elements.
     * The address p need not be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_load1_pd"));

    ro.df[0] = *p;
    ro.df[1] = *p;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadr_pd loads two DP FP values in reverse order.
 * The address p must be 16byte aligned.  *
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_loadr_pd(const double *p) /* (MOVAPD + shuffling) */
{

    /*
     * Loads two DP FP values in reverse order.
     * The address p must be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadr_pd"));

    ro.df[0] = p[1];
    ro.df[1] = p[0];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadu_pd loads two DP FP values.  The address p
 * need not be 16byte aligned.  *
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_loadu_pd(const double *p) /* MOVUPD */
{

    /*
     * Loads two DP FP values.
     * The address p need not be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadu_pd"));

    ro.df[0] = p[0];
    ro.df[1] = p[1];

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_load_sd loads a DP FP value.  The upper DP FP is
 * set to zero.  The address p need not be 16byte aligned.  *
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_load_sd(const double *p)  /* MOVSD */
{

    /*
     * Loads a DP FP value.
     * The upper DP FP is set to zero.
     * The address p need not be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_load_sd"));

    ro.df[0] = *p;
    ro.df[1] = 0.0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadh_pd loads a DP FP value as the upper DP FP
 * value of the result.  The lower DP FP value is passed through from a.
 * The address p need not be 16byte aligned.  *
 * \param  a   a
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_loadh_pd(Rt_m128d a, const double *p) /* MOVHPD */
{

    /*
     * Loads a DP FP value as the upper DP FP value of the result.
     * The lower DP FP value is passed through from a.
     * The address p need not be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadh_pd"));

    ao.m128d = a;

    ro.df[0] = ao.df[0];
    ro.df[1] = *p;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadl_pd loads a DP FP value as the lower DP FP
 * value of the result.  The upper DP FP value is passed through from a.
 * The address p need not be 16byte aligned.  *
 * \param  a   a
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_loadl_pd(Rt_m128d a, const double *p) /* MOVLPD */
{

    /*
     * Loads a DP FP value as the lower DP FP value of the result.
     * The upper DP FP value is passed through from a.
     * The address p need not be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadl_pd"));

    ao.m128d = a;

    ro.df[0] = *p;
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/* Set Operations  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_set_sd sets the lower DP FP value to w and sets the
 * upper DP FP value to zero.  *
 * \param  w   w
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_set_sd(double w)
{

    /*
     * Sets the lower DP FP value to w and sets the upper DP FP value to zero.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_sd"));

    ro.df[0] = w;
    ro.df[1] = 0.0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_set1_pd sets the 2 DP FP values to w.  *
 * \param  w   w
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_set1_pd(double w)
{

    /*
     * Sets the 2 DP FP values to w.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set1_pd"));

    ro.df[0] = w;
    ro.df[1] = w;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_set_pd sets the lower DP FP value to x and sets the
 * upper DP FP value to w.  *
 * \param  w   w
 * \param  x   x
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_set_pd(double w, double x)
{

    /*
     * Sets the lower DP FP value to x and sets the upper DP FP value to w.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_pd"));

    ro.df[0] = x;
    ro.df[1] = w;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_setr_pd sets the lower DP FP value to w and sets
 * the upper DP FP value to x.  *
 * \param  w   w
 * \param  x   x
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_setr_pd(double w, double x)
{

    /*
     * Sets the lower DP FP value to w and sets the upper DP FP value to x.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setr_pd"));

    ro.df[0] = w;
    ro.df[1] = x;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_setzero_pd sets the 2 DP FP values to zero.  *
 * \return result as described above
 */
Rt_m128d
Rt_mm_setzero_pd(void)          /* XORPD */
{

    /*
     * Sets the 2 DP FP values to zero.
     */
    volatile RpWNIOverlayM128d ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setzero_pd"));

    ro.df[0] = 0.0;
    ro.df[1] = 0.0;

    RWRETURN(ro.m128d);

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_move_sd sets the lower DP FP value to the lower DP
 * FP value of b.  The upper DP FP value is passed through from a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_move_sd(Rt_m128d a, Rt_m128d b) /* MOVSD */
{

    /*
     * Sets the lower DP FP value to the lower DP FP value of b.
     * The upper DP FP value is passed through from a.
     */
    volatile RpWNIOverlayM128d ro;
    volatile RpWNIOverlayM128d ao;
    volatile RpWNIOverlayM128d bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_move_sd"));

    ao.m128d = a;
    bo.m128d = b;

    ro.df[0] = bo.df[0];
    ro.df[1] = ao.df[1];

    RWRETURN(ro.m128d);

}

/* Store Operations  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_stream_pd stores the data in a to the address p
 * without polluting the caches.  The address p must be 16byte aligned.
 * If the cache line containing address p is already in the cache, the
 * cache will be updated.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_stream_pd(double *p, Rt_m128d a)
{

    /*
     * Stores the data in a to the address p without polluting the caches.
     * The address p must be 16byte aligned.
     * If the cache line containing address p is already in the cache, the
     * cache will be updated.
     */

    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_stream_pd"));

    ao.m128d = a;

    p[0] = ao.df[0];
    p[1] = ao.df[1];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_store_sd stores the lower DP FP value of a.  The
 * address p need not be 16byte aligned.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_store_sd(double *p, Rt_m128d a) /* MOVSD */
{

    /*
     * Stores the lower DP FP value of a.
     * The address p need not be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_store_sd"));

    ao.m128d = a;

    *p = ao.df[0];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_store1_pd stores the lower DP FP value of a twice.
 * The address p must be 16byte aligned.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_store1_pd(double *p, Rt_m128d a) /* (MOVAPD + shuffling) */
{

    /*
     * Stores the lower DP FP value of a twice.
     * The address p must be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_store1_pd"));

    ao.m128d = a;

    p[0] = ao.df[0];
    p[1] = ao.df[0];
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_store_pd stores two DP FP values.  The address p
 * must be 16byte aligned.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_store_pd(double *p, Rt_m128d a) /* MOVAPD */
{

    /*
     * Stores two DP FP values.
     * The address p must be 16byte aligned.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_store_pd"));

    ao.m128d = a;
    p[0] = ao.df[0];
    p[1] = ao.df[1];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_storeu_pd stores two DP FP values.  The address p
 * need not be 16 byte aligned.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_storeu_pd(double *p, Rt_m128d a) /* MOVUPD */
{

    /*
     * Stores two DP FP values.
     * The address p need not be 16 byte aligned.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storeu_pd"));

    ao.m128d = a;
    p[0] = ao.df[0];
    p[1] = ao.df[1];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_storer_pd stores two DP FP values in reverse order.
 * The address p must be 16 byte aligned.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_storer_pd(double *p, Rt_m128d a) /* (MOVAPD + shuffling) */
{

    /*
     * Stores two DP FP values in reverse order.
     * The address p must be 16 byte aligned.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storer_pd"));

    ao.m128d = a;
    p[0] = ao.df[1];
    p[1] = ao.df[0];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_storeh_pd stores the upper DP FP value of a.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_storeh_pd(double *p, Rt_m128d a) /* MOVHPD */
{

    /*
     * Stores the upper DP FP value of a.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storeh_pd"));

    ao.m128d = a;

    *p = ao.df[1];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_storel_pd stores the lower DP FP value of a.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_storel_pd(double *p, Rt_m128d a) /* MOVLPD */
{

    /*
     * Stores the lower DP FP value of a.
     */
    volatile RpWNIOverlayM128d ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storel_pd"));

    ao.m128d = a;

    *p = ao.df[0];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_epi8 adds the 16 signed or unsigned 8bit
 * integers in a to the 16 signed or unsigned 8bit integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_add_epi8(Rt_m128i a, Rt_m128i b) /* PADDB */
{

    /*
     * Adds the 16 signed or unsigned 8bit integers in a
     * to the 16 signed or unsigned 8bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = ao._b[0] + bo._b[0];
    ro._b[1] = ao._b[1] + bo._b[1];
    ro._b[2] = ao._b[2] + bo._b[2];
    ro._b[3] = ao._b[3] + bo._b[3];
    ro._b[4] = ao._b[4] + bo._b[4];
    ro._b[5] = ao._b[5] + bo._b[5];
    ro._b[6] = ao._b[6] + bo._b[6];
    ro._b[7] = ao._b[7] + bo._b[7];
    ro._b[8] = ao._b[8] + bo._b[8];
    ro._b[9] = ao._b[9] + bo._b[9];
    ro._b[10] = ao._b[10] + bo._b[10];
    ro._b[11] = ao._b[11] + bo._b[11];
    ro._b[12] = ao._b[12] + bo._b[12];
    ro._b[13] = ao._b[13] + bo._b[13];
    ro._b[14] = ao._b[14] + bo._b[14];
    ro._b[15] = ao._b[15] + bo._b[15];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_epi16 adds the 8 signed or unsigned 16bit
 * integers in a to the 8 signed or unsigned 16bit integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_add_epi16(Rt_m128i a, Rt_m128i b) /* PADDW */
{
    /*
     * Adds the 8 signed or unsigned 16bit integers in a
     * to the 8 signed or unsigned 16bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = ao._w[0] + bo._w[0];
    ro._w[1] = ao._w[1] + bo._w[1];
    ro._w[2] = ao._w[2] + bo._w[2];
    ro._w[3] = ao._w[3] + bo._w[3];
    ro._w[4] = ao._w[4] + bo._w[4];
    ro._w[5] = ao._w[5] + bo._w[5];
    ro._w[6] = ao._w[6] + bo._w[6];
    ro._w[7] = ao._w[7] + bo._w[7];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_epi32 adds the 4 signed or unsigned 32bit
 * integers in a to the 4 signed or unsigned 32bit integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_add_epi32(Rt_m128i a, Rt_m128i b) /* PADDD */
{

    /*
     * Adds the 4 signed or unsigned 32bit integers in a
     * to the 4 signed or unsigned 32bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_epi32"));

    ao.m128i = a;
    bo.m128i = b;

    ro._d[0] = ao._d[0] + bo._d[0];
    ro._d[1] = ao._d[1] + bo._d[1];
    ro._d[2] = ao._d[2] + bo._d[2];
    ro._d[3] = ao._d[3] + bo._d[3];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_si64 adds the signed or unsigned 64bit integer
 * a to the signed or unsigned 64bit integer b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m64
Rt_mm_add_si64(Rt_m64 a, Rt_m64 b) /* PADDQ */
{
    /*
     * Adds the signed or unsigned 64bit integer a
     * to the signed or unsigned 64bit integer b.
     */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_si64"));

    ao.m64 = a;
    bo.m64 = b;

    ro.m64 = _m64Add(ao.m64, bo.m64);

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_epi64 adds the 2 signed or unsigned 64bit
 * integers in a to the 2 signed or unsigned 64bit integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_add_epi64(Rt_m128i a, Rt_m128i b) /* PADDQ */
{

    /*
     * Adds the 2 signed or unsigned 64bit integers in a
     * to the 2 signed or unsigned 64bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_epi64"));

    ao.m128i = a;
    bo.m128i = b;

    ro.m64[0] = _m64Sub(ao.m64[0], bo.m64[0]);
    ro.m64[1] = _m64Sub(ao.m64[1], bo.m64[1]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_adds_epi8 adds the 16 signed 8bit integers in a to
 * the 16 signed 8bit integers in b and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_adds_epi8(Rt_m128i a, Rt_m128i b) /* PADDSB */
{

    /*
     * Adds the 16 signed 8bit integers in a
     * to the 16 signed 8bit integers in b and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_adds_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = SignedSaturateByte(((int) ao._b[0]) + ((int) bo._b[0]));
    ro._b[1] = SignedSaturateByte(((int) ao._b[1]) + ((int) bo._b[1]));
    ro._b[2] = SignedSaturateByte(((int) ao._b[2]) + ((int) bo._b[2]));
    ro._b[3] = SignedSaturateByte(((int) ao._b[3]) + ((int) bo._b[3]));
    ro._b[4] = SignedSaturateByte(((int) ao._b[4]) + ((int) bo._b[4]));
    ro._b[5] = SignedSaturateByte(((int) ao._b[5]) + ((int) bo._b[5]));
    ro._b[6] = SignedSaturateByte(((int) ao._b[6]) + ((int) bo._b[6]));
    ro._b[7] = SignedSaturateByte(((int) ao._b[7]) + ((int) bo._b[7]));
    ro._b[8] = SignedSaturateByte(((int) ao._b[8]) + ((int) bo._b[8]));
    ro._b[9] = SignedSaturateByte(((int) ao._b[9]) + ((int) bo._b[9]));
    ro._b[10] =
        SignedSaturateByte(((int) ao._b[10]) + ((int) bo._b[10]));
    ro._b[11] =
        SignedSaturateByte(((int) ao._b[11]) + ((int) bo._b[11]));
    ro._b[12] =
        SignedSaturateByte(((int) ao._b[12]) + ((int) bo._b[12]));
    ro._b[13] =
        SignedSaturateByte(((int) ao._b[13]) + ((int) bo._b[13]));
    ro._b[14] =
        SignedSaturateByte(((int) ao._b[14]) + ((int) bo._b[14]));
    ro._b[15] =
        SignedSaturateByte(((int) ao._b[15]) + ((int) bo._b[15]));

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_adds_epi16 adds the 8 signed 16bit integers in a
 * to the 8 signed 16bit integers in b and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_adds_epi16(Rt_m128i a, Rt_m128i b) /* PADDSW */
{

    /*
     * Adds the 8 signed 16bit integers in a
     * to the 8 signed 16bit integers in b and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_adds_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = SignedSaturateWord(((int) ao._w[0]) + ((int) bo._w[0]));
    ro._w[1] = SignedSaturateWord(((int) ao._w[1]) + ((int) bo._w[1]));
    ro._w[2] = SignedSaturateWord(((int) ao._w[2]) + ((int) bo._w[2]));
    ro._w[3] = SignedSaturateWord(((int) ao._w[3]) + ((int) bo._w[3]));
    ro._w[4] = SignedSaturateWord(((int) ao._w[4]) + ((int) bo._w[4]));
    ro._w[5] = SignedSaturateWord(((int) ao._w[5]) + ((int) bo._w[5]));
    ro._w[6] = SignedSaturateWord(((int) ao._w[6]) + ((int) bo._w[6]));
    ro._w[7] = SignedSaturateWord(((int) ao._w[7]) + ((int) bo._w[7]));

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_adds_epu8 adds the 16 unsigned 8bit integers in a
 * to the 16 unsigned 8bit integers in b and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_adds_epu8(Rt_m128i a, Rt_m128i b) /* PADDUSB */
{

    /*
     * Adds the 16 unsigned 8bit integers in a
     * to the 16 unsigned 8bit integers in b and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_adds_epu8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = UnSignedSaturateByte(((unsigned int) ao._b[0]) +
                                    ((unsigned int) bo._b[0]));
    ro._b[1] = UnSignedSaturateByte(((unsigned int) ao._b[1]) +
                                    ((unsigned int) bo._b[1]));
    ro._b[2] = UnSignedSaturateByte(((unsigned int) ao._b[2]) +
                                    ((unsigned int) bo._b[2]));
    ro._b[3] = UnSignedSaturateByte(((unsigned int) ao._b[3]) +
                                    ((unsigned int) bo._b[3]));
    ro._b[4] = UnSignedSaturateByte(((unsigned int) ao._b[4]) +
                                    ((unsigned int) bo._b[4]));
    ro._b[5] = UnSignedSaturateByte(((unsigned int) ao._b[5]) +
                                    ((unsigned int) bo._b[5]));
    ro._b[6] = UnSignedSaturateByte(((unsigned int) ao._b[6]) +
                                    ((unsigned int) bo._b[6]));
    ro._b[7] = UnSignedSaturateByte(((unsigned int) ao._b[7]) +
                                    ((unsigned int) bo._b[7]));
    ro._b[8] = UnSignedSaturateByte(((unsigned int) ao._b[8]) +
                                    ((unsigned int) bo._b[8]));
    ro._b[9] = UnSignedSaturateByte(((unsigned int) ao._b[9]) +
                                    ((unsigned int) bo._b[9]));
    ro._b[10] = UnSignedSaturateByte(((unsigned int) ao._b[10]) +
                                     ((unsigned int) bo._b[10]));
    ro._b[11] = UnSignedSaturateByte(((unsigned int) ao._b[11]) +
                                     ((unsigned int) bo._b[11]));
    ro._b[12] = UnSignedSaturateByte(((unsigned int) ao._b[12]) +
                                     ((unsigned int) bo._b[12]));
    ro._b[13] = UnSignedSaturateByte(((unsigned int) ao._b[13]) +
                                     ((unsigned int) bo._b[13]));
    ro._b[14] = UnSignedSaturateByte(((unsigned int) ao._b[14]) +
                                     ((unsigned int) bo._b[14]));
    ro._b[15] = UnSignedSaturateByte(((unsigned int) ao._b[15]) +
                                     ((unsigned int) bo._b[15]));

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_adds_epu16 adds the 8 unsigned 16bit integers in a
 * to the 8 unsigned 16bit integers in b and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_adds_epu16(Rt_m128i a, Rt_m128i b) /* PADDUSW */
{

    /*
     * Adds the 8 unsigned 16bit integers in a
     * to the 8 unsigned 16bit integers in b and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_adds_epu16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = UnSignedSaturateWord(((unsigned int) ao._w[0]) +
                                    ((unsigned int) bo._w[0]));
    ro._w[1] = UnSignedSaturateWord(((unsigned int) ao._w[1]) +
                                    ((unsigned int) bo._w[1]));
    ro._w[2] = UnSignedSaturateWord(((unsigned int) ao._w[2]) +
                                    ((unsigned int) bo._w[2]));
    ro._w[3] = UnSignedSaturateWord(((unsigned int) ao._w[3]) +
                                    ((unsigned int) bo._w[3]));
    ro._w[4] = UnSignedSaturateWord(((unsigned int) ao._w[4]) +
                                    ((unsigned int) bo._w[4]));
    ro._w[5] = UnSignedSaturateWord(((unsigned int) ao._w[5]) +
                                    ((unsigned int) bo._w[5]));
    ro._w[6] = UnSignedSaturateWord(((unsigned int) ao._w[6]) +
                                    ((unsigned int) bo._w[6]));
    ro._w[7] = UnSignedSaturateWord(((unsigned int) ao._w[7]) +
                                    ((unsigned int) bo._w[7]));

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_avg_epu8 computes the average of the 16 unsigned
 * 8bit integers in a and the 16 unsigned 8bit integers in b and
 * rounds.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_avg_epu8(Rt_m128i a, Rt_m128i b) /* PAVGB */
{

    /*
     * Computes the average of the 16 unsigned 8bit integers in a
     * and the 16 unsigned 8bit integers in b and rounds.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_avg_epu8"));

    ao.m128i = a;
    bo.m128i = b;

    ro.ub[0] =
        (((unsigned int) ao.ub[0]) + ((unsigned int) bo.ub[0])) / 2;
    ro.ub[1] =
        (((unsigned int) ao.ub[1]) + ((unsigned int) bo.ub[1])) / 2;
    ro.ub[2] =
        (((unsigned int) ao.ub[2]) + ((unsigned int) bo.ub[2])) / 2;
    ro.ub[3] =
        (((unsigned int) ao.ub[3]) + ((unsigned int) bo.ub[3])) / 2;
    ro.ub[4] =
        (((unsigned int) ao.ub[4]) + ((unsigned int) bo.ub[4])) / 2;
    ro.ub[5] =
        (((unsigned int) ao.ub[5]) + ((unsigned int) bo.ub[5])) / 2;
    ro.ub[6] =
        (((unsigned int) ao.ub[6]) + ((unsigned int) bo.ub[6])) / 2;
    ro.ub[7] =
        (((unsigned int) ao.ub[7]) + ((unsigned int) bo.ub[7])) / 2;
    ro.ub[8] =
        (((unsigned int) ao.ub[8]) + ((unsigned int) bo.ub[8])) / 2;
    ro.ub[9] =
        (((unsigned int) ao.ub[9]) + ((unsigned int) bo.ub[9])) / 2;
    ro.ub[10] =
        (((unsigned int) ao.ub[10]) + ((unsigned int) bo.ub[10])) / 2;
    ro.ub[11] =
        (((unsigned int) ao.ub[11]) + ((unsigned int) bo.ub[11])) / 2;
    ro.ub[12] =
        (((unsigned int) ao.ub[12]) + ((unsigned int) bo.ub[12])) / 2;
    ro.ub[13] =
        (((unsigned int) ao.ub[13]) + ((unsigned int) bo.ub[13])) / 2;
    ro.ub[14] =
        (((unsigned int) ao.ub[14]) + ((unsigned int) bo.ub[14])) / 2;
    ro.ub[15] =
        (((unsigned int) ao.ub[15]) + ((unsigned int) bo.ub[15])) / 2;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_avg_epu16 computes the average of the 8 unsigned
 * 16bit integers in a and the 8 unsigned 16bit integers in b and
 * rounds.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_avg_epu16(Rt_m128i a, Rt_m128i b) /* PAVGW */
{

    /*
     * Computes the average of the 8 unsigned 16bit integers in a
     * and the 8 unsigned 16bit integers in b and rounds.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_avg_epu16"));

    ao.m128i = a;
    bo.m128i = b;

    ro.uw[0] =
        (((unsigned int) ao.uw[0]) + ((unsigned int) bo.uw[0])) / 2;
    ro.uw[1] =
        (((unsigned int) ao.uw[1]) + ((unsigned int) bo.uw[1])) / 2;
    ro.uw[2] =
        (((unsigned int) ao.uw[2]) + ((unsigned int) bo.uw[2])) / 2;
    ro.uw[3] =
        (((unsigned int) ao.uw[3]) + ((unsigned int) bo.uw[3])) / 2;
    ro.uw[4] =
        (((unsigned int) ao.uw[4]) + ((unsigned int) bo.uw[4])) / 2;
    ro.uw[5] =
        (((unsigned int) ao.uw[5]) + ((unsigned int) bo.uw[5])) / 2;
    ro.uw[6] =
        (((unsigned int) ao.uw[6]) + ((unsigned int) bo.uw[6])) / 2;
    ro.uw[7] =
        (((unsigned int) ao.uw[7]) + ((unsigned int) bo.uw[7])) / 2;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_madd_epi16 multiplies the 8 signed 16bit integers
 * from a by the 8 signed 16bit integers from b.  Adds the signed 32bit
 * integer results pairwise and packs the 4 signed 32bit integer
 * results.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_madd_epi16(Rt_m128i a, Rt_m128i b) /* PMADDWD */
{

    /*
     * Multiplies the 8 signed 16bit integers from a
     * by the 8 signed 16bit integers from b.
     * Adds the signed 32bit integer results pairwise
     * and packs the 4 signed 32bit integer results.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_madd_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._d[0] =
        (((int) ao._w[0]) * ((int) bo._w[0])) +
        (((int) ao._w[1]) * ((int) bo._w[1]));
    ro._d[1] =
        (((int) ao._w[2]) * ((int) bo._w[2])) +
        (((int) ao._w[3]) * ((int) bo._w[3]));
    ro._d[2] =
        (((int) ao._w[4]) * ((int) bo._w[4])) +
        (((int) ao._w[5]) * ((int) bo._w[5]));
    ro._d[3] =
        (((int) ao._w[6]) * ((int) bo._w[6])) +
        (((int) ao._w[7]) * ((int) bo._w[7]));

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_max_epi16 computes the pairwise maxima of the 8
 * signed 16bit integers from a and the 8 signed 16bit integers from b.
 * *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_max_epi16(Rt_m128i a, Rt_m128i b) /* PMAXSW */
{

    /*
     * Computes the pairwise maxima of the 8 signed 16bit integers from a
     * and the 8 signed 16bit integers from b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_max_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = _rw_max(ao._w[0], bo._w[0]);
    ro._w[1] = _rw_max(ao._w[1], bo._w[1]);
    ro._w[2] = _rw_max(ao._w[2], bo._w[2]);
    ro._w[3] = _rw_max(ao._w[3], bo._w[3]);
    ro._w[4] = _rw_max(ao._w[4], bo._w[4]);
    ro._w[5] = _rw_max(ao._w[5], bo._w[5]);
    ro._w[6] = _rw_max(ao._w[6], bo._w[6]);
    ro._w[7] = _rw_max(ao._w[7], bo._w[7]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_max_epu8 computes the pairwise maxima of the 16
 * unsigned 8bit integers from a and the 16 unsigned 8bit integers from
 * b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_max_epu8(Rt_m128i a, Rt_m128i b) /* PMAXUB */
{

    /*
     * Computes the pairwise maxima of the 16 unsigned 8bit integers from a
     * and the 16 unsigned 8bit integers from b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_max_epu8"));

    ao.m128i = a;
    bo.m128i = b;

    ro.ub[0] = _rw_max(ao.ub[0], bo.ub[0]);
    ro.ub[1] = _rw_max(ao.ub[1], bo.ub[1]);
    ro.ub[2] = _rw_max(ao.ub[2], bo.ub[2]);
    ro.ub[3] = _rw_max(ao.ub[3], bo.ub[3]);
    ro.ub[4] = _rw_max(ao.ub[4], bo.ub[4]);
    ro.ub[5] = _rw_max(ao.ub[5], bo.ub[5]);
    ro.ub[6] = _rw_max(ao.ub[6], bo.ub[6]);
    ro.ub[7] = _rw_max(ao.ub[7], bo.ub[7]);
    ro.ub[8] = _rw_max(ao.ub[8], bo.ub[8]);
    ro.ub[9] = _rw_max(ao.ub[9], bo.ub[9]);
    ro.ub[10] = _rw_max(ao.ub[10], bo.ub[10]);
    ro.ub[11] = _rw_max(ao.ub[11], bo.ub[11]);
    ro.ub[12] = _rw_max(ao.ub[12], bo.ub[12]);
    ro.ub[13] = _rw_max(ao.ub[13], bo.ub[13]);
    ro.ub[14] = _rw_max(ao.ub[14], bo.ub[14]);
    ro.ub[15] = _rw_max(ao.ub[15], bo.ub[15]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_min_epi16 computes the pairwise minima of the 8
 * signed 16bit integers from a and the 8 signed 16bit integers from b.
 * *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_min_epi16(Rt_m128i a, Rt_m128i b) /* PMINSW */
{

    /*
     * Computes the pairwise minima of the 8 signed 16bit integers from a
     * and the 8 signed 16bit integers from b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_min_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = _rw_min(ao._w[0], bo._w[0]);
    ro._w[1] = _rw_min(ao._w[1], bo._w[1]);
    ro._w[2] = _rw_min(ao._w[2], bo._w[2]);
    ro._w[3] = _rw_min(ao._w[3], bo._w[3]);
    ro._w[4] = _rw_min(ao._w[4], bo._w[4]);
    ro._w[5] = _rw_min(ao._w[5], bo._w[5]);
    ro._w[6] = _rw_min(ao._w[6], bo._w[6]);
    ro._w[7] = _rw_min(ao._w[7], bo._w[7]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_min_epu8 computes the pairwise minima of the 16
 * unsigned 8bit integers from a and the 16 unsigned 8bit integers from
 * b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_min_epu8(Rt_m128i a, Rt_m128i b) /* PMINUB */
{

    /*
     * Computes the pairwise minima of the 16 unsigned 8bit integers from a
     * and the 16 unsigned 8bit integers from b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_min_epu8"));

    ao.m128i = a;
    bo.m128i = b;

    ro.ub[0] = _rw_min(ao.ub[0], bo.ub[0]);
    ro.ub[1] = _rw_min(ao.ub[1], bo.ub[1]);
    ro.ub[2] = _rw_min(ao.ub[2], bo.ub[2]);
    ro.ub[3] = _rw_min(ao.ub[3], bo.ub[3]);
    ro.ub[4] = _rw_min(ao.ub[4], bo.ub[4]);
    ro.ub[5] = _rw_min(ao.ub[5], bo.ub[5]);
    ro.ub[6] = _rw_min(ao.ub[6], bo.ub[6]);
    ro.ub[7] = _rw_min(ao.ub[7], bo.ub[7]);
    ro.ub[8] = _rw_min(ao.ub[8], bo.ub[8]);
    ro.ub[9] = _rw_min(ao.ub[9], bo.ub[9]);
    ro.ub[10] = _rw_min(ao.ub[10], bo.ub[10]);
    ro.ub[11] = _rw_min(ao.ub[11], bo.ub[11]);
    ro.ub[12] = _rw_min(ao.ub[12], bo.ub[12]);
    ro.ub[13] = _rw_min(ao.ub[13], bo.ub[13]);
    ro.ub[14] = _rw_min(ao.ub[14], bo.ub[14]);
    ro.ub[15] = _rw_min(ao.ub[15], bo.ub[15]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mulhi_epi16 multiplies the 8 signed 16bit integers
 * from a by the 8 signed 16bit integers from b.  Packs the upper
 * 16bits of the 8 signed 32bit results.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_mulhi_epi16(Rt_m128i a, Rt_m128i b) /* PMULHW */
{

    /*
     * Multiplies the 8 signed 16bit integers from a
     * by the 8 signed 16bit integers from b.
     * Packs the upper 16bits of the 8 signed 32bit results.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mulhi_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = ((int) ao._w[0]) * ((int) bo._w[0]) >> 16;
    ro._w[1] = ((int) ao._w[1]) * ((int) bo._w[1]) >> 16;
    ro._w[2] = ((int) ao._w[2]) * ((int) bo._w[2]) >> 16;
    ro._w[3] = ((int) ao._w[3]) * ((int) bo._w[3]) >> 16;
    ro._w[4] = ((int) ao._w[4]) * ((int) bo._w[4]) >> 16;
    ro._w[5] = ((int) ao._w[5]) * ((int) bo._w[5]) >> 16;
    ro._w[6] = ((int) ao._w[6]) * ((int) bo._w[6]) >> 16;
    ro._w[7] = ((int) ao._w[7]) * ((int) bo._w[7]) >> 16;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mulhi_epu16 multiplies the 8 unsigned 16bit
 * integers from a by the 8 unsigned 16bit integers from b.  Packs the
 * upper 16bits of the 8 unsigned 32bit results.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_mulhi_epu16(Rt_m128i a, Rt_m128i b) /* PMULHUW */
{

    /*
     * Multiplies the 8 unsigned 16bit integers from a
     * by the 8 unsigned 16bit integers from  b.
     * Packs the upper 16bits of the 8 unsigned 32bit results.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mulhi_epu16"));

    ao.m128i = a;
    bo.m128i = b;

    ro.uw[0] = ((int) ao.uw[0]) * ((int) bo.uw[0]) >> 16;
    ro.uw[1] = ((int) ao.uw[1]) * ((int) bo.uw[1]) >> 16;
    ro.uw[2] = ((int) ao.uw[2]) * ((int) bo.uw[2]) >> 16;
    ro.uw[3] = ((int) ao.uw[3]) * ((int) bo.uw[3]) >> 16;
    ro.uw[4] = ((int) ao.uw[4]) * ((int) bo.uw[4]) >> 16;
    ro.uw[5] = ((int) ao.uw[5]) * ((int) bo.uw[5]) >> 16;
    ro.uw[6] = ((int) ao.uw[6]) * ((int) bo.uw[6]) >> 16;
    ro.uw[7] = ((int) ao.uw[7]) * ((int) bo.uw[7]) >> 16;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mullo_epi16 multiplies the 8 signed or unsigned
 * 16bit integers from a by the 8 signed or unsigned 16bit integers
 * from b.  Packs the lower 16bits of the 8 signed or unsigned 32bit
 * results.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_mullo_epi16(Rt_m128i a, Rt_m128i b) /* PMULLW */
{

    /*
     * Multiplies the 8 signed or unsigned 16bit integers from a
     * by the 8 signed or unsigned 16bit integers from b.
     * Packs the lower 16bits of the 8 signed or unsigned 32bit
     * results.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mullo_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = ((int) ao._w[0]) * ((int) bo._w[0]) & 0xffff;
    ro._w[1] = ((int) ao._w[1]) * ((int) bo._w[1]) & 0xffff;
    ro._w[2] = ((int) ao._w[2]) * ((int) bo._w[2]) & 0xffff;
    ro._w[3] = ((int) ao._w[3]) * ((int) bo._w[3]) & 0xffff;
    ro._w[4] = ((int) ao._w[4]) * ((int) bo._w[4]) & 0xffff;
    ro._w[5] = ((int) ao._w[5]) * ((int) bo._w[5]) & 0xffff;
    ro._w[6] = ((int) ao._w[6]) * ((int) bo._w[6]) & 0xffff;
    ro._w[7] = ((int) ao._w[7]) * ((int) bo._w[7]) & 0xffff;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mul_su32 multiplies the lower 32bit integer from a
 * by the lower 32bit integer from b, and returns the 64bit integer
 * result.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m64
Rt_mm_mul_su32(Rt_m64 a, Rt_m64 b) /* PMULUDQ */
{
    /*
     * Multiplies the lower 32bit integer from a
     * by the lower 32bit integer from b, and
     * returns the 64bit integer result.
     */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mul_su32"));

    ao.m64 = a;
    bo.m64 = b;

    ro.m64 = _m64Mul(ao._d[0], bo._d[0]);

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mul_epu32 multiplies 2 unsigned 32bit integers
 * from a by 2 unsigned 32bit integers from b.  Packs the 2 unsigned
 * 64bit integer results.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_mul_epu32(Rt_m128i a, Rt_m128i b) /* PMULUDQ */
{

    /*
     * Multiplies 2 unsigned 32bit integers from a
     * by 2 unsigned 32bit integers from b.
     * Packs the 2 unsigned 64bit integer results.
     */
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mul_epu32"));

    ao.m128i = a;
    bo.m128i = b;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_mul_epu32");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sad_epu8 computes the absolute difference of the 16
 * unsigned 8bit integers from a and the 16 unsigned 8bit integers from
 * b.  Sums the upper 8 differences and lower 8 differences, and packs
 * the resulting 2 unsigned 16bit integers into the upper and lower
 * 64bit elements.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sad_epu8(Rt_m128i a, Rt_m128i b) /* PSADBW */
{

    /*
     * Computes the absolute difference of the 16 unsigned 8bit integers
     * from a and the 16 unsigned 8bit integers from b.
     * Sums the upper 8 differences and lower 8 differences,
     * and packs the resulting 2 unsigned 16bit integers into
     * the upper and lower 64bit elements.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sad_epu8"));

    ao.m128i = a;
    bo.m128i = b;

    ro.uw[0] =
        (abs(ao.ub[0] - bo.ub[0]) +
         abs(ao.ub[1] - bo.ub[1]) +
         abs(ao.ub[2] - bo.ub[2]) +
         abs(ao.ub[3] - bo.ub[3]) +
         abs(ao.ub[4] - bo.ub[4]) +
         abs(ao.ub[5] - bo.ub[5]) +
         abs(ao.ub[6] - bo.ub[6]) + abs(ao.ub[7] - bo.ub[7]));
    ro.uw[1] = 0x0;
    ro.uw[2] = 0x0;
    ro.uw[3] = 0x0;
    ro.uw[4] =
        abs(ao.ub[8] - bo.ub[8]) +
        abs(ao.ub[9] - bo.ub[9]) +
        abs(ao.ub[10] - bo.ub[10]) +
        abs(ao.ub[11] - bo.ub[11]) +
        abs(ao.ub[12] - bo.ub[12]) +
        abs(ao.ub[13] - bo.ub[13]) +
        abs(ao.ub[14] - bo.ub[14]) + abs(ao.ub[15] - bo.ub[15]);
    ro.uw[5] = 0x0;
    ro.uw[6] = 0x0;
    ro.uw[7] = 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_epi8 subtracts the 16 signed or unsigned 8bit
 * integers of b from the 16 signed or unsigned 8bit integers of a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sub_epi8(Rt_m128i a, Rt_m128i b) /* PSUBB */
{

    /*
     * Subtracts the 16 signed or unsigned 8bit integers of b
     * from the 16 signed or unsigned 8bit integers of a.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = ao._b[0] - bo._b[0];
    ro._b[1] = ao._b[1] - bo._b[1];
    ro._b[2] = ao._b[2] - bo._b[2];
    ro._b[3] = ao._b[3] - bo._b[3];
    ro._b[4] = ao._b[4] - bo._b[4];
    ro._b[5] = ao._b[5] - bo._b[5];
    ro._b[6] = ao._b[6] - bo._b[6];
    ro._b[7] = ao._b[7] - bo._b[7];
    ro._b[8] = ao._b[8] - bo._b[8];
    ro._b[9] = ao._b[9] - bo._b[9];
    ro._b[10] = ao._b[10] - bo._b[10];
    ro._b[11] = ao._b[11] - bo._b[11];
    ro._b[12] = ao._b[12] - bo._b[12];
    ro._b[13] = ao._b[13] - bo._b[13];
    ro._b[14] = ao._b[14] - bo._b[14];
    ro._b[15] = ao._b[15] - bo._b[15];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_epi16 subtracts the 8 signed or unsigned 16bit
 * integers of b from the 8 signed or unsigned 16bit integers of a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sub_epi16(Rt_m128i a, Rt_m128i b) /* PSUBW */
{

    /*
     * Subtracts the 8 signed or unsigned 16bit integers of b
     * from the 8 signed or unsigned 16bit integers of a.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = ao._w[0] - bo._w[0];
    ro._w[1] = ao._w[1] - bo._w[1];
    ro._w[2] = ao._w[2] - bo._w[2];
    ro._w[3] = ao._w[3] - bo._w[3];
    ro._w[4] = ao._w[4] - bo._w[4];
    ro._w[5] = ao._w[5] - bo._w[5];
    ro._w[6] = ao._w[6] - bo._w[6];
    ro._w[7] = ao._w[7] - bo._w[7];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_epi32 subtracts the 4 signed or unsigned 32bit
 * integers of b from the 4 signed or unsigned 32bit integers of a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sub_epi32(Rt_m128i a, Rt_m128i b) /* PSUBD */
{

    /*
     * Subtracts the 4 signed or unsigned 32bit integers of b
     * from the 4 signed or unsigned 32bit integers of a.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_epi32"));

    ao.m128i = a;
    bo.m128i = b;

    ro._d[0] = ao._d[0] - bo._d[0];
    ro._d[1] = ao._d[1] - bo._d[1];
    ro._d[2] = ao._d[2] - bo._d[2];
    ro._d[3] = ao._d[3] - bo._d[3];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_si64 subtracts the signed or unsigned 64bit
 * integer b from the signed or unsigned 64bit integer a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m64
Rt_mm_sub_si64(Rt_m64 a, Rt_m64 b) /* PSUBQ */
{

    /*
     * Subtracts the signed or unsigned 64bit integer b
     * from the signed or unsigned 64bit integer a.
     */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_si64"));

    ao.m64 = a;
    bo.m64 = b;

    ro.m64 = _m64Sub(ao.m64, bo.m64);

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_epi64 subtracts the 2 signed or unsigned 64bit
 * integers in b from the 2 signed or unsigned 64bit integers in a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sub_epi64(Rt_m128i a, Rt_m128i b) /* PSUBQ */
{

    /*
     * Subtracts the 2 signed or unsigned 64bit integers in b
     * from the 2 signed or unsigned 64bit integers in a.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_epi64"));

    ao.m128i = a;
    bo.m128i = b;

    ro.m64[0] = _m64Add(ao.m64[0], bo.m64[0]);
    ro.m64[1] = _m64Add(ao.m64[1], bo.m64[1]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_subs_epi8 subtracts the 16 signed 8bit integers of
 * b from the 16 signed 8bit integers of a and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_subs_epi8(Rt_m128i a, Rt_m128i b) /* PSUBSB */
{

    /*
     * Subtracts the 16 signed 8bit integers of b
     * from the 16 signed 8bit integers of a and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_subs_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = SignedSaturateByte(((int) ao._b[0]) - ((int) bo._b[0]));
    ro._b[1] = SignedSaturateByte(((int) ao._b[1]) - ((int) bo._b[1]));
    ro._b[2] = SignedSaturateByte(((int) ao._b[2]) - ((int) bo._b[2]));
    ro._b[3] = SignedSaturateByte(((int) ao._b[3]) - ((int) bo._b[3]));
    ro._b[4] = SignedSaturateByte(((int) ao._b[4]) - ((int) bo._b[4]));
    ro._b[5] = SignedSaturateByte(((int) ao._b[5]) - ((int) bo._b[5]));
    ro._b[6] = SignedSaturateByte(((int) ao._b[6]) - ((int) bo._b[6]));
    ro._b[7] = SignedSaturateByte(((int) ao._b[7]) - ((int) bo._b[7]));
    ro._b[8] = SignedSaturateByte(((int) ao._b[8]) - ((int) bo._b[8]));
    ro._b[9] = SignedSaturateByte(((int) ao._b[9]) - ((int) bo._b[9]));
    ro._b[10] =
        SignedSaturateByte(((int) ao._b[10]) - ((int) bo._b[10]));
    ro._b[11] =
        SignedSaturateByte(((int) ao._b[11]) - ((int) bo._b[11]));
    ro._b[12] =
        SignedSaturateByte(((int) ao._b[12]) - ((int) bo._b[12]));
    ro._b[13] =
        SignedSaturateByte(((int) ao._b[13]) - ((int) bo._b[13]));
    ro._b[14] =
        SignedSaturateByte(((int) ao._b[14]) - ((int) bo._b[14]));
    ro._b[15] =
        SignedSaturateByte(((int) ao._b[15]) - ((int) bo._b[15]));

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_subs_epi16 subtracts the 8 signed 16bit integers
 * of b from the 8 signed 16bit integers of a and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_subs_epi16(Rt_m128i a, Rt_m128i b) /* PSUBSW */
{

    /*
     * Subtracts the 8 signed 16bit integers of b
     * from the 8 signed 16bit integers of a and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_subs_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = SignedSaturateWord(((int) ao._w[0]) - ((int) bo._w[0]));
    ro._w[1] = SignedSaturateWord(((int) ao._w[1]) - ((int) bo._w[1]));
    ro._w[2] = SignedSaturateWord(((int) ao._w[2]) - ((int) bo._w[2]));
    ro._w[3] = SignedSaturateWord(((int) ao._w[3]) - ((int) bo._w[3]));
    ro._w[4] = SignedSaturateWord(((int) ao._w[4]) - ((int) bo._w[4]));
    ro._w[5] = SignedSaturateWord(((int) ao._w[5]) - ((int) bo._w[5]));
    ro._w[6] = SignedSaturateWord(((int) ao._w[6]) - ((int) bo._w[6]));
    ro._w[7] = SignedSaturateWord(((int) ao._w[7]) - ((int) bo._w[7]));

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_subs_epu8 subtracts the 16 unsigned 8bit integers
 * of b from the 16 unsigned 8bit integers of a and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_subs_epu8(Rt_m128i a, Rt_m128i b) /* PSUBUSB */
{

    /*
     * Subtracts the 16 unsigned 8bit integers of b
     * from the 16 unsigned 8bit integers of a and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_subs_epu8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = UnSignedSaturateByte(((unsigned int) ao._b[0]) -
                                    ((unsigned int) bo._b[0]));
    ro._b[1] = UnSignedSaturateByte(((unsigned int) ao._b[1]) -
                                    ((unsigned int) bo._b[1]));
    ro._b[2] = UnSignedSaturateByte(((unsigned int) ao._b[2]) -
                                    ((unsigned int) bo._b[2]));
    ro._b[3] = UnSignedSaturateByte(((unsigned int) ao._b[3]) -
                                    ((unsigned int) bo._b[3]));
    ro._b[4] = UnSignedSaturateByte(((unsigned int) ao._b[4]) -
                                    ((unsigned int) bo._b[4]));
    ro._b[5] = UnSignedSaturateByte(((unsigned int) ao._b[5]) -
                                    ((unsigned int) bo._b[5]));
    ro._b[6] = UnSignedSaturateByte(((unsigned int) ao._b[6]) -
                                    ((unsigned int) bo._b[6]));
    ro._b[7] = UnSignedSaturateByte(((unsigned int) ao._b[7]) -
                                    ((unsigned int) bo._b[7]));
    ro._b[8] = UnSignedSaturateByte(((unsigned int) ao._b[8]) -
                                    ((unsigned int) bo._b[8]));
    ro._b[9] = UnSignedSaturateByte(((unsigned int) ao._b[9]) -
                                    ((unsigned int) bo._b[9]));
    ro._b[10] = UnSignedSaturateByte(((unsigned int) ao._b[10]) -
                                     ((unsigned int) bo._b[10]));
    ro._b[11] = UnSignedSaturateByte(((unsigned int) ao._b[11]) -
                                     ((unsigned int) bo._b[11]));
    ro._b[12] = UnSignedSaturateByte(((unsigned int) ao._b[12]) -
                                     ((unsigned int) bo._b[12]));
    ro._b[13] = UnSignedSaturateByte(((unsigned int) ao._b[13]) -
                                     ((unsigned int) bo._b[13]));
    ro._b[14] = UnSignedSaturateByte(((unsigned int) ao._b[14]) -
                                     ((unsigned int) bo._b[14]));
    ro._b[15] = UnSignedSaturateByte(((unsigned int) ao._b[15]) -
                                     ((unsigned int) bo._b[15]));

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_subs_epu16 subtracts the 8 unsigned 16bit integers
 * of b from the 8 unsigned 16bit integers of a and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_subs_epu16(Rt_m128i a, Rt_m128i b) /* PSUBUSW */
{

    /*
     * Subtracts the 8 unsigned 16bit integers of b
     * from the 8 unsigned 16bit integers of a and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_subs_epu16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = UnSignedSaturateWord(((unsigned int) ao._w[0]) -
                                    ((unsigned int) bo._w[0]));
    ro._w[1] = UnSignedSaturateWord(((unsigned int) ao._w[1]) -
                                    ((unsigned int) bo._w[1]));
    ro._w[2] = UnSignedSaturateWord(((unsigned int) ao._w[2]) -
                                    ((unsigned int) bo._w[2]));
    ro._w[3] = UnSignedSaturateWord(((unsigned int) ao._w[3]) -
                                    ((unsigned int) bo._w[3]));
    ro._w[4] = UnSignedSaturateWord(((unsigned int) ao._w[4]) -
                                    ((unsigned int) bo._w[4]));
    ro._w[5] = UnSignedSaturateWord(((unsigned int) ao._w[5]) -
                                    ((unsigned int) bo._w[5]));
    ro._w[6] = UnSignedSaturateWord(((unsigned int) ao._w[6]) -
                                    ((unsigned int) bo._w[6]));
    ro._w[7] = UnSignedSaturateWord(((unsigned int) ao._w[7]) -
                                    ((unsigned int) bo._w[7]));

    RWRETURN(ro.m128i);
}

/* Logical Operations */

/**
 * \ingroup rtintel
 * \ref Rt_mm_and_si128 computes the bitwise AND of the 128bit
 * value in a and the 128bit value in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_and_si128(Rt_m128i a, Rt_m128i b) /* PAND */
{

    /*
     * Computes the bitwise AND of the 128bit value in a
     * and the 128bit value in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_and_si128"));

    ao.m128i = a;
    bo.m128i = b;

    ro.ud[0] = ao.ud[0] & bo.ud[0];
    ro.ud[1] = ao.ud[1] & bo.ud[1];
    ro.ud[2] = ao.ud[2] & bo.ud[2];
    ro.ud[3] = ao.ud[3] & bo.ud[3];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_andnot_si128 computes the bitwise AND of the
 * 128bit value in b and the bitwise NOT of the 128bit value in a.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_andnot_si128(Rt_m128i a, Rt_m128i b) /* PANDN */
{

    /*
     * Computes the bitwise AND of the 128bit value in b
     * and the bitwise NOT of the 128bit value in a.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_andnot_si128"));

    ao.m128i = a;
    bo.m128i = b;

    ro.ud[0] = (~ao.ud[0]) & bo.ud[0];
    ro.ud[1] = (~ao.ud[1]) & bo.ud[1];
    ro.ud[2] = (~ao.ud[2]) & bo.ud[2];
    ro.ud[3] = (~ao.ud[3]) & bo.ud[3];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_or_si128 computes the bitwise OR of the 128bit
 * value in a and the 128bit value in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_or_si128(Rt_m128i a, Rt_m128i b) /* POR */
{

    /*
     * Computes the bitwise OR of the 128bit value in a
     * and the 128bit value in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_or_si128"));

    ao.m128i = a;
    bo.m128i = b;

    ro.ud[0] = ao.ud[0] | bo.ud[0];
    ro.ud[1] = ao.ud[1] | bo.ud[1];
    ro.ud[2] = ao.ud[2] | bo.ud[2];
    ro.ud[3] = ao.ud[3] | bo.ud[3];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_xor_si128 computes the bitwise XOR of the 128bit
 * value in a and the 128bit value in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_xor_si128(Rt_m128i a, Rt_m128i b) /* PXOR */
{

    /*
     * Computes the bitwise XOR of the 128bit value in a
     * and the 128bit value in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_xor_si128"));

    ao.m128i = a;
    bo.m128i = b;

    ro.ud[0] = ao.ud[0] ^ bo.ud[0];
    ro.ud[1] = ao.ud[1] ^ bo.ud[1];
    ro.ud[2] = ao.ud[2] ^ bo.ud[2];
    ro.ud[3] = ao.ud[3] ^ bo.ud[3];

    RWRETURN(ro.m128i);
}

/* Shift Operations  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_slli_si128 shifts the 128bit value in a left by
 * imm bytes while shifting in zeros.  imm must be an immediate.  *
 * \param  a   a
 * \param  imm   imm
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_slli_si128(Rt_m128i a, int __RWUNUSED__ imm) /* PSLLDQ */
{
    /*
     * Shifts the 128bit value in a left by imm bytes while shifting in zeros.
     * imm must be an immediate.
     * r = a << (imm * 8);
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_slli_si128"));

    ao.m128i = a;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_slli_si128");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_slli_epi16 shifts the 8 signed or unsigned 16bit
 * integers in a left by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_slli_epi16(Rt_m128i a, int count) /* PSLLW */
{

    /*
     * Shifts the 8 signed or unsigned 16bit integers in a 
     * left by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_slli_epi16"));

    ao.m128i = a;

    ro.uw[0] = ao.uw[0] << count;
    ro.uw[1] = ao.uw[1] << count;
    ro.uw[2] = ao.uw[2] << count;
    ro.uw[3] = ao.uw[3] << count;
    ro.uw[4] = ao.uw[4] << count;
    ro.uw[5] = ao.uw[5] << count;
    ro.uw[6] = ao.uw[6] << count;
    ro.uw[7] = ao.uw[7] << count;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sll_epi16 shifts the 8 signed or unsigned 16bit
 * integers in a left by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sll_epi16(Rt_m128i a, Rt_m128i count) /* PSLLW */
{

    /*
     * Shifts the 8 signed or unsigned 16bit integers in a
     * left by count bits while shifting in  zeros.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i counto;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sll_epi16"));

    ao.m128i = a;
    counto.m128i = count;

    ro.uw[0] = ao.uw[0] << counto.ud[0];
    ro.uw[1] = ao.uw[1] << counto.ud[0];
    ro.uw[2] = ao.uw[2] << counto.ud[0];
    ro.uw[3] = ao.uw[3] << counto.ud[0];
    ro.uw[4] = ao.uw[4] << counto.ud[0];
    ro.uw[5] = ao.uw[5] << counto.ud[0];
    ro.uw[6] = ao.uw[6] << counto.ud[0];
    ro.uw[7] = ao.uw[7] << counto.ud[0];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_slli_epi32 shifts the 4 signed or unsigned 32bit
 * integers in a left by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_slli_epi32(Rt_m128i a, int count) /* PSLLD */
{

    /*
     * Shifts the 4 signed or unsigned 32bit integers in a
     * left by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_slli_epi32"));

    ao.m128i = a;

    ro.ud[0] = ao.ud[0] << count;
    ro.ud[1] = ao.ud[1] << count;
    ro.ud[2] = ao.ud[2] << count;
    ro.ud[3] = ao.ud[3] << count;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sll_epi32 shifts the 4 signed or unsigned 32bit
 * integers in a left by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sll_epi32(Rt_m128i a, Rt_m128i count) /* PSLLD */
{

    /*
     * Shifts the 4 signed or unsigned 32bit integers in a
     * left by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i counto;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sll_epi32"));

    ao.m128i = a;
    counto.m128i = count;

    ro.ud[0] = ao.ud[0] << counto.ud[0];
    ro.ud[1] = ao.ud[1] << counto.ud[0];
    ro.ud[2] = ao.ud[2] << counto.ud[0];
    ro.ud[3] = ao.ud[3] << counto.ud[0];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_slli_epi64 shifts the 2 signed or unsigned 64bit
 * integers in a left by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_slli_epi64(Rt_m128i a, int __RWUNUSED__ count) /* PSLLQ */
{

    /*
     * Shifts the 2 signed or unsigned 64bit integers in a
     * left by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_slli_epi64"));

    ao.m128i = a;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_slli_epi64");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sll_epi64 shifts the 2 signed or unsigned 64bit
 * integers in a left by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_sll_epi64(Rt_m128i a, Rt_m128i count) /* PSLLQ */
{

    /*
     * Shifts the 2 signed or unsigned 64bit integers in a
     * left by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i counto;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sll_epi64"));

    ao.m128i = a;
    counto.m128i = count;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_sll_epi64");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srai_epi16 shifts the 8 signed 16bit integers in a
 * right by count bits while shifting in the sign bit.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_srai_epi16(Rt_m128i a, int count) /* PSRAW */
{

    /*
     * Shifts the 8 signed 16bit integers in a
     * right by count bits while shifting in the sign bit.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srai_epi16"));

    ao.m128i = a;

    ro._w[0] = ao._w[0] >> count;
    ro._w[1] = ao._w[1] >> count;
    ro._w[2] = ao._w[2] >> count;
    ro._w[3] = ao._w[3] >> count;
    ro._w[4] = ao._w[4] >> count;
    ro._w[5] = ao._w[5] >> count;
    ro._w[6] = ao._w[6] >> count;
    ro._w[7] = ao._w[7] >> count;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sra_epi16 shifts the 8 signed 16bit integers in a
 * right by count bits while shifting in the sign bit.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sra_epi16(Rt_m128i a, Rt_m128i count) /* PSRAW */
{

    /*
     * Shifts the 8 signed 16bit integers in a
     * right by count bits while shifting in the sign bit.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i counto;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sra_epi16"));

    ao.m128i = a;
    counto.m128i = count;

    ro._w[0] = ao._w[0] >> counto.ud[0];
    ro._w[1] = ao._w[1] >> counto.ud[0];
    ro._w[2] = ao._w[2] >> counto.ud[0];
    ro._w[3] = ao._w[3] >> counto.ud[0];
    ro._w[4] = ao._w[4] >> counto.ud[0];
    ro._w[5] = ao._w[5] >> counto.ud[0];
    ro._w[6] = ao._w[6] >> counto.ud[0];
    ro._w[7] = ao._w[7] >> counto.ud[0];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srai_epi32 shifts the 4 signed 32bit integers in a
 * right by count bits while shifting in the sign bit.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_srai_epi32(Rt_m128i a, int count) /* PSRAD */
{

    /*
     * Shifts the 4 signed 32bit integers in a
     * right by count bits while shifting in the sign bit.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srai_epi32"));

    ao.m128i = a;

    ro._d[0] = ao._d[0] >> count;
    ro._d[1] = ao._d[1] >> count;
    ro._d[2] = ao._d[2] >> count;
    ro._d[3] = ao._d[3] >> count;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sra_epi32 shifts the 4 signed 32bit integers in a
 * right by count bits while shifting in the sign bit.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_sra_epi32(Rt_m128i a, Rt_m128i count) /* PSRAD */
{

    /*
     * Shifts the 4 signed 32bit integers in a
     * right by count bits while shifting in the sign bit.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i counto;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sra_epi32"));

    ao.m128i = a;
    counto.m128i = count;

    ro._d[0] = ao._d[0] >> counto.ud[0];
    ro._d[1] = ao._d[1] >> counto.ud[0];
    ro._d[2] = ao._d[2] >> counto.ud[0];
    ro._d[3] = ao._d[3] >> counto.ud[0];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srli_si128 shifts the 128bit value in a right by
 * imm bytes while shifting in zeros.  imm must be an immediate.  r =
 * srl(a, imm * 8); *
 * \param  a   a
 * \param  imm   imm
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_srli_si128(Rt_m128i a, int __RWUNUSED__ imm) /* PSRLDQ */
{

    /*
     * Shifts the 128bit value in a
     * right by imm bytes while shifting in zeros.
     * imm must be an immediate.
     *  r = srl(a, imm * 8);
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srli_si128"));

    ao.m128i = a;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_srli_si128");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srli_epi16 shifts the 8 signed or unsigned 16bit
 * integers in a right by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_srli_epi16(Rt_m128i a, int count) /* PSRLW */
{

    /*
     * Shifts the 8 signed or unsigned 16bit integers in a
     * right by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srli_epi16"));

    ao.m128i = a;

    ro.uw[0] = ao.uw[0] >> count;
    ro.uw[1] = ao.uw[1] >> count;
    ro.uw[2] = ao.uw[2] >> count;
    ro.uw[3] = ao.uw[3] >> count;
    ro.uw[4] = ao.uw[4] >> count;
    ro.uw[5] = ao.uw[5] >> count;
    ro.uw[6] = ao.uw[6] >> count;
    ro.uw[7] = ao.uw[7] >> count;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srl_epi16 shifts the 8 signed or unsigned 16bit
 * integers in a right by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_srl_epi16(Rt_m128i a, Rt_m128i count) /* PSRLW */
{

    /*
     * Shifts the 8 signed or unsigned 16bit integers in a
     * right by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i counto;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srl_epi16"));

    ao.m128i = a;
    counto.m128i = count;

    ro.uw[0] = ao.uw[0] >> counto.ud[0];
    ro.uw[1] = ao.uw[1] >> counto.ud[0];
    ro.uw[2] = ao.uw[2] >> counto.ud[0];
    ro.uw[3] = ao.uw[3] >> counto.ud[0];
    ro.uw[4] = ao.uw[4] >> counto.ud[0];
    ro.uw[5] = ao.uw[5] >> counto.ud[0];
    ro.uw[6] = ao.uw[6] >> counto.ud[0];
    ro.uw[7] = ao.uw[7] >> counto.ud[0];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srli_epi32 shifts the 4 signed or unsigned 32bit
 * integers in a right by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_srli_epi32(Rt_m128i a, int count) /* PSRLD */
{

    /*
     * Shifts the 4 signed or unsigned 32bit integers in a
     * right by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srli_epi32"));

    ao.m128i = a;

    ro.ud[0] = ao.ud[0] >> count;
    ro.ud[1] = ao.ud[1] >> count;
    ro.ud[2] = ao.ud[2] >> count;
    ro.ud[3] = ao.ud[3] >> count;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srl_epi32 shifts the 4 signed or unsigned 32bit
 * integers in a right by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_srl_epi32(Rt_m128i a, Rt_m128i count) /* PSRLD */
{

    /*
     * Shifts the 4 signed or unsigned 32bit integers in a
     * right by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i counto;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srl_epi32"));

    ao.m128i = a;
    counto.m128i = count;

    ro.ud[0] = ao.ud[0] >> counto.ud[0];
    ro.ud[1] = ao.ud[1] >> counto.ud[0];
    ro.ud[2] = ao.ud[2] >> counto.ud[0];
    ro.ud[3] = ao.ud[3] >> counto.ud[0];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srli_epi64 shifts the 2 signed or unsigned 64bit
 * integers in a right by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_srli_epi64(Rt_m128i a, int __RWUNUSED__ count) /* PSRLQ */
{

    /*
     * Shifts the 2 signed or unsigned 64bit integers in a
     * right by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srli_epi64"));

    ao.m128i = a;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_srli_epi64");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_srl_epi64 shifts the 2 signed or unsigned 64bit
 * integers in a right by count bits while shifting in zeros.  *
 * \param  a   a
 * \param  count   count
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_srl_epi64(Rt_m128i a, Rt_m128i count) /* PSRLQ */
{

    /*
     * Shifts the 2 signed or unsigned 64bit integers in a
     * right by count bits while shifting in zeros.
     */
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i counto;

    RWAPIFUNCTION(RWSTRING("Rt_mm_srl_epi64"));

    ao.m128i = a;
    counto.m128i = count;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_srl_epi64");

    RWRETURN(Rt_mm_set_sd(0));
}

/* Comparisons  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpeq_epi8 compares the 16 signed or unsigned 8bit
 * integers in a and the 16 signed or unsigned 8bit integers in b for
 * equality.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmpeq_epi8(Rt_m128i a, Rt_m128i b) /* PCMPEQB */
{

    /*
     * Compares the 16 signed or unsigned 8bit integers in a
     * and the 16 signed or unsigned 8bit integers in b for equality.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpeq_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = (ao._b[0] == bo._b[0]) ? 0xff : 0x0;
    ro._b[1] = (ao._b[1] == bo._b[1]) ? 0xff : 0x0;
    ro._b[2] = (ao._b[2] == bo._b[2]) ? 0xff : 0x0;
    ro._b[3] = (ao._b[3] == bo._b[3]) ? 0xff : 0x0;
    ro._b[4] = (ao._b[4] == bo._b[4]) ? 0xff : 0x0;
    ro._b[5] = (ao._b[5] == bo._b[5]) ? 0xff : 0x0;
    ro._b[6] = (ao._b[6] == bo._b[6]) ? 0xff : 0x0;
    ro._b[7] = (ao._b[7] == bo._b[7]) ? 0xff : 0x0;
    ro._b[8] = (ao._b[8] == bo._b[8]) ? 0xff : 0x0;
    ro._b[9] = (ao._b[9] == bo._b[8]) ? 0xff : 0x0;
    ro._b[10] = (ao._b[10] == bo._b[10]) ? 0xff : 0x0;
    ro._b[11] = (ao._b[11] == bo._b[11]) ? 0xff : 0x0;
    ro._b[12] = (ao._b[12] == bo._b[12]) ? 0xff : 0x0;
    ro._b[13] = (ao._b[13] == bo._b[13]) ? 0xff : 0x0;
    ro._b[14] = (ao._b[14] == bo._b[14]) ? 0xff : 0x0;
    ro._b[15] = (ao._b[15] == bo._b[15]) ? 0xff : 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpeq_epi16 compares the 8 signed or unsigned
 * 16bit integers in a and the 8 signed or unsigned 16bit integers in b
 * for equality.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmpeq_epi16(Rt_m128i a, Rt_m128i b) /* PCMPEQW */
{

    /*
     * Compares the 8 signed or unsigned 16bit integers in a
     * and the 8 signed or unsigned 16bit integers in b for equality.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpeq_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = (ao._w[0] == bo._w[0]) ? 0xffff : 0x0;
    ro._w[1] = (ao._w[1] == bo._w[1]) ? 0xffff : 0x0;
    ro._w[2] = (ao._w[2] == bo._w[2]) ? 0xffff : 0x0;
    ro._w[3] = (ao._w[3] == bo._w[3]) ? 0xffff : 0x0;
    ro._w[4] = (ao._w[4] == bo._w[4]) ? 0xffff : 0x0;
    ro._w[5] = (ao._w[5] == bo._w[5]) ? 0xffff : 0x0;
    ro._w[6] = (ao._w[6] == bo._w[6]) ? 0xffff : 0x0;
    ro._w[7] = (ao._w[7] == bo._w[7]) ? 0xffff : 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpeq_epi32 compares the 4 signed or unsigned
 * 32bit integers in a and the 4 signed or unsigned 32bit integers in b
 * for equality.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmpeq_epi32(Rt_m128i a, Rt_m128i b) /* PCMPEQD */
{

    /*
     * Compares the 4 signed or unsigned 32bit integers in a
     * and the 4 signed or unsigned 32bit integers in b for equality.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpeq_epi32"));

    ao.m128i = a;
    bo.m128i = b;

    ro._d[0] = (ao._d[0] == bo._d[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao._d[1] == bo._d[1]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao._d[2] == bo._d[2]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao._d[3] == bo._d[3]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpgt_epi8 compares the 16 signed 8bit integers in
 * a and the 16 signed 8bit integers in b for greater than.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmpgt_epi8(Rt_m128i a, Rt_m128i b) /* PCMPGTB */
{

    /*
     * Compares the 16 signed 8bit integers in a
     * and the 16 signed 8bit integers in b for greater than.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpgt_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = (ao._b[0] > bo._b[0]) ? 0xff : 0x0;
    ro._b[1] = (ao._b[1] > bo._b[1]) ? 0xff : 0x0;
    ro._b[2] = (ao._b[2] > bo._b[2]) ? 0xff : 0x0;
    ro._b[3] = (ao._b[3] > bo._b[3]) ? 0xff : 0x0;
    ro._b[4] = (ao._b[4] > bo._b[4]) ? 0xff : 0x0;
    ro._b[5] = (ao._b[5] > bo._b[5]) ? 0xff : 0x0;
    ro._b[6] = (ao._b[6] > bo._b[6]) ? 0xff : 0x0;
    ro._b[7] = (ao._b[7] > bo._b[7]) ? 0xff : 0x0;
    ro._b[8] = (ao._b[8] > bo._b[8]) ? 0xff : 0x0;
    ro._b[9] = (ao._b[9] > bo._b[8]) ? 0xff : 0x0;
    ro._b[10] = (ao._b[10] > bo._b[10]) ? 0xff : 0x0;
    ro._b[11] = (ao._b[11] > bo._b[11]) ? 0xff : 0x0;
    ro._b[12] = (ao._b[12] > bo._b[12]) ? 0xff : 0x0;
    ro._b[13] = (ao._b[13] > bo._b[13]) ? 0xff : 0x0;
    ro._b[14] = (ao._b[14] > bo._b[14]) ? 0xff : 0x0;
    ro._b[15] = (ao._b[15] > bo._b[15]) ? 0xff : 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpgt_epi16 compares the 8 signed 16bit integers
 * in a and the 8 signed 16bit integers in b for greater than.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmpgt_epi16(Rt_m128i a, Rt_m128i b) /* PCMPGTW */
{

    /*
     * Compares the 8 signed 16bit integers in a
     * and the 8 signed 16bit integers in b for greater than.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpgt_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = (ao._w[0] > bo._w[0]) ? 0xffff : 0x0;
    ro._w[1] = (ao._w[1] > bo._w[1]) ? 0xffff : 0x0;
    ro._w[2] = (ao._w[2] > bo._w[2]) ? 0xffff : 0x0;
    ro._w[3] = (ao._w[3] > bo._w[3]) ? 0xffff : 0x0;
    ro._w[4] = (ao._w[4] > bo._w[4]) ? 0xffff : 0x0;
    ro._w[5] = (ao._w[5] > bo._w[5]) ? 0xffff : 0x0;
    ro._w[6] = (ao._w[6] > bo._w[6]) ? 0xffff : 0x0;
    ro._w[7] = (ao._w[7] > bo._w[7]) ? 0xffff : 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpgt_epi32 compares the 4 signed 32bit integers
 * in a and the 4 signed 32bit integers in b for greater than.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmpgt_epi32(Rt_m128i a, Rt_m128i b) /* PCMPGTD */
{

    /*
     * Compares the 4 signed 32bit integers in a
     * and the 4 signed 32bit integers in b for greater than.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpgt_epi32"));

    ao.m128i = a;
    bo.m128i = b;

    ro._d[0] = (ao._d[0] > bo._d[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao._d[1] > bo._d[1]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao._d[2] > bo._d[2]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao._d[3] > bo._d[3]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmplt_epi8 compares the 16 signed 8bit integers in
 * a and the 16 signed 8bit integers in b for less than.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmplt_epi8(Rt_m128i a, Rt_m128i b) /* PCMPGTB r */
{

    /*
     * Compares the 16 signed 8bit integers in a
     * and the 16 signed 8bit integers in b for less than.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmplt_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = (ao._b[0] < bo._b[0]) ? 0xff : 0x0;
    ro._b[1] = (ao._b[1] < bo._b[1]) ? 0xff : 0x0;
    ro._b[2] = (ao._b[2] < bo._b[2]) ? 0xff : 0x0;
    ro._b[3] = (ao._b[3] < bo._b[3]) ? 0xff : 0x0;
    ro._b[4] = (ao._b[4] < bo._b[4]) ? 0xff : 0x0;
    ro._b[5] = (ao._b[5] < bo._b[5]) ? 0xff : 0x0;
    ro._b[6] = (ao._b[6] < bo._b[6]) ? 0xff : 0x0;
    ro._b[7] = (ao._b[7] < bo._b[7]) ? 0xff : 0x0;
    ro._b[8] = (ao._b[8] < bo._b[8]) ? 0xff : 0x0;
    ro._b[9] = (ao._b[9] < bo._b[8]) ? 0xff : 0x0;
    ro._b[10] = (ao._b[10] < bo._b[10]) ? 0xff : 0x0;
    ro._b[11] = (ao._b[11] < bo._b[11]) ? 0xff : 0x0;
    ro._b[12] = (ao._b[12] < bo._b[12]) ? 0xff : 0x0;
    ro._b[13] = (ao._b[13] < bo._b[13]) ? 0xff : 0x0;
    ro._b[14] = (ao._b[14] < bo._b[14]) ? 0xff : 0x0;
    ro._b[15] = (ao._b[15] < bo._b[15]) ? 0xff : 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmplt_epi16 compares the 8 signed 16bit integers
 * in a and the 8 signed 16bit integers in b for less than.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmplt_epi16(Rt_m128i a, Rt_m128i b) /* PCMPGTW r */
{

    /*
     * Compares the 8 signed 16bit integers in a
     * and the 8 signed 16bit integers in b for less than.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmplt_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = (ao._w[0] < bo._w[0]) ? 0xffff : 0x0;
    ro._w[1] = (ao._w[1] < bo._w[1]) ? 0xffff : 0x0;
    ro._w[2] = (ao._w[2] < bo._w[2]) ? 0xffff : 0x0;
    ro._w[3] = (ao._w[3] < bo._w[3]) ? 0xffff : 0x0;
    ro._w[4] = (ao._w[4] < bo._w[4]) ? 0xffff : 0x0;
    ro._w[5] = (ao._w[5] < bo._w[5]) ? 0xffff : 0x0;
    ro._w[6] = (ao._w[6] < bo._w[6]) ? 0xffff : 0x0;
    ro._w[7] = (ao._w[7] < bo._w[7]) ? 0xffff : 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmplt_epi32 compares the 4 signed 32bit integers
 * in a and the 4 signed 32bit integers in b for less than.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cmplt_epi32(Rt_m128i a, Rt_m128i b) /* PCMPGTD r */
{

    /*
     * Compares the 4 signed 32bit integers in a
     * and the 4 signed 32bit integers in b for less than.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmplt_epi32"));

    ao.m128i = a;
    bo.m128i = b;

    ro._d[0] = (ao._d[0] < bo._d[0]) ? 0xffffffff : 0x0;
    ro._d[1] = (ao._d[1] < bo._d[1]) ? 0xffffffff : 0x0;
    ro._d[2] = (ao._d[2] < bo._d[2]) ? 0xffffffff : 0x0;
    ro._d[3] = (ao._d[3] < bo._d[3]) ? 0xffffffff : 0x0;

    RWRETURN(ro.m128i);
}

/* Conversions */

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtsi32_si128 moves 32bit integer a to the least
 * significant 32 bits of an Rt_m128 object one extending the upper bits.
 * *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_cvtsi32_si128(int a)      /*  MOVD */
{

    /*
     * Moves 32bit integer a to the least significant 32 bits of
     * an Rt_m128 object one extending the upper bits.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtsi32_si128"));

    ro._d[0] = a;
    ro._d[1] = 0x0;
    ro._d[2] = 0x0;
    ro._d[3] = 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtsi128_si32 moves the least significant 32 bits
 * of a to a 32 bit integer.  *
 * \param  a   a
 *
 * \return result as described above
 */
int
Rt_mm_cvtsi128_si32(Rt_m128i a) /* MOVD */
{

    /*
     * Moves the least significant 32 bits of a to a 32 bit integer.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtsi128_si32"));

    ao.m128i = a;

    RWRETURN(ao._d[0]);
}

/* Miscellaneous Operations  */

/**
 * \ingroup rtintel
 * \ref Rt_mm_movepi64_pi64 returns the lower 64 bits of a as an
 * Rt_m64 type.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m64
Rt_mm_movepi64_pi64(Rt_m128i a) /* MOVDQ2Q */
{

    /*
     * Returns the lower 64 bits of a as an Rt_m64 type.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_movepi64_pi64"));

    ao.m128i = a;

    RWRETURN(ao.m64[0]);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_movpi64_epi64 moves the 64 bits of a to the lower
 * 64 bits of the result, zeroing the upper bits.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_movpi64_epi64(Rt_m64 a)   /* MOVQ2DQ */
{

    /*
     * Moves the 64 bits of a to the lower 64 bits of the result,
     * zeroing the upper bits.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_movpi64_epi64"));

    ro.m64[0] = a;
    ro.ud[2] = 0x0;
    ro.ud[3] = 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_move_epi64 moves the lower 64 bits of a to the
 * lower 64 bits of the result, zeroing the upper bits.  *
 * \param  a   a
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_move_epi64(Rt_m128i a)    /* MOVQ */
{

    /*
     * Moves the lower 64 bits of a to the lower 64 bits of the result,
     * zeroing the upper bits.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_move_epi64"));

    ao.m128i = a;

    ro.m64[0] = ao.m64[0];
    ro.ud[2] = 0x0;
    ro.ud[3] = 0x0;

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_packs_epi16 packs the 16 signed 16bit integers
 * from a and b into 8bit integers and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_packs_epi16(Rt_m128i a, Rt_m128i b) /* PACKSSWB */
{

    /*
     * Packs the 16 signed 16bit integers from a
     * and b into 8bit integers and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_packs_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = SignedSaturateByte(ao._w[0]);
    ro._b[1] = SignedSaturateByte(ao._w[1]);
    ro._b[2] = SignedSaturateByte(ao._w[2]);
    ro._b[3] = SignedSaturateByte(ao._w[3]);
    ro._b[4] = SignedSaturateByte(ao._w[4]);
    ro._b[5] = SignedSaturateByte(ao._w[5]);
    ro._b[6] = SignedSaturateByte(ao._w[6]);
    ro._b[7] = SignedSaturateByte(ao._w[7]);

    ro._b[8] = SignedSaturateByte(bo._w[0]);
    ro._b[9] = SignedSaturateByte(bo._w[1]);
    ro._b[10] = SignedSaturateByte(bo._w[2]);
    ro._b[11] = SignedSaturateByte(bo._w[3]);
    ro._b[12] = SignedSaturateByte(bo._w[4]);
    ro._b[13] = SignedSaturateByte(bo._w[5]);
    ro._b[14] = SignedSaturateByte(bo._w[6]);
    ro._b[15] = SignedSaturateByte(bo._w[7]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_packs_epi32 packs the 8 signed 32bit integers from
 * a and b into signed 16bit integers and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_packs_epi32(Rt_m128i a, Rt_m128i b) /* PACKSSDW */
{

    /*
     * Packs the 8 signed 32bit integers from a
     * and b into signed 16bit integers and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_packs_epi32"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = SignedSaturateWord(ao._d[0]);
    ro._w[1] = SignedSaturateWord(ao._d[1]);
    ro._w[2] = SignedSaturateWord(ao._d[2]);
    ro._w[3] = SignedSaturateWord(ao._d[3]);

    ro._w[4] = SignedSaturateWord(bo._d[0]);
    ro._w[5] = SignedSaturateWord(bo._d[1]);
    ro._w[6] = SignedSaturateWord(bo._d[2]);
    ro._w[7] = SignedSaturateWord(bo._d[3]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_packus_epi16 packs the 16 signed 16bit integers
 * from a and b into 8bit unsigned integers and saturates.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_packus_epi16(Rt_m128i a, Rt_m128i b) /* PACKUSWB */
{

    /*
     * Packs the 16 signed 16bit integers from a
     * and b into 8bit unsigned integers and saturates.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_packus_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro.ub[0] = UnSignedSaturateByte(ao.uw[0]);
    ro.ub[1] = UnSignedSaturateByte(ao.uw[1]);
    ro.ub[2] = UnSignedSaturateByte(ao.uw[2]);
    ro.ub[3] = UnSignedSaturateByte(ao.uw[3]);
    ro.ub[4] = UnSignedSaturateByte(ao.uw[4]);
    ro.ub[5] = UnSignedSaturateByte(ao.uw[5]);
    ro.ub[6] = UnSignedSaturateByte(ao.uw[6]);
    ro.ub[7] = UnSignedSaturateByte(ao.uw[7]);

    ro.ub[8] = UnSignedSaturateByte(bo.uw[0]);
    ro.ub[9] = UnSignedSaturateByte(bo.uw[1]);
    ro.ub[10] = UnSignedSaturateByte(bo.uw[2]);
    ro.ub[11] = UnSignedSaturateByte(bo.uw[3]);
    ro.ub[12] = UnSignedSaturateByte(bo.uw[4]);
    ro.ub[13] = UnSignedSaturateByte(bo.uw[5]);
    ro.ub[14] = UnSignedSaturateByte(bo.uw[6]);
    ro.ub[15] = UnSignedSaturateByte(bo.uw[7]);

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_extract_epi16 extracts the selected signed or
 * unsigned 16bit integer from a and zero extends.  The selector imm
 * must be an immediate.  *
 * \param  a   a
 * \param  imm   imm
 *
 * \return result as described above
 */
int
Rt_mm_extract_epi16(Rt_m128i a, int imm) /* PEXTRW */
{

    /*
     * Extracts the selected signed or unsigned 16bit integer from a
     * and zero extends.
     * The selector imm must be an immediate.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_extract_epi16"));

    ao.m128i = a;

    RWRETURN(((imm == 0) ? ao.uw[0] :
              ((imm == 1) ? ao.uw[1] :
               ((imm == 2) ? ao.uw[2] :
                ((imm == 3) ? ao.uw[3] :
                 ((imm == 4) ? ao.uw[4] :
                  ((imm == 5) ? ao.uw[5] :
                   ((imm == 6) ? ao.uw[6]
                    : ((imm == 7) ? ao.uw[7] : 0)))))))));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_insert_epi16 inserts the least significant 16 bits
 * of b into the selected 16bit integer of a.  The selector imm must be
 * an immediate.  *
 * \param  a   a
 * \param  b   b
 * \param  imm   imm
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_insert_epi16(Rt_m128i a, int b, int imm) /* PINSRW */
{

    /*
     * Inserts the least significant 16 bits of b
     * into the selected 16bit integer of a.
     * The selector imm must be an immediate.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_insert_epi16"));

    ao.m128i = a;

    ro._w[0] = (imm == 0) ? b : ao._w[0];
    ro._w[1] = (imm == 1) ? b : ao._w[1];
    ro._w[2] = (imm == 1) ? b : ao._w[2];
    ro._w[3] = (imm == 1) ? b : ao._w[3];
    ro._w[4] = (imm == 1) ? b : ao._w[4];
    ro._w[5] = (imm == 1) ? b : ao._w[5];
    ro._w[6] = (imm == 1) ? b : ao._w[6];
    ro._w[7] = (imm == 1) ? b : ao._w[7];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_movemask_epi8 creates a 16bit mask from the most
 * significant bits of the 16 signed or unsigned 8bit integers in a and
 * zero extends the upper bits.  *
 * \param  a   a
 *
 * \return result as described above
 */
int
Rt_mm_movemask_epi8(Rt_m128i a) /* PMOVMSKB */
{

    /*
     * Creates a 16bit mask from the most significant bits of
     * the 16 signed or unsigned 8bit integers in a
     * and zero extends the upper bits.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_movemask_epi8"));

    ao.m128i = a;

    RWRETURN((((int) (ao.ub[15] >> 7) << 15) |
              ((int) (ao.ub[14] >> 7) << 14) |
              ((int) (ao.ub[13] >> 7) << 13) |
              ((int) (ao.ub[12] >> 7) << 12) |
              ((int) (ao.ub[11] >> 7) << 11) |
              ((int) (ao.ub[10] >> 7) << 10) |
              ((int) (ao.ub[9] >> 7) << 9) |
              ((int) (ao.ub[8] >> 7) << 8) |
              ((int) (ao.ub[7] >> 7) << 7) |
              ((int) (ao.ub[6] >> 7) << 6) |
              ((int) (ao.ub[7] >> 7) << 5) |
              ((int) (ao.ub[4] >> 7) << 4) |
              ((int) (ao.ub[3] >> 7) << 3) |
              ((int) (ao.ub[2] >> 7) << 2) |
              ((int) (ao.ub[1] >> 7) << 1) | ((int) (ao.ub[0] >> 7) <<
                                              0)));

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_shuffle_epi32 shuffles the 4 signed or unsigned
 * 32bit integers in a as specified by imm.  The shuffle value, imm,
 * must be an immediate.  *
 * \param  a   a
 * \param  imm   imm
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_shuffle_epi32(Rt_m128i a, int __RWUNUSED__ imm) /* PSHUFD */
{

    /*
     * Shuffles the 4 signed or unsigned 32bit integers in a
     * as specified by imm.
     * The shuffle value, imm, must be an immediate.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_shuffle_epi32"));

    ao.m128i = a;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_shuffle_epi32");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_shufflehi_epi16 shuffles the upper 4 signed or
 * unsigned 16  bit integers in a as specified by imm. The shuffle
 * value, imm, must be an immediate.  *
 * \param  a   a
 * \param  imm   imm
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_shufflehi_epi16(Rt_m128i a, int __RWUNUSED__ imm) /* PSHUFHW */
{
    /*
     * Shuffles the upper 4 signed or unsigned 16  bit integers in a
     * as specified by imm. The shuffle value, imm,
     * must be an immediate.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_shufflehi_epi16"));

    ao.m128i = a;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_shufflehi_epi16");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_shufflelo_epi16 shuffles the lower 4 signed or
 * unsigned 16  bit integers in a as specified by imm.  The shuffle
 * value, imm, must be an immediate.  *
 * \param  a   a
 * \param  imm   imm
 *
 * \return result as described above
 */
Rt_m128d
Rt_mm_shufflelo_epi16(Rt_m128i a, int __RWUNUSED__ imm) /* PSHUFLW */
{
    /*
     * Shuffles the lower 4 signed or unsigned 16  bit integers in a
     * as specified by imm.
     * The shuffle value, imm, must be an immediate.
     */
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_shufflelo_epi16"));

    ao.m128i = a;

    INTEL_WNI_UNIMPLEMENTED("Rt_mm_shufflelo_epi16");

    RWRETURN(Rt_mm_set_sd(0));
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpackhi_epi8 interleaves the upper 8 signed or
 * unsigned 8bit integers in a with the upper 8 signed or unsigned 8bit
 * integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_unpackhi_epi8(Rt_m128i a, Rt_m128i b) /* PUNPCKHBW */
{

    /*
     * Interleaves the upper 8 signed or unsigned 8bit integers in a
     * with the upper 8 signed or unsigned 8bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpackhi_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = ao._b[8];
    ro._b[1] = bo._b[8];
    ro._b[2] = ao._b[9];
    ro._b[3] = bo._b[9];
    ro._b[4] = ao._b[10];
    ro._b[5] = bo._b[10];
    ro._b[6] = ao._b[11];
    ro._b[7] = bo._b[11];
    ro._b[8] = ao._b[12];
    ro._b[9] = bo._b[12];
    ro._b[10] = ao._b[13];
    ro._b[11] = bo._b[13];
    ro._b[12] = ao._b[14];
    ro._b[13] = bo._b[14];
    ro._b[14] = ao._b[15];
    ro._b[15] = bo._b[15];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpackhi_epi16 interleaves the upper 4 signed or
 * unsigned 16bit integers in a with the upper 4 signed or unsigned
 * 16bit integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_unpackhi_epi16(Rt_m128i a, Rt_m128i b) /* PUNPCKHWD */
{

    /*
     * Interleaves the upper 4 signed or unsigned 16bit integers in a
     * with the upper 4 signed or unsigned 16bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpackhi_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = ao._w[4];
    ro._w[1] = bo._w[4];
    ro._w[2] = ao._w[5];
    ro._w[3] = bo._w[5];
    ro._w[4] = ao._w[6];
    ro._w[5] = bo._w[6];
    ro._w[6] = ao._w[7];
    ro._w[7] = bo._w[7];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpackhi_epi32 interleaves the upper 2 signed or
 * unsigned 32bit integers in a with the upper 2 signed or unsigned
 * 32bit integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_unpackhi_epi32(Rt_m128i a, Rt_m128i b) /* PUNPCKHDQ */
{
    /*
     * Interleaves the upper 2 signed or unsigned 32bit integers in a
     * with the upper 2 signed or unsigned 32bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpackhi_epi32"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = ao._w[2];
    ro._w[1] = bo._w[2];
    ro._w[2] = ao._w[3];
    ro._w[3] = bo._w[3];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpackhi_epi64 interleaves the upper signed or
 * unsigned 64bit integer in a with the upper signed or unsigned 64bit
 * integer in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_unpackhi_epi64(Rt_m128i a, Rt_m128i b) /* PUNPCKHQDQ */
{

    /*
     * Interleaves the upper signed or unsigned 64bit integer in a
     * with the upper signed or unsigned 64bit integer in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpackhi_epi64"));

    ao.m128i = a;
    bo.m128i = b;

    ro.m64[0] = ao.m64[1];
    ro.m64[1] = bo.m64[1];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpacklo_epi8 interleaves the lower 8 signed or
 * unsigned 8bit integers in a with the lower 8 signed or unsigned 8bit
 * integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_unpacklo_epi8(Rt_m128i a, Rt_m128i b) /* PUNPCKLBW */
{

    /*
     * Interleaves the lower 8 signed or unsigned 8bit integers in a
     * with the lower 8 signed or unsigned 8bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpacklo_epi8"));

    ao.m128i = a;
    bo.m128i = b;

    ro._b[0] = ao._b[0];
    ro._b[1] = bo._b[0];
    ro._b[2] = ao._b[1];
    ro._b[3] = bo._b[1];
    ro._b[4] = ao._b[2];
    ro._b[5] = bo._b[2];
    ro._b[6] = ao._b[3];
    ro._b[7] = bo._b[3];
    ro._b[8] = ao._b[4];
    ro._b[9] = bo._b[4];
    ro._b[10] = ao._b[5];
    ro._b[11] = bo._b[5];
    ro._b[12] = ao._b[6];
    ro._b[13] = bo._b[6];
    ro._b[14] = ao._b[7];
    ro._b[15] = bo._b[7];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpacklo_epi16 interleaves the lower 4 signed or
 * unsigned 16bit integers in a with the lower 4 signed or unsigned
 * 16bit integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_unpacklo_epi16(Rt_m128i a, Rt_m128i b) /* PUNPCKLWD */
{

    /*
     * Interleaves the lower 4 signed or unsigned 16bit integers in a
     * with the lower 4 signed or unsigned 16bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpacklo_epi16"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = ao._w[0];
    ro._w[1] = bo._w[0];
    ro._w[2] = ao._w[1];
    ro._w[3] = bo._w[1];
    ro._w[4] = ao._w[2];
    ro._w[5] = bo._w[2];
    ro._w[6] = ao._w[3];
    ro._w[7] = bo._w[3];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpacklo_epi32 interleaves the lower 2 signed or
 * unsigned 32bit integers in a with the lower 2 signed or unsigned
 * 32bit integers in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_unpacklo_epi32(Rt_m128i a, Rt_m128i b) /* PUNPCKLDQ */
{

    /*
     * Interleaves the lower 2 signed or unsigned 32bit integers in a
     * with the lower 2 signed or unsigned 32bit integers in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpacklo_epi32"));

    ao.m128i = a;
    bo.m128i = b;

    ro._w[0] = ao._w[0];
    ro._w[1] = bo._w[0];
    ro._w[2] = ao._w[1];
    ro._w[3] = bo._w[1];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpacklo_epi64 interleaves the lower signed or
 * unsigned 64bit integer in a with the lower signed or unsigned 64bit
 * integer in b.  *
 * \param  a   a
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_unpacklo_epi64(Rt_m128i a, Rt_m128i b) /* PUNPCKLQDQ */
{

    /*
     * Interleaves the lower signed or unsigned 64bit integer in a
     * with the lower signed or unsigned 64bit integer in b.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i ao;
    volatile RpWNIOverlayM128i bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpacklo_epi64"));

    ao.m128i = a;
    bo.m128i = b;

    ro.m64[0] = ao.m64[0];
    ro.m64[1] = bo.m64[0];

    RWRETURN(ro.m128i);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadl_epi64 load the lower 64 bits of the value
 * pointed to by p into the lower 64 bits of the result, zeroing the
 * upper 64 bits of the result.  *
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_loadl_epi64(Rt_m128i const *p) /* MOVQ */
{

    /*
     * Load the lower 64 bits of the value pointed to by p
     * into the lower 64 bits of the result,
     * zeroing the upper 64 bits of the result.
     */
    volatile RpWNIOverlayM128i ro;
    volatile RpWNIOverlayM128i po;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadl_epi64"));

    po.m128i = *p;

    ro.m64[0] = po.m64[0];
    ro.ud[2] = 0x0;
    ro.ud[3] = 0x0;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_load_si128 loads 128bit value.  Address p must be
 * 16byte aligned.  *
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_load_si128(const Rt_m128i * p) /* MOVDQA */
{

    /*
     * Loads 128bit value.
     * Address p must be 16byte aligned.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_load_si128"));

    ro.m128i = *p;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_loadu_si128 Loads 128bit value.address p need not
 * be 16byte aligned.  *
 * \param  p   p
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_loadu_si128(const Rt_m128i * p) /* MOVDQU */
{

    /* Loads 128bit value.
     * Address p need not be 16byte aligned.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadu_si128"));

    ro.m128i = *p;

    RWRETURN(ro.m128i);
}

/* Set Operations  */

/**\ingroup rtintel
 * \ref Rt_mm_set_epi64 sets the 2 64bit integer values.  *
 * \param  q1   q1
 * \param  q0   q0
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_set_epi64(Rt_m64 q1, Rt_m64 q0)
{

    /*
     * Sets the 2 64bit integer values.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_epi64"));

    ro.m64[0] = q0;
    ro.m64[1] = q1;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_set_epi32 sets the 4 signed 32bit integer values.
 * *
 * \param  i3   i3
 * \param  i2   i2
 * \param  i1   i1
 * \param  i0   i0
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_set_epi32(int i3, int i2, int i1, int i0)
{

    /*
     * Sets the 4 signed 32bit integer values.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_epi32"));

    ro._d[0] = i0;
    ro._d[1] = i1;
    ro._d[2] = i2;
    ro._d[3] = i3;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_set_epi16 sets the 8 signed 16bit integer values.
 * *
 * \param  w6,   w6,
                short w5
 * \param  w4   w4
 * \param  w3   w3
 * \param  w2   w2
 * \param  w1   w1
 * \param  w0   w0
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_set_epi16(short w7, short w6,
                short w5, short w4, short w3, short w2, short w1,
                short w0)
{

    /*
     * Sets the 8 signed 16bit integer values.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_epi16"));

    ro._w[0] = w0;
    ro._w[1] = w1;
    ro._w[2] = w2;
    ro._w[3] = w3;
    ro._w[4] = w4;
    ro._w[5] = w5;
    ro._w[6] = w6;
    ro._w[7] = w7;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_set_epi8 sets the 16 signed 8bit integer values.
 * *
 * \param  b12,   b12,
               char b11, char b10, char b9, char b8,
               char b7, char b6, char b5, char b4,
               char b3
 * \param  b2   b2
 * \param  b1   b1
 * \param  b0   b0
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_set_epi8(char b15, char b14, char b13, char b12,
               char b11, char b10, char b9, char b8,
               char b7, char b6, char b5, char b4,
               char b3, char b2, char b1, char b0)
{

    /*
     * Sets the 16 signed 8bit integer values.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_epi8"));

    ro._b[0] = b0;
    ro._b[1] = b1;
    ro._b[2] = b2;
    ro._b[3] = b3;
    ro._b[4] = b4;
    ro._b[5] = b5;
    ro._b[6] = b6;
    ro._b[7] = b7;
    ro._b[8] = b8;
    ro._b[9] = b9;
    ro._b[10] = b10;
    ro._b[11] = b11;
    ro._b[12] = b12;
    ro._b[13] = b13;
    ro._b[14] = b14;
    ro._b[15] = b15;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_set1_epi64 sets the 2 64bit integer values to q.
 * *
 * \param  q   q
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_set1_epi64(Rt_m64 q)
{

    /*
     * Sets the 2 64bit integer values to q.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set1_epi64"));

    ro.m64[0] = q;
    ro.m64[1] = q;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_set1_epi32 sets the 4 signed 32bit integer values
 * to i.  *
 * \param  i   i
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_set1_epi32(int i)
{

    /*
     * Sets the 4 signed 32bit integer values to i.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set1_epi32"));

    ro._d[0] = i;
    ro._d[1] = i;
    ro._d[2] = i;
    ro._d[3] = i;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_set1_epi16 sets the 8 signed 16bit integer values
 * to w.  *
 * \param  w   w
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_set1_epi16(short w)
{

    /*
     * Sets the 8 signed 16bit integer values to w.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set1_epi16"));

    ro._w[0] = w;
    ro._w[1] = w;
    ro._w[2] = w;
    ro._w[3] = w;
    ro._w[4] = w;
    ro._w[5] = w;
    ro._w[6] = w;
    ro._w[7] = w;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_set1_epi8 sets the 16 signed 8bit integer values
 * to b.  *
 * \param  b   b
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_set1_epi8(char b)
{

    /*
     * Sets the 16 signed 8bit integer values to b.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set1_epi8"));

    ro._b[0] = b;
    ro._b[1] = b;
    ro._b[2] = b;
    ro._b[3] = b;
    ro._b[4] = b;
    ro._b[5] = b;
    ro._b[6] = b;
    ro._b[7] = b;
    ro._b[8] = b;
    ro._b[9] = b;
    ro._b[10] = b;
    ro._b[11] = b;
    ro._b[12] = b;
    ro._b[13] = b;
    ro._b[14] = b;
    ro._b[15] = b;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_setr_epi64 sets the 2 64bit integer values in
 * reverse order.  *
 * \param  q0   q0
 * \param  q1   q1
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_setr_epi64(Rt_m64 q0, Rt_m64 q1)
{

    /*
     * Sets the 2 64bit integer values in reverse order.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setr_epi64"));

    ro.m64[0] = q0;
    ro.m64[1] = q1;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_setr_epi32 sets the 4 signed 32bit integer values
 * in reverse order.  *
 * \param  i0   i0
 * \param  i1   i1
 * \param  i2   i2
 * \param  i3   i3
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_setr_epi32(int i0, int i1, int i2, int i3)
{

    /*
     * Sets the 4 signed 32bit integer values in reverse order.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setr_epi32"));

    ro._d[0] = i0;
    ro._d[1] = i1;
    ro._d[2] = i2;
    ro._d[3] = i3;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_setr_epi16 sets the 8 signed 16bit integer values
 * in reverse order.  *
 * \param  w1,   w1,
                 short w2
 * \param  w3   w3
 * \param  w4   w4
 * \param  w5   w5
 * \param  w6   w6
 * \param  w7   w7
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_setr_epi16(short w0, short w1,
                 short w2, short w3, short w4, short w5, short w6,
                 short w7)
{

    /*
     * Sets the 8 signed 16bit integer values in reverse order.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setr_epi16"));

    ro._w[0] = w0;
    ro._w[1] = w1;
    ro._w[2] = w2;
    ro._w[3] = w3;
    ro._w[4] = w4;
    ro._w[5] = w5;
    ro._w[6] = w6;
    ro._w[7] = w7;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_setr_epi8 sets the 16 signed 8bit integer values
 * in reverse order.  *
 * \param  b3,   b3,
                char b4, char b5, char b6, char b7,
                char b8, char b9, char b10, char b11,
                char b12
 * \param  b13   b13
 * \param  b14   b14
 * \param  b15   b15
 *
 * \return result as described above
 */
Rt_m128i
Rt_mm_setr_epi8(char b0, char b1, char b2, char b3,
                char b4, char b5, char b6, char b7,
                char b8, char b9, char b10, char b11,
                char b12, char b13, char b14, char b15)
{

    /*
     * Sets the 16 signed 8bit integer values in reverse order.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setr_epi8"));

    ro._b[0] = b0;
    ro._b[1] = b1;
    ro._b[2] = b2;
    ro._b[3] = b3;
    ro._b[4] = b4;
    ro._b[5] = b5;
    ro._b[6] = b6;
    ro._b[7] = b7;
    ro._b[8] = b8;
    ro._b[9] = b9;
    ro._b[10] = b10;
    ro._b[11] = b11;
    ro._b[12] = b12;
    ro._b[13] = b13;
    ro._b[14] = b14;
    ro._b[15] = b15;

    RWRETURN(ro.m128i);
}

/**\ingroup rtintel
 * \ref Rt_mm_setzero_si128 sets the 128bit value to zero.  *
 * \return result as described above
 */
Rt_m128i
Rt_mm_setzero_si128(void)       /* PXOR */
{

    /*
     * Sets the 128bit value to zero.
     */
    volatile RpWNIOverlayM128i ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setzero_si128"));

    ro._d[0] = 0x0;
    ro._d[1] = 0x0;
    ro._d[2] = 0x0;
    ro._d[3] = 0x0;

    RWRETURN(ro.m128i);
}

/* Store Operations  */

/**\ingroup rtintel
 * \ref Rt_mm_store_si128 stores 128bit value.  Address p must
 * be 16 byte aligned.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_store_si128(Rt_m128i * p, Rt_m128i a) /* MOVDQA */
{

    /*
     * Stores 128bit value.
     * Address p must be 16 byte aligned.
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_store_si128"));

    *p = a;

    RWRETURNVOID();
}

/**\ingroup rtintel
 * \ref Rt_mm_storeu_si128 stores 128bit value.  Address p need
 * not be 16byte aligned.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_storeu_si128(Rt_m128i * p, Rt_m128i a) /* MOVDQU */
{

    /*
     * Stores 128bit value.
     * Address p need not be 16byte aligned.
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_storeu_si128"));

    *p = a;

    RWRETURNVOID();
}

/**\ingroup rtintel
 * \ref Rt_mm_maskmoveu_si128 conditionally store byte elements
 * of d to address p.  The high bit of each byte in the selector n
 * determines whether the corresponding byte in d will be stored.
 * Address p need not be 16byte aligned.  *
 * \param  s   s
 * \param  n   n
 * \param  p   p
 *
 */
void
Rt_mm_maskmoveu_si128(Rt_m128i s, Rt_m128i n, char *p) /* MASKMOVDQU */
{

    /*
     * Conditionally store byte elements of d to address p.
     * The high bit of each byte in the selector n
     * determines whether the corresponding byte in d will be stored.
     * Address p need not be 16byte aligned.
     */
    volatile RpWNIOverlayM128i so;
    volatile RpWNIOverlayM128i no;

    RWAPIFUNCTION(RWSTRING("Rt_mm_maskmoveu_si128"));

    so.m128i = s;
    no.m128i = n;

    if (no._b[0] < 0)
        p[0] = so._b[0];
    if (no._b[1] < 0)
        p[1] = so._b[1];
    if (no._b[2] < 0)
        p[2] = so._b[3];
    if (no._b[3] < 0)
        p[3] = so._b[3];
    if (no._b[4] < 0)
        p[4] = so._b[4];
    if (no._b[5] < 0)
        p[5] = so._b[5];
    if (no._b[6] < 0)
        p[6] = so._b[6];
    if (no._b[7] < 0)
        p[7] = so._b[7];
    if (no._b[8] < 0)
        p[8] = so._b[8];
    if (no._b[9] < 0)
        p[9] = so._b[9];
    if (no._b[10] < 0)
        p[11] = so._b[10];
    if (no._b[11] < 0)
        p[11] = so._b[11];
    if (no._b[12] < 0)
        p[12] = so._b[13];
    if (no._b[13] < 0)
        p[13] = so._b[13];
    if (no._b[14] < 0)
        p[14] = so._b[14];
    if (no._b[15] < 0)
        p[15] = so._b[15];

    RWRETURNVOID();
}

/**\ingroup rtintel
 * \ref Rt_mm_storel_epi64 stores the lower 64 bits of q into the
 * lower 64 bits of the value pointed to by p.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_storel_epi64(Rt_m128i * p, Rt_m128i a) /* MOVQ */
{

    /*
     * Stores the lower 64 bits of q into
     * the lower 64 bits of the value pointed to by p.
     */

    volatile RpWNIOverlayM128i *po = (RpWNIOverlayM128i *) p;
    volatile RpWNIOverlayM128i ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storel_epi64"));

    ao.m128i = a;

    po->m64[0] = ao.m64[0];

    RWRETURNVOID();
}

/* Cacheability Support Operations for Willamette Integer Instructions  */

/**\ingroup rtintel
 * \ref Rt_mm_stream_si128 stores the data in a to the address p
 * without polluting the caches.  If the cache line containing address p
 * is already in the cache, the cache will be updated.  Address p must be
 * 16 byte aligned.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_stream_si128(Rt_m128i * p, Rt_m128i a)
{

    /*
     * Stores the data in a to the address p without polluting the caches.
     * If the cache line containing address p is already in the cache,
     * the cache will be updated.
     * Address p must be 16 byte aligned.
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_stream_si128"));

    *p = a;

    RWRETURNVOID();
}

/**\ingroup rtintel
 * \ref Rt_mm_stream_si32 stores the data in a to the address p
 * without polluting the caches.  If the cache line containing address p
 * is already in the cache, the cache will be updated.  *
 * \param  p   p
 * \param  a   a
 *
 */
void
Rt_mm_stream_si32(int *p, int a)
{

    /*
     * Stores the data in a to the address p without polluting the caches.
     * If the cache line containing address p is already in the cache,
     * the cache will be updated.
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_stream_si32"));

    *p = a;

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_clflush Cache line containing p is flushed and
 * invalidated fromall caches in the coherency domain.  *
 * \param  p   p
 *
 */
void
Rt_mm_clflush(void const * __RWUNUSED__ p)
{
    RWAPIFUNCTION(RWSTRING("Rt_mm_clflush"));

    /* Cache line containing p is flushed and invalidated from
     * all caches in the coherency domain.
     */

    RWRETURNVOID();
}

/**\ingroup rtintel
 * \ref Rt_mm_lfence guarantees that every load instruction that precedes,
 * in program order, the load fence instruction is globally visible
 * before any load instruction which follows the fence in program order.
 *
 */
void
Rt_mm_lfence(void)
{
    RWAPIFUNCTION(RWSTRING("Rt_mm_lfence"));

    /*
     * Guarantees that every load instruction that precedes,
     * in program order, the load fence instruction is globally visible
     * before any load instruction which follows the fence in program order.
     */

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mfence guarantees that every memory access that precedes, 
 * in program order, the memory fence instruction is globally visible before
 * any memory instruction which follows the fence in program order.
 */
void
Rt_mm_mfence(void)
{
    RWAPIFUNCTION(RWSTRING("Rt_mm_mfence"));

    /*
     * Guarantees that every memory access that precedes, in program order,
     * the memory fence instruction is globally visible before
     * any memory instruction which follows the fence in program order.
     */

    RWRETURNVOID();
}
