/* 
 * MMX emulation
 *
 * Copyright (c) 1998 Criterion Software Ltd.
 */

/*
 *      Generic software emulation of icl/MMX intrinsics.
 */
#include <stdio.h>
#include <stdlib.h>

#include "rpplugin.h"
#include "rpdbgerr.h"
#include "rtintel.h"
#include "mmx.h"

static const char  __RWUNUSED__  rcsid[] =
    "@@(#)$Id: mmx.c,v 1.9 2001/08/29 16:38:16 johns Exp $";

#if (defined(__ICL))
/* Avoid voluminous
 *   'warning #963: no EMMS instruction before return'
 * etc warnings
 */
#pragma warning( disable : 963 )
#pragma warning( disable : 964 )
#pragma warning( disable : 965 )
#endif /* (defined(__ICL)) */

#if (defined(_ASSERTE))
#define MMX_ASSERT(predicate) _ASSERTE(predicate)
#else /* (defined(_ASSERTE)) */
#define MMX_ASSERT(predicate) LI_ASSERT(predicate)
#endif /* (defined(_ASSERTE)) */

#if ( defined(__ICL) && defined(LI_DEVELOP) && defined(LI_DEBUG) )
#define MMX_VERIFY(icl_intrinsic, result)                               \
do {                                                                    \
      RwOverlayM64   chk;                                               \
                                                                        \
      chk.m64 = icl_intrinsic;                                          \
      MMX_ASSERT((chk._d[0] == result._d[0]) &&                         \
                 (chk._d[1] == result._d[1]));                          \
   } while(0)
#endif /* ( defined(__ICL) && defined(LI_DEVELOP) && defined(LI_DEBUG) )  */

#if (!defined(MMX_VERIFY))
#define MMX_VERIFY(icl_intrinsic, result) /* No op */
#endif /* (!defined(MMX_VERIFY)) */

#define RwOverlayM64Initialize(result, msd, lsd)                        \
do                                                                      \
{                                                                       \
   result._d[0] = lsd;                                                  \
   result._d[1] = msd;                                                  \
} while(0)

#define Clamp(value, lo, hi)                                            \
     (((value)<=(lo))?(lo):(((value)>=(hi))?(hi):(value)))

#define LUB ((int)0)
#define HUB ((int)((unsigned char)~0))

#define LSB (~((int)(((unsigned char)~0) >> 1)))
#define HSB ((int)(((unsigned char)~0) >> 1))

#define LUW ((int)0)
#define HUW ((int)((unsigned short)~0))

#define LSW (~((int)(((unsigned short)~0) >> 1)))
#define HSW ((int)(((unsigned short)~0) >> 1))

#define LUD ((int)0)
#define HUD ((int)((unsigned int)~0))

#define LSD (~((int)(((unsigned int)~0) >> 1)))
#define HSD ((int)(((unsigned int)~0) >> 1))

/**
 * \ingroup rtintel
 * \ref Rt_m_empty Empty the multimedia state.
 */
void
Rt_m_empty(void)
{
    /*
     *  void _m_empty (void)
     *  Empty the multimedia state.
     */
    RWAPIFUNCTION(RWSTRING("Rt_m_empty"));

#if (0)
    LI_INTERNAL(("%s:%d Rt_m_empty() unimplemented\n", __FILE__,
                 __LINE__));
#endif /* (0) */
}

/**
 * \ingroup rtintel
 * \ref Rt_m_from_int Convert the integer object i to a 64-bit
 * Rt_m64 object. The integer value is zero extended to 64 bits.
 * 
 * \param i  int i
 *
 */
Rt_m64
Rt_m_from_int(int i)
{
    /*
     *  Rt_m64 _m_from_int (int i)
     *  Convert the integer object i to a 64-bit  Rt_m64 object.
     *
     *  The integer value is zero extended to 64 bits.
     */
    volatile RwOverlayM64 overlayM64;
    Rt_m64              result;

    RWAPIFUNCTION(RWSTRING("Rt_m_from_int"));

    overlayM64._d[0] = i;
    overlayM64._d[1] = 0;

    MMX_VERIFY(_m_from_int((int) i), overlayM64);

    result = overlayM64.m64;

    RWRETURN(result);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_to_int Convert the lower 32 bits of the Rt_m64 object
 * m to an integer.
 *
 * \param m  Rt_m64 m
 */
int
Rt_m_to_int(Rt_m64 m)
{
    /*
     *  int _m_to_int (Rt_m64 m)
     *  Convert the lower 32 bits of the Rt_m64 object m to an integer.
     */
    volatile RwOverlayM64 result;

    RWAPIFUNCTION(RWSTRING("Rt_m_to_int"));

    result.m64 = m;
#if ( defined(__ICL) && defined(LI_DEVELOP) && defined(LI_DEBUG) )
    {
        RwOverlayM64        chk;

        chk._d[0] = _m_to_int((Rt_m64) m);
        MMX_ASSERT(chk._d[0] == result._d[0]);
    }
#endif /* ( defined(__ICL) && defined(LI_DEVELOP) && defined(LI_DEBUG) )  */

    RWRETURN(result._d[0]);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_packsswb Pack the eight 16-bit values found in m1
 * and m2 into eight 8-bit values with signed saturation.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_packsswb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_packsswb (Rt_m64 m1, Rt_m64 m2)
     *  Pack the eight 16-bit values found in m1 and m2 into eight 8-bit
     *  values with signed saturation.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_packsswb"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1._w[0]);
    result._b[0] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._w[1]);
    result._b[1] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._w[2]);
    result._b[2] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._w[3]);
    result._b[3] = Clamp(sat, LSB, HSB);
    sat = ((int) o2._w[0]);
    result._b[4] = Clamp(sat, LSB, HSB);
    sat = ((int) o2._w[1]);
    result._b[5] = Clamp(sat, LSB, HSB);
    sat = ((int) o2._w[2]);
    result._b[6] = Clamp(sat, LSB, HSB);
    sat = ((int) o2._w[3]);
    result._b[7] = Clamp(sat, LSB, HSB);

    MMX_VERIFY(_m_packsswb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_packssdw Pack the four 32-bit values found in m1 and
 * m2 into four 16-bit values with signed saturation.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_packssdw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_packssdw (Rt_m64 m1, Rt_m64 m2)
     *  Pack the four 32-bit values found in m1 and m2 into four 16-bit values
     *  with signed saturation.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_packssdw"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1._d[0]);
    result._w[0] = Clamp(sat, LSW, HSW);
    sat = ((int) o1._d[1]);
    result._w[1] = Clamp(sat, LSW, HSW);
    sat = ((int) o2._d[0]);
    result._w[2] = Clamp(sat, LSW, HSW);
    sat = ((int) o2._d[1]);
    result._w[3] = Clamp(sat, LSW, HSW);

    MMX_VERIFY(_m_packssdw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_packuswb Pack the eight 16-bit values found in m1
 * and m2 into eight 8-bit values with unsigned saturation.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_packuswb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_packuswb (Rt_m64 m1, Rt_m64 m2)
     *  Pack the eight 16-bit values found in m1 and m2 into eight 8-bit
     *  values with unsigned saturation.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_packuswb"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1._w[0]);
    result.ub[0] = Clamp(sat, LUB, HUB);
    sat = ((int) o1._w[1]);
    result.ub[1] = Clamp(sat, LUB, HUB);
    sat = ((int) o1._w[2]);
    result.ub[2] = Clamp(sat, LUB, HUB);
    sat = ((int) o1._w[3]);
    result.ub[3] = Clamp(sat, LUB, HUB);
    sat = ((int) o2._w[0]);
    result.ub[4] = Clamp(sat, LUB, HUB);
    sat = ((int) o2._w[1]);
    result.ub[5] = Clamp(sat, LUB, HUB);
    sat = ((int) o2._w[2]);
    result.ub[6] = Clamp(sat, LUB, HUB);
    sat = ((int) o2._w[3]);
    result.ub[7] = Clamp(sat, LUB, HUB);

    MMX_VERIFY(_m_packuswb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_punpckhbw Interleave the four 8-bit values from the
 * high half of m1 with the four 8-bit values from the high half of m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 *
 * \return four 16-bit values.
 */
Rt_m64
Rt_m_punpckhbw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_punpckhbw (Rt_m64 m1, Rt_m64 m2)
     *  Interleave the four 8-bit values from the high half of m1 with the
     *  four 8-bit values from the high half of m2.
     *
     *  The result is four 16-bit values.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_punpckhbw"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._b[0] = o1._b[4];
    result._b[1] = o2._b[4];
    result._b[2] = o1._b[5];
    result._b[3] = o2._b[5];
    result._b[4] = o1._b[6];
    result._b[5] = o2._b[6];
    result._b[6] = o1._b[7];
    result._b[7] = o2._b[7];

    MMX_VERIFY(_m_punpckhbw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_punpckhwd Interleave the two 16-bit values from the
 * high half of m1 with the two 16-bit values from the high half of m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 *
 * \return two 32-bit values.
 */
Rt_m64
Rt_m_punpckhwd(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_punpckhwd (Rt_m64 m1, Rt_m64 m2)
     *  Interleave the two 16-bit values from the high half of m1 with the two
     *  16-bit values from the high half of m2.
     *
     *  The result is two 32-bit values.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_punpckhwd"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._w[0] = o1._w[2];
    result._w[1] = o2._w[2];
    result._w[2] = o1._w[3];
    result._w[3] = o2._w[3];

    MMX_VERIFY(_m_punpckhwd((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_punpckhdq Interleave the 32-bit value from the high
 * half of m1 with the 32-bit value from the high half of m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 *
 * \return a 64-bit value.
 */
Rt_m64
Rt_m_punpckhdq(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_punpckhdq (Rt_m64 m1, Rt_m64 m2)
     *  Interleave the 32-bit value from the high half of m1 with the 32-bit
     *  value from the high half of m2.
     *
     *  The result is a 64-bit value.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_punpckhdq"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = o1._d[1];
    result._d[1] = o2._d[1];

    MMX_VERIFY(_m_punpckhdq((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_punpcklbw Interleave the four 8-bit values from the
 * low half of m1 with the four 8-bit values from the low half of m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 *
 * \return four 16-bit values.
 */
Rt_m64
Rt_m_punpcklbw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_punpcklbw (Rt_m64 m1, Rt_m64 m2)
     *  Interleave the four 8-bit values from the low half of m1 with the four
     *  8-bit values from the low half of m2.
     *
     *  The result is four 16-bit values.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_punpcklbw"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._b[0] = o1._b[0];
    result._b[1] = o2._b[0];
    result._b[2] = o1._b[1];
    result._b[3] = o2._b[1];
    result._b[4] = o1._b[2];
    result._b[5] = o2._b[2];
    result._b[6] = o1._b[3];
    result._b[7] = o2._b[3];

    MMX_VERIFY(_m_punpcklbw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_punpcklwd Interleave the two 16-bit values from the
 * low half of m1 with the two 16-bit values from the low half of m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 *
 * \return two 32-bit values.
 */
Rt_m64
Rt_m_punpcklwd(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_punpcklwd (Rt_m64 m1, Rt_m64 m2)
     *  Interleave the two 16-bit values from the low half of m1 with the two
     *  16-bit values from the low half of m2.
     *
     *  The result is two 32-bit values.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_punpcklwd"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._w[0] = o1._w[0];
    result._w[1] = o2._w[0];
    result._w[2] = o1._w[1];
    result._w[3] = o2._w[1];

    MMX_VERIFY(_m_punpcklwd((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_punpckldq Interleave the 32-bit value from the low
 * half of m1 with the 32-bit value from the low half of m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 *
 * \return a 64-bit value.
 */
Rt_m64
Rt_m_punpckldq(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_punpckldq (Rt_m64 m1, Rt_m64 m2)
     *  Interleave the 32-bit value from the low half of m1 with the 32-bit
     *  value from the low half of m2.
     *
     *  The result is a 64-bit value.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_punpckldq"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = o1._d[0];
    result._d[1] = o2._d[0];

    MMX_VERIFY(_m_punpckldq((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_paddb Add the eight 8-bit values in m1 to the eight
 * 8-bit values in m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_paddb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_paddb (Rt_m64 m1, Rt_m64 m2)
     *  Add the eight 8-bit values in m1 to the eight 8-bit values in m2.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_paddb"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._b[0] = o1._b[0] + ((int) o2._b[0]);
    result._b[1] = o1._b[1] + ((int) o2._b[1]);
    result._b[2] = o1._b[2] + ((int) o2._b[2]);
    result._b[3] = o1._b[3] + ((int) o2._b[3]);
    result._b[4] = o1._b[4] + ((int) o2._b[4]);
    result._b[5] = o1._b[5] + ((int) o2._b[5]);
    result._b[6] = o1._b[6] + ((int) o2._b[6]);
    result._b[7] = o1._b[7] + ((int) o2._b[7]);

    MMX_VERIFY(_m_paddb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_paddw Add the four 16-bit values in m1 to the four
 * 16-bit values in m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_paddw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_paddw (Rt_m64 m1, Rt_m64 m2)
     *  Add the four 16-bit values in m1 to the four 16-bit values in m2.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_paddw"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._w[0] = o1._w[0] + ((int) o2._w[0]);
    result._w[1] = o1._w[1] + ((int) o2._w[1]);
    result._w[2] = o1._w[2] + ((int) o2._w[2]);
    result._w[3] = o1._w[3] + ((int) o2._w[3]);

    MMX_VERIFY(_m_paddw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_paddd Add the two 32-bit values in m1 to the two
 * 32-bit values in m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_paddd(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_paddd (Rt_m64 m1, Rt_m64 m2)
     *  Add the two 32-bit values in m1 to the two 32-bit values in m2.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_paddd"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = o1._d[0] + ((int) o2._d[0]);
    result._d[1] = o1._d[1] + ((int) o2._d[1]);

    MMX_VERIFY(_m_paddd((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_paddsb Add the eight signed 8-bit values in m1 to
 * the eight signed 8-bit values in m2 and saturate.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_paddsb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_paddsb (Rt_m64 m1, Rt_m64 m2)
     *  Add the eight signed 8-bit values in m1 to the eight signed 8-bit
     *  values in m2 and saturate.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_paddsb"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1._b[0]) + ((int) o2._b[0]);
    result._b[0] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[1]) + ((int) o2._b[1]);
    result._b[1] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[2]) + ((int) o2._b[2]);
    result._b[2] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[3]) + ((int) o2._b[3]);
    result._b[3] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[4]) + ((int) o2._b[4]);
    result._b[4] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[5]) + ((int) o2._b[5]);
    result._b[5] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[6]) + ((int) o2._b[6]);
    result._b[6] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[7]) + ((int) o2._b[7]);
    result._b[7] = Clamp(sat, LSB, HSB);

    MMX_VERIFY(_m_paddsb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_paddsw Add the four signed 16-bit values in m1 to
 * the four signed 16-bit values in m2 and saturate.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_paddsw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_paddsw (Rt_m64 m1, Rt_m64 m2)
     *  Add the four signed 16-bit values in m1 to the four signed 16-bit
     *  values in m2 and saturate.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_paddsw"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1._w[0]) + ((int) o2._w[0]);
    result._w[0] = Clamp(sat, LSW, HSW);
    sat = ((int) o1._w[1]) + ((int) o2._w[1]);
    result._w[1] = Clamp(sat, LSW, HSW);
    sat = ((int) o1._w[2]) + ((int) o2._w[2]);
    result._w[2] = Clamp(sat, LSW, HSW);
    sat = ((int) o1._w[3]) + ((int) o2._w[3]);
    result._w[3] = Clamp(sat, LSW, HSW);

    MMX_VERIFY(_m_paddsw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_paddusb Add the eight unsigned 8-bit values in m1 to
 * the eight unsigned 8-bit values in m2 and saturate.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_paddusb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_paddusb (Rt_m64 m1, Rt_m64 m2)
     *  Add the eight unsigned 8-bit values in m1 to the eight unsigned 8-bit
     *  values in m2 and saturate.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_paddusb"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1.ub[0]) + ((int) o2.ub[0]);
    result.ub[0] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[1]) + ((int) o2.ub[1]);
    result.ub[1] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[2]) + ((int) o2.ub[2]);
    result.ub[2] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[3]) + ((int) o2.ub[3]);
    result.ub[3] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[4]) + ((int) o2.ub[4]);
    result.ub[4] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[5]) + ((int) o2.ub[5]);
    result.ub[5] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[6]) + ((int) o2.ub[6]);
    result.ub[6] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[7]) + ((int) o2.ub[7]);
    result.ub[7] = Clamp(sat, LUB, HUB);

    MMX_VERIFY(_m_paddusb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_paddusw Add the four unsigned 16-bit values in m1 to
 * the four unsigned 16-bit values in m2 and saturate.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_paddusw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_paddusw (Rt_m64 m1, Rt_m64 m2)
     *  Add the four unsigned 16-bit values in m1 to the four unsigned 16-bit
     *  values in m2 and saturate.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_paddusw"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1.uw[0]) + ((int) o2.uw[0]);
    result.uw[0] = Clamp(sat, LUW, HUW);
    sat = ((int) o1.uw[1]) + ((int) o2.uw[1]);
    result.uw[1] = Clamp(sat, LUW, HUW);
    sat = ((int) o1.uw[2]) + ((int) o2.uw[2]);
    result.uw[2] = Clamp(sat, LUW, HUW);
    sat = ((int) o1.uw[3]) + ((int) o2.uw[3]);
    result.uw[3] = Clamp(sat, LUW, HUW);

    MMX_VERIFY(_m_paddusw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psubb Subtract the eight 8-bit values in m2 from the
 * eight 8-bit values in m1.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_psubb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_psubb (Rt_m64 m1, Rt_m64 m2)
     *  Subtract the eight 8-bit values in m2 from the eight 8-bit values in
     *  m1.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_psubb"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._b[0] = o1._b[0] - o2._b[0];
    result._b[1] = o1._b[1] - o2._b[1];
    result._b[2] = o1._b[2] - o2._b[2];
    result._b[3] = o1._b[3] - o2._b[3];
    result._b[4] = o1._b[4] - o2._b[4];
    result._b[5] = o1._b[5] - o2._b[5];
    result._b[6] = o1._b[6] - o2._b[6];
    result._b[7] = o1._b[7] - o2._b[7];

    MMX_VERIFY(_m_psubb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psubw Subtract the four 16-bit values in m2 from the
 * four 16-bit values in m1.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_psubw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_psubw (Rt_m64 m1, Rt_m64 m2)
     *  Subtract the four 16-bit values in m2 from the four 16-bit values in
     *  m1.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_psubw"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._w[0] = o1._w[0] - o2._w[0];
    result._w[1] = o1._w[1] - o2._w[1];
    result._w[2] = o1._w[2] - o2._w[2];
    result._w[3] = o1._w[3] - o2._w[3];

    MMX_VERIFY(_m_psubw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psubd Subtract the two 32-bit values in m2 from the
 * two 32-bit values in m1.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_psubd(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_psubd (Rt_m64 m1, Rt_m64 m2)
     *  Subtract the two 32-bit values in m2 from the two 32-bit values in m1.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_psubd"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = o1._d[0] - o2._d[0];
    result._d[1] = o1._d[1] - o2._d[1];

    MMX_VERIFY(_m_psubd((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psubsb Subtract the eight signed 8-bit values in m2
 * from the eight signed 8-bit values in m1 and saturate.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_psubsb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_psubsb (Rt_m64 m1, Rt_m64 m2)
     *  Subtract the eight signed 8-bit values in m2 from the eight signed
     *  8-bit values in m1 and saturate.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_psubsb"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1._b[0]) - ((int) o2._b[0]);
    result._b[0] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[1]) - ((int) o2._b[1]);
    result._b[1] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[2]) - ((int) o2._b[2]);
    result._b[2] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[3]) - ((int) o2._b[3]);
    result._b[3] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[4]) - ((int) o2._b[4]);
    result._b[4] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[5]) - ((int) o2._b[5]);
    result._b[5] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[6]) - ((int) o2._b[6]);
    result._b[6] = Clamp(sat, LSB, HSB);
    sat = ((int) o1._b[7]) - ((int) o2._b[7]);
    result._b[7] = Clamp(sat, LSB, HSB);

    MMX_VERIFY(_m_psubsb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psubsw Subtract the four signed 16-bit values in m2
 * from the four signed 16-bit values in m1 and saturate.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_psubsw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_psubsw (Rt_m64 m1, Rt_m64 m2)
     *  Subtract the four signed 16-bit values in m2 from the four signed
     *  16-bit values in m1 and saturate.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_psubsw"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1._w[0]) - ((int) o2._w[0]);
    result._w[0] = Clamp(sat, LSW, HSW);
    sat = ((int) o1._w[1]) - ((int) o2._w[1]);
    result._w[1] = Clamp(sat, LSW, HSW);
    sat = ((int) o1._w[2]) - ((int) o2._w[2]);
    result._w[2] = Clamp(sat, LSW, HSW);

    sat = ((int) o1._w[3]) - ((int) o2._w[3]);
    result._w[3] = Clamp(sat, LSW, HSW);

    MMX_VERIFY(_m_psubsw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psubusb Subtract the eight unsigned 8-bit values in
 * m2 from the eight unsigned 8-bit values in m1 and saturate.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_psubusb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_psubusb (Rt_m64 m1, Rt_m64 m2)
     *  Subtract the eight unsigned 8-bit values in m2 from the eight unsigned
     *  8-bit values in m1 and saturate.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_psubusb"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1.ub[0]) - ((int) o2.ub[0]);
    result.ub[0] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[1]) - ((int) o2.ub[1]);
    result.ub[1] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[2]) - ((int) o2.ub[2]);
    result.ub[2] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[3]) - ((int) o2.ub[3]);
    result.ub[3] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[4]) - ((int) o2.ub[4]);
    result.ub[4] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[5]) - ((int) o2.ub[5]);
    result.ub[5] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[6]) - ((int) o2.ub[6]);
    result.ub[6] = Clamp(sat, LUB, HUB);
    sat = ((int) o1.ub[7]) - ((int) o2.ub[7]);
    result.ub[7] = Clamp(sat, LUB, HUB);

    MMX_VERIFY(_m_psubusb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psubusw Subtract the four unsigned 16-bit values in
 * m2 from the four unsigned 16-bit values in m1 and saturate.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_psubusw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_psubusw (Rt_m64 m1, Rt_m64 m2)
     *  Subtract the four unsigned 16-bit values in m2 from the four unsigned
     *  16-bit values in m1 and saturate.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;
    int                 sat;

    RWAPIFUNCTION(RWSTRING("Rt_m_psubusw"));

    o1.m64 = m1;
    o2.m64 = m2;
    sat = ((int) o1.uw[0]) - ((int) o2.uw[0]);
    result.uw[0] = Clamp(sat, LUW, HUW);
    sat = ((int) o1.uw[1]) - ((int) o2.uw[1]);
    result.uw[1] = Clamp(sat, LUW, HUW);
    sat = ((int) o1.uw[2]) - ((int) o2.uw[2]);
    result.uw[2] = Clamp(sat, LUW, HUW);
    sat = ((int) o1.uw[3]) - ((int) o2.uw[3]);
    result.uw[3] = Clamp(sat, LUW, HUW);

    MMX_VERIFY(_m_psubusw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pmaddwd Multiply four 16-bit values in m1 by four
 * 16-bit values in m2 producing four 32-bit intermediate results, which
 * are then summed by pairs to produce two 32-bit results.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pmaddwd(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pmaddwd (Rt_m64 m1, Rt_m64 m2)
     *  Multiply four 16-bit values in m1 by four 16-bit values in m2
     *  producing four 32-bit intermediate results, which are then summed by
     *  pairs to produce two 32-bit results.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pmaddwd"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] =
        ((RwInt32) o1._w[0]) * ((RwInt32) o2._w[0]) +
        ((RwInt32) o1._w[1]) * ((RwInt32) o2._w[1]);
    result._d[1] =
        ((RwInt32) o1._w[2]) * ((RwInt32) o2._w[2]) +
        ((RwInt32) o1._w[3]) * ((RwInt32) o2._w[3]);

    MMX_VERIFY(_m_pmaddwd((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pmulhw Multiply four 16-bit values in m1 by four
 * 16-bit values in m2 and produce the high 16 bits of the four results.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pmulhw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pmulhw (Rt_m64 m1, Rt_m64 m2)
     *  Multiply four 16-bit values in m1 by four 16-bit values in m2 and
     *  produce the high 16 bits of the four results.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pmulhw"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._w[0] = ((int) o1._w[0]) * ((int) o2._w[0]) >> 16;
    result._w[1] = ((int) o1._w[1]) * ((int) o2._w[1]) >> 16;
    result._w[2] = ((int) o1._w[2]) * ((int) o2._w[2]) >> 16;
    result._w[3] = ((int) o1._w[3]) * ((int) o2._w[3]) >> 16;

    MMX_VERIFY(_m_pmulhw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pmullw Multiply four 16-bit values in m1 by four
 * 16-bit values in m2 and produce the low 16 bits of the four results.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pmullw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pmullw (Rt_m64 m1, Rt_m64 m2)
     *  Multiply four 16-bit values in m1 by four 16-bit values in m2 and
     *  produce the low 16 bits of the four results.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pmullw"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._w[0] = (short) ((int) o1._w[0]) * ((int) o2._w[0]);
    result._w[1] = (short) ((int) o1._w[1]) * ((int) o2._w[1]);
    result._w[2] = (short) ((int) o1._w[2]) * ((int) o2._w[2]);
    result._w[3] = (short) ((int) o1._w[3]) * ((int) o2._w[3]);

    MMX_VERIFY(_m_pmullw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psllw Shift four 16-bit values in m left the amount
 * specified by count while shifting in zeros.
 * 
 * \param m1  Rt_m64 m
 * \param count  Rt_m64 count
 */
Rt_m64
Rt_m_psllw(Rt_m64 m, Rt_m64 count)
{
    /*
     *  Rt_m64 _m_psllw (Rt_m64 m, Rt_m64 count)
     *  Shift four 16-bit values in m left the amount specified by count while
     *  shifting in zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;
    volatile RwOverlayM64 c;

    RWAPIFUNCTION(RWSTRING("Rt_m_psllw"));

    o.m64 = m;
    c.m64 = count;
    if ((!c.ud[1]) && (c.ud[0] < 16))
    {
        result.uw[0] = o.uw[0] << c.ud[0];
        result.uw[1] = o.uw[1] << c.ud[0];
        result.uw[2] = o.uw[2] << c.ud[0];
        result.uw[3] = o.uw[3] << c.ud[0];
    }
    else
    {
        result.uw[0] = 0;
        result.uw[1] = 0;
        result.uw[2] = 0;
        result.uw[3] = 0;
    }

    MMX_VERIFY(_m_psllw((Rt_m64) m, (Rt_m64) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psllwi Shift four 16-bit values in m left the amount
 * specified by count while shifting in zeros.  * For the best
 * performance, count should be a constant.
 * 
 * \param m  Rt_m64 m
 * \param count  int count
 */
Rt_m64
Rt_m_psllwi(Rt_m64 m, int count)
{
    /*
     *  Rt_m64 _m_psllwi (Rt_m64 m, int count)
     *  Shift four 16-bit values in m left the amount specified by count while
     *  shifting in zeros.
     *
     *  For the best performance, count should be a constant.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;

    RWAPIFUNCTION(RWSTRING("Rt_m_psllwi"));

    o.m64 = m;
    if (count < 16)
    {
        result.uw[0] = o.uw[0] << count;
        result.uw[1] = o.uw[1] << count;
        result.uw[2] = o.uw[2] << count;
        result.uw[3] = o.uw[3] << count;
    }
    else
    {
        result.uw[0] = 0;
        result.uw[1] = 0;
        result.uw[2] = 0;
        result.uw[3] = 0;
    }

    MMX_VERIFY(_m_psllwi((Rt_m64) m, (int) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pslld Shift two 32-bit values in m left the amount
 * specified by count while shifting in zeros.
 * 
 * \param m  Rt_m64 m
 * \param count count
 */
Rt_m64
Rt_m_pslld(Rt_m64 m, Rt_m64 count)
{
    /*
     *  Rt_m64 _m_pslld (Rt_m64 m, Rt_m64 count)
     *  Shift two 32-bit values in m left the amount specified by count while
     *  shifting in zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;
    volatile RwOverlayM64 c;

    RWAPIFUNCTION(RWSTRING("Rt_m_pslld"));

    o.m64 = m;
    c.m64 = count;

    if ((!c.ud[1]) && (c.ud[0] < 32))
    {
        result.ud[0] = o.ud[0] << c.ud[0];
        result.ud[1] = o.ud[1] << c.ud[0];
    }
    else
    {
        result.ud[0] = 0;
        result.ud[1] = 0;
    }

    MMX_VERIFY(_m_pslld((Rt_m64) m, (Rt_m64) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pslldi Shift two 32-bit values in m left the amount
 * specified by count while shifting in zeros.  * For the best
 * performance, count should be a constant.
 * 
 * \param m  Rt_m64 m
 * \param count  int count
 */
Rt_m64
Rt_m_pslldi(Rt_m64 m, int count)
{
    /*
     *  Rt_m64 _m_pslldi (Rt_m64 m, int count)
     *  Shift two 32-bit values in m left the amount specified by count while
     *  shifting in zeros.
     *
     *  For the best performance, count should be a constant.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;

    RWAPIFUNCTION(RWSTRING("Rt_m_pslldi"));

    o.m64 = m;

    if (count < 32)
    {
        result.ud[0] = o.ud[0] << count;
        result.ud[1] = o.ud[1] << count;
    }
    else
    {
        result.ud[0] = 0;
        result.ud[1] = 0;
    }

    MMX_VERIFY(_m_pslldi((Rt_m64) m, (int) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psllq Shift the 64-bit value in m left the amount
 * specified by count while shifting in zeros.
 * 
 * \param m  Rt_m64 m
 * \param count  Rt_m64 count
 */
Rt_m64
Rt_m_psllq(Rt_m64 m, Rt_m64 count)
{
    /*
     *  Rt_m64 _m_psllq (Rt_m64 m, Rt_m64 count)
     *  Shift the 64-bit value in m left the amount specified by count while
     *  shifting in zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;
    volatile RwOverlayM64 c;

    RWAPIFUNCTION(RWSTRING("Rt_m_psllq"));

    o.m64 = m;
    c.m64 = count;

    if (c.ud[1] == 0)
    {
        if (c.ud[0] == 0)
        {
            result.ud[0] = o.ud[0];
            result.ud[1] = o.ud[1];
        }
        else if (c.ud[0] < 32)
        {
            result.ud[0] = o.ud[0] << c.ud[0];
            result.ud[1] =
                (o.ud[1] << c.ud[0]) | (o.ud[0] >> (32 - c.ud[0]));
        }
        else
        {
            result.ud[0] = 0;
            if (c.ud[0] < 64)
            {
                result.ud[1] = (o.ud[0] << (c.ud[0] - 32));
            }
            else
            {
                result.ud[1] = 0;
            }
        }
    }
    else
    {
        result.ud[0] = 0;
        result.ud[1] = 0;
    }

    MMX_VERIFY(_m_psllq((Rt_m64) m, (Rt_m64) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psllqi Shift the 64-bit value in m left the amount
 * specified by count while shifting in zeros.  * For the best
 * performance, count should be a constant.
 * 
 * \param m  
 * \param count  
 */
Rt_m64
Rt_m_psllqi(Rt_m64 m, int count)
{
    /*
     *  Rt_m64 _m_psllqi (Rt_m64 m, int count)
     *  Shift the 64-bit value in m left the amount specified by count while
     *  shifting in zeros.
     *
     *  For the best performance, count should be a constant.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;

    RWAPIFUNCTION(RWSTRING("Rt_m_psllqi"));

    o.m64 = m;
    if (count == 0)
    {
        result.ud[0] = o.ud[0];
        result.ud[1] = o.ud[1];
    }
    else if (count < 32)
    {
        result.ud[0] = o.ud[0] << count;
        result.ud[1] = (o.ud[1] << count) | (o.ud[0] >> (32 - count));
    }
    else
    {
        result.ud[0] = 0;
        if (count < 64)
        {
            result.ud[1] = (o.ud[0] << (count - 32));
        }
        else
        {
            result.ud[1] = 0;
        }
    }

    MMX_VERIFY(_m_psllqi((Rt_m64) m, (int) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psraw Shift four 16-bit values in m right the amount
 * specified by count while shifting in the sign bit.
 * 
 * \param m  Rt_m64 m
 * \param count  Rt_m64 count
 */
Rt_m64
Rt_m_psraw(Rt_m64 m, Rt_m64 count)
{
    /*
     *  Rt_m64 _m_psraw (Rt_m64 m, Rt_m64 count)
     *  Shift four 16-bit values in m right the amount specified by count
     *  while shifting in the sign bit.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;
    volatile RwOverlayM64 c;

    RWAPIFUNCTION(RWSTRING("Rt_m_psraw"));

    o.m64 = m;
    c.m64 = count;

    if ((!c.ud[1]) && (c.ud[0] < 16))
    {
        result._w[0] = o._w[0] >> c.ud[0];
        result._w[1] = o._w[1] >> c.ud[0];
        result._w[2] = o._w[2] >> c.ud[0];
        result._w[3] = o._w[3] >> c.ud[0];
    }
    else
    {
        result._w[0] = (o._w[0] < 0) ? (~0) : 0;
        result._w[1] = (o._w[1] < 0) ? (~0) : 0;
        result._w[2] = (o._w[2] < 0) ? (~0) : 0;
        result._w[3] = (o._w[3] < 0) ? (~0) : 0;

    }

    MMX_VERIFY(_m_psraw((Rt_m64) m, (Rt_m64) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psrawi Shift four 16-bit values in m right the
 * amount specified by count while shifting in the sign bit.  * For the
 * best performance, count should be a constant.
 * 
 * \param m  Rt_m64 m
 * \param count  int count
 */
Rt_m64
Rt_m_psrawi(Rt_m64 m, int count)
{
    /*
     *  Rt_m64 _m_psrawi (Rt_m64 m, int count)
     *  Shift four 16-bit values in m right the amount specified by count
     *  while shifting in the sign bit.
     *
     *  For the best performance, count should be a constant.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;

    RWAPIFUNCTION(RWSTRING("Rt_m_psrawi"));

    o.m64 = m;

    if (count < 16)
    {
        result._w[0] = o._w[0] >> count;
        result._w[1] = o._w[1] >> count;
        result._w[2] = o._w[2] >> count;
        result._w[3] = o._w[3] >> count;
    }
    else
    {
        result._w[0] = (o._w[0] < 0) ? (~0) : 0;
        result._w[1] = (o._w[1] < 0) ? (~0) : 0;
        result._w[2] = (o._w[2] < 0) ? (~0) : 0;
        result._w[3] = (o._w[3] < 0) ? (~0) : 0;

    }

    MMX_VERIFY(_m_psrawi((Rt_m64) m, (int) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psrad Shift two 32-bit values in m right the amount
 * specified by count while shifting in the sign bit.
 * 
 * \param m  Rt_m64 m
 * \param count  Rt_m64 count
 */
Rt_m64
Rt_m_psrad(Rt_m64 m, Rt_m64 count)
{
    /*
     *  Rt_m64 _m_psrad (Rt_m64 m, Rt_m64 count)
     *  Shift two 32-bit values in m right the amount specified by count while
     *  shifting in the sign bit.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;
    volatile RwOverlayM64 c;

    RWAPIFUNCTION(RWSTRING("Rt_m_psrad"));

    o.m64 = m;
    c.m64 = count;

    if ((!c.ud[1]) && (c.ud[0] < 32))
    {

        result._d[0] = o._d[0] >> c.ud[0];
        result._d[1] = o._d[1] >> c.ud[0];
    }
    else
    {

        result._d[0] = (o._d[0] < 0) ? (~0) : 0;
        result._d[1] = (o._d[1] < 0) ? (~0) : 0;

    }

    MMX_VERIFY(_m_psrad((Rt_m64) m, (Rt_m64) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psradi Shift two 32-bit values in m right the amount
 * specified by count while shifting in the sign bit.  * For the best
 * performance, count should be a constant.
 * 
 * \param m  Rt_m64 m
 * \param count  int count
 */
Rt_m64
Rt_m_psradi(Rt_m64 m, int count)
{
    /*
     *  Rt_m64 _m_psradi (Rt_m64 m, int count)
     *  Shift two 32-bit values in m right the amount specified by count while
     *  shifting in the sign bit.
     *
     *  For the best performance, count should be a constant.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;

    RWAPIFUNCTION(RWSTRING("Rt_m_psradi"));

    o.m64 = m;

    if (count < 32)
    {

        result._d[0] = o._d[0] >> count;
        result._d[1] = o._d[1] >> count;
    }
    else
    {

        result._d[0] = (o._d[0] < 0) ? (~0) : 0;
        result._d[1] = (o._d[1] < 0) ? (~0) : 0;

    }

    MMX_VERIFY(_m_psradi((Rt_m64) m, (int) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psrlw Shift four 16-bit values in m right the amount
 * specified by count while shifting in zeros.
 * 
 * \param m  Rt_m64 m
 * \param count  Rt_m64 count
 */
Rt_m64
Rt_m_psrlw(Rt_m64 m, Rt_m64 count)
{
    /*
     *  Rt_m64 _m_psrlw (Rt_m64 m, Rt_m64 count)
     *  Shift four 16-bit values in m right the amount specified by count
     *  while shifting in zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;
    volatile RwOverlayM64 c;

    RWAPIFUNCTION(RWSTRING("Rt_m_psrlw"));

    o.m64 = m;
    c.m64 = count;

    if ((!c.ud[1]) && (c.ud[0] < 16))
    {

        result.uw[0] = o.uw[0] >> c.ud[0];
        result.uw[1] = o.uw[1] >> c.ud[0];
        result.uw[2] = o.uw[2] >> c.ud[0];
        result.uw[3] = o.uw[3] >> c.ud[0];
    }
    else
    {
        result.uw[0] = 0;
        result.uw[1] = 0;
        result.uw[2] = 0;
        result.uw[3] = 0;
    }

    MMX_VERIFY(_m_psrlw((Rt_m64) m, (Rt_m64) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psrlwi Shift four 16-bit values in m right the
 * amount specified by count while shifting in zeros.  * For the best
 * performance, count should be a constant.
 * 
 * \param m  Rt_m64 m
 * \param count  int count
 */
Rt_m64
Rt_m_psrlwi(Rt_m64 m, int count)
{
    /*
     *  Rt_m64 _m_psrlwi (Rt_m64 m, int count)
     *  Shift four 16-bit values in m right the amount specified by count
     *  while shifting in zeros.
     *
     *  For the best performance, count should be a constant.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;

    RWAPIFUNCTION(RWSTRING("Rt_m_psrlwi"));

    o.m64 = m;

    if (count < 16)
    {

        result.uw[0] = o.uw[0] >> count;
        result.uw[1] = o.uw[1] >> count;
        result.uw[2] = o.uw[2] >> count;
        result.uw[3] = o.uw[3] >> count;
    }
    else
    {
        result.uw[0] = 0;
        result.uw[1] = 0;
        result.uw[2] = 0;
        result.uw[3] = 0;
    }

    MMX_VERIFY(_m_psrlwi((Rt_m64) m, (int) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psrld Shift two 32-bit values in m right the amount
 * specified by count while shifting in zeros.
 * 
 * \param m  Rt_m64 m
 * \param count  Rt_m64 count
 */
Rt_m64
Rt_m_psrld(Rt_m64 m, Rt_m64 count)
{
    /*
     *  Rt_m64 _m_psrld (Rt_m64 m, Rt_m64 count)
     *  Shift two 32-bit values in m right the amount specified by count while
     *  shifting in zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;
    volatile RwOverlayM64 c;

    RWAPIFUNCTION(RWSTRING("Rt_m_psrld"));

    o.m64 = m;
    c.m64 = count;

    if ((!c.ud[1]) && (c.ud[0] < 32))
    {
        result.ud[0] = o.ud[0] >> c.ud[0];
        result.ud[1] = o.ud[1] >> c.ud[0];
    }
    else
    {
        result.ud[0] = 0;
        result.ud[1] = 0;
    }

    MMX_VERIFY(_m_psrld((Rt_m64) m, (Rt_m64) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psrldi Shift two 32-bit values in m right the amount
 * specified by count while shifting in zeros.  * For the best
 * performance, count should be a constant.
 * 
 * \param m  Rt_m64 m
 * \param count  int count
 */
Rt_m64
Rt_m_psrldi(Rt_m64 m, int count)
{
    /*
     *  Rt_m64 _m_psrldi (Rt_m64 m, int count)
     *  Shift two 32-bit values in m right the amount specified by count while
     *  shifting in zeros.
     *
     *  For the best performance, count should be a constant.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;

    RWAPIFUNCTION(RWSTRING("Rt_m_psrldi"));

    o.m64 = m;

    if (count < 32)
    {
        result.ud[0] = o.ud[0] >> count;
        result.ud[1] = o.ud[1] >> count;
    }
    else
    {
        result.ud[0] = 0;
        result.ud[1] = 0;
    }

    MMX_VERIFY(_m_psrldi((Rt_m64) m, (int) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psrlq Shift the 64-bit value in m right the amount
 * specified by count while shifting in zeros.
 * 
 * \param m  Rt_m64 m
 * \param count  Rt_m64 count
 */
Rt_m64
Rt_m_psrlq(Rt_m64 m, Rt_m64 count)
{
    /*
     *  Rt_m64 _m_psrlq (Rt_m64 m, Rt_m64 count)
     *  Shift the 64-bit value in m right the amount specified by count while
     *  shifting in zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;
    volatile RwOverlayM64 c;

    RWAPIFUNCTION(RWSTRING("Rt_m_psrlq"));

    o.m64 = m;
    c.m64 = count;

    if (c.ud[1] == 0)
    {

        if (c.ud[0] == 0)
        {
            result.ud[0] = o.ud[0];
            result.ud[1] = o.ud[1];
        }
        else if (c.ud[0] < 32)
        {
            result.ud[0] =
                (o.ud[0] >> c.ud[0]) | (o.ud[1] << (32 - c.ud[0]));
            result.ud[1] = o.ud[1] >> c.ud[0];
        }
        else
        {
            if (c.ud[0] < 64)
            {
                result.ud[0] = (o.ud[1] >> (c.ud[0] - 32));
            }
            else
            {
                result.ud[0] = 0;
            }
            result.ud[1] = 0;
        }
    }
    else
    {
        result.ud[0] = 0;
        result.ud[1] = 0;
    }

    MMX_VERIFY(_m_psrlq((Rt_m64) m, (Rt_m64) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_psrlqi Shift the 64-bit value in m right the amount
 * specified by count while shifting in zeros.  * For the best
 * performance, count should be a constant.
 * 
 * \param m  Rt_m64 m
 * \param count  int count
 */
Rt_m64
Rt_m_psrlqi(Rt_m64 m, int count)
{
    /*
     *  Rt_m64 _m_psrlqi (Rt_m64 m, int count)
     *  Shift the 64-bit value in m right the amount specified by count while
     *  shifting in zeros.
     *
     *  For the best performance, count should be a constant.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o;

    RWAPIFUNCTION(RWSTRING("Rt_m_psrlqi"));

    o.m64 = m;
    if (count == 0)
    {
        result.ud[0] = o.ud[0];
        result.ud[1] = o.ud[1];
    }
    else if (count < 32)
    {
        result.ud[0] = (o.ud[0] >> count) | (o.ud[1] << (32 - count));
        result.ud[1] = o.ud[1] >> count;
    }
    else
    {
        if (count < 64)
        {
            result.ud[0] = (o.ud[1] >> (count - 32));
        }
        else
        {
            result.ud[0] = 0;
        }
        result.ud[1] = 0;
    }

    MMX_VERIFY(_m_psrlqi((Rt_m64) m, (int) count), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pand Perform a bitwise AND of the 64-bit value in m1
 * with the 64-bit value in m2.
 * 
 * \param m  Rt_m64 m1
 * \param count  Rt_m64 m2
 */
Rt_m64
Rt_m_pand(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pand (Rt_m64 m1, Rt_m64 m2)
     *  Perform a bitwise AND of the 64-bit value in m1 with the 64-bit value
     *  in m2.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pand"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = o1._d[0] & o2._d[0];
    result._d[1] = o1._d[1] & o2._d[1];

    MMX_VERIFY(_m_pand((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pandn Perform a logical NOT on the 64-bit value in
 * m1 and use the result in a bitwise AND with the 64-bit value in m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pandn(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pandn (Rt_m64 m1, Rt_m64 m2)
     *  Perform a logical NOT on the 64-bit value in m1 and use the result in
     *  a bitwise AND with the 64-bit value in m2.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pandn"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = (~o1._d[0]) & o2._d[0];
    result._d[1] = (~o1._d[1]) & o2._d[1];

    MMX_VERIFY(_m_pandn((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_por Perform a bitwise OR of the 64-bit value in m1
 * with the 64-bit value in m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_por(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_por (Rt_m64 m1, Rt_m64 m2)
     *  Perform a bitwise OR of the 64-bit value in m1 with the 64-bit value
     *  in m2.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_por"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = o1._d[0] | o2._d[0];
    result._d[1] = o1._d[1] | o2._d[1];

    MMX_VERIFY(_m_por((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pxor Perform a bitwise XOR of the 64-bit value in m1
 * with the 64-bit value in m2.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pxor(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pxor (Rt_m64 m1, Rt_m64 m2)
     *  Perform a bitwise XOR of the 64-bit value in m1 with the 64-bit value
     *  in m2.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pxor"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = o1._d[0] ^ o2._d[0];
    result._d[1] = o1._d[1] ^ o2._d[1];

    MMX_VERIFY(_m_pxor((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pcmpeqb If the respective 8-bit values in m1 are
 * equal to the respective 8-bit values in m2 set the respective 8-bit
 * resulting values to all ones, otherwise set them to all zeros.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pcmpeqb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pcmpeqb (Rt_m64 m1, Rt_m64 m2)
     *  If the respective 8-bit values in m1 are equal to the respective 8-bit
     *  values in m2 set the respective 8-bit resulting values to all ones,
     *  otherwise set them to all zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pcmpeqb"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._b[0] = (o1._b[0] == o2._b[0]) ? (~0) : 0;
    result._b[1] = (o1._b[1] == o2._b[1]) ? (~0) : 0;
    result._b[2] = (o1._b[2] == o2._b[2]) ? (~0) : 0;
    result._b[3] = (o1._b[3] == o2._b[3]) ? (~0) : 0;
    result._b[4] = (o1._b[4] == o2._b[4]) ? (~0) : 0;
    result._b[5] = (o1._b[5] == o2._b[5]) ? (~0) : 0;
    result._b[6] = (o1._b[6] == o2._b[6]) ? (~0) : 0;
    result._b[7] = (o1._b[7] == o2._b[7]) ? (~0) : 0;

    MMX_VERIFY(_m_pcmpeqb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pcmpeqw If the respective 16-bit values in m1 are
 * equal to the respective 16-bit values in m2 set the respective 16-bit
 * resulting values to all ones, otherwise set them to all zeros.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pcmpeqw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pcmpeqw (Rt_m64 m1, Rt_m64 m2)
     *  If the respective 16-bit values in m1 are equal to the respective
     *  16-bit values in m2 set the respective 16-bit resulting values to all
     *  ones, otherwise set them to all zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pcmpeqw"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._w[0] = (o1._w[0] == o2._w[0]) ? (~0) : 0;
    result._w[1] = (o1._w[1] == o2._w[1]) ? (~0) : 0;
    result._w[2] = (o1._w[2] == o2._w[2]) ? (~0) : 0;
    result._w[3] = (o1._w[3] == o2._w[3]) ? (~0) : 0;

    MMX_VERIFY(_m_pcmpeqw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pcmpeqd If the respective 32-bit values in m1 are
 * equal to the respective 32-bit values in m2 set the respective 32-bit
 * resulting values to all ones, otherwise set them to all zeros.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pcmpeqd(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pcmpeqd (Rt_m64 m1, Rt_m64 m2)
     *  If the respective 32-bit values in m1 are equal to the respective
     *  32-bit values in m2 set the respective 32-bit resulting values to all
     *  ones, otherwise set them to all zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pcmpeqd"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = (o1._d[0] == o2._d[0]) ? (~0) : 0;
    result._d[1] = (o1._d[1] == o2._d[1]) ? (~0) : 0;

    MMX_VERIFY(_m_pcmpeqd((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pcmpgtb If the respective 8-bit values in m1 are
 * greater than the respective 8-bit values in m2 set the respective
 * 8-bit resulting values to all ones, otherwise set them to all zeros.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pcmpgtb(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pcmpgtb (Rt_m64 m1, Rt_m64 m2)
     *  If the respective 8-bit values in m1 are greater than the respective
     *  8-bit values in m2 set the respective 8-bit resulting values to all
     *  ones, otherwise set them to all zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pcmpgtb"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._b[0] = (o1._b[0] > o2._b[0]) ? (~0) : 0;
    result._b[1] = (o1._b[1] > o2._b[1]) ? (~0) : 0;
    result._b[2] = (o1._b[2] > o2._b[2]) ? (~0) : 0;
    result._b[3] = (o1._b[3] > o2._b[3]) ? (~0) : 0;
    result._b[4] = (o1._b[4] > o2._b[4]) ? (~0) : 0;
    result._b[5] = (o1._b[5] > o2._b[5]) ? (~0) : 0;
    result._b[6] = (o1._b[6] > o2._b[6]) ? (~0) : 0;
    result._b[7] = (o1._b[7] > o2._b[7]) ? (~0) : 0;

    MMX_VERIFY(_m_pcmpgtb((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pcmpgtw If the respective 16-bit values in m1 are
 * greater than the respective 16-bit values in m2 set the respective
 * 16-bit resulting values to all ones, otherwise set them to all zeros.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 */
Rt_m64
Rt_m_pcmpgtw(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pcmpgtw (Rt_m64 m1, Rt_m64 m2)
     *  If the respective 16-bit values in m1 are greater than the respective
     *  16-bit values in m2 set the respective 16-bit resulting values to all
     *  ones, otherwise set them to all zeros.
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pcmpgtw"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._w[0] = (o1._w[0] > o2._w[0]) ? (~0) : 0;
    result._w[1] = (o1._w[1] > o2._w[1]) ? (~0) : 0;
    result._w[2] = (o1._w[2] > o2._w[2]) ? (~0) : 0;
    result._w[3] = (o1._w[3] > o2._w[3]) ? (~0) : 0;

    MMX_VERIFY(_m_pcmpgtw((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pcmpgtd If the respective 32-bit values in m1 are
 * greater than the respective 32-bit values in m2 set the respective
 * 32-bit resulting values to all ones, otherwise set them all to zeros.
 * 
 * \param m1  Rt_m64 m1
 * \param m2  Rt_m64 m2
 *
 */
Rt_m64
Rt_m_pcmpgtd(Rt_m64 m1, Rt_m64 m2)
{
    /*
     *  Rt_m64 _m_pcmpgtd (Rt_m64 m1, Rt_m64 m2)
     *  If the respective 32-bit values in m1 are greater than the respective
     *  32-bit values in m2 set the respective 32-bit resulting values to all
     *  ones, otherwise set them all to zeros.
     *
     */
    volatile RwOverlayM64 result;
    volatile RwOverlayM64 o1;
    volatile RwOverlayM64 o2;

    RWAPIFUNCTION(RWSTRING("Rt_m_pcmpgtd"));

    o1.m64 = m1;
    o2.m64 = m2;
    result._d[0] = (o1._d[0] > o2._d[0]) ? (~0) : (0);
    result._d[1] = (o1._d[1] > o2._d[1]) ? (~0) : (0);

    MMX_VERIFY(_m_pcmpgtd((Rt_m64) m1, (Rt_m64) m2), result);
    RWRETURN(result.m64);
}
