/*
 * INTEL OVERLOAD
 *
 * Intel overloaded functions.
 *
 * Copyright (c) 1998 Criterion Software Ltd.
 *
 */

/****************************************************************************
 Includes
 */

#include <stdio.h>
#include <stdlib.h>

#include "rpplugin.h"
#include "rpdbgerr.h"
#include "rtintel.h"
#include "overload.h"

static const char __RWUNUSED__   rcsid[] =
    "@@(#)$Id: overload.c,v 1.12 2001/06/12 08:55:13 johns Exp $";

#if ( defined(_WIN32) && defined(_MSC_VER) && (_MSC_VER>=1000) )
#if (defined(_XBOX))
#include <xtl.h>
#else /* (defined(_XBOX)) */
#include <windows.h>
#include <crtdbg.h>
#endif /* (defined(_XBOX)) */
#define   OUTPUTDEBUGSTRING(_msg)   OutputDebugString(_msg)
#endif /* ( defined(_WIN32) && defined(_MSC_VER) && (_MSC_VER>=1000) ) */

#if (!defined(OUTPUTDEBUGSTRING))
#define OUTPUTDEBUGSTRING(_msg) /* Null op */
#endif /* (!defined(OUTPUTDEBUGSTRING)) */

#if (!defined(_MM_FLUSH_ZERO_MASK))
#define _MM_FLUSH_ZERO_MASK   0x8000
#endif /* (!defined(_MM_FLUSH_ZERO_MASK)) */

#if (!defined(_MM_FLUSH_ZERO_ON))
#define _MM_FLUSH_ZERO_ON     0x8000
#endif /* (!defined(_MM_FLUSH_ZERO_ON)) */

#if (!defined(_MM_FLUSH_ZERO_OFF))
#define _MM_FLUSH_ZERO_OFF    0x0000
#endif /* (!defined(_MM_FLUSH_ZERO_OFF)) */

#if (defined(__ICL))
#if (400<=__ICL)

#if (defined(RW_SUPPRESS_PREFETCH))
#define RW_PREFETCH_SSE(_p, _i) /* No op */
#endif /* (defined(RW_SUPPRESS_PREFETCH)) */

#if (!defined(RW_PREFETCH_SSE))
#define RW_PREFETCH_SSE(_p, _i) _mm_prefetch((_p), (_i))
#endif /* (!defined(RW_PREFETCH_SSE)) */

#define _rwSSEVECTORMULTPOINT(_src, _trg, _mx, _my, _mz, _mw, _v)    \
    (_v).m128 =                                                 \
        _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_set_ps1((_src).x), \
                                         (_mx).m128),           \
                              _mm_mul_ps(_mm_set_ps1((_src).y), \
                                         (_my).m128)),          \
                   _mm_add_ps(_mm_mul_ps(_mm_set_ps1((_src).z), \
                                         (_mz).m128),           \
                              (_mw).m128));                     \
                                                                \
    (_trg) = (_v).v4d.v3d;                                      \

#define _rwSSEVECTORMULTVECTOR(_src, _trg, _mx, _my, _mz, _v)   \
    (_v).m128 =                                                 \
        _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_set_ps1((_src).x), \
                                         (_mx).m128),           \
                              _mm_mul_ps(_mm_set_ps1((_src).y), \
                                         (_my).m128)),          \
                   _mm_mul_ps(_mm_set_ps1((_src).z),            \
                              (_mz).m128));                     \
                                                                \
    (_trg) = (_v).v4d.v3d;                                      \


/****************************************************************************
 _rwMatrixMultiplySSE

 On entry   : Dest matrix pointer, two source matrix pointers
 On exit    : Matrix pointer contains result
 */
static
RWASMAPI(void)
_rwMatrixMultiplySSE(RwMatrix * dstMat,
                     const RwMatrix * matA, const RwMatrix * matB)
{
    RpSSEOverlayM128    v;
    RpSSEOverlayM128    m_x;
    RpSSEOverlayM128    m_y;
    RpSSEOverlayM128    m_z;
    RpSSEOverlayM128    m_w;

    RWFUNCTION(RWSTRING("_rwMatrixMultiplySSE"));
    RWASSERT(dstMat);
    RWASSERT(matA);
    RWASSERT(matB);

    m_x.v4d.w = 0;
    m_x.v4d.v3d = matB->right;

    m_y.v4d.w = 0;
    m_y.v4d.v3d = matB->up;

    m_z.v4d.w = 0;
    m_z.v4d.v3d = matB->at;

    m_w.v4d.w = 1;
    m_w.v4d.v3d = matB->pos;

    /* Multiply out right */
    _rwSSEVECTORMULTVECTOR(matA->right, dstMat->right, m_x, m_y, m_z,
                           v);

    /* Then up */
    _rwSSEVECTORMULTVECTOR(matA->up, dstMat->up, m_x, m_y, m_z, v);

    /* Then at */
    _rwSSEVECTORMULTVECTOR(matA->at, dstMat->at, m_x, m_y, m_z, v);

    /* Then pos - this is different because there is an extra add
     * (implicit 1 (one) in bottom right of matrix)
     */
    _rwSSEVECTORMULTPOINT(matA->pos, dstMat->pos, m_x, m_y, m_z, m_w,
                          v);

    /* And that's all folks */
    RWRETURNVOID();
}

/*
 * See
 * http://www.lysator.liu.se/c/duffs-device.html
 * for details of Duff's Device for dynamic loop unrolling optimization.
 * At time of writing, Tom Duff is at Lucas Films / Pixar.
 */

/****************************************************************************
 VectorMultPointSSE

 On entry   : out array, in array, num of points, matrix
 On exit    : out array of success
 */

static RwV3d       *
VectorMultPointSSE(RwV3d * pointsOut,
                   const RwV3d * pointsIn,
                   RwInt32 numPoints, const RwMatrix * matrix)
{

    RpSSEOverlayM128    v;
    RpSSEOverlayM128    m_x;
    RpSSEOverlayM128    m_y;
    RpSSEOverlayM128    m_z;
    RpSSEOverlayM128    m_w;

    RWFUNCTION(RWSTRING("VectorMultPointSSE"));

    RWASSERT(pointsOut);
    RWASSERT(pointsIn);
    RWASSERT(matrix);

    if (numPoints > 0)
    {
        RwUInt32            offset = (numPoints - 1) & ((RwUInt32) ~ 7);
        const RwV3d        *sourceptr = &pointsIn[offset];
        RwV3d              *targetptr = &pointsOut[offset];

        m_x.v4d.w = 0;
        m_x.v4d.v3d = matrix->right;

        m_y.v4d.w = 0;
        m_y.v4d.v3d = matrix->up;

        m_z.v4d.w = 0;
        m_z.v4d.v3d = matrix->at;

        m_w.v4d.w = 1;
        m_w.v4d.v3d = matrix->pos;

        switch (numPoints & 7)
        {
            case 0:
                do
                {
                    _rwSSEVECTORMULTPOINT(sourceptr[7], targetptr[7],
                                          m_x, m_y, m_z, m_w, v);
            case 7:
                    _rwSSEVECTORMULTPOINT(sourceptr[6], targetptr[6],
                                          m_x, m_y, m_z, m_w, v);
            case 6:
                    _rwSSEVECTORMULTPOINT(sourceptr[5], targetptr[5],
                                          m_x, m_y, m_z, m_w, v);
            case 5:
                    _rwSSEVECTORMULTPOINT(sourceptr[4], targetptr[4],
                                          m_x, m_y, m_z, m_w, v);
            case 4:
                    _rwSSEVECTORMULTPOINT(sourceptr[3], targetptr[3],
                                          m_x, m_y, m_z, m_w, v);
            case 3:
                    _rwSSEVECTORMULTPOINT(sourceptr[2], targetptr[2],
                                          m_x, m_y, m_z, m_w, v);
            case 2:
                    _rwSSEVECTORMULTPOINT(sourceptr[1], targetptr[1],
                                          m_x, m_y, m_z, m_w, v);
            case 1:
                    _rwSSEVECTORMULTPOINT(sourceptr[0], targetptr[0],
                                          m_x, m_y, m_z, m_w, v);
                    sourceptr -= 8;
                }
                while ((targetptr -= 8) >= pointsOut);
        }
    }

    RWRETURN(pointsOut);
}

/****************************************************************************
 _rwVectorMultvectorSSE

 On entry   : out array, in array, num of points, matrix
 On exit    : out array of success
 */

static RwV3d       *
VectorMultVectorSSE(RwV3d * pointsOut,
                    const RwV3d * pointsIn,
                    RwInt32 numPoints, const RwMatrix * matrix)
{

    RpSSEOverlayM128    v;
    RpSSEOverlayM128    m_x;
    RpSSEOverlayM128    m_y;
    RpSSEOverlayM128    m_z;
    RpSSEOverlayM128    m_w;

    RWFUNCTION(RWSTRING("VectorMultVectorSSE"));

    RWASSERT(pointsOut);
    RWASSERT(pointsIn);
    RWASSERT(matrix);

    if (numPoints > 0)
    {
        RwUInt32            offset = (numPoints - 1) & ((RwUInt32) ~ 7);
        const RwV3d        *sourceptr = &pointsIn[offset];
        RwV3d              *targetptr = &pointsOut[offset];

        m_x.v4d.w = 0;
        m_x.v4d.v3d = matrix->right;

        m_y.v4d.w = 0;
        m_y.v4d.v3d = matrix->up;

        m_z.v4d.w = 0;
        m_z.v4d.v3d = matrix->at;

        m_w.v4d.w = 1;
        m_w.v4d.v3d = matrix->pos;

        switch (numPoints & 7)
        {
            case 0:
                do
                {
                    _rwSSEVECTORMULTVECTOR(sourceptr[7], targetptr[7],
                                           m_x, m_y, m_z, v);
            case 7:
                    _rwSSEVECTORMULTVECTOR(sourceptr[6], targetptr[6],
                                           m_x, m_y, m_z, v);
            case 6:
                    _rwSSEVECTORMULTVECTOR(sourceptr[5], targetptr[5],
                                           m_x, m_y, m_z, v);
            case 5:
                    _rwSSEVECTORMULTVECTOR(sourceptr[4], targetptr[4],
                                           m_x, m_y, m_z, v);
            case 4:
                    _rwSSEVECTORMULTVECTOR(sourceptr[3], targetptr[3],
                                           m_x, m_y, m_z, v);
            case 3:
                    _rwSSEVECTORMULTVECTOR(sourceptr[2], targetptr[2],
                                           m_x, m_y, m_z, v);
            case 2:
                    _rwSSEVECTORMULTVECTOR(sourceptr[1], targetptr[1],
                                           m_x, m_y, m_z, v);
            case 1:
                    _rwSSEVECTORMULTVECTOR(sourceptr[0], targetptr[0],
                                           m_x, m_y, m_z, v);
                    sourceptr -= 8;
                }
                while ((targetptr -= 8) >= pointsOut);
        }
    }

    RWRETURN(pointsOut);
}

#if (!defined(RXPIPELINE))

#define LOAD_MATRIX_SSE(mRg, mUp, mAt, mPs, mat) \
MACRO_START                                             \
{                                                       \
    /*                                                  \
     * Load the matrix vectors, these do not change.    \
     */                                                 \
    mRg[0].m128 = _mm_set_ps1(mat->right.x);            \
    mRg[1].m128 = _mm_set_ps1(mat->right.y);            \
    mRg[2].m128 = _mm_set_ps1(mat->right.z);            \
                                                        \
    mUp[0].m128 = _mm_set_ps1(mat->up.x);               \
    mUp[1].m128 = _mm_set_ps1(mat->up.y);               \
    mUp[2].m128 = _mm_set_ps1(mat->up.z);               \
                                                        \
    mAt[0].m128 = _mm_set_ps1(mat->at.x);               \
    mAt[1].m128 = _mm_set_ps1(mat->at.y);               \
    mAt[2].m128 = _mm_set_ps1(mat->at.z);               \
                                                        \
    mPs[0].m128 = _mm_set_ps1(mat->pos.x);              \
    mPs[1].m128 = _mm_set_ps1(mat->pos.y);              \
    mPs[2].m128 = _mm_set_ps1(mat->pos.z);              \
}                                                       \
MACRO_STOP

#define LOAD_CAMERA_SSE(cW, cH, xOff, yOff, zScale, zShift, cam) \
MACRO_START                                     \
{                                               \
    /*                                          \
     * Load the camera matrix.                  \
     */                                         \
    cW.m128 = _mm_set_ps1(cam.camWidth);        \
    cH.m128 = _mm_set_ps1(cam.camHeight);       \
                                                \
    xOff.m128 = _mm_set_ps1(cam.camOffsetX);    \
    yOff.m128 = _mm_set_ps1(cam.camOffsetY);    \
                                                \
    zScale.m128 = _mm_set_ps1(cam.zScale);      \
    zShift.m128 = _mm_set_ps1(cam.zShift);      \
                                                \
}                                               \
MACRO_STOP

#define LOAD_CLIP_SSE(nC, fC, xLo, xHi, yLo, yHi, zLo, zHi, cam) \
MACRO_START                                     \
{                                               \
    RwSplitBits split;                          \
                                                \
    nC.m128 = _mm_set_ps1(cam.nC);              \
    fC.m128 = _mm_set_ps1(cam.fC);              \
                                                \
    split.nUInt = (RwUInt32) rwXLOCLIP;         \
    xLo.m128 = _mm_set_ps1(split.nReal);        \
    split.nUInt = (RwUInt32) rwXHICLIP;         \
    xHi.m128 = _mm_set_ps1(split.nReal);        \
                                                \
    split.nUInt = (RwUInt32) rwYLOCLIP;         \
    yLo.m128 = _mm_set_ps1(split.nReal);        \
    split.nUInt = (RwUInt32) rwYHICLIP;         \
    yHi.m128 = _mm_set_ps1(split.nReal);        \
                                                \
    split.nUInt = (RwUInt32) rwZLOCLIP;         \
    zLo.m128 = _mm_set_ps1(split.nReal);        \
    split.nUInt = (RwUInt32) rwZHICLIP;         \
    zHi.m128 = _mm_set_ps1(split.nReal);        \
                                                \
}                                               \
MACRO_STOP

/****************************************************************************
 PipeTransformPerspectiveSSE

 On entry   : Instanced data
 On exit    : None
 */

static              RwInt32
PipeTransformPerspectiveSSE(RwResEntry * repEntry,
                            RwInt32 nNumVert,
                            RWVERTEXINSTANCE * instancedVertex,
                            RwIm2DVertex * deviceVertex,
                            RwCameraVertex * cameraVertex)
{
    RwMatrix           *mpMat =
        (RwMatrix *) RWPOLYHEADER(repEntry)->xForm;
    RWVERTEXINSTANCE   *inVert0, *inVert1, *inVert2, *inVert3;
    RwIm2DVertex       *devVert0, *devVert1, *devVert2, *devVert3;
    RwCameraVertex     *camVert0, *camVert1, *camVert2, *camVert3;
    RwReal              tval;
    RwInt32             j;
    RwSplitBits         split;
    RpSSEOverlayM128    v1, v2, v3;
    RpSSEOverlayM128    in[3], out[3], nRecipZ;
    RpSSEOverlayM128    matRight[3], matUp[3], matAt[3], matPos[3];
    RpSSEOverlayM128    transpose[4], row[4];
    RpSSEOverlayM128    camWidth, camHeight;
    RpSSEOverlayM128    camOffsetX, camOffsetY;
    RpSSEOverlayM128    zShift, zScale;
    RpSSEOverlayM128    nearClip, farClip;
    RpSSEOverlayM128    xLoClip, xHiClip, yLoClip, yHiClip, zLoClip,
        zHiClip;
    RpSSEOverlayM128    xClip, yClip, zClip;
    RpSSEOverlayM128    clipFlagsOr, clipFlagsAnd;
    const RpSSEOverlayM128 _mm_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
    const RpSSEOverlayM128 _mm_one = { 1.0f, 1.0f, 1.0f, 1.0f };

    RWFUNCTION(RWSTRING("PipeTransformPerspectiveSSE"));
    RWASSERT(repEntry);

    /*
     * Get first four vert.
     */
    inVert0 = instancedVertex;
    devVert0 = deviceVertex;
    camVert0 = cameraVertex;

    inVert1 = inVert0;
    devVert1 = devVert0;
    camVert1 = camVert0;

    inVert2 = inVert0;
    devVert2 = devVert0;
    camVert2 = camVert0;

    inVert3 = inVert0;
    devVert3 = devVert0;
    camVert3 = camVert0;

    switch (nNumVert & 3)
    {
        case 0:
            camVert1 = RwCameraVertexGetNext(camVert0);
            devVert1 = RwIm2DVertexGetNext(devVert0);
            inVert1 = RWVERTEXINSTANCEGetNext(repEntry, inVert0);

        case 3:
            camVert2 = RwCameraVertexGetNext(camVert1);
            devVert2 = RwIm2DVertexGetNext(devVert1);
            inVert2 = RWVERTEXINSTANCEGetNext(repEntry, inVert1);

        case 2:
            camVert3 = RwCameraVertexGetNext(camVert2);
            devVert3 = RwIm2DVertexGetNext(devVert2);
            inVert3 = RWVERTEXINSTANCEGetNext(repEntry, inVert2);

        default:
            ;
    }

    clipFlagsOr.m128 = _mm_zero.m128;
    clipFlagsAnd.m128 = _mm_cmpeq_ps(_mm_zero.m128, _mm_zero.m128);

    LOAD_MATRIX_SSE(matRight, matUp, matAt, matPos, mpMat);
    LOAD_CAMERA_SSE(camWidth, camHeight,
                    camOffsetX, camOffsetY, zScale, zShift,
                    _rwPipeState);
    LOAD_CLIP_SSE(nearClip, farClip, xLoClip, xHiClip, yLoClip, yHiClip,
                  zLoClip, zHiClip, _rwPipeState);

    /* Perspective Projection */
    j = (nNumVert + 3) >> 2;
    while (--j >= 0)
    {

        /* Load the four X, Y + Z */

        /*
         * _mm_shuffle_ps(Rt_m128 a ,  Rt_m128 b , int i )
         *
         * Selects four specific SP FP values from a and b, based on the mask i
         * The mask must be an immediate
         *
         * See also icl _MM_TRANSPOSE4_PS macro
         */

        transpose[0].v4d.v3d = *(RWVERTEXINSTANCEGetPos(inVert0));
        transpose[1].v4d.v3d = *(RWVERTEXINSTANCEGetPos(inVert1));
        transpose[2].v4d.v3d = *(RWVERTEXINSTANCEGetPos(inVert2));
        transpose[3].v4d.v3d = *(RWVERTEXINSTANCEGetPos(inVert3));

        row[0].m128 =
            _mm_shuffle_ps((transpose[0].m128), (transpose[1].m128),
                           0x44);
        row[2].m128 =
            _mm_shuffle_ps((transpose[0].m128), (transpose[1].m128),
                           0xEE);
        row[1].m128 =
            _mm_shuffle_ps((transpose[2].m128), (transpose[3].m128),
                           0x44);
        row[3].m128 =
            _mm_shuffle_ps((transpose[2].m128), (transpose[3].m128),
                           0xEE);

        (in[0].m128) = _mm_shuffle_ps(row[0].m128, row[1].m128, 0xDD);
        (in[1].m128) = _mm_shuffle_ps(row[2].m128, row[3].m128, 0x88);
        (in[2].m128) = _mm_shuffle_ps(row[2].m128, row[3].m128, 0xDD);

        /* Calc the new pos X the four verts. */
        v1.m128 = _mm_mul_ps(in[0].m128, matRight[0].m128);
        v2.m128 = _mm_mul_ps(in[1].m128, matUp[0].m128);
        v3.m128 = _mm_mul_ps(in[2].m128, matAt[0].m128);
        out[0].m128 =
            _mm_add_ps(_mm_add_ps(v1.m128, v2.m128),
                       _mm_add_ps(v3.m128, matPos[0].m128));

        /* Calc the new pos Y the four verts. */
        v1.m128 = _mm_mul_ps(in[0].m128, matRight[1].m128);
        v2.m128 = _mm_mul_ps(in[1].m128, matUp[1].m128);
        v3.m128 = _mm_mul_ps(in[2].m128, matAt[1].m128);
        out[1].m128 =
            _mm_add_ps(_mm_add_ps(v1.m128, v2.m128),
                       _mm_add_ps(v3.m128, matPos[1].m128));

        /* Calc the new pos Z the four verts. */
        v1.m128 = _mm_mul_ps(in[0].m128, matRight[2].m128);
        v2.m128 = _mm_mul_ps(in[1].m128, matUp[2].m128);
        v3.m128 = _mm_mul_ps(in[2].m128, matAt[2].m128);
        out[2].m128 =
            _mm_add_ps(_mm_add_ps
                       (v1.m128, v2.m128),
                       _mm_add_ps(v3.m128, matPos[2].m128));

        /* Save the results. */

        row[0].m128 =
            _mm_shuffle_ps((_mm_zero.m128), (out[0].m128), 0x44);
        row[2].m128 =
            _mm_shuffle_ps((_mm_zero.m128), (out[0].m128), 0xEE);
        row[1].m128 =
            _mm_shuffle_ps((out[1].m128), (out[2].m128), 0x44);
        row[3].m128 =
            _mm_shuffle_ps((out[1].m128), (out[2].m128), 0xEE);

        transpose[0].m128 =
            _mm_shuffle_ps(row[0].m128, row[1].m128, 0x88);
        transpose[1].m128 =
            _mm_shuffle_ps(row[0].m128, row[1].m128, 0xDD);
        transpose[2].m128 =
            _mm_shuffle_ps(row[2].m128, row[3].m128, 0x88);
        transpose[3].m128 =
            _mm_shuffle_ps(row[2].m128, row[3].m128, 0xDD);

        camVert0->cameraVertex = transpose[0].v4d.v3d;
        camVert1->cameraVertex = transpose[1].v4d.v3d;
        camVert2->cameraVertex = transpose[2].v4d.v3d;
        camVert3->cameraVertex = transpose[3].v4d.v3d;

        /*
         * Only do the projection and store a depth buffer value
         * for vertices inside the view volume
         * 3D clipped vertices will have to wait until later ...
         * Check the X clip and  Prefech the overload flags.
         */
        v1.m128 = _mm_cmpgt_ps(out[0].m128, out[2].m128);
        v1.m128 = _mm_and_ps(v1.m128, xHiClip.m128);
        v2.m128 = _mm_cmplt_ps(out[0].m128, _mm_zero.m128);
        v3.m128 = _mm_and_ps(v2.m128, xLoClip.m128);

        xClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
        xClip.m128 = _mm_or_ps(v3.m128, xClip.m128);

        /* Check the Y clip and set overload flag. */
        v1.m128 = _mm_cmpgt_ps(out[1].m128, out[2].m128);
        v1.m128 = _mm_and_ps(v1.m128, yHiClip.m128);
        v2.m128 = _mm_cmplt_ps(out[1].m128, _mm_zero.m128);
        v3.m128 = _mm_and_ps(v2.m128, yLoClip.m128);
        yClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
        yClip.m128 = _mm_or_ps(v3.m128, yClip.m128);

        /* Check the Z clip and get the next four verts. */
        inVert0 = RWVERTEXINSTANCEGetNext(repEntry, inVert3);
        v1.m128 = _mm_cmpgt_ps(out[2].m128, farClip.m128);
        inVert1 = RWVERTEXINSTANCEGetNext(repEntry, inVert0);
        v1.m128 = _mm_and_ps(v1.m128, zHiClip.m128);
        inVert2 = RWVERTEXINSTANCEGetNext(repEntry, inVert1);
        v2.m128 = _mm_cmplt_ps(out[2].m128, nearClip.m128);
        inVert3 = RWVERTEXINSTANCEGetNext(repEntry, inVert2);
        v3.m128 = _mm_and_ps(v2.m128, zLoClip.m128);
        zClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
        zClip.m128 = _mm_or_ps(v3.m128, zClip.m128);

        /* Combine the clip flags. */
        zClip.m128 = _mm_or_ps(zClip.m128, xClip.m128);
        zClip.m128 = _mm_or_ps(zClip.m128, yClip.m128);

        /* Store the clip flags. */
        split.nReal = zClip._f[0];
        camVert0->clipFlags = (RwUInt8) split.nUInt;
        split.nReal = zClip._f[1];
        camVert1->clipFlags = (RwUInt8) split.nUInt;
        split.nReal = zClip._f[2];
        camVert2->clipFlags = (RwUInt8) split.nUInt;
        split.nReal = zClip._f[3];
        camVert3->clipFlags = (RwUInt8) split.nUInt;

        /* Prefetch inVert0. */
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert0)->x,
                        _MM_HINT_NTA);
        clipFlagsOr.m128 = _mm_or_ps(clipFlagsOr.m128, zClip.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert0)->y,
                        _MM_HINT_NTA);
        clipFlagsAnd.m128 = _mm_and_ps(clipFlagsAnd.m128, zClip.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert0)->z,
                        _MM_HINT_NTA);

        /* Set the recip. */
        nRecipZ.m128 = _mm_cmpeq_ps(zClip.m128, _mm_zero.m128);
        nRecipZ.m128 =
            _mm_or_ps(_mm_and_ps(nRecipZ.m128, out[2].m128),
                      _mm_andnot_ps(nRecipZ.m128, _mm_one.m128));
        nRecipZ.m128 = _mm_rcp_ps(nRecipZ.m128);

        /* Calc the results and prefetch the next inputs. */
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert1)->x,
                        _MM_HINT_NTA);
        out[0].m128 = _mm_mul_ps(out[0].m128, nRecipZ.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert1)->y,
                        _MM_HINT_NTA);
        out[0].m128 = _mm_mul_ps(out[0].m128, camWidth.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert1)->z,
                        _MM_HINT_NTA);
        out[0].m128 = _mm_add_ps(out[0].m128, camOffsetX.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert2)->x,
                        _MM_HINT_NTA);
        out[1].m128 = _mm_mul_ps(out[1].m128, nRecipZ.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert2)->y,
                        _MM_HINT_NTA);
        out[1].m128 = _mm_mul_ps(out[1].m128, camHeight.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert2)->z,
                        _MM_HINT_NTA);
        out[1].m128 = _mm_add_ps(out[1].m128, camOffsetY.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert3)->x,
                        _MM_HINT_NTA);
        out[2].m128 = _mm_mul_ps(zScale.m128, nRecipZ.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert3)->y,
                        _MM_HINT_NTA);
        out[2].m128 = _mm_add_ps(out[2].m128, zShift.m128);
        RW_PREFETCH_SSE((char *) &RWVERTEXINSTANCEGetPos(inVert3)->z,
                        _MM_HINT_NTA);

        /* Set the recip */
        if (0 == RwCameraVertexGetClipFlags(camVert0))
        {
            RwIm2DVertexSetCameraX(devVert0, camVert0->cameraVertex.x);
            RwIm2DVertexSetCameraY(devVert0, camVert0->cameraVertex.y);
            RwIm2DVertexSetCameraZ(devVert0, camVert0->cameraVertex.z);
            RwIm2DVertexSetRecipCameraZ(devVert0, nRecipZ._f[0]);

            RwIm2DVertexSetScreenX(devVert0, out[0]._f[0]);
            RwIm2DVertexSetScreenY(devVert0, out[1]._f[0]);
            RwIm2DVertexSetScreenZ(devVert0, out[2]._f[0]);
        }

        if (0 == RwCameraVertexGetClipFlags(camVert1))
        {
            RwIm2DVertexSetCameraX(devVert1, camVert1->cameraVertex.x);
            RwIm2DVertexSetCameraY(devVert1, camVert1->cameraVertex.y);
            RwIm2DVertexSetCameraZ(devVert1, camVert1->cameraVertex.z);
            RwIm2DVertexSetRecipCameraZ(devVert1, nRecipZ._f[1]);

            RwIm2DVertexSetScreenX(devVert1, out[0]._f[1]);
            RwIm2DVertexSetScreenY(devVert1, out[1]._f[1]);
            RwIm2DVertexSetScreenZ(devVert1, out[2]._f[1]);
        }

        if (0 == RwCameraVertexGetClipFlags(camVert2))
        {
            RwIm2DVertexSetCameraX(devVert2, camVert2->cameraVertex.x);
            RwIm2DVertexSetCameraY(devVert2, camVert2->cameraVertex.y);
            RwIm2DVertexSetCameraZ(devVert2, camVert2->cameraVertex.z);
            RwIm2DVertexSetRecipCameraZ(devVert2, nRecipZ._f[2]);

            RwIm2DVertexSetScreenX(devVert2, out[0]._f[2]);
            RwIm2DVertexSetScreenY(devVert2, out[1]._f[2]);
            RwIm2DVertexSetScreenZ(devVert2, out[2]._f[2]);
        }

        if (0 == RwCameraVertexGetClipFlags(camVert3))
        {
            RwIm2DVertexSetCameraX(devVert3, camVert3->cameraVertex.x);
            RwIm2DVertexSetCameraY(devVert3, camVert3->cameraVertex.y);
            RwIm2DVertexSetCameraZ(devVert3, camVert3->cameraVertex.z);
            RwIm2DVertexSetRecipCameraZ(devVert3, nRecipZ._f[3]);

            RwIm2DVertexSetScreenX(devVert3, out[0]._f[3]);
            RwIm2DVertexSetScreenY(devVert3, out[1]._f[3]);
            RwIm2DVertexSetScreenZ(devVert3, out[2]._f[3]);
        }

        /* Get the next three verts. */
        camVert0 = RwCameraVertexGetNext(camVert3);
        devVert0 = RwIm2DVertexGetNext(devVert3);

        camVert1 = RwCameraVertexGetNext(camVert0);
        devVert1 = RwIm2DVertexGetNext(devVert0);

        camVert2 = RwCameraVertexGetNext(camVert1);
        devVert2 = RwIm2DVertexGetNext(devVert1);

        camVert3 = RwCameraVertexGetNext(camVert2);
        devVert3 = RwIm2DVertexGetNext(devVert2);
    }

    /* If clipFlagsOr = 0, everything is on the screen,
     * If clipFlagsAnd != 0, everything is off the screen.
     */
    j = 0;
    split.nReal = clipFlagsOr._f[0];
    j |= split.nUInt;
    split.nReal = clipFlagsOr._f[1];
    j |= split.nUInt;
    split.nReal = clipFlagsOr._f[2];
    j |= split.nUInt;
    split.nReal = clipFlagsOr._f[3];
    j |= split.nUInt;
    _rwPipeState.currentContext->clipFlagsOr |= j;

    j = -1;
    split.nReal = clipFlagsAnd._f[0];
    j &= split.nUInt;
    split.nReal = clipFlagsAnd._f[1];
    j &= split.nUInt;
    split.nReal = clipFlagsAnd._f[2];
    j &= split.nUInt;
    split.nReal = clipFlagsAnd._f[3];
    j &= split.nUInt;
    _rwPipeState.currentContext->clipFlagsAnd &= j;

    RWRETURN(j);
}

static              RwInt32
PipeTransformParallelSSE(RwResEntry * repEntry,
                         RwInt32 nNumVert,
                         RWVERTEXINSTANCE * instancedVertex,
                         RwIm2DVertex * deviceVertex,
                         RwCameraVertex * cameraVertex)
{
    RwMatrix           *mpMat =
        (RwMatrix *) RWPOLYHEADER(repEntry)->xForm;
    RWVERTEXINSTANCE   *inVert0, *inVert1, *inVert2, *inVert3;
    RwIm2DVertex       *devVert0, *devVert1, *devVert2, *devVert3;
    RwCameraVertex     *camVert0, *camVert1, *camVert2, *camVert3;
    RwInt32             j;
    RwSplitBits         split;
    RpSSEOverlayM128    v1, v2, v3;
    RpSSEOverlayM128    in[3], out[3], nRecipZ;
    RpSSEOverlayM128    matRight[3], matUp[3], matAt[3], matPos[3];
    RpSSEOverlayM128    transpose[4], row[4];
    RpSSEOverlayM128    camWidth, camHeight;
    RpSSEOverlayM128    camOffsetX, camOffsetY;
    RpSSEOverlayM128    zShift, zScale;
    RpSSEOverlayM128    nearClip, farClip;
    RpSSEOverlayM128    xLoClip, xHiClip, yLoClip, yHiClip, zLoClip,
        zHiClip;
    RpSSEOverlayM128    xClip, yClip, zClip;
    RpSSEOverlayM128    clipFlagsOr, clipFlagsAnd;
    const RpSSEOverlayM128 _mm_zero = {
        0.0f, 0.0f, 0.0f, 0.0f
    };
    const RpSSEOverlayM128 _mm_one = {
        1.0f, 1.0f, 1.0f, 1.0f
    };

    RWFUNCTION(RWSTRING("PipeTransformParallelSSE"));
    RWASSERT(repEntry);
    /*
     * Get first four vert.
     */
    inVert0 = instancedVertex;
    devVert0 = deviceVertex;
    camVert0 = cameraVertex;

    inVert1 = inVert0;
    devVert1 = devVert0;
    camVert1 = camVert0;

    inVert2 = inVert0;
    devVert2 = devVert0;
    camVert2 = camVert0;

    inVert3 = inVert0;
    devVert3 = devVert0;
    camVert3 = camVert0;

    switch (nNumVert & 3)
    {
        case 0:
            camVert1 = RwCameraVertexGetNext(camVert0);
            devVert1 = RwIm2DVertexGetNext(devVert0);
            inVert1 = RWVERTEXINSTANCEGetNext(repEntry, inVert0);
        case 3:
            camVert2 = RwCameraVertexGetNext(camVert1);
            devVert2 = RwIm2DVertexGetNext(devVert1);
            inVert2 = RWVERTEXINSTANCEGetNext(repEntry, inVert1);
        case 2:
            camVert3 = RwCameraVertexGetNext(camVert2);
            devVert3 = RwIm2DVertexGetNext(devVert2);
            inVert3 = RWVERTEXINSTANCEGetNext(repEntry, inVert2);
        default:
            ;
    }

    clipFlagsOr.m128 = _mm_zero.m128;
    clipFlagsAnd.m128 = _mm_cmpeq_ps(_mm_zero.m128, _mm_zero.m128);
    LOAD_MATRIX_SSE(matRight, matUp, matAt, matPos, mpMat);
    LOAD_CAMERA_SSE(camWidth, camHeight,
                    camOffsetX, camOffsetY, zScale, zShift,
                    _rwPipeState);
    LOAD_CLIP_SSE(nearClip, farClip, xLoClip, xHiClip, yLoClip, yHiClip,
                  zLoClip, zHiClip, _rwPipeState);

    /* Parallel Projection */
    j = (nNumVert + 3) >> 2;
    while (--j >= 0)
    {
        /* Load the four X, Y + Z */

        /*
         * _mm_shuffle_ps(Rt_m128 a ,  Rt_m128 b , int i )
         *
         * Selects four specific SP FP values from a and b, based on the mask i
         * The mask must be an immediate
         *
         * See also icl _MM_TRANSPOSE4_PS macro
         */

        transpose[0].v4d.v3d = *(RWVERTEXINSTANCEGetPos(inVert0));
        transpose[1].v4d.v3d = *(RWVERTEXINSTANCEGetPos(inVert1));
        transpose[2].v4d.v3d = *(RWVERTEXINSTANCEGetPos(inVert2));
        transpose[3].v4d.v3d = *(RWVERTEXINSTANCEGetPos(inVert3));

        row[0].m128 =
            _mm_shuffle_ps((transpose[0].m128), (transpose[1].m128),
                           0x44);
        row[2].m128 =
            _mm_shuffle_ps((transpose[0].m128), (transpose[1].m128),
                           0xEE);
        row[1].m128 =
            _mm_shuffle_ps((transpose[2].m128), (transpose[3].m128),
                           0x44);
        row[3].m128 =
            _mm_shuffle_ps((transpose[2].m128), (transpose[3].m128),
                           0xEE);

        (in[0].m128) = _mm_shuffle_ps(row[0].m128, row[1].m128, 0xDD);
        (in[1].m128) = _mm_shuffle_ps(row[2].m128, row[3].m128, 0x88);
        (in[2].m128) = _mm_shuffle_ps(row[2].m128, row[3].m128, 0xDD);

        /* Calc the X. */
        v1.m128 = _mm_mul_ps(in[0].m128, matRight[0].m128);
        v2.m128 = _mm_mul_ps(in[1].m128, matUp[0].m128);
        v3.m128 = _mm_mul_ps(in[2].m128, matAt[0].m128);
        out[0].m128 =
            _mm_add_ps(_mm_add_ps
                       (v1.m128,
                        v2.m128), _mm_add_ps(v3.m128, matPos[0].m128));

        /* Calc the Y. */
        v1.m128 = _mm_mul_ps(in[0].m128, matRight[1].m128);
        v2.m128 = _mm_mul_ps(in[1].m128, matUp[1].m128);
        v3.m128 = _mm_mul_ps(in[2].m128, matAt[1].m128);
        out[1].m128 =
            _mm_add_ps(_mm_add_ps(v1.m128, v2.m128),
                       _mm_add_ps(v3.m128, matPos[1].m128));

        /* Calc the Z. */
        v1.m128 = _mm_mul_ps(in[0].m128, matRight[2].m128);
        v2.m128 = _mm_mul_ps(in[1].m128, matUp[2].m128);
        v3.m128 = _mm_mul_ps(in[2].m128, matAt[2].m128);
        out[2].m128 =
            _mm_add_ps(_mm_add_ps(v1.m128, v2.m128),
                       _mm_add_ps(v3.m128, matPos[2].m128));

        /* Save the results. */

        row[0].m128 =
            _mm_shuffle_ps((_mm_zero.m128), (out[0].m128), 0x44);
        row[2].m128 =
            _mm_shuffle_ps((_mm_zero.m128), (out[0].m128), 0xEE);
        row[1].m128 =
            _mm_shuffle_ps((out[1].m128), (out[2].m128), 0x44);
        row[3].m128 =
            _mm_shuffle_ps((out[1].m128), (out[2].m128), 0xEE);

        transpose[0].m128 =
            _mm_shuffle_ps(row[0].m128, row[1].m128, 0x88);
        transpose[1].m128 =
            _mm_shuffle_ps(row[0].m128, row[1].m128, 0xDD);
        transpose[2].m128 =
            _mm_shuffle_ps(row[2].m128, row[3].m128, 0x88);
        transpose[3].m128 =
            _mm_shuffle_ps(row[2].m128, row[3].m128, 0xDD);

        camVert0->cameraVertex = transpose[0].v4d.v3d;
        camVert1->cameraVertex = transpose[1].v4d.v3d;
        camVert2->cameraVertex = transpose[2].v4d.v3d;
        camVert3->cameraVertex = transpose[3].v4d.v3d;

        /* No fields overloaded yet */

        /*
         * Only do the projection and store a depth buffer value
         * for vertices inside the view volume
         * 3 D clipped vertices will have to wait until later...
         */

        /* Check the X clip. */
        v1.m128 = _mm_cmpgt_ps(out[0].m128, _mm_one.m128);
        v1.m128 = _mm_and_ps(v1.m128, xHiClip.m128);
        v2.m128 = _mm_cmplt_ps(out[0].m128, _mm_zero.m128);
        v3.m128 = _mm_and_ps(v2.m128, xLoClip.m128);

        xClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
        xClip.m128 = _mm_or_ps(v3.m128, xClip.m128);

        /* Check the Y clip. */
        v1.m128 = _mm_cmpgt_ps(out[1].m128, _mm_one.m128);
        v1.m128 = _mm_and_ps(v1.m128, yHiClip.m128);
        v2.m128 = _mm_cmplt_ps(out[1].m128, _mm_zero.m128);
        v3.m128 = _mm_and_ps(v2.m128, yLoClip.m128);

        yClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
        yClip.m128 = _mm_or_ps(v3.m128, yClip.m128);
        /* Check the Z clip. */
        v1.m128 = _mm_cmpgt_ps(out[2].m128, farClip.m128);
        v1.m128 = _mm_and_ps(v1.m128, zHiClip.m128);
        v2.m128 = _mm_cmplt_ps(out[2].m128, nearClip.m128);
        v3.m128 = _mm_and_ps(v2.m128, zLoClip.m128);

        zClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
        zClip.m128 = _mm_or_ps(v3.m128, zClip.m128);

        /* Combine the clip flags. */
        zClip.m128 = _mm_or_ps(zClip.m128, xClip.m128);
        zClip.m128 = _mm_or_ps(zClip.m128, yClip.m128);

        /* Store the clip flags. */
        split.nReal = zClip._f[0];
        camVert0->clipFlags = (RwUInt8) split.nUInt;
        split.nReal = zClip._f[1];
        camVert1->clipFlags = (RwUInt8) split.nUInt;
        split.nReal = zClip._f[2];
        camVert2->clipFlags = (RwUInt8) split.nUInt;
        split.nReal = zClip._f[3];
        camVert3->clipFlags = (RwUInt8) split.nUInt;

        clipFlagsOr.m128 = _mm_or_ps(clipFlagsOr.m128, zClip.m128);
        clipFlagsAnd.m128 = _mm_and_ps(clipFlagsAnd.m128, zClip.m128);

        out[0].m128 = _mm_mul_ps(out[0].m128, camWidth.m128);
        out[0].m128 = _mm_add_ps(out[0].m128, camOffsetX.m128);

        out[1].m128 = _mm_mul_ps(out[1].m128, camHeight.m128);
        out[1].m128 = _mm_add_ps(out[1].m128, camOffsetY.m128);

        out[2].m128 = _mm_mul_ps(out[2].m128, zScale.m128);
        out[2].m128 = _mm_add_ps(out[2].m128, zShift.m128);

        /* Set the recip */
        if (0 == RwCameraVertexGetClipFlags(camVert0))
        {
            RwIm2DVertexSetCameraX(devVert0, camVert0->cameraVertex.x);
            RwIm2DVertexSetCameraY(devVert0, camVert0->cameraVertex.y);
            RwIm2DVertexSetCameraZ(devVert0, camVert0->cameraVertex.z);
            RwIm2DVertexSetRecipCameraZ(devVert0, (RwReal) 1.0);

            RwIm2DVertexSetScreenX(devVert0, out[0]._f[0]);
            RwIm2DVertexSetScreenY(devVert0, out[1]._f[0]);
            RwIm2DVertexSetScreenZ(devVert0, out[2]._f[0]);
        }

        if (0 == RwCameraVertexGetClipFlags(camVert1))
        {
            RwIm2DVertexSetCameraX(devVert1, camVert1->cameraVertex.x);
            RwIm2DVertexSetCameraY(devVert1, camVert1->cameraVertex.y);
            RwIm2DVertexSetCameraZ(devVert1, camVert1->cameraVertex.z);
            RwIm2DVertexSetRecipCameraZ(devVert1, (RwReal) 1.0);

            RwIm2DVertexSetScreenX(devVert1, out[0]._f[1]);
            RwIm2DVertexSetScreenY(devVert1, out[1]._f[1]);
            RwIm2DVertexSetScreenZ(devVert1, out[2]._f[1]);
        }

        if (0 == RwCameraVertexGetClipFlags(camVert2))
        {
            RwIm2DVertexSetCameraX(devVert2, camVert2->cameraVertex.x);
            RwIm2DVertexSetCameraY(devVert2, camVert2->cameraVertex.y);
            RwIm2DVertexSetCameraZ(devVert2, camVert2->cameraVertex.z);
            RwIm2DVertexSetRecipCameraZ(devVert2, (RwReal) 1.0);

            RwIm2DVertexSetScreenX(devVert2, out[0]._f[2]);
            RwIm2DVertexSetScreenY(devVert2, out[1]._f[2]);
            RwIm2DVertexSetScreenZ(devVert2, out[2]._f[2]);
        }

        if (0 == RwCameraVertexGetClipFlags(camVert3))
        {
            RwIm2DVertexSetCameraX(devVert3, camVert3->cameraVertex.x);
            RwIm2DVertexSetCameraY(devVert3, camVert3->cameraVertex.y);
            RwIm2DVertexSetCameraZ(devVert3, camVert3->cameraVertex.z);
            RwIm2DVertexSetRecipCameraZ(devVert3, (RwReal) 1.0);

            RwIm2DVertexSetScreenX(devVert3, out[0]._f[3]);
            RwIm2DVertexSetScreenY(devVert3, out[1]._f[3]);
            RwIm2DVertexSetScreenZ(devVert3, out[2]._f[3]);
        }

        /* Onto the next 4 vertex */
        camVert0 = RwCameraVertexGetNext(camVert3);
        devVert0 = RwIm2DVertexGetNext(devVert3);
        inVert0 = RWVERTEXINSTANCEGetNext(repEntry, inVert3);

        /* Get the next three verts. */
        camVert1 = RwCameraVertexGetNext(camVert0);
        devVert1 = RwIm2DVertexGetNext(devVert0);
        inVert1 = RWVERTEXINSTANCEGetNext(repEntry, inVert0);

        camVert2 = RwCameraVertexGetNext(camVert1);
        devVert2 = RwIm2DVertexGetNext(devVert1);
        inVert2 = RWVERTEXINSTANCEGetNext(repEntry, inVert1);

        camVert3 = RwCameraVertexGetNext(camVert2);
        devVert3 = RwIm2DVertexGetNext(devVert2);
        inVert3 = RWVERTEXINSTANCEGetNext(repEntry, inVert2);
    }

    /* If clipFlagsOr = 0, everything is on the screen,
     * If clipFlagsAnd != 0, everything is off the screen.
     */
    j = 0;
    split.nReal = clipFlagsOr._f[0];
    j |= split.nUInt;
    split.nReal = clipFlagsOr._f[1];
    j |= split.nUInt;
    split.nReal = clipFlagsOr._f[2];
    j |= split.nUInt;
    split.nReal = clipFlagsOr._f[3];
    j |= split.nUInt;
    _rwPipeState.currentContext->clipFlagsOr |= j;

    j = -1;
    split.nReal = clipFlagsAnd._f[0];
    j &= split.nUInt;
    split.nReal = clipFlagsAnd._f[1];
    j &= split.nUInt;
    split.nReal = clipFlagsAnd._f[2];
    j &= split.nUInt;
    split.nReal = clipFlagsAnd._f[3];
    j &= split.nUInt;
    _rwPipeState.currentContext->clipFlagsAnd &= j;

    RWRETURN(j);
}

static void
PipeTransformSSE(RwResEntry * repEntry)
{
    RwInt32             clipFlagsAnd;
    RwPipeContext      *currentContext = _rwPipeState.currentContext;
    RWVERTEXINSTANCE   *instancedVertex;
    RwIm2DVertex       *deviceVertex;
    RwCameraVertex     *cameraVertex;
    RwInt32             nNumVert =
        RWVERTEXINSTANCEGetQuantity(repEntry);

    RWFUNCTION(RWSTRING("PipeTransformSSE"));
    RWASSERT(repEntry);
    _rwPipeState.currentContext->clipFlagsOr = 0;
    _rwPipeState.currentContext->clipFlagsAnd = -1;
    instancedVertex = RWVERTEXINSTANCEGet(repEntry, 0);
    deviceVertex = RwIm2DVertexGet(0);
    cameraVertex = RwCameraVertexGet(0);
    if (rwPERSPECTIVE ==
        ((RwCamera *) RWSRCGLOBAL(curCamera))->projectionType)
        clipFlagsAnd =
            PipeTransformPerspectiveSSE
            (repEntry, nNumVert, instancedVertex, deviceVertex,
             cameraVertex);
    else
        clipFlagsAnd =
            PipeTransformParallelSSE(repEntry, nNumVert,
                                     instancedVertex, deviceVertex,
                                     cameraVertex);
    /* If we're not overloading stuff, copy it across now from the instanced
     * vertices to the camera dn potentially the device vertices
     */
    if (!clipFlagsAnd)
    {
        currentContext->fpSetNonOverloadedFieldsInCamAndDevVert
            (repEntry);
    }

    RWRETURNVOID();
}

#else /* (!defined(RXPIPELINE)) */

#define PipeTransformSSE NULL

#endif /* (!defined(RXPIPELINE)) */

RtIntelOverload    *
_rtIntelOverloadGetHandle(void)
{
    static RtIntelOverload dGIntelOverload = {
        _rwMatrixMultiplySSE,
        VectorMultPointSSE,
        VectorMultVectorSSE,
        PipeTransformSSE
    };
    RwBool              SSEEnabledCPU;
    RtIntelOverload    *result = NULL;

    RWFUNCTION(RWSTRING("_rtIntelOverloadGetHandle"));

    SSEEnabledCPU = RtIntelHaveSSE();

    if (SSEEnabledCPU)
    {
        /* _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); */
        _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON);
        OUTPUTDEBUGSTRING(__FILE__ ":" RW_STRINGIFY_EXPANDED(__LINE__)
                          ":"
                          "_mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON);\n");
        result = &dGIntelOverload;
    }
    else
    {
        RWERROR((E_RW_NOTSSEENABLEDCPU));
    }

    RWMESSAGE(("SSEEnabledCPU %08x result %p", SSEEnabledCPU, result));
    RWRETURN(result);
}

#endif /* (400<=__ICL) */
#endif /* (defined(__ICL)) */
