/*
 * Refinement plugin
 */

/****************************************************************************
 *                                                                          *
 *  Module  :   bbtpsse.c                                                   *
 *                                                                          *
 *  Purpose :   Tools for Bernstein Bezier Triangular Patch                 *
 *                                                                          *
 ****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#include "rpplugin.h"
#include "time.h"              /* pick up struct tm; */
#include <rpdbgerr.h>
#include <rwcore.h>
#include <rpworld.h>
#include "rprefine.h"

#if (!defined(DOXYGEN_SHOULD_SKIP_THIS))
static const char   rcsid[] __RWUNUSED__ =
    "@@(#)$Id: bbtpsse.c,v 1.13 2001/01/26 11:05:17 johns Exp $";
#endif /* (!defined(DOXYGEN_SHOULD_SKIP_THIS)) */

/* Check for SSE. */

#if (defined(__ICL) && defined(RWSIMD))
#if (400 <= __ICL)

#include <rtintel.h>

#include "bbtpsse.h"

/****************************************************************************
 Local types
 */

/****************************************************************************
 Public variables
 */

/****************************************************************************
 External variables
 */

/****************************************************************************
 Local variables
 */

/****************************************************************************
 Local functions for the Bernstein-Bezier Triangular Patch toolkit
 */

/*****************************************************************************
 PATCH code
 */

/*****************************************************************************
 **
 ** SSE - Only enabled for Intel SSE compilers.
 **
 *****************************************************************************/

/****************************************************************************
 Local functions for the Bernstein-Bezier Triangular Patch toolkit
 *
 * SSE Implementation. The maths remains the same except four values
 * are computed rather than just one.
 */

void
_rtbbtpSSEGenerateOrdinates(BBTPSSEOrdinates * ord,
                            RpSSEOverlayM128 * objPos0,
                            RpSSEOverlayM128 * objNrm0,
                            RpSSEOverlayM128 * objPos1,
                            RpSSEOverlayM128 * objNrm1,
                            RpSSEOverlayM128 * objPos2,
                            RpSSEOverlayM128 * objNrm2)
{
    RpSSEOverlayM128    N[3], l, invl;
    const RpSSEOverlayM128 one_quater = { (RwReal) 0.25,
        (RwReal) 0.25,
        (RwReal) 0.25,
        (RwReal) 0.25
    };

    RWFUNCTION(RWSTRING("_rtbbtpSSEGenerateOrdinates"));

    /* N = objNrm0 + nrmNrm1
     * l = length(N)
     * N = normalise(N) */

    SSEV3dAdd(N, objNrm0, objNrm1);
    SSEV3dLength(l, N);
    invl.m128 = _mm_rcp_ps(l.m128);
    SSEV3dMultConstant(N, N, invl);

    _rtbbtpSSEComputeOrdinates(&ord->b210, objPos0, objPos1, objNrm0,
                               N);
    _rtbbtpSSEComputeOrdinates(&ord->b120, objPos1, objPos0, objNrm1,
                               N);

    SSEV3dAdd(N, objNrm0, objNrm2);
    SSEV3dLength(l, N);
    invl.m128 = _mm_rcp_ps(l.m128);
    SSEV3dMultConstant(N, N, invl);

    _rtbbtpSSEComputeOrdinates(&ord->b201, objPos0, objPos2, objNrm0,
                               N);
    _rtbbtpSSEComputeOrdinates(&ord->b102, objPos2, objPos0, objNrm2,
                               N);

    SSEV3dAdd(N, objNrm1, objNrm2);
    SSEV3dLength(l, N);
    invl.m128 = _mm_rcp_ps(l.m128);
    SSEV3dMultConstant(N, N, invl);

    _rtbbtpSSEComputeOrdinates(&ord->b021, objPos1, objPos2, objNrm1,
                               N);
    _rtbbtpSSEComputeOrdinates(&ord->b012, objPos2, objPos1, objNrm2,
                               N);

    ord->b111.m128 = _mm_add_ps(ord->b201.m128, ord->b102.m128);
    ord->b111.m128 = _mm_add_ps(ord->b111.m128, ord->b021.m128);
    ord->b111.m128 = _mm_add_ps(ord->b111.m128, ord->b012.m128);
    ord->b111.m128 = _mm_add_ps(ord->b111.m128, ord->b210.m128);
    ord->b111.m128 = _mm_add_ps(ord->b111.m128, ord->b120.m128);

    ord->b111.m128 = _mm_mul_ps(ord->b111.m128, one_quater.m128);

    RWRETURNVOID();
}

void
_rtbbtpSSEGenerateControlPoints(BBTPSSEControlPoints * cps,
                                RpSSEOverlayM128 * objPos0,
                                RpSSEOverlayM128 * objNrm0,
                                RpSSEOverlayM128 * objPos1,
                                RpSSEOverlayM128 * objNrm1,
                                RpSSEOverlayM128 * objPos2,
                                RpSSEOverlayM128 * objNrm2)
{
    RpSSEOverlayM128    tmp[3];

    const RpSSEOverlayM128 minus_one_sixth = { (RwReal) - 0.166666,
        (RwReal) - 0.166666,
        (RwReal) - 0.166666,
        (RwReal) - 0.166666
    };
    const RpSSEOverlayM128 one_quater = { (RwReal) 0.25,
        (RwReal) 0.25,
        (RwReal) 0.25,
        (RwReal) 0.25
    };

    RWFUNCTION(RWSTRING("_rtbbtpSSEGenerateControlPoints"));

    /* vertex 0 */
    _rtbbtpSSEComputeControlPoints(cps->b201,
                                   objPos0, objNrm0, objPos2, objNrm2);
    _rtbbtpSSEComputeControlPoints(cps->b210,
                                   objPos0, objNrm0, objPos1, objNrm1);

    /* vertex 1 */
    _rtbbtpSSEComputeControlPoints(cps->b021,
                                   objPos1, objNrm1, objPos2, objNrm2);
    _rtbbtpSSEComputeControlPoints(cps->b120,
                                   objPos1, objNrm1, objPos0, objNrm0);

    /* vertex 2 */
    _rtbbtpSSEComputeControlPoints(cps->b102,
                                   objPos2, objNrm2, objPos0, objNrm0);
    _rtbbtpSSEComputeControlPoints(cps->b012,
                                   objPos2, objNrm2, objPos1, objNrm0);

    /* bums */
    cps->b300[0].m128 = objPos0[0].m128;
    cps->b300[1].m128 = objPos0[1].m128;
    cps->b300[2].m128 = objPos0[2].m128;

    cps->b030[0].m128 = objPos1[0].m128;
    cps->b030[1].m128 = objPos1[1].m128;
    cps->b030[2].m128 = objPos1[2].m128;

    cps->b003[0].m128 = objPos2[0].m128;
    cps->b003[1].m128 = objPos2[1].m128;
    cps->b003[2].m128 = objPos2[2].m128;

    SSEV3dAdd(cps->b111, cps->b210, cps->b201);
    SSEV3dAdd(cps->b111, cps->b111, cps->b120);
    SSEV3dAdd(cps->b111, cps->b111, cps->b102);
    SSEV3dAdd(cps->b111, cps->b111, cps->b012);
    SSEV3dAdd(cps->b111, cps->b111, cps->b021);

    SSEV3dMultConstant(cps->b111, cps->b111, one_quater);

    SSEV3dAdd(tmp, cps->b300, cps->b030);
    SSEV3dAdd(tmp, tmp, cps->b003);
    SSEV3dMultConstant(tmp, tmp, minus_one_sixth);

    SSEV3dAdd(cps->b111, cps->b111, tmp);

    RWRETURNVOID();
}

/*****************************************************************************
 PATCH code
 */

void
_rtbbtpSSEPatchEvaluate(RpSSEOverlayM128 * res,
                        BBTPSSEOrdinates * ord,
                        RpSSEOverlayM128 * a,
                        RpSSEOverlayM128 * b, RpSSEOverlayM128 * c)
{
    RwInt32             i;
    RpSSEOverlayM128    a2, a3, b2, b3, c2, c3, a2b, a2c, ab2, ac2, b2c,
        bc2, abc;
    const RpSSEOverlayM128 three = { (RwReal) 3.0,
        (RwReal) 3.0,
        (RwReal) 3.0,
        (RwReal) 3.0
    };

    RWFUNCTION(RWSTRING("_rtbbtpSSEPatchEvaluate"));

    /* Square & cube of a, b & c */
    a2.m128 = _mm_mul_ps(a->m128, a->m128);
    a3.m128 = _mm_mul_ps(a->m128, a2.m128);

    b2.m128 = _mm_mul_ps(b->m128, b->m128);
    b3.m128 = _mm_mul_ps(b->m128, b2.m128);

    c2.m128 = _mm_mul_ps(c->m128, c->m128);
    c3.m128 = _mm_mul_ps(c->m128, c2.m128);

    /* Pre-compute some common const */
    a2b.m128 = _mm_mul_ps(a2.m128, b->m128);
    a2c.m128 = _mm_mul_ps(a2.m128, c->m128);

    ab2.m128 = _mm_mul_ps(a->m128, b2.m128);
    ac2.m128 = _mm_mul_ps(a->m128, c2.m128);

    b2c.m128 = _mm_mul_ps(b2.m128, c->m128);
    bc2.m128 = _mm_mul_ps(b->m128, c2.m128);

    abc.m128 = _mm_mul_ps(_mm_mul_ps(a->m128, b->m128), c->m128);
    abc.m128 = _mm_add_ps(abc.m128, abc.m128);

    /* Compute the res. */

    res->m128 = _mm_add_ps(_mm_mul_ps(ord->b021.m128, b2c.m128),
                           _mm_mul_ps(ord->b012.m128, bc2.m128));
    res->m128 = _mm_add_ps(_mm_mul_ps(ord->b120.m128, ab2.m128),
                           res->m128);
    res->m128 = _mm_add_ps(_mm_mul_ps(ord->b102.m128, ac2.m128),
                           res->m128);
    res->m128 = _mm_add_ps(_mm_mul_ps(ord->b210.m128, a2b.m128),
                           res->m128);
    res->m128 = _mm_add_ps(_mm_mul_ps(ord->b201.m128, a2c.m128),
                           res->m128);
    res->m128 = _mm_add_ps(_mm_mul_ps(ord->b111.m128, abc.m128),
                           res->m128);
    res->m128 = _mm_mul_ps(res->m128, three.m128);

    RWRETURNVOID();
}

void
_rtbbtpSSEComputeOrdinates(RpSSEOverlayM128 * alpha,
                           RpSSEOverlayM128 * objPos0,
                           RpSSEOverlayM128 * objPos1,
                           RpSSEOverlayM128 * objNrm,
                           RpSSEOverlayM128 * nrmDir __RWUNUSED__)
{
    RpSSEOverlayM128    edgeDir[3], dp;
    const RpSSEOverlayM128 minus_one_third = { (RwReal) - 0.333333,
        (RwReal) - 0.333333,
        (RwReal) - 0.333333,
        (RwReal) - 0.333333
    };

    RWFUNCTION(RWSTRING("_rtbbtpSSEComputeOrdinates"));

    /* edgeDir = objPos1 - objPos0 */
    SSEV3dSub(edgeDir, objPos1, objPos0);

    /* dp = dot product(edgeDir, objNrm) */
    SSEV3dDotProduct(dp, edgeDir, objNrm);

    /* alpha = -dp2 / 3 */
    alpha->m128 = _mm_mul_ps(dp.m128, minus_one_third.m128);

    RWRETURNVOID();
}

void
_rt_rtbbtpSSEComputeOrdinatesSlow(RpSSEOverlayM128 * alpha,
                                  RpSSEOverlayM128 * objPos0,
                                  RpSSEOverlayM128 * objPos1,
                                  RpSSEOverlayM128 * objNrm,
                                  RpSSEOverlayM128 * nrmDir)
{
    RpSSEOverlayM128    edgeDir[3], dp1, dp2;
    const RpSSEOverlayM128 one_third = { (RwReal) 0.333333,
        (RwReal) 0.333333,
        (RwReal) 0.333333,
        (RwReal) 0.333333
    };
    const RpSSEOverlayM128 minus_one = { (RwReal) - 1.0,
        (RwReal) - 1.0,
        (RwReal) - 1.0,
        (RwReal) - 1.0
    };

    RWFUNCTION(RWSTRING("_rt_rtbbtpSSEComputeOrdinatesSlow"));

    /* edgeDir = objPos1 - objPos0 */
    SSEV3dSub(edgeDir, objPos1, objPos0);

    /* dp1 = dot product(nrmDir, objNrm) */
    SSEV3dDotProduct(dp1, nrmDir, objNrm);

    /* dp2 = dot product(edgeir, objNrm) */
    SSEV3dDotProduct(dp2, edgeDir, objNrm);

    /* alpha = -(dp2) / (3 * dp1) */
    alpha->m128 = _mm_mul_ps(dp2.m128, minus_one.m128);
    alpha->m128 = _mm_mul_ps(alpha->m128, one_third.m128);
    alpha->m128 = _mm_div_ps(alpha->m128, dp1.m128);

    RWRETURNVOID();
}

/*****************************************************************************
 SURFACE code
 */
void
_rtbbtpSSESurfaceEvaluate(RpSSEOverlayM128 * res,
                          BBTPSSEControlPoints * cps,
                          RpSSEOverlayM128 * a,
                          RpSSEOverlayM128 * b, RpSSEOverlayM128 * c)
{
    RpSSEOverlayM128    tmp1[3], tmp2[3];
    RpSSEOverlayM128    a2, a3, b2, b3, c2, c3, a2b, a2c, ab2, ac2, b2c,
        bc2, abc;
    const RpSSEOverlayM128 three = { (RwReal) 3.0,
        (RwReal) 3.0,
        (RwReal) 3.0,
        (RwReal) 3.0
    };

    RWFUNCTION(RWSTRING("_rtbbtpSSESurfaceEvaluate"));

    /* Square & cube of a, b & c */
    a2.m128 = _mm_mul_ps(a->m128, a->m128);
    a3.m128 = _mm_mul_ps(a->m128, a2.m128);

    b2.m128 = _mm_mul_ps(b->m128, b->m128);
    b3.m128 = _mm_mul_ps(b->m128, b2.m128);

    c2.m128 = _mm_mul_ps(c->m128, c->m128);
    c3.m128 = _mm_mul_ps(c->m128, c2.m128);

    /* Pre-compute some common const */
    a2b.m128 = _mm_mul_ps(a2.m128, b->m128);
    a2c.m128 = _mm_mul_ps(a2.m128, c->m128);

    ab2.m128 = _mm_mul_ps(a->m128, b2.m128);
    ac2.m128 = _mm_mul_ps(a->m128, c2.m128);

    b2c.m128 = _mm_mul_ps(b2.m128, c->m128);
    bc2.m128 = _mm_mul_ps(b->m128, c2.m128);

    abc.m128 = _mm_mul_ps(_mm_mul_ps(a->m128, b->m128), c->m128);
    abc.m128 = _mm_add_ps(abc.m128, abc.m128);

    /* Compute the res */
    SSEV3dMultConstant(tmp1, cps->b021, b2c);
    SSEV3dMultConstant(tmp2, cps->b012, bc2);
    SSEV3dAdd(res, tmp1, tmp2);

    SSEV3dMultConstant(tmp1, cps->b120, ab2);
    SSEV3dAdd(res, res, tmp1);

    SSEV3dMultConstant(tmp1, cps->b102, ac2);
    SSEV3dAdd(res, res, tmp1);

    SSEV3dMultConstant(tmp1, cps->b210, a2b);
    SSEV3dAdd(res, res, tmp1);

    SSEV3dMultConstant(tmp1, cps->b201, a2c);
    SSEV3dAdd(res, res, tmp1);

    SSEV3dMultConstant(tmp1, cps->b111, abc);
    SSEV3dAdd(res, res, tmp1);

    SSEV3dMultConstant(res, res, three);

    SSEV3dMultConstant(tmp1, cps->b300, a3);
    SSEV3dAdd(res, res, tmp1);

    SSEV3dMultConstant(tmp1, cps->b030, b3);
    SSEV3dAdd(res, res, tmp1);

    SSEV3dMultConstant(tmp1, cps->b003, c3);
    SSEV3dAdd(res, res, tmp1);

    RWRETURNVOID();
}

/*****************************************************************************
 Intersect the tangent plane at v0 with the line from third of the way along
 the edge in the direction of the interpolated normal
 */
void
_rt_rtbbtpSSEComputeControlPointsOld(RpSSEOverlayM128 * res,
                                     RpSSEOverlayM128 * objPos0,
                                     RpSSEOverlayM128 * objNrm0,
                                     RpSSEOverlayM128 * objPos1,
                                     RpSSEOverlayM128 * objNrm1)
{
    RpSSEOverlayM128    N[3], E[3], tmp1[3], tmp2[3], dp1, dp2, l, invl,
        c;
    const RpSSEOverlayM128 minus_one = { (RwReal) 3 - 1.0,
        (RwReal) - 1.0,
        (RwReal) - 1.0,
        (RwReal) - 1.0
    };
    const RpSSEOverlayM128 one_third = { (RwReal) 0.333333,
        (RwReal) 0.333333,
        (RwReal) 0.333333,
        (RwReal) 0.333333
    };
    const RpSSEOverlayM128 two_third = { (RwReal) 0.666667,
        (RwReal) 0.666667,
        (RwReal) 0.666667,
        (RwReal) 0.666667
    };
    const RpSSEOverlayM128 point_95 = { (RwReal) 0.95,
        (RwReal) 0.95,
        (RwReal) 0.95,
        (RwReal) 0.95
    };

    RWFUNCTION(RWSTRING("_rt_rtbbtpSSEComputeControlPointsOld"));

    /* compute a unit length avg nrm */

    /* N = (objNrm0 * 2 / 3) + (objNrm1 / 3) */
    SSEV3dMultConstant(tmp1, objNrm0, two_third);
    SSEV3dMultConstant(tmp2, objNrm1, one_third);
    SSEV3dAdd(N, tmp1, tmp2);

    /* l = length(N) */
    SSEV3dLength(l, N);

    /* N = normalise(N) */
    invl.m128 = _mm_rcp_ps(l.m128);
    SSEV3dMultConstant(N, N, invl);

    /* E = objPos1 - objPos0 */
    SSEV3dSub(E, objPos1, objPos0);

    /* dp1 = dot product(E, objNrm0) */
    SSEV3dDotProduct(dp1, E, objNrm0);

    /* dp2 = dot product(N, objNrm0) * -0.3 */
    SSEV3dDotProduct(dp2, N, objNrm0);
    dp2.m128 = _mm_mul_ps(dp2.m128, one_third.m128);
    dp2.m128 = _mm_mul_ps(dp2.m128, minus_one.m128);

    /* N = N * (0.95 * dp1 / dp2) */
    c.m128 = _mm_div_ps(_mm_div_ps(dp1.m128, dp2.m128), point_95.m128);
    SSEV3dMultConstant(N, N, c);

    /* third way (TM, New Labour) along the edge */

    /* res = (objPos0 * 2 / 3) + (objPos1 / 3) */
    SSEV3dMultConstant(tmp1, objPos0, two_third);
    SSEV3dMultConstant(tmp2, objPos1, one_third);
    SSEV3dAdd(res, tmp1, tmp2);

    /* res = res - N */
    SSEV3dSub(res, res, N);

    RWRETURNVOID();
}

/*****************************************************************************
 Given an edge rotate the third-length edge vector about the vertex such that
 it is mutually perpendicular to n0 and (n0 cross edge).
 */
void
_rtbbtpSSEComputeControlPoints(RpSSEOverlayM128 * res,
                               RpSSEOverlayM128 * objPos0,
                               RpSSEOverlayM128 * objNrm0,
                               RpSSEOverlayM128 * objPos1,
                               RpSSEOverlayM128 * objNrm1 __RWUNUSED__)
{
    RpSSEOverlayM128    E[3], A[3], B[3], l0, l1, invl;
    const RpSSEOverlayM128 one_third = { (RwReal) 0.333333,
        (RwReal) 0.333333,
        (RwReal) 0.333333,
        (RwReal) 0.333333
    };

    RWFUNCTION(RWSTRING("_rtbbtpSSEComputeControlPoints"));

    /* E = objPos1 - objPos0 */
    SSEV3dSub(E, objPos1, objPos0);

    /* l0 = length(E) */
    SSEV3dLength(l0, E);

    /* E = normalise(E) */
    invl.m128 = _mm_rcp_ps(l0.m128);
    SSEV3dMultConstant(E, E, invl);

    /* A = cross product(objNrm0, E) */
    SSEV3dCrossProduct(A, objNrm0, E);

    /* B = cross product(A, objNrm0) */
    SSEV3dCrossProduct(B, A, objNrm0);

    /* L1 = length(B) */
    SSEV3dLength(l1, B);

    /* B = normalise(B) */
    invl.m128 = _mm_rcp_ps(l1.m128);
    SSEV3dMultConstant(B, B, invl);

    /* B = B * (L0 / 3) */
    l1.m128 = _mm_mul_ps(l0.m128, one_third.m128);
    SSEV3dMultConstant(B, B, l1);

    /* res = objPos0 + B */
    SSEV3dAdd(res, objPos0, B);

    RWRETURNVOID();
}

#endif /* (400 <= __ICL) */
#endif /* (defined(__ICL) && defined(RWSIMD)) */
