diff --git a/configure.py b/configure.py index 8558ecb8..fb312fec 100755 --- a/configure.py +++ b/configure.py @@ -1040,7 +1040,7 @@ config.libs = [ DolphinLib( "mtx", [ - Object(NonMatching, "Dolphin/mtx/mtx.c"), + Object(Matching, "Dolphin/mtx/mtx.c", cflags = [*cflags_base, "-fp_contract off"]), Object(NonMatching, "Dolphin/mtx/mtx44vec.c"), Object(NonMatching, "Dolphin/mtx/mtx44.c"), Object(NonMatching, "Dolphin/mtx/vec.c"), diff --git a/include/dolphin/mtx.h b/include/dolphin/mtx.h index f57beeaf..56e48b91 100644 --- a/include/dolphin/mtx.h +++ b/include/dolphin/mtx.h @@ -1,30 +1,39 @@ #ifndef _DOLPHIN_MTX #define _DOLPHIN_MTX +#include #include - #ifdef __cplusplus extern "C" { #endif +#define GEKKO +#ifndef GEKKO +#define MTX_USE_C +#undef MTX_USE_PS +#endif -typedef f32 Mtx[3][4]; -typedef f32 (*MtxPtr)[4]; -typedef f32 ROMtx[4][3]; -typedef f32 (*ROMtxPtr)[3]; -typedef f32 Mtx44[4][4]; -typedef f32 (*Mtx44Ptr)[4]; +#if (!defined(MTX_USE_PS) && !defined(MTX_USE_C)) +#ifndef _DEBUG +#define MTX_USE_PS +#endif +#endif + + +#define MTX_PTR_OFFSET 3 + +#define MTX44_PTR_OFFSET 4 typedef struct { - f32 x, y, z; -} Vec, *VecPtr; -typedef struct { - s16 x, y, z; -} S16Vec, *S16VecPtr; + u32 numMtx; + MtxPtr stackBase; + MtxPtr stackPtr; -typedef struct { - f32 x, y, z, w; -} Quaternion, *QuaternionPtr; +} MtxStack, *MtxStackPtr; + +#define MTXDegToRad(a) ((a)*0.01745329252f) +#define MTXRadToDeg(a) ((a)*57.29577951f) +#define MTXRowCol(m, r, c) ((m)[(r)][(c)]) void C_MTXIdentity(Mtx m); void C_MTXCopy(const Mtx src, Mtx dst); @@ -33,33 +42,8 @@ void C_MTXConcatArray(const Mtx a, const Mtx* srcBase, Mtx* dstBase, u32 count); void C_MTXTranspose(const Mtx src, Mtx xPose); u32 C_MTXInverse(const Mtx src, Mtx inv); u32 C_MTXInvXpose(const Mtx src, Mtx invX); -void C_MTXMultVec(const Mtx m, const Vec* src, Vec* dst); -void C_MTXMultVecArray(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count); -void C_MTXMultVecSR(const Mtx m, const Vec* src, Vec* dst); -void C_MTXMultVecArraySR(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count); -void C_MTXQuat(Mtx m, const Quaternion* q); -void C_MTXReflect(Mtx m, const Vec* p, const Vec* n); -void C_MTXTrans(Mtx m, f32 xT, f32 yT, f32 zT); -void C_MTXTransApply(const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT); -void C_MTXScale(Mtx m, f32 xS, f32 yS, f32 zS); -void C_MTXScaleApply(const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS); -void C_MTXRotRad(Mtx m, char axis, f32 rad); -void C_MTXRotTrig(Mtx m, char axis, f32 sinA, f32 cosA); -void C_MTXRotAxisRad(Mtx m, const Vec* axis, f32 rad); -void C_MTXLookAt(Mtx m, const Vec* camPos, const Vec* camUp, const Vec* target); -void C_MTXFrustum(Mtx44 m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f); -void C_MTXPerspective(Mtx44 m, f32 fovY, f32 aspect, f32 n, f32 f); -void C_MTXOrtho(Mtx44 m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f); -void C_MTXLightFrustum(Mtx m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 scaleS, f32 scaleT, f32 transS, - f32 transT); -void C_MTXLightPerspective(Mtx m, f32 fovY, f32 aspect, f32 scaleS, f32 scaleT, f32 transS, - f32 transT); - -void C_MTXLightOrtho(Mtx m, f32 t, f32 b, f32 l, f32 r, f32 scaleS, f32 scaleT, f32 transS, - f32 transT); - -#ifdef __MWERKS__ +#ifdef GEKKO void PSMTXIdentity(Mtx m); void PSMTXCopy(const Mtx src, Mtx dst); void PSMTXConcat(const Mtx a, const Mtx b, Mtx ab); @@ -67,22 +51,9 @@ void PSMTXConcatArray(const Mtx a, const Mtx* srcBase, Mtx* dstBase, u32 count); void PSMTXTranspose(const Mtx src, Mtx xPose); u32 PSMTXInverse(const Mtx src, Mtx inv); u32 PSMTXInvXpose(const Mtx src, Mtx invX); -void PSMTXMultVec(const Mtx m, const Vec* src, Vec* dst); -void PSMTXMultVecArray(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count); -void PSMTXMultVecSR(const Mtx m, const Vec* src, Vec* dst); -void PSMTXMultVecArraySR(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count); -void PSMTXQuat(Mtx m, const Quaternion* q); -void PSMTXReflect(Mtx m, const Vec* p, const Vec* n); -void PSMTXTrans(Mtx m, f32 xT, f32 yT, f32 zT); -void PSMTXTransApply(const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT); -void PSMTXScale(Mtx m, f32 xS, f32 yS, f32 zS); -void PSMTXScaleApply(const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS); -void PSMTXRotRad(Mtx m, char axis, f32 rad); -void PSMTXRotTrig(Mtx m, char axis, f32 sinA, f32 cosA); -void PSMTXRotAxisRad(Mtx m, const Vec* axis, f32 rad); #endif -#ifdef __MWERKS__ +#ifdef MTX_USE_PS #define MTXIdentity PSMTXIdentity #define MTXCopy PSMTXCopy #define MTXConcat PSMTXConcat @@ -90,10 +61,67 @@ void PSMTXRotAxisRad(Mtx m, const Vec* axis, f32 rad); #define MTXTranspose PSMTXTranspose #define MTXInverse PSMTXInverse #define MTXInvXpose PSMTXInvXpose +#else +#define MTXIdentity C_MTXIdentity +#define MTXCopy C_MTXCopy +#define MTXConcat C_MTXConcat +#define MTXConcatArray C_MTXConcatArray +#define MTXTranspose C_MTXTranspose +#define MTXInverse C_MTXInverse +#define MTXInvXpose C_MTXInvXpose +#endif + +void C_MTXMultVec(const Mtx m, const Vec* src, Vec* dst); +void C_MTXMultVecArray(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count); +void C_MTXMultVecSR(const Mtx m, const Vec* src, Vec* dst); +void C_MTXMultVecArraySR(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count); + +#ifdef GEKKO +void PSMTXMultVec(const Mtx m, const Vec* src, Vec* dst); +void PSMTXMultVecArray(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count); +void PSMTXMultVecSR(const Mtx m, const Vec* src, Vec* dst); +void PSMTXMultVecArraySR(const Mtx m, const Vec* srcBase, Vec* dstBase, u32 count); +#endif + +#ifdef MTX_USE_PS #define MTXMultVec PSMTXMultVec #define MTXMultVecArray PSMTXMultVecArray #define MTXMultVecSR PSMTXMultVecSR #define MTXMultVecArraySR PSMTXMultVecArraySR +#else // MTX_USE_C +#define MTXMultVec C_MTXMultVec +#define MTXMultVecArray C_MTXMultVecArray +#define MTXMultVecSR C_MTXMultVecSR +#define MTXMultVecArraySR C_MTXMultVecArraySR +#endif + +void C_MTXQuat(Mtx m, const Quaternion* q); +void C_MTXReflect(Mtx m, const Vec* p, const Vec* n); + +void C_MTXTrans(Mtx m, f32 xT, f32 yT, f32 zT); +void C_MTXTransApply(const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT); +void C_MTXScale(Mtx m, f32 xS, f32 yS, f32 zS); +void C_MTXScaleApply(const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS); + +void C_MTXRotRad(Mtx m, char axis, f32 rad); +void C_MTXRotTrig(Mtx m, char axis, f32 sinA, f32 cosA); +void C_MTXRotAxisRad(Mtx m, const Vec* axis, f32 rad); + +#ifdef GEKKO +void PSMTXQuat(Mtx m, const Quaternion* q); +void PSMTXReflect(Mtx m, const Vec* p, const Vec* n); + +void PSMTXTrans(Mtx m, f32 xT, f32 yT, f32 zT); +void PSMTXTransApply(const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT); +void PSMTXScale(Mtx m, f32 xS, f32 yS, f32 zS); +void PSMTXScaleApply(const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS); + +void PSMTXRotRad(Mtx m, char axis, f32 rad); +void PSMTXRotTrig(Mtx m, char axis, f32 sinA, f32 cosA); +void PSMTXRotAxisRad(Mtx m, const Vec* axis, f32 rad); +#endif + +#ifdef MTX_USE_PS #define MTXQuat PSMTXQuat #define MTXReflect PSMTXReflect #define MTXTrans PSMTXTrans @@ -103,20 +131,11 @@ void PSMTXRotAxisRad(Mtx m, const Vec* axis, f32 rad); #define MTXRotRad PSMTXRotRad #define MTXRotTrig PSMTXRotTrig #define MTXRotAxisRad PSMTXRotAxisRad + #define MTXRotDeg(m, axis, deg) PSMTXRotRad(m, axis, MTXDegToRad(deg)) #define MTXRotAxisDeg(m, axis, deg) PSMTXRotAxisRad(m, axis, MTXDegToRad(deg)) -#else -#define MTXIdentity C_MTXIdentity -#define MTXCopy C_MTXCopy -#define MTXConcat C_MTXConcat -#define MTXConcatArray C_MTXConcatArray -#define MTXTranspose C_MTXTranspose -#define MTXInverse C_MTXInverse -#define MTXInvXpose C_MTXInvXpose -#define MTXMultVec C_MTXMultVec -#define MTXMultVecArray C_MTXMultVecArray -#define MTXMultVecSR C_MTXMultVecSR -#define MTXMultVecArraySR C_MTXMultVecArraySR + +#else // MTX_USE_C #define MTXQuat C_MTXQuat #define MTXReflect C_MTXReflect #define MTXTrans C_MTXTrans @@ -126,18 +145,176 @@ void PSMTXRotAxisRad(Mtx m, const Vec* axis, f32 rad); #define MTXRotRad C_MTXRotRad #define MTXRotTrig C_MTXRotTrig #define MTXRotAxisRad C_MTXRotAxisRad + #define MTXRotDeg(m, axis, deg) C_MTXRotRad(m, axis, MTXDegToRad(deg)) #define MTXRotAxisDeg(m, axis, deg) C_MTXRotAxisRad(m, axis, MTXDegToRad(deg)) + #endif +void C_MTXLookAt(Mtx m, const Point3d* camPos, const Vec* camUp, const Point3d* target); + #define MTXLookAt C_MTXLookAt + +void C_MTXFrustum(Mtx44 m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f); +void C_MTXPerspective(Mtx44 m, f32 fovY, f32 aspect, f32 n, f32 f); +void C_MTXOrtho(Mtx44 m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f); + #define MTXFrustum C_MTXFrustum #define MTXPerspective C_MTXPerspective #define MTXOrtho C_MTXOrtho + +void C_MTXLightFrustum(Mtx m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 scaleS, f32 scaleT, f32 transS, + f32 transT); + +void C_MTXLightPerspective(Mtx m, f32 fovY, f32 aspect, f32 scaleS, f32 scaleT, f32 transS, + f32 transT); + +void C_MTXLightOrtho(Mtx m, f32 t, f32 b, f32 l, f32 r, f32 scaleS, f32 scaleT, f32 transS, + f32 transT); + #define MTXLightFrustum C_MTXLightFrustum #define MTXLightPerspective C_MTXLightPerspective #define MTXLightOrtho C_MTXLightOrtho +void C_VECAdd(const Vec* a, const Vec* b, Vec* ab); +void C_VECSubtract(const Vec* a, const Vec* b, Vec* a_b); +void C_VECScale(const Vec* src, Vec* dst, f32 scale); +void C_VECNormalize(const Vec* src, Vec* unit); +f32 C_VECSquareMag(const Vec* v); +f32 C_VECMag(const Vec* v); +f32 C_VECDotProduct(const Vec* a, const Vec* b); +void C_VECCrossProduct(const Vec* a, const Vec* b, Vec* axb); +f32 C_VECSquareDistance(const Vec* a, const Vec* b); +f32 C_VECDistance(const Vec* a, const Vec* b); +void C_VECReflect(const Vec* src, const Vec* normal, Vec* dst); +void C_VECHalfAngle(const Vec* a, const Vec* b, Vec* half); + +#ifdef GEKKO +void PSVECAdd(const Vec* a, const Vec* b, Vec* ab); +void PSVECSubtract(const Vec* a, const Vec* b, Vec* a_b); +void PSVECScale(const Vec* src, Vec* dst, f32 scale); +void PSVECNormalize(const Vec* src, Vec* unit); +f32 PSVECSquareMag(const Vec* v); +f32 PSVECMag(const Vec* v); +f32 PSVECDotProduct(const Vec* a, const Vec* b); +void PSVECCrossProduct(const Vec* a, const Vec* b, Vec* axb); +f32 PSVECSquareDistance(const Vec* a, const Vec* b); +f32 PSVECDistance(const Vec* a, const Vec* b); +#endif + +// TODO +#if defined( MTX_USE_PS) && 0 +#define VECAdd PSVECAdd +#define VECSubtract PSVECSubtract +#define VECScale PSVECScale +#define VECNormalize PSVECNormalize +#define VECSquareMag PSVECSquareMag +#define VECMag PSVECMag +#define VECDotProduct PSVECDotProduct +#define VECCrossProduct PSVECCrossProduct +#define VECSquareDistance PSVECSquareDistance +#define VECDistance PSVECDistance +#else // MTX_USE_C +#define VECAdd C_VECAdd +#define VECSubtract C_VECSubtract +#define VECScale C_VECScale +#define VECNormalize C_VECNormalize +#define VECSquareMag C_VECSquareMag +#define VECMag C_VECMag +#define VECDotProduct C_VECDotProduct +#define VECCrossProduct C_VECCrossProduct +#define VECSquareDistance C_VECSquareDistance +#define VECDistance C_VECDistance +#endif + +#define VECReflect C_VECReflect +#define VECHalfAngle C_VECHalfAngle + +void C_QUATAdd(const Quaternion* p, const Quaternion* q, Quaternion* r); +void C_QUATSubtract(const Quaternion* p, const Quaternion* q, Quaternion* r); +void C_QUATMultiply(const Quaternion* p, const Quaternion* q, Quaternion* pq); +void C_QUATDivide(const Quaternion* p, const Quaternion* q, Quaternion* r); +void C_QUATScale(const Quaternion* q, Quaternion* r, f32 scale); +f32 C_QUATDotProduct(const Quaternion* p, const Quaternion* q); +void C_QUATNormalize(const Quaternion* src, Quaternion* unit); +void C_QUATInverse(const Quaternion* src, Quaternion* inv); +void C_QUATExp(const Quaternion* q, Quaternion* r); +void C_QUATLogN(const Quaternion* q, Quaternion* r); + +void C_QUATMakeClosest(const Quaternion* q, const Quaternion* qto, Quaternion* r); +void C_QUATRotAxisRad(Quaternion* r, const Vec* axis, f32 rad); +void C_QUATMtx(Quaternion* r, const Mtx m); + +void C_QUATLerp(const Quaternion* p, const Quaternion* q, Quaternion* r, f32 t); +void C_QUATSlerp(const Quaternion* p, const Quaternion* q, Quaternion* r, f32 t); +void C_QUATSquad(const Quaternion* p, const Quaternion* a, const Quaternion* b, const Quaternion* q, + Quaternion* r, f32 t); +void C_QUATCompA(const Quaternion* qprev, const Quaternion* q, const Quaternion* qnext, + Quaternion* a); + +#ifdef GEKKO +void PSQUATAdd(const Quaternion* p, const Quaternion* q, Quaternion* r); +void PSQUATSubtract(const Quaternion* p, const Quaternion* q, Quaternion* r); +void PSQUATMultiply(const Quaternion* p, const Quaternion* q, Quaternion* pq); +void PSQUATDivide(const Quaternion* p, const Quaternion* q, Quaternion* r); +void PSQUATScale(const Quaternion* q, Quaternion* r, f32 scale); +f32 PSQUATDotProduct(const Quaternion* p, const Quaternion* q); +void PSQUATNormalize(const Quaternion* src, Quaternion* unit); +void PSQUATInverse(const Quaternion* src, Quaternion* inv); +#endif + +#ifdef MTX_USE_PS +#define QUATAdd PSQUATAdd +#define QUATSubtract PSQUATSubtract +#define QUATMultiply PSQUATMultiply +#define QUATDivide PSQUATDivide +#define QUATScale PSQUATScale +#define QUATDotProduct PSQUATDotProduct +#define QUATNormalize PSQUATNormalize +#define QUATInverse PSQUATInverse +#else // MTX_USE_C +#define QUATAdd C_QUATAdd +#define QUATSubtract C_QUATSubtract +#define QUATMultiply C_QUATMultiply +#define QUATDivide C_QUATDivide +#define QUATScale C_QUATScale +#define QUATDotProduct C_QUATDotProduct +#define QUATNormalize C_QUATNormalize +#define QUATInverse C_QUATInverse +#endif + +#define QUATExp C_QUATExp +#define QUATLogN C_QUATLogN +#define QUATMakeClosest C_QUATMakeClosest +#define QUATRotAxisRad C_QUATRotAxisRad +#define QUATMtx C_QUATMtx +#define QUATLerp C_QUATLerp +#define QUATSlerp C_QUATSlerp +#define QUATSquad C_QUATSquad +#define QUATCompA C_QUATCompA + +#ifdef GEKKO +void PSMTXReorder(const Mtx src, ROMtx dest); +void PSMTXROMultVecArray(const ROMtx m, const Vec* srcBase, Vec* dstBase, u32 count); +void PSMTXROSkin2VecArray(const ROMtx m0, const ROMtx m1, const f32* wtBase, const Vec* srcBase, + Vec* dstBase, u32 count); +void PSMTXMultS16VecArray(const Mtx m, const S16Vec* srcBase, Vec* dstBase, u32 count); +void PSMTXROMultS16VecArray(const ROMtx m, const S16Vec* srcBase, Vec* dstBase, u32 count); +#endif + +void MTXInitStack(MtxStack* sPtr, u32 numMtx); +MtxPtr MTXPush(MtxStack* sPtr, const Mtx m); +MtxPtr MTXPushFwd(MtxStack* sPtr, const Mtx m); +MtxPtr MTXPushInv(MtxStack* sPtr, const Mtx m); +MtxPtr MTXPushInvXpose(MtxStack* sPtr, const Mtx m); +MtxPtr MTXPop(MtxStack* sPtr); +MtxPtr MTXGetStackPtr(const MtxStack* sPtr); + +#define MTXAllocStack(sPtr, numMtx) \ + (((MtxStackPtr)(sPtr))->stackBase = (MtxPtr)OSAlloc(((numMtx) * sizeof(Mtx)))) + +#define MTXFreeStack(sPtr) (OSFree((void*)(((MtxStackPtr)(sPtr))->stackBase))) + #ifdef __cplusplus } #endif diff --git a/include/dolphin/mtx/GeoTypes.h b/include/dolphin/mtx/GeoTypes.h new file mode 100644 index 00000000..7e9b64da --- /dev/null +++ b/include/dolphin/mtx/GeoTypes.h @@ -0,0 +1,39 @@ +#ifndef _DOLPHIN_GEOTYPES +#define _DOLPHIN_GEOTYPES + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + f32 x, y, z; +} Vec, *VecPtr, Point3d, *Point3dPtr; + +typedef struct { + s16 x; + s16 y; + s16 z; +} S16Vec, *S16VecPtr; + +typedef struct { + f32 x, y, z, w; +} Quaternion, *QuaternionPtr, Qtrn, *QtrnPtr; + +typedef f32 Mtx[3][4]; + +typedef f32 (*MtxPtr)[4]; + +typedef f32 ROMtx[4][3]; + +typedef f32 (*ROMtxPtr)[3]; + +typedef f32 Mtx44[4][4]; + +typedef f32 (*Mtx44Ptr)[4]; +#ifdef __cplusplus +} +#endif + +#endif // _DOLPHIN_GEOTYPES diff --git a/src/Dolphin/mtx/mtx.c b/src/Dolphin/mtx/mtx.c new file mode 100644 index 00000000..eb9a7cdd --- /dev/null +++ b/src/Dolphin/mtx/mtx.c @@ -0,0 +1,1286 @@ +#include "dolphin/mtx.h" + +static f32 Unit01[] = {0.0f, 1.0f}; + +extern f32 sinf(f32); + +void C_MTXIdentity(Mtx mtx) { + mtx[0][0] = 1.0f; + mtx[0][1] = 0.0f; + mtx[0][2] = 0.0f; + mtx[1][0] = 0.0f; + mtx[1][1] = 1.0f; + mtx[1][2] = 0.0f; + mtx[2][0] = 0.0f; + mtx[2][1] = 0.0f; + mtx[2][2] = 1.0f; +} + +#ifdef GEKKO +void PSMTXIdentity(register Mtx m) { + register f32 zero_c = 0.0f; + register f32 one_c = 1.0f; + register f32 c_01; + register f32 c_10; + // clang-format off + asm { + psq_st zero_c, 8(m), 0, 0 + ps_merge01 c_01, zero_c, one_c + psq_st zero_c, 24(m), 0, 0 + ps_merge10 c_10, one_c, zero_c + psq_st zero_c, 32(m), 0, 0 + psq_st c_01, 16(m), 0, 0 + psq_st c_10, 0(m), 0, 0 + psq_st c_10, 40(m), 0, 0 + } + // clang-format on +} +#endif + +void C_MTXCopy(const Mtx src, Mtx dst) { + + if (src == dst) { + return; + } + + dst[0][0] = src[0][0]; + dst[0][1] = src[0][1]; + dst[0][2] = src[0][2]; + dst[0][3] = src[0][3]; + + dst[1][0] = src[1][0]; + dst[1][1] = src[1][1]; + dst[1][2] = src[1][2]; + dst[1][3] = src[1][3]; + + dst[2][0] = src[2][0]; + dst[2][1] = src[2][1]; + dst[2][2] = src[2][2]; + dst[2][3] = src[2][3]; +} + +#ifdef GEKKO +asm void PSMTXCopy(const register Mtx src, register Mtx dst) { + // clang-format off + nofralloc + + psq_l fp0, 0(src), 0, 0 + psq_st fp0, 0(dst), 0, 0 + psq_l fp1, 8(src), 0, 0 + psq_st fp1, 8(dst), 0, 0 + psq_l fp2, 16(src), 0, 0 + psq_st fp2, 16(dst), 0, 0 + psq_l fp3, 24(src), 0, 0 + psq_st fp3, 24(dst), 0, 0 + psq_l fp4, 32(src), 0, 0 + psq_st fp4, 32(dst), 0, 0 + psq_l fp5, 40(src), 0, 0 + psq_st fp5, 40(dst), 0, 0 + + blr + // clang-format on +} +#endif + +void C_MTXConcat(const Mtx a, const Mtx b, Mtx ab) { + Mtx mTmp; + MtxPtr m; + + if ((ab == a) || (ab == b)) { + m = mTmp; + } + + else { + m = ab; + } + + m[0][0] = a[0][0] * b[0][0] + a[0][1] * b[1][0] + a[0][2] * b[2][0]; + m[0][1] = a[0][0] * b[0][1] + a[0][1] * b[1][1] + a[0][2] * b[2][1]; + m[0][2] = a[0][0] * b[0][2] + a[0][1] * b[1][2] + a[0][2] * b[2][2]; + m[0][3] = a[0][0] * b[0][3] + a[0][1] * b[1][3] + a[0][2] * b[2][3] + a[0][3]; + + m[1][0] = a[1][0] * b[0][0] + a[1][1] * b[1][0] + a[1][2] * b[2][0]; + m[1][1] = a[1][0] * b[0][1] + a[1][1] * b[1][1] + a[1][2] * b[2][1]; + m[1][2] = a[1][0] * b[0][2] + a[1][1] * b[1][2] + a[1][2] * b[2][2]; + m[1][3] = a[1][0] * b[0][3] + a[1][1] * b[1][3] + a[1][2] * b[2][3] + a[1][3]; + + m[2][0] = a[2][0] * b[0][0] + a[2][1] * b[1][0] + a[2][2] * b[2][0]; + m[2][1] = a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1]; + m[2][2] = a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2]; + m[2][3] = a[2][0] * b[0][3] + a[2][1] * b[1][3] + a[2][2] * b[2][3] + a[2][3]; + + if (m == mTmp) { + C_MTXCopy(mTmp, ab); + } +} + +#ifdef GEKKO +asm void PSMTXConcat(const register Mtx mA, const register Mtx mB, register Mtx mAB) { + // clang-format off + nofralloc + +#define FP0 fp0 +#define FP1 fp1 +#define FP2 fp2 +#define FP3 fp3 +#define FP4 fp4 +#define FP5 fp5 +#define FP6 fp6 +#define FP7 fp7 +#define FP8 fp8 +#define FP9 fp9 +#define FP10 fp10 +#define FP11 fp11 +#define FP12 fp12 +#define FP13 fp13 +#define FP14 fp14 +#define FP15 fp15 +#define FP31 fp31 + stwu r1, -64(r1); + psq_l FP0, 0(mA), 0, 0; + stfd fp14, 8(r1); + psq_l FP6, 0(mB), 0, 0; + addis r6, 0, Unit01@ha; + psq_l FP7, 8(mB), 0, 0; + stfd fp15, 16(r1) + addi r6, r6, Unit01@l; + stfd fp31, 40(r1) + psq_l FP8, 16(mB), 0, 0 + ps_muls0 FP12, FP6, FP0 + psq_l FP2, 16(mA), 0, 0 + ps_muls0 FP13, FP7, FP0 + psq_l FP31, 0(r6), 0, 0 + ps_muls0 FP14, FP6, FP2 + psq_l FP9, 24(mB), 0, 0 + ps_muls0 FP15, FP7, FP2 + psq_l FP1, 8(mA), 0, 0 + ps_madds1 FP12, FP8, FP0, FP12 + psq_l FP3, 24(mA), 0, 0 + ps_madds1 FP14, FP8, FP2, FP14 + psq_l FP10, 32(mB), 0, 0 + ps_madds1 FP13, FP9, FP0, FP13 + psq_l FP11, 40(mB), 0, 0 + ps_madds1 FP15, FP9, FP2, FP15 + psq_l FP4, 32(mA), 0, 0 + psq_l FP5, 40(mA), 0, 0 + ps_madds0 FP12, FP10, FP1, FP12 + ps_madds0 FP13, FP11, FP1, FP13 + ps_madds0 FP14, FP10, FP3, FP14 + ps_madds0 FP15, FP11, FP3, FP15 + psq_st FP12, 0(mAB), 0, 0 + + ps_muls0 FP2, FP6, FP4 + ps_madds1 FP13, FP31, FP1, FP13 + ps_muls0 FP0, FP7, FP4 + psq_st FP14, 16(mAB), 0, 0 + ps_madds1 FP15, FP31, FP3, FP15 + + psq_st FP13, 8(mAB), 0, 0 + + ps_madds1 FP2, FP8, FP4, FP2 + ps_madds1 FP0, FP9, FP4, FP0 + ps_madds0 FP2, FP10, FP5, FP2 + lfd fp14, 8(r1) + psq_st FP15, 24(mAB), 0, 0 + ps_madds0 FP0, FP11, FP5, FP0 + psq_st FP2, 32(mAB), 0, 0 + ps_madds1 FP0, FP31, FP5, FP0 + lfd fp15, 16(r1) + psq_st FP0, 40(mAB), 0, 0 + + lfd fp31, 40(r1) + addi r1, r1, 64 + + blr + // clang-format on + +#undef FP0 +#undef FP1 +#undef FP2 +#undef FP3 +#undef FP4 +#undef FP5 +#undef FP6 +#undef FP7 +#undef FP8 +#undef FP9 +#undef FP10 +#undef FP11 +#undef FP12 +#undef FP13 +#undef FP14 +#undef FP15 +#undef FP31 +} +#endif + +void C_MTXConcatArray(const Mtx a, const Mtx* srcBase, Mtx* dstBase, u32 count) { + u32 i; + for (i = 0; i < count; i++) { + C_MTXConcat(a, *srcBase, *dstBase); + + srcBase++; + dstBase++; + } +} + +#ifdef GEKKO +#if (defined(__MWERKS__) && defined(_DEBUG)) +#pragma global_optimizer on +#pragma optimization_level 1 +#endif + +void PSMTXConcatArray(const register Mtx a, const register Mtx* srcBase, register Mtx* dstBase, + register u32 count) { + register f32 va0, va1, va2, va3, va4, va5; + register f32 vb0, vb1, vb2, vb3, vb4, vb5; + register f32 vd0, vd1, vd2, vd3, vd4, vd5; + register f32 u01; + register f32* u01Ptr = Unit01; + + // clang-format off + asm + { + psq_l va0, 0(a), 0, 0 + psq_l va1, 8(a), 0, 0 + psq_l va2, 16(a), 0, 0 + psq_l va3, 24(a), 0, 0 + subi count, count, 1 + psq_l va4, 32(a), 0, 0 + psq_l va5, 40(a), 0, 0 + mtctr count + psq_l u01, 0(u01Ptr), 0, 0 + + psq_l vb0, 0(srcBase), 0, 0 + psq_l vb2, 16(srcBase), 0, 0 + + ps_muls0 vd0, vb0, va0 + ps_muls0 vd2, vb0, va2 + ps_muls0 vd4, vb0, va4 + + psq_l vb4, 32(srcBase), 0, 0 + + ps_madds1 vd0, vb2, va0, vd0 + ps_madds1 vd2, vb2, va2, vd2 + ps_madds1 vd4, vb2, va4, vd4 + + psq_l vb1, 8(srcBase), 0, 0 + + ps_madds0 vd0, vb4, va1, vd0 + ps_madds0 vd2, vb4, va3, vd2 + ps_madds0 vd4, vb4, va5, vd4 + + psq_l vb3, 24(srcBase), 0, 0 + psq_st vd0, 0(dstBase), 0, 0 + + ps_muls0 vd1, vb1, va0 + ps_muls0 vd3, vb1, va2 + ps_muls0 vd5, vb1, va4 + + psq_l vb5, 40(srcBase), 0, 0 + psq_st vd2, 16(dstBase), 0, 0 + ps_madds1 vd1, vb3, va0, vd1 + ps_madds1 vd3, vb3, va2, vd3 + ps_madds1 vd5, vb3, va4, vd5 + +_loop: + addi srcBase, srcBase, sizeof(Mtx) + ps_madds0 vd1, vb5, va1, vd1 + ps_madds0 vd3, vb5, va3, vd3 + ps_madds0 vd5, vb5, va5, vd5 + psq_l vb0, 0(srcBase), 0, 0 + psq_st vd4, 32(dstBase), 0, 0 + ps_madd vd1, u01, va1, vd1 + ps_madd vd3, u01, va3, vd3 + ps_madd vd5, u01, va5, vd5 + psq_l vb2, 16(srcBase), 0, 0 + psq_st vd1, 8(dstBase), 0, 0 + ps_muls0 vd0, vb0, va0 + ps_muls0 vd2, vb0, va2 + ps_muls0 vd4, vb0, va4 + psq_l vb4, 32(srcBase), 0, 0 + psq_st vd3, 24(dstBase), 0, 0 + ps_madds1 vd0, vb2, va0, vd0 + ps_madds1 vd2, vb2, va2, vd2 + ps_madds1 vd4, vb2, va4, vd4 + psq_l vb1, 8(srcBase), 0, 0 + psq_st vd5, 40(dstBase), 0, 0 + addi dstBase, dstBase, sizeof(Mtx) + + ps_madds0 vd0, vb4, va1, vd0 + ps_madds0 vd2, vb4, va3, vd2 + ps_madds0 vd4, vb4, va5, vd4 + psq_l vb3, 24(srcBase), 0, 0 + psq_st vd0, 0(dstBase), 0, 0 + ps_muls0 vd1, vb1, va0 + ps_muls0 vd3, vb1, va2 + ps_muls0 vd5, vb1, va4 + psq_l vb5, 40(srcBase), 0, 0 + psq_st vd2, 16(dstBase), 0, 0 + ps_madds1 vd1, vb3, va0, vd1 + ps_madds1 vd3, vb3, va2, vd3 + ps_madds1 vd5, vb3, va4, vd5 + bdnz _loop + psq_st vd4, 32(dstBase), 0, 0 + ps_madds0 vd1, vb5, va1, vd1 + ps_madds0 vd3, vb5, va3, vd3 + ps_madds0 vd5, vb5, va5, vd5 + ps_madd vd1, u01, va1, vd1 + ps_madd vd3, u01, va3, vd3 + ps_madd vd5, u01, va5, vd5 + psq_st vd1, 8(dstBase), 0, 0 + psq_st vd3, 24(dstBase), 0, 0 + psq_st vd5, 40(dstBase), 0, 0 + } + // clang-format on +} + +#if (defined(__MWERKS__) && defined(_DEBUG)) +#pragma optimization_level 0 +#pragma global_optimizer reset +#endif + +#endif + +void C_MTXTranspose(const Mtx src, Mtx xPose) { + Mtx mTmp; + MtxPtr m; + + if (src == xPose) { + m = mTmp; + } else { + m = xPose; + } + + m[0][0] = src[0][0]; + m[0][1] = src[1][0]; + m[0][2] = src[2][0]; + m[0][3] = 0.0f; + m[1][0] = src[0][1]; + m[1][1] = src[1][1]; + m[1][2] = src[2][1]; + m[1][3] = 0.0f; + m[2][0] = src[0][2]; + m[2][1] = src[1][2]; + m[2][2] = src[2][2]; + m[2][3] = 0.0f; + + if (m == mTmp) { + C_MTXCopy(mTmp, xPose); + } +} + +#ifdef GEKKO +void PSMTXTranspose(const register Mtx src, register Mtx xPose) { + register f32 c_zero = 0.0F; + register f32 row0a, row1a, row0b, row1b; + register f32 trns0, trns1, trns2; + // clang-format off + asm + { + psq_l row0a, 0(src), 0, 0 + stfs c_zero, 44(xPose) + psq_l row1a, 16(src), 0, 0 + ps_merge00 trns0, row0a, row1a + psq_l row0b, 8(src), 1, 0 + ps_merge11 trns1, row0a, row1a + psq_l row1b, 24(src), 1, 0 + psq_st trns0, 0(xPose), 0, 0 + psq_l row0a, 32(src), 0, 0 + ps_merge00 trns2, row0b, row1b + psq_st trns1, 16(xPose), 0, 0 + ps_merge00 trns0, row0a, c_zero + psq_st trns2, 32(xPose), 0, 0 + ps_merge10 trns1, row0a, c_zero + psq_st trns0, 8(xPose), 0, 0 + lfs row0b, 40(src) + psq_st trns1, 24(xPose), 0, 0 + stfs row0b, 40(xPose) + } + // clang-format on +} +#endif + +u32 C_MTXInverse(const Mtx src, Mtx inv) { + Mtx mTmp; + MtxPtr m; + f32 det; + + if (src == inv) { + m = mTmp; + } else { + m = inv; + } + + det = src[0][0] * src[1][1] * src[2][2] + src[0][1] * src[1][2] * src[2][0] + + src[0][2] * src[1][0] * src[2][1] - src[2][0] * src[1][1] * src[0][2] - + src[1][0] * src[0][1] * src[2][2] - src[0][0] * src[2][1] * src[1][2]; + + if (det == 0.0f) { + return 0; + } + + det = 1.0f / det; + + m[0][0] = (src[1][1] * src[2][2] - src[2][1] * src[1][2]) * det; + m[0][1] = -(src[0][1] * src[2][2] - src[2][1] * src[0][2]) * det; + m[0][2] = (src[0][1] * src[1][2] - src[1][1] * src[0][2]) * det; + + m[1][0] = -(src[1][0] * src[2][2] - src[2][0] * src[1][2]) * det; + m[1][1] = (src[0][0] * src[2][2] - src[2][0] * src[0][2]) * det; + m[1][2] = -(src[0][0] * src[1][2] - src[1][0] * src[0][2]) * det; + + m[2][0] = (src[1][0] * src[2][1] - src[2][0] * src[1][1]) * det; + m[2][1] = -(src[0][0] * src[2][1] - src[2][0] * src[0][1]) * det; + m[2][2] = (src[0][0] * src[1][1] - src[1][0] * src[0][1]) * det; + + m[0][3] = -m[0][0] * src[0][3] - m[0][1] * src[1][3] - m[0][2] * src[2][3]; + m[1][3] = -m[1][0] * src[0][3] - m[1][1] * src[1][3] - m[1][2] * src[2][3]; + m[2][3] = -m[2][0] * src[0][3] - m[2][1] * src[1][3] - m[2][2] * src[2][3]; + + if (m == mTmp) { + C_MTXCopy(mTmp, inv); + } + + return 1; +} + +#ifdef GEKKO +asm u32 PSMTXInverse(const register Mtx src, register Mtx inv){ + // clang-format off + nofralloc + + psq_l fp0, 0(src), 1, 0 + psq_l fp1, 4(src), 0, 0 + psq_l fp2, 16(src), 1, 0 + ps_merge10 fp6, fp1, fp0 + psq_l fp3, 20(src), 0, 0 + psq_l fp4, 32(src), 1, 0 + ps_merge10 fp7, fp3, fp2 + psq_l fp5, 36(src), 0, 0 + ps_mul fp11, fp3, fp6 + ps_mul fp13, fp5, fp7 + ps_merge10 fp8, fp5, fp4 + ps_msub fp11, fp1, fp7, fp11 + ps_mul fp12, fp1, fp8 + ps_msub fp13, fp3, fp8, fp13 + ps_mul fp10, fp3, fp4 + ps_msub fp12, fp5, fp6, fp12 + ps_mul fp9, fp0, fp5 + ps_mul fp8, fp1, fp2 + ps_sub fp6, fp6, fp6 + ps_msub fp10, fp2, fp5, fp10 + ps_mul fp7, fp0, fp13 + ps_msub fp9, fp1, fp4, fp9 + ps_madd fp7, fp2, fp12, fp7 + ps_msub fp8, fp0, fp3, fp8 + ps_madd fp7, fp4, fp11, fp7 + ps_cmpo0 cr0, fp7, fp6 + bne _regular + addi r3, 0, 0 + blr + +_regular: + fres fp0, fp7 + ps_add fp6, fp0, fp0 + ps_mul fp5, fp0, fp0 + ps_nmsub fp0, fp7, fp5, fp6 + lfs fp1, 12(src) + ps_muls0 fp13, fp13, fp0 + lfs fp2, 28(src) + ps_muls0 fp12, fp12, fp0 + lfs fp3, 44(src) + ps_muls0 fp11, fp11, fp0 + ps_merge00 fp5, fp13, fp12 + ps_muls0 fp10, fp10, fp0 + ps_merge11 fp4, fp13, fp12 + ps_muls0 fp9, fp9, fp0 + psq_st fp5, 0(inv), 0, 0 + ps_mul fp6, fp13, fp1 + psq_st fp4, 16(inv), 0, 0 + ps_muls0 fp8, fp8, fp0 + ps_madd fp6, fp12, fp2, fp6 + psq_st fp10, 32(inv), 1, 0 + ps_nmadd fp6, fp11, fp3, fp6 + psq_st fp9, 36(inv), 1, 0 + ps_mul fp7, fp10, fp1 + ps_merge00 fp5, fp11, fp6 + psq_st fp8, 40(inv), 1, 0 + ps_merge11 fp4, fp11, fp6 + psq_st fp5, 8(inv), 0, 0 + ps_madd fp7, fp9, fp2, fp7 + psq_st fp4, 24(inv), 0, 0 + ps_nmadd fp7, fp8, fp3, fp7 + addi r3, 0, 1 + psq_st fp7, 44(inv), 1, 0 + blr + // clang-format on +} +#endif + +u32 C_MTXInvXpose(const Mtx src, Mtx invX) { + Mtx mTmp; + MtxPtr m; + f32 det; + + if (src == invX) { + m = mTmp; + } else { + m = invX; + } + + det = src[0][0] * src[1][1] * src[2][2] + src[0][1] * src[1][2] * src[2][0] + + src[0][2] * src[1][0] * src[2][1] - src[2][0] * src[1][1] * src[0][2] - + src[1][0] * src[0][1] * src[2][2] - src[0][0] * src[2][1] * src[1][2]; + + if (det == 0.0f) { + return 0; + } + + det = 1.0f / det; + + m[0][0] = (src[1][1] * src[2][2] - src[2][1] * src[1][2]) * det; + m[0][1] = -(src[1][0] * src[2][2] - src[2][0] * src[1][2]) * det; + m[0][2] = (src[1][0] * src[2][1] - src[2][0] * src[1][1]) * det; + + m[1][0] = -(src[0][1] * src[2][2] - src[2][1] * src[0][2]) * det; + m[1][1] = (src[0][0] * src[2][2] - src[2][0] * src[0][2]) * det; + m[1][2] = -(src[0][0] * src[2][1] - src[2][0] * src[0][1]) * det; + + m[2][0] = (src[0][1] * src[1][2] - src[1][1] * src[0][2]) * det; + m[2][1] = -(src[0][0] * src[1][2] - src[1][0] * src[0][2]) * det; + m[2][2] = (src[0][0] * src[1][1] - src[1][0] * src[0][1]) * det; + + m[0][3] = 0.0F; + m[1][3] = 0.0F; + m[2][3] = 0.0F; + + if (m == mTmp) { + C_MTXCopy(mTmp, invX); + } + + return 1; +} + +#ifdef GEKKO +asm u32 PSMTXInvXpose(const register Mtx src, register Mtx invX) { + // clang-format off + nofralloc + + psq_l fp0, 0(src), 1, 0 + psq_l fp1, 4(src), 0, 0 + psq_l fp2, 16(src), 1, 0 + ps_merge10 fp6, fp1, fp0 + psq_l fp3, 20(src), 0, 0 + psq_l fp4, 32(src), 1, 0 + ps_merge10 fp7, fp3, fp2 + psq_l fp5, 36(src), 0, 0 + ps_mul fp11, fp3, fp6 + ps_merge10 fp8, fp5, fp4 + ps_mul fp13, fp5, fp7 + ps_msub fp11, fp1, fp7, fp11 + ps_mul fp12, fp1, fp8 + ps_msub fp13, fp3, fp8, fp13 + ps_msub fp12, fp5, fp6, fp12 + ps_mul fp10, fp3, fp4 + ps_mul fp9, fp0, fp5 + ps_mul fp8, fp1, fp2 + ps_msub fp10, fp2, fp5, fp10 + ps_msub fp9, fp1, fp4, fp9 + ps_msub fp8, fp0, fp3, fp8 + ps_mul fp7, fp0, fp13 + ps_sub fp1, fp1, fp1 + ps_madd fp7, fp2, fp12, fp7 + ps_madd fp7, fp4, fp11, fp7 + ps_cmpo0 cr0, fp7, fp1 + bne _regular + addi r3, 0, 0 + blr + +_regular: + fres fp0, fp7 + psq_st fp1, 12(invX), 1, 0 + ps_add fp6, fp0, fp0 + ps_mul fp5, fp0, fp0 + psq_st fp1, 28(invX), 1, 0 + ps_nmsub fp0, fp7, fp5, fp6 + psq_st fp1, 44(invX), 1, 0 + ps_muls0 fp13, fp13, fp0 + ps_muls0 fp12, fp12, fp0 + ps_muls0 fp11, fp11, fp0 + psq_st fp13, 0(invX), 0, 0 + psq_st fp12, 16(invX), 0, 0 + ps_muls0 fp10, fp10, fp0 + ps_muls0 fp9, fp9, fp0 + psq_st fp11, 32(invX), 0, 0 + psq_st fp10, 8(invX), 1, 0 + ps_muls0 fp8, fp8, fp0 + addi r3, 0, 1 + psq_st fp9, 24(invX), 1, 0 + psq_st fp8, 40(invX), 1, 0 + blr + // clang-format on +} +#endif + +void C_MTXRotRad(Mtx m, char axis, f32 rad) { + + f32 sinA, cosA; + sinA = sinf(rad); + cosA = cosf(rad); + C_MTXRotTrig(m, axis, sinA, cosA); +} + +#ifdef GEKKO +void PSMTXRotRad(Mtx m, char axis, f32 rad) { + f32 sinA, cosA; + + sinA = sinf(rad); + cosA = cosf(rad); + + PSMTXRotTrig(m, axis, sinA, cosA); +} +#endif + +void C_MTXRotTrig(Mtx m, char axis, f32 sinA, f32 cosA) { + switch (axis) { + + case 'x': + case 'X': + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = cosA; + m[1][2] = -sinA; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = sinA; + m[2][2] = cosA; + m[2][3] = 0.0f; + break; + + case 'y': + case 'Y': + m[0][0] = cosA; + m[0][1] = 0.0f; + m[0][2] = sinA; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = 1.0f; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = -sinA; + m[2][1] = 0.0f; + m[2][2] = cosA; + m[2][3] = 0.0f; + break; + + case 'z': + case 'Z': + m[0][0] = cosA; + m[0][1] = -sinA; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = sinA; + m[1][1] = cosA; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 1.0f; + m[2][3] = 0.0f; + break; + + default: + break; + } +} + +#ifdef GEKKO +void PSMTXRotTrig(register Mtx m, register char axis, register f32 sinA, register f32 cosA) { + register f32 fc0, fc1, nsinA; + register f32 fw0, fw1, fw2, fw3; + // clang-format off + asm + { + frsp sinA, sinA + frsp cosA, cosA + } + + fc0 = 0.0F; + fc1 = 1.0F; + asm + { + ori axis, axis, 0x20 + ps_neg nsinA, sinA + cmplwi axis, 'x' + beq _case_x + cmplwi axis, 'y' + beq _case_y + cmplwi axis, 'z' + beq _case_z + b _end + +_case_x: + psq_st fc1, 0(m), 1, 0 + psq_st fc0, 4(m), 0, 0 + ps_merge00 fw0, sinA, cosA + psq_st fc0, 12(m), 0, 0 + ps_merge00 fw1, cosA, nsinA + psq_st fc0, 28(m), 0, 0 + psq_st fc0, 44(m), 1, 0 + psq_st fw0, 36(m), 0, 0 + psq_st fw1, 20(m), 0, 0 + b _end; + +_case_y: + ps_merge00 fw0, cosA, fc0 + ps_merge00 fw1, fc0, fc1 + psq_st fc0, 24(m), 0, 0 + psq_st fw0, 0(m), 0, 0 + ps_merge00 fw2, nsinA, fc0 + ps_merge00 fw3, sinA, fc0 + psq_st fw0, 40(m), 0, 0; + psq_st fw1, 16(m), 0, 0; + psq_st fw3, 8(m), 0, 0; + psq_st fw2, 32(m), 0, 0; + b _end; + +_case_z: + psq_st fc0, 8(m), 0, 0 + ps_merge00 fw0, sinA, cosA + ps_merge00 fw2, cosA, nsinA + psq_st fc0, 24(m), 0, 0 + psq_st fc0, 32(m), 0, 0 + ps_merge00 fw1, fc1, fc0 + psq_st fw0, 16(m), 0, 0 + psq_st fw2, 0(m), 0, 0 + psq_st fw1, 40(m), 0, 0 + +_end: + } + // clang-format on +} + +#endif + +void C_MTXRotAxisRad(Mtx m, const Vec* axis, f32 rad) { + Vec vN; + f32 s, c; + f32 t; + f32 x, y, z; + f32 xSq, ySq, zSq; + + s = sinf(rad); + c = cosf(rad); + t = 1.0f - c; + + C_VECNormalize(axis, &vN); + + x = vN.x; + y = vN.y; + z = vN.z; + + xSq = x * x; + ySq = y * y; + zSq = z * z; + + m[0][0] = (t * xSq) + (c); + m[0][1] = (t * x * y) - (s * z); + m[0][2] = (t * x * z) + (s * y); + m[0][3] = 0.0f; + + m[1][0] = (t * x * y) + (s * z); + m[1][1] = (t * ySq) + (c); + m[1][2] = (t * y * z) - (s * x); + m[1][3] = 0.0f; + + m[2][0] = (t * x * z) - (s * y); + m[2][1] = (t * y * z) + (s * x); + m[2][2] = (t * zSq) + (c); + m[2][3] = 0.0f; +} + +#ifdef GEKKO +static void __PSMTXRotAxisRadInternal(register Mtx m, const register Vec* axis, register f32 sT, + register f32 cT) { + register f32 tT, fc0; + register f32 tmp0, tmp1, tmp2, tmp3, tmp4; + register f32 tmp5, tmp6, tmp7, tmp8, tmp9; + + tmp9 = 0.5F; + tmp8 = 3.0F; + // clang-format off + asm + { + frsp cT, cT + psq_l tmp0, 0(axis), 0, 0 + frsp sT, sT + lfs tmp1, 8(axis) + ps_mul tmp2, tmp0, tmp0 + fadds tmp7, tmp9, tmp9 + ps_madd tmp3, tmp1, tmp1, tmp2 + fsubs fc0, tmp9, tmp9 + ps_sum0 tmp4, tmp3, tmp1, tmp2 + fsubs tT, tmp7, cT + frsqrte tmp5, tmp4 + fmuls tmp2, tmp5, tmp5 + fmuls tmp3, tmp5, tmp9 + fnmsubs tmp2, tmp2, tmp4, tmp8 + fmuls tmp5, tmp2, tmp3 + ps_merge00 cT, cT, cT + ps_muls0 tmp0, tmp0, tmp5 + ps_muls0 tmp1, tmp1, tmp5 + ps_muls0 tmp4, tmp0, tT + ps_muls0 tmp9, tmp0, sT + ps_muls0 tmp5, tmp1, tT + ps_muls1 tmp3, tmp4, tmp0 + ps_muls0 tmp2, tmp4, tmp0 + ps_muls0 tmp4, tmp4, tmp1 + fnmsubs tmp6, tmp1, sT, tmp3 + fmadds tmp7, tmp1, sT, tmp3 + ps_neg tmp0, tmp9 + ps_sum0 tmp8, tmp4, fc0, tmp9 + ps_sum0 tmp2, tmp2, tmp6, cT + ps_sum1 tmp3, cT, tmp7, tmp3 + ps_sum0 tmp6, tmp0, fc0 ,tmp4 + psq_st tmp8, 8(m), 0, 0 + ps_sum0 tmp0, tmp4, tmp4, tmp0 + psq_st tmp2, 0(m), 0, 0 + ps_muls0 tmp5, tmp5, tmp1 + psq_st tmp3, 16(m), 0, 0 + ps_sum1 tmp4, tmp9, tmp0, tmp4 + psq_st tmp6, 24(m), 0, 0 + ps_sum0 tmp5, tmp5, fc0, cT + psq_st tmp4, 32(m), 0, 0 + psq_st tmp5, 40(m), 0, 0 + } + // clang-format on +} + +void PSMTXRotAxisRad(Mtx m, const Vec* axis, f32 rad) { + f32 sinT, cosT; + + sinT = sinf(rad); + cosT = cosf(rad); + + __PSMTXRotAxisRadInternal(m, axis, sinT, cosT); +} + +#endif + +void C_MTXTrans(Mtx m, f32 xT, f32 yT, f32 zT) { + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = xT; + m[1][0] = 0.0f; + m[1][1] = 1.0f; + m[1][2] = 0.0f; + m[1][3] = yT; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 1.0f; + m[2][3] = zT; +} + +#ifdef GEKKO +void PSMTXTrans(register Mtx m, register f32 xT, register f32 yT, register f32 zT) { + register f32 c0 = 0.0F; + register f32 c1 = 1.0F; + // clang-format off + asm + { + stfs xT, 12(m) + stfs yT, 28(m) + psq_st c0, 4(m), 0, 0 + psq_st c0, 32(m), 0, 0 + stfs c0, 16(m) + stfs c1, 20(m) + stfs c0, 24(m) + stfs c1, 40(m) + stfs zT, 44(m) + stfs c1, 0(m) + } + // clang-format on +} +#endif + +void C_MTXTransApply(const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT) { + if (src != dst) { + dst[0][0] = src[0][0]; + dst[0][1] = src[0][1]; + dst[0][2] = src[0][2]; + dst[1][0] = src[1][0]; + dst[1][1] = src[1][1]; + dst[1][2] = src[1][2]; + dst[2][0] = src[2][0]; + dst[2][1] = src[2][1]; + dst[2][2] = src[2][2]; + } + + dst[0][3] = src[0][3] + xT; + dst[1][3] = src[1][3] + yT; + dst[2][3] = src[2][3] + zT; +} + +#ifdef GEKKO +asm void PSMTXTransApply(const register Mtx src, register Mtx dst, register f32 xT, register f32 yT, + register f32 zT) { + // clang-format off + nofralloc; + psq_l fp4, 0(src), 0, 0 + frsp xT, xT + psq_l fp5, 8(src), 0, 0 + frsp yT, yT + psq_l fp7, 24(src), 0, 0 + frsp zT, zT + psq_l fp8, 40(src), 0, 0 + psq_st fp4, 0(dst), 0, 0 + ps_sum1 fp5, xT, fp5, fp5 + psq_l fp6, 16(src), 0, 0 + psq_st fp5, 8(dst), 0, 0 + ps_sum1 fp7, yT, fp7, fp7 + psq_l fp9, 32(src), 0, 0 + psq_st fp6, 16(dst), 0, 0 + ps_sum1 fp8, zT, fp8, fp8 + psq_st fp7, 24(dst), 0, 0 + psq_st fp9, 32(dst), 0, 0 + psq_st fp8, 40(dst), 0, 0 + blr + // clang-format on +} +#endif + +void C_MTXScale(Mtx m, f32 xS, f32 yS, f32 zS) { + m[0][0] = xS; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = yS; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = zS; + m[2][3] = 0.0f; +} + +#ifdef GEKKO +void PSMTXScale(register Mtx m, register f32 xS, register f32 yS, register f32 zS) { + register f32 c0 = 0.0F; + // clang-format off + asm + { + stfs xS, 0(m) + psq_st c0, 4(m), 0, 0 + psq_st c0, 12(m), 0, 0 + stfs yS, 20(m) + psq_st c0, 24(m), 0, 0 + psq_st c0, 32(m), 0, 0 + stfs zS, 40(m) + stfs c0, 44(m) + } + // clang-format on +} +#endif + +void C_MTXScaleApply(const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS) { + dst[0][0] = src[0][0] * xS; + dst[0][1] = src[0][1] * xS; + dst[0][2] = src[0][2] * xS; + dst[0][3] = src[0][3] * xS; + + dst[1][0] = src[1][0] * yS; + dst[1][1] = src[1][1] * yS; + dst[1][2] = src[1][2] * yS; + dst[1][3] = src[1][3] * yS; + + dst[2][0] = src[2][0] * zS; + dst[2][1] = src[2][1] * zS; + dst[2][2] = src[2][2] * zS; + dst[2][3] = src[2][3] * zS; +} + +#ifdef GEKKO +asm void PSMTXScaleApply(const register Mtx src, register Mtx dst, register f32 xS, register f32 yS, + register f32 zS) { + // clang-format off + nofralloc; + frsp xS, xS + psq_l fp4, 0(src), 0, 0 + frsp yS, yS + psq_l fp5, 8(src), 0, 0 + frsp zS, zS + ps_muls0 fp4, fp4, xS + psq_l fp6, 16(src), 0, 0 + ps_muls0 fp5, fp5, xS + psq_l fp7, 24(src), 0, 0 + ps_muls0 fp6, fp6, yS + psq_l fp8, 32(src), 0, 0 + psq_st fp4, 0(dst), 0, 0 + ps_muls0 fp7, fp7, yS + psq_l fp2, 40(src), 0, 0 + psq_st fp5, 8(dst), 0, 0 + ps_muls0 fp8, fp8, zS + psq_st fp6, 16(dst), 0, 0 + ps_muls0 fp2, fp2, zS + psq_st fp7, 24(dst), 0, 0 + psq_st fp8, 32(dst), 0, 0 + psq_st fp2, 40(dst), 0, 0 + blr + // clang-format on +} +#endif + +void C_MTXQuat(Mtx m, const Quaternion* q) { + + f32 s, xs, ys, zs, wx, wy, wz, xx, xy, xz, yy, yz, zz; + s = 2.0f / ((q->x * q->x) + (q->y * q->y) + (q->z * q->z) + (q->w * q->w)); + + xs = q->x * s; + ys = q->y * s; + zs = q->z * s; + wx = q->w * xs; + wy = q->w * ys; + wz = q->w * zs; + xx = q->x * xs; + xy = q->x * ys; + xz = q->x * zs; + yy = q->y * ys; + yz = q->y * zs; + zz = q->z * zs; + + m[0][0] = 1.0f - (yy + zz); + m[0][1] = xy - wz; + m[0][2] = xz + wy; + m[0][3] = 0.0f; + + m[1][0] = xy + wz; + m[1][1] = 1.0f - (xx + zz); + m[1][2] = yz - wx; + m[1][3] = 0.0f; + + m[2][0] = xz - wy; + m[2][1] = yz + wx; + m[2][2] = 1.0f - (xx + yy); + m[2][3] = 0.0f; +} + +#ifdef GEKKO +void PSMTXQuat(register Mtx m, const register Quaternion* q) { + register f32 c_zero, c_one, c_two, scale; + register f32 tmp0, tmp1, tmp2, tmp3, tmp4; + register f32 tmp5, tmp6, tmp7, tmp8, tmp9; + + c_one = 1.0F; + // clang-format off + asm + { + psq_l tmp0, 0(q), 0, 0 + psq_l tmp1, 8(q), 0, 0 + fsubs c_zero, c_one, c_one + fadds c_two, c_one, c_one + ps_mul tmp2, tmp0, tmp0 + ps_merge10 tmp5, tmp0, tmp0 + ps_madd tmp4, tmp1, tmp1, tmp2 + ps_mul tmp3, tmp1, tmp1 + ps_sum0 scale, tmp4, tmp4, tmp4 + ps_muls1 tmp7, tmp5, tmp1 + fres tmp9, scale + ps_sum1 tmp4, tmp3, tmp4, tmp2 + ps_nmsub scale, scale, tmp9, c_two + ps_muls1 tmp6, tmp1, tmp1 + ps_mul scale, tmp9, scale + ps_sum0 tmp2, tmp2, tmp2, tmp2 + fmuls scale, scale, c_two + ps_madd tmp8, tmp0, tmp5, tmp6 + ps_msub tmp6, tmp0, tmp5, tmp6 + psq_st c_zero, 12(m), 1, 0 + ps_nmsub tmp2, tmp2, scale, c_one + ps_nmsub tmp4, tmp4, scale, c_one + psq_st c_zero, 44(m), 1, 0 + ps_mul tmp8, tmp8, scale + ps_mul tmp6, tmp6, scale + psq_st tmp2, 40(m), 1, 0 + ps_madds0 tmp5, tmp0, tmp1, tmp7 + ps_merge00 tmp1, tmp8, tmp4 + ps_nmsub tmp7, tmp7, c_two, tmp5 + ps_merge10 tmp0, tmp4, tmp6 + psq_st tmp1, 16(m), 0, 0 + ps_mul tmp5, tmp5, scale + ps_mul tmp7, tmp7, scale + psq_st tmp0, 0(m), 0, 0 + psq_st tmp5, 8(m), 1, 0 + ps_merge10 tmp3, tmp7, c_zero + ps_merge01 tmp9, tmp7, tmp5 + psq_st tmp3, 24(m), 0, 0 + psq_st tmp9, 32(m), 0, 0 + } + // clang-format on +} +#endif + +void C_MTXReflect(Mtx m, const Vec* p, const Vec* n) { + f32 vxy, vxz, vyz, pdotn; + + vxy = -2.0f * n->x * n->y; + vxz = -2.0f * n->x * n->z; + vyz = -2.0f * n->y * n->z; + pdotn = 2.0f * C_VECDotProduct(p, n); + + m[0][0] = 1.0f - 2.0f * n->x * n->x; + m[0][1] = vxy; + m[0][2] = vxz; + m[0][3] = pdotn * n->x; + + m[1][0] = vxy; + m[1][1] = 1.0f - 2.0f * n->y * n->y; + m[1][2] = vyz; + m[1][3] = pdotn * n->y; + + m[2][0] = vxz; + m[2][1] = vyz; + m[2][2] = 1.0f - 2.0f * n->z * n->z; + m[2][3] = pdotn * n->z; +} + +#ifdef GEKKO +void PSMTXReflect(register Mtx m, const register Vec* p, const register Vec* n) { + register f32 c_one = 1.0F; + register f32 vn_xy, vn_z1, n2vn_xy, n2vn_z1, pdotn; + register f32 tmp0, tmp1, tmp2, tmp3; + register f32 tmp4, tmp5, tmp6, tmp7; + // clang-format off + asm + { + psq_l vn_z1, 8(n), 1, 0 + psq_l vn_xy, 0(n), 0, 0 + psq_l tmp0, 0(p), 0, 0 + ps_nmadd n2vn_z1, vn_z1, c_one, vn_z1 + psq_l tmp1, 8(p), 1, 0 + ps_nmadd n2vn_xy, vn_xy, c_one, vn_xy + ps_muls0 tmp4, vn_xy, n2vn_z1 + ps_mul pdotn, n2vn_xy, tmp0 + ps_muls0 tmp2, vn_xy, n2vn_xy + ps_sum0 pdotn, pdotn, pdotn, pdotn + ps_muls1 tmp3, vn_xy, n2vn_xy + psq_st tmp4, 32(m), 0, 0 + ps_sum0 tmp2, tmp2, tmp2, c_one + ps_nmadd pdotn, n2vn_z1, tmp1, pdotn + ps_sum1 tmp3, c_one, tmp3, tmp3 + psq_st tmp2, 0(m), 0, 0 + ps_muls0 tmp5, vn_xy, pdotn + ps_merge00 tmp6, n2vn_z1, pdotn + psq_st tmp3, 16(m), 0, 0 + ps_merge00 tmp7, tmp4, tmp5 + ps_muls0 tmp6, tmp6, vn_z1 + ps_merge11 tmp5, tmp4, tmp5 + psq_st tmp7, 8(m), 0, 0 + ps_sum0 tmp6, tmp6, tmp6, c_one + psq_st tmp5, 24(m), 0, 0 + psq_st tmp6, 40(m), 0, 0 + } + // clang-format on +} +#endif + +void C_MTXLookAt(Mtx m, const Point3d* camPos, const Vec* camUp, const Point3d* target) { + + Vec vLook, vRight, vUp; + + vLook.x = camPos->x - target->x; + vLook.y = camPos->y - target->y; + vLook.z = camPos->z - target->z; + VECNormalize(&vLook, &vLook); + VECCrossProduct(camUp, &vLook, &vRight); + VECNormalize(&vRight, &vRight); + VECCrossProduct(&vLook, &vRight, &vUp); + + m[0][0] = vRight.x; + m[0][1] = vRight.y; + m[0][2] = vRight.z; + m[0][3] = -(camPos->x * vRight.x + camPos->y * vRight.y + camPos->z * vRight.z); + + m[1][0] = vUp.x; + m[1][1] = vUp.y; + m[1][2] = vUp.z; + m[1][3] = -(camPos->x * vUp.x + camPos->y * vUp.y + camPos->z * vUp.z); + + m[2][0] = vLook.x; + m[2][1] = vLook.y; + m[2][2] = vLook.z; + m[2][3] = -(camPos->x * vLook.x + camPos->y * vLook.y + camPos->z * vLook.z); +} + +void C_MTXLightFrustum(Mtx m, float t, float b, float l, float r, float n, float scaleS, + float scaleT, float transS, float transT) { + f32 tmp; + + tmp = 1.0f / (r - l); + m[0][0] = ((2 * n) * tmp) * scaleS; + m[0][1] = 0.0f; + m[0][2] = (((r + l) * tmp) * scaleS) - transS; + m[0][3] = 0.0f; + + tmp = 1.0f / (t - b); + m[1][0] = 0.0f; + m[1][1] = ((2 * n) * tmp) * scaleT; + m[1][2] = (((t + b) * tmp) * scaleT) - transT; + m[1][3] = 0.0f; + + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = -1.0f; + m[2][3] = 0.0f; +} + +void C_MTXLightPerspective(Mtx m, f32 fovY, f32 aspect, float scaleS, float scaleT, float transS, + float transT) { + f32 angle; + f32 cot; + + angle = fovY * 0.5f; + angle = MTXDegToRad(angle); + + cot = 1.0f / tanf(angle); + + m[0][0] = (cot / aspect) * scaleS; + m[0][1] = 0.0f; + m[0][2] = -transS; + m[0][3] = 0.0f; + + m[1][0] = 0.0f; + m[1][1] = cot * scaleT; + m[1][2] = -transT; + m[1][3] = 0.0f; + + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = -1.0f; + m[2][3] = 0.0f; +} + +void C_MTXLightOrtho(Mtx m, f32 t, f32 b, f32 l, f32 r, float scaleS, float scaleT, float transS, + float transT) { + f32 tmp; + tmp = 1.0f / (r - l); + m[0][0] = (2.0f * tmp * scaleS); + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = ((-(r + l) * tmp) * scaleS) + transS; + + tmp = 1.0f / (t - b); + m[1][0] = 0.0f; + m[1][1] = (2.0f * tmp) * scaleT; + m[1][2] = 0.0f; + m[1][3] = ((-(t + b) * tmp) * scaleT) + transT; + + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 0.0f; + m[2][3] = 1.0f; +}