From 5df0bae04541671da56dcb911198439abbcd3312 Mon Sep 17 00:00:00 2001
From: Phillip Stephens <antidote.crk@gmail.com>
Date: Sun, 19 Apr 2015 16:15:32 -0700
Subject: [PATCH] * Update MathLib

---
 CPlane.hpp      |  21 ++++----
 CProjection.cpp |   2 -
 CProjection.hpp | 126 +++++++++++++++++++++++++++++++++++++-----------
 CVector3f.cpp   |  26 +---------
 CVector3f.hpp   |  20 ++++++--
 5 files changed, 130 insertions(+), 65 deletions(-)

diff --git a/CPlane.hpp b/CPlane.hpp
index 084669a..c9570c6 100644
--- a/CPlane.hpp
+++ b/CPlane.hpp
@@ -9,14 +9,7 @@ class ZE_ALIGN(16) CPlane
 public:
     ZE_DECLARE_ALIGNED_ALLOCATOR();
 
-    CPlane()
-    {
-#if __SSE__
-        mVec128 = _mm_xor_ps(mVec128, mVec128);
-#else
-        a = 0.0f; b = 0.0f; c = 0.0f; d = 0.0f;
-#endif
-    }
+    inline CPlane() {}
     CPlane(float a, float b, float c, float d) : a(a), b(b), c(c), d(d) {}
     CPlane(const CVector3f& point, float displacement)
     {
@@ -28,7 +21,16 @@ public:
         d = displacement;
     }
     
-protected:
+    inline void normalize()
+    {
+        float nd = d;
+        float mag = vec.length();
+        assert(mag != 0.0f);
+        mag = 1.0 / mag;
+        vec *= mag;
+        d = nd * mag;
+    }
+    
     union
     {
         struct
@@ -36,6 +38,7 @@ protected:
             float a, b, c, d;
         };
         float p[4];
+        CVector3f vec;
 #ifdef __SSE__
         __m128 mVec128;
 #endif
diff --git a/CProjection.cpp b/CProjection.cpp
index cde161c..6349289 100644
--- a/CProjection.cpp
+++ b/CProjection.cpp
@@ -70,5 +70,3 @@ void CProjection::_updateCachedMatrix()
         throw std::runtime_error("attempted to cache invalid projection type");
 }
 
-
-
diff --git a/CProjection.hpp b/CProjection.hpp
index 6a59122..4c441ec 100644
--- a/CProjection.hpp
+++ b/CProjection.hpp
@@ -7,38 +7,110 @@
 #define _USE_MATH_DEFINES 1
 #include <math.h>
 
-typedef union
+union TMatrix4f
 {
     float m[4][4];
 #if __SSE__
     __m128 mVec128[4];
 #endif
-} TMatrix4f;
-static inline void copyMatrix4f(TMatrix4f& dest, const TMatrix4f& src)
-{
+    inline TMatrix4f transposed()
+    {
+        TMatrix4f ret;
 #if __SSE__
-    dest.mVec128[0] = src.mVec128[0];
-    dest.mVec128[1] = src.mVec128[1];
-    dest.mVec128[2] = src.mVec128[2];
-    dest.mVec128[3] = src.mVec128[3];
+        __m128 T0 = _mm_unpacklo_ps(mVec128[0], mVec128[1]);
+        __m128 T2 = _mm_unpacklo_ps(mVec128[2], mVec128[3]);
+        __m128 T1 = _mm_unpackhi_ps(mVec128[0], mVec128[1]);
+        __m128 T3 = _mm_unpackhi_ps(mVec128[2], mVec128[3]);
+        ret.mVec128[0] = _mm_movelh_ps(T0, T2);
+        ret.mVec128[1] = _mm_movehl_ps(T2, T0);
+        ret.mVec128[2] = _mm_movelh_ps(T1, T3);
+        ret.mVec128[3] = _mm_movehl_ps(T3, T1);
 #else
-    dest.m[0][0] = src.m[0][0];
-    dest.m[0][1] = src.m[0][1];
-    dest.m[0][2] = src.m[0][2];
-    dest.m[0][3] = src.m[0][3];
-    dest.m[1][0] = src.m[1][0];
-    dest.m[1][1] = src.m[1][1];
-    dest.m[1][2] = src.m[1][2];
-    dest.m[1][3] = src.m[1][3];
-    dest.m[2][0] = src.m[2][0];
-    dest.m[2][1] = src.m[2][1];
-    dest.m[2][2] = src.m[2][2];
-    dest.m[2][3] = src.m[2][3];
-    dest.m[3][0] = src.m[3][0];
-    dest.m[3][1] = src.m[3][1];
-    dest.m[3][2] = src.m[3][2];
-    dest.m[3][3] = src.m[3][3];
+        ret.m[0][0] = m[0][0];
+        ret.m[1][0] = m[0][1];
+        ret.m[2][0] = m[0][2];
+        ret.m[3][0] = m[0][3];
+        
+        ret.m[0][1] = m[1][0];
+        ret.m[1][1] = m[1][1];
+        ret.m[2][1] = m[1][2];
+        ret.m[3][1] = m[1][3];
+        
+        ret.m[0][2] = m[2][0];
+        ret.m[1][2] = m[2][1];
+        ret.m[2][2] = m[2][2];
+        ret.m[3][2] = m[2][3];
+        
+        ret.m[0][3] = m[3][0];
+        ret.m[1][3] = m[3][1];
+        ret.m[2][3] = m[3][2];
+        ret.m[3][3] = m[3][3];
 #endif
+        return ret;
+    }
+    inline TMatrix4f& operator=(const TMatrix4f& other)
+    {
+#if __SSE__
+        mVec128[0] = other.mVec128[0];
+        mVec128[1] = other.mVec128[1];
+        mVec128[2] = other.mVec128[2];
+        mVec128[3] = other.mVec128[3];
+#else
+        m[0][0] = other.m[0][0];
+        m[0][1] = other.m[0][1];
+        m[0][2] = other.m[0][2];
+        m[0][3] = other.m[0][3];
+        m[1][0] = other.m[1][0];
+        m[1][1] = other.m[1][1];
+        m[1][2] = other.m[1][2];
+        m[1][3] = other.m[1][3];
+        m[2][0] = other.m[2][0];
+        m[2][1] = other.m[2][1];
+        m[2][2] = other.m[2][2];
+        m[2][3] = other.m[2][3];
+        m[3][0] = other.m[3][0];
+        m[3][1] = other.m[3][1];
+        m[3][2] = other.m[3][2];
+        m[3][3] = other.m[3][3];
+#endif
+        return *this;
+    }
+};
+static inline TMatrix4f operator*(const TMatrix4f& lhs, const TMatrix4f& rhs)
+{
+    TMatrix4f ret;
+#if __SSE__
+    unsigned i;
+    for (i=0 ; i<4 ; ++i) {
+        ret.mVec128[i] =
+        _mm_add_ps(_mm_add_ps(_mm_add_ps(
+                   _mm_mul_ps(lhs.mVec128[0], _mm_shuffle_ps(rhs.mVec128[i], rhs.mVec128[i], _MM_SHUFFLE(0, 0, 0, 0))),
+                   _mm_mul_ps(lhs.mVec128[1], _mm_shuffle_ps(rhs.mVec128[i], rhs.mVec128[i], _MM_SHUFFLE(1, 1, 1, 1)))),
+                   _mm_mul_ps(lhs.mVec128[2], _mm_shuffle_ps(rhs.mVec128[i], rhs.mVec128[i], _MM_SHUFFLE(2, 2, 2, 2)))),
+                   _mm_mul_ps(lhs.mVec128[3], _mm_shuffle_ps(rhs.mVec128[i], rhs.mVec128[i], _MM_SHUFFLE(3, 3, 3, 3))));
+    }
+#else
+    ret.m[0][0] = lhs.m[0][0]*rhs.m[0][0] + lhs.m[1][0]*rhs.m[0][1] + lhs.m[2][0]*rhs.m[0][2] + lhs.m[3][0]*rhs.m[0][3];
+    ret.m[1][0] = lhs.m[0][0]*rhs.m[1][0] + lhs.m[1][0]*rhs.m[1][1] + lhs.m[2][0]*rhs.m[1][2] + lhs.m[3][0]*rhs.m[1][3];
+    ret.m[2][0] = lhs.m[0][0]*rhs.m[2][0] + lhs.m[1][0]*rhs.m[2][1] + lhs.m[2][0]*rhs.m[2][2] + lhs.m[3][0]*rhs.m[2][3];
+    ret.m[3][0] = lhs.m[0][0]*rhs.m[3][0] + lhs.m[1][0]*rhs.m[3][1] + lhs.m[2][0]*rhs.m[3][2] + lhs.m[3][0]*rhs.m[3][3];
+    
+    ret.m[0][1] = lhs.m[0][1]*rhs.m[0][0] + lhs.m[1][1]*rhs.m[0][1] + lhs.m[2][1]*rhs.m[0][2] + lhs.m[3][1]*rhs.m[0][3];
+    ret.m[1][1] = lhs.m[0][1]*rhs.m[1][0] + lhs.m[1][1]*rhs.m[1][1] + lhs.m[2][1]*rhs.m[1][2] + lhs.m[3][1]*rhs.m[1][3];
+    ret.m[2][1] = lhs.m[0][1]*rhs.m[2][0] + lhs.m[1][1]*rhs.m[2][1] + lhs.m[2][1]*rhs.m[2][2] + lhs.m[3][1]*rhs.m[2][3];
+    ret.m[3][1] = lhs.m[0][1]*rhs.m[3][0] + lhs.m[1][1]*rhs.m[3][1] + lhs.m[2][1]*rhs.m[3][2] + lhs.m[3][1]*rhs.m[3][3];
+    
+    ret.m[0][2] = lhs.m[0][2]*rhs.m[0][0] + lhs.m[1][2]*rhs.m[0][1] + lhs.m[2][2]*rhs.m[0][2] + lhs.m[3][2]*rhs.m[0][3];
+    ret.m[1][2] = lhs.m[0][2]*rhs.m[1][0] + lhs.m[1][2]*rhs.m[1][1] + lhs.m[2][2]*rhs.m[1][2] + lhs.m[3][2]*rhs.m[1][3];
+    ret.m[2][2] = lhs.m[0][2]*rhs.m[2][0] + lhs.m[1][2]*rhs.m[2][1] + lhs.m[2][2]*rhs.m[2][2] + lhs.m[3][2]*rhs.m[2][3];
+    ret.m[3][2] = lhs.m[0][2]*rhs.m[3][0] + lhs.m[1][2]*rhs.m[3][1] + lhs.m[2][2]*rhs.m[3][2] + lhs.m[3][2]*rhs.m[3][3];
+    
+    ret.m[0][3] = lhs.m[0][3]*rhs.m[0][0] + lhs.m[1][3]*rhs.m[0][1] + lhs.m[2][3]*rhs.m[0][2] + lhs.m[3][3]*rhs.m[0][3];
+    ret.m[1][3] = lhs.m[0][3]*rhs.m[1][0] + lhs.m[1][3]*rhs.m[1][1] + lhs.m[2][3]*rhs.m[1][2] + lhs.m[3][3]*rhs.m[1][3];
+    ret.m[2][3] = lhs.m[0][3]*rhs.m[2][0] + lhs.m[1][3]*rhs.m[2][1] + lhs.m[2][3]*rhs.m[2][2] + lhs.m[3][3]*rhs.m[2][3];
+    ret.m[3][3] = lhs.m[0][3]*rhs.m[3][0] + lhs.m[1][3]*rhs.m[3][1] + lhs.m[2][3]*rhs.m[2][2] + lhs.m[3][3]*rhs.m[3][3];
+#endif
+    return ret;
 }
 extern const TMatrix4f kIdentityMtx4;
 
@@ -73,7 +145,7 @@ public:
     {
         m_projType = PROJ_ORTHO;
         m_ortho = SProjOrtho();
-        copyMatrix4f(m_mtx, kIdentityMtx4);
+        m_mtx = kIdentityMtx4;
     }
     CProjection(const CProjection& other) {*this = other;}
     CProjection(const SProjOrtho& ortho) {setOrtho(ortho);}
@@ -85,7 +157,7 @@ public:
         {
             m_projType = other.m_projType;
             m_ortho = other.m_ortho;
-            copyMatrix4f(m_mtx, other.m_mtx);
+            m_mtx = other.m_mtx;
         }
         return *this;
     }
@@ -109,7 +181,7 @@ public:
         return m_persp;
     }
     
-    inline const TMatrix4f& getCachedMatrix() {return m_mtx;}
+    inline const TMatrix4f& getCachedMatrix() const {return m_mtx;}
     
 protected:
 
diff --git a/CVector3f.cpp b/CVector3f.cpp
index 42efdfd..5869dea 100644
--- a/CVector3f.cpp
+++ b/CVector3f.cpp
@@ -4,32 +4,10 @@
 #include <assert.h>
 #include "Math.hpp"
 
-const CVector3f CVector3f::skOne = CVector3f(1);
+const CVector3f CVector3f::skOne = CVector3f(1.0);
+const CVector3f CVector3f::skNegOne = CVector3f(-1.0);
 const CVector3f CVector3f::skZero;
 
-void CVector3f::normalize()
-{
-    float mag = length();
-    assert(mag != 0.0);
-    
-    x /= mag;
-    y /= mag;
-    z /= mag;
-}
-
-CVector3f CVector3f::normalized() const
-{
-    CVector3f ret;
-    float mag = length();
-    assert(mag != 0.0);
-
-    ret.x = x/mag;
-    ret.y = y/mag;
-    ret.z = z/mag;
-
-    return ret;
-}
-
 float CVector3f::getAngleDiff(const CVector3f& a, const CVector3f& b)
 {
     float mag1 = a.length();
diff --git a/CVector3f.hpp b/CVector3f.hpp
index 975f233..df3497a 100644
--- a/CVector3f.hpp
+++ b/CVector3f.hpp
@@ -4,6 +4,7 @@
 #include "Global.hpp"
 #include <Athena/IStreamReader.hpp>
 #include <math.h>
+#include <assert.h>
 
 typedef union
 {
@@ -18,7 +19,7 @@ class ZE_ALIGN(16) CVector3f
 public:
     ZE_DECLARE_ALIGNED_ALLOCATOR();
     
-    CVector3f() {zeroOut();}
+    inline CVector3f() {zeroOut();}
 #if __SSE__
     CVector3f(const __m128& mVec128) : mVec128(mVec128) {v[3] = 0.0f;}
 #endif
@@ -148,8 +149,20 @@ public:
 #endif
         return *this;
     }
-    void normalize();
-    CVector3f normalized() const;
+    inline void normalize()
+    {
+        float mag = length();
+        assert(mag != 0.0);
+        mag = 1.0 / mag;
+        *this *= mag;
+    }
+    inline CVector3f normalized() const
+    {
+        float mag = length();
+        assert(mag != 0.0);
+        mag = 1.0 / mag;
+        return *this * mag;
+    }
     inline CVector3f cross(const CVector3f& rhs) const
     {
         return CVector3f(y * rhs.z - z * rhs.y, z * rhs.x - x * rhs.z, x * rhs.y - y * rhs.x);
@@ -236,6 +249,7 @@ public:
     };
 
     static const CVector3f skOne;
+    static const CVector3f skNegOne;
     static const CVector3f skZero;
 };