SIMD refactor

2025-08-06 20:25:59 +00:00 · 2018-12-07 15:16:50 -10:00 · 2018-12-07 15:16:50 -10:00 · e8dfecbb6e
commit e8dfecbb6e
parent d881e58f62
49 changed files with 6047 additions and 4721 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,5 +1,5 @@
 ---
-IndentWidth: 4
+IndentWidth: 2
 ColumnLimit: 128
 UseTab: Never
 ---
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -40,7 +40,6 @@ add_library(zeus
    include/zeus/CColor.hpp
    include/zeus/Global.hpp
    include/zeus/zeus.hpp
-    include/zeus/TVectorUnion.hpp
    include/zeus/CVector2i.hpp
    include/zeus/CVector2f.hpp
    include/zeus/CVector3f.hpp
@ -56,7 +55,11 @@ add_library(zeus
    include/zeus/CSphere.hpp
    include/zeus/CUnitVector.hpp
    include/zeus/CMRay.hpp
-    include/zeus/CEulerAngles.hpp)
+    include/zeus/CEulerAngles.hpp
+    include/zeus/simd/simd.hpp
+    include/zeus/simd/simd_sse.hpp
+    include/zeus/simd/simd_avx.hpp
+    include/zeus/simd/parallelism_v2_simd.hpp)

 add_subdirectory(test)

--- a/include/zeus/CAABox.hpp
+++ b/include/zeus/CAABox.hpp
@ -6,412 +6,367 @@
 #include "zeus/CLineSeg.hpp"
 #include "zeus/CSphere.hpp"
 #include "zeus/Math.hpp"
+
 #if ZE_ATHENA_TYPES
+
 #include <athena/IStreamReader.hpp>
+
 #endif

-namespace zeus
-{
-class alignas(16) CAABox
-{
+namespace zeus {
+class CAABox {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  enum class EBoxEdgeId {
+    Z0,
+    X0,
+    Z1,
+    X1,
+    Z2,
+    X2,
+    Z3,
+    X3,
+    Y0,
+    Y1,
+    Y2,
+    Y3
+  };

-    enum class EBoxEdgeId
-    {
-        Z0,
-        X0,
-        Z1,
-        X1,
-        Z2,
-        X2,
-        Z3,
-        X3,
-        Y0,
-        Y1,
-        Y2,
-        Y3
-    };
+  enum class EBoxFaceID {
+  };

-    enum class EBoxFaceID
-    {
-    };
+  static const CAABox skInvertedBox;
+  static const CAABox skNullBox;

-    static const CAABox skInvertedBox;
-    static const CAABox skNullBox;
+  CVector3f min;
+  CVector3f max;

-    CVector3f min;
-    CVector3f max;
+  // set default AABox to insane inverse min/max to allow for accumulation
+  CAABox() : CAABox(1e16f, -1e16f) {}

-    // set default AABox to insane inverse min/max to allow for accumulation
-    CAABox() : CAABox(1e16f, -1e16f) {}
+  CAABox(const CVector3f& min, const CVector3f& max) : min(min), max(max) {}

-    CAABox(const CVector3f& min, const CVector3f& max) : min(min), max(max) {}
+  CAABox(float min, float max) : min(CVector3f(min)), max(CVector3f(max)) {}

-    CAABox(float min, float max) : min(CVector3f(min)), max(CVector3f(max)) {}
+  CAABox(float minX, float minY, float minZ, float maxX, float maxY, float maxZ)
+    : min(minX, minY, minZ), max(maxX, maxY, maxZ) {
+  }

-    CAABox(float minX, float minY, float minZ, float maxX, float maxY, float maxZ)
-    : min(minX, minY, minZ), max(maxX, maxY, maxZ)
-    {
-    }
 #if ZE_ATHENA_TYPES
-    inline void readBoundingBoxBig(athena::io::IStreamReader& in)
-    {
-        min.readBig(in);
-        max.readBig(in);
-    }
-    static inline CAABox ReadBoundingBoxBig(athena::io::IStreamReader& in)
-    {
-        CAABox ret;
-        ret.readBoundingBoxBig(in);
-        return ret;
-    }
+
+  void readBoundingBoxBig(athena::io::IStreamReader& in) {
+    min.readBig(in);
+    max.readBig(in);
+  }
+
+  static CAABox ReadBoundingBoxBig(athena::io::IStreamReader& in) {
+    CAABox ret;
+    ret.readBoundingBoxBig(in);
+    return ret;
+  }

 #endif

-    float distanceFromPointSquared(const CVector3f& other) const
-    {
-        float dist = 0;
-        for (int i = 0; i < 3; i++)
-        {
-            if (other[i] < min[i])
-            {
-                const float tmp = (min[i] - other[i]);
-                dist += tmp * tmp;
-            }
-            else if (other[i] > max[i])
-            {
-                const float tmp = (other[i] - max[i]);
-                dist += tmp * tmp;
-            }
-        }
-
-        return dist;
+  float distanceFromPointSquared(const CVector3f& other) const {
+    float dist = 0;
+    for (int i = 0; i < 3; i++) {
+      if (other[i] < min[i]) {
+        const float tmp = (min[i] - other[i]);
+        dist += tmp * tmp;
+      } else if (other[i] > max[i]) {
+        const float tmp = (other[i] - max[i]);
+        dist += tmp * tmp;
+      }
    }

-    float distanceFromPoint(const CVector3f& other) const { return std::sqrt(distanceFromPointSquared(other)); }
+    return dist;
+  }

-    inline bool intersects(const CAABox& other) const
-    {
-        bool x1 = (max[0] >= other.min[0]);
-        bool x2 = (min[0] <= other.max[0]);
-        bool y1 = (max[1] >= other.min[1]);
-        bool y2 = (min[1] <= other.max[1]);
-        bool z1 = (max[2] >= other.min[2]);
-        bool z2 = (min[2] <= other.max[2]);
-        return x1 && x2 && y1 && y2 && z1 && z2;
+  float distanceFromPoint(const CVector3f& other) const { return std::sqrt(distanceFromPointSquared(other)); }
+
+  bool intersects(const CAABox& other) const {
+    bool x1 = (max[0] >= other.min[0]);
+    bool x2 = (min[0] <= other.max[0]);
+    bool y1 = (max[1] >= other.min[1]);
+    bool y2 = (min[1] <= other.max[1]);
+    bool z1 = (max[2] >= other.min[2]);
+    bool z2 = (min[2] <= other.max[2]);
+    return x1 && x2 && y1 && y2 && z1 && z2;
+  }
+
+  bool intersects(const CSphere& other) const {
+    return distanceFromPointSquared(other.position) <= other.radius * other.radius;
+  }
+
+  float intersectionRadius(const CSphere& other) const {
+    float dist = distanceFromPoint(other.position);
+    return (dist < other.radius) ? dist : -1.f;
+  }
+
+  CAABox booleanIntersection(const CAABox& other) const {
+    CVector3f minVec = CVector3f::skZero;
+    CVector3f maxVec = CVector3f::skZero;
+
+    for (int i = 0; i < 3; ++i) {
+      if (min[i] <= other.min[i] && max[i] >= other.max[i]) {
+        minVec[i] = other.min[i];
+        maxVec[i] = other.max[i];
+      } else if (other.min[i] <= min[i] && other.max[i] >= max[i]) {
+        minVec[i] = min[i];
+        maxVec[i] = max[i];
+      } else if (other.min[i] <= min[i] && other.max[i] >= min[i]) {
+        minVec[i] = min[i];
+        maxVec[i] = other.max[i];
+      } else if (other.min[i] <= max[i] && other.max[i] >= max[i]) {
+        minVec[i] = other.min[i];
+        maxVec[i] = max[i];
+      }
    }

-    bool intersects(const CSphere& other) const
-    {
-        return distanceFromPointSquared(other.position) <= other.radius * other.radius;
+    return {minVec, maxVec};
+  }
+
+  bool inside(const CAABox& other) const {
+    bool x = min[0] >= other.min[0] && max[0] <= other.max[0];
+    bool y = min[1] >= other.min[1] && max[1] <= other.max[1];
+    bool z = min[2] >= other.min[2] && max[2] <= other.max[2];
+    return x && y && z;
+  }
+
+  bool insidePlane(const CPlane& plane) const {
+    CVector3f vmax;
+    /* X axis */
+    if (plane.x() >= 0.f)
+      vmax[0] = max[0];
+    else
+      vmax[0] = min[0];
+    /* Y axis */
+    if (plane.y() >= 0.f)
+      vmax[1] = max[1];
+    else
+      vmax[1] = min[1];
+    /* Z axis */
+    if (plane.z() >= 0.f)
+      vmax[2] = max[2];
+    else
+      vmax[2] = min[2];
+    return plane.normal().dot(vmax) + plane.d() >= 0.f;
+  }
+
+  CVector3f center() const { return (min + max) * 0.5f; }
+
+  CVector3f extents() const { return (max - min) * 0.5f; }
+
+  float volume() const {
+    auto delta = max - min;
+    return delta.x() * delta.y() * delta.z();
+  }
+
+  CLineSeg getEdge(EBoxEdgeId id) const {
+    switch (id) {
+    case EBoxEdgeId::Z0:
+    default:
+      return CLineSeg({min.x(), min.y(), max.z()}, {min.x(), min.y(), min.z()});
+    case EBoxEdgeId::X0:
+      return CLineSeg({min.x(), min.y(), min.z()}, {max.x(), min.y(), min.z()});
+    case EBoxEdgeId::Z1:
+      return CLineSeg({max.x(), min.y(), min.z()}, {max.x(), min.y(), max.z()});
+    case EBoxEdgeId::X1:
+      return CLineSeg({max.x(), min.y(), max.z()}, {min.x(), min.y(), max.z()});
+    case EBoxEdgeId::Z2:
+      return CLineSeg({max.x(), max.y(), max.z()}, {max.x(), max.y(), min.z()});
+    case EBoxEdgeId::X2:
+      return CLineSeg({max.x(), max.y(), min.z()}, {min.x(), max.y(), min.z()});
+    case EBoxEdgeId::Z3:
+      return CLineSeg({min.x(), max.y(), min.z()}, {min.x(), max.y(), max.z()});
+    case EBoxEdgeId::X3:
+      return CLineSeg({min.x(), max.y(), max.z()}, {max.x(), max.y(), max.z()});
+    case EBoxEdgeId::Y0:
+      return CLineSeg({min.x(), min.y(), max.z()}, {min.x(), max.y(), max.z()});
+    case EBoxEdgeId::Y1:
+      return CLineSeg({min.x(), min.y(), min.z()}, {min.x(), max.y(), min.z()});
+    case EBoxEdgeId::Y2:
+      return CLineSeg({max.x(), min.y(), min.z()}, {max.x(), max.y(), min.z()});
+    case EBoxEdgeId::Y3:
+      return CLineSeg({max.x(), min.y(), max.z()}, {max.x(), max.y(), max.z()});
+    }
+  }
+
+  CAABox getTransformedAABox(const CTransform& xfrm) const {
+    CAABox box;
+    CVector3f point = xfrm * getPoint(0);
+    box.accumulateBounds(point);
+    point = xfrm * getPoint(1);
+    box.accumulateBounds(point);
+    point = xfrm * getPoint(2);
+    box.accumulateBounds(point);
+    point = xfrm * getPoint(3);
+    box.accumulateBounds(point);
+    point = xfrm * getPoint(4);
+    box.accumulateBounds(point);
+    point = xfrm * getPoint(5);
+    box.accumulateBounds(point);
+    point = xfrm * getPoint(6);
+    box.accumulateBounds(point);
+    point = xfrm * getPoint(7);
+    box.accumulateBounds(point);
+    return box;
+  }
+
+  void accumulateBounds(const CVector3f& point) {
+    if (min.x() > point.x())
+      min.x() = point.x();
+    if (min.y() > point.y())
+      min.y() = point.y();
+    if (min.z() > point.z())
+      min.z() = point.z();
+    if (max.x() < point.x())
+      max.x() = point.x();
+    if (max.y() < point.y())
+      max.y() = point.y();
+    if (max.z() < point.z())
+      max.z() = point.z();
+  }
+
+  void accumulateBounds(const CAABox& other) {
+    accumulateBounds(other.min);
+    accumulateBounds(other.max);
+  }
+
+  bool pointInside(const CVector3f& other) const {
+    return (min.x() <= other.x() && other.x() <= max.x() &&
+            min.y() <= other.y() && other.y() <= max.y() &&
+            min.z() <= other.z() && other.z() <= max.z());
+  }
+
+  CVector3f closestPointAlongVector(const CVector3f& other) const {
+    return {(other.x() >= 0.f ? min.x() : max.x()),
+            (other.y() >= 0.f ? min.y() : max.y()),
+            (other.z() >= 0.f ? min.z() : max.z())};
+  }
+
+  CVector3f furthestPointAlongVector(const CVector3f& other) const {
+    return {(other.x() >= 0.f ? max.x() : min.x()),
+            (other.y() >= 0.f ? max.y() : min.y()),
+            (other.z() >= 0.f ? max.z() : min.z())};
+  }
+
+  float distanceBetween(const CAABox& other) {
+    int intersects = 0;
+    if (max.x() >= other.min.x() && min.x() <= other.max.x())
+      intersects |= 0x1;
+    if (max.y() >= other.min.y() && min.y() <= other.max.y())
+      intersects |= 0x2;
+    if (max.z() >= other.min.z() && min.z() <= other.max.z())
+      intersects |= 0x4;
+
+    float minX, maxX;
+    if (max.x() < other.min.x()) {
+      minX = max.x();
+      maxX = other.min.x();
+    } else {
+      minX = min.x();
+      maxX = other.max.x();
    }

-    float intersectionRadius(const CSphere& other) const
-    {
-        float dist = distanceFromPoint(other.position);
-        return (dist < other.radius) ? dist : -1.f;
+    float minY, maxY;
+    if (max.y() < other.min.y()) {
+      minY = max.y();
+      maxY = other.min.y();
+    } else {
+      minY = min.y();
+      maxY = other.max.y();
    }

-    inline CAABox booleanIntersection(const CAABox& other) const
-    {
-        CVector3f minVec = CVector3f::skZero;
-        CVector3f maxVec = CVector3f::skZero;
-
-        for (int i = 0; i < 3; ++i)
-        {
-            if (min[i] <= other.min[i] && max[i] >= other.max[i])
-            {
-                minVec[i] = other.min[i];
-                maxVec[i] = other.max[i];
-            }
-            else if (other.min[i] <= min[i] && other.max[i] >= max[i])
-            {
-                minVec[i] = min[i];
-                maxVec[i] = max[i];
-            }
-            else if (other.min[i] <= min[i] && other.max[i] >= min[i])
-            {
-                minVec[i] = min[i];
-                maxVec[i] = other.max[i];
-            }
-            else if (other.min[i] <= max[i] && other.max[i] >= max[i])
-            {
-                minVec[i] = other.min[i];
-                maxVec[i] = max[i];
-            }
-        }
-
-        return {minVec, maxVec};
+    float minZ, maxZ;
+    if (max.z() < other.min.z()) {
+      minZ = max.z();
+      maxZ = other.min.z();
+    } else {
+      minZ = min.z();
+      maxZ = other.max.z();
    }

-    inline bool inside(const CAABox& other) const
-    {
-        bool x = min[0] >= other.min[0] && max[0] <= other.max[0];
-        bool y = min[1] >= other.min[1] && max[1] <= other.max[1];
-        bool z = min[2] >= other.min[2] && max[2] <= other.max[2];
-        return x && y && z;
+    switch (intersects) {
+    case 0:
+      return zeus::CVector3f(maxX - minX, maxY - minY, maxZ - minZ).magnitude();
+    case 1:
+      return zeus::CVector2f(maxY - minY, maxZ - minZ).magnitude();
+    case 2:
+      return zeus::CVector2f(maxX - minX, maxZ - minZ).magnitude();
+    case 3:
+      return std::fabs(maxZ - minZ);
+    case 4:
+      return zeus::CVector2f(maxX - minX, maxY - minY).magnitude();
+    case 5:
+      return std::fabs(maxY - minY);
+    case 6:
+      return std::fabs(maxX - minX);
+    case 7:
+    default:
+      return 0.f;
    }
+  }

-    inline bool insidePlane(const CPlane& plane) const
-    {
-        CVector3f vmax;
-        /* X axis */
-        if (plane.a >= 0)
-            vmax[0] = max[0];
-        else
-            vmax[0] = min[0];
-        /* Y axis */
-        if (plane.b >= 0)
-            vmax[1] = max[1];
-        else
-            vmax[1] = min[1];
-        /* Z axis */
-        if (plane.c >= 0)
-            vmax[2] = max[2];
-        else
-            vmax[2] = min[2];
-        return plane.vec.dot(vmax) + plane.d >= 0.f;
-    }
+  CVector3f getPoint(const int point) const {
+    const CVector3f* vecs = &min;
+    return CVector3f(vecs[(point & 1) != 0].x(), vecs[(point & 2) != 0].y(), vecs[(point & 4) != 0].z());
+  }

-    CVector3f center() const { return (min + max) * 0.5f; }
+  CVector3f clampToBox(const CVector3f& vec) const {
+    CVector3f ret = vec;
+    ret.x() = clamp(min.x(), float(ret.x()), max.x());
+    ret.y() = clamp(min.y(), float(ret.y()), max.y());
+    ret.z() = clamp(min.z(), float(ret.z()), max.z());
+    return ret;
+  }

-    CVector3f extents() const { return (max - min) * 0.5f; }
+  void splitX(CAABox& negX, CAABox& posX) const {
+    float midX = (max.x() - min.x()) * .5f + min.x();
+    posX.max = max;
+    posX.min = min;
+    posX.min.x() = midX;
+    negX.max = max;
+    negX.max.x() = midX;
+    negX.min = min;
+  }

-    float volume() const { return (max.x - min.x) * (max.y - min.y) * (max.z - min.z); }
+  void splitY(CAABox& negY, CAABox& posY) const {
+    float midY = (max.y() - min.y()) * .5f + min.y();
+    posY.max = max;
+    posY.min = min;
+    posY.min.y() = midY;
+    negY.max = max;
+    negY.max.y() = midY;
+    negY.min = min;
+  }

-    inline CLineSeg getEdge(EBoxEdgeId id) const
-    {
-        switch (id)
-        {
-        case EBoxEdgeId::Z0:
-        default:
-            return CLineSeg({min.x, min.y, max.z}, {min.x, min.y, min.z});
-        case EBoxEdgeId::X0:
-            return CLineSeg({min.x, min.y, min.z}, {max.x, min.y, min.z});
-        case EBoxEdgeId::Z1:
-            return CLineSeg({max.x, min.y, min.z}, {max.x, min.y, max.z});
-        case EBoxEdgeId::X1:
-            return CLineSeg({max.x, min.y, max.z}, {min.x, min.y, max.z});
-        case EBoxEdgeId::Z2:
-            return CLineSeg({max.x, max.y, max.z}, {max.x, max.y, min.z});
-        case EBoxEdgeId::X2:
-            return CLineSeg({max.x, max.y, min.z}, {min.x, max.y, min.z});
-        case EBoxEdgeId::Z3:
-            return CLineSeg({min.x, max.y, min.z}, {min.x, max.y, max.z});
-        case EBoxEdgeId::X3:
-            return CLineSeg({min.x, max.y, max.z}, {max.x, max.y, max.z});
-        case EBoxEdgeId::Y0:
-            return CLineSeg({min.x, min.y, max.z}, {min.x, max.y, max.z});
-        case EBoxEdgeId::Y1:
-            return CLineSeg({min.x, min.y, min.z}, {min.x, max.y, min.z});
-        case EBoxEdgeId::Y2:
-            return CLineSeg({max.x, min.y, min.z}, {max.x, max.y, min.z});
-        case EBoxEdgeId::Y3:
-            return CLineSeg({max.x, min.y, max.z}, {max.x, max.y, max.z});
-        }
-    }
+  void splitZ(CAABox& negZ, CAABox& posZ) const {
+    float midZ = (max.z() - min.z()) * .5f + min.z();
+    posZ.max = max;
+    posZ.min = min;
+    posZ.min.z() = midZ;
+    negZ.max = max;
+    negZ.max.z() = midZ;
+    negZ.min = min;
+  }

-    inline CAABox getTransformedAABox(const CTransform& xfrm) const
-    {
-        CAABox box;
-        CVector3f point = xfrm * getPoint(0);
-        box.accumulateBounds(point);
-        point = xfrm * getPoint(1);
-        box.accumulateBounds(point);
-        point = xfrm * getPoint(2);
-        box.accumulateBounds(point);
-        point = xfrm * getPoint(3);
-        box.accumulateBounds(point);
-        point = xfrm * getPoint(4);
-        box.accumulateBounds(point);
-        point = xfrm * getPoint(5);
-        box.accumulateBounds(point);
-        point = xfrm * getPoint(6);
-        box.accumulateBounds(point);
-        point = xfrm * getPoint(7);
-        box.accumulateBounds(point);
-        return box;
-    }
+  bool invalid() { return (max.x() < min.x() || max.y() < min.y() || max.z() < min.z()); }

-    inline void accumulateBounds(const CVector3f& point)
-    {
-        if (min.x > point.x)
-            min.x = point.x;
-        if (min.y > point.y)
-            min.y = point.y;
-        if (min.z > point.z)
-            min.z = point.z;
-        if (max.x < point.x)
-            max.x = point.x;
-        if (max.y < point.y)
-            max.y = point.y;
-        if (max.z < point.z)
-            max.z = point.z;
-    }
-
-    inline void accumulateBounds(const CAABox& other)
-    {
-        accumulateBounds(other.min);
-        accumulateBounds(other.max);
-    }
-
-    inline bool pointInside(const CVector3f& other) const
-    {
-        return (min.x <= other.x && other.x <= max.x &&
-                min.y <= other.y && other.y <= max.y &&
-                min.z <= other.z && other.z <= max.z);
-    }
-
-    inline CVector3f closestPointAlongVector(const CVector3f& other) const
-    {
-        return {(other.x >= 0.f ? min.x : max.x),
-                (other.y >= 0.f ? min.y : max.y),
-                (other.z >= 0.f ? min.z : max.z)};
-    }
-
-    inline CVector3f furthestPointAlongVector(const CVector3f& other) const
-    {
-        return {(other.x >= 0.f ? max.x : min.x),
-                (other.y >= 0.f ? max.y : min.y),
-                (other.z >= 0.f ? max.z : min.z)};
-    }
-
-    inline float distanceBetween(const CAABox& other)
-    {
-        int intersects = 0;
-        if (max.x >= other.min.x && min.x <= other.max.x)
-            intersects |= 0x1;
-        if (max.y >= other.min.y && min.y <= other.max.y)
-            intersects |= 0x2;
-        if (max.z >= other.min.z && min.z <= other.max.z)
-            intersects |= 0x4;
-
-        float minX, maxX;
-        if (max.x < other.min.x)
-        {
-            minX = max.x;
-            maxX = other.min.x;
-        }
-        else
-        {
-            minX = min.x;
-            maxX = other.max.x;
-        }
-
-        float minY, maxY;
-        if (max.y < other.min.y)
-        {
-            minY = max.y;
-            maxY = other.min.y;
-        }
-        else
-        {
-            minY = min.y;
-            maxY = other.max.y;
-        }
-
-        float minZ, maxZ;
-        if (max.z < other.min.z)
-        {
-            minZ = max.z;
-            maxZ = other.min.z;
-        }
-        else
-        {
-            minZ = min.z;
-            maxZ = other.max.z;
-        }
-
-        switch (intersects)
-        {
-        case 0:
-            return zeus::CVector3f(maxX - minX, maxY - minY, maxZ - minZ).magnitude();
-        case 1:
-            return zeus::CVector2f(maxY - minY, maxZ - minZ).magnitude();
-        case 2:
-            return zeus::CVector2f(maxX - minX, maxZ - minZ).magnitude();
-        case 3:
-            return std::fabs(maxZ - minZ);
-        case 4:
-            return zeus::CVector2f(maxX - minX, maxY - minY).magnitude();
-        case 5:
-            return std::fabs(maxY - minY);
-        case 6:
-            return std::fabs(maxX - minX);
-        case 7:
-        default:
-            return 0.f;
-        }
-    }
-
-    inline CVector3f getPoint(const int point) const
-    {
-        const CVector3f* vecs = &min;
-        return CVector3f(vecs[(point & 1) != 0].x, vecs[(point & 2) != 0].y, vecs[(point & 4) != 0].z);
-    }
-
-    inline CVector3f clampToBox(const CVector3f& vec)
-    {
-        CVector3f ret = vec;
-        clamp(min.x, ret.x, max.x);
-        clamp(min.y, ret.y, max.y);
-        clamp(min.z, ret.z, max.z);
-        return ret;
-    }
-
-    inline void splitX(CAABox& negX, CAABox& posX) const
-    {
-        float midX = (max.x - min.x) * .5f + min.x;
-        posX.max = max;
-        posX.min = min;
-        posX.min.x = midX;
-        negX.max = max;
-        negX.max.x = midX;
-        negX.min = min;
-    }
-
-    inline void splitY(CAABox& negY, CAABox& posY) const
-    {
-        float midY = (max.y - min.y) * .5f + min.y;
-        posY.max = max;
-        posY.min = min;
-        posY.min.y = midY;
-        negY.max = max;
-        negY.max.y = midY;
-        negY.min = min;
-    }
-
-    inline void splitZ(CAABox& negZ, CAABox& posZ) const
-    {
-        float midZ = (max.z - min.z) * .5f + min.z;
-        posZ.max = max;
-        posZ.min = min;
-        posZ.min.z = midZ;
-        negZ.max = max;
-        negZ.max.z = midZ;
-        negZ.min = min;
-    }
-
-    inline bool invalid() { return (max.x < min.x || max.y < min.y || max.z < min.z); }
-
-    inline float operator[](size_t idx) const
-    {
-        assert(idx < 6);
-        if (idx < 3)
-            return min[idx];
-        else
-            return max[idx-3];
-    }
+  float operator[](size_t idx) const {
+    assert(idx < 6);
+    if (idx < 3)
+      return min[idx];
+    else
+      return max[idx - 3];
+  }
 };

-inline bool operator==(const CAABox& left, const CAABox& right)
-{
-    return (left.min == right.min && left.max == right.max);
+inline bool operator==(const CAABox& left, const CAABox& right) {
+  return (left.min == right.min && left.max == right.max);
 }
-inline bool operator!=(const CAABox& left, const CAABox& right)
-{
-    return (left.min != right.min || left.max != right.max);
+
+inline bool operator!=(const CAABox& left, const CAABox& right) {
+  return (left.min != right.min || left.max != right.max);
 }
 }

--- a/include/zeus/CAxisAngle.hpp
+++ b/include/zeus/CAxisAngle.hpp
@ -4,22 +4,15 @@
 #include "zeus/CVector3f.hpp"
 #include "CUnitVector.hpp"

-namespace zeus
-{
-struct alignas(16) CAxisAngle : CVector3f
-{
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
-
-    CAxisAngle() = default;
-    CAxisAngle(float x, float y, float z) : CVector3f(x, y, z) {}
-    CAxisAngle(const CUnitVector3f& axis, float angle) : CVector3f(angle * axis) {}
-
-    CAxisAngle(const CVector3f& axisAngle) : CVector3f(axisAngle) {}
-
-    float angle() const { return magnitude(); }
-    const CVector3f& getVector() const { return *this; }
-
-    static const CAxisAngle sIdentity;
+namespace zeus {
+struct CAxisAngle : CVector3f {
+  CAxisAngle() = default;
+  CAxisAngle(float x, float y, float z) : CVector3f(x, y, z) {}
+  CAxisAngle(const CUnitVector3f& axis, float angle) : CVector3f(angle * axis) {}
+  CAxisAngle(const CVector3f& axisAngle) : CVector3f(axisAngle) {}
+  float angle() const { return magnitude(); }
+  const CVector3f& getVector() const { return *this; }
+  static const CAxisAngle sIdentity;
 };
 }

--- a/include/zeus/CColor.hpp
+++ b/include/zeus/CColor.hpp
@ -2,11 +2,15 @@

 #include "Global.hpp"
 #include "zeus/Math.hpp"
-#include "TVectorUnion.hpp"
+#include "CVector4f.hpp"
+
 #if ZE_ATHENA_TYPES
-#include <athena/FileReader.hpp>
-#include <athena/FileWriter.hpp>
+
+#include "athena/FileReader.hpp"
+#include "athena/FileWriter.hpp"
+
 #endif
+
 #include <iostream>
 #include <cassert>

@ -20,410 +24,315 @@
 #define COLOR(rgba) rgba
 #endif

-namespace zeus
-{
+namespace zeus {
 typedef uint8_t Comp8;
 typedef uint32_t Comp32;
 constexpr float OneOver255 = 1.f / 255.f;

 typedef union {
-    struct
-    {
-        Comp8 r, g, b, a;
-    };
-    Comp32 rgba;
+  struct {
+    Comp8 r, g, b, a;
+  };
+  Comp32 rgba;
 } RGBA32;

 class CVector4f;

-class alignas(16) CColor
-{
+class CColor {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  simd<float> mSimd;
+  static const CColor skRed;
+  static const CColor skBlack;
+  static const CColor skBlue;
+  static const CColor skGreen;
+  static const CColor skGrey;
+  static const CColor skOrange;
+  static const CColor skPurple;
+  static const CColor skYellow;
+  static const CColor skWhite;
+  static const CColor skClear;

-    static const CColor skRed;
-    static const CColor skBlack;
-    static const CColor skBlue;
-    static const CColor skGreen;
-    static const CColor skGrey;
-    static const CColor skOrange;
-    static const CColor skPurple;
-    static const CColor skYellow;
-    static const CColor skWhite;
-    static const CColor skClear;
+  CColor() : mSimd(1.f) {}

-#if __SSE__
-    CColor(const __m128& mVec128) : mVec128(mVec128) {}
-#endif
+  CColor(float rgb, float a = 1.0) { splat(rgb, a); }

-    CColor() : r(1.0f), g(1.0f), b(1.0f), a(1.0f) {}
-    CColor(float rgb, float a = 1.0) { splat(rgb, a); }
-    CColor(float r, float g, float b, float a = 1.0f)
-    {
-        v[0] = r;
-        v[1] = g;
-        v[2] = b;
-        v[3] = a;
-    }
-#if ZE_ATHENA_TYPES
-    CColor(const atVec4f& vec)
-#if __SSE__ || __GEKKO_PS__
-    : mVec128(vec.mVec128)
-    {
-    }
-#else
-    {
-        r = vec.vec[0], g = vec.vec[1], b = vec.vec[2], a = vec.vec[3];
-    }
-#endif
-#endif
-
-    CColor(Comp32 rgba) { fromRGBA32(rgba); }
-    CColor(const Comp8* rgba) { fromRGBA8(rgba[0], rgba[1], rgba[2], rgba[3]); }
-
-    CColor(const CVector4f& other);
-    CColor& operator=(const CVector4f& other);
+  CColor(float r, float g, float b, float a = 1.0f) : mSimd(r, g, b, a) {}

 #if ZE_ATHENA_TYPES

-    static inline CColor ReadRGBABig(athena::io::IStreamReader& reader)
-    {
-        CColor ret;
-        ret.readRGBABig(reader);
-        return ret;
-    }
+  CColor(const atVec4f& vec) : mSimd(vec.simd) {}

-    inline void readRGBABig(athena::io::IStreamReader& reader)
-    {
-        r = reader.readFloatBig();
-        g = reader.readFloatBig();
-        b = reader.readFloatBig();
-        a = reader.readFloatBig();
-    }
-    inline void readBGRABig(athena::io::IStreamReader& reader)
-    {
-        b = reader.readFloatBig();
-        g = reader.readFloatBig();
-        r = reader.readFloatBig();
-        a = reader.readFloatBig();
-    }
-    inline void writeRGBABig(athena::io::IStreamWriter& writer) const
-    {
-        writer.writeFloatBig(r);
-        writer.writeFloatBig(g);
-        writer.writeFloatBig(b);
-        writer.writeFloatBig(a);
-    }
-    inline void writeBGRABig(athena::io::IStreamWriter& writer) const
-    {
-        writer.writeFloatBig(b);
-        writer.writeFloatBig(g);
-        writer.writeFloatBig(r);
-        writer.writeFloatBig(a);
-    }
-    inline void writeRGBA8(athena::io::IStreamWriter& writer) const
-    {
-        writer.writeUByte(this->r * 255);
-        writer.writeUByte(this->g * 255);
-        writer.writeUByte(this->b * 255);
-        writer.writeUByte(this->a * 255);
-    }
 #endif

-    inline bool operator==(const CColor& rhs) const { return (r == rhs.r && g == rhs.g && b == rhs.b && a == rhs.a); }
-    inline bool operator!=(const CColor& rhs) const { return !(*this == rhs); }
-    inline CColor operator+(const CColor& rhs) const
-    {
-#if __SSE__
-        return CColor(_mm_add_ps(mVec128, rhs.mVec128));
-#else
-        return CColor(r + rhs.r, g + rhs.g, b + rhs.b, a + rhs.a);
-#endif
-    }
-    inline CColor operator-(const CColor& rhs) const
-    {
-#if __SSE__
-        return CColor(_mm_sub_ps(mVec128, rhs.mVec128));
-#else
-        return CColor(r - rhs.r, g - rhs.g, b - rhs.b, a - rhs.a);
-#endif
-    }
-    inline CColor operator*(const CColor& rhs) const
-    {
-#if __SSE__
-        return CColor(_mm_mul_ps(mVec128, rhs.mVec128));
-#else
-        return CColor(r * rhs.r, g * rhs.g, b * rhs.b, a * rhs.a);
-#endif
-    }
-    inline CColor operator/(const CColor& rhs) const
-    {
-#if __SSE__
-        return CColor(_mm_div_ps(mVec128, rhs.mVec128));
-#else
-        return CColor(r / rhs.r, g / rhs.g, b / rhs.b, a / rhs.a);
-#endif
-    }
-    inline CColor operator+(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, val, val}};
-        return CColor(_mm_add_ps(mVec128, splat.mVec128));
-#else
-        return CColor(r + val, g + val, b + val, a + val);
-#endif
-    }
-    inline CColor operator-(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, val, val}};
-        return CColor(_mm_sub_ps(mVec128, splat.mVec128));
-#else
-        return CColor(r - val, g - val, b - val, a - val);
-#endif
-    }
-    inline CColor operator*(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, val, val}};
-        return CColor(_mm_mul_ps(mVec128, splat.mVec128));
-#else
-        return CColor(r * val, g * val, b * val, a * val);
-#endif
-    }
-    inline CColor operator/(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, val, val}};
-        return CColor(_mm_div_ps(mVec128, splat.mVec128));
-#else
-        return CColor(r / val, g / val, b / val, a / val);
-#endif
-    }
-    inline const CColor& operator+=(const CColor& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_add_ps(mVec128, rhs.mVec128);
-#else
-        r += rhs.r;
-        g += rhs.g;
-        b += rhs.b;
-        a += rhs.a;
-#endif
-        return *this;
-    }
-    inline const CColor& operator-=(const CColor& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_sub_ps(mVec128, rhs.mVec128);
-#else
-        r -= rhs.r;
-        g -= rhs.g;
-        b -= rhs.b;
-        a -= rhs.a;
-#endif
-        return *this;
-    }
-    inline const CColor& operator*=(const CColor& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-#else
-        r *= rhs.r;
-        g *= rhs.g;
-        b *= rhs.b;
-        a *= rhs.a;
-#endif
-        return *this;
-    }
-    inline const CColor& operator/=(const CColor& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_div_ps(mVec128, rhs.mVec128);
-#else
-        r /= rhs.r;
-        g /= rhs.g;
-        b /= rhs.b;
-        a /= rhs.a;
-#endif
-        return *this;
-    }
-    inline void normalize()
-    {
-        float mag = magnitude();
-        mag = 1.f / mag;
-        *this *= mag;
-    }
-    inline CColor normalized() const
-    {
-        float mag = magnitude();
-        mag = 1.f / mag;
-        return *this * mag;
-    }
+  CColor(Comp32 rgba) { fromRGBA32(rgba); }
+
+  CColor(const Comp8* rgba) { fromRGBA8(rgba[0], rgba[1], rgba[2], rgba[3]); }
+
+  CColor(const CVector4f& other) : mSimd(other.mSimd) {}
+
+  template <typename T>
+  CColor(const simd<T>& s) : mSimd(s) {}
+
+  CColor& operator=(const CVector4f& other) {
+    mSimd = other.mSimd;
+    return *this;
+  }
+
+#if ZE_ATHENA_TYPES
+
+  static CColor ReadRGBABig(athena::io::IStreamReader& reader) {
+    CColor ret;
+    ret.readRGBABig(reader);
+    return ret;
+  }
+
+  void readRGBABig(athena::io::IStreamReader& reader) {
+    simd_floats f;
+    f[0] = reader.readFloatBig();
+    f[1] = reader.readFloatBig();
+    f[2] = reader.readFloatBig();
+    f[3] = reader.readFloatBig();
+    mSimd.copy_from(f);
+  }
+
+  void readBGRABig(athena::io::IStreamReader& reader) {
+    simd_floats f;
+    f[2] = reader.readFloatBig();
+    f[1] = reader.readFloatBig();
+    f[0] = reader.readFloatBig();
+    f[3] = reader.readFloatBig();
+    mSimd.copy_from(f);
+  }
+
+  void writeRGBABig(athena::io::IStreamWriter& writer) const {
+    simd_floats f(mSimd);
+    writer.writeFloatBig(f[0]);
+    writer.writeFloatBig(f[1]);
+    writer.writeFloatBig(f[2]);
+    writer.writeFloatBig(f[3]);
+  }
+
+  void writeBGRABig(athena::io::IStreamWriter& writer) const {
+    simd_floats f(mSimd);
+    writer.writeFloatBig(f[2]);
+    writer.writeFloatBig(f[1]);
+    writer.writeFloatBig(f[0]);
+    writer.writeFloatBig(f[3]);
+  }
+
+  void writeRGBA8(athena::io::IStreamWriter& writer) const {
+    simd_floats f(mSimd);
+    writer.writeUByte(atUint8(f[0] * 255));
+    writer.writeUByte(atUint8(f[1] * 255));
+    writer.writeUByte(atUint8(f[2] * 255));
+    writer.writeUByte(atUint8(f[3] * 255));
+  }

-    inline float magSquared() const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-        result.mVec128 = _mm_dp_ps(mVec128, mVec128, 0xF1);
-        return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, mVec128);
-        return result.v[0] + result.v[1] + result.v[2] + result.v[3];
 #endif
-#else
-        return r * r + g * g + b * b + a * a;
-#endif
-    }
-    inline float magnitude() const { return std::sqrt(magSquared()); }
-    static inline CColor lerp(const CColor& a, const CColor& b, float t) { return (a + (b - a) * t); }
-    static inline CColor nlerp(const CColor& a, const CColor& b, float t) { return lerp(a, b, t).normalized(); }
-    inline float& operator[](const size_t& idx) { assert(idx < 4); return (&r)[idx]; }
-    inline const float& operator[](const size_t& idx) const { assert(idx < 4); return (&r)[idx]; }
-    inline void splat(float rgb, float a)
-    {
-#if __SSE__
-        TVectorUnion splat = {{rgb, rgb, rgb, a}};
-        mVec128 = splat.mVec128;
-#else
-        v[0] = rgb;
-        v[1] = rgb;
-        v[2] = rgb;
-        v[3] = a;
-#endif
-    }

-    inline float rgbDot(const CColor& rhs) const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-        result.mVec128 = _mm_dp_ps(mVec128, rhs.mVec128, 0x71);
-        return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-        return result.v[0] + result.v[1] + result.v[2];
-#endif
-#else
-        return (r * rhs.r) + (g * rhs.g) + (b * rhs.b);
-#endif
-    }
+  bool operator==(const CColor& rhs) const {
+    return (r() == rhs.r() && g() == rhs.g() && b() == rhs.b() && a() == rhs.a());
+  }

-    union {
-        struct
-        {
-            float r, g, b, a;
-        };
-        float v[4];
-#if __SSE__
-        __m128 mVec128;
-#endif
-    };
+  bool operator!=(const CColor& rhs) const { return !(*this == rhs); }

-    void fromRGBA8(Comp8 r, Comp8 g, Comp8 b, Comp8 a)
-    {
-        this->r = r * OneOver255;
-        this->g = g * OneOver255;
-        this->b = b * OneOver255;
-        this->a = a * OneOver255;
-    }
+  CColor operator+(const CColor& rhs) const {
+    return mSimd + rhs.mSimd;
+  }

-    void fromRGBA32(Comp32 rgba)
-    {
-        static RGBA32 tmp;
-        tmp.rgba = COLOR(rgba);
-        fromRGBA8(tmp.r, tmp.g, tmp.b, tmp.a);
-    }
+  CColor operator-(const CColor& rhs) const {
+    return mSimd - rhs.mSimd;
+  }

-    /*!
-     * \brief Converts a CColor to RGBA8
-     * \param r
-     * \param g
-     * \param b
-     * \param a
-     */
-    void toRGBA8(Comp8& r, Comp8& g, Comp8& b, Comp8& a)
-    {
-        r = this->r * 255;
-        g = this->g * 255;
-        b = this->b * 255;
-        a = this->a * 255;
-    }
+  CColor operator*(const CColor& rhs) const {
+    return mSimd * rhs.mSimd;
+  }

-    /**
-     * @brief Assigns rgba from hsv
-     * @param h[0-1] The hue percentagee of the color.
-     * @param s[0-1] The saturation percentage of the color.
-     * @param v[0-1] The value percentage of the color.
-     * @param a[0-1] The alpha percentage of the color.
-     */
-    void fromHSV(float h, float s, float v, float _a = 1.0);
+  CColor operator/(const CColor& rhs) const {
+    return mSimd / rhs.mSimd;
+  }

-    /**
-     * @brief Converts rgba to hsv
-     * @param h[0-1] The hue percentagee of the color.
-     * @param s[0-1] The saturation percentage of the color.
-     * @param v[0-1] The value percentage of the color.
-     * @param a[0-1] The alpha percentage of the color.
-     */
-    void toHSV(float& h, float& s, float& v) const;
+  CColor operator+(float val) const {
+    return mSimd + simd<float>(val);
+  }

-    void fromHSL(float h, float s, float l, float _a = 1.0);
+  CColor operator-(float val) const {
+    return mSimd - simd<float>(val);
+  }

-    void toHSL(float& h, float& s, float& l);
+  CColor operator*(float val) const {
+    return mSimd * simd<float>(val);
+  }

-    CColor toGrayscale() { return {std::sqrt((r * r + g * g + b * b) / 3), a}; }
+  CColor operator/(float val) const {
+    return mSimd / simd<float>(val);
+  }

-    /**
-     * @brief Clamps to GPU-safe RGBA values [0,1]
-     */
-    void Clamp()
-    {
-        this->r = std::min(1.f, std::max(0.f, this->r));
-        this->g = std::min(1.f, std::max(0.f, this->g));
-        this->b = std::min(1.f, std::max(0.f, this->b));
-        this->a = std::min(1.f, std::max(0.f, this->a));
-    }
+  const CColor& operator+=(const CColor& rhs) {
+    mSimd += rhs.mSimd;
+    return *this;
+  }
+
+  const CColor& operator-=(const CColor& rhs) {
+    mSimd -= rhs.mSimd;
+    return *this;
+  }
+
+  const CColor& operator*=(const CColor& rhs) {
+    mSimd *= rhs.mSimd;
+    return *this;
+  }
+
+  const CColor& operator/=(const CColor& rhs) {
+    mSimd /= rhs.mSimd;
+    return *this;
+  }
+
+  const CColor& operator+=(float rhs) {
+    mSimd += simd<float>(rhs);
+    return *this;
+  }
+
+  const CColor& operator-=(float rhs) {
+    mSimd -= simd<float>(rhs);
+    return *this;
+  }
+
+  const CColor& operator*=(float rhs) {
+    mSimd *= simd<float>(rhs);
+    return *this;
+  }
+
+  const CColor& operator/=(float rhs) {
+    mSimd /= simd<float>(rhs);
+    return *this;
+  }
+
+  void normalize() {
+    float mag = magnitude();
+    mag = 1.f / mag;
+    *this *= mag;
+  }
+
+  CColor normalized() const {
+    float mag = magnitude();
+    mag = 1.f / mag;
+    return *this * mag;
+  }
+
+  float magSquared() const {
+    return mSimd.dot4(mSimd);
+  }
+
+  float magnitude() const { return std::sqrt(magSquared()); }
+
+  static CColor lerp(const CColor& a, const CColor& b, float t) {
+    return zeus::simd<float>(1.f - t) * a.mSimd + b.mSimd * zeus::simd<float>(t);
+  }
+
+  static CColor nlerp(const CColor& a, const CColor& b, float t) { return lerp(a, b, t).normalized(); }
+
+  simd<float>::reference operator[](const size_t& idx) {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float operator[](const size_t& idx) const {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  void splat(float rgb, float a) {
+    mSimd = simd<float>(rgb);
+    mSimd[3] = a;
+  }
+
+  float rgbDot(const CColor& rhs) const {
+    return mSimd.dot3(rhs.mSimd);
+  }
+
+  void fromRGBA8(const Comp8 ri, const Comp8 gi, const Comp8 bi, const Comp8 ai) {
+    mSimd = simd<float>(ri * OneOver255, gi * OneOver255, bi * OneOver255, ai * OneOver255);
+  }
+
+  void fromRGBA32(Comp32 rgba) {
+    static RGBA32 tmp;
+    tmp.rgba = COLOR(rgba);
+    fromRGBA8(tmp.r, tmp.g, tmp.b, tmp.a);
+  }
+
+  /*!
+   * \brief Converts a CColor to RGBA8
+   * \param r
+   * \param g
+   * \param b
+   * \param a
+   */
+  void toRGBA8(Comp8& ro, Comp8& go, Comp8& bo, Comp8& ao) const {
+    ro = Comp8(r() * 255);
+    go = Comp8(g() * 255);
+    bo = Comp8(b() * 255);
+    ao = Comp8(a() * 255);
+  }
+
+  /**
+   * @brief Assigns rgba from hsv
+   * @param h[0-1] The hue percentagee of the color.
+   * @param s[0-1] The saturation percentage of the color.
+   * @param v[0-1] The value percentage of the color.
+   * @param a[0-1] The alpha percentage of the color.
+   */
+  void fromHSV(float h, float s, float v, float _a = 1.0);
+
+  /**
+   * @brief Converts rgba to hsv
+   * @param h[0-1] The hue percentagee of the color.
+   * @param s[0-1] The saturation percentage of the color.
+   * @param v[0-1] The value percentage of the color.
+   * @param a[0-1] The alpha percentage of the color.
+   */
+  void toHSV(float& h, float& s, float& v) const;
+
+  void fromHSL(float h, float s, float l, float _a = 1.0);
+
+  void toHSL(float& h, float& s, float& l) const;
+
+  CColor toGrayscale() const { return {std::sqrt((r() * r() + g() * g() + b() * b()) / 3), a()}; }
+
+  /**
+   * @brief Clamps to GPU-safe RGBA values [0,1]
+   */
+  void Clamp() {
+    r() = std::min(1.f, std::max(0.f, float(r())));
+    g() = std::min(1.f, std::max(0.f, float(g())));
+    b() = std::min(1.f, std::max(0.f, float(b())));
+    a() = std::min(1.f, std::max(0.f, float(a())));
+  }
+
+  float r() const { return mSimd[0]; }
+  float g() const { return mSimd[1]; }
+  float b() const { return mSimd[2]; }
+  float a() const { return mSimd[3]; }
+
+  simd<float>::reference r() { return mSimd[0]; }
+  simd<float>::reference g() { return mSimd[1]; }
+  simd<float>::reference b() { return mSimd[2]; }
+  simd<float>::reference a() { return mSimd[3]; }
 };

-static inline CColor operator+(float lhs, const CColor& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, lhs}};
-    return CColor(_mm_add_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CColor(lhs + rhs.r, lhs + rhs.g, lhs + rhs.b, lhs + rhs.a);
-#endif
+static inline CColor operator+(float lhs, const CColor& rhs) {
+  return simd<float>(lhs) + rhs.mSimd;
 }

-static inline CColor operator-(float lhs, const CColor& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, lhs}};
-    return CColor(_mm_sub_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CColor(lhs - rhs.r, lhs - rhs.g, lhs - rhs.b, lhs - rhs.a);
-#endif
+static inline CColor operator-(float lhs, const CColor& rhs) {
+  return simd<float>(lhs) - rhs.mSimd;
 }

-static inline CColor operator*(float lhs, const CColor& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, lhs}};
-    return CColor(_mm_mul_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CColor(lhs * rhs.r, lhs * rhs.g, lhs * rhs.b, lhs * rhs.a);
-#endif
+static inline CColor operator*(float lhs, const CColor& rhs) {
+  return simd<float>(lhs) * rhs.mSimd;
 }

-static inline CColor operator/(float lhs, const CColor& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, lhs}};
-    return CColor(_mm_div_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CColor(lhs / rhs.r, lhs / rhs.g, lhs / rhs.b, lhs / rhs.a);
-#endif
+static inline CColor operator/(float lhs, const CColor& rhs) {
+  return simd<float>(lhs) / rhs.mSimd;
 }
 }
--- a/include/zeus/CEulerAngles.hpp
+++ b/include/zeus/CEulerAngles.hpp
@ -2,16 +2,14 @@

 #include "zeus/CVector3f.hpp"

-namespace zeus
-{
+namespace zeus {
 class CQuaternion;

-class CEulerAngles : public CVector3f
-{
+class CEulerAngles : public CVector3f {
 public:
-    CEulerAngles(float x, float y, float z) { assign(x, y, z); }
-    CEulerAngles(const CQuaternion& quat);
-    CEulerAngles(const CTransform& xf);
+  CEulerAngles(float x, float y, float z) { assign(x, y, z); }
+  CEulerAngles(const CQuaternion& quat);
+  CEulerAngles(const CTransform& xf);
 };

 }
--- a/include/zeus/CFrustum.hpp
+++ b/include/zeus/CFrustum.hpp
@ -4,19 +4,16 @@
 #include "zeus/CAABox.hpp"
 #include "zeus/CProjection.hpp"

-namespace zeus
-{
-class CFrustum
-{
-    CPlane planes[6];
-    bool valid = false;
+namespace zeus {
+class CFrustum {
+  CPlane planes[6];
+  bool valid = false;

 public:
-    void updatePlanes(const CMatrix4f& viewMtx, const CMatrix4f& projection);
-    void updatePlanes(const CTransform& viewPointMtx, const CProjection& projection);
-
-    bool aabbFrustumTest(const CAABox& aabb) const;
-    bool sphereFrustumTest(const CSphere& sphere) const;
-    bool pointFrustumTest(const CVector3f& point) const;
+  void updatePlanes(const CMatrix4f& viewMtx, const CMatrix4f& projection);
+  void updatePlanes(const CTransform& viewPointMtx, const CProjection& projection);
+  bool aabbFrustumTest(const CAABox& aabb) const;
+  bool sphereFrustumTest(const CSphere& sphere) const;
+  bool pointFrustumTest(const CVector3f& point) const;
 };
 }
--- a/include/zeus/CLine.hpp
+++ b/include/zeus/CLine.hpp
@ -3,14 +3,13 @@
 #include "Global.hpp"
 #include "zeus/CVector3f.hpp"

-namespace zeus
-{
-class CLine
-{
+namespace zeus {
+class CLine {
 public:
-    CLine(const CVector3f& origin, const CVector3f& dir) : origin(origin), dir(dir) {}
-    CVector3f origin;
-    CVector3f dir;
+  CLine(const CVector3f& origin, const CVector3f& dir) : origin(origin), dir(dir) {}
+
+  CVector3f origin;
+  CVector3f dir;
 };
 }

--- a/include/zeus/CLineSeg.hpp
+++ b/include/zeus/CLineSeg.hpp
@ -3,23 +3,20 @@
 #include "Global.hpp"
 #include "zeus/CVector3f.hpp"

-namespace zeus
-{
-class CLineSeg
-{
+namespace zeus {
+class CLineSeg {
 public:
-    CLineSeg(const CVector3f& start, const CVector3f& end) : x0_start(start), x18_end(end)
-    {
-        CVector3f tmp = (end - start).normalized();
-        if (tmp.x != 0 || tmp.y != 0 || tmp.z != 0)
-            xc_dir = tmp.normalized();
-        else
-            xc_dir = CVector3f::skZero;
-    }
+  CLineSeg(const CVector3f& start, const CVector3f& end) : x0_start(start), x18_end(end) {
+    CVector3f tmp = (end - start).normalized();
+    if (tmp.x() != 0.f || tmp.y() != 0.f || tmp.z() != 0.f)
+      xc_dir = tmp.normalized();
+    else
+      xc_dir = CVector3f::skZero;
+  }

-    CVector3f x0_start;
-    CVector3f xc_dir;
-    CVector3f x18_end;
+  CVector3f x0_start;
+  CVector3f xc_dir;
+  CVector3f x18_end;
 };
 }

--- a/include/zeus/CMRay.hpp
+++ b/include/zeus/CMRay.hpp
@ -1,38 +1,34 @@
 #pragma once
+
 #include "zeus/CVector3f.hpp"
 #include "zeus/CTransform.hpp"
 #include "zeus/Math.hpp"

-namespace zeus
-{
-struct CMRay
-{
-    CMRay(const CVector3f& start, const CVector3f& dirin, float len)
-    : start(start), length(len), invLength(1.f / len), dir(dirin)
-    {
-        end = start + (len * dirin);
-        delta = end - start;
-    }
+namespace zeus {
+struct CMRay {
+  CMRay(const CVector3f& start, const CVector3f& dirin, float len)
+    : start(start), length(len), invLength(1.f / len), dir(dirin) {
+    end = start + (len * dirin);
+    delta = end - start;
+  }

-    CMRay(const CVector3f& start, const CVector3f& end, float len, float invLen)
-    : start(start), end(end), length(len), invLength(invLen)
-    {
-        delta = end - start;
-        dir = invLen * delta;
-    }
+  CMRay(const CVector3f& start, const CVector3f& end, float len, float invLen)
+    : start(start), end(end), length(len), invLength(invLen) {
+    delta = end - start;
+    dir = invLen * delta;
+  }

-    CMRay getInvUnscaledTransformRay(const CTransform& xfrm) const
-    {
-        const CTransform inv = xfrm.inverse();
-        return CMRay(inv * start, inv * end, length, invLength);
-    }
+  CMRay getInvUnscaledTransformRay(const CTransform& xfrm) const {
+    const CTransform inv = xfrm.inverse();
+    return CMRay(inv * start, inv * end, length, invLength);
+  }

-    CVector3f start; // x0
-    CVector3f end; // xc
-    CVector3f delta; // x18
-    float length; // x24
-    float invLength; // x28
-    CVector3f dir; // x2c
+  CVector3f start; // x0
+  CVector3f end; // xc
+  CVector3f delta; // x18
+  float length; // x24
+  float invLength; // x28
+  CVector3f dir; // x2c
 };
 }

--- a/include/zeus/CMatrix3f.hpp
+++ b/include/zeus/CMatrix3f.hpp
@ -6,254 +6,185 @@
 #include <cstring>

 /* Column-major matrix class */
-namespace zeus
-{
+namespace zeus {
 class CQuaternion;
-class alignas(16) CMatrix3f
-{
+
+class CMatrix3f {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();

-    explicit CMatrix3f(bool zero = false)
-    {
-        memset(m, 0, sizeof(m));
-        if (!zero)
-        {
-            m[0][0] = 1.0;
-            m[1][1] = 1.0;
-            m[2][2] = 1.0;
-        }
+  explicit CMatrix3f(bool zero = false) {
+    m[0] = simd<float>(0.f);
+    m[1] = simd<float>(0.f);
+    m[2] = simd<float>(0.f);
+    if (!zero) {
+      m[0][0] = 1.0;
+      m[1][1] = 1.0;
+      m[2][2] = 1.0;
    }
-    CMatrix3f(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22)
-    {
-        m[0][0] = m00, m[1][0] = m01, m[2][0] = m02;
-        m[0][1] = m10, m[1][1] = m11, m[2][1] = m12;
-        m[0][2] = m20, m[1][2] = m21, m[2][2] = m22;
-    }
-    CMatrix3f(const CVector3f& scaleVec)
-    {
-        memset(m, 0, sizeof(m));
-        m[0][0] = scaleVec[0];
-        m[1][1] = scaleVec[1];
-        m[2][2] = scaleVec[2];
-    }
-    CMatrix3f(float scale) : CMatrix3f(CVector3f(scale)) {}
-    CMatrix3f(const CVector3f& r0, const CVector3f& r1, const CVector3f& r2)
-    {
-        vec[0] = r0;
-        vec[1] = r1;
-        vec[2] = r2;
-    }
-    CMatrix3f(const CMatrix3f& other)
-    {
-        vec[0] = other.vec[0];
-        vec[1] = other.vec[1];
-        vec[2] = other.vec[2];
-    }
-#if __SSE__
-    CMatrix3f(const __m128& r0, const __m128& r1, const __m128& r2)
-    {
-        vec[0].mVec128 = r0;
-        vec[1].mVec128 = r1;
-        vec[2].mVec128 = r2;
-    }
-#endif
+  }
+
+  CMatrix3f(float m00, float m01, float m02,
+            float m10, float m11, float m12,
+            float m20, float m21, float m22)
+  : m{{m00, m10, m20},
+      {m01, m11, m21},
+      {m02, m12, m22}} {}
+
+  CMatrix3f(const CVector3f& scaleVec) {
+    m[0] = simd<float>(0.f);
+    m[1] = simd<float>(0.f);
+    m[2] = simd<float>(0.f);
+    m[0][0] = scaleVec[0];
+    m[1][1] = scaleVec[1];
+    m[2][2] = scaleVec[2];
+  }
+
+  CMatrix3f(float scale) : CMatrix3f(CVector3f(scale)) {}
+
+  CMatrix3f(const CVector3f& r0, const CVector3f& r1, const CVector3f& r2) {
+    m[0] = r0;
+    m[1] = r1;
+    m[2] = r2;
+  }
+
+  CMatrix3f(const CMatrix3f& other) {
+    m[0] = other.m[0];
+    m[1] = other.m[1];
+    m[2] = other.m[2];
+  }
+
+  CMatrix3f(const simd<float>& r0, const simd<float>& r1, const simd<float>& r2) {
+    m[0].mSimd = r0;
+    m[1].mSimd = r1;
+    m[2].mSimd = r2;
+  }
+
 #if ZE_ATHENA_TYPES
-    CMatrix3f(const atVec4f& r0, const atVec4f& r1, const atVec4f& r2)
-    {
-#if __SSE__
-        vec[0].mVec128 = r0.mVec128;
-        vec[1].mVec128 = r1.mVec128;
-        vec[2].mVec128 = r2.mVec128;
-#else
-        vec[0].x = r0.vec[0];
-        vec[0].y = r0.vec[1];
-        vec[0].z = r0.vec[2];
-        vec[1].x = r1.vec[0];
-        vec[1].y = r1.vec[1];
-        vec[1].z = r1.vec[2];
-        vec[2].x = r2.vec[0];
-        vec[2].y = r2.vec[1];
-        vec[2].z = r2.vec[2];
+
+  CMatrix3f(const atVec4f& r0, const atVec4f& r1, const atVec4f& r2) {
+    m[0].mSimd = r0.simd;
+    m[1].mSimd = r1.simd;
+    m[2].mSimd = r2.simd;
+  }
+
+  void readBig(athena::io::IStreamReader& input) {
+    m[0][0] = input.readFloatBig();
+    m[1][0] = input.readFloatBig();
+    m[2][0] = input.readFloatBig();
+    m[0][1] = input.readFloatBig();
+    m[1][1] = input.readFloatBig();
+    m[2][1] = input.readFloatBig();
+    m[0][2] = input.readFloatBig();
+    m[1][2] = input.readFloatBig();
+    m[2][2] = input.readFloatBig();
+  }
+
+  static CMatrix3f ReadBig(athena::io::IStreamReader& input) {
+    CMatrix3f ret;
+    ret.readBig(input);
+    return ret;
+  }
+
 #endif
-    }
-    void readBig(athena::io::IStreamReader& input)
-    {
-        m[0][0] = input.readFloatBig();
-        m[1][0] = input.readFloatBig();
-        m[2][0] = input.readFloatBig();
-        m[0][1] = input.readFloatBig();
-        m[1][1] = input.readFloatBig();
-        m[2][1] = input.readFloatBig();
-        m[0][2] = input.readFloatBig();
-        m[1][2] = input.readFloatBig();
-        m[2][2] = input.readFloatBig();
-    }

-    static CMatrix3f ReadBig(athena::io::IStreamReader& input)
-    {
-        CMatrix3f ret;
-        ret.readBig(input);
-        return ret;
-    }
-#endif
-    CMatrix3f(const CVector3f& axis, float angle);
-    CMatrix3f(const CQuaternion& quat);
-    CMatrix3f(const TVectorUnion& r0, const TVectorUnion& r1, const TVectorUnion& r2)
-    {
-#if __SSE__
-        vec[0].mVec128 = r0.mVec128;
-        vec[1].mVec128 = r1.mVec128;
-        vec[2].mVec128 = r2.mVec128;
-#else
-        vec[0].x = r0.vec[0];
-        vec[0].y = r0.vec[1];
-        vec[0].z = r0.vec[2];
-        vec[1].x = r1.vec[0];
-        vec[1].y = r1.vec[1];
-        vec[1].z = r1.vec[2];
-        vec[2].x = r2.vec[0];
-        vec[2].y = r2.vec[1];
-        vec[2].z = r2.vec[2];
-#endif
-    }
+  CMatrix3f(const CVector3f& axis, float angle);

-    inline CMatrix3f& operator=(const CMatrix3f& other)
-    {
-        vec[0] = other.vec[0];
-        vec[1] = other.vec[1];
-        vec[2] = other.vec[2];
-        return *this;
-    }
+  CMatrix3f(const CQuaternion& quat);

-    inline CVector3f operator*(const CVector3f& other) const
-    {
-#if __SSE__
-        TVectorUnion res;
-        res.mVec128 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(vec[0].mVec128, ze_splat_ps(other.mVec128, 0)),
-                                            _mm_mul_ps(vec[1].mVec128, ze_splat_ps(other.mVec128, 1))),
-                                 _mm_mul_ps(vec[2].mVec128, ze_splat_ps(other.mVec128, 2)));
-        return CVector3f(res.mVec128);
-#else
-        return CVector3f(m[0][0] * other.v[0] + m[1][0] * other.v[1] + m[2][0] * other.v[2],
-                         m[0][1] * other.v[0] + m[1][1] * other.v[1] + m[2][1] * other.v[2],
-                         m[0][2] * other.v[0] + m[1][2] * other.v[1] + m[2][2] * other.v[2]);
-#endif
-    }
+  CMatrix3f& operator=(const CMatrix3f& other) {
+    m[0] = other.m[0];
+    m[1] = other.m[1];
+    m[2] = other.m[2];
+    return *this;
+  }

-    inline CVector3f& operator[](int i)
-    {
-        assert(0 <= i && i < 3);
-        return vec[i];
-    }
+  CVector3f operator*(const CVector3f& other) const {
+    return m[0].mSimd * other.mSimd.shuffle<0, 0, 0, 0>() +
+           m[1].mSimd * other.mSimd.shuffle<1, 1, 1, 1>() +
+           m[2].mSimd * other.mSimd.shuffle<2, 2, 2, 2>();
+  }

-    inline const CVector3f& operator[](int i) const
-    {
-        assert(0 <= i && i < 3);
-        return vec[i];
-    }
+  CVector3f& operator[](size_t i) {
+    assert(i < 3);
+    return m[i];
+  }

-    inline CMatrix3f orthonormalized() const
-    {
-        CMatrix3f ret;
-        ret[0] = vec[0].normalized();
-        ret[2] = ret[0].cross(vec[1]);
-        ret[2].normalize();
-        ret[1] = ret[2].cross(ret[0]);
-        return ret;
-    }
+  const CVector3f& operator[](size_t i) const {
+    assert(i < 3);
+    return m[i];
+  }

-    inline bool operator==(const CMatrix3f& other) const
-    {
-        return vec[0] == other.vec[0] && vec[1] == other.vec[1] && vec[2] == other.vec[2];
-    }
+  CMatrix3f orthonormalized() const {
+    CMatrix3f ret;
+    ret[0] = m[0].normalized();
+    ret[2] = ret[0].cross(m[1]);
+    ret[2].normalize();
+    ret[1] = ret[2].cross(ret[0]);
+    return ret;
+  }

-    static const CMatrix3f skIdentityMatrix3f;
+  bool operator==(const CMatrix3f& other) const {
+    return m[0] == other.m[0] && m[1] == other.m[1] && m[2] == other.m[2];
+  }

-    void transpose();
-    void transposeSSE3();
-    CMatrix3f transposed() const;
-    CMatrix3f transposedSSE3() const;
+  static const CMatrix3f skIdentityMatrix3f;

-    inline void invert() { *this = inverted(); }
-    CMatrix3f inverted() const;
+  void transpose();

-    void addScaledMatrix(const CMatrix3f& other, float scale)
-    {
-        CVector3f scaleVec(scale);
-        vec[0] += other.vec[0] * scaleVec;
-        vec[1] += other.vec[1] * scaleVec;
-        vec[2] += other.vec[2] * scaleVec;
-    }
+  CMatrix3f transposed() const;

-    static inline CMatrix3f RotateX(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
-        return CMatrix3f(TVectorUnion{{1.f, 0.f, 0.f, 0.f}},
-                         TVectorUnion{{0.f, cosT, sinT, 0.f}},
-                         TVectorUnion{{0.f, -sinT, cosT, 0.f}});
-    }
+  void invert() { *this = inverted(); }

-    static inline CMatrix3f RotateY(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
-        return CMatrix3f(TVectorUnion{{cosT, 0.f, -sinT, 0.f}},
-                         TVectorUnion{{0.f, 1.f, 0.f, 0.f}},
-                         TVectorUnion{{sinT, 0.f, cosT, 0.f}});
-    }
+  CMatrix3f inverted() const;

-    static inline CMatrix3f RotateZ(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
-        return CMatrix3f(TVectorUnion{{cosT, sinT, 0.f, 0.f}},
-                         TVectorUnion{{-sinT, cosT, 0.f, 0.f}},
-                         TVectorUnion{{0.f, 0.f, 1.f, 0.f}});
-    }
+  void addScaledMatrix(const CMatrix3f& other, float scale) {
+    CVector3f scaleVec(scale);
+    m[0] += other.m[0] * scaleVec;
+    m[1] += other.m[1] * scaleVec;
+    m[2] += other.m[2] * scaleVec;
+  }

-    float determinant() const
-    {
-        return
-        m[1][0] * (m[2][1] * m[0][2] - m[0][1] * m[2][2]) +
-        m[0][0] * (m[1][1] * m[2][2] - m[2][1] * m[1][2]) +
-        m[2][0] * (m[0][1] * m[1][2] - m[1][1] * m[0][2]);
-    }
+  static CMatrix3f RotateX(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);
+    return CMatrix3f(simd<float>{1.f, 0.f, 0.f, 0.f},
+                     simd<float>{0.f, cosT, sinT, 0.f},
+                     simd<float>{0.f, -sinT, cosT, 0.f});
+  }

-    union {
-        float m[3][4]; /* 4th row for union-alignment */
-        struct
-        {
-            CVector3f vec[3];
-        };
-    };
+  static CMatrix3f RotateY(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);
+    return CMatrix3f(simd<float>{cosT, 0.f, -sinT, 0.f},
+                     simd<float>{0.f, 1.f, 0.f, 0.f},
+                     simd<float>{sinT, 0.f, cosT, 0.f});
+  }
+
+  static CMatrix3f RotateZ(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);
+    return CMatrix3f(simd<float>{cosT, sinT, 0.f, 0.f},
+                     simd<float>{-sinT, cosT, 0.f, 0.f},
+                     simd<float>{0.f, 0.f, 1.f, 0.f});
+  }
+
+  float determinant() const {
+    return
+      m[1][0] * (m[2][1] * m[0][2] - m[0][1] * m[2][2]) +
+      m[0][0] * (m[1][1] * m[2][2] - m[2][1] * m[1][2]) +
+      m[2][0] * (m[0][1] * m[1][2] - m[1][1] * m[0][2]);
+  }
+
+  CVector3f m[3];
 };

-static inline CMatrix3f operator*(const CMatrix3f& lhs, const CMatrix3f& rhs)
-{
-#if __SSE__
-    unsigned i;
-    TVectorUnion resVec[3];
-    for (i = 0; i < 3; ++i)
-    {
-        resVec[i].mVec128 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(lhs[0].mVec128, ze_splat_ps(rhs[i].mVec128, 0)),
-                                                  _mm_mul_ps(lhs[1].mVec128, ze_splat_ps(rhs[i].mVec128, 1))),
-                                       _mm_mul_ps(lhs[2].mVec128, ze_splat_ps(rhs[i].mVec128, 2)));
-        resVec[i].v[3] = 0.0;
-    }
-    return CMatrix3f(resVec[0].mVec128, resVec[1].mVec128, resVec[2].mVec128);
-#else
-    return CMatrix3f(lhs[0][0] * rhs[0][0] + lhs[1][0] * rhs[0][1] + lhs[2][0] * rhs[0][2],
-                     lhs[0][0] * rhs[1][0] + lhs[1][0] * rhs[1][1] + lhs[2][0] * rhs[1][2],
-                     lhs[0][0] * rhs[2][0] + lhs[1][0] * rhs[2][1] + lhs[2][0] * rhs[2][2],
-                     lhs[0][1] * rhs[0][0] + lhs[1][1] * rhs[0][1] + lhs[2][1] * rhs[0][2],
-                     lhs[0][1] * rhs[1][0] + lhs[1][1] * rhs[1][1] + lhs[2][1] * rhs[1][2],
-                     lhs[0][1] * rhs[2][0] + lhs[1][1] * rhs[2][1] + lhs[2][1] * rhs[2][2],
-                     lhs[0][2] * rhs[0][0] + lhs[1][2] * rhs[0][1] + lhs[2][2] * rhs[0][2],
-                     lhs[0][2] * rhs[1][0] + lhs[1][2] * rhs[1][1] + lhs[2][2] * rhs[1][2],
-                     lhs[0][2] * rhs[2][0] + lhs[1][2] * rhs[2][1] + lhs[2][2] * rhs[2][2]);
-#endif
+static inline CMatrix3f operator*(const CMatrix3f& lhs, const CMatrix3f& rhs) {
+  simd<float> v[3];
+  for (int i = 0; i < 3; ++i)
+    v[i] = lhs.m[0].mSimd * rhs[i].mSimd.shuffle<0, 0, 0, 0>() +
+           lhs.m[1].mSimd * rhs[i].mSimd.shuffle<1, 1, 1, 1>() +
+           lhs.m[2].mSimd * rhs[i].mSimd.shuffle<2, 2, 2, 2>();
+  return CMatrix3f(v[0], v[1], v[2]);
 }
 }

--- a/include/zeus/CMatrix4f.hpp
+++ b/include/zeus/CMatrix4f.hpp
@ -1,176 +1,116 @@
 #pragma once
+
 #include "zeus/CMatrix3f.hpp"
 #include "zeus/CVector4f.hpp"
 #include "zeus/CVector3f.hpp"

-namespace zeus
-{
-class alignas(16) CMatrix4f
-{
+namespace zeus {
+class CMatrix4f {
 public:
-    static const CMatrix4f skIdentityMatrix4f;
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
-    explicit CMatrix4f(bool zero = false)
-    {
-        memset(m, 0, sizeof(m));
+  static const CMatrix4f skIdentityMatrix4f;

-        if (!zero)
-        {
-            m[0][0] = 1.0;
-            m[1][1] = 1.0;
-            m[2][2] = 1.0;
-            m[3][3] = 1.0;
-        }
+  explicit CMatrix4f(bool zero = false) {
+    if (!zero) {
+      m[0][0] = 1.0;
+      m[1][1] = 1.0;
+      m[2][2] = 1.0;
+      m[3][3] = 1.0;
    }
-    CMatrix4f(float m00, float m01, float m02, float m03, float m10, float m11, float m12, float m13, float m20, float m21,
-              float m22, float m23, float m30, float m31, float m32, float m33)
-    {
-        m[0][0] = m00, m[1][0] = m01, m[2][0] = m02, m[3][0] = m03;
-        m[0][1] = m10, m[1][1] = m11, m[2][1] = m12, m[3][1] = m13;
-        m[0][2] = m20, m[1][2] = m21, m[2][2] = m22, m[3][2] = m23;
-        m[0][3] = m30, m[1][3] = m31, m[2][3] = m32, m[3][3] = m33;
-    }
-    CMatrix4f(const CVector3f& scaleVec)
-    {
-        memset(m, 0, sizeof(m));
-        m[0][0] = scaleVec[0];
-        m[1][1] = scaleVec[1];
-        m[2][2] = scaleVec[2];
-        m[3][3] = 1.0f;
-    }
-    CMatrix4f(const CVector4f& r0, const CVector4f& r1, const CVector4f& r2, const CVector4f& r3)
-    {
-        vec[0] = r0;
-        vec[1] = r1;
-        vec[2] = r2;
-        vec[3] = r3;
-    }
-    CMatrix4f(const CMatrix4f& other)
-    {
-        vec[0] = other.vec[0];
-        vec[1] = other.vec[1];
-        vec[2] = other.vec[2];
-        vec[3] = other.vec[3];
-    }
-#if __SSE__
-    CMatrix4f(const __m128& r0, const __m128& r1, const __m128& r2, const __m128& r3)
-    {
-        vec[0].mVec128 = r0;
-        vec[1].mVec128 = r1;
-        vec[2].mVec128 = r2;
-        vec[3].mVec128 = r3;
-    }
-#endif
-    CMatrix4f(const CMatrix3f& other)
-    {
-        memset(m, 0, sizeof(m));
-        vec[0] = other.vec[0];
-        vec[1] = other.vec[1];
-        vec[2] = other.vec[2];
-        vec[3] = CVector4f(0, 0, 0, 1.0f);
-    }
-    inline CMatrix4f& operator=(const CMatrix4f& other)
-    {
-        vec[0] = other.vec[0];
-        vec[1] = other.vec[1];
-        vec[2] = other.vec[2];
-        vec[3] = other.vec[3];
-        return *this;
-    }
-    inline CVector4f operator*(const CVector4f& other) const
-    {
-#if __SSE__
-        TVectorUnion res;
-        res.mVec128 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(vec[0].mVec128, ze_splat_ps(other.mVec128, 0)),
-                                            _mm_mul_ps(vec[1].mVec128, ze_splat_ps(other.mVec128, 1))),
-                                 _mm_add_ps(_mm_mul_ps(vec[2].mVec128, ze_splat_ps(other.mVec128, 2)),
-                                            _mm_mul_ps(vec[3].mVec128, ze_splat_ps(other.mVec128, 3))));
+  }

-        return CVector4f(res.mVec128);
-#else
-        return CVector4f(m[0][0] * other.v[0] + m[1][0] * other.v[1] + m[2][0] * other.v[2] + m[3][0] * other.v[3],
-                         m[0][1] * other.v[0] + m[1][1] * other.v[1] + m[2][1] * other.v[2] + m[3][1] * other.v[3],
-                         m[0][2] * other.v[0] + m[1][2] * other.v[1] + m[2][2] * other.v[2] + m[3][2] * other.v[3],
-                         m[0][3] * other.v[0] + m[1][3] * other.v[1] + m[2][3] * other.v[2] + m[3][3] * other.v[3]);
-#endif
-    }
+  CMatrix4f(float m00, float m01, float m02, float m03,
+            float m10, float m11, float m12, float m13,
+            float m20, float m21, float m22, float m23,
+            float m30, float m31, float m32, float m33)
+  : m{{m00, m10, m20, m30},
+      {m01, m11, m21, m31},
+      {m02, m12, m22, m32},
+      {m03, m13, m23, m33}} {}

-    inline CVector4f& operator[](int i)
-    {
-        assert(0 <= i && i < 4);
-        return vec[i];
-    }
+  CMatrix4f(const CVector3f& scaleVec) {
+    m[0][0] = scaleVec[0];
+    m[1][1] = scaleVec[1];
+    m[2][2] = scaleVec[2];
+    m[3][3] = 1.0f;
+  }

-    inline const CVector4f& operator[](int i) const
-    {
-        assert(0 <= i && i < 4);
-        return vec[i];
-    }
+  CMatrix4f(const CVector4f& r0, const CVector4f& r1, const CVector4f& r2, const CVector4f& r3) {
+    m[0] = r0;
+    m[1] = r1;
+    m[2] = r2;
+    m[3] = r3;
+  }

-    CMatrix4f transposed() const;
-    CMatrix4f transposedSSE3() const;
+  CMatrix4f(const CMatrix4f& other) {
+    m[0] = other.m[0];
+    m[1] = other.m[1];
+    m[2] = other.m[2];
+    m[3] = other.m[3];
+  }
+  
+  CMatrix4f(const simd<float>& r0, const simd<float>& r1, const simd<float>& r2, const simd<float>& r3) {
+    m[0].mSimd = r0;
+    m[1].mSimd = r1;
+    m[2].mSimd = r2;
+    m[3].mSimd = r3;
+  }

-    inline CVector3f multiplyOneOverW(const CVector3f& point) const
-    {
-        CVector4f xfVec = *this * point;
-        return xfVec.toVec3f() / xfVec.w;
-    }
+  CMatrix4f(const CMatrix3f& other) {
+    m[0] = other.m[0];
+    m[1] = other.m[1];
+    m[2] = other.m[2];
+    m[3] = CVector4f(0.f, 0.f, 0.f, 1.0f);
+  }

-    inline CVector3f multiplyOneOverW(const CVector3f& point, float& wOut) const
-    {
-        CVector4f xfVec = *this * point;
-        wOut = xfVec.w;
-        return xfVec.toVec3f() / xfVec.w;
-    }
+  CMatrix4f& operator=(const CMatrix4f& other) {
+    m[0] = other.m[0];
+    m[1] = other.m[1];
+    m[2] = other.m[2];
+    m[3] = other.m[3];
+    return *this;
+  }

-    union {
-        float m[4][4];
-        struct
-        {
-            CVector4f vec[4];
-        };
-    };
+  CVector4f operator*(const CVector4f& other) const {
+    return m[0].mSimd * other.mSimd.shuffle<0, 0, 0, 0>() +
+           m[1].mSimd * other.mSimd.shuffle<1, 1, 1, 1>() +
+           m[2].mSimd * other.mSimd.shuffle<2, 2, 2, 2>() +
+           m[3].mSimd * other.mSimd.shuffle<3, 3, 3, 3>();
+  }
+
+  CVector4f& operator[](size_t i) {
+    assert(i < 4);
+    return m[i];
+  }
+
+  const CVector4f& operator[](size_t i) const {
+    assert(i < 4);
+    return m[i];
+  }
+
+  CMatrix4f transposed() const;
+
+  CVector3f multiplyOneOverW(const CVector3f& point) const {
+    CVector4f xfVec = *this * point;
+    return xfVec.toVec3f() / xfVec.w();
+  }
+
+  CVector3f multiplyOneOverW(const CVector3f& point, float& wOut) const {
+    CVector4f xfVec = *this * point;
+    wOut = xfVec.w();
+    return xfVec.toVec3f() / xfVec.w();
+  }
+
+  CVector4f m[4];
 };
-static inline CMatrix4f operator*(const CMatrix4f& lhs, const CMatrix4f& rhs)
-{
-    CMatrix4f ret;
-#if __SSE__
-    unsigned i;

-    for (i = 0; i < 4; ++i)
-    {
-        ret.vec[i].mVec128 = _mm_add_ps(
-            _mm_add_ps(_mm_add_ps(_mm_mul_ps(lhs.vec[0].mVec128,
-                                             _mm_shuffle_ps(rhs.vec[i].mVec128, rhs.vec[i].mVec128, _MM_SHUFFLE(0, 0, 0, 0))),
-                                  _mm_mul_ps(lhs.vec[1].mVec128,
-                                             _mm_shuffle_ps(rhs.vec[i].mVec128, rhs.vec[i].mVec128, _MM_SHUFFLE(1, 1, 1, 1)))),
-                       _mm_mul_ps(lhs.vec[2].mVec128,
-                                  _mm_shuffle_ps(rhs.vec[i].mVec128, rhs.vec[i].mVec128, _MM_SHUFFLE(2, 2, 2, 2)))),
-            _mm_mul_ps(lhs.vec[3].mVec128, _mm_shuffle_ps(rhs.vec[i].mVec128, rhs.vec[i].mVec128, _MM_SHUFFLE(3, 3, 3, 3))));
-    }
-
-#else
-    ret.m[0][0] = lhs.m[0][0] * rhs.m[0][0] + lhs.m[1][0] * rhs.m[0][1] + lhs.m[2][0] * rhs.m[0][2] + lhs.m[3][0] * rhs.m[0][3];
-    ret.m[1][0] = lhs.m[0][0] * rhs.m[1][0] + lhs.m[1][0] * rhs.m[1][1] + lhs.m[2][0] * rhs.m[1][2] + lhs.m[3][0] * rhs.m[1][3];
-    ret.m[2][0] = lhs.m[0][0] * rhs.m[2][0] + lhs.m[1][0] * rhs.m[2][1] + lhs.m[2][0] * rhs.m[2][2] + lhs.m[3][0] * rhs.m[2][3];
-    ret.m[3][0] = lhs.m[0][0] * rhs.m[3][0] + lhs.m[1][0] * rhs.m[3][1] + lhs.m[2][0] * rhs.m[3][2] + lhs.m[3][0] * rhs.m[3][3];
-
-    ret.m[0][1] = lhs.m[0][1] * rhs.m[0][0] + lhs.m[1][1] * rhs.m[0][1] + lhs.m[2][1] * rhs.m[0][2] + lhs.m[3][1] * rhs.m[0][3];
-    ret.m[1][1] = lhs.m[0][1] * rhs.m[1][0] + lhs.m[1][1] * rhs.m[1][1] + lhs.m[2][1] * rhs.m[1][2] + lhs.m[3][1] * rhs.m[1][3];
-    ret.m[2][1] = lhs.m[0][1] * rhs.m[2][0] + lhs.m[1][1] * rhs.m[2][1] + lhs.m[2][1] * rhs.m[2][2] + lhs.m[3][1] * rhs.m[2][3];
-    ret.m[3][1] = lhs.m[0][1] * rhs.m[3][0] + lhs.m[1][1] * rhs.m[3][1] + lhs.m[2][1] * rhs.m[3][2] + lhs.m[3][1] * rhs.m[3][3];
-
-    ret.m[0][2] = lhs.m[0][2] * rhs.m[0][0] + lhs.m[1][2] * rhs.m[0][1] + lhs.m[2][2] * rhs.m[0][2] + lhs.m[3][2] * rhs.m[0][3];
-    ret.m[1][2] = lhs.m[0][2] * rhs.m[1][0] + lhs.m[1][2] * rhs.m[1][1] + lhs.m[2][2] * rhs.m[1][2] + lhs.m[3][2] * rhs.m[1][3];
-    ret.m[2][2] = lhs.m[0][2] * rhs.m[2][0] + lhs.m[1][2] * rhs.m[2][1] + lhs.m[2][2] * rhs.m[2][2] + lhs.m[3][2] * rhs.m[2][3];
-    ret.m[3][2] = lhs.m[0][2] * rhs.m[3][0] + lhs.m[1][2] * rhs.m[3][1] + lhs.m[2][2] * rhs.m[3][2] + lhs.m[3][2] * rhs.m[3][3];
-
-    ret.m[0][3] = lhs.m[0][3] * rhs.m[0][0] + lhs.m[1][3] * rhs.m[0][1] + lhs.m[2][3] * rhs.m[0][2] + lhs.m[3][3] * rhs.m[0][3];
-    ret.m[1][3] = lhs.m[0][3] * rhs.m[1][0] + lhs.m[1][3] * rhs.m[1][1] + lhs.m[2][3] * rhs.m[1][2] + lhs.m[3][3] * rhs.m[1][3];
-    ret.m[2][3] = lhs.m[0][3] * rhs.m[2][0] + lhs.m[1][3] * rhs.m[2][1] + lhs.m[2][3] * rhs.m[2][2] + lhs.m[3][3] * rhs.m[2][3];
-    ret.m[3][3] = lhs.m[0][3] * rhs.m[3][0] + lhs.m[1][3] * rhs.m[3][1] + lhs.m[2][3] * rhs.m[2][2] + lhs.m[3][3] * rhs.m[3][3];
-#endif
-    return ret;
+static inline CMatrix4f operator*(const CMatrix4f& lhs, const CMatrix4f& rhs) {
+  simd<float> v[4];
+  for (int i = 0; i < 4; ++i)
+    v[i] = lhs.m[0].mSimd * rhs[i].mSimd.shuffle<0, 0, 0, 0>() +
+           lhs.m[1].mSimd * rhs[i].mSimd.shuffle<1, 1, 1, 1>() +
+           lhs.m[2].mSimd * rhs[i].mSimd.shuffle<2, 2, 2, 2>() +
+           lhs.m[3].mSimd * rhs[i].mSimd.shuffle<3, 3, 3, 3>();
+  return CMatrix4f(v[0], v[1], v[2], v[3]);
 }
 }

--- a/include/zeus/COBBox.hpp
+++ b/include/zeus/COBBox.hpp
@ -5,51 +5,46 @@
 #include "zeus/CAABox.hpp"
 #include "zeus/CMRay.hpp"

-namespace zeus
-{
-class alignas(16) COBBox
-{
+namespace zeus {
+class COBBox {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
 #if ZE_ATHENA_TYPES
-    void readBig(athena::io::IStreamReader& in)
-    {
-        transform.read34RowMajor(in);
-        extents.readBig(in);
-    }
-    static COBBox ReadBig(athena::io::IStreamReader& in)
-    {
-        COBBox out;
-        out.readBig(in);
-        return out;
-    }
+
+  void readBig(athena::io::IStreamReader& in) {
+    transform.read34RowMajor(in);
+    extents.readBig(in);
+  }
+
+  static COBBox ReadBig(athena::io::IStreamReader& in) {
+    COBBox out;
+    out.readBig(in);
+    return out;
+  }

 #endif

-    CTransform transform;
-    CVector3f extents;
+  CTransform transform;
+  CVector3f extents;

-    COBBox() {}
+  COBBox() = default;

-    COBBox(const CAABox& aabb) : extents(aabb.extents()) { transform.origin = aabb.center(); }
+  COBBox(const CAABox& aabb) : extents(aabb.extents()) { transform.origin = aabb.center(); }

-    COBBox(const CTransform& xf, const CVector3f& extents) : transform(xf), extents(extents) {}
+  COBBox(const CTransform& xf, const CVector3f& extents) : transform(xf), extents(extents) {}

-    CAABox calculateAABox(const CTransform& worldXf = CTransform()) const;
+  CAABox calculateAABox(const CTransform& worldXf = CTransform()) const;

-    static COBBox FromAABox(const CAABox& box, const CTransform& xf)
-    {
-        const CVector3f extents = box.max - box.center();
-        const CTransform newXf = CTransform::Translate(box.center()) * xf;
-        return COBBox(newXf, extents);
-    }
+  static COBBox FromAABox(const CAABox& box, const CTransform& xf) {
+    const CVector3f extents = box.max - box.center();
+    const CTransform newXf = CTransform::Translate(box.center()) * xf;
+    return COBBox(newXf, extents);
+  }

-    bool OBBIntersectsBox(const COBBox& other) const;
+  bool OBBIntersectsBox(const COBBox& other) const;

-    bool AABoxIntersectsBox(const CAABox& other)
-    {
-        return OBBIntersectsBox(FromAABox(other, CTransform::Identity()));
-    }
+  bool AABoxIntersectsBox(const CAABox& other) {
+    return OBBIntersectsBox(FromAABox(other, CTransform::Identity()));
+  }
 };
 }

--- a/include/zeus/CPlane.hpp
+++ b/include/zeus/CPlane.hpp
@ -4,72 +4,67 @@
 #include "zeus/CVector3f.hpp"
 #include "zeus/Math.hpp"

-namespace zeus
-{
-class alignas(16) CPlane
-{
+namespace zeus {
+class CPlane {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  CPlane() : mSimd(1.0, 0.f, 0.f, 0.f) {}

-    inline CPlane() : a(1.f), b(0.f), c(0.f), d(0.f) {}
-    CPlane(float a, float b, float c, float d) : a(a), b(b), c(c), d(d) {}
-    CPlane(const CVector3f& a, const CVector3f& b, const CVector3f& c)
-    {
-        vec = (b - a).cross(c - a).normalized();
-        d = a.dot(vec);
-    }
+  CPlane(float a, float b, float c, float d) : mSimd(a, b, c, d) {}

-    CPlane(const CVector3f& point, float displacement)
-    {
-#if __SSE__
-        mVec128 = point.mVec128;
-#else
-        a = point[0];
-        b = point[1];
-        c = point[2];
-#endif
-        d = displacement;
-    }
+  CPlane(const CVector3f& a, const CVector3f& b, const CVector3f& c) {
+    mSimd = (b - a).cross(c - a).normalized().mSimd;
+    mSimd[3] = a.dot(normal());
+  }

-    float clipLineSegment(const CVector3f& a, const CVector3f& b)
-    {
-        float mag = (b-a).dot(vec);
-        float dis = (-(vec.y - d)) / mag;
-        return clamp(0.0f, dis, 1.0f);
-    }
+  CPlane(const CVector3f& point, float displacement) {
+    mSimd = point.mSimd;
+    mSimd[3] = displacement;
+  }

-    inline void normalize()
-    {
-        float nd = d;
-        float mag = vec.magnitude();
-        mag = 1.f / mag;
-        vec = vec * mag;
-        d = nd * mag;
-    }
+  float clipLineSegment(const CVector3f& a, const CVector3f& b) {
+    float mag = (b - a).dot(normal());
+    float dis = (-(y() - d())) / mag;
+    return clamp(0.0f, dis, 1.0f);
+  }

-    float pointToPlaneDist(const CVector3f& pos) const
-    {
-        return pos.dot(vec) - d;
-    }
+  void normalize() {
+    float nd = d();
+    auto norm = normal();
+    float mag = norm.magnitude();
+    mag = 1.f / mag;
+    mSimd = (norm * mag).mSimd;
+    mSimd[3] = nd * mag;
+  }

-    bool rayPlaneIntersection(const CVector3f& from, const CVector3f& to, CVector3f& point) const;
+  float pointToPlaneDist(const CVector3f& pos) const {
+    return pos.dot(normal()) - d();
+  }

-    const CVector3f& normal() const { return vec; }
+  bool rayPlaneIntersection(const CVector3f& from, const CVector3f& to, CVector3f& point) const;

-    inline float& operator[](size_t idx) { assert(idx < 4); return p[idx]; }
-    inline const float& operator[](size_t idx) const { assert(idx < 4); return p[idx]; }
+  CVector3f normal() const { return mSimd; }

-    union {
-        struct
-        {
-            float a, b, c, d;
-        };
-        float p[4];
-        CVector3f vec;
-#ifdef __SSE__
-        __m128 mVec128;
-#endif
-    };
+  zeus::simd<float>::reference operator[](size_t idx) {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float operator[](size_t idx) const {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float x() const { return mSimd[0]; }
+  float y() const { return mSimd[1]; }
+  float z() const { return mSimd[2]; }
+  float d() const { return mSimd[3]; }
+
+  simd<float>::reference x() { return mSimd[0]; }
+  simd<float>::reference y() { return mSimd[1]; }
+  simd<float>::reference z() { return mSimd[2]; }
+  simd<float>::reference d() { return mSimd[3]; }
+
+  zeus::simd<float> mSimd;
 };
 }

--- a/include/zeus/CProjection.hpp
+++ b/include/zeus/CProjection.hpp
@ -6,121 +6,117 @@
 #include <cstdio>
 #include <cmath>

-namespace zeus
-{
-enum class EProjType
-{
-    None = 0,
-    Orthographic = 1,
-    Perspective = 2
+namespace zeus {
+enum class EProjType {
+  None = 0,
+  Orthographic = 1,
+  Perspective = 2
 };

-class SProjOrtho
-{
+class SProjOrtho {
 public:
-    float top, bottom, left, right, znear, zfar;
-    explicit SProjOrtho(float p_top = 1.0f, float p_bottom = -1.0f, float p_left = -1.0f, float p_right = 1.0f,
-                        float p_near = 1.0f, float p_far = -1.0f)
-    : top(p_top), bottom(p_bottom), left(p_left), right(p_right), znear(p_near), zfar(p_far)
-    {
-    }
+  float top, bottom, left, right, znear, zfar;
+
+  explicit SProjOrtho(float p_top = 1.0f, float p_bottom = -1.0f, float p_left = -1.0f, float p_right = 1.0f,
+                      float p_near = 1.0f, float p_far = -1.0f)
+    : top(p_top), bottom(p_bottom), left(p_left), right(p_right), znear(p_near), zfar(p_far) {
+  }
 };
-struct SProjPersp
-{
-    float fov, aspect, znear, zfar;
-    SProjPersp(float p_fov = degToRad(55.0f), float p_aspect = 1.0f, float p_near = 0.1f, float p_far = 4096.f)
-    : fov(p_fov), aspect(p_aspect), znear(p_near), zfar(p_far)
-    {
-    }
+
+struct SProjPersp {
+  float fov, aspect, znear, zfar;
+
+  SProjPersp(float p_fov = degToRad(55.0f), float p_aspect = 1.0f, float p_near = 0.1f, float p_far = 4096.f)
+    : fov(p_fov), aspect(p_aspect), znear(p_near), zfar(p_far) {
+  }
 };
+
 extern const SProjOrtho kOrthoIdentity;

-class alignas(16) CProjection
-{
-    void _updateCachedMatrix();
+class CProjection {
+  void _updateCachedMatrix();

 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  CProjection() {
+    m_projType = EProjType::Orthographic;
+    m_ortho = SProjOrtho();
+    m_mtx = CMatrix4f::skIdentityMatrix4f;
+  }

-    CProjection()
-    {
-        m_projType = EProjType::Orthographic;
-        m_ortho = SProjOrtho();
-        m_mtx = CMatrix4f::skIdentityMatrix4f;
-    }
-    CProjection(const CProjection& other) { *this = other; }
-    CProjection(const SProjOrtho& ortho) { setOrtho(ortho); }
-    CProjection(const SProjPersp& persp) { setPersp(persp); }
+  CProjection(const CProjection& other) { *this = other; }

-    inline CProjection& operator=(const CProjection& other)
-    {
-        if (this != &other)
-        {
-            m_projType = other.m_projType;
-            m_ortho = other.m_ortho;
-            m_mtx = other.m_mtx;
-        }
-        return *this;
-    }
+  CProjection(const SProjOrtho& ortho) { setOrtho(ortho); }

-    inline void setOrtho(const SProjOrtho& ortho)
-    {
-        m_projType = EProjType::Orthographic;
-        m_ortho = ortho;
-        _updateCachedMatrix();
-    }
-    inline void setPersp(const SProjPersp& persp)
-    {
-        m_projType = EProjType::Perspective;
-        m_persp = persp;
-        _updateCachedMatrix();
-    }
+  CProjection(const SProjPersp& persp) { setPersp(persp); }

-    inline EProjType getType() const { return m_projType; }
-    inline const SProjOrtho& getOrtho() const
-    {
-        if (m_projType != EProjType::Orthographic)
-        {
-            std::fprintf(stderr, "attempted to access orthographic structure of non-ortho projection");
-            std::abort();
-        }
-        return m_ortho;
-    }
-    inline const SProjPersp& getPersp() const
-    {
-        if (m_projType != EProjType::Perspective)
-        {
-            std::fprintf(stderr, "attempted to access perspective structure of non-persp projection");
-            std::abort();
-        }
-        return m_persp;
+  CProjection& operator=(const CProjection& other) {
+    if (this != &other) {
+      m_projType = other.m_projType;
+      m_ortho = other.m_ortho;
+      m_mtx = other.m_mtx;
    }
+    return *this;
+  }

-    inline const CMatrix4f& getCachedMatrix() const { return m_mtx; }
+  void setOrtho(const SProjOrtho& ortho) {
+    m_projType = EProjType::Orthographic;
+    m_ortho = ortho;
+    _updateCachedMatrix();
+  }
+
+  void setPersp(const SProjPersp& persp) {
+    m_projType = EProjType::Perspective;
+    m_persp = persp;
+    _updateCachedMatrix();
+  }
+
+  EProjType getType() const { return m_projType; }
+
+  const SProjOrtho& getOrtho() const {
+#ifndef NDEBUG
+    if (m_projType != EProjType::Orthographic) {
+      std::fprintf(stderr, "attempted to access orthographic structure of non-ortho projection");
+      std::abort();
+    }
+#endif
+    return m_ortho;
+  }
+
+  const SProjPersp& getPersp() const {
+#ifndef NDEBUG
+    if (m_projType != EProjType::Perspective) {
+      std::fprintf(stderr, "attempted to access perspective structure of non-persp projection");
+      std::abort();
+    }
+#endif
+    return m_persp;
+  }
+
+  const CMatrix4f& getCachedMatrix() const { return m_mtx; }

 protected:
-    /* Projection type */
-    EProjType m_projType;
+  /* Projection type */
+  EProjType m_projType;

-    /* Projection intermediate */
-    union {
+  /* Projection intermediate */
+  union {
 #ifdef _MSC_VER
-        struct
-        {
-            SProjOrtho m_ortho;
-        };
-        struct
-        {
-            SProjPersp m_persp;
-        };
-#else
+    struct
+    {
        SProjOrtho m_ortho;
-        SProjPersp m_persp;
-#endif
    };
+    struct
+    {
+        SProjPersp m_persp;
+    };
+#else
+    SProjOrtho m_ortho;
+    SProjPersp m_persp;
+#endif
+  };

-    /* Cached projection matrix */
-    CMatrix4f m_mtx;
+  /* Cached projection matrix */
+  CMatrix4f m_mtx;
 };
 }

--- a/include/zeus/CQuaternion.hpp
+++ b/include/zeus/CQuaternion.hpp
@ -8,262 +8,303 @@
 #include "zeus/Math.hpp"
 #include "zeus/CRelAngle.hpp"
 #include "zeus/CTransform.hpp"
+
 #if ZE_ATHENA_TYPES
+
 #include <athena/IStreamReader.hpp>
+
 #endif

-namespace zeus
-{
+namespace zeus {

-static inline float normalize_angle(float angle)
-{
-    if (angle > M_PIF)
-        angle -= 2.f * M_PIF;
-    else if (angle < -M_PIF)
-        angle += 2.f * M_PIF;
+static float normalize_angle(float angle) {
+  if (angle > M_PIF)
+    angle -= 2.f * M_PIF;
+  else if (angle < -M_PIF)
+    angle += 2.f * M_PIF;

-    return angle;
+  return angle;
 }

 class CNUQuaternion;

 /** Unit quaternion, used for all quaternion arithmetic */
-class alignas(16) CQuaternion
-{
-#if __atdna__ && ZE_ATHENA_TYPES
-    float clangVec __attribute__((__vector_size__(16)));
-#endif
+class CQuaternion {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  CQuaternion() : mSimd(1.f, 0.f, 0.f, 0.f) {}
+
+  CQuaternion(float wi, float xi, float yi, float zi) : mSimd(wi, xi, yi, zi) {}
+
+  CQuaternion(float xi, float yi, float zi) { fromVector3f(CVector3f(xi, yi, zi)); }
+
+  CQuaternion(float wi, const CVector3f& vec) : mSimd(vec.mSimd.shuffle<0, 0, 1, 2>()) {
+    mSimd[0] = wi;
+  }
+
+  template <typename T>
+  CQuaternion(const simd<T>& s) : mSimd(s) {}

-    CQuaternion() : w(1.0f), x(0.0f), y(0.0f), z(0.0f) {}
-    CQuaternion(float wi, float xi, float yi, float zi) : w(wi), x(xi), y(yi), z(zi) {}
-    CQuaternion(float xi, float yi, float zi) { fromVector3f(CVector3f(xi, yi, zi)); }
-    CQuaternion(float wi, const CVector3f& vec) : w(wi), x(vec.x), y(vec.y), z(vec.z) {}
 #if ZE_ATHENA_TYPES
-    inline void readBig(athena::io::IStreamReader& input)
-    {
-        w = input.readFloatBig();
-        x = input.readFloatBig();
-        y = input.readFloatBig();
-        z = input.readFloatBig();
-    }
-    CQuaternion(const atVec4f& vec)
-    {
-#if __SSE__
-        mVec128 = vec.mVec128;
-#else
-        x = vec.vec[1];
-        y = vec.vec[2];
-        z = vec.vec[3];
-        w = vec.vec[0];
-#endif
-    }

-    operator atVec4f&()
-    {
-        return *reinterpret_cast<atVec4f*>(v);
-    }
-    operator const atVec4f&() const
-    {
-        return *reinterpret_cast<const atVec4f*>(v);
-    }
+  void readBig(athena::io::IStreamReader& input) {
+    simd_floats f;
+    f[0] = input.readFloatBig();
+    f[1] = input.readFloatBig();
+    f[2] = input.readFloatBig();
+    f[3] = input.readFloatBig();
+    mSimd.copy_from(f);
+  }
+
+  CQuaternion(const atVec4f& vec) : mSimd(vec.simd) {}
+
+  operator atVec4f&() {
+    return *reinterpret_cast<atVec4f*>(this);
+  }
+
+  operator const atVec4f&() const {
+    return *reinterpret_cast<const atVec4f*>(this);
+  }

 #endif
-    
-    CQuaternion(const CMatrix3f& mat);
-    CQuaternion(const CVector3f& vec) { fromVector3f(vec); }
-    CQuaternion(const CVector4f& vec)
-    {
-#if __SSE__
-        mVec128 = vec.mVec128;
-#else
-        x = vec[1];
-        y = vec[2];
-        z = vec[3];
-        w = vec[0];
-#endif
-    }

-    CQuaternion(const CVector3f& vecA, const CVector3f& vecB)
-    {
-        CVector3f vecAN = vecA.normalized();
-        CVector3f vecBN = vecB.normalized();
-        CVector3f w = vecAN.cross(vecBN);
-        *this = CQuaternion(1.f + vecAN.dot(vecBN), w.x, w.y, w.z).normalized();
-    }
+  CQuaternion(const CMatrix3f& mat);

-    void fromVector3f(const CVector3f& vec);
+  CQuaternion(const CVector3f& vec) { fromVector3f(vec); }

-    CQuaternion& operator=(const CQuaternion& q);
-    CQuaternion operator+(const CQuaternion& q) const;
-    CQuaternion operator-(const CQuaternion& q) const;
-    CQuaternion operator*(const CQuaternion& q) const;
-    CQuaternion operator/(const CQuaternion& q) const;
-    CQuaternion operator*(float scale) const;
-    CQuaternion operator/(float scale) const;
-    CQuaternion operator-() const;
-    const CQuaternion& operator+=(const CQuaternion& q);
-    const CQuaternion& operator-=(const CQuaternion& q);
-    const CQuaternion& operator*=(const CQuaternion& q);
-    const CQuaternion& operator*=(float scale);
-    const CQuaternion& operator/=(float scale);
-    float magnitude() const { return std::sqrt(magSquared()); }
-    float magSquared() const { return w * w + x * x + y * y + z * z; }
-    void normalize() { *this /= magnitude(); }
-    CQuaternion normalized() const { return *this / magnitude(); }
-    void invert();
-    CQuaternion inverse() const;
+  CQuaternion(const CVector4f& vec) : mSimd(vec.mSimd) {}

-    /**
-     * @brief Set the rotation using axis angle notation
-     * @param axis The axis to rotate around
-     * @param angle The magnitude of the rotation in radians
-     * @return
-     */
-    static inline CQuaternion fromAxisAngle(const CUnitVector3f& axis, const CRelAngle& angle)
-    {
-        return CQuaternion(std::cos(angle / 2.f), axis * std::sin(angle / 2.f));
-    }
+  CQuaternion(const CVector3f& vecA, const CVector3f& vecB) {
+    CVector3f vecAN = vecA.normalized();
+    CVector3f vecBN = vecB.normalized();
+    CVector3f w = vecAN.cross(vecBN);
+    *this = CQuaternion(1.f + vecAN.dot(vecBN), w).normalized();
+  }

-    void rotateX(const CRelAngle& angle) { *this *= fromAxisAngle({1.0f, 0.0f, 0.0f}, angle); }
-    void rotateY(const CRelAngle& angle) { *this *= fromAxisAngle({0.0f, 1.0f, 0.0f}, angle); }
-    void rotateZ(const CRelAngle& angle) { *this *= fromAxisAngle({0.0f, 0.0f, 1.0f}, angle); }
+  void fromVector3f(const CVector3f& vec);

-    static inline CVector3f rotate(const CQuaternion& rotation, const CAxisAngle& v)
-    {
-        CQuaternion q = rotation * v;
-        q *= rotation.inverse();
+  CQuaternion& operator=(const CQuaternion& q);

-        return {q.x, q.y, q.z};
-    }
+  CQuaternion operator+(const CQuaternion& q) const;

-    static CQuaternion lookAt(const CUnitVector3f& source, const CUnitVector3f& dest, const CRelAngle& maxAng);
+  CQuaternion operator-(const CQuaternion& q) const;

-    CVector3f transform(const CVector3f& v) const
-    {
-        CQuaternion r(0.f, v);
-        return (*this * r * inverse()).getImaginary();
-    }
+  CQuaternion operator*(const CQuaternion& q) const;

-    CQuaternion log() const;
+  CQuaternion operator/(const CQuaternion& q) const;

-    CQuaternion exp() const;
+  CQuaternion operator*(float scale) const;

-    inline CTransform toTransform() const { return CTransform(CMatrix3f(*this)); }
-    inline CTransform toTransform(const zeus::CVector3f& origin) const { return CTransform(CMatrix3f(*this), origin); }
-    inline float dot(const CQuaternion& rhs) const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-            result.mVec128 = _mm_dp_ps(mVec128, rhs.mVec128, 0xF1);
-            return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-        return result.v[0] + result.v[1] + result.v[2] + result.v[3];
-#endif
-#else
-        return (x * rhs.x) + (y * rhs.y) + (z * rhs.z) + (w * rhs.w);
-#endif
-    }
+  CQuaternion operator/(float scale) const;

-    static CQuaternion lerp(const CQuaternion& a, const CQuaternion& b, double t);
-    static CQuaternion slerp(const CQuaternion& a, const CQuaternion& b, double t);
-    static CQuaternion slerpShort(const CQuaternion& a, const CQuaternion& b, double t);
-    static CQuaternion nlerp(const CQuaternion& a, const CQuaternion& b, double t);
-    static CQuaternion shortestRotationArc(const zeus::CVector3f& v0, const zeus::CVector3f& v1);
-    static CQuaternion clampedRotateTo(const zeus::CUnitVector3f& v0, const zeus::CUnitVector3f& v1,
-                                       const zeus::CRelAngle& angle);
+  CQuaternion operator-() const;

-    inline float roll() const { return std::atan2(2.f * (x * y + w * z), w * w + x * x - y * y - z * z); }
+  const CQuaternion& operator+=(const CQuaternion& q);

-    inline float pitch() const { return std::atan2(2.f * (y * z + w * x), w * w - x * x - y * y + z * z); }
+  const CQuaternion& operator-=(const CQuaternion& q);

-    inline float yaw() const { return std::asin(-2.f * (x * z - w * y)); }
+  const CQuaternion& operator*=(const CQuaternion& q);

-    CQuaternion buildEquivalent() const;
+  const CQuaternion& operator*=(float scale);

-    zeus::CVector3f getImaginary() const { return {x, y, z}; }
-    void setImaginary(const zeus::CVector3f& i) { x = i.x; y = i.y; z = i.z; }
+  const CQuaternion& operator/=(float scale);

-    CRelAngle angleFrom(const zeus::CQuaternion& other);
+  float magnitude() const { return std::sqrt(magSquared()); }

-    inline float& operator[](size_t idx) { assert(idx < 4); return (&w)[idx]; }
-    inline const float& operator[](size_t idx) const { assert(idx < 4); return (&w)[idx]; }
+  float magSquared() const { return mSimd.dot4(mSimd); }

-    union
-    {
-        __m128 mVec128;
-        struct
-        {
-            float w, x, y, z;
-        };
-        float v[4];
-    };
+  void normalize() { *this /= magnitude(); }

-    static const CQuaternion skNoRotation;
+  CQuaternion normalized() const { return *this / magnitude(); }

-    static CQuaternion fromNUQuaternion(const CNUQuaternion& q);
+  void invert();
+
+  CQuaternion inverse() const;
+
+  /**
+   * @brief Set the rotation using axis angle notation
+   * @param axis The axis to rotate around
+   * @param angle The magnitude of the rotation in radians
+   * @return
+   */
+  static CQuaternion fromAxisAngle(const CUnitVector3f& axis, const CRelAngle& angle) {
+    return CQuaternion(std::cos(angle / 2.f), axis * std::sin(angle / 2.f));
+  }
+
+  void rotateX(const CRelAngle& angle) { *this *= fromAxisAngle({1.0f, 0.0f, 0.0f}, angle); }
+
+  void rotateY(const CRelAngle& angle) { *this *= fromAxisAngle({0.0f, 1.0f, 0.0f}, angle); }
+
+  void rotateZ(const CRelAngle& angle) { *this *= fromAxisAngle({0.0f, 0.0f, 1.0f}, angle); }
+
+  static CVector3f rotate(const CQuaternion& rotation, const CAxisAngle& v) {
+    CQuaternion q = rotation * v;
+    q *= rotation.inverse();
+
+    return {q.mSimd.shuffle<1, 2, 3, 3>()};
+  }
+
+  static CQuaternion lookAt(const CUnitVector3f& source, const CUnitVector3f& dest, const CRelAngle& maxAng);
+
+  CVector3f transform(const CVector3f& v) const {
+    CQuaternion r(0.f, v);
+    return (*this * r * inverse()).getImaginary();
+  }
+
+  CQuaternion log() const;
+
+  CQuaternion exp() const;
+
+  CTransform toTransform() const { return CTransform(CMatrix3f(*this)); }
+
+  CTransform toTransform(const zeus::CVector3f& origin) const { return CTransform(CMatrix3f(*this), origin); }
+
+  float dot(const CQuaternion& rhs) const {
+    return mSimd.dot4(rhs.mSimd);
+  }
+
+  static CQuaternion lerp(const CQuaternion& a, const CQuaternion& b, double t);
+
+  static CQuaternion slerp(const CQuaternion& a, const CQuaternion& b, double t);
+
+  static CQuaternion slerpShort(const CQuaternion& a, const CQuaternion& b, double t);
+
+  static CQuaternion nlerp(const CQuaternion& a, const CQuaternion& b, double t);
+
+  static CQuaternion shortestRotationArc(const zeus::CVector3f& v0, const zeus::CVector3f& v1);
+
+  static CQuaternion clampedRotateTo(const zeus::CUnitVector3f& v0, const zeus::CUnitVector3f& v1,
+                                     const zeus::CRelAngle& angle);
+
+  float roll() const {
+    simd_floats f(mSimd);
+    return std::atan2(2.f * (f[1] * f[2] + f[0] * f[3]), f[0] * f[0] + f[1] * f[1] - f[2] * f[2] - f[3] * f[3]);
+  }
+
+  float pitch() const {
+    simd_floats f(mSimd);
+    return std::atan2(2.f * (f[2] * f[3] + f[0] * f[1]), f[0] * f[0] - f[1] * f[1] - f[2] * f[2] + f[3] * f[3]);
+  }
+
+  float yaw() const {
+    simd_floats f(mSimd);
+    return std::asin(-2.f * (f[1] * f[3] - f[0] * f[2]));
+  }
+
+  CQuaternion buildEquivalent() const;
+
+  zeus::CVector3f getImaginary() const { return mSimd.shuffle<1, 2, 3, 3>(); }
+
+  void setImaginary(const zeus::CVector3f& i) {
+    x() = i.x();
+    y() = i.y();
+    z() = i.z();
+  }
+
+  CRelAngle angleFrom(const zeus::CQuaternion& other);
+
+  simd<float>::reference operator[](size_t idx) {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float operator[](size_t idx) const {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float w() const { return mSimd[0]; }
+  float x() const { return mSimd[1]; }
+  float y() const { return mSimd[2]; }
+  float z() const { return mSimd[3]; }
+
+  simd<float>::reference w() { return mSimd[0]; }
+  simd<float>::reference x() { return mSimd[1]; }
+  simd<float>::reference y() { return mSimd[2]; }
+  simd<float>::reference z() { return mSimd[3]; }
+
+  simd<float> mSimd;
+
+  static const CQuaternion skNoRotation;
+
+  static CQuaternion fromNUQuaternion(const CNUQuaternion& q);
 };

 /** Non-unit quaternion, no guarantee that it's normalized.
 *  Converting to CQuaternion will perform normalize operation.
 */
-class alignas(16) CNUQuaternion
-{
+class CNUQuaternion {
 public:
-    CNUQuaternion() : w(1.0f), x(0.0f), y(0.0f), z(0.0f) {}
-    CNUQuaternion(float wi, float xi, float yi, float zi) : w(wi), x(xi), y(yi), z(zi) {}
-    CNUQuaternion(float win, const zeus::CVector3f& vec) { w = win; x = vec.x; y = vec.y; z = vec.z; }
-    CNUQuaternion(const CQuaternion& other) { w = other.w; x = other.x; y = other.y; z = other.z; }
-    CNUQuaternion(const CMatrix3f& mtx) : CNUQuaternion(CQuaternion(mtx)) {}
-    static inline CNUQuaternion fromAxisAngle(const CUnitVector3f& axis, const CRelAngle& angle)
-    {
-        return CNUQuaternion(CQuaternion::fromAxisAngle(axis, angle));
-    }
+  CNUQuaternion() : mSimd(1.f, 0.f, 0.f, 0.f) {}

-    float magnitude() const { return std::sqrt(magSquared()); }
-    float magSquared() const { return w * w + x * x + y * y + z * z; }
-    void normalize()
-    {
-        float magDiv = 1.f / magnitude();
-        w *= magDiv;
-        x *= magDiv;
-        y *= magDiv;
-        z *= magDiv;
-    }
-    CNUQuaternion normalized() const
-    {
-        float magDiv = 1.f / magnitude();
-        return { w * magDiv, x * magDiv, y * magDiv, z * magDiv };
-    }
+  CNUQuaternion(float wi, float xi, float yi, float zi) : mSimd(wi, xi, yi, zi) {}

-    CNUQuaternion operator*(const CNUQuaternion& q) const;
-    CNUQuaternion operator*(float f) const;
-    const CNUQuaternion& operator+=(const CNUQuaternion& q);
+  CNUQuaternion(float win, const zeus::CVector3f& vec) : mSimd(vec.mSimd.shuffle<0, 0, 1, 2>()) {
+    w() = win;
+  }

-    inline float& operator[](size_t idx) { assert(idx < 4); return (&w)[idx]; }
-    inline const float& operator[](size_t idx) const { assert(idx < 4); return (&w)[idx]; }
+  CNUQuaternion(const CQuaternion& other) : mSimd(other.mSimd) {}

-    union
-    {
-        __m128 mVec128;
-        struct
-        {
-            float w, x, y, z;
-        };
-    };
+  CNUQuaternion(const CMatrix3f& mtx) : CNUQuaternion(CQuaternion(mtx)) {}
+
+  CNUQuaternion(const simd<float>& s) : mSimd(s) {}
+
+  static CNUQuaternion fromAxisAngle(const CUnitVector3f& axis, const CRelAngle& angle) {
+    return CNUQuaternion(CQuaternion::fromAxisAngle(axis, angle));
+  }
+
+  float magnitude() const { return std::sqrt(magSquared()); }
+
+  float magSquared() const { return mSimd.dot4(mSimd); }
+
+  void normalize() {
+    float magDiv = 1.f / magnitude();
+    mSimd *= magDiv;
+  }
+
+  CNUQuaternion normalized() const {
+    float magDiv = 1.f / magnitude();
+    return mSimd * simd<float>(magDiv);
+  }
+
+  CNUQuaternion operator*(const CNUQuaternion& q) const;
+
+  CNUQuaternion operator*(float f) const;
+
+  const CNUQuaternion& operator+=(const CNUQuaternion& q);
+
+  zeus::simd<float>::reference operator[](size_t idx) {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float operator[](size_t idx) const {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float w() const { return mSimd[0]; }
+  float x() const { return mSimd[1]; }
+  float y() const { return mSimd[2]; }
+  float z() const { return mSimd[3]; }
+
+  simd<float>::reference w() { return mSimd[0]; }
+  simd<float>::reference x() { return mSimd[1]; }
+  simd<float>::reference y() { return mSimd[2]; }
+  simd<float>::reference z() { return mSimd[3]; }
+
+  simd<float> mSimd;
 };

-inline CQuaternion CQuaternion::fromNUQuaternion(const CNUQuaternion& q)
-{
-    auto norm = q.normalized();
-    return { norm.w, norm.x, norm.y, norm.z };
+inline CQuaternion CQuaternion::fromNUQuaternion(const CNUQuaternion& q) {
+  auto norm = q.normalized();
+  return norm.mSimd;
 }

 CQuaternion operator+(float lhs, const CQuaternion& rhs);
+
 CQuaternion operator-(float lhs, const CQuaternion& rhs);
+
 CQuaternion operator*(float lhs, const CQuaternion& rhs);
+
 CNUQuaternion operator*(float lhs, const CNUQuaternion& rhs);
 }
--- a/include/zeus/CRectangle.hpp
+++ b/include/zeus/CRectangle.hpp
@ -1,32 +1,30 @@
 #pragma once
+
 #include "zeus/CVector2f.hpp"

-namespace zeus
-{
-class CRectangle
-{
+namespace zeus {
+class CRectangle {
 public:
-    CRectangle() {}
-    CRectangle(float x, float y, float w, float h) : position(x, y), size(w, h) {}
+  CRectangle() {}

-    inline bool contains(const CVector2f& point) const
-    {
-        if (point.x < position.x || point.x > position.x + size.x)
-            return false;
-        if (point.y < position.y || point.y > position.y + size.y)
-            return false;
+  CRectangle(float x, float y, float w, float h) : position(x, y), size(w, h) {}

-        return true;
-    }
+  bool contains(const CVector2f& point) const {
+    if (point.x() < position.x() || point.x() > position.x() + size.x())
+      return false;
+    if (point.y() < position.y() || point.y() > position.y() + size.y())
+      return false;

-    inline bool intersects(const CRectangle& rect) const
-    {
-        return !(position.x > rect.position.x + rect.size.x || rect.position.x > position.x + size.x ||
-                 position.y > rect.position.y + rect.size.y || rect.position.y > position.y + size.y);
-    }
+    return true;
+  }

-    CVector2f position;
-    CVector2f size;
+  bool intersects(const CRectangle& rect) const {
+    return !(position.x() > rect.position.x() + rect.size.x() || rect.position.x() > position.x() + size.x() ||
+             position.y() > rect.position.y() + rect.size.y() || rect.position.y() > position.y() + size.y());
+  }
+
+  CVector2f position;
+  CVector2f size;
 };
 }

--- a/include/zeus/CRelAngle.hpp
+++ b/include/zeus/CRelAngle.hpp
@ -4,51 +4,92 @@
 #include "zeus/Math.hpp"
 #include <cmath>

-namespace zeus
-{
+namespace zeus {
 /**
 * @brief The CRelAngle class represents relative angle in radians
 */
-struct CRelAngle
-{
-    float angle = 0.f;
+struct CRelAngle {
+  float angle = 0.f;

-    static float MakeRelativeAngle(float angle)
-    {
-        float absAngle = std::fabs(angle);
-        if (absAngle == 2.f * M_PIF)
-            return std::copysign(absAngle, angle);
-        float ret = absAngle - std::floor(absAngle / (2.f * M_PIF)) * (2.f * M_PIF);
-        return std::copysign(ret, angle);
-    }
+  static float MakeRelativeAngle(float angle) {
+    float absAngle = std::fabs(angle);
+    if (absAngle == 2.f * M_PIF)
+      return std::copysign(absAngle, angle);
+    float ret = absAngle - std::floor(absAngle / (2.f * M_PIF)) * (2.f * M_PIF);
+    return std::copysign(ret, angle);
+  }

-    CRelAngle() = default;
-    CRelAngle(float angle) : angle(MakeRelativeAngle(angle)) {}
-    CRelAngle& operator=(float ang) { angle = MakeRelativeAngle(ang); return *this; }
-    CRelAngle& operator=(const CRelAngle& ang) { angle = ang.angle; return *this; }
-    float asDegrees() const { return radToDeg(angle); }
-    float asRadians() const { return angle; }
-    float arcCosine() const { return std::acos(angle); }
+  CRelAngle() = default;

-    static CRelAngle FromDegrees(float angle)
-    {
-        CRelAngle ret;
-        ret.angle = MakeRelativeAngle(degToRad(angle));
-        return ret;
-    }
+  CRelAngle(float angle) : angle(MakeRelativeAngle(angle)) {}

-    operator float() const { return angle; }
-    static CRelAngle FromRadians(float angle) { return CRelAngle(angle); }
+  CRelAngle& operator=(float ang) {
+    angle = MakeRelativeAngle(ang);
+    return *this;
+  }

-    bool operator <(const CRelAngle& other) const { return angle < other.angle; }
-    CRelAngle& operator +=(const CRelAngle& other) { angle = MakeRelativeAngle(angle + other.angle); return *this; }
-    CRelAngle& operator +=(float r) { angle = MakeRelativeAngle(angle + r); return *this; }
-    CRelAngle& operator -=(const CRelAngle& other) { angle = MakeRelativeAngle(angle - other.angle); return *this; }
-    CRelAngle& operator -=(float r) { angle = MakeRelativeAngle(angle - r); return *this; }
-    CRelAngle& operator *=(const CRelAngle& other) { angle = MakeRelativeAngle(angle * other.angle); return *this; }
-    CRelAngle& operator *=(float r) { angle = MakeRelativeAngle(angle * r); return *this; }
-    CRelAngle& operator /=(const CRelAngle& other) { angle = MakeRelativeAngle(angle / other.angle); return *this; }
-    CRelAngle& operator /=(float r) { angle = MakeRelativeAngle(angle / r); return *this; }
+  CRelAngle& operator=(const CRelAngle& ang) {
+    angle = ang.angle;
+    return *this;
+  }
+
+  float asDegrees() const { return radToDeg(angle); }
+
+  float asRadians() const { return angle; }
+
+  float arcCosine() const { return std::acos(angle); }
+
+  static CRelAngle FromDegrees(float angle) {
+    CRelAngle ret;
+    ret.angle = MakeRelativeAngle(degToRad(angle));
+    return ret;
+  }
+
+  operator float() const { return angle; }
+
+  static CRelAngle FromRadians(float angle) { return CRelAngle(angle); }
+
+  bool operator<(const CRelAngle& other) const { return angle < other.angle; }
+
+  CRelAngle& operator+=(const CRelAngle& other) {
+    angle = MakeRelativeAngle(angle + other.angle);
+    return *this;
+  }
+
+  CRelAngle& operator+=(float r) {
+    angle = MakeRelativeAngle(angle + r);
+    return *this;
+  }
+
+  CRelAngle& operator-=(const CRelAngle& other) {
+    angle = MakeRelativeAngle(angle - other.angle);
+    return *this;
+  }
+
+  CRelAngle& operator-=(float r) {
+    angle = MakeRelativeAngle(angle - r);
+    return *this;
+  }
+
+  CRelAngle& operator*=(const CRelAngle& other) {
+    angle = MakeRelativeAngle(angle * other.angle);
+    return *this;
+  }
+
+  CRelAngle& operator*=(float r) {
+    angle = MakeRelativeAngle(angle * r);
+    return *this;
+  }
+
+  CRelAngle& operator/=(const CRelAngle& other) {
+    angle = MakeRelativeAngle(angle / other.angle);
+    return *this;
+  }
+
+  CRelAngle& operator/=(float r) {
+    angle = MakeRelativeAngle(angle / r);
+    return *this;
+  }
 };
 }

--- a/include/zeus/CSphere.hpp
+++ b/include/zeus/CSphere.hpp
@ -2,25 +2,20 @@

 #include "zeus/CVector3f.hpp"

-namespace zeus
-{
-class alignas(16) CSphere
-{
+namespace zeus {
+class CSphere {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  CSphere(const CVector3f& position, float radius) : position(position), radius(radius) {}

-    CSphere(const CVector3f& position, float radius) : position(position), radius(radius) {}
+  CVector3f getSurfaceNormal(const CVector3f& coord) const { return (coord - position).normalized(); }

-    inline CVector3f getSurfaceNormal(const CVector3f& coord) const { return (coord - position).normalized(); }
+  bool intersects(const CSphere& other) {
+    float dist = (position - other.position).magnitude();
+    return dist < (radius + other.radius);
+  }

-    inline bool intersects(const CSphere& other)
-    {
-        float dist = (position - other.position).magnitude();
-        return dist < (radius + other.radius);
-    }
-
-    CVector3f position;
-    float radius;
+  CVector3f position;
+  float radius;
 };
 }

--- a/include/zeus/CTransform.hpp
+++ b/include/zeus/CTransform.hpp
@ -8,273 +8,277 @@
 #include <cstdint>
 #include <cstdio>

-namespace zeus
-{
-class alignas(16) CTransform
-{
+namespace zeus {
+class CTransform {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  CTransform() : basis(false) {}
+
+  CTransform(const CMatrix3f& basis, const CVector3f& offset = CVector3f::skZero)
+  : basis(basis), origin(offset) {}

-    CTransform() : basis(false) {}
-    CTransform(const CMatrix3f& basis, const CVector3f& offset = CVector3f::skZero) : basis(basis), origin(offset) {}
 #if ZE_ATHENA_TYPES
-    CTransform(const atVec4f* mtx) : basis(mtx[0], mtx[1], mtx[2]), origin(mtx[0].vec[3], mtx[1].vec[3], mtx[2].vec[3]) {}

-    void read34RowMajor(athena::io::IStreamReader& r)
-    {
-        atVec4f r0 = r.readVec4fBig();
-        atVec4f r1 = r.readVec4fBig();
-        atVec4f r2 = r.readVec4fBig();
-        basis = CMatrix3f(r0, r1, r2);
-        basis.transpose();
-        origin = CVector3f(r0.vec[3], r1.vec[3], r2.vec[3]);
-    }
+  CTransform(const atVec4f* mtx)
+  : basis(mtx[0], mtx[1], mtx[2])
+  , origin(mtx[0].simd[3], mtx[1].simd[3], mtx[2].simd[3]) {}
+
+  void read34RowMajor(athena::io::IStreamReader& r) {
+    atVec4f r0 = r.readVec4fBig();
+    atVec4f r1 = r.readVec4fBig();
+    atVec4f r2 = r.readVec4fBig();
+    basis = CMatrix3f(r0, r1, r2);
+    basis.transpose();
+    origin = CVector3f(r0.simd[3], r1.simd[3], r2.simd[3]);
+  }
+
 #endif

-    /* Column constructor */
-    CTransform(const CVector3f& c0, const CVector3f& c1, const CVector3f& c2, const CVector3f& c3)
-    : basis(c0, c1, c2), origin(c3) {}
+  /* Column constructor */
+  CTransform(const CVector3f& c0, const CVector3f& c1, const CVector3f& c2, const CVector3f& c3)
+  : basis(c0, c1, c2), origin(c3) {}

-    static inline CTransform Identity() { return CTransform(CMatrix3f::skIdentityMatrix3f); }
+  static CTransform Identity() {
+    return CTransform(CMatrix3f::skIdentityMatrix3f);
+  }

-    inline bool operator ==(const CTransform& other) const
-    {
-        return origin == other.origin && basis == other.basis;
-    }
+  bool operator==(const CTransform& other) const {
+    return origin == other.origin && basis == other.basis;
+  }

-    inline CTransform operator*(const CTransform& rhs) const
-    {
-        return CTransform(basis * rhs.basis, origin + (basis * rhs.origin));
-    }
+  CTransform operator*(const CTransform& rhs) const {
+    return CTransform(basis * rhs.basis, origin + (basis * rhs.origin));
+  }

-    inline CTransform inverse() const
-    {
-        CMatrix3f inv = basis.inverted();
-        return CTransform(inv, inv * -origin);
-    }
+  CTransform inverse() const {
+    CMatrix3f inv = basis.inverted();
+    return CTransform(inv, inv * -origin);
+  }

-    static inline CTransform Translate(const CVector3f& position) { return {CMatrix3f::skIdentityMatrix3f, position}; }
+  static CTransform Translate(const CVector3f& position) {
+    return {CMatrix3f::skIdentityMatrix3f, position};
+  }

-    static inline CTransform Translate(float x, float y, float z) { return Translate({x, y, z}); }
+  static CTransform Translate(float x, float y, float z) {
+    return Translate({x, y, z});
+  }

-    inline CTransform operator+(const CVector3f& other) { return CTransform(basis, origin + other); }
+  CTransform operator+(const CVector3f& other) {
+    return CTransform(basis, origin + other);
+  }

-    inline CTransform& operator+=(const CVector3f& other)
-    {
-        origin += other;
-        return *this;
-    }
+  CTransform& operator+=(const CVector3f& other) {
+    origin += other;
+    return *this;
+  }

-    inline CTransform operator-(const CVector3f& other) { return CTransform(basis, origin - other); }
+  CTransform operator-(const CVector3f& other) {
+    return CTransform(basis, origin - other);
+  }

-    inline CTransform& operator-=(const CVector3f& other)
-    {
-        origin -= other;
-        return *this;
-    }
+  CTransform& operator-=(const CVector3f& other) {
+    origin -= other;
+    return *this;
+  }

-    inline zeus::CVector3f rotate(const CVector3f& vec) const { return basis * vec; }
+  zeus::CVector3f rotate(const CVector3f& vec) const {
+    return basis * vec;
+  }

-    static inline CTransform RotateX(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
-        return CTransform(CMatrix3f(TVectorUnion{{1.f, 0.f, 0.f, 0.f}},
-                                    TVectorUnion{{0.f, cosT, sinT, 0.f}},
-                                    TVectorUnion{{0.f, -sinT, cosT, 0.f}}));
-    }
+  static CTransform RotateX(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);
+    return CTransform(CMatrix3f(simd<float>{1.f, 0.f, 0.f, 0.f},
+                                simd<float>{0.f, cosT, sinT, 0.f},
+                                simd<float>{0.f, -sinT, cosT, 0.f}));
+  }

-    static inline CTransform RotateY(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
-        return CTransform(CMatrix3f(TVectorUnion{{cosT, 0.f, -sinT, 0.f}},
-                                    TVectorUnion{{0.f, 1.f, 0.f, 0.f}},
-                                    TVectorUnion{{sinT, 0.f, cosT, 0.f}}));
-    }
+  static CTransform RotateY(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);
+    return CTransform(CMatrix3f(simd<float>{cosT, 0.f, -sinT, 0.f},
+                                simd<float>{0.f, 1.f, 0.f, 0.f},
+                                simd<float>{sinT, 0.f, cosT, 0.f}));
+  }

-    static inline CTransform RotateZ(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
-        return CTransform(CMatrix3f(TVectorUnion{{cosT, sinT, 0.f, 0.f}},
-                                    TVectorUnion{{-sinT, cosT, 0.f, 0.f}},
-                                    TVectorUnion{{0.f, 0.f, 1.f, 0.f}}));
-    }
+  static CTransform RotateZ(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);
+    return CTransform(CMatrix3f(simd<float>{cosT, sinT, 0.f, 0.f},
+                                simd<float>{-sinT, cosT, 0.f, 0.f},
+                                simd<float>{0.f, 0.f, 1.f, 0.f}));
+  }

-    inline void rotateLocalX(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
+  void rotateLocalX(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);

-        zeus::CVector3f b2 = basis[2] * sinT;
-        zeus::CVector3f b1 = basis[1] * sinT;
-        zeus::CVector3f cosV(cosT);
+    zeus::CVector3f b2 = basis[2] * sinT;
+    zeus::CVector3f b1 = basis[1] * sinT;
+    zeus::CVector3f cosV(cosT);

-        basis[1] *= cosV;
-        basis[2] *= cosV;
+    basis[1] *= cosV;
+    basis[2] *= cosV;

-        basis[1] += b2;
-        basis[2] -= b1;
-    }
+    basis[1] += b2;
+    basis[2] -= b1;
+  }

-    inline void rotateLocalY(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
+  void rotateLocalY(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);

-        zeus::CVector3f b0 = basis[0] * sinT;
-        zeus::CVector3f b2 = basis[2] * sinT;
-        zeus::CVector3f cosV(cosT);
+    zeus::CVector3f b0 = basis[0] * sinT;
+    zeus::CVector3f b2 = basis[2] * sinT;
+    zeus::CVector3f cosV(cosT);

-        basis[0] *= cosV;
-        basis[2] *= cosV;
+    basis[0] *= cosV;
+    basis[2] *= cosV;

-        basis[2] += b0;
-        basis[0] -= b2;
-    }
+    basis[2] += b0;
+    basis[0] -= b2;
+  }

-    inline void rotateLocalZ(float theta)
-    {
-        float sinT = std::sin(theta);
-        float cosT = std::cos(theta);
+  void rotateLocalZ(float theta) {
+    float sinT = std::sin(theta);
+    float cosT = std::cos(theta);

-        zeus::CVector3f b0 = basis[0] * sinT;
-        zeus::CVector3f b1 = basis[1] * sinT;
-        zeus::CVector3f cosV(cosT);
+    zeus::CVector3f b0 = basis[0] * sinT;
+    zeus::CVector3f b1 = basis[1] * sinT;
+    zeus::CVector3f cosV(cosT);

-        basis[0] *= cosV;
-        basis[1] *= cosV;
+    basis[0] *= cosV;
+    basis[1] *= cosV;

-        basis[0] += b1;
-        basis[1] -= b0;
-    }
+    basis[0] += b1;
+    basis[1] -= b0;
+  }

-    inline CVector3f transposeRotate(const CVector3f& in) const
-    {
-        return CVector3f(basis[0].dot(in), basis[1].dot(in), basis[2].dot(in));
-    }
+  CVector3f transposeRotate(const CVector3f& in) const {
+    return CVector3f(basis[0].dot(in), basis[1].dot(in), basis[2].dot(in));
+  }

-    inline void scaleBy(float factor)
-    {
-        CTransform xfrm(CMatrix3f(CVector3f(factor, factor, factor)));
-        *this = *this * xfrm;
-    }
+  void scaleBy(float factor) {
+    CTransform xfrm(CMatrix3f(CVector3f(factor, factor, factor)));
+    *this = *this * xfrm;
+  }

-    static inline CTransform Scale(const CVector3f& factor)
-    {
-        return CTransform(CMatrix3f(TVectorUnion{{factor.x, 0.f, 0.f, 0.f}},
-                                    TVectorUnion{{0.f, factor.y, 0.f, 0.f}},
-                                    TVectorUnion{{0.f, 0.f, factor.z, 0.f}}));
-    }
+  static CTransform Scale(const CVector3f& factor) {
+    return CTransform(CMatrix3f(simd<float>{factor.x(), 0.f, 0.f, 0.f},
+                                simd<float>{0.f, factor.y(), 0.f, 0.f},
+                                simd<float>{0.f, 0.f, factor.z(), 0.f}));
+  }

-    static inline CTransform Scale(float x, float y, float z)
-    {
-        return CTransform(
-            CMatrix3f(TVectorUnion{{x, 0.f, 0.f, 0.f}},
-                      TVectorUnion{{0.f, y, 0.f, 0.f}},
-                      TVectorUnion{{0.f, 0.f, z, 0.f}}));
-    }
+  static CTransform Scale(float x, float y, float z) {
+    return CTransform(CMatrix3f(simd<float>{x, 0.f, 0.f, 0.f},
+                                simd<float>{0.f, y, 0.f, 0.f},
+                                simd<float>{0.f, 0.f, z, 0.f}));
+  }

-    static inline CTransform Scale(float factor)
-    {
-        return CTransform(CMatrix3f(TVectorUnion{{factor, 0.f, 0.f, 0.f}},
-                                    TVectorUnion{{0.f, factor, 0.f, 0.f}},
-                                    TVectorUnion{{0.f, 0.f, factor, 0.f}}));
-    }
+  static CTransform Scale(float factor) {
+    return CTransform(CMatrix3f(simd<float>{factor, 0.f, 0.f, 0.f},
+                                simd<float>{0.f, factor, 0.f, 0.f},
+                                simd<float>{0.f, 0.f, factor, 0.f}));
+  }

-    inline CTransform multiplyIgnoreTranslation(const CTransform& xfrm) const
-    {
-        CTransform ret;
-        ret.basis = basis * xfrm.basis;
-        return ret;
-    }
+  CTransform multiplyIgnoreTranslation(const CTransform& xfrm) const {
+    CTransform ret;
+    ret.basis = basis * xfrm.basis;
+    return ret;
+  }

-    inline CTransform getRotation() const
-    {
-        CTransform ret = *this;
-        ret.origin.zeroOut();
-        return ret;
-    }
-    void setRotation(const CMatrix3f& mat) { basis = mat; }
-    void setRotation(const CTransform& xfrm) { setRotation(xfrm.basis); }
+  CTransform getRotation() const {
+    CTransform ret = *this;
+    ret.origin.zeroOut();
+    return ret;
+  }

-    /**
-     * @brief buildMatrix3f Returns the stored matrix
-     * buildMatrix3f is here for compliance with Retro's Math API
-     * @return The Matrix (Neo, you are the one)
-     */
-    inline const CMatrix3f& buildMatrix3f() const { return basis; }
+  void setRotation(const CMatrix3f& mat) {
+    basis = mat;
+  }

-    inline CVector3f operator*(const CVector3f& other) const { return origin + basis * other; }
+  void setRotation(const CTransform& xfrm) {
+    setRotation(xfrm.basis);
+  }

-    inline CMatrix4f toMatrix4f() const
-    {
-        CMatrix4f ret(basis[0], basis[1], basis[2], origin);
-        ret[0][3] = 0.0f;
-        ret[1][3] = 0.0f;
-        ret[2][3] = 0.0f;
-        ret[3][3] = 1.0f;
-        return ret;
-    }
+  /**
+   * @brief buildMatrix3f Returns the stored matrix
+   * buildMatrix3f is here for compliance with Retro's Math API
+   * @return The Matrix (Neo, you are the one)
+   */
+  const CMatrix3f& buildMatrix3f() const {
+    return basis;
+  }

-    inline CVector3f upVector() const
-    {
-        return basis.vec[2];
-    }
+  CVector3f operator*(const CVector3f& other) const {
+    return origin + basis * other;
+  }

-    inline CVector3f frontVector() const
-    {
-        return basis.vec[1];
-    }
+  CMatrix4f toMatrix4f() const {
+    CMatrix4f ret(basis[0], basis[1], basis[2], origin);
+    ret[0][3] = 0.0f;
+    ret[1][3] = 0.0f;
+    ret[2][3] = 0.0f;
+    ret[3][3] = 1.0f;
+    return ret;
+  }

-    inline CVector3f rightVector() const
-    {
-        return basis.vec[0];
-    }
+  CVector3f upVector() const {
+    return basis.m[2];
+  }

-    inline void orthonormalize()
-    {
-        basis[0].normalize();
-        basis[2] = basis[0].cross(basis[1]);
-        basis[2].normalize();
-        basis[1] = basis[2].cross(basis[0]);
-    }
+  CVector3f frontVector() const {
+    return basis.m[1];
+  }

-    void printMatrix() const
-    {
-        printf("%f %f %f %f\n"
-               "%f %f %f %f\n"
-               "%f %f %f %f\n"
-               "%f %f %f %f\n",
-               basis[0][0], basis[1][0], basis[2][0], origin[0],
-               basis[0][1], basis[1][1], basis[2][1], origin[1],
-               basis[0][2], basis[1][2], basis[2][2], origin[2],
-               0.f, 0.f, 0.f, 1.f);
-    }
+  CVector3f rightVector() const {
+    return basis.m[0];
+  }

-    static zeus::CTransform MakeRotationsBasedOnY(const CUnitVector3f& uVec)
-    {
-        uint32_t i;
-        if (uVec.y < uVec.x || uVec.z < uVec.y || uVec.z < uVec.x)
-            i = 2;
-        else
-            i = 1;
+  void orthonormalize() {
+    basis[0].normalize();
+    basis[2] = basis[0].cross(basis[1]);
+    basis[2].normalize();
+    basis[1] = basis[2].cross(basis[0]);
+  }

-        CVector3f v = CVector3f::skZero;
-        v[i] = 1.f;
-        CUnitVector3f newUVec(uVec.cross(v));
-        return {newUVec, uVec, uVec.cross(newUVec), CVector3f::skZero};
-    }
+  void printMatrix() const {
+    printf("%f %f %f %f\n"
+           "%f %f %f %f\n"
+           "%f %f %f %f\n"
+           "%f %f %f %f\n",
+           basis[0][0], basis[1][0], basis[2][0], origin[0],
+           basis[0][1], basis[1][1], basis[2][1], origin[1],
+           basis[0][2], basis[1][2], basis[2][2], origin[2],
+           0.f, 0.f, 0.f, 1.f);
+  }

-    CMatrix3f basis;
-    CVector3f origin;
+  static zeus::CTransform MakeRotationsBasedOnY(const CUnitVector3f& uVec) {
+    uint32_t i;
+    if (uVec.y() < uVec.x() || uVec.z() < uVec.y() || uVec.z() < uVec.x())
+      i = 2;
+    else
+      i = 1;
+
+    CVector3f v = CVector3f::skZero;
+    v[i] = 1.f;
+    CUnitVector3f newUVec(uVec.cross(v));
+    return {newUVec, uVec, uVec.cross(newUVec), CVector3f::skZero};
+  }
+
+  CMatrix3f basis;
+  CVector3f origin;
 };

-static inline CTransform CTransformFromScaleVector(const CVector3f& scale) { return CTransform(CMatrix3f(scale)); }
+static inline CTransform CTransformFromScaleVector(const CVector3f& scale) {
+  return CTransform(CMatrix3f(scale));
+}
+
 CTransform CTransformFromEditorEuler(const CVector3f& eulerVec);
+
 CTransform CTransformFromEditorEulers(const CVector3f& eulerVec, const CVector3f& origin);
+
 CTransform CTransformFromAxisAngle(const CVector3f& axis, float angle);
+
 CTransform lookAt(const CVector3f& pos, const CVector3f& lookPos, const CVector3f& up = CVector3f::skUp);
 }

--- a/include/zeus/CUnitVector.hpp
+++ b/include/zeus/CUnitVector.hpp
@ -2,23 +2,19 @@

 #include "zeus/CVector3f.hpp"

-namespace zeus
-{
-class alignas(16) CUnitVector3f : public CVector3f
-{
+namespace zeus {
+class CUnitVector3f : public CVector3f {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  CUnitVector3f() : CVector3f(0.f, 1.f, 0.f) {}

-    CUnitVector3f() : CVector3f(0, 1, 0) {}
-    CUnitVector3f(float x, float y, float z, bool doNormalize = true) : CVector3f(x, y, z)
-    {
-        if (doNormalize && canBeNormalized())
-            normalize();
-    }
-    CUnitVector3f(const CVector3f& vec, bool doNormalize = true) : CVector3f(vec)
-    {
-        if (doNormalize && canBeNormalized())
-            normalize();
-    }
+  CUnitVector3f(float x, float y, float z, bool doNormalize = true) : CVector3f(x, y, z) {
+    if (doNormalize && canBeNormalized())
+      normalize();
+  }
+
+  CUnitVector3f(const CVector3f& vec, bool doNormalize = true) : CVector3f(vec) {
+    if (doNormalize && canBeNormalized())
+      normalize();
+  }
 };
 }
--- a/include/zeus/CVector2f.hpp
+++ b/include/zeus/CVector2f.hpp
@ -2,431 +2,260 @@

 #include "Global.hpp"
 #include "zeus/Math.hpp"
-#include "TVectorUnion.hpp"
-
-#if ZE_ATHENA_TYPES
-#include <athena/IStreamReader.hpp>
-#endif
-
 #include "zeus/Math.hpp"
 #include <cassert>

-namespace zeus
-{
-class alignas(16) CVector2f
-{
-#if __atdna__
-    float clangVec __attribute__((__vector_size__(8)));
-#endif
+namespace zeus {
+class CVector2f {
 public:
-    // ZE_DECLARE_ALIGNED_ALLOCATOR();
-    union {
-        struct
-        {
-            float x, y;
-        };
-        float v[4];
-#if __SSE__
-        __m128 mVec128;
-#endif
-    };
+  simd<float> mSimd;
+  CVector2f() : mSimd(0.f) {}

-    inline CVector2f() { zeroOut(); }
-#if __SSE__
-    CVector2f(const __m128& mVec128) : mVec128(mVec128)
-    {
-        v[2] = 0.0f;
-        v[3] = 0.0f;
-    }
-#endif
+  template <typename T>
+  CVector2f(const simd<T>& s) : mSimd(s) {}
+  
 #if ZE_ATHENA_TYPES
-    CVector2f(const atVec2f& vec)
-#if __SSE__
-    : mVec128(vec.mVec128)
-    {
-    }
-#else
-    {
-        x = vec.vec[0], y = vec.vec[1], v[2] = 0.0f, v[3] = 0.0f;
-    }
+
+  CVector2f(const atVec2f& vec) : mSimd(vec.simd) {}
+
+  operator atVec2f&() {
+    return *reinterpret_cast<atVec2f*>(this);
+  }
+
+  operator const atVec2f&() const {
+    return *reinterpret_cast<const atVec2f*>(this);
+  }
+
+  void readBig(athena::io::IStreamReader& input) {
+    mSimd[0] = input.readFloatBig();
+    mSimd[1] = input.readFloatBig();
+    mSimd[2] = 0.0f;
+    mSimd[3] = 0.0f;
+  }
+
+  static CVector2f ReadBig(athena::io::IStreamReader& input) {
+    CVector2f ret;
+    ret.readBig(input);
+    return ret;
+  }
+
 #endif

-    operator atVec2f&()
-    {
-        return *reinterpret_cast<atVec2f*>(v);
-    }
-    operator const atVec2f&() const
-    {
-        return *reinterpret_cast<const atVec2f*>(v);
-    }
+  explicit CVector2f(float xy) { splat(xy); }

-    void readBig(athena::io::IStreamReader& input)
-    {
-        x = input.readFloatBig();
-        y = input.readFloatBig();
-        v[2] = 0.0f;
-        v[3] = 0.0f;
-    }
+  void assign(float x, float y) {
+    mSimd[0] = x;
+    mSimd[1] = y;
+    mSimd[2] = 0.0f;
+    mSimd[3] = 0.0f;
+  }

-    static CVector2f ReadBig(athena::io::IStreamReader& input)
-    {
-        CVector2f ret;
-        ret.readBig(input);
-        return ret;
-    }
-#endif
+  CVector2f(float x, float y) { assign(x, y); }

-    explicit CVector2f(float xy) { splat(xy); }
-    void assign(float x, float y)
-    {
-        v[0] = x;
-        v[1] = y;
-        v[2] = 0.0f;
-        v[3] = 0.0f;
-    }
-    CVector2f(float x, float y) { assign(x, y); }
+  bool operator==(const CVector2f& rhs) const {
+    return mSimd[0] == rhs.mSimd[0] && mSimd[1] == rhs.mSimd[1];
+  }

-    inline bool operator==(const CVector2f& rhs) const { return (x == rhs.x && y == rhs.y); }
-    inline bool operator!=(const CVector2f& rhs) const { return !(*this == rhs); }
-    inline bool operator<(const CVector2f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmplt_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 || vec.v[1] != 0);
-#else
-        return (x < rhs.x || y < rhs.y);
-#endif
-    }
-    inline bool operator<=(const CVector2f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmple_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 || vec.v[1] != 0);
-#else
-        return (x <= rhs.x || y <= rhs.y);
-#endif
-    }
-    inline bool operator>(const CVector2f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmpgt_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 || vec.v[1] != 0);
-#else
-        return (x > rhs.x || y > rhs.y);
-#endif
-    }
-    inline bool operator>=(const CVector2f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmpge_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 || vec.v[1] != 0);
-#else
-        return (x >= rhs.x || y >= rhs.y);
-#endif
-    }
+  bool operator!=(const CVector2f& rhs) const {
+    return mSimd[0] != rhs.mSimd[0] || mSimd[1] != rhs.mSimd[1];
+  }

-    inline CVector2f operator+(const CVector2f& rhs) const
-    {
-#if __SSE__
-        return CVector2f(_mm_add_ps(mVec128, rhs.mVec128));
-#else
-        return CVector2f(x + rhs.x, y + rhs.y);
-#endif
-    }
-    inline CVector2f operator-(const CVector2f& rhs) const
-    {
-#if __SSE__
-        return CVector2f(_mm_sub_ps(mVec128, rhs.mVec128));
-#else
-        return CVector2f(x - rhs.x, y - rhs.y);
-#endif
-    }
-    inline CVector2f operator-() const
-    {
-#if __SSE__
-        return CVector2f(_mm_sub_ps(_mm_xor_ps(mVec128, mVec128), mVec128));
-#else
-        return CVector2f(-x, -y);
-#endif
-    }
-    inline CVector2f operator*(const CVector2f& rhs) const
-    {
-#if __SSE__
-        return CVector2f(_mm_mul_ps(mVec128, rhs.mVec128));
-#else
-        return CVector2f(x * rhs.x, y * rhs.y);
-#endif
-    }
-    inline CVector2f operator/(const CVector2f& rhs) const
-    {
-#if __SSE__
-        return CVector2f(_mm_div_ps(mVec128, rhs.mVec128));
-#else
-        return CVector2f(x / rhs.x, y / rhs.y);
-#endif
-    }
-    inline CVector2f operator+(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, 0.0f, 0.0f}};
-        return CVector2f(_mm_add_ps(mVec128, splat.mVec128));
-#else
-        return CVector2f(x + val, y + val);
-#endif
-    }
-    inline CVector2f operator-(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, 0.0f, 0.0f}};
-        return CVector2f(_mm_sub_ps(mVec128, splat.mVec128));
-#else
-        return CVector2f(x - val, y - val);
-#endif
-    }
-    inline CVector2f operator*(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, 0.0f, 0.0f}};
-        return CVector2f(_mm_mul_ps(mVec128, splat.mVec128));
-#else
-        return CVector2f(x * val, y * val);
-#endif
-    }
-    inline CVector2f operator/(float val) const
-    {
-        float ooval = 1.f / val;
-#if __SSE__
-        TVectorUnion splat = {{ooval, ooval, 0.0f, 0.0f}};
-        return CVector2f(_mm_mul_ps(mVec128, splat.mVec128));
-#else
-        return CVector2f(x * ooval, y * ooval);
-#endif
-    }
-    inline const CVector2f& operator+=(const CVector2f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_add_ps(mVec128, rhs.mVec128);
-#else
-        x += rhs.x;
-        y += rhs.y;
-#endif
-        return *this;
-    }
-    inline const CVector2f& operator-=(const CVector2f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_sub_ps(mVec128, rhs.mVec128);
-#else
-        x -= rhs.x;
-        y -= rhs.y;
-#endif
-        return *this;
-    }
-    inline const CVector2f& operator*=(const CVector2f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-#else
-        x *= rhs.x;
-        y *= rhs.y;
-#endif
-        return *this;
-    }
-    inline const CVector2f& operator/=(const CVector2f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_div_ps(mVec128, rhs.mVec128);
-#else
-        x /= rhs.x;
-        y /= rhs.y;
-#endif
-        return *this;
-    }
-    inline const CVector2f& operator+=(float rhs)
-    {
-#if __SSE__
-        TVectorUnion splat = {{rhs, rhs, 0.f, 0.0f}};
-        mVec128 = _mm_add_ps(mVec128, splat.mVec128);
-#else
-        x += rhs;
-        y += rhs;
-#endif
-        return *this;
-    }
-    inline const CVector2f& operator-=(float rhs)
-    {
-#if __SSE__
-        TVectorUnion splat = {{rhs, rhs, 0.f, 0.0f}};
-        mVec128 = _mm_sub_ps(mVec128, splat.mVec128);
-#else
-        x -= rhs;
-        y -= rhs;
-#endif
-        return *this;
-    }
-    inline const CVector2f& operator*=(float rhs)
-    {
-#if __SSE__
-        TVectorUnion splat = {{rhs, rhs, 0.f, 0.0f}};
-        mVec128 = _mm_mul_ps(mVec128, splat.mVec128);
-#else
-        x *= rhs;
-        y *= rhs;
-#endif
-        return *this;
-    }
-    inline const CVector2f& operator/=(float rhs)
-    {
-        float oorhs = 1.f / rhs;
-#if __SSE__
-        TVectorUnion splat = {{oorhs, oorhs, 0.f, 0.0f}};
-        mVec128 = _mm_mul_ps(mVec128, splat.mVec128);
-#else
-        x *= oorhs;
-        y *= oorhs;
-#endif
-        return *this;
-    }
-    inline void normalize()
-    {
-        float mag = magnitude();
-        mag = 1.f / mag;
-        *this *= CVector2f(mag);
-    }
+  bool operator<(const CVector2f& rhs) const {
+    return mSimd[0] < rhs.mSimd[0] && mSimd[1] < rhs.mSimd[1];
+  }

-    inline CVector2f normalized() const
-    {
-        float mag = magnitude();
-        mag = 1.f / mag;
-        return *this * mag;
-    }
+  bool operator<=(const CVector2f& rhs) const {
+    return mSimd[0] <= rhs.mSimd[0] && mSimd[1] <= rhs.mSimd[1];
+  }

-    inline CVector2f perpendicularVector() const { return {-y, x}; }
+  bool operator>(const CVector2f& rhs) const {
+    return mSimd[0] > rhs.mSimd[0] && mSimd[1] > rhs.mSimd[1];
+  }

-    inline float cross(const CVector2f& rhs) const { return (x * rhs.y) - (y * rhs.x); }
-    inline float dot(const CVector2f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-        result.mVec128 = _mm_dp_ps(mVec128, rhs.mVec128, 0x31);
-        return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-        return result.v[0] + result.v[1];
-#endif
-#else
-        return (x * rhs.x) + (y * rhs.y);
-#endif
-    }
-    inline float magSquared() const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-        result.mVec128 = _mm_dp_ps(mVec128, mVec128, 0x31);
-        return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, mVec128);
-        return result.v[0] + result.v[1];
-#endif
-#else
-        return x * x + y * y;
-#endif
-    }
-    inline float magnitude() const { return std::sqrt(magSquared()); }
+  bool operator>=(const CVector2f& rhs) const {
+    return mSimd[0] >= rhs.mSimd[0] && mSimd[1] >= rhs.mSimd[1];
+  }

-    inline void zeroOut()
-    {
-        *this = CVector2f::skZero;
-    }
+  CVector2f operator+(const CVector2f& rhs) const {
+    return mSimd + rhs.mSimd;
+  }

-    inline void splat(float xy)
-    {
-#if __SSE__
-        TVectorUnion splat = {{xy, xy, 0.0f, 0.0f}};
-        mVec128 = splat.mVec128;
-#else
-        v[0] = xy;
-        v[1] = xy;
-        v[2] = 0.0f;
-        v[3] = 0.0f;
-#endif
-    }
+  CVector2f operator-(const CVector2f& rhs) const {
+    return mSimd - rhs.mSimd;
+  }

-    static float getAngleDiff(const CVector2f& a, const CVector2f& b);
+  CVector2f operator-() const {
+    return -mSimd;
+  }

-    static inline CVector2f lerp(const CVector2f& a, const CVector2f& b, float t) { return (a + (b - a) * t); }
-    static inline CVector2f nlerp(const CVector2f& a, const CVector2f& b, float t) { return lerp(a, b, t).normalized(); }
-    static CVector2f slerp(const CVector2f& a, const CVector2f& b, float t);
+  CVector2f operator*(const CVector2f& rhs) const {
+    return mSimd * rhs.mSimd;
+  }

-    inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }
+  CVector2f operator/(const CVector2f& rhs) const {
+    return mSimd / rhs.mSimd;
+  }

-    inline bool canBeNormalized() const
-    {
-        if (std::isinf(x) || std::isinf(y))
-            return false;
-        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON;
-    }
+  CVector2f operator+(float val) const {
+    return mSimd + simd<float>(val);
+  }

-    inline bool isZero() const { return magSquared() <= 1.1920929e-7f; }
+  CVector2f operator-(float val) const {
+    return mSimd - simd<float>(val);
+  }

-    inline bool isEqu(const CVector2f& other, float epsilon = 1.1920929e-7f)
-    {
-        const CVector2f diffVec = other - *this;
-        return (diffVec.x <= epsilon && diffVec.y <= epsilon);
-    }
+  CVector2f operator*(float val) const {
+    return mSimd * simd<float>(val);
+  }

-    inline float& operator[](size_t idx) { assert(idx < 2); return (&x)[idx]; }
-    inline const float& operator[](size_t idx) const { assert(idx < 2); return (&x)[idx]; }
+  CVector2f operator/(float val) const {
+    float ooval = 1.f / val;
+    return mSimd * simd<float>(ooval);
+  }

-    static const CVector2f skOne;
-    static const CVector2f skNegOne;
-    static const CVector2f skZero;
+  const CVector2f& operator+=(const CVector2f& rhs) {
+    mSimd += rhs.mSimd;
+    return *this;
+  }
+
+  const CVector2f& operator-=(const CVector2f& rhs) {
+    mSimd -= rhs.mSimd;
+    return *this;
+  }
+
+  const CVector2f& operator*=(const CVector2f& rhs) {
+    mSimd *= rhs.mSimd;
+    return *this;
+  }
+
+  const CVector2f& operator/=(const CVector2f& rhs) {
+    mSimd /= rhs.mSimd;
+    return *this;
+  }
+
+  const CVector2f& operator+=(float rhs) {
+    mSimd += simd<float>(rhs);
+    return *this;
+  }
+
+  const CVector2f& operator-=(float rhs) {
+    mSimd -= simd<float>(rhs);
+    return *this;
+  }
+
+  const CVector2f& operator*=(float rhs) {
+    mSimd *= simd<float>(rhs);
+    return *this;
+  }
+
+  const CVector2f& operator/=(float rhs) {
+    float oorhs = 1.f / rhs;
+    mSimd /= simd<float>(oorhs);
+    return *this;
+  }
+
+  void normalize() {
+    float mag = magnitude();
+    mag = 1.f / mag;
+    *this *= CVector2f(mag);
+  }
+
+  CVector2f normalized() const {
+    float mag = magnitude();
+    mag = 1.f / mag;
+    return *this * mag;
+  }
+
+  CVector2f perpendicularVector() const { return {-y(), x()}; }
+
+  float cross(const CVector2f& rhs) const { return (x() * rhs.y()) - (y() * rhs.x()); }
+
+  float dot(const CVector2f& rhs) const {
+    return mSimd.dot2(rhs.mSimd);
+  }
+
+  float magSquared() const {
+    return mSimd.dot2(mSimd);
+  }
+
+  float magnitude() const {
+    return std::sqrt(magSquared());
+  }
+
+  void zeroOut() {
+    *this = CVector2f::skZero;
+  }
+
+  void splat(float xy) {
+    mSimd = zeus::simd<float>(xy);
+  }
+
+  static float getAngleDiff(const CVector2f& a, const CVector2f& b);
+
+  static CVector2f lerp(const CVector2f& a, const CVector2f& b, float t) {
+    return zeus::simd<float>(1.f - t) * a.mSimd + b.mSimd * zeus::simd<float>(t);
+  }
+
+  static CVector2f nlerp(const CVector2f& a, const CVector2f& b, float t) {
+    return lerp(a, b, t).normalized();
+  }
+
+  static CVector2f slerp(const CVector2f& a, const CVector2f& b, float t);
+
+  bool isNormalized() const {
+    return std::fabs(1.f - magSquared()) < 0.01f;
+  }
+
+  bool canBeNormalized() const {
+    if (std::isinf(x()) || std::isinf(y()))
+      return false;
+    return std::fabs(x()) >= FLT_EPSILON || std::fabs(y()) >= FLT_EPSILON;
+  }
+
+  bool isZero() const {
+    return magSquared() <= FLT_EPSILON;
+  }
+
+  bool isEqu(const CVector2f& other, float epsilon = FLT_EPSILON) {
+    const CVector2f diffVec = other - *this;
+    return (diffVec.x() <= epsilon && diffVec.y() <= epsilon);
+  }
+
+  zeus::simd<float>::reference operator[](size_t idx) {
+    assert(idx < 2);
+    return mSimd[idx];
+  }
+
+  float operator[](size_t idx) const {
+    assert(idx < 2);
+    return mSimd[idx];
+  }
+
+  float x() const { return mSimd[0]; }
+  float y() const { return mSimd[1]; }
+
+  simd<float>::reference x() { return mSimd[0]; }
+  simd<float>::reference y() { return mSimd[1]; }
+
+  static const CVector2f skOne;
+  static const CVector2f skNegOne;
+  static const CVector2f skZero;
 };

-static inline CVector2f operator+(float lhs, const CVector2f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, 0.0f, 0.0f}};
-    return CVector2f(_mm_add_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector2f(lhs + rhs.x, lhs + rhs.y);
-#endif
+static inline CVector2f operator+(float lhs, const CVector2f& rhs) {
+  return zeus::simd<float>(lhs) + rhs.mSimd;
 }

-static inline CVector2f operator-(float lhs, const CVector2f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, 0.0f, 0.0f}};
-    return CVector2f(_mm_sub_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector2f(lhs - rhs.x, lhs - rhs.y);
-#endif
+static inline CVector2f operator-(float lhs, const CVector2f& rhs) {
+  return zeus::simd<float>(lhs) - rhs.mSimd;
 }

-static inline CVector2f operator*(float lhs, const CVector2f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, 0.0f, 0.0f}};
-    return CVector2f(_mm_mul_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector2f(lhs * rhs.x, lhs * rhs.y);
-#endif
+static inline CVector2f operator*(float lhs, const CVector2f& rhs) {
+  return zeus::simd<float>(lhs) * rhs.mSimd;
 }

-static inline CVector2f operator/(float lhs, const CVector2f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, 0.0f, 0.0f}};
-    return CVector2f(_mm_div_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector2f(lhs / rhs.x, lhs / rhs.y);
-#endif
+static inline CVector2f operator/(float lhs, const CVector2f& rhs) {
+  return zeus::simd<float>(lhs) / rhs.mSimd;
 }
 }

--- a/include/zeus/CVector2i.hpp
+++ b/include/zeus/CVector2i.hpp
@ -5,56 +5,57 @@
 #include "CVector2f.hpp"

 #if ZE_ATHENA_TYPES
+
 #include <athena/IStreamReader.hpp>
+
 #endif

-namespace zeus
-{
+namespace zeus {

-class CVector2i
-{
+class CVector2i {
 public:
-    union {
-        struct
-        {
-            int x, y;
-        };
-        int v[2];
+  union {
+    struct {
+      int x, y;
    };
-    CVector2i() = default;
-    CVector2i(int xin, int yin) : x(xin), y(yin) {}
-    CVector2i(const CVector2f& vec) : x(int(vec.x)), y(int(vec.y)) {}
+    int v[2];
+  };

-    CVector2f toVec2f() const { return CVector2f(x, y); }
+  CVector2i() = default;

-    inline CVector2i operator+(const CVector2i& val) const
-    {
-        return CVector2i(x + val.x, y + val.y);
-    }
-    inline CVector2i operator-(const CVector2i& val) const
-    {
-        return CVector2i(x - val.x, y - val.y);
-    }
-    inline CVector2i operator*(const CVector2i& val) const
-    {
-        return CVector2i(x * val.x, y * val.y);
-    }
-    inline CVector2i operator/(const CVector2i& val) const
-    {
-        return CVector2i(x / val.x, y / val.y);
-    }
-    inline bool operator==(const CVector2i& other) const
-    {
-        return x == other.x && y == other.y;
-    }
-    inline bool operator!=(const CVector2i& other) const
-    {
-        return x != other.x || y != other.y;
-    }
-    inline CVector2i operator*(int val) const
-    {
-        return CVector2i(x * val, y * val);
-    }
+  CVector2i(int xin, int yin) : x(xin), y(yin) {}
+
+  CVector2i(const CVector2f& vec) : x(int(vec.x())), y(int(vec.y())) {}
+
+  CVector2f toVec2f() const { return CVector2f(x, y); }
+
+  CVector2i operator+(const CVector2i& val) const {
+    return CVector2i(x + val.x, y + val.y);
+  }
+
+  CVector2i operator-(const CVector2i& val) const {
+    return CVector2i(x - val.x, y - val.y);
+  }
+
+  CVector2i operator*(const CVector2i& val) const {
+    return CVector2i(x * val.x, y * val.y);
+  }
+
+  CVector2i operator/(const CVector2i& val) const {
+    return CVector2i(x / val.x, y / val.y);
+  }
+
+  bool operator==(const CVector2i& other) const {
+    return x == other.x && y == other.y;
+  }
+
+  bool operator!=(const CVector2i& other) const {
+    return x != other.x || y != other.y;
+  }
+
+  CVector2i operator*(int val) const {
+    return CVector2i(x * val, y * val);
+  }
 };
 }

--- a/include/zeus/CVector3d.hpp
+++ b/include/zeus/CVector3d.hpp
@ -1,288 +1,118 @@
 #pragma once

-#include <athena/Types.hpp>
+#include "athena/Types.hpp"
 #include "Global.hpp"
 #include "zeus/Math.hpp"
-#include "TVectorUnion.hpp"
 #include "zeus/CVector3f.hpp"

-namespace zeus
-{
-class alignas(32) CVector3d
-{
+namespace zeus {
+
+class CVector3d {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR32();
-    CVector3d() { zeroOut(); }
+  zeus::simd<double> mSimd;
+  CVector3d() : mSimd(0.0) {}

-#if __AVX__
-    CVector3d(const __m256d& mVec256)
-    {
-        this->mVec256 = mVec256;
-        v[3] = 0.0;
-    }
-#elif __SSE__
-    CVector3d(const __m128d mVec128[2])
-    {
-        this->mVec128[0] = mVec128[0];
-        this->mVec128[1] = mVec128[1];
-        v[3] = 0.0;
-    }
-#endif
+  template <typename T>
+  CVector3d(const simd<T>& s) : mSimd(s) {}
+  
 #if ZE_ATHENA_TYPES
-    CVector3d(const atVec3d& vec)
-    {
-#if __AVX__
-        mVec256 = vec.mVec256;
-#elif __SSE__
-        mVec128[0] = vec.mVec128[0];
-        mVec128[1] = vec.mVec128[1];
-#else
-        x = v[0], y = v[1], z = v[2], v[3] = 0.0f;
-#endif
-    }
+  CVector3d(const atVec3d& vec) : mSimd(vec.simd) {}
 #endif

-    explicit CVector3d(double xyz) { splat(xyz); }
+  explicit CVector3d(double xyz) : mSimd(xyz) {}

-    CVector3d(const CVector3f& vec)
-    {
-#if __AVX__
-        mVec256 = _mm256_cvtps_pd(vec.mVec128);
-#elif __SSE__
-        mVec128[0] = _mm_cvtps_pd(vec.mVec128);
-        v[2] = vec[2];
-#else
-        v[0] = vec[0];
-        v[1] = vec[1];
-        v[2] = vec[2];
-        v[3] = 0.0;
-#endif
-    }
+  CVector3d(const CVector3f& vec) : mSimd(vec.mSimd) {}

-    CVector3d(double x, double y, double z)
-    {
-#if __AVX__
-        TDblVectorUnion splat{{x, y, z, 0.0}};
-        mVec256 = splat.mVec256;
-#elif __SSE__
-        TDblVectorUnion splat{{x, y, z, 0.0}};
-        mVec128[0] = splat.mVec128[0];
-        mVec128[1] = splat.mVec128[1];
-#else
-        v[0] = x;
-        v[1] = y;
-        v[2] = z;
-        v[3] = 0.0;
-#endif
-    }
+  CVector3d(double x, double y, double z) : mSimd(x, y, z) {}

-    CVector3f asCVector3f()
-    {
-#if __AVX__
-        return CVector3f(_mm256_cvtpd_ps(mVec256));
-#else
-        return CVector3f(float(x), float(y), float(z));
-#endif
-    }
+  CVector3f asCVector3f() {
+    return mSimd;
+  }

-    double magSquared() const
-    {
-#if __SSE__
-        TDblVectorUnion result;
-#if __SSE4_1__
-        result.mVec128[0] = _mm_dp_pd(mVec128[0], mVec128[0], 0x31);
-        return result.v[0] + (v[2] * v[2]);
-#else
-        result.mVec128[0] = _mm_mul_pd(mVec128[0], mVec128[0]);
-        result.mVec128[1] = _mm_mul_pd(mVec128[1], mVec128[1]);
-        return result.v[0] + result.v[1] + result.v[2];
-#endif
-#else
-        return x * x + y * y + z * z;
-#endif
-    }
+  double magSquared() const {
+    return mSimd.dot3(mSimd);
+  }

-    double magnitude() const { return sqrt(magSquared()); }
-    inline CVector3d cross(const CVector3d& rhs) const
-    {
-        return {y * rhs.z - z * rhs.y,
-                z * rhs.x - x * rhs.z,
-                x * rhs.y - y * rhs.x};
-    }
+  double magnitude() const {
+    return sqrt(magSquared());
+  }

-    double dot(const CVector3d& rhs) const
-    {
-#if __SSE__
-        TDblVectorUnion result;
-#if __SSE4_1__
-        result.mVec128[0] = _mm_dp_pd(mVec128[0], rhs.mVec128[0], 0x31);
-        return result.v[0] + (v[2] * rhs.v[2]);
-#else
-        result.mVec128[0] = _mm_mul_pd(mVec128[0], rhs.mVec128[0]);
-        result.mVec128[1] = _mm_mul_pd(mVec128[1], rhs.mVec128[1]);
-        return result.v[0] + result.v[1] + result.v[2];
-#endif
-#else
-        return (x * rhs.x) + (y * rhs.y) + (z * rhs.z);
-#endif
-    }
+  CVector3d cross(const CVector3d& rhs) const {
+    return {y() * rhs.z() - z() * rhs.y(),
+            z() * rhs.x() - x() * rhs.z(),
+            x() * rhs.y() - y() * rhs.x()};
+  }

-    CVector3d asNormalized()
-    {
-        double mag = magnitude();
-        mag = 1.0 / mag;
-        return {x * mag, y * mag, z * mag};
-    }
+  double dot(const CVector3d& rhs) const {
+    return mSimd.dot3(rhs.mSimd);
+  }

-    void splat(double xyz)
-    {
-#if __AVX__
-        TDblVectorUnion splat = {{xyz, xyz, xyz, 0.0}};
-        mVec256 = splat.mVec256;
-#elif __SSE__
-        TDblVectorUnion splat = {{xyz, xyz, xyz, 0.0}};
-        mVec128[0] = splat.mVec128[0];
-        mVec128[1] = splat.mVec128[1];
-#else
-        v[0] = xyz;
-        v[1] = xyz;
-        v[2] = xyz;
-        v[3] = 0.0;
-#endif
-    }
+  CVector3d asNormalized() {
+    double mag = magnitude();
+    mag = 1.0 / mag;
+    return mSimd * zeus::simd<double>(mag);
+  }

-    void zeroOut()
-    {
-        *this = skZero;
-    }
+  void splat(double xyz) {
+    mSimd = zeus::simd<double>(xyz);
+  }

-    inline CVector3d operator+(const CVector3d& rhs) const
-    {
-#if __AVX__
-        return _mm256_add_pd(mVec256, rhs.mVec256);
-#elif __SSE__
-        const __m128d tmpVec128[2] = {_mm_add_pd(mVec128[0], rhs.mVec128[0]),
-                                      _mm_add_pd(mVec128[1], rhs.mVec128[1])};
-        return CVector3d(tmpVec128);
-#else
-        return CVector3d(x + rhs.x, y + rhs.y, z + rhs.z);
-#endif
-    }
-    inline CVector3d operator-(const CVector3d& rhs) const
-    {
-#if __AVX__
-        return _mm256_sub_pd(mVec256, rhs.mVec256);
-#elif __SSE__
-        const __m128d tmpVec128[2] = {_mm_sub_pd(mVec128[0], rhs.mVec128[0]),
-                                      _mm_sub_pd(mVec128[1], rhs.mVec128[1])};
-        return CVector3d(tmpVec128);
-#else
-        return CVector3d(x - rhs.x, y - rhs.y, z - rhs.z);
-#endif
-    }
-    inline CVector3d operator*(const CVector3d& rhs) const
-    {
-#if __AVX__
-        return _mm256_mul_pd(mVec256, rhs.mVec256);
-#elif __SSE__
-        const __m128d tmpVec128[2] = {_mm_mul_pd(mVec128[0], rhs.mVec128[0]),
-                                      _mm_mul_pd(mVec128[1], rhs.mVec128[1])};
-        return CVector3d(tmpVec128);
-#else
-        return CVector3d(x * rhs.x, y * rhs.y, z * rhs.z);
-#endif
-    }
-    inline CVector3d operator/(const CVector3d& rhs) const
-    {
-#if __AVX__
-        return _mm256_div_pd(mVec256, rhs.mVec256);
-#elif __SSE__
-        const __m128d tmpVec128[2] = {_mm_div_pd(mVec128[0], rhs.mVec128[0]),
-                                      _mm_div_pd(mVec128[1], rhs.mVec128[1])};
-        return CVector3d(tmpVec128);
-#else
-        return CVector3d(x / rhs.x, y / rhs.y, z / rhs.z);
-#endif
-    }
+  void zeroOut() {
+    *this = skZero;
+  }

-    inline double& operator[](size_t idx) { assert(idx < 3); return v[idx]; }
-    inline const double& operator[](size_t idx) const { assert(idx < 3); return v[idx]; }
+  CVector3d operator+(const CVector3d& rhs) const {
+    return mSimd + rhs.mSimd;
+  }

-    union {
-        struct
-        {
-            double x, y, z;
-        };
-        double v[4];
-#if __AVX__
-        __m256d mVec256;
-#endif
-#if __SSE__
-        __m128d mVec128[2];
-#endif
-    };
+  CVector3d operator-(const CVector3d& rhs) const {
+    return mSimd - rhs.mSimd;
+  }

-    static const CVector3d skZero;
+  CVector3d operator*(const CVector3d& rhs) const {
+    return mSimd * rhs.mSimd;
+  }
+
+  CVector3d operator/(const CVector3d& rhs) const {
+    return mSimd / rhs.mSimd;
+  }
+
+  zeus::simd<double>::reference operator[](size_t idx) {
+    assert(idx < 3);
+    return mSimd[idx];
+  }
+
+  double operator[](size_t idx) const {
+    assert(idx < 3);
+    return mSimd[idx];
+  }
+
+  double x() const { return mSimd[0]; }
+  double y() const { return mSimd[1]; }
+  double z() const { return mSimd[2]; }
+
+  simd<double>::reference x() { return mSimd[0]; }
+  simd<double>::reference y() { return mSimd[1]; }
+  simd<double>::reference z() { return mSimd[2]; }
+
+  static const CVector3d skZero;
 };

-static inline CVector3d operator+(double lhs, const CVector3d& rhs)
-{
-#if __AVX__
-    TDblVectorUnion splat{{lhs, lhs, lhs, 0}};
-    return _mm256_add_pd(splat.mVec256, rhs.mVec256);
-#elif __SSE__
-    TDblVectorUnion splat{{lhs, lhs, lhs, 0}};
-    splat.mVec128[0] = _mm_add_pd(splat.mVec128[0], rhs.mVec128[0]);
-    splat.mVec128[1] = _mm_add_pd(splat.mVec128[1], rhs.mVec128[1]);
-    return {splat.mVec128};
-#else
-    return {lhs + rhs.x, lhs + rhs.y, lhs + rhs.z};
-#endif
+static inline CVector3d operator+(double lhs, const CVector3d& rhs) {
+  return zeus::simd<double>(lhs) + rhs.mSimd;
 }

-static inline CVector3d operator-(double lhs, const CVector3d& rhs)
-{
-#if __AVX__
-    TDblVectorUnion splat{{lhs, lhs, lhs, 0}};
-    return _mm256_sub_pd(splat.mVec256, rhs.mVec256);
-#elif __SSE__
-    TDblVectorUnion splat{{lhs, lhs, lhs, 0}};
-    splat.mVec128[0] = _mm_sub_pd(splat.mVec128[0], rhs.mVec128[0]);
-    splat.mVec128[1] = _mm_sub_pd(splat.mVec128[1], rhs.mVec128[1]);
-    return {splat.mVec128};
-#else
-    return {lhs - rhs.x, lhs - rhs.y, lhs - rhs.z};
-#endif
+static inline CVector3d operator-(double lhs, const CVector3d& rhs) {
+  return zeus::simd<double>(lhs) - rhs.mSimd;
 }

-static inline CVector3d operator*(double lhs, const CVector3d& rhs)
-{
-#if __AVX__
-    TDblVectorUnion splat{{lhs, lhs, lhs, 0}};
-    return _mm256_mul_pd(splat.mVec256, rhs.mVec256);
-#elif __SSE__
-    TDblVectorUnion splat{{lhs, lhs, lhs, 0}};
-    splat.mVec128[0] = _mm_mul_pd(splat.mVec128[0], rhs.mVec128[0]);
-    splat.mVec128[1] = _mm_mul_pd(splat.mVec128[1], rhs.mVec128[1]);
-    return {splat.mVec128};
-#else
-    return {lhs * rhs.x, lhs * rhs.y, lhs * rhs.z};
-#endif
+static inline CVector3d operator*(double lhs, const CVector3d& rhs) {
+  return zeus::simd<double>(lhs) * rhs.mSimd;
+}
+
+static inline CVector3d operator/(double lhs, const CVector3d& rhs) {
+  return zeus::simd<double>(lhs) / rhs.mSimd;
 }

-static inline CVector3d operator/(double lhs, const CVector3d& rhs)
-{
-#if __AVX__
-    TDblVectorUnion splat{{lhs, lhs, lhs, 0}};
-    return _mm256_div_pd(splat.mVec256, rhs.mVec256);
-#elif __SSE__
-    TDblVectorUnion splat{{lhs, lhs, lhs, 0}};
-    splat.mVec128[0] = _mm_div_pd(splat.mVec128[0], rhs.mVec128[0]);
-    splat.mVec128[1] = _mm_div_pd(splat.mVec128[1], rhs.mVec128[1]);
-    return {splat.mVec128};
-#else
-    return {lhs.x / rhs.x, lhs.y / rhs.y, lhs.z / rhs.z};
-#endif
-}
 }

--- a/include/zeus/CVector3f.hpp
+++ b/include/zeus/CVector3f.hpp
@ -3,440 +3,278 @@
 #include "Global.hpp"
 #include "zeus/Math.hpp"
 #include "zeus/CVector2f.hpp"
-#include "TVectorUnion.hpp"
+
 #if ZE_ATHENA_TYPES
-#include <athena/IStreamReader.hpp>
+#include "athena/IStreamReader.hpp"
 #endif

-namespace zeus
-{
+namespace zeus {
 class CVector3d;
-class alignas(16) CVector3f
-{
-#if __atdna__
-    float clangVec __attribute__((__vector_size__(12)));
-#endif
+
+class CVector3f {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+  zeus::simd<float> mSimd;
+  CVector3f() : mSimd(0.f) {}

-    union {
-        struct
-        {
-            float x, y, z;
-        };
-        float v[4];
-#if __SSE__
-        __m128 mVec128;
-#elif __GEKKO_PS__
-        ps128_t mVec128;
-#endif
-    };
-
-    inline CVector3f() { zeroOut(); }
-#if __SSE__ || __GEKKO_PS__
-    CVector3f(const __m128& mVec128) : mVec128(mVec128) { v[3] = 0.0f; }
-#endif
+  template <typename T>
+  CVector3f(const simd<T>& s) : mSimd(s) {}

 #if ZE_ATHENA_TYPES
-    CVector3f(const atVec3f& vec)
-#if __SSE__ || __GEKKO_PS__
-    : mVec128(vec.mVec128)
-    {
-    }
-#else
-    {
-        x = vec.vec[0], y = vec.vec[1], z = vec.vec[2], v[3] = 0.0f;
-    }
+
+  CVector3f(const atVec3f& vec) : mSimd(vec.simd) {}
+
+  operator atVec3f&() {
+    return *reinterpret_cast<atVec3f*>(this);
+  }
+
+  operator const atVec3f&() const {
+    return *reinterpret_cast<const atVec3f*>(this);
+  }
+
+  void readBig(athena::io::IStreamReader& input) {
+    simd_floats f;
+    f[0] = input.readFloatBig();
+    f[1] = input.readFloatBig();
+    f[2] = input.readFloatBig();
+    f[3] = 0.0f;
+    mSimd.copy_from(f);
+  }
+
+  static CVector3f ReadBig(athena::io::IStreamReader& input) {
+    CVector3f ret;
+    ret.readBig(input);
+    return ret;
+  }
+
 #endif

-    operator atVec3f&()
-    {
-        return *reinterpret_cast<atVec3f*>(v);
-    }
-    operator const atVec3f&() const
-    {
-        return *reinterpret_cast<const atVec3f*>(v);
+  CVector3f(const CVector3d& vec);
+
+  explicit CVector3f(float xyz) : mSimd(xyz) {}
+
+  void assign(float x, float y, float z) {
+    mSimd = zeus::simd<float>(x, y, z);
+  }
+
+  CVector3f(float x, float y, float z) : mSimd(x, y, z) {}
+
+  CVector3f(const float* floats) : mSimd(floats[0], floats[1], floats[2]) {}
+
+  CVector3f(const CVector2f& other) {
+    mSimd = other.mSimd;
+    mSimd[2] = 0.0f;
+    mSimd[3] = 0.0f;
+  }
+
+  CVector2f toVec2f() const {
+    return CVector2f(mSimd);
+  }
+
+  bool operator==(const CVector3f& rhs) const {
+    return mSimd[0] == rhs.mSimd[0] && mSimd[1] == rhs.mSimd[1] && mSimd[2] == rhs.mSimd[2];
+  }
+
+  bool operator!=(const CVector3f& rhs) const { return !(*this == rhs); }
+
+  CVector3f operator+(const CVector3f& rhs) const {
+    return mSimd + rhs.mSimd;
+  }
+
+  CVector3f operator-(const CVector3f& rhs) const {
+    return mSimd - rhs.mSimd;
+  }
+
+  CVector3f operator-() const {
+    return -mSimd;
+  }
+
+  CVector3f operator*(const CVector3f& rhs) const {
+    return mSimd * rhs.mSimd;
+  }
+
+  CVector3f operator/(const CVector3f& rhs) const {
+    return mSimd / rhs.mSimd;
+  }
+
+  CVector3f operator+(float val) const {
+    return mSimd + zeus::simd<float>(val);
+  }
+
+  CVector3f operator-(float val) const {
+    return mSimd - zeus::simd<float>(val);
+  }
+
+  CVector3f operator*(float val) const {
+    return mSimd * zeus::simd<float>(val);
+  }
+
+  CVector3f operator/(float val) const {
+    float ooval = 1.f / val;
+    return mSimd * zeus::simd<float>(ooval);
+  }
+
+  const CVector3f& operator+=(const CVector3f& rhs) {
+    mSimd += rhs.mSimd;
+    return *this;
+  }
+
+  const CVector3f& operator-=(const CVector3f& rhs) {
+    mSimd -= rhs.mSimd;
+    return *this;
+  }
+
+  const CVector3f& operator*=(const CVector3f& rhs) {
+    mSimd *= rhs.mSimd;
+    return *this;
+  }
+
+  const CVector3f& operator/=(const CVector3f& rhs) {
+    mSimd /= rhs.mSimd;
+    return *this;
+  }
+
+  void normalize() {
+    float mag = 1.f / magnitude();
+    *this *= CVector3f(mag);
+  }
+
+  CVector3f normalized() const {
+    float mag = 1.f / magnitude();
+    return *this * mag;
+  }
+
+  CVector3f cross(const CVector3f& rhs) const {
+    return CVector3f(y() * rhs.z() - z() * rhs.y(),
+                     z() * rhs.x() - x() * rhs.z(),
+                     x() * rhs.y() - y() * rhs.x());
+  }
+
+  float dot(const CVector3f& rhs) const {
+    return mSimd.dot3(rhs.mSimd);
+  }
+
+  float magSquared() const {
+    return mSimd.dot3(mSimd);
+  }
+
+  float magnitude() const {
+    return std::sqrt(magSquared());
+  }
+
+  bool isNotInf() const {
+    return !(std::isinf(x()) || std::isinf(y()) || std::isinf(z()));
+  }
+
+  bool isMagnitudeSafe() const {
+    return isNotInf() && magSquared() >= 9.9999994e-29;
+  }
+
+  void zeroOut() {
+    *this = CVector3f::skZero;
+  }
+
+  void splat(float xyz) {
+    mSimd = zeus::simd<float>(xyz);
+  }
+
+  static float getAngleDiff(const CVector3f& a, const CVector3f& b);
+
+  static CVector3f lerp(const CVector3f& a, const CVector3f& b, float t) {
+    return zeus::simd<float>(1.f - t) * a.mSimd + b.mSimd * zeus::simd<float>(t);
+  }
+
+  static CVector3f nlerp(const CVector3f& a, const CVector3f& b, float t) {
+    return lerp(a, b, t).normalized();
+  }
+
+  static CVector3f slerp(const CVector3f& a, const CVector3f& b, float t);
+
+  bool isNormalized() const {
+    return std::fabs(1.f - magSquared()) < 0.01f;
+  }
+
+  bool canBeNormalized() const {
+    if (std::isinf(x()) || std::isinf(y()) || std::isinf(z()))
+      return false;
+    return std::fabs(x()) >= FLT_EPSILON || std::fabs(y()) >= FLT_EPSILON || std::fabs(z()) >= FLT_EPSILON;
+  }
+
+  bool isZero() const {
+    return magSquared() <= FLT_EPSILON;
+  }
+
+  void scaleToLength(float newLength) {
+    float length = magSquared();
+    if (length < FLT_EPSILON) {
+      mSimd[0] = newLength, mSimd[1] = 0.f, mSimd[2] = 0.f;
+      return;
    }

-    void readBig(athena::io::IStreamReader& input)
-    {
-        x = input.readFloatBig();
-        y = input.readFloatBig();
-        z = input.readFloatBig();
-        v[3] = 0.0f;
-    }
+    length = std::sqrt(length);
+    float scalar = newLength / length;
+    *this *= CVector3f(scalar);
+  }

-    static CVector3f ReadBig(athena::io::IStreamReader& input)
-    {
-        CVector3f ret;
-        ret.readBig(input);
-        return ret;
-    }
-#endif
+  CVector3f scaledToLength(float newLength) const {
+    CVector3f v = *this;
+    v.scaleToLength(newLength);
+    return v;
+  }

-    CVector3f(const CVector3d& vec);
+  bool isEqu(const CVector3f& other, float epsilon = FLT_EPSILON) {
+    const CVector3f diffVec = other - *this;
+    return (diffVec.x() <= epsilon && diffVec.y() <= epsilon && diffVec.z() <= epsilon);
+  }

-    explicit CVector3f(float xyz) { splat(xyz); }
-    void assign(float x, float y, float z)
-    {
-        v[0] = x;
-        v[1] = y;
-        v[2] = z;
-        v[3] = 0.0f;
-    }
-    CVector3f(float x, float y, float z) { assign(x, y, z); }
+  zeus::simd<float>::reference operator[](size_t idx) {
+    assert(idx < 3);
+    return mSimd[idx];
+  }

-    CVector3f(const float* floats)
-    {
-#if __SSE__
-        mVec128 = _mm_loadu_ps(floats);
-#else
-        x = floats[0];
-        y = floats[1];
-        z = floats[2];
-#endif
-        v[3] = 0.0f;
-    }
+  float operator[](size_t idx) const {
+    assert(idx < 3);
+    return mSimd[idx];
+  }

-    CVector3f(const CVector2f& other)
-    {
-        x = other.x;
-        y = other.y;
-        z = 0.0f;
-        v[3] = 0.0f;
-    }
+  float x() const { return mSimd[0]; }
+  float y() const { return mSimd[1]; }
+  float z() const { return mSimd[2]; }

-    inline CVector2f toVec2f() const
-    {
-#if __SSE__
-        return CVector2f(mVec128);
-#else
-        return CVector2f(x, y);
-#endif
-    }
+  simd<float>::reference x() { return mSimd[0]; }
+  simd<float>::reference y() { return mSimd[1]; }
+  simd<float>::reference z() { return mSimd[2]; }

-    inline bool operator==(const CVector3f& rhs) const { return (x == rhs.x && y == rhs.y && z == rhs.z); }
-    inline bool operator!=(const CVector3f& rhs) const { return !(*this == rhs); }
-    inline CVector3f operator+(const CVector3f& rhs) const
-    {
-#if __SSE__
-        return CVector3f(_mm_add_ps(mVec128, rhs.mVec128));
-#elif __GEKKO_PS__
-        return CVector3f(__mm_gekko_add_ps(mVec128, rhs.mVec128));
-#else
-        return CVector3f(x + rhs.x, y + rhs.y, z + rhs.z);
-#endif
-    }
-    inline CVector3f operator-(const CVector3f& rhs) const
-    {
-#if __SSE__
-        return CVector3f(_mm_sub_ps(mVec128, rhs.mVec128));
-#else
-        return CVector3f(x - rhs.x, y - rhs.y, z - rhs.z);
-#endif
-    }
-    inline CVector3f operator-() const
-    {
-#if __SSE__
-        return CVector3f(_mm_sub_ps(_mm_xor_ps(mVec128, mVec128), mVec128));
-#elif __GEKKO_PS__
-        return CVector3f(_mm_gekko_neg_ps(mVec128));
-#else
-        return CVector3f(-x, -y, -z);
-#endif
-    }
-    inline CVector3f operator*(const CVector3f& rhs) const
-    {
-#if __SSE__
-        return CVector3f(_mm_mul_ps(mVec128, rhs.mVec128));
-#else
-        return CVector3f(x * rhs.x, y * rhs.y, z * rhs.z);
-#endif
-    }
-    inline CVector3f operator/(const CVector3f& rhs) const
-    {
-#if __SSE__
-        return CVector3f(_mm_div_ps(mVec128, rhs.mVec128));
-#else
-        return CVector3f(x / rhs.x, y / rhs.y, z / rhs.z);
-#endif
-    }
-    inline CVector3f operator+(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, val, 0.0f}};
-        return CVector3f(_mm_add_ps(mVec128, splat.mVec128));
-#else
-        return CVector3f(x + val, y + val, z + val);
-#endif
-    }
-    inline CVector3f operator-(float val) const
-    {
-#if __SSE__ || __GEKKO_PS__
-        TVectorUnion splat = {{val, val, val, 0.0f}};
-#endif
-#if __SSE__
-        return CVector3f(_mm_sub_ps(mVec128, splat.mVec128));
-#elif __GEKKO_PS__
-        return CVector3f(_mm_gekko_sub_ps(mVec128, splat.mVec128));
-#else
-        return CVector3f(x - val, y - val, z - val);
-#endif
-    }
-    inline CVector3f operator*(float val) const
-    {
-#if __SSE__ || __GEKKO_PS__
-        TVectorUnion splat = {{val, val, val, 0.0f}};
-#endif
-#if __SSE__
-        return CVector3f(_mm_mul_ps(mVec128, splat.mVec128));
-#elif __GEKKO_PS__
-        return CVector3f(_mm_gekko_mul_ps(mVec128, splat.mVec128));
-#else
-        return CVector3f(x * val, y * val, z * val);
-#endif
-    }
-    inline CVector3f operator/(float val) const
-    {
-        float ooval = 1.f / val;
-#if __SSE__ || __GEKKO_PS__
-        TVectorUnion splat = {{ooval, ooval, ooval, 0.0f}};
-#endif
-#if __SSE__
-        return CVector3f(_mm_mul_ps(mVec128, splat.mVec128));
-#elif __GEKKO_PS__
-        return CVector3f(_mm_gekko_mul_ps(mVec128, splat.mVec128));
-#else
-        return CVector3f(x * ooval, y * ooval, z * ooval);
-#endif
-    }
-    inline const CVector3f& operator+=(const CVector3f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_add_ps(mVec128, rhs.mVec128);
-#elif __GEKKO_PS__
-        mVec128 = _mm_gekko_add_ps(mVec128, rhs.mVec128);
-#else
-        x += rhs.x;
-        y += rhs.y;
-        z += rhs.z;
-#endif
-        return *this;
-    }
-    inline const CVector3f& operator-=(const CVector3f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_sub_ps(mVec128, rhs.mVec128);
-#else
-        x -= rhs.x;
-        y -= rhs.y;
-        z -= rhs.z;
-#endif
-        return *this;
-    }
-    inline const CVector3f& operator*=(const CVector3f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-#else
-        x *= rhs.x;
-        y *= rhs.y;
-        z *= rhs.z;
-#endif
-        return *this;
-    }
-    inline const CVector3f& operator/=(const CVector3f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_div_ps(mVec128, rhs.mVec128);
-#else
-        x /= rhs.x;
-        y /= rhs.y;
-        z /= rhs.z;
-#endif
-        return *this;
-    }
+  static const CVector3f skOne;
+  static const CVector3f skNegOne;
+  static const CVector3f skZero;
+  static const CVector3f skForward;
+  static const CVector3f skBack;
+  static const CVector3f skLeft;
+  static const CVector3f skRight;
+  static const CVector3f skUp;
+  static const CVector3f skDown;
+  static const CVector3f skRadToDegVec;
+  static const CVector3f skDegToRadVec;

-    inline void normalize()
-    {
-        float mag = 1.f / magnitude();
-        *this *= CVector3f(mag);
-    }
-    inline CVector3f normalized() const
-    {
-        float mag = 1.f / magnitude();
-        return *this * mag;
-    }
-    inline CVector3f cross(const CVector3f& rhs) const
-    {
-        return CVector3f(y * rhs.z - z * rhs.y,
-                         z * rhs.x - x * rhs.z,
-                         x * rhs.y - y * rhs.x);
-    }
+  static CVector3f radToDeg(const CVector3f& rad) { return rad * skRadToDegVec; }

-    inline float dot(const CVector3f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-        result.mVec128 = _mm_dp_ps(mVec128, rhs.mVec128, 0x71);
-        return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-        return result.v[0] + result.v[1] + result.v[2];
-#endif
-#else
-        return (x * rhs.x) + (y * rhs.y) + (z * rhs.z);
-#endif
-    }
-
-    inline float magSquared() const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-        result.mVec128 = _mm_dp_ps(mVec128, mVec128, 0x71);
-        return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, mVec128);
-        return result.v[0] + result.v[1] + result.v[2];
-#endif
-#else
-        return x * x + y * y + z * z;
-#endif
-    }
-
-    inline float magnitude() const { return std::sqrt(magSquared()); }
-
-    inline bool isNotInf() const
-    {
-        return !(std::isinf(x) || std::isinf(y) || std::isinf(z));
-    }
-
-    inline bool isMagnitudeSafe() const
-    {
-        return isNotInf() && magSquared() >= 9.9999994e-29;
-    }
-
-    inline void zeroOut()
-    {
-        *this = CVector3f::skZero;
-    }
-
-    inline void splat(float xyz)
-    {
-#if __SSE__
-        TVectorUnion splat = {{xyz, xyz, xyz, 0.0f}};
-        mVec128 = splat.mVec128;
-#else
-        v[0] = xyz;
-        v[1] = xyz;
-        v[2] = xyz;
-        v[3] = 0.0f;
-#endif
-    }
-
-    static float getAngleDiff(const CVector3f& a, const CVector3f& b);
-
-    static inline CVector3f lerp(const CVector3f& a, const CVector3f& b, float t) { return (a + (b - a) * t); }
-    static inline CVector3f nlerp(const CVector3f& a, const CVector3f& b, float t) { return lerp(a, b, t).normalized(); }
-    static CVector3f slerp(const CVector3f& a, const CVector3f& b, float t);
-
-    inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }
-
-    inline bool canBeNormalized() const
-    {
-        if (std::isinf(x) || std::isinf(y) || std::isinf(z))
-            return false;
-        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON || std::fabs(z) >= FLT_EPSILON;
-    }
-
-    inline bool isZero() const { return magSquared() <= 1.1920929e-7f; }
-
-    inline void scaleToLength(float newLength)
-    {
-        float length = magSquared();
-        if (length < 1.1920929e-7f)
-        {
-            x = newLength, y = 0.f, z = 0.f;
-            return;
-        }
-
-        length = std::sqrt(length);
-        float scalar = newLength / length;
-        *this *= CVector3f(scalar);
-    }
-
-    inline CVector3f scaledToLength(float newLength) const
-    {
-        CVector3f v = *this;
-        v.scaleToLength(newLength);
-        return v;
-    }
-
-    inline bool isEqu(const CVector3f& other, float epsilon = 1.1920929e-7f)
-    {
-        const CVector3f diffVec = other - *this;
-        return (diffVec.x <= epsilon && diffVec.y <= epsilon && diffVec.z <= epsilon);
-    }
-
-    inline float& operator[](size_t idx) { assert(idx < 3); return (&x)[idx]; }
-    inline const float& operator[](size_t idx) const { assert(idx < 3); return (&x)[idx]; }
-
-    static const CVector3f skOne;
-    static const CVector3f skNegOne;
-    static const CVector3f skZero;
-    static const CVector3f skForward;
-    static const CVector3f skBack;
-    static const CVector3f skLeft;
-    static const CVector3f skRight;
-    static const CVector3f skUp;
-    static const CVector3f skDown;
-    static const CVector3f skRadToDegVec;
-    static const CVector3f skDegToRadVec;
-
-    static CVector3f radToDeg(const CVector3f& rad) { return rad * skRadToDegVec; }
-    static CVector3f degToRad(const CVector3f& deg) { return deg * skDegToRadVec; }
+  static CVector3f degToRad(const CVector3f& deg) { return deg * skDegToRadVec; }
 };

-static inline CVector3f operator+(float lhs, const CVector3f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, 0.0f}};
-    return CVector3f(_mm_add_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector3f(lhs + rhs.x, lhs + rhs.y, lhs + rhs.z);
-#endif
+static inline CVector3f operator+(float lhs, const CVector3f& rhs) {
+  return zeus::simd<float>(lhs) + rhs.mSimd;
 }

-static inline CVector3f operator-(float lhs, const CVector3f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, 0.0f}};
-    return CVector3f(_mm_sub_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector3f(lhs - rhs.x, lhs - rhs.y, lhs - rhs.z);
-#endif
+static inline CVector3f operator-(float lhs, const CVector3f& rhs) {
+  return zeus::simd<float>(lhs) - rhs.mSimd;
 }

-static inline CVector3f operator*(float lhs, const CVector3f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, 0.0f}};
-    return CVector3f(_mm_mul_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector3f(lhs * rhs.x, lhs * rhs.y, lhs * rhs.z);
-#endif
+static inline CVector3f operator*(float lhs, const CVector3f& rhs) {
+  return zeus::simd<float>(lhs) * rhs.mSimd;
 }

-static inline CVector3f operator/(float lhs, const CVector3f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, 0.0f}};
-    return CVector3f(_mm_div_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector3f(lhs / rhs.x, lhs / rhs.y, lhs / rhs.z);
-#endif
+static inline CVector3f operator/(float lhs, const CVector3f& rhs) {
+  return zeus::simd<float>(lhs) / rhs.mSimd;
 }

 }
--- a/include/zeus/CVector4f.hpp
+++ b/include/zeus/CVector4f.hpp
@ -1,420 +1,261 @@
 #pragma once

 #include "Global.hpp"
-#include "TVectorUnion.hpp"
 #include "zeus/CVector3f.hpp"
+
 #if ZE_ATHENA_TYPES
-#include <athena/IStreamReader.hpp>
+
+#include "athena/IStreamReader.hpp"
+
 #endif
+
 #include "zeus/Math.hpp"
 #include <cfloat>
 #include <cassert>

-namespace zeus
-{
+namespace zeus {
 class CColor;
-class alignas(16) CVector4f
-{
-#if __atdna__
-    float clangVec __attribute__((__vector_size__(16)));
-#endif
+
+class CVector4f {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
-    union {
-        struct
-        {
-            float x, y, z, w;
-        };
-        float v[4];
-#if __SSE__
-        __m128 mVec128;
-#endif
-    };
+  zeus::simd<float> mSimd;
+
+  CVector4f() : mSimd(0.f) {}
+
+  template <typename T>
+  CVector4f(const simd<T>& s) : mSimd(s) {}

-    inline CVector4f() { zeroOut(); }
-#if __SSE__
-    CVector4f(const __m128& mVec128) : mVec128(mVec128) {}
-#endif
 #if ZE_ATHENA_TYPES
-    CVector4f(const atVec4f& vec)
-#if __SSE__
-    : mVec128(vec.mVec128)
-    {
-    }
-#else
-    {
-        x = vec.vec[0], y = vec.vec[1], z = vec.vec[2], w = vec.vec[3];
-    }
+
+  CVector4f(const atVec4f& vec) : mSimd(vec.simd) {}
+
+  operator atVec4f&() {
+    return *reinterpret_cast<atVec4f*>(this);
+  }
+
+  operator const atVec4f&() const {
+    return *reinterpret_cast<const atVec4f*>(this);
+  }
+
+  void readBig(athena::io::IStreamReader& input) {
+    simd_floats f;
+    f[0] = input.readFloatBig();
+    f[1] = input.readFloatBig();
+    f[2] = input.readFloatBig();
+    f[3] = input.readFloatBig();
+    mSimd.copy_from(f);
+  }
+
 #endif

-    operator atVec4f&()
-    {
-        return *reinterpret_cast<atVec4f*>(v);
-    }
-    operator const atVec4f&() const
-    {
-        return *reinterpret_cast<const atVec4f*>(v);
-    }
+  explicit CVector4f(float xyzw) : mSimd(xyzw) {}

-    void readBig(athena::io::IStreamReader& input)
-    {
-        x = input.readFloatBig();
-        y = input.readFloatBig();
-        z = input.readFloatBig();
-        w = input.readFloatBig();
-    }
-#endif
+  void assign(float x, float y, float z, float w) {
+    mSimd = simd<float>(x, y, z, w);
+  }

-    explicit CVector4f(float xyzw) { splat(xyzw); }
-    void assign(float x, float y, float z, float w)
-    {
-        v[0] = x;
-        v[1] = y;
-        v[2] = z;
-        v[3] = w;
-    }
-    CVector4f(float x, float y, float z, float w) { assign(x, y, z, w); }
-    CVector4f(const CColor& other);
+  CVector4f(float x, float y, float z, float w) : mSimd(x, y, z, w) {}

-    CVector4f(const CVector3f& other, float wIn = 1.f)
-    {
-#if __SSE__
-        mVec128 = other.mVec128;
-#else
-        x = other.x;
-        y = other.y;
-        z = other.z;
-#endif
-        w = wIn;
-    }
+  CVector4f(const CColor& other);

-    static CVector4f ToClip(const zeus::CVector3f& v, float w)
-    {
-        return CVector4f(v * w, w);
-    }
+  CVector4f(const CVector3f& other, float wIn = 1.f) : mSimd(other.mSimd) {
+    mSimd[3] = wIn;
+  }

-    inline CVector3f toVec3f() const
-    {
-#if __SSE__
-        return CVector3f(mVec128);
-#else
-        return CVector3f(x, y, z);
-#endif
-    }
+  static CVector4f ToClip(const zeus::CVector3f& v, float w) {
+    return CVector4f(v * w, w);
+  }

-    CVector4f& operator=(const CColor& other);
-    inline bool operator==(const CVector4f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmpeq_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 && vec.v[1] != 0 && vec.v[2] != 0 && vec.v[3] != 0);
-#else
-        return (x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w);
-#endif
-    }
-    inline bool operator!=(const CVector4f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmpneq_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 && vec.v[1] != 0 && vec.v[2] != 0 && vec.v[3] != 0);
-#else
-        return !(*this == rhs);
-#endif
-    }
-    inline bool operator<(const CVector4f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmplt_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 || vec.v[1] != 0 || vec.v[2] != 0 || vec.v[3] != 0);
-#else
-        return (x < rhs.x || y < rhs.y || z < rhs.z || w < rhs.w);
-#endif
-    }
-    inline bool operator<=(const CVector4f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmple_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 || vec.v[1] != 0 || vec.v[2] != 0 || vec.v[3] != 0);
-#else
-        return (x <= rhs.x || y <= rhs.y || z <= rhs.z || w <= rhs.w);
-#endif
-    }
-    inline bool operator>(const CVector4f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmpgt_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 || vec.v[1] != 0 || vec.v[2] != 0 || vec.v[3] != 0);
-#else
-        return (x > rhs.x || y > rhs.y || z > rhs.z || w > rhs.w);
-#endif
-    }
-    inline bool operator>=(const CVector4f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion vec;
-        vec.mVec128 = _mm_cmpge_ps(mVec128, rhs.mVec128);
-        return (vec.v[0] != 0 || vec.v[1] != 0 || vec.v[2] != 0 || vec.v[3] != 0);
-#else
-        return (x >= rhs.x || y >= rhs.y || z >= rhs.z || w >= rhs.w);
-#endif
-    }
-    inline CVector4f operator+(const CVector4f& rhs) const
-    {
-#if __SSE__
-        return CVector4f(_mm_add_ps(mVec128, rhs.mVec128));
-#else
-        return CVector4f(x + rhs.x, y + rhs.y, z + rhs.z, w + rhs.w);
-#endif
-    }
-    inline CVector4f operator-(const CVector4f& rhs) const
-    {
-#if __SSE__
-        return CVector4f(_mm_sub_ps(mVec128, rhs.mVec128));
-#else
-        return CVector4f(x - rhs.x, y - rhs.y, z - rhs.z, w - rhs.w);
-#endif
-    }
-    inline CVector4f operator-() const
-    {
-#if __SSE__
-        return CVector4f(_mm_sub_ps(_mm_xor_ps(mVec128, mVec128), mVec128));
-#else
-        return CVector4f(-x, -y, -z, -w);
-#endif
-    }
-    inline CVector4f operator*(const CVector4f& rhs) const
-    {
-#if __SSE__
-        return CVector4f(_mm_mul_ps(mVec128, rhs.mVec128));
-#else
-        return CVector4f(x * rhs.x, y * rhs.y, z * rhs.z, w * rhs.w);
-#endif
-    }
-    inline CVector4f operator/(const CVector4f& rhs) const
-    {
-#if __SSE__
-        return CVector4f(_mm_div_ps(mVec128, rhs.mVec128));
-#else
-        return CVector4f(x / rhs.x, y / rhs.y, z / rhs.z, w / rhs.w);
-#endif
-    }
-    inline CVector4f operator+(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, val, val}};
-        return CVector4f(_mm_add_ps(mVec128, splat.mVec128));
-#else
-        return CVector4f(x + val, y + val, z + val, w + val);
-#endif
-    }
-    inline CVector4f operator-(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, val, val}};
-        return CVector4f(_mm_sub_ps(mVec128, splat.mVec128));
-#else
-        return CVector4f(x - val, y - val, z - val, w - val);
-#endif
-    }
-    inline CVector4f operator*(float val) const
-    {
-#if __SSE__
-        TVectorUnion splat = {{val, val, val, val}};
-        return CVector4f(_mm_mul_ps(mVec128, splat.mVec128));
-#else
-        return CVector4f(x * val, y * val, z * val, w * val);
-#endif
-    }
-    inline CVector4f operator/(float val) const
-    {
-        float ooval = 1.f / val;
-#if __SSE__
-        TVectorUnion splat = {{ooval, ooval, ooval, ooval}};
-        return CVector4f(_mm_mul_ps(mVec128, splat.mVec128));
-#else
-        return CVector4f(x * ooval, y * ooval, z * ooval, w * ooval);
-#endif
-    }
-    inline const CVector4f& operator+=(const CVector4f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_add_ps(mVec128, rhs.mVec128);
-#else
-        x += rhs.x;
-        y += rhs.y;
-        z += rhs.z;
-        w += rhs.w;
-#endif
-        return *this;
-    }
-    inline const CVector4f& operator-=(const CVector4f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_sub_ps(mVec128, rhs.mVec128);
-#else
-        x -= rhs.x;
-        y -= rhs.y;
-        z -= rhs.z;
-        w -= rhs.w;
-#endif
-        return *this;
-    }
-    inline const CVector4f& operator*=(const CVector4f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-#else
-        x *= rhs.x;
-        y *= rhs.y;
-        z *= rhs.z;
-        w *= rhs.w;
-#endif
-        return *this;
-    }
-    inline const CVector4f& operator/=(const CVector4f& rhs)
-    {
-#if __SSE__
-        mVec128 = _mm_div_ps(mVec128, rhs.mVec128);
-#else
-        x /= rhs.x;
-        y /= rhs.y;
-        z /= rhs.z;
-        w /= rhs.w;
-#endif
-        return *this;
-    }
-    inline void normalize()
-    {
-        float mag = magnitude();
-        mag = 1.f / mag;
-        *this *= CVector4f(mag);
-    }
-    inline CVector4f normalized() const
-    {
-        float mag = magnitude();
-        mag = 1.f / mag;
-        return *this * mag;
-    }
+  CVector3f toVec3f() const {
+    return CVector3f(mSimd);
+  }

-    inline float dot(const CVector4f& rhs) const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-        result.mVec128 = _mm_dp_ps(mVec128, rhs.mVec128, 0xF1);
-        return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, rhs.mVec128);
-        return result.v[0] + result.v[1] + result.v[2] + result.v[3];
-#endif
-#else
-        return (x * rhs.x) + (y * rhs.y) + (z * rhs.z) + (w * rhs.w);
-#endif
-    }
-    inline float magSquared() const
-    {
-#if __SSE__
-        TVectorUnion result;
-#if __SSE4_1__
-        result.mVec128 = _mm_dp_ps(mVec128, mVec128, 0xF1);
-        return result.v[0];
-#else
-        result.mVec128 = _mm_mul_ps(mVec128, mVec128);
-        return result.v[0] + result.v[1] + result.v[2];
-#endif
-#else
-        return x * x + y * y + z * z + w * w;
-#endif
-    }
-    inline float magnitude() const { return std::sqrt(magSquared()); }
+  CVector4f& operator=(const CColor& other);

-    inline void zeroOut()
-    {
-        *this = CVector4f::skZero;
-    }
+  bool operator==(const CVector4f& rhs) const {
+    auto eq_mask = mSimd == rhs.mSimd;
+    return eq_mask[0] && eq_mask[1] && eq_mask[2] && eq_mask[3];
+  }

-    inline void splat(float xyzw)
-    {
-#if __SSE__
-        TVectorUnion splat = {{xyzw, xyzw, xyzw, xyzw}};
-        mVec128 = splat.mVec128;
-#else
-        v[0] = xyz;
-        v[1] = xyz;
-        v[2] = xyz;
-        v[3] = xyzw;
-#endif
-    }
+  bool operator!=(const CVector4f& rhs) const {
+    auto eq_mask = mSimd != rhs.mSimd;
+    return eq_mask[0] || eq_mask[1] || eq_mask[2] || eq_mask[3];
+  }

-    static inline CVector4f lerp(const CVector4f& a, const CVector4f& b, float t) { return (a + (b - a) * t); }
-    static inline CVector4f nlerp(const CVector4f& a, const CVector4f& b, float t) { return lerp(a, b, t).normalized(); }
+  bool operator<(const CVector4f& rhs) const {
+    auto eq_mask = mSimd < rhs.mSimd;
+    return eq_mask[0] && eq_mask[1] && eq_mask[2] && eq_mask[3];
+  }

-    inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }
+  bool operator<=(const CVector4f& rhs) const {
+    auto eq_mask = mSimd <= rhs.mSimd;
+    return eq_mask[0] && eq_mask[1] && eq_mask[2] && eq_mask[3];
+  }

-    inline bool canBeNormalized() const
-    {
-        if (std::isinf(x) || std::isinf(y) || std::isinf(z) || std::isinf(w))
-            return false;
-        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON || std::fabs(z) >= FLT_EPSILON || std::fabs(w) >= FLT_EPSILON;
-    }
+  bool operator>(const CVector4f& rhs) const {
+    auto eq_mask = mSimd > rhs.mSimd;
+    return eq_mask[0] && eq_mask[1] && eq_mask[2] && eq_mask[3];
+  }

-    inline bool isEqu(const CVector4f& other, float epsilon = 1.1920929e-7f)
-    {
-        const CVector4f diffVec = other - *this;
-        return (diffVec.x <= epsilon && diffVec.y <= epsilon && diffVec.z <= epsilon && diffVec.w <= epsilon);
-    }
+  bool operator>=(const CVector4f& rhs) const {
+    auto eq_mask = mSimd >= rhs.mSimd;
+    return eq_mask[0] && eq_mask[1] && eq_mask[2] && eq_mask[3];
+  }

-    inline float& operator[](size_t idx) { assert(idx < 4); return (&x)[idx]; }
-    inline const float& operator[](size_t idx) const { assert(idx < 4); return (&x)[idx]; }
+  CVector4f operator+(const CVector4f& rhs) const {
+    return mSimd + rhs.mSimd;
+  }

-    static const CVector4f skOne;
-    static const CVector4f skNegOne;
-    static const CVector4f skZero;
+  CVector4f operator-(const CVector4f& rhs) const {
+    return mSimd - rhs.mSimd;
+  }
+
+  CVector4f operator-() const {
+    return -mSimd;
+  }
+
+  CVector4f operator*(const CVector4f& rhs) const {
+    return mSimd * rhs.mSimd;
+  }
+
+  CVector4f operator/(const CVector4f& rhs) const {
+    return mSimd / rhs.mSimd;
+  }
+
+  CVector4f operator+(float val) const {
+    return mSimd + zeus::simd<float>(val);
+  }
+
+  CVector4f operator-(float val) const {
+    return mSimd - zeus::simd<float>(val);
+  }
+
+  CVector4f operator*(float val) const {
+    return mSimd * zeus::simd<float>(val);
+  }
+
+  CVector4f operator/(float val) const {
+    float ooval = 1.f / val;
+    return mSimd * zeus::simd<float>(ooval);
+  }
+
+  const CVector4f& operator+=(const CVector4f& rhs) {
+    mSimd += rhs.mSimd;
+    return *this;
+  }
+
+  const CVector4f& operator-=(const CVector4f& rhs) {
+    mSimd -= rhs.mSimd;
+    return *this;
+  }
+
+  const CVector4f& operator*=(const CVector4f& rhs) {
+    mSimd *= rhs.mSimd;
+    return *this;
+  }
+
+  const CVector4f& operator/=(const CVector4f& rhs) {
+    mSimd /= rhs.mSimd;
+    return *this;
+  }
+
+  void normalize() {
+    float mag = magnitude();
+    mag = 1.f / mag;
+    *this *= CVector4f(mag);
+  }
+
+  CVector4f normalized() const {
+    float mag = magnitude();
+    mag = 1.f / mag;
+    return *this * mag;
+  }
+
+  float dot(const CVector4f& rhs) const {
+    return mSimd.dot4(rhs.mSimd);
+  }
+
+  float magSquared() const {
+    return mSimd.dot4(mSimd);
+  }
+
+  float magnitude() const {
+    return std::sqrt(magSquared());
+  }
+
+  void zeroOut() {
+    *this = CVector4f::skZero;
+  }
+
+  void splat(float xyzw) {
+    mSimd = zeus::simd<float>(xyzw);
+  }
+
+  static CVector4f lerp(const CVector4f& a, const CVector4f& b, float t) {
+    return zeus::simd<float>(1.f - t) * a.mSimd + b.mSimd * zeus::simd<float>(t);
+  }
+
+  static CVector4f nlerp(const CVector4f& a, const CVector4f& b, float t) {
+    return lerp(a, b, t).normalized();
+  }
+
+  bool isNormalized() const {
+    return std::fabs(1.f - magSquared()) < 0.01f;
+  }
+
+  bool canBeNormalized() const {
+    if (std::isinf(x()) || std::isinf(y()) || std::isinf(z()) || std::isinf(w()))
+      return false;
+    return std::fabs(x()) >= FLT_EPSILON || std::fabs(y()) >= FLT_EPSILON ||
+           std::fabs(z()) >= FLT_EPSILON || std::fabs(w()) >= FLT_EPSILON;
+  }
+
+  bool isEqu(const CVector4f& other, float epsilon = FLT_EPSILON) {
+    const CVector4f diffVec = other - *this;
+    return (diffVec.x() <= epsilon && diffVec.y() <= epsilon &&
+            diffVec.z() <= epsilon && diffVec.w() <= epsilon);
+  }
+
+  zeus::simd<float>::reference operator[](size_t idx) {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float operator[](size_t idx) const {
+    assert(idx < 4);
+    return mSimd[idx];
+  }
+
+  float x() const { return mSimd[0]; }
+  float y() const { return mSimd[1]; }
+  float z() const { return mSimd[2]; }
+  float w() const { return mSimd[3]; }
+
+  simd<float>::reference x() { return mSimd[0]; }
+  simd<float>::reference y() { return mSimd[1]; }
+  simd<float>::reference z() { return mSimd[2]; }
+  simd<float>::reference w() { return mSimd[3]; }
+
+  static const CVector4f skOne;
+  static const CVector4f skNegOne;
+  static const CVector4f skZero;
 };

-static inline CVector4f operator+(float lhs, const CVector4f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, lhs}};
-    return CVector4f(_mm_add_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector4f(lhs + rhs.x, lhs + rhs.y, lhs + rhs.z, lhs + rhs.w);
-#endif
+static CVector4f operator+(float lhs, const CVector4f& rhs) {
+  return zeus::simd<float>(lhs) + rhs.mSimd;
 }

-static inline CVector4f operator-(float lhs, const CVector4f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, lhs}};
-    return CVector4f(_mm_sub_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector4f(lhs - rhs.x, lhs - rhs.y, lhs - rhs.z, lhs - rhs.w);
-#endif
+static CVector4f operator-(float lhs, const CVector4f& rhs) {
+  return zeus::simd<float>(lhs) - rhs.mSimd;
 }

-static inline CVector4f operator*(float lhs, const CVector4f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, lhs}};
-    return CVector4f(_mm_mul_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector4f(lhs * rhs.x, lhs * rhs.y, lhs * rhs.z, lhs * rhs.w);
-#endif
+static CVector4f operator*(float lhs, const CVector4f& rhs) {
+  return zeus::simd<float>(lhs) * rhs.mSimd;
+}
+
+static CVector4f operator/(float lhs, const CVector4f& rhs) {
+  return zeus::simd<float>(lhs) / rhs.mSimd;
 }

-static inline CVector4f operator/(float lhs, const CVector4f& rhs)
-{
-#if __SSE__
-    TVectorUnion splat = {{lhs, lhs, lhs, lhs}};
-    return CVector4f(_mm_div_ps(splat.mVec128, rhs.mVec128));
-#else
-    return CVector4f(lhs / rhs.x, lhs / rhs.y, lhs / rhs.z, lhs / rhs.w);
-#endif
-}
 }

--- a/include/zeus/Global.hpp
+++ b/include/zeus/Global.hpp
@ -1,61 +1,19 @@
 #pragma once

-#if _M_IX86_FP >= 1 || _M_X64
-#define __SSE__ 1
-#endif
-
-#if __SSE__
-#include <immintrin.h>
-#ifndef _MSC_VER
-#include <mm_malloc.h>
-#endif
-#define zeAlloc(sz, align) _mm_malloc(sz, align)
-#define zeFree(ptr) _mm_free(ptr)
-#elif GEKKO
-#include <ps_intrins.h>
-#define zeAlloc(sz, align) _ps_malloc(sz, align)
-#define zeFree(ptr) _ps_free(ptr)
-#endif
-
-#if __SSE__ || __GEKKO_PS__
-#define ZE_DECLARE_ALIGNED_ALLOCATOR()                                                                                         \
-    inline void* operator new(size_t sizeInBytes) { return zeAlloc(sizeInBytes, 16); }                                         \
-    inline void operator delete(void* ptr) { zeFree(ptr); }                                                                    \
-    inline void* operator new(size_t, void* ptr) { return ptr; }                                                               \
-    inline void operator delete(void*, void*) {}                                                                               \
-    inline void* operator new[](size_t sizeInBytes) { return zeAlloc(sizeInBytes, 16); }                                       \
-    inline void operator delete[](void* ptr) { zeFree(ptr); }                                                                  \
-    inline void* operator new[](size_t, void* ptr) { return ptr; }                                                             \
-    inline void operator delete[](void*, void*) {}                                                                             \
-    void __unused__()
-#define ZE_DECLARE_ALIGNED_ALLOCATOR32()                                                                                       \
-    inline void* operator new(size_t sizeInBytes) { return zeAlloc(sizeInBytes, 32); }                                         \
-    inline void operator delete(void* ptr) { zeFree(ptr); }                                                                    \
-    inline void* operator new(size_t, void* ptr) { return ptr; }                                                               \
-    inline void operator delete(void*, void*) {}                                                                               \
-    inline void* operator new[](size_t sizeInBytes) { return zeAlloc(sizeInBytes, 32); }                                       \
-    inline void operator delete[](void* ptr) { zeFree(ptr); }                                                                  \
-    inline void* operator new[](size_t, void* ptr) { return ptr; }                                                             \
-    inline void operator delete[](void*, void*) {}                                                                             \
-    void __unused__()
+#if ZE_ATHENA_TYPES
+#include "athena/IStreamReader.hpp"
+#include "athena/simd/simd.hpp"
 #else
-#define ZE_DECLARE_ALIGNED_ALLOCATOR() void __unused__()
-#define ZE_DECLARE_ALIGNED_ALLOCATOR32() void __unused__()
+#include "simd/simd.hpp"
 #endif

-#if __SSE__
-#define ZE_SHUFFLE(x, y, z, w) ((w) << 6 | (z) << 4 | (y) << 2 | (x))
-#define ze_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask))
-#define ze_splat3_ps(_a, _i) ze_pshufd_ps((_a), ZE_SHUFFLE(_i, _i, _i, 3))
-#define ze_splat_ps(_a, _i) ze_pshufd_ps((_a), ZE_SHUFFLE(_i, _i, _i, _i))
-#if _WIN32
-#define zeCastiTo128f(a) (_mm_castsi128_ps(a))
-#else
-#define zeCastiTo128f(a) ((__m128)(a))
-#endif
-#elif __GEKKO_PS__
-
+namespace zeus {
+#if ZE_ATHENA_TYPES
+template<typename T> using simd = athena::simd<T>;
+using simd_floats = athena::simd_floats;
+using simd_doubles = athena::simd_doubles;
 #endif
+}

 inline int rotr(int x, int n) { return ((x >> n) | (x << (32 - n))); }
 inline int rotl(int x, int n) { return ((x << n) | (x >> (32 - n))); }
--- a/include/zeus/Math.hpp
+++ b/include/zeus/Math.hpp
@ -1,6 +1,7 @@
 #pragma once

 #include <cfloat>
+
 #undef min
 #undef max

@ -26,8 +27,7 @@
 #include <cmath>
 #include <algorithm>

-namespace zeus
-{
+namespace zeus {

 #if _MSC_VER
 #if defined(_M_IX86)
@ -43,115 +43,131 @@ namespace zeus
 #endif
 #endif

-struct CPUInfo
-{
-    const char cpuBrand[48] = {0};
-    const char cpuVendor[32] = {0};
+struct CPUInfo {
+  const char cpuBrand[48] = {0};
+  const char cpuVendor[32] = {0};
 #if ZEUS_ARCH_X86_64 || ZEUS_ARCH_X86
-    const bool isIntel = false;
-    const bool SSE1 = false;
-    const bool SSE2 = false;
-    const bool SSE3 = false;
-    const bool SSSE3 = false;
-    const bool SSE41 = false;
-    const bool SSE42 = false;
-    const bool SSE4a = false;
-    const bool AESNI = false;
-    const bool AVX = false;
-    const bool AVX2 = false;
+  const bool isIntel = false;
+  const bool SSE1 = false;
+  const bool SSE2 = false;
+  const bool SSE3 = false;
+  const bool SSSE3 = false;
+  const bool SSE41 = false;
+  const bool SSE42 = false;
+  const bool SSE4a = false;
+  const bool AESNI = false;
+  const bool AVX = false;
+  const bool AVX2 = false;
 #endif
 };
+
 /**
 * Detects CPU capabilities and returns true if SSE4.1 or SSE4.2 is available
 */
 void detectCPU();
+
 const CPUInfo& cpuFeatures();
+
 std::pair<bool, const CPUInfo&> validateCPU();
+
 void getCpuInfo(int eax, int regs[4]);
+
 void getCpuInfoEx(int eax, int ecx, int regs[4]);

 class CVector3f;
+
 class CVector2f;
+
 class CTransform;

-template <typename T>
-inline constexpr T min(const T& a, const T& b)
-{
-    return a < b ? a : b;
+template<typename T>
+inline constexpr T min(const T& a, const T& b) {
+  return a < b ? a : b;
 }
-template <typename T>
-inline constexpr T max(const T& a, const T& b)
-{
-    return a > b ? a : b;
-}
-template <> CVector3f min(const CVector3f& a, const CVector3f& b);
-template <> CVector3f max(const CVector3f& a, const CVector3f& b);

-template <typename T>
-inline constexpr T clamp(const T& a, const T& val, const T& b)
-{
-    return max<T>(a, min<T>(b, val));
+template<typename T>
+inline constexpr T max(const T& a, const T& b) {
+  return a > b ? a : b;
+}
+
+template<>
+CVector3f min(const CVector3f& a, const CVector3f& b);
+
+template<>
+CVector3f max(const CVector3f& a, const CVector3f& b);
+
+template<typename T>
+inline constexpr T clamp(const T& a, const T& val, const T& b) {
+  return max<T>(a, min<T>(b, val));
 }

 inline constexpr float radToDeg(float rad) { return rad * (180.f / M_PIF); }
+
 inline constexpr float degToRad(float deg) { return deg * (M_PIF / 180.f); }
+
 inline constexpr double radToDeg(double rad) { return rad * (180.0 / M_PI); }
+
 inline constexpr double degToRad(double deg) { return deg * (M_PI / 180.0); }

 CVector3f baryToWorld(const CVector3f& p0, const CVector3f& p1, const CVector3f& p2, const CVector3f& bary);

 CVector3f getBezierPoint(const CVector3f& a, const CVector3f& b, const CVector3f& c, const CVector3f& d, float t);
+
 float getCatmullRomSplinePoint(float a, float b, float c, float d, float t);
-CVector3f getCatmullRomSplinePoint(const CVector3f& a, const CVector3f& b, const CVector3f& c, const CVector3f& d, float t);
+
+CVector3f
+getCatmullRomSplinePoint(const CVector3f& a, const CVector3f& b, const CVector3f& c, const CVector3f& d, float t);
+
 CVector3f getRoundCatmullRomSplinePoint(const CVector3f& a, const CVector3f& b, const CVector3f& c, const CVector3f& d,
                                        float t);

 // Since round(double) doesn't exist in some <cmath> implementations
 // we'll define our own
 inline double round(double val) { return (val < 0.0 ? std::ceil(val - 0.5) : std::ceil(val + 0.5)); }
+
 inline double powD(float a, float b) { return std::exp(b * std::log(a)); }

 inline double invSqrtD(double val) { return 1.0 / std::sqrt(val); }
+
 inline float invSqrtF(float val) { return float(1.0 / std::sqrt(val)); }
+
 int floorPowerOfTwo(int x);
+
 int ceilingPowerOfTwo(int x);

-template <typename U>
-typename std::enable_if<!std::is_enum<U>::value && std::is_integral<U>::value, int>::type PopCount(U x)
-{
+template<typename U>
+typename std::enable_if<!std::is_enum<U>::value && std::is_integral<U>::value, int>::type PopCount(U x) {
 #if __GNUC__ >= 4
-    return __builtin_popcountll(x);
+  return __builtin_popcountll(x);
 #else
-    const U m1 = U(0x5555555555555555);  // binary: 0101...
-    const U m2 = U(0x3333333333333333);  // binary: 00110011..
-    const U m4 = U(0x0f0f0f0f0f0f0f0f);  // binary:  4 zeros,  4 ones ...
-    const U h01 = U(0x0101010101010101); // the sum of 256 to the power of 0,1,2,3...
+  const U m1 = U(0x5555555555555555);  // binary: 0101...
+  const U m2 = U(0x3333333333333333);  // binary: 00110011..
+  const U m4 = U(0x0f0f0f0f0f0f0f0f);  // binary:  4 zeros,  4 ones ...
+  const U h01 = U(0x0101010101010101); // the sum of 256 to the power of 0,1,2,3...

-    x -= (x >> 1) & m1;                        // put count of each 2 bits into those 2 bits
-    x = (x & m2) + ((x >> 2) & m2);            // put count of each 4 bits into those 4 bits
-    x = (x + (x >> 4)) & m4;                   // put count of each 8 bits into those 8 bits
-    return (x * h01) >> ((sizeof(U) - 1) * 8); // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
+  x -= (x >> 1) & m1;                        // put count of each 2 bits into those 2 bits
+  x = (x & m2) + ((x >> 2) & m2);            // put count of each 4 bits into those 4 bits
+  x = (x + (x >> 4)) & m4;                   // put count of each 8 bits into those 8 bits
+  return (x * h01) >> ((sizeof(U) - 1) * 8); // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
 #endif
 }

-template <typename E>
-typename std::enable_if<std::is_enum<E>::value, int>::type PopCount(E e)
-{
-    return PopCount(static_cast<typename std::underlying_type<E>::type>(e));
+template<typename E>
+typename std::enable_if<std::is_enum<E>::value, int>::type PopCount(E e) {
+  return PopCount(static_cast<typename std::underlying_type<E>::type>(e));
 }


-bool close_enough(const CVector3f &a, const CVector3f &b, float epsilon = 0.000099999997f);
+bool close_enough(const CVector3f& a, const CVector3f& b, float epsilon = 0.000099999997f);
+
 bool close_enough(const CVector2f& a, const CVector2f& b, float epsilon = 0.000099999997f);

-inline bool close_enough(float a, float b, double epsilon = 0.000009999999747378752)
-{
-    return std::fabs(a - b) < epsilon;
+inline bool close_enough(float a, float b, double epsilon = 0.000009999999747378752) {
+  return std::fabs(a - b) < epsilon;
 }

-inline bool close_enough(double a, double b, double epsilon = 0.000009999999747378752)
-{
-    return std::fabs(a - b) < epsilon;
+inline bool close_enough(double a, double b, double epsilon = 0.000009999999747378752) {
+  return std::fabs(a - b) < epsilon;
 }
 }

--- a/include/zeus/TVectorUnion.hpp
+++ b/include/zeus/TVectorUnion.hpp
@ -1,22 +0,0 @@
-#pragma once
-
-namespace zeus
-{
-typedef union {
-    float v[4];
-#if __SSE__
-    __m128 mVec128;
-#endif
-} TVectorUnion;
-
-typedef union {
-    double v[4];
-#if __AVX__
-    __m256d mVec256;
-#endif
-#if __SSE__
-    __m128d mVec128[2];
-#endif
-} TDblVectorUnion;
-}
-
--- a/include/zeus/simd/parallelism_v2_simd.hpp
+++ b/include/zeus/simd/parallelism_v2_simd.hpp
--- a/include/zeus/simd/simd.hpp
+++ b/include/zeus/simd/simd.hpp
@ -0,0 +1,26 @@
+#pragma once
+#define _ZEUS_SIMD_INCLUDED
+namespace zeus::_simd { using namespace std; }
+#include "parallelism_v2_simd.hpp"
+#if _M_IX86_FP >= 1 || _M_X64
+#define __SSE__ 1
+#endif
+#if __AVX__
+#include "simd_avx.hpp"
+#elif __SSE__
+#include "simd_sse.hpp"
+#else
+namespace simd_abi {
+template<typename T> struct zeus_native {};
+template<> struct zeus_native<float> { using type = fixed_size<4>; };
+template<> struct zeus_native<double> { using type = fixed_size<4>; };
+}
+#endif
+namespace zeus {
+template<typename T> using simd = _simd::simd<T,
+  typename _simd::simd_abi::zeus_native<T>::type>;
+template<typename T>
+using simd_values = _simd::simd_data<simd<T>>;
+using simd_floats = simd_values<float>;
+using simd_doubles = simd_values<double>;
+}
--- a/include/zeus/simd/simd_avx.hpp
+++ b/include/zeus/simd/simd_avx.hpp
@ -0,0 +1,188 @@
+#pragma once
+#ifndef _ZEUS_SIMD_INCLUDED
+#error simd_avx.hpp must not be included directly. Include simd.hpp instead.
+#endif
+#include "simd_sse.hpp"
+#include <immintrin.h>
+namespace zeus::_simd {
+// __m256d storage for AVX
+template<>
+class __simd_storage<double, m256d_abi> {
+public:
+  using storage_type = __m256d;
+  storage_type __storage_;
+  double __get(size_t __index) const noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), __storage_);
+    return sse_data[__index];
+  }
+  void __set(size_t __index, double __val) noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), __storage_);
+    sse_data[__index] = __val;
+    __storage_ = _mm256_load_pd(sse_data.data());
+  }
+  void __set4(double a, double b, double c, double d) noexcept {
+    __storage_ = _mm256_set_pd(d, c, b, a);
+  }
+  void __broadcast(double __val) noexcept {
+    __storage_ = _mm256_set1_pd(__val);
+  }
+  double __dot2(const __simd_storage<double, m256d_abi>& other) const noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1];
+  }
+  double __dot3(const __simd_storage<double, m256d_abi>& other) const noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1] + sse_data[2];
+  }
+  double __dot4(const __simd_storage<double, m256d_abi>& other) const noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3];
+  }
+
+  void __copy_from(const simd_data<simd<double, m256d_abi>>& __buffer) noexcept {
+    __storage_ = _mm256_load_pd(__buffer.data());
+  }
+
+  void __copy_to(simd_data<simd<double, m256d_abi>>& __buffer) const noexcept {
+    _mm256_store_pd(__buffer.data(), __storage_);
+  }
+
+  __simd_storage() = default;
+  explicit __simd_storage(const __simd_storage<float, m128_abi>& other) {
+    __storage_ = _mm256_cvtps_pd(other.__storage_);
+  }
+
+  explicit __simd_storage(const storage_type& s) : __storage_(s) {}
+  const storage_type& __native() const { return __storage_; }
+};
+// __m256d mask storage for AVX
+template<>
+class __simd_mask_storage<double, m256d_abi> : public __simd_storage<double, m256d_abi> {
+public:
+  bool __get(size_t __index) const noexcept {
+    alignas(32) uint64_t sse_data[4];
+    _mm256_store_pd(reinterpret_cast<double*>(sse_data), __storage_);
+    return sse_data[__index] != 0;
+  }
+
+  void __set(size_t __index, bool __val) noexcept {
+    alignas(32) uint64_t sse_data[4];
+    _mm256_store_pd(reinterpret_cast<double*>(sse_data), __storage_);
+    sse_data[__index] = __val ? UINT64_MAX : 0;
+    __storage_ = _mm256_load_pd(reinterpret_cast<double*>(sse_data));
+  }
+};
+
+template <>
+inline simd<double, m256d_abi> simd<double, m256d_abi>::operator-() const {
+  return _mm256_xor_pd(__s_.__storage_, _mm256_set1_pd(-0.0));
+}
+
+inline simd<double, m256d_abi>
+operator+(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi> ret;
+  ret.__s_.__storage_ = _mm256_add_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<double, m256d_abi>
+operator-(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi> ret;
+  ret.__s_.__storage_ = _mm256_sub_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<double, m256d_abi>
+operator*(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi> ret;
+  ret.__s_.__storage_ = _mm256_mul_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<double, m256d_abi>
+operator/(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi> ret;
+  ret.__s_.__storage_ = _mm256_div_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<double, m256d_abi>&
+operator+=(simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  a.__s_.__storage_ = _mm256_add_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<double, m256d_abi>&
+operator-=(simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  a.__s_.__storage_ = _mm256_sub_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<double, m256d_abi>&
+operator*=(simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  a.__s_.__storage_ = _mm256_mul_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<double, m256d_abi>&
+operator/=(simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  a.__s_.__storage_ = _mm256_div_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator==(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_EQ_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator!=(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_NEQ_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator>=(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_GE_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator<=(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_LE_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator>(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_GT_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator<(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_LT_OQ);
+  return ret;
+}
+
+inline __simd_storage<float, m128_abi>::__simd_storage(const __simd_storage<double, m256d_abi>& other) {
+  __storage_ = _mm256_cvtpd_ps(other.__storage_);
+}
+
+namespace simd_abi {
+template<> struct zeus_native<double> { using type = m256d_abi; };
+} // namespace simd_abi
+
+} // namespace zeus::_simd
--- a/include/zeus/simd/simd_sse.hpp
+++ b/include/zeus/simd/simd_sse.hpp
@ -0,0 +1,455 @@
+#pragma once
+#ifndef _ZEUS_SIMD_INCLUDED
+#error simd_sse.hpp must not be included directly. Include simd.hpp instead.
+#endif
+#include <xmmintrin.h>
+#if __SSE4_1__
+#include <smmintrin.h>
+#endif
+namespace zeus::_simd {
+// __m128 ABI
+using m128_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 1), 4>;
+// __m128d ABI
+using m128d_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 2), 4>;
+#ifdef __AVX__
+// __m256d ABI
+using m256d_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 3), 4>;
+#endif
+
+template <>
+class __simd_storage<double, m128d_abi>;
+#ifdef __AVX__
+template <>
+class __simd_storage<double, m256d_abi>;
+#endif
+
+// __m128 storage for SSE2+
+template <>
+class __simd_storage<float, m128_abi> {
+public:
+  using storage_type = __m128;
+  storage_type __storage_;
+  float __get(size_t __index) const noexcept {
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), __storage_);
+    return sse_data[__index];
+  }
+  void __set(size_t __index, float __val) noexcept {
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), __storage_);
+    sse_data[__index] = __val;
+    __storage_ = _mm_load_ps(sse_data.data());
+  }
+  void __set4(float a, float b, float c, float d) noexcept {
+    __storage_ = _mm_set_ps(d, c, b, a);
+  }
+  void __broadcast(float __val) noexcept {
+    __storage_ = _mm_set1_ps(__val);
+  }
+  float __dot2(const __simd_storage<float, m128_abi>& other) const noexcept {
+#if __SSE4_1__
+    float ret;
+    _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x3F));
+    return ret;
+#else
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1];
+#endif
+  }
+  float __dot3(const __simd_storage<float, m128_abi>& other) const noexcept {
+#if __SSE4_1__
+    float ret;
+    _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x7F));
+    return ret;
+#else
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1] + sse_data[2];
+#endif
+  }
+  float __dot4(const __simd_storage<float, m128_abi>& other) const noexcept {
+#if __SSE4_1__
+    float ret;
+    _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0xFF));
+    return ret;
+#else
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3];
+#endif
+  }
+  template<int x, int y, int z, int w>
+  __simd_storage __shuffle() const noexcept {
+    __simd_storage s;
+    s.__storage_ = _mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x));
+    return s;
+  }
+
+  void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept {
+    __storage_ = _mm_load_ps(__buffer.data());
+  }
+
+  void __copy_to(simd_data<simd<float, m128_abi>>& __buffer) const noexcept {
+    _mm_store_ps(__buffer.data(), __storage_);
+  }
+
+  __simd_storage() = default;
+  explicit __simd_storage(const __simd_storage<double, m128d_abi>& other);
+#ifdef __AVX__
+  explicit __simd_storage(const __simd_storage<double, m256d_abi>& other);
+#endif
+
+  explicit __simd_storage(const storage_type& s) : __storage_(s) {}
+  const storage_type& __native() const { return __storage_; }
+};
+// __m128 mask storage for SSE2+
+template <>
+class __simd_mask_storage<float, m128_abi> : public __simd_storage<float, m128_abi>
+{
+public:
+  bool __get(size_t __index) const noexcept {
+    alignas(16) uint32_t sse_data[4];
+    _mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_);
+    return sse_data[__index] != 0;
+  }
+  void __set(size_t __index, bool __val) noexcept {
+    alignas(16) uint32_t sse_data[4];
+    _mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_);
+    sse_data[__index] = __val ? UINT32_MAX : 0;
+    __storage_ = _mm_load_ps(reinterpret_cast<float*>(sse_data));
+  }
+};
+
+template <>
+inline simd<float, m128_abi> simd<float, m128_abi>::operator-() const {
+  return _mm_xor_ps(__s_.__storage_, _mm_set1_ps(-0.f));
+}
+
+inline simd<float, m128_abi>
+operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi> ret;
+  ret.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>
+operator-(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi> ret;
+  ret.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>
+operator*(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi> ret;
+  ret.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>
+operator/(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi> ret;
+  ret.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>&
+operator+=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  a.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<float, m128_abi>&
+operator-=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  a.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<float, m128_abi>&
+operator*=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  a.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<float, m128_abi>&
+operator/=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  a.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator==(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmpeq_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator!=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmpneq_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator>=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmpge_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator<=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmple_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator>(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmpgt_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator<(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmplt_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+// __m128d storage for SSE2+
+template <>
+class __simd_storage<double, m128d_abi> {
+public:
+  using storage_type = std::array<__m128d, 2>;
+  storage_type __storage_;
+  double __get(size_t __index) const noexcept {
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), __storage_[__index / 2]);
+    return sse_data[__index % 2];
+  }
+  void __set(size_t __index, double __val) noexcept {
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), __storage_[__index / 2]);
+    sse_data[__index % 2] = __val;
+    __storage_[__index / 2] = _mm_load_pd(sse_data.data());
+  }
+  void __set4(double a, double b, double c, double d) noexcept {
+    __storage_[0] = _mm_set_pd(b, a);
+    __storage_[1] = _mm_set_pd(d, c);
+  }
+  void __broadcast(double __val) noexcept {
+    for (int i = 0; i < 2; ++i)
+      __storage_[i] = _mm_set1_pd(__val);
+  }
+  double __dot2(const __simd_storage<double, m128d_abi>& other) const noexcept {
+#if __SSE4_1__
+    double ret;
+    _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
+    return ret;
+#else
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0]));
+    return sse_data[0] + sse_data[1];
+#endif
+  }
+  double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept {
+#if __SSE4_1__
+    double ret;
+    _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
+    alignas(16) std::array<double, 2> sse_data2;
+    _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1]));
+    return ret + sse_data2[0];
+#else
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0]));
+    alignas(16) std::array<double, 2> sse_data2;
+    _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1]));
+    return sse_data[0] + sse_data[1] + sse_data2[0];
+#endif
+  }
+  double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept {
+#if __SSE4_1__
+    double ret;
+    _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
+    double ret2;
+    _mm_store_sd(&ret2, _mm_dp_pd(__storage_[1], other.__storage_[1], 0x3F));
+    return ret + ret2;
+#else
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0]));
+    alignas(16) std::array<double, 2> sse_data2;
+    _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1]));
+    return sse_data[0] + sse_data[1] + sse_data2[0] + sse_data2[1];
+#endif
+  }
+
+  void __copy_from(const simd_data<simd<double, m128d_abi>>& __buffer) noexcept {
+    __storage_[0] = _mm_load_pd(__buffer.data());
+    __storage_[1] = _mm_load_pd(__buffer.data() + 2);
+  }
+
+  void __copy_to(simd_data<simd<double, m128d_abi>>& __buffer) const noexcept {
+    _mm_store_pd(__buffer.data(), __storage_[0]);
+    _mm_store_pd(__buffer.data() + 2, __storage_[1]);
+  }
+
+  __simd_storage() = default;
+  explicit __simd_storage(const __simd_storage<float, m128_abi>& other) {
+    __storage_[0] = _mm_cvtps_pd(other.__storage_);
+    __storage_[1] = _mm_cvtps_pd(_mm_movehl_ps(other.__storage_, other.__storage_));
+  }
+
+  explicit __simd_storage(const storage_type& s) : __storage_(s) {}
+  const storage_type& __native() const { return __storage_; }
+};
+// __m128d mask storage for SSE2+
+template <>
+class __simd_mask_storage<double, m128d_abi> : public __simd_storage<double, m128d_abi>
+{
+public:
+  bool __get(size_t __index) const noexcept {
+    alignas(16) uint64_t sse_data[2];
+    _mm_store_pd(reinterpret_cast<double*>(sse_data), __storage_[__index / 2]);
+    return sse_data[__index] != 0;
+  }
+  void __set(size_t __index, bool __val) noexcept {
+    alignas(16) uint64_t sse_data[2];
+    _mm_store_pd(reinterpret_cast<double*>(sse_data), __storage_[__index / 2]);
+    sse_data[__index % 2] = __val ? UINT64_MAX : 0;
+    __storage_[__index / 2] = _mm_load_pd(reinterpret_cast<double*>(sse_data));
+  }
+};
+
+template <>
+inline simd<double, m128d_abi> simd<double, m128d_abi>::operator-() const {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_xor_pd(__s_.__storage_[i], _mm_set1_pd(-0.0));
+  return ret;
+}
+
+inline simd<double, m128d_abi>
+operator+(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_add_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>
+operator-(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_sub_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>
+operator*(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_mul_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>
+operator/(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_div_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>&
+operator+=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  for (int i = 0; i < 2; ++i)
+    a.__s_.__storage_[i] = _mm_add_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return a;
+}
+
+inline simd<double, m128d_abi>&
+operator-=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  for (int i = 0; i < 2; ++i)
+    a.__s_.__storage_[i] = _mm_sub_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return a;
+}
+
+inline simd<double, m128d_abi>&
+operator*=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  for (int i = 0; i < 2; ++i)
+    a.__s_.__storage_[i] = _mm_mul_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return a;
+}
+
+inline simd<double, m128d_abi>&
+operator/=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  for (int i = 0; i < 2; ++i)
+    a.__s_.__storage_[i] = _mm_div_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return a;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator==(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmpeq_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator!=(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmpneq_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator>=(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmpge_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator<=(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmple_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator>(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmpgt_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator<(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmplt_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline __simd_storage<float, m128_abi>::__simd_storage(const __simd_storage<double, m128d_abi>& other) {
+  __storage_ = _mm_movelh_ps(_mm_cvtpd_ps(other.__storage_[0]), _mm_cvtpd_ps(other.__storage_[1]));
+}
+
+namespace simd_abi {
+template<typename T> struct zeus_native {};
+template<> struct zeus_native<float> { using type = m128_abi; };
+#ifndef __AVX__
+template<> struct zeus_native<double> { using type = m128d_abi; };
+#endif
+} // namespace simd_abi
+
+} // namespace zeus::_simd
--- a/src/CAABox.cpp
+++ b/src/CAABox.cpp
@ -3,7 +3,6 @@

 namespace zeus
 {
-
 const CAABox CAABox::skInvertedBox = CAABox();
 const CAABox CAABox::skNullBox = CAABox(CVector3f::skZero, CVector3f::skZero);
 }
--- a/src/CColor.cpp
+++ b/src/CColor.cpp
@ -1,8 +1,7 @@
 #include "zeus/CColor.hpp"
 #include "zeus/CVector4f.hpp"

-namespace zeus
-{
+namespace zeus {
 const CColor CColor::skRed(Comp32(0xFF0000FFul));
 const CColor CColor::skBlack(Comp32(0x000000FFul));
 const CColor CColor::skBlue(Comp32(0x0000FFFFul));
@ -14,133 +13,106 @@ const CColor CColor::skYellow(Comp32(0xFFFF00FFul));
 const CColor CColor::skWhite(Comp32(0xFFFFFFFFul));
 const CColor CColor::skClear(Comp32(0x00000000ul));

-float hueToRgb(float p, float q, float t)
-{
-    if (t < 0.0f)
-        t += 1.0f;
-    if (t > 1.0f)
-        t -= 1.0f;
-    if (t < 1.f / 6.f)
-        return p + (q - p) * 6.f * t;
-    if (t < 1.f / 2.f)
-        return q;
-    if (t < 2.f / 3.f)
-        return p + (q - p) * (2.f / 3.f - t) * 6.f;
-    return p;
+float hueToRgb(float p, float q, float t) {
+  if (t < 0.0f)
+    t += 1.0f;
+  if (t > 1.0f)
+    t -= 1.0f;
+  if (t < 1.f / 6.f)
+    return p + (q - p) * 6.f * t;
+  if (t < 1.f / 2.f)
+    return q;
+  if (t < 2.f / 3.f)
+    return p + (q - p) * (2.f / 3.f - t) * 6.f;
+  return p;
 }

-CColor::CColor(const CVector4f& other)
-{
-    r = other.x;
-    g = other.y;
-    b = other.z;
-    a = other.w;
+void CColor::fromHSV(float h, float s, float v, float _a) {
+  int i = int(h * 6.f);
+  float f = h * 6.f - i;
+  float p = v * (1.f - s);
+  float q = v * (1.f - f * s);
+  float t = v * (1.f - (1.f - f) * s);
+  simd_floats fo;
+
+  switch (i % 6) {
+  case 0:
+    fo[0] = v, fo[1] = t, fo[2] = p;
+    break;
+  case 1:
+    fo[0] = q, fo[1] = v, fo[2] = p;
+    break;
+  case 2:
+    fo[0] = p, fo[1] = v, fo[2] = t;
+    break;
+  case 3:
+    fo[0] = p, fo[1] = q, fo[2] = v;
+    break;
+  case 4:
+    fo[0] = t, fo[1] = p, fo[2] = v;
+    break;
+  case 5:
+    fo[0] = v, fo[1] = p, fo[2] = q;
+    break;
+  default:
+    break;
+  }
+
+  fo[3] = _a;
+  mSimd.copy_from(fo);
 }

-CColor& CColor::operator=(const CVector4f& other)
-{
-    r = other.x;
-    g = other.y;
-    b = other.z;
-    a = other.w;
+void CColor::toHSV(float& h, float& s, float& v) const {
+  float min = std::min(r(), std::min(g(), b()));
+  float max = std::max(r(), std::max(g(), b()));
+  v = max;

-    return *this;
+  float delta = max - min;
+  s = max == 0.f ? 0.f : delta / max;
+
+  if (max == min)
+    h = 0.f;
+  else {
+    if (max == r())
+      h = (g() - b()) / delta + (g() < b() ? 6.f : 0.f);
+    else if (max == g())
+      h = (b() - r()) / delta + 2.f;
+    else if (max == b())
+      h = (r() - g()) / delta + 4.f;
+    h /= 6.f;
+  }
 }

-void CColor::fromHSV(float h, float s, float v, float _a)
-{
-    int i = int(h * 6);
-    float f = h * 6 - i;
-    float p = v * (1 - s);
-    float q = v * (1 - f * s);
-    float t = v * (1 - (1 - f) * s);
-    float _r, _g, _b;
-
-    switch (i % 6)
-    {
-    case 0:
-        _r = v, _g = t, _b = p;
-        break;
-    case 1:
-        _r = q, _g = v, _b = p;
-        break;
-    case 2:
-        _r = p, _g = v, _b = t;
-        break;
-    case 3:
-        _r = p, _g = q, _b = v;
-        break;
-    case 4:
-        _r = t, _g = p, _b = v;
-        break;
-    case 5:
-        _r = v, _g = p, _b = q;
-        break;
-    }
-
-    r = _r;
-    g = _g;
-    b = _b;
-    a = _a;
+void CColor::fromHSL(float h, float s, float l, float _a) {
+  if (s == 0.0f) {
+    mSimd = simd<float>(l);
+  } else {
+    const float q = l < 0.5f ? l * (1.f + s) : l + s - 1.f * s;
+    const float p = 2.f * l - q;
+    r() = hueToRgb(p, q, h + 1.f / 3.f);
+    g() = hueToRgb(p, q, h);
+    b() = hueToRgb(p, q, h - 1.f / 3.f);
+  }
+  a() = _a;
 }

-void CColor::toHSV(float& h, float& s, float& v) const
-{
-    float min = std::min(r, std::min(g, b));
-    float max = std::max(r, std::max(g, b));
-    v = max;
+void CColor::toHSL(float& h, float& s, float& l) const {
+  const float min = std::min(r(), std::min(g(), b()));
+  const float max = std::max(r(), std::max(g(), b()));
+  const float d = max - min;

-    float delta = max - min;
-    s = max == 0 ? 0 : delta / max;
+  if (max == min)
+    h = s = 0.f;
+  else {
+    s = l > 0.5f ? d / (2.f - max - min) : d / (max + min);
+    if (max == r())
+      h = (g() - b()) / d + (g() < b() ? 6.f : 0.f);
+    else if (max == g())
+      h = (b() - r()) / d + 2.f;
+    else if (max == b())
+      h = (r() - g()) / d + 4.f;

-    if (max == min)
-        h = 0;
-    else
-    {
-        if (max == r)
-            h = (g - b) / delta + (g < b ? 6 : 0);
-        else if (max == g)
-            h = (b - r) / delta + 2;
-        else if (max == b)
-            h = (r - g) / delta + 4;
-        h /= 6;
-    }
-}
-
-void CColor::fromHSL(float h, float s, float l, float _a)
-{
-    if (s == 0.0f)
-        r = g = b = l;
-    else
-    {
-        const float q = l < 0.5f ? l * (1.f + s) : l + s - 1.f * s;
-        const float p = 2 * l - q;
-        r = hueToRgb(p, q, h + 1.f / 3);
-        g = hueToRgb(p, q, h);
-        b = hueToRgb(p, q, h - 1.f / 3);
-    }
-    a = _a;
-}
-
-void CColor::toHSL(float& h, float& s, float& l)
-{
-    const float min = std::min(r, std::min(g, b));
-    const float max = std::max(r, std::max(g, b));
-    const float d = max - min;
-
-    if (max == min)
-        h = s = 0;
-    else
-    {
-        s = l > 0.5f ? d / (2.f - max - min) : d / (max + min);
-        if (max == r)
-            h = (g - b) / d + (g < b ? 6.f : 0.f);
-        else if (max == g)
-            h = (b - r) / d + 2.f;
-        else if (max == b)
-            h = (r - g) / d + 4.f;
-
-        h /= 6;
-    }
+    h /= 6.f;
+  }
 }
 }
--- a/src/CEulerAngles.cpp
+++ b/src/CEulerAngles.cpp
@ -10,29 +10,29 @@ CEulerAngles::CEulerAngles(const CQuaternion& quat)
    float t0 = 0.f;
    if (quatDot > 0.f)
        t0 = 2.f / quatDot;
-    double t1 = 1.0 - (t0 * quat.x * quat.x + t0 * quat.z * quat.z);
-    double t2 = t0 * quat.y * quat.x - t0 * quat.z * quat.w;
+    double t1 = 1.0 - (t0 * quat.x() * quat.x() + t0 * quat.z() * quat.z());
+    double t2 = t0 * quat.y() * quat.x() - t0 * quat.z() * quat.w();
    double t3 = t1 * t1 + t2 * t2;

    double t4 = 0.0;
    if (t3 > 0.0)
        t4 = std::sqrt(t3);

-    double t5 = t0 * quat.z * quat.y + t0 * quat.x * quat.w;
+    double t5 = t0 * quat.z() * quat.y() + t0 * quat.x() * quat.w();

    if (std::abs(t4) > 0.00001)
    {
-        x = -std::atan2(-t5, t4);
-        y = -std::atan2(t0 * quat.z * quat.x - t0 * quat.y * quat.w,
-                        1.0 - (t0 * quat.x * quat.x + t0 * quat.y * quat.y));
-        z = -std::atan2(t2, t1);
+        x() = -std::atan2(-t5, t4);
+        y() = -std::atan2(t0 * quat.z() * quat.x() - t0 * quat.y() * quat.w(),
+                          1.0 - (t0 * quat.x() * quat.x() + t0 * quat.y() * quat.y()));
+        z() = -std::atan2(t2, t1);
    }
    else
    {
-        x = -std::atan2(-t5, t4);
-        y = -std::atan2(-(t0 * quat.z * quat.x + t0 * quat.y * quat.w),
-                        1.0 - (t0 * quat.y * quat.y + t0 * quat.z * quat.z));
-        z = 0.f;
+        x() = -std::atan2(-t5, t4);
+        y() = -std::atan2(-(t0 * quat.z() * quat.x() + t0 * quat.y() * quat.w()),
+                          1.0 - (t0 * quat.y() * quat.y() + t0 * quat.z() * quat.z()));
+        z() = 0.f;
    }
 }

@ -58,15 +58,15 @@ CEulerAngles::CEulerAngles(const CTransform& xf)

    if (std::fabs(f1) >= 0.00001)
    {
-        x = -std::atan2(-xf.basis[1][2], f1);
-        y = -std::atan2(xf.basis[0][2], xf.basis[2][2]);
-        z = -std::atan2(xf.basis[1][0], xf.basis[1][1]);
+        x() = -std::atan2(-xf.basis[1][2], f1);
+        y() = -std::atan2(xf.basis[0][2], xf.basis[2][2]);
+        z() = -std::atan2(xf.basis[1][0], xf.basis[1][1]);
    }
    else
    {
-        x = -std::atan2(-xf.basis[1][2], f1);
-        y = -std::atan2(-xf.basis[2][0], xf.basis[0][0]);
-        z = 0.f;
+        x() = -std::atan2(-xf.basis[1][2], f1);
+        y() = -std::atan2(-xf.basis[2][0], xf.basis[0][0]);
+        z() = 0.f;
    }
 }

--- a/src/CFrustum.cpp
+++ b/src/CFrustum.cpp
@ -1,138 +1,88 @@
 #include "zeus/CFrustum.hpp"

-namespace zeus
-{
+namespace zeus {

-void CFrustum::updatePlanes(const CMatrix4f& viewMtx, const CMatrix4f& projection)
-{
-    CMatrix4f mvp = projection * viewMtx;
-    CMatrix4f mvp_rm = mvp.transposed();
+void CFrustum::updatePlanes(const CMatrix4f& viewMtx, const CMatrix4f& projection) {
+  CMatrix4f mvp = projection * viewMtx;
+  CMatrix4f mvp_rm = mvp.transposed();

-#if __SSE__
+  /* Left */
+  planes[0].mSimd = mvp_rm.m[3].mSimd + mvp_rm.m[0].mSimd;

-    /* Left */
-    planes[0].mVec128 = _mm_add_ps(mvp_rm.vec[3].mVec128, mvp_rm.vec[0].mVec128);
+  /* Right */
+  planes[1].mSimd = mvp_rm.m[3].mSimd - mvp_rm.m[0].mSimd;

-    /* Right */
-    planes[1].mVec128 = _mm_sub_ps(mvp_rm.vec[3].mVec128, mvp_rm.vec[0].mVec128);
+  /* Bottom */
+  planes[2].mSimd = mvp_rm.m[3].mSimd + mvp_rm.m[1].mSimd;

-    /* Bottom */
-    planes[2].mVec128 = _mm_add_ps(mvp_rm.vec[3].mVec128, mvp_rm.vec[1].mVec128);
+  /* Top */
+  planes[3].mSimd = mvp_rm.m[3].mSimd - mvp_rm.m[1].mSimd;

-    /* Top */
-    planes[3].mVec128 = _mm_sub_ps(mvp_rm.vec[3].mVec128, mvp_rm.vec[1].mVec128);
+  /* Near */
+  planes[4].mSimd = mvp_rm.m[3].mSimd + mvp_rm.m[2].mSimd;

-    /* Near */
-    planes[4].mVec128 = _mm_add_ps(mvp_rm.vec[3].mVec128, mvp_rm.vec[2].mVec128);
+  /* Far */
+  planes[5].mSimd = mvp_rm.m[3].mSimd - mvp_rm.m[2].mSimd;

-    /* Far */
-    planes[5].mVec128 = _mm_sub_ps(mvp_rm.vec[3].mVec128, mvp_rm.vec[2].mVec128);
+  planes[0].normalize();
+  planes[1].normalize();
+  planes[2].normalize();
+  planes[3].normalize();
+  planes[4].normalize();
+  planes[5].normalize();

-#else
-    /* Left */
-        planes[0].a = mvp.m[0][0] + mvp.m[3][0];
-        planes[0].b = mvp.m[0][1] + mvp.m[3][1];
-        planes[0].c = mvp.m[0][2] + mvp.m[3][2];
-        planes[0].d = mvp.m[0][3] + mvp.m[3][3];
-
-        /* Right */
-        planes[1].a = -mvp.m[0][0] + mvp.m[3][0];
-        planes[1].b = -mvp.m[0][1] + mvp.m[3][1];
-        planes[1].c = -mvp.m[0][2] + mvp.m[3][2];
-        planes[1].d = -mvp.m[0][3] + mvp.m[3][3];
-
-        /* Bottom */
-        planes[2].a = mvp.m[1][0] + mvp.m[3][0];
-        planes[2].b = mvp.m[1][1] + mvp.m[3][1];
-        planes[2].c = mvp.m[1][2] + mvp.m[3][2];
-        planes[2].d = mvp.m[1][3] + mvp.m[3][3];
-
-        /* Top */
-        planes[3].a = -mvp.m[1][0] + mvp.m[3][0];
-        planes[3].b = -mvp.m[1][1] + mvp.m[3][1];
-        planes[3].c = -mvp.m[1][2] + mvp.m[3][2];
-        planes[3].d = -mvp.m[1][3] + mvp.m[3][3];
-
-        /* Near */
-        planes[4].a = mvp.m[2][0] + mvp.m[3][0];
-        planes[4].b = mvp.m[2][1] + mvp.m[3][1];
-        planes[4].c = mvp.m[2][2] + mvp.m[3][2];
-        planes[4].d = mvp.m[2][3] + mvp.m[3][3];
-
-        /* Far */
-        planes[5].a = -mvp.m[2][0] + mvp.m[3][0];
-        planes[5].b = -mvp.m[2][1] + mvp.m[3][1];
-        planes[5].c = -mvp.m[2][2] + mvp.m[3][2];
-        planes[5].d = -mvp.m[2][3] + mvp.m[3][3];
-
-#endif
-
-    planes[0].normalize();
-    planes[1].normalize();
-    planes[2].normalize();
-    planes[3].normalize();
-    planes[4].normalize();
-    planes[5].normalize();
-
-    valid = true;
+  valid = true;
 }

-void CFrustum::updatePlanes(const CTransform& viewPointMtx, const CProjection& projection)
-{
-    zeus::CMatrix3f tmp(viewPointMtx.basis[0], viewPointMtx.basis[2], -viewPointMtx.basis[1]);
-    zeus::CTransform viewBasis = zeus::CTransform(tmp.transposed());
-    zeus::CTransform viewMtx = viewBasis * zeus::CTransform::Translate(-viewPointMtx.origin);
+void CFrustum::updatePlanes(const CTransform& viewPointMtx, const CProjection& projection) {
+  zeus::CMatrix3f tmp(viewPointMtx.basis[0], viewPointMtx.basis[2], -viewPointMtx.basis[1]);
+  zeus::CTransform viewBasis = zeus::CTransform(tmp.transposed());
+  zeus::CTransform viewMtx = viewBasis * zeus::CTransform::Translate(-viewPointMtx.origin);

-    updatePlanes(viewMtx.toMatrix4f(), projection.getCachedMatrix());
+  updatePlanes(viewMtx.toMatrix4f(), projection.getCachedMatrix());
 }

-bool CFrustum::aabbFrustumTest(const CAABox& aabb) const
-{
-    if (!valid)
-        return true;
-
-    CVector3f center = aabb.center();
-    CVector3f extents = aabb.extents();
-
-    for (uint32_t i = 0; i < 6; ++i)
-    {
-        const CPlane& plane = planes[i];
-
-        float m = plane.vec.dot(center) + plane.d;
-        float n = extents.dot({std::fabs(plane.a), std::fabs(plane.b), std::fabs(plane.c)});
-
-        if (m + n < 0)
-            return false;
-    }
+bool CFrustum::aabbFrustumTest(const CAABox& aabb) const {
+  if (!valid)
    return true;
+
+  CVector3f center = aabb.center();
+  CVector3f extents = aabb.extents();
+
+  for (uint32_t i = 0; i < 6; ++i) {
+    const CPlane& plane = planes[i];
+
+    float m = plane.normal().dot(center) + plane.d();
+    float n = extents.dot({std::fabs(plane.x()), std::fabs(plane.y()), std::fabs(plane.z())});
+
+    if (m + n < 0.f)
+      return false;
+  }
+  return true;
 }

-bool CFrustum::sphereFrustumTest(const CSphere& sphere) const
-{
-    if (!valid)
-        return true;
-
-    for (uint32_t i = 0 ; i<6 ; ++i)
-    {
-        float dadot = planes[i].vec.dot(sphere.position);
-        if ((dadot + planes[i].d + sphere.radius) < 0)
-            return false;
-    }
+bool CFrustum::sphereFrustumTest(const CSphere& sphere) const {
+  if (!valid)
    return true;
+
+  for (uint32_t i = 0; i < 6; ++i) {
+    float dadot = planes[i].normal().dot(sphere.position);
+    if ((dadot + planes[i].d() + sphere.radius) < 0.f)
+      return false;
+  }
+  return true;
 }

-bool CFrustum::pointFrustumTest(const CVector3f& point) const
-{
-    if (!valid)
-        return true;
-
-    for (uint32_t i = 0 ; i<6 ; ++i)
-    {
-        float dadot = planes[i].vec.dot(point);
-        if ((dadot + planes[i].d) < 0)
-            return false;
-    }
+bool CFrustum::pointFrustumTest(const CVector3f& point) const {
+  if (!valid)
    return true;
+
+  for (uint32_t i = 0; i < 6; ++i) {
+    float dadot = planes[i].normal().dot(point);
+    if ((dadot + planes[i].d()) < 0.f)
+      return false;
+  }
+  return true;
 }

 }
--- a/src/CMatrix3f.cpp
+++ b/src/CMatrix3f.cpp
@ -2,104 +2,113 @@
 #include "zeus/CQuaternion.hpp"
 #include "zeus/Global.hpp"

-namespace zeus
-{
+namespace zeus {
 const CMatrix3f CMatrix3f::skIdentityMatrix3f = CMatrix3f();

-CMatrix3f::CMatrix3f(const CQuaternion& quat)
-{
-    CQuaternion nq = quat.normalized();
-    float x2 = nq.x * nq.x;
-    float y2 = nq.y * nq.y;
-    float z2 = nq.z * nq.z;
+CMatrix3f::CMatrix3f(const CQuaternion& quat) {
+  CQuaternion nq = quat.normalized();
+  float x2 = nq.x() * nq.x();
+  float y2 = nq.y() * nq.y();
+  float z2 = nq.z() * nq.z();

-    m[0][0] = 1.0 - 2.0 * y2 - 2.0 * z2;
-    m[1][0] = 2.0 * nq.x * nq.y - 2.0 * nq.z * nq.w;
-    m[2][0] = 2.0 * nq.x * nq.z + 2.0 * nq.y * nq.w;
+  m[0][0] = 1.0 - 2.0 * y2 - 2.0 * z2;
+  m[1][0] = 2.0 * nq.x() * nq.y() - 2.0 * nq.z() * nq.w();
+  m[2][0] = 2.0 * nq.x() * nq.z() + 2.0 * nq.y() * nq.w();

-    m[0][1] = 2.0 * nq.x * nq.y + 2.0 * nq.z * nq.w;
-    m[1][1] = 1.0 - 2.0 * x2 - 2.0 * z2;
-    m[2][1] = 2.0 * nq.y * nq.z - 2.0 * nq.x * nq.w;
+  m[0][1] = 2.0 * nq.x() * nq.y() + 2.0 * nq.z() * nq.w();
+  m[1][1] = 1.0 - 2.0 * x2 - 2.0 * z2;
+  m[2][1] = 2.0 * nq.y() * nq.z() - 2.0 * nq.x() * nq.w();

-    m[0][2] = 2.0 * nq.x * nq.z - 2.0 * nq.y * nq.w;
-    m[1][2] = 2.0 * nq.y * nq.z + 2.0 * nq.x * nq.w;
-    m[2][2] = 1.0 - 2.0 * x2 - 2.0 * y2;
-
-    m[0][3] = 0.0f;
-    m[1][3] = 0.0f;
-    m[2][3] = 0.0f;
+  m[0][2] = 2.0 * nq.x() * nq.z() - 2.0 * nq.y() * nq.w();
+  m[1][2] = 2.0 * nq.y() * nq.z() + 2.0 * nq.x() * nq.w();
+  m[2][2] = 1.0 - 2.0 * x2 - 2.0 * y2;
 }

-void CMatrix3f::transpose()
-{
+void CMatrix3f::transpose() {
 #if __SSE__
-    __m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128);
-    __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
-    __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero);
-    __m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128);
-    __m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, zero);
-    vec[0].mVec128 = _mm_movelh_ps(T0, T2);
-    vec[1].mVec128 = _mm_movehl_ps(T2, T0);
-    vec[2].mVec128 = _mm_movelh_ps(T1, T3);
+  __m128 zero = _mm_xor_ps(m[0].mSimd.native(), m[0].mSimd.native());
+  __m128 T0 = _mm_unpacklo_ps(m[0].mSimd.native(), m[1].mSimd.native());
+  __m128 T2 = _mm_unpacklo_ps(m[2].mSimd.native(), zero);
+  __m128 T1 = _mm_unpackhi_ps(m[0].mSimd.native(), m[1].mSimd.native());
+  __m128 T3 = _mm_unpackhi_ps(m[2].mSimd.native(), zero);
+  m[0].mSimd = _mm_movelh_ps(T0, T2);
+  m[1].mSimd = _mm_movehl_ps(T2, T0);
+  m[2].mSimd = _mm_movelh_ps(T1, T3);
+#elif __ARM_NEON
+  float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] );
+  float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] );
+
+  float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] );
+  float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] );
+
+  m[0].mSimd = T0.val[0];
+  m[1].mSimd = T0.val[1];
+  m[2].mSimd = T1.val[0];
 #else
-    float tmp;
+  float tmp;

-    tmp = m[0][1];
-    m[0][1] = m[1][0];
-    m[1][0] = tmp;
+  tmp = m[0][1];
+  m[0][1] = m[1][0];
+  m[1][0] = tmp;

-    tmp = m[0][2];
-    m[0][2] = m[2][0];
-    m[2][0] = tmp;
+  tmp = m[0][2];
+  m[0][2] = m[2][0];
+  m[2][0] = tmp;

-    tmp = m[1][2];
-    m[1][2] = m[2][1];
-    m[2][1] = tmp;
+  tmp = m[1][2];
+  m[1][2] = m[2][1];
+  m[2][1] = tmp;
 #endif
 }

-CMatrix3f CMatrix3f::transposed() const
-{
+CMatrix3f CMatrix3f::transposed() const {
 #if __SSE__
-    __m128 zero = _mm_xor_ps(vec[0].mVec128, vec[0].mVec128);
-    __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
-    __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, zero);
-    __m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128);
-    __m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, zero);
-    return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3));
+  __m128 zero = _mm_xor_ps(m[0].mSimd.native(), m[0].mSimd.native());
+  __m128 T0 = _mm_unpacklo_ps(m[0].mSimd.native(), m[1].mSimd.native());
+  __m128 T2 = _mm_unpacklo_ps(m[2].mSimd.native(), zero);
+  __m128 T1 = _mm_unpackhi_ps(m[0].mSimd.native(), m[1].mSimd.native());
+  __m128 T3 = _mm_unpackhi_ps(m[2].mSimd.native(), zero);
+  return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3));
+#elif __ARM_NEON
+  float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] );
+  float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] );
+
+  float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] );
+  float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] );
+
+  return CMatrix3f(T0.val[0], T0.val[1], T1.val[0]);
 #else
-    CMatrix3f ret(*this);
-    float tmp;
+  CMatrix3f ret(*this);
+  float tmp;

-    tmp = ret.m[0][1];
-    ret.m[0][1] = ret.m[1][0];
-    ret.m[1][0] = tmp;
+  tmp = ret.m[0][1];
+  ret.m[0][1] = ret.m[1][0];
+  ret.m[1][0] = tmp;

-    tmp = m[0][2];
-    ret.m[0][2] = ret.m[2][0];
-    ret.m[2][0] = tmp;
+  tmp = m[0][2];
+  ret.m[0][2] = ret.m[2][0];
+  ret.m[2][0] = tmp;

-    tmp = m[1][2];
-    ret.m[1][2] = ret.m[2][1];
-    ret.m[2][1] = tmp;
+  tmp = m[1][2];
+  ret.m[1][2] = ret.m[2][1];
+  ret.m[2][1] = tmp;

-    return ret;
+  return ret;
 #endif
 }

-CMatrix3f CMatrix3f::inverted() const
-{
-    float det = m[0][0] * m[1][1] * m[2][2] + m[1][0] * m[2][1] * m[0][2] + m[2][0] * m[0][1] * m[1][2] -
-                m[0][2] * m[1][1] * m[2][0] - m[1][2] * m[2][1] * m[0][0] - m[2][2] * m[0][1] * m[1][0];
+CMatrix3f CMatrix3f::inverted() const {
+  float det = m[0][0] * m[1][1] * m[2][2] + m[1][0] * m[2][1] * m[0][2] + m[2][0] * m[0][1] * m[1][2] -
+              m[0][2] * m[1][1] * m[2][0] - m[1][2] * m[2][1] * m[0][0] - m[2][2] * m[0][1] * m[1][0];

-    if (det == 0.0)
-        return CMatrix3f();
+  if (det == 0.0)
+    return CMatrix3f();

-    det = 1.0f / det;
-    return CMatrix3f((m[1][1] * m[2][2] - m[1][2] * m[2][1]) * det, -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) * det,
-                     (m[1][0] * m[2][1] - m[1][1] * m[2][0]) * det, -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) * det,
-                     (m[0][0] * m[2][2] - m[0][2] * m[2][0]) * det, -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) * det,
-                     (m[0][1] * m[1][2] - m[0][2] * m[1][1]) * det, -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) * det,
-                     (m[0][0] * m[1][1] - m[0][1] * m[1][0]) * det);
+  det = 1.0f / det;
+  return CMatrix3f((m[1][1] * m[2][2] - m[1][2] * m[2][1]) * det, -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) * det,
+                   (m[1][0] * m[2][1] - m[1][1] * m[2][0]) * det, -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) * det,
+                   (m[0][0] * m[2][2] - m[0][2] * m[2][0]) * det, -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) * det,
+                   (m[0][1] * m[1][2] - m[0][2] * m[1][1]) * det, -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) * det,
+                   (m[0][0] * m[1][1] - m[0][1] * m[1][0]) * det);
 }
 }
--- a/src/CMatrix4f.cpp
+++ b/src/CMatrix4f.cpp
@ -9,14 +9,25 @@ CMatrix4f CMatrix4f::transposed() const
 {
    CMatrix4f ret;
 #if __SSE__
-    __m128 T0 = _mm_unpacklo_ps(vec[0].mVec128, vec[1].mVec128);
-    __m128 T2 = _mm_unpacklo_ps(vec[2].mVec128, vec[3].mVec128);
-    __m128 T1 = _mm_unpackhi_ps(vec[0].mVec128, vec[1].mVec128);
-    __m128 T3 = _mm_unpackhi_ps(vec[2].mVec128, vec[3].mVec128);
-    ret.vec[0].mVec128 = _mm_movelh_ps(T0, T2);
-    ret.vec[1].mVec128 = _mm_movehl_ps(T2, T0);
-    ret.vec[2].mVec128 = _mm_movelh_ps(T1, T3);
-    ret.vec[3].mVec128 = _mm_movehl_ps(T3, T1);
+    __m128 T0 = _mm_unpacklo_ps(m[0].mSimd.native(), m[1].mSimd.native());
+    __m128 T2 = _mm_unpacklo_ps(m[2].mSimd.native(), m[3].mSimd.native());
+    __m128 T1 = _mm_unpackhi_ps(m[0].mSimd.native(), m[1].mSimd.native());
+    __m128 T3 = _mm_unpackhi_ps(m[2].mSimd.native(), m[3].mSimd.native());
+    ret.m[0].mSimd = _mm_movelh_ps(T0, T2);
+    ret.m[1].mSimd = _mm_movehl_ps(T2, T0);
+    ret.m[2].mSimd = _mm_movelh_ps(T1, T3);
+    ret.m[3].mSimd = _mm_movehl_ps(T3, T1);
+#elif __ARM_NEON
+    float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] );
+    float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] );
+
+    float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] );
+    float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] );
+
+    ret.m[0].mSimd = T0.val[0];
+    ret.m[1].mSimd = T0.val[1];
+    ret.m[2].mSimd = T1.val[0];
+    ret.m[3].mSimd = T1.val[1];
 #else
    ret.m[0][0] = m[0][0];
    ret.m[1][0] = m[0][1];
--- a/src/COBBox.cpp
+++ b/src/COBBox.cpp
@ -1,139 +1,140 @@
 #include "zeus/COBBox.hpp"

-namespace zeus
-{
+namespace zeus {

-CAABox COBBox::calculateAABox(const CTransform& worldXf) const
-{
-    CAABox ret = CAABox::skInvertedBox;
+CAABox COBBox::calculateAABox(const CTransform& worldXf) const {
+  CAABox ret = CAABox::skInvertedBox;

-    CTransform trans = worldXf * transform;
-    static const CVector3f basis[8] = {{1.f, 1.f, 1.f},    {1.f, 1.f, -1.f},  {1.f, -1.f, 1.f},  {1.f, -1.f, -1.f},
-                                       {-1.f, -1.f, -1.f}, {-1.f, -1.f, 1.f}, {-1.f, 1.f, -1.f}, {-1.f, 1.f, 1.f}};
-    CVector3f p = extents * basis[0];
-    ret.accumulateBounds(trans * p);
-    p = extents * basis[1];
-    ret.accumulateBounds(trans * p);
-    p = extents * basis[2];
-    ret.accumulateBounds(trans * p);
-    p = extents * basis[3];
-    ret.accumulateBounds(trans * p);
-    p = extents * basis[4];
-    ret.accumulateBounds(trans * p);
-    p = extents * basis[5];
-    ret.accumulateBounds(trans * p);
-    p = extents * basis[6];
-    ret.accumulateBounds(trans * p);
-    p = extents * basis[7];
-    ret.accumulateBounds(trans * p);
+  CTransform trans = worldXf * transform;
+  static const CVector3f basis[8] = {{1.f,  1.f,  1.f},
+                                     {1.f,  1.f,  -1.f},
+                                     {1.f,  -1.f, 1.f},
+                                     {1.f,  -1.f, -1.f},
+                                     {-1.f, -1.f, -1.f},
+                                     {-1.f, -1.f, 1.f},
+                                     {-1.f, 1.f,  -1.f},
+                                     {-1.f, 1.f,  1.f}};
+  CVector3f p = extents * basis[0];
+  ret.accumulateBounds(trans * p);
+  p = extents * basis[1];
+  ret.accumulateBounds(trans * p);
+  p = extents * basis[2];
+  ret.accumulateBounds(trans * p);
+  p = extents * basis[3];
+  ret.accumulateBounds(trans * p);
+  p = extents * basis[4];
+  ret.accumulateBounds(trans * p);
+  p = extents * basis[5];
+  ret.accumulateBounds(trans * p);
+  p = extents * basis[6];
+  ret.accumulateBounds(trans * p);
+  p = extents * basis[7];
+  ret.accumulateBounds(trans * p);

-    return ret;
+  return ret;
 }

-bool COBBox::OBBIntersectsBox(const COBBox& other) const
-{
-    CVector3f v = other.transform.origin - transform.origin;
-    CVector3f T = CVector3f(v.dot(transform.basis[0]),
-                            v.dot(transform.basis[1]),
-                            v.dot(transform.basis[2]));
+bool COBBox::OBBIntersectsBox(const COBBox& other) const {
+  CVector3f v = other.transform.origin - transform.origin;
+  CVector3f T = CVector3f(v.dot(transform.basis[0]),
+                          v.dot(transform.basis[1]),
+                          v.dot(transform.basis[2]));

-    CMatrix3f R;
+  CMatrix3f R;

-    float ra, rb, t;
-
-    for (int i = 0; i < 3; ++i)
-        for (int k = 0; k < 3; ++k)
-            R[i][k] = transform.basis[i].dot(other.transform.basis[k]);
-
-    for (int i = 0; i < 3; ++i)
-    {
-        ra = extents[i];
-        rb = (other.extents[0] * std::fabs(R[i][0])) +
-             (other.extents[1] * std::fabs(R[i][1])) +
-             (other.extents[2] * std::fabs(R[i][2]));
-        t = std::fabs(T[i]);
-
-        if (t > (ra + rb + FLT_EPSILON))
-            return false;
-    }
+  float ra, rb, t;

+  for (int i = 0; i < 3; ++i)
    for (int k = 0; k < 3; ++k)
-    {
-        ra = (extents[0] * std::fabs(R[0][k])) +
-             (extents[1] * std::fabs(R[1][k])) +
-             (extents[2] * std::fabs(R[2][k]));
-        rb = other.extents[k];
+      R[i][k] = transform.basis[i].dot(other.transform.basis[k]);

-        t = std::fabs(T[0] * R[0][k] + T[1] * R[1][k] + T[2] * R[2][k]);
+  for (int i = 0; i < 3; ++i) {
+    ra = extents[i];
+    rb = (other.extents[0] * std::fabs(R[i][0])) +
+         (other.extents[1] * std::fabs(R[i][1])) +
+         (other.extents[2] * std::fabs(R[i][2]));
+    t = std::fabs(T[i]);

-        if (t > (ra + rb + FLT_EPSILON))
-            return false;
-    }
-
-    /* A0 x B0 */
-    ra = (extents[1] * std::fabs(R[2][0])) + (extents[2] * std::fabs(R[1][0]));
-    rb = (other.extents[1] * std::fabs(R[0][2])) + (other.extents[2] * std::fabs(R[0][1]));
-    t = std::fabs((T[2] * R[1][0]) - (T[1] * R[2][0]));
    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+      return false;
+  }
+
+  for (int k = 0; k < 3; ++k) {
+    ra = (extents[0] * std::fabs(R[0][k])) +
+         (extents[1] * std::fabs(R[1][k])) +
+         (extents[2] * std::fabs(R[2][k]));
+    rb = other.extents[k];
+
+    t = std::fabs(T[0] * R[0][k] + T[1] * R[1][k] + T[2] * R[2][k]);

-    /* A0 x B1 */
-    ra = (extents[1] * std::fabs(R[2][1])) + (extents[2] * std::fabs(R[1][1]));
-    rb = (other.extents[0] * std::fabs(R[0][2])) + (other.extents[2] * std::fabs(R[0][0]));
-    t = std::fabs((T[2] * R[1][1]) - (T[1] * R[2][1]));
    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+      return false;
+  }

-    /* A0 x B2 */
-    ra = (extents[1] * std::fabs(R[2][2])) + (extents[2] * std::fabs(R[1][2]));
-    rb = (other.extents[0] * std::fabs(R[0][1])) + (other.extents[1] * std::fabs(R[0][0]));
-    t = std::fabs((T[2] * R[1][2]) - (T[1] * R[2][2]));
-    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+  /* A0 x B0 */
+  ra = (extents[1] * std::fabs(R[2][0])) + (extents[2] * std::fabs(R[1][0]));
+  rb = (other.extents[1] * std::fabs(R[0][2])) + (other.extents[2] * std::fabs(R[0][1]));
+  t = std::fabs((T[2] * R[1][0]) - (T[1] * R[2][0]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;

-    /* A1 x B0 */
-    ra = (extents[0] * std::fabs(R[2][0])) + (extents[2] * std::fabs(R[0][0]));
-    rb = (other.extents[1] * std::fabs(R[1][2])) + (other.extents[2] * std::fabs(R[1][1]));
-    t = std::fabs((T[0] * R[2][0]) - (T[2] * R[0][0]));
-    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+  /* A0 x B1 */
+  ra = (extents[1] * std::fabs(R[2][1])) + (extents[2] * std::fabs(R[1][1]));
+  rb = (other.extents[0] * std::fabs(R[0][2])) + (other.extents[2] * std::fabs(R[0][0]));
+  t = std::fabs((T[2] * R[1][1]) - (T[1] * R[2][1]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;

-    /* A1 x B1 */
-    ra = (extents[0] * std::fabs(R[2][1])) + (extents[2] * std::fabs(R[0][1]));
-    rb = (other.extents[0] * std::fabs(R[1][2])) + (other.extents[2] * std::fabs(R[1][0]));
-    t = std::fabs((T[0] * R[2][1]) - (T[2] * R[0][1]));
-    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+  /* A0 x B2 */
+  ra = (extents[1] * std::fabs(R[2][2])) + (extents[2] * std::fabs(R[1][2]));
+  rb = (other.extents[0] * std::fabs(R[0][1])) + (other.extents[1] * std::fabs(R[0][0]));
+  t = std::fabs((T[2] * R[1][2]) - (T[1] * R[2][2]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;

-    /* A1 x B2 */
-    ra = (extents[0] * std::fabs(R[2][2])) + (extents[2] * std::fabs(R[0][2]));
-    rb = (other.extents[0] * std::fabs(R[1][1])) + (other.extents[1] * std::fabs(R[1][0]));
-    t = std::fabs((T[0] * R[2][2]) - (T[2] * R[0][2]));
-    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+  /* A1 x B0 */
+  ra = (extents[0] * std::fabs(R[2][0])) + (extents[2] * std::fabs(R[0][0]));
+  rb = (other.extents[1] * std::fabs(R[1][2])) + (other.extents[2] * std::fabs(R[1][1]));
+  t = std::fabs((T[0] * R[2][0]) - (T[2] * R[0][0]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;

-    /* A2 x B0 */
-    ra = (extents[0] * std::fabs(R[1][0])) + (extents[1] * std::fabs(R[0][0]));
-    rb = (other.extents[1] * std::fabs(R[2][2])) + (other.extents[2] * std::fabs(R[2][1]));
-    t = std::fabs((T[1] * R[0][0]) - (T[0] * R[1][0]));
-    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+  /* A1 x B1 */
+  ra = (extents[0] * std::fabs(R[2][1])) + (extents[2] * std::fabs(R[0][1]));
+  rb = (other.extents[0] * std::fabs(R[1][2])) + (other.extents[2] * std::fabs(R[1][0]));
+  t = std::fabs((T[0] * R[2][1]) - (T[2] * R[0][1]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;

-    /* A2 x B1 */
-    ra = (extents[0] * std::fabs(R[1][1])) + (extents[1] * std::fabs(R[0][1]));
-    rb = (other.extents[0] * std::fabs(R[2][2])) + (other.extents[2] * std::fabs(R[2][0]));
-    t = std::fabs((T[1] * R[0][1]) - (T[0] * R[1][1]));
-    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+  /* A1 x B2 */
+  ra = (extents[0] * std::fabs(R[2][2])) + (extents[2] * std::fabs(R[0][2]));
+  rb = (other.extents[0] * std::fabs(R[1][1])) + (other.extents[1] * std::fabs(R[1][0]));
+  t = std::fabs((T[0] * R[2][2]) - (T[2] * R[0][2]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;

-    /* A2 x B2 */
-    ra = (extents[0] * std::fabs(R[1][2])) + (extents[1] * std::fabs(R[0][2]));
-    rb = (other.extents[0] * std::fabs(R[2][1])) + (other.extents[1] * std::fabs(R[2][0]));
-    t = std::fabs((T[1] * R[0][2]) - (T[0] * R[1][2]));
-    if (t > (ra + rb + FLT_EPSILON))
-        return false;
+  /* A2 x B0 */
+  ra = (extents[0] * std::fabs(R[1][0])) + (extents[1] * std::fabs(R[0][0]));
+  rb = (other.extents[1] * std::fabs(R[2][2])) + (other.extents[2] * std::fabs(R[2][1]));
+  t = std::fabs((T[1] * R[0][0]) - (T[0] * R[1][0]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;

-    return true;
+  /* A2 x B1 */
+  ra = (extents[0] * std::fabs(R[1][1])) + (extents[1] * std::fabs(R[0][1]));
+  rb = (other.extents[0] * std::fabs(R[2][2])) + (other.extents[2] * std::fabs(R[2][0]));
+  t = std::fabs((T[1] * R[0][1]) - (T[0] * R[1][1]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;
+
+  /* A2 x B2 */
+  ra = (extents[0] * std::fabs(R[1][2])) + (extents[1] * std::fabs(R[0][2]));
+  rb = (other.extents[0] * std::fabs(R[2][1])) + (other.extents[1] * std::fabs(R[2][0]));
+  t = std::fabs((T[1] * R[0][2]) - (T[0] * R[1][2]));
+  if (t > (ra + rb + FLT_EPSILON))
+    return false;
+
+  return true;
 }

 }
--- a/src/CPlane.cpp
+++ b/src/CPlane.cpp
@ -1,18 +1,16 @@
 #include "zeus/CPlane.hpp"

-namespace zeus
-{
+namespace zeus {

-bool CPlane::rayPlaneIntersection(const CVector3f& from, const CVector3f& to, CVector3f& point) const
-{
-    zeus::CVector3f delta = to - from;
-    if (std::fabs(delta.normalized().dot(vec)) < 0.01f)
-        return false;
-    float tmp = -pointToPlaneDist(from) / delta.dot(vec);
-    if (tmp < -0.f || tmp > 1.0001f)
-        return false;
-    point = delta * tmp + from;
-    return true;
+bool CPlane::rayPlaneIntersection(const CVector3f& from, const CVector3f& to, CVector3f& point) const {
+  zeus::CVector3f delta = to - from;
+  if (std::fabs(delta.normalized().dot(normal())) < 0.01f)
+    return false;
+  float tmp = -pointToPlaneDist(from) / delta.dot(normal());
+  if (tmp < -0.f || tmp > 1.0001f)
+    return false;
+  point = delta * tmp + from;
+  return true;
 }

 }
--- a/src/CProjection.cpp
+++ b/src/CProjection.cpp
@ -2,72 +2,67 @@
 #include "zeus/Math.hpp"
 #include <cassert>

-namespace zeus
-{
-void CProjection::_updateCachedMatrix()
-{
-    assert(m_projType == EProjType::Orthographic || m_projType == EProjType::Perspective);
-    if (m_projType == EProjType::Orthographic)
-    {
-        float tmp;
+namespace zeus {
+void CProjection::_updateCachedMatrix() {
+  assert(m_projType == EProjType::Orthographic || m_projType == EProjType::Perspective);
+  if (m_projType == EProjType::Orthographic) {
+    float tmp;

-        tmp = 1.0f / (m_ortho.right - m_ortho.left);
-        m_mtx.m[0][0] = 2.0f * tmp;
-        m_mtx.m[1][0] = 0.0f;
-        m_mtx.m[2][0] = 0.0f;
-        m_mtx.m[3][0] = -(m_ortho.right + m_ortho.left) * tmp;
+    tmp = 1.0f / (m_ortho.right - m_ortho.left);
+    m_mtx.m[0][0] = 2.0f * tmp;
+    m_mtx.m[1][0] = 0.0f;
+    m_mtx.m[2][0] = 0.0f;
+    m_mtx.m[3][0] = -(m_ortho.right + m_ortho.left) * tmp;

-        tmp = 1.0f / (m_ortho.top - m_ortho.bottom);
-        m_mtx.m[0][1] = 0.0f;
-        m_mtx.m[1][1] = 2.0f * tmp;
-        m_mtx.m[2][1] = 0.0f;
-        m_mtx.m[3][1] = -(m_ortho.top + m_ortho.bottom) * tmp;
+    tmp = 1.0f / (m_ortho.top - m_ortho.bottom);
+    m_mtx.m[0][1] = 0.0f;
+    m_mtx.m[1][1] = 2.0f * tmp;
+    m_mtx.m[2][1] = 0.0f;
+    m_mtx.m[3][1] = -(m_ortho.top + m_ortho.bottom) * tmp;

-        tmp = 1.0f / (m_ortho.zfar - m_ortho.znear);
-        m_mtx.m[0][2] = 0.0f;
-        m_mtx.m[1][2] = 0.0f;
-        m_mtx.m[2][2] = -tmp;
-        m_mtx.m[3][2] = -m_ortho.zfar * tmp;
+    tmp = 1.0f / (m_ortho.zfar - m_ortho.znear);
+    m_mtx.m[0][2] = 0.0f;
+    m_mtx.m[1][2] = 0.0f;
+    m_mtx.m[2][2] = -tmp;
+    m_mtx.m[3][2] = -m_ortho.zfar * tmp;

-        m_mtx.m[0][3] = 0.0f;
-        m_mtx.m[1][3] = 0.0f;
-        m_mtx.m[2][3] = 0.0f;
-        m_mtx.m[3][3] = 1.0f;
-    }
-    else if (m_projType == EProjType::Perspective)
-    {
-        float tfov = std::tan(m_persp.fov * 0.5f);
-        float top = m_persp.znear * tfov;
-        float bottom = -top;
-        float right = m_persp.aspect * m_persp.znear * tfov;
-        float left = -right;
+    m_mtx.m[0][3] = 0.0f;
+    m_mtx.m[1][3] = 0.0f;
+    m_mtx.m[2][3] = 0.0f;
+    m_mtx.m[3][3] = 1.0f;
+  } else if (m_projType == EProjType::Perspective) {
+    float tfov = std::tan(m_persp.fov * 0.5f);
+    float top = m_persp.znear * tfov;
+    float bottom = -top;
+    float right = m_persp.aspect * m_persp.znear * tfov;
+    float left = -right;

-        float rml = right - left;
-        float rpl = right + left;
-        float tmb = top - bottom;
-        float tpb = top + bottom;
-        float fpn = m_persp.zfar + m_persp.znear;
-        float fmn = m_persp.zfar - m_persp.znear;
+    float rml = right - left;
+    float rpl = right + left;
+    float tmb = top - bottom;
+    float tpb = top + bottom;
+    float fpn = m_persp.zfar + m_persp.znear;
+    float fmn = m_persp.zfar - m_persp.znear;

-        m_mtx.m[0][0] = 2.f * m_persp.znear / rml;
-        m_mtx.m[1][0] = 0.0f;
-        m_mtx.m[2][0] = rpl / rml;
-        m_mtx.m[3][0] = 0.0f;
+    m_mtx.m[0][0] = 2.f * m_persp.znear / rml;
+    m_mtx.m[1][0] = 0.0f;
+    m_mtx.m[2][0] = rpl / rml;
+    m_mtx.m[3][0] = 0.0f;

-        m_mtx.m[0][1] = 0.0f;
-        m_mtx.m[1][1] = 2.f * m_persp.znear / tmb;
-        m_mtx.m[2][1] = tpb / tmb;
-        m_mtx.m[3][1] = 0.0f;
+    m_mtx.m[0][1] = 0.0f;
+    m_mtx.m[1][1] = 2.f * m_persp.znear / tmb;
+    m_mtx.m[2][1] = tpb / tmb;
+    m_mtx.m[3][1] = 0.0f;

-        m_mtx.m[0][2] = 0.0f;
-        m_mtx.m[1][2] = 0.0f;
-        m_mtx.m[2][2] = -fpn / fmn;
-        m_mtx.m[3][2] = -2.f * m_persp.zfar * m_persp.znear / fmn;
+    m_mtx.m[0][2] = 0.0f;
+    m_mtx.m[1][2] = 0.0f;
+    m_mtx.m[2][2] = -fpn / fmn;
+    m_mtx.m[3][2] = -2.f * m_persp.zfar * m_persp.znear / fmn;

-        m_mtx.m[0][3] = 0.0f;
-        m_mtx.m[1][3] = 0.0f;
-        m_mtx.m[2][3] = -1.0f;
-        m_mtx.m[3][3] = 0.0f;
-    }
+    m_mtx.m[0][3] = 0.0f;
+    m_mtx.m[1][3] = 0.0f;
+    m_mtx.m[2][3] = -1.0f;
+    m_mtx.m[3][3] = 0.0f;
+  }
 }
 }
--- a/src/CQuaternion.cpp
+++ b/src/CQuaternion.cpp
@ -1,399 +1,330 @@
 #include "zeus/CQuaternion.hpp"
 #include "zeus/Math.hpp"

-namespace zeus
-{
+namespace zeus {
 const CQuaternion CQuaternion::skNoRotation;

-CQuaternion::CQuaternion(const CMatrix3f& mat)
-{
-    float trace = mat[0][0] + mat[1][1] + mat[2][2];
-    if (trace >= 0.f)
-    {
-        float st = std::sqrt(trace + 1.0f);
-        float s = 0.5f / st;
-        w = 0.5f * st;
-        x = (mat[1][2] - mat[2][1]) * s;
-        y = (mat[2][0] - mat[0][2]) * s;
-        z = (mat[0][1] - mat[1][0]) * s;
-    }
-    else
-    {
-        int idx = 0;
-        if (mat[1][1] > mat[0][0])
-        {
-            idx = 1;
-            if (mat[2][2] > mat[1][1])
-                idx = 2;
-        }
-        else if (mat[2][2] > mat[0][0])
-        {
-            idx = 2;
-        }
-
-        switch (idx)
-        {
-        case 0:
-        {
-            float st = std::sqrt(mat[0][0] - (mat[1][1] + mat[2][2]) + 1.f);
-            float s = 0.5f / st;
-            w = (mat[1][2] - mat[2][1]) * s;
-            x = 0.5f * st;
-            y = (mat[1][0] + mat[0][1]) * s;
-            z = (mat[2][0] + mat[0][2]) * s;
-            break;
-        }
-        case 1:
-        {
-            float st = std::sqrt(mat[1][1] - (mat[2][2] + mat[0][0]) + 1.f);
-            float s = 0.5f / st;
-            w = (mat[2][0] - mat[0][2]) * s;
-            x = (mat[1][0] + mat[0][1]) * s;
-            y = 0.5f * st;
-            z = (mat[2][1] + mat[1][2]) * s;
-            break;
-        }
-        case 2:
-        {
-            float st = std::sqrt(mat[2][2] - (mat[0][0] + mat[1][1]) + 1.f);
-            float s = 0.5f / st;
-            w = (mat[0][1] - mat[1][0]) * s;
-            x = (mat[2][0] + mat[0][2]) * s;
-            y = (mat[2][1] + mat[1][2]) * s;
-            z = 0.5f * st;
-            break;
-        }
-        default:
-            w = 0.f;
-            x = 0.f;
-            y = 0.f;
-            z = 0.f;
-            break;
-        }
-    }
-}
-
-void CQuaternion::fromVector3f(const CVector3f& vec)
-{
-    float cosX = std::cos(0.5f * vec.x);
-    float cosY = std::cos(0.5f * vec.y);
-    float cosZ = std::cos(0.5f * vec.z);
-
-    float sinX = std::sin(0.5f * vec.x);
-    float sinY = std::sin(0.5f * vec.y);
-    float sinZ = std::sin(0.5f * vec.z);
-
-    w = cosZ * cosY * cosX + sinZ * sinY * sinX;
-    x = cosZ * cosY * sinX - sinZ * sinY * cosX;
-    y = cosZ * sinY * cosX + sinZ * cosY * sinX;
-    z = sinZ * cosY * cosX - cosZ * sinY * sinX;
-}
-
-CQuaternion& CQuaternion::operator=(const CQuaternion& q)
-{
-#if __SSE__
-    mVec128 = q.mVec128;
-#else
-    w = q.w;
-    x = q.x;
-    y = q.y;
-    z = q.z;
-#endif
-    return *this;
-}
-
-CQuaternion CQuaternion::operator+(const CQuaternion& q) const { return CQuaternion(w + q.w, x + q.x, y + q.y, z + q.z); }
-
-CQuaternion CQuaternion::operator-(const CQuaternion& q) const { return CQuaternion(w - q.w, x - q.x, y - q.y, z - q.z); }
-
-CQuaternion CQuaternion::operator*(const CQuaternion& q) const
-{
-    return CQuaternion(w * q.w - CVector3f(x, y, z).dot({q.x, q.y, q.z}),
-                       y * q.z - z * q.y + w * q.x + x * q.w,
-                       z * q.x - x * q.z + w * q.y + y * q.w,
-                       x * q.y - y * q.x + w * q.z + z * q.w);
-}
-
-CNUQuaternion CNUQuaternion::operator*(const CNUQuaternion& q) const
-{
-    return CNUQuaternion(w * q.w - CVector3f(x, y, z).dot({q.x, q.y, q.z}),
-                         y * q.z - z * q.y + w * q.x + x * q.w,
-                         z * q.x - x * q.z + w * q.y + y * q.w,
-                         x * q.y - y * q.x + w * q.z + z * q.w);
-}
-
-CQuaternion CQuaternion::operator/(const CQuaternion& q) const
-{
-    CQuaternion p(q);
-    p.invert();
-    return *this * p;
-}
-
-CQuaternion CQuaternion::operator*(float scale) const { return CQuaternion(w * scale, x * scale, y * scale, z * scale); }
-
-CNUQuaternion CNUQuaternion::operator*(float scale) const { return CNUQuaternion(w * scale, x * scale, y * scale, z * scale); }
-
-CQuaternion CQuaternion::operator/(float scale) const { return CQuaternion(w / scale, x / scale, y / scale, z / scale); }
-
-CQuaternion CQuaternion::operator-() const { return CQuaternion(-w, -x, -y, -z); }
-
-const CQuaternion& CQuaternion::operator+=(const CQuaternion& q)
-{
-    w += q.w;
-    x += q.x;
-    y += q.y;
-    z += q.z;
-    return *this;
-}
-
-const CNUQuaternion& CNUQuaternion::operator+=(const CNUQuaternion& q)
-{
-    w += q.w;
-    x += q.x;
-    y += q.y;
-    z += q.z;
-    return *this;
-}
-
-const CQuaternion& CQuaternion::operator-=(const CQuaternion& q)
-{
-    w -= q.w;
-    x -= q.x;
-    y -= q.y;
-    z -= q.z;
-    return *this;
-}
-
-const CQuaternion& CQuaternion::operator*=(const CQuaternion& q)
-{
-    CQuaternion orig = *this;
-
-    w = orig.w * q.w - CVector3f(orig.x, orig.y, orig.z).dot({q.x, q.y, q.z});
-    x = orig.y * q.z - orig.z * q.y + orig.w * q.x + orig.x * q.w;
-    y = orig.z * q.x - orig.x * q.z + orig.w * q.y + orig.y * q.w;
-    z = orig.x * q.y - orig.y * q.x + orig.w * q.z + orig.z * q.w;
-
-    return *this;
-}
-
-const CQuaternion& CQuaternion::operator*=(float scale)
-{
-    w *= scale;
-    x *= scale;
-    y *= scale;
-    z *= scale;
-    return *this;
-}
-
-const CQuaternion& CQuaternion::operator/=(float scale)
-{
-    w /= scale;
-    x /= scale;
-    y /= scale;
-    z /= scale;
-    return *this;
-}
-
-void CQuaternion::invert()
-{
-    x = -x;
-    y = -y;
-    z = -z;
-}
-
-CQuaternion CQuaternion::inverse() const { return CQuaternion(w, -x, -y, -z); }
-
-CQuaternion CQuaternion::log() const
-{
-    float a = std::acos(w);
-    float sina = std::sin(a);
-    CQuaternion ret;
-
-    ret.w = 0.f;
-
-    if (sina > 0.f)
-    {
-        ret.x = a * x / sina;
-        ret.y = a * y / sina;
-        ret.z = a * z / sina;
-    }
-    else
-    {
-        ret.x = 0.f;
-        ret.y = 0.f;
-        ret.z = 0.f;
+CQuaternion::CQuaternion(const CMatrix3f& mat) {
+  float trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.f) {
+    float st = std::sqrt(trace + 1.0f);
+    float s = 0.5f / st;
+    w() = 0.5f * st;
+    x() = (mat[1][2] - mat[2][1]) * s;
+    y() = (mat[2][0] - mat[0][2]) * s;
+    z() = (mat[0][1] - mat[1][0]) * s;
+  } else {
+    int idx = 0;
+    if (mat[1][1] > mat[0][0]) {
+      idx = 1;
+      if (mat[2][2] > mat[1][1])
+        idx = 2;
+    } else if (mat[2][2] > mat[0][0]) {
+      idx = 2;
    }

-    return ret;
+    switch (idx) {
+    case 0: {
+      float st = std::sqrt(mat[0][0] - (mat[1][1] + mat[2][2]) + 1.f);
+      float s = 0.5f / st;
+      w() = (mat[1][2] - mat[2][1]) * s;
+      x() = 0.5f * st;
+      y() = (mat[1][0] + mat[0][1]) * s;
+      z() = (mat[2][0] + mat[0][2]) * s;
+      break;
+    }
+    case 1: {
+      float st = std::sqrt(mat[1][1] - (mat[2][2] + mat[0][0]) + 1.f);
+      float s = 0.5f / st;
+      w() = (mat[2][0] - mat[0][2]) * s;
+      x() = (mat[1][0] + mat[0][1]) * s;
+      y() = 0.5f * st;
+      z() = (mat[2][1] + mat[1][2]) * s;
+      break;
+    }
+    case 2: {
+      float st = std::sqrt(mat[2][2] - (mat[0][0] + mat[1][1]) + 1.f);
+      float s = 0.5f / st;
+      w() = (mat[0][1] - mat[1][0]) * s;
+      x() = (mat[2][0] + mat[0][2]) * s;
+      y() = (mat[2][1] + mat[1][2]) * s;
+      z() = 0.5f * st;
+      break;
+    }
+    default:
+      w() = 0.f;
+      x() = 0.f;
+      y() = 0.f;
+      z() = 0.f;
+      break;
+    }
+  }
 }

-CQuaternion CQuaternion::exp() const
-{
-    float a = (CVector3f(x, y, z).magnitude());
-    float sina = std::sin(a);
-    float cosa = std::cos(a);
-    CQuaternion ret;
+void CQuaternion::fromVector3f(const CVector3f& vec) {
+  float cosX = std::cos(0.5f * vec.x());
+  float cosY = std::cos(0.5f * vec.y());
+  float cosZ = std::cos(0.5f * vec.z());

-    ret.w = cosa;
-    if (a > 0.f)
-    {
-        ret.x = sina * x / a;
-        ret.y = sina * y / a;
-        ret.z = sina * z / a;
-    }
-    else
-    {
-        ret.x = 0.f;
-        ret.y = 0.f;
-        ret.z = 0.f;
-    }
+  float sinX = std::sin(0.5f * vec.x());
+  float sinY = std::sin(0.5f * vec.y());
+  float sinZ = std::sin(0.5f * vec.z());

-    return ret;
+  simd_floats f;
+  f[0] = cosZ * cosY * cosX + sinZ * sinY * sinX;
+  f[1] = cosZ * cosY * sinX - sinZ * sinY * cosX;
+  f[2] = cosZ * sinY * cosX + sinZ * cosY * sinX;
+  f[3] = sinZ * cosY * cosX - cosZ * sinY * sinX;
+  mSimd.copy_from(f);
+}
+
+CQuaternion& CQuaternion::operator=(const CQuaternion& q) {
+  mSimd = q.mSimd;
+  return *this;
+}
+
+CQuaternion CQuaternion::operator+(const CQuaternion& q) const {
+  return mSimd + q.mSimd;
+}
+
+CQuaternion CQuaternion::operator-(const CQuaternion& q) const {
+  return mSimd - q.mSimd;
+}
+
+CQuaternion CQuaternion::operator*(const CQuaternion& q) const {
+  return CQuaternion(w() * q.w() - CVector3f(x(), y(), z()).dot({q.x(), q.y(), q.z()}),
+                     y() * q.z() - z() * q.y() + w() * q.x() + x() * q.w(),
+                     z() * q.x() - x() * q.z() + w() * q.y() + y() * q.w(),
+                     x() * q.y() - y() * q.x() + w() * q.z() + z() * q.w());
+}
+
+CNUQuaternion CNUQuaternion::operator*(const CNUQuaternion& q) const {
+  return CNUQuaternion(w() * q.w() - CVector3f(x(), y(), z()).dot({q.x(), q.y(), q.z()}),
+                       y() * q.z() - z() * q.y() + w() * q.x() + x() * q.w(),
+                       z() * q.x() - x() * q.z() + w() * q.y() + y() * q.w(),
+                       x() * q.y() - y() * q.x() + w() * q.z() + z() * q.w());
+}
+
+CQuaternion CQuaternion::operator/(const CQuaternion& q) const {
+  CQuaternion p(q);
+  p.invert();
+  return *this * p;
+}
+
+CQuaternion CQuaternion::operator*(float scale) const {
+  return mSimd * simd<float>(scale);
+}
+
+CNUQuaternion CNUQuaternion::operator*(float scale) const {
+  return mSimd * simd<float>(scale);
+}
+
+CQuaternion CQuaternion::operator/(float scale) const {
+  return mSimd / simd<float>(scale);
+}
+
+CQuaternion CQuaternion::operator-() const { return -mSimd; }
+
+const CQuaternion& CQuaternion::operator+=(const CQuaternion& q) {
+  mSimd += q.mSimd;
+  return *this;
+}
+
+const CNUQuaternion& CNUQuaternion::operator+=(const CNUQuaternion& q) {
+  mSimd += q.mSimd;
+  return *this;
+}
+
+const CQuaternion& CQuaternion::operator-=(const CQuaternion& q) {
+  mSimd -= q.mSimd;
+  return *this;
+}
+
+const CQuaternion& CQuaternion::operator*=(const CQuaternion& q) {
+  CQuaternion orig = *this;
+
+  w() = orig.w() * q.w() - CVector3f(orig.x(), orig.y(), orig.z()).dot({q.x(), q.y(), q.z()});
+  x() = orig.y() * q.z() - orig.z() * q.y() + orig.w() * q.x() + orig.x() * q.w();
+  y() = orig.z() * q.x() - orig.x() * q.z() + orig.w() * q.y() + orig.y() * q.w();
+  z() = orig.x() * q.y() - orig.y() * q.x() + orig.w() * q.z() + orig.z() * q.w();
+
+  return *this;
+}
+
+const CQuaternion& CQuaternion::operator*=(float scale) {
+  mSimd *= simd<float>(scale);
+  return *this;
+}
+
+const CQuaternion& CQuaternion::operator/=(float scale) {
+  mSimd /= simd<float>(scale);
+  return *this;
+}
+
+static const simd<float> InvertQuat(1.f, -1.f, -1.f, -1.f);
+
+void CQuaternion::invert() {
+  mSimd *= InvertQuat;
+}
+
+CQuaternion CQuaternion::inverse() const { return mSimd * InvertQuat; }
+
+CQuaternion CQuaternion::log() const {
+  float a = std::acos(w());
+  float sina = std::sin(a);
+  CQuaternion ret;
+
+  if (sina > 0.f)
+    ret = a * *this / sina;
+  else
+    ret = simd<float>(0.f);
+
+  ret.w() = 0.f;
+
+  return ret;
+}
+
+CQuaternion CQuaternion::exp() const {
+  float a = (CVector3f(mSimd.shuffle<1, 2, 3, 3>()).magnitude());
+  float sina = std::sin(a);
+  float cosa = std::cos(a);
+  CQuaternion ret;
+
+  if (a > 0.f)
+    ret = sina * *this / a;
+  else
+    ret = simd<float>(0.f);
+
+  ret.w() = cosa;
+
+  return ret;
 }

 CQuaternion CQuaternion::lerp(const CQuaternion& a, const CQuaternion& b, double t) { return (a + t * (b - a)); }

-CQuaternion CQuaternion::nlerp(const CQuaternion& a, const CQuaternion& b, double t) { return lerp(a, b, t).normalized(); }
-
-CQuaternion CQuaternion::slerp(const CQuaternion& a, const CQuaternion& b, double t)
-{
-    if (t <= 0.0f)
-        return a;
-    if (t >= 1.0f)
-        return b;
-
-    CQuaternion ret;
-
-    float mag = std::sqrt(a.dot(a) * b.dot(b));
-
-    float prod = a.dot(b) / mag;
-
-    if (std::fabs(prod) < 1.0f)
-    {
-        const double sign = (prod < 0.0f) ? -1.0f : 1.0f;
-
-        const double theta = std::acos(sign * prod);
-        const double s1 = std::sin(sign * t * theta);
-        const double d = 1.0 / std::sin(theta);
-        const double s0 = std::sin((1.0 - t) * theta);
-
-        ret.x = float((a.x * s0 + b.x * s1) * d);
-        ret.y = float((a.y * s0 + b.y * s1) * d);
-        ret.z = float((a.z * s0 + b.z * s1) * d);
-        ret.w = float((a.w * s0 + b.w * s1) * d);
-
-        return ret;
-    }
-    return a;
+CQuaternion CQuaternion::nlerp(const CQuaternion& a, const CQuaternion& b, double t) {
+  return lerp(a, b, t).normalized();
 }

-CQuaternion CQuaternion::shortestRotationArc(const zeus::CVector3f& v0, const zeus::CVector3f& v1)
-{
-    CVector3f v0N = v0;
-    CVector3f v1N = v1;
+CQuaternion CQuaternion::slerp(const CQuaternion& a, const CQuaternion& b, double t) {
+  if (t <= 0.0f)
+    return a;
+  if (t >= 1.0f)
+    return b;

-    if (!v0N.isZero())
-        v0N.normalize();
-    if (!v1N.isZero())
-        v1N.normalize();
+  CQuaternion ret;

-    CVector3f cross = v0N.cross(v1N);
+  float mag = std::sqrt(a.dot(a) * b.dot(b));

-    if (cross.magSquared() < 0.001f)
-    {
-        if (v0N.dot(v1N) > 0.f)
-            return CQuaternion::skNoRotation;
-        if (cross.canBeNormalized())
-            return CQuaternion(0.0f, cross.normalized());
-        return CQuaternion::skNoRotation;
-    }
-    else
-    {
-        float w = std::sqrt((1.f + zeus::clamp(-1.f, v0N.dot(v1N), 1.f)) * 2.f);
-        return CQuaternion(0.5f * w, cross * (1.f / w));
-    }
+  float prod = a.dot(b) / mag;
+
+  if (std::fabs(prod) < 1.0f) {
+    const double sign = (prod < 0.0f) ? -1.0f : 1.0f;
+
+    const double theta = std::acos(sign * prod);
+    const double s1 = std::sin(sign * t * theta);
+    const double d = 1.0 / std::sin(theta);
+    const double s0 = std::sin((1.0 - t) * theta);
+
+    ret = (a * s0 + b * s1) * d;
+
+    return ret;
+  }
+  return a;
+}
+
+CQuaternion CQuaternion::shortestRotationArc(const zeus::CVector3f& v0, const zeus::CVector3f& v1) {
+  CVector3f v0N = v0;
+  CVector3f v1N = v1;
+
+  if (!v0N.isZero())
+    v0N.normalize();
+  if (!v1N.isZero())
+    v1N.normalize();
+
+  CVector3f cross = v0N.cross(v1N);
+
+  if (cross.magSquared() < 0.001f) {
+    if (v0N.dot(v1N) > 0.f)
+      return CQuaternion::skNoRotation;
+    if (cross.canBeNormalized())
+      return CQuaternion(0.0f, cross.normalized());
+    return CQuaternion::skNoRotation;
+  } else {
+    float w = std::sqrt((1.f + zeus::clamp(-1.f, v0N.dot(v1N), 1.f)) * 2.f);
+    return CQuaternion(0.5f * w, cross * (1.f / w));
+  }
 }

 CQuaternion CQuaternion::clampedRotateTo(const zeus::CUnitVector3f& v0, const zeus::CUnitVector3f& v1,
-                                         const zeus::CRelAngle& angle)
-{
-    CQuaternion arc = shortestRotationArc(v0, v1);
-    if (angle >= 2.f * std::acos(arc.w))
-        return arc;
+                                         const zeus::CRelAngle& angle) {
+  CQuaternion arc = shortestRotationArc(v0, v1);
+  if (angle >= 2.f * std::acos(arc.w()))
+    return arc;

-    return fromAxisAngle(arc.getImaginary(), angle);
+  return fromAxisAngle(arc.getImaginary(), angle);
 }

-CQuaternion CQuaternion::slerpShort(const CQuaternion& a, const CQuaternion& b, double t)
-{
-    return zeus::CQuaternion::slerp((b.dot(a) >= 0.f) ? a : a.buildEquivalent(), b, t);
+CQuaternion CQuaternion::slerpShort(const CQuaternion& a, const CQuaternion& b, double t) {
+  return zeus::CQuaternion::slerp((b.dot(a) >= 0.f) ? a : a.buildEquivalent(), b, t);
 }

-CQuaternion operator+(float lhs, const CQuaternion& rhs)
-{
-    return CQuaternion(lhs + rhs.w, lhs * rhs.x, lhs * rhs.y, lhs * rhs.z);
+CQuaternion operator+(float lhs, const CQuaternion& rhs) {
+  return simd<float>(lhs) + rhs.mSimd;
 }

-CQuaternion operator-(float lhs, const CQuaternion& rhs)
-{
-    return CQuaternion(lhs - rhs.w, lhs * rhs.x, lhs * rhs.y, lhs * rhs.z);
+CQuaternion operator-(float lhs, const CQuaternion& rhs) {
+  return simd<float>(lhs) - rhs.mSimd;
 }

-CQuaternion operator*(float lhs, const CQuaternion& rhs)
-{
-    return CQuaternion(lhs * rhs.w, lhs * rhs.x, lhs * rhs.y, lhs * rhs.z);
+CQuaternion operator*(float lhs, const CQuaternion& rhs) {
+  return simd<float>(lhs) * rhs.mSimd;
 }

-CNUQuaternion operator*(float lhs, const CNUQuaternion& rhs)
-{
-    return CNUQuaternion(lhs * rhs.w, lhs * rhs.x, lhs * rhs.y, lhs * rhs.z);
+CNUQuaternion operator*(float lhs, const CNUQuaternion& rhs) {
+  return simd<float>(lhs) * rhs.mSimd;
 }

-CQuaternion CQuaternion::buildEquivalent() const
-{
-    float tmp = std::acos(clamp(-1.f, w, 1.f)) * 2.0;
-    if (std::fabs(tmp) < 1.0e-7)
-        return {-1.f, 0.f, 0.f, 0.f};
-    else
-        return CQuaternion::fromAxisAngle(CUnitVector3f(x, y, z), tmp + 2.0 * M_PI);
+CQuaternion CQuaternion::buildEquivalent() const {
+  float tmp = std::acos(clamp(-1.f, w(), 1.f)) * 2.f;
+  if (std::fabs(tmp) < 1.0e-7)
+    return {-1.f, 0.f, 0.f, 0.f};
+  else
+    return CQuaternion::fromAxisAngle(CUnitVector3f(mSimd.shuffle<1, 2, 3, 3>()), tmp + 2.0 * M_PI);
 }

-CRelAngle CQuaternion::angleFrom(const zeus::CQuaternion& other)
-{
-    return std::acos(zeus::clamp(-1.f, dot(other), 1.f));
+CRelAngle CQuaternion::angleFrom(const zeus::CQuaternion& other) {
+  return std::acos(zeus::clamp(-1.f, dot(other), 1.f));
 }

-CQuaternion CQuaternion::lookAt(const CUnitVector3f& source, const CUnitVector3f& dest, const CRelAngle& maxAng)
-{
-    CQuaternion q = skNoRotation;
-    zeus::CVector3f destNoZ = dest;
-    zeus::CVector3f sourceNoZ = source;
-    destNoZ.z = 0.f;
-    sourceNoZ.z = 0.f;
-    zeus::CVector3f tmp;
-    if (sourceNoZ.magSquared() > 0.0001f && destNoZ.magSquared() > 0.0001f)
-    {
-        sourceNoZ.normalize();
-        destNoZ.normalize();
+CQuaternion CQuaternion::lookAt(const CUnitVector3f& source, const CUnitVector3f& dest, const CRelAngle& maxAng) {
+  CQuaternion q = skNoRotation;
+  zeus::CVector3f destNoZ = dest;
+  zeus::CVector3f sourceNoZ = source;
+  destNoZ.z() = 0.f;
+  sourceNoZ.z() = 0.f;
+  zeus::CVector3f tmp;
+  if (sourceNoZ.magSquared() > 0.0001f && destNoZ.magSquared() > 0.0001f) {
+    sourceNoZ.normalize();
+    destNoZ.normalize();

-        float angleBetween =
-            normalize_angle(std::atan2(destNoZ.x, destNoZ.y) - std::atan2(sourceNoZ.x, sourceNoZ.y));
-        float realAngle = zeus::clamp(-maxAng.asRadians(), angleBetween, maxAng.asRadians());
-        CQuaternion tmpQ;
-        tmpQ.rotateZ(-realAngle);
-        q = tmpQ;
-        tmp = q.transform(sourceNoZ);
-    }
-    else if (sourceNoZ.magSquared() > 0.0001f)
-        tmp = sourceNoZ.normalized();
-    else if (destNoZ.magSquared() > 0.0001f)
-        tmp = destNoZ.normalized();
-    else
-        return skNoRotation;
+    float angleBetween =
+      normalize_angle(std::atan2(destNoZ.x(), destNoZ.y()) - std::atan2(sourceNoZ.x(), sourceNoZ.y()));
+    float realAngle = zeus::clamp(-maxAng.asRadians(), angleBetween, maxAng.asRadians());
+    CQuaternion tmpQ;
+    tmpQ.rotateZ(-realAngle);
+    q = tmpQ;
+    tmp = q.transform(sourceNoZ);
+  } else if (sourceNoZ.magSquared() > 0.0001f)
+    tmp = sourceNoZ.normalized();
+  else if (destNoZ.magSquared() > 0.0001f)
+    tmp = destNoZ.normalized();
+  else
+    return skNoRotation;

-    float realAngle =
-        zeus::clamp(-maxAng.asRadians(), normalize_angle(std::acos(dest.z) - std::acos(source.z)), maxAng.asRadians());
-    return CQuaternion::fromAxisAngle(tmp.cross(CVector3f::skUp), -realAngle) * q;
+  float realAngle =
+    zeus::clamp(-maxAng.asRadians(), normalize_angle(std::acos(dest.z()) - std::acos(source.z())), maxAng.asRadians());
+  return CQuaternion::fromAxisAngle(tmp.cross(CVector3f::skUp), -realAngle) * q;
 }

 }
--- a/src/CTransform.cpp
+++ b/src/CTransform.cpp
@ -1,69 +1,65 @@
 #include "zeus/CTransform.hpp"

-namespace zeus
-{
-CTransform CTransformFromEditorEuler(const CVector3f& eulerVec)
-{
-    CTransform result;
-    double ti, tj, th, ci, cj, ch, si, sj, sh, cc, cs, sc, ss;
+namespace zeus {
+CTransform CTransformFromEditorEuler(const CVector3f& eulerVec) {
+  CTransform result;
+  double ti, tj, th, ci, cj, ch, si, sj, sh, cc, cs, sc, ss;

-    ti = eulerVec[0];
-    tj = eulerVec[1];
-    th = eulerVec[2];
+  ti = eulerVec[0];
+  tj = eulerVec[1];
+  th = eulerVec[2];

-    ci = std::cos(ti);
-    cj = std::cos(tj);
-    ch = std::cos(th);
-    si = std::sin(ti);
-    sj = std::sin(tj);
-    sh = std::sin(th);
+  ci = std::cos(ti);
+  cj = std::cos(tj);
+  ch = std::cos(th);
+  si = std::sin(ti);
+  sj = std::sin(tj);
+  sh = std::sin(th);

-    cc = ci * ch;
-    cs = ci * sh;
-    sc = si * ch;
-    ss = si * sh;
+  cc = ci * ch;
+  cs = ci * sh;
+  sc = si * ch;
+  ss = si * sh;

-    result.basis.m[0][0] = float(cj * ch);
-    result.basis.m[1][0] = float(sj * sc - cs);
-    result.basis.m[2][0] = float(sj * cc + ss);
-    result.basis.m[0][1] = float(cj * sh);
-    result.basis.m[1][1] = float(sj * ss + cc);
-    result.basis.m[2][1] = float(sj * cs - sc);
-    result.basis.m[0][2] = float(-sj);
-    result.basis.m[1][2] = float(cj * si);
-    result.basis.m[2][2] = float(cj * ci);
+  result.basis.m[0][0] = float(cj * ch);
+  result.basis.m[1][0] = float(sj * sc - cs);
+  result.basis.m[2][0] = float(sj * cc + ss);
+  result.basis.m[0][1] = float(cj * sh);
+  result.basis.m[1][1] = float(sj * ss + cc);
+  result.basis.m[2][1] = float(sj * cs - sc);
+  result.basis.m[0][2] = float(-sj);
+  result.basis.m[1][2] = float(cj * si);
+  result.basis.m[2][2] = float(cj * ci);

-    return result;
+  return result;
 }

-CTransform CTransformFromAxisAngle(const CVector3f& axis, float angle)
-{
-    CTransform result;
-    CVector3f axisN = axis.normalized();
+CTransform CTransformFromAxisAngle(const CVector3f& axis, float angle) {
+  CTransform result;
+  CVector3f axisN = axis.normalized();

-    float c = std::cos(angle);
-    float s = std::sin(angle);
-    float t = 1.f - c;
+  float c = std::cos(angle);
+  float s = std::sin(angle);
+  float t = 1.f - c;

-    result.basis.m[0][0] = t * axisN.v[0] * axisN.v[0] + c;
-    result.basis.m[1][0] = t * axisN.v[0] * axisN.v[1] - axisN.v[2] * s;
-    result.basis.m[2][0] = t * axisN.v[0] * axisN.v[2] + axisN.v[1] * s;
+  result.basis.m[0][0] = t * axisN[0] * axisN[0] + c;
+  result.basis.m[1][0] = t * axisN[0] * axisN[1] - axisN[2] * s;
+  result.basis.m[2][0] = t * axisN[0] * axisN[2] + axisN[1] * s;

-    result.basis.m[0][1] = t * axisN.v[0] * axisN.v[1] + axisN.v[2] * s;
-    result.basis.m[1][1] = t * axisN.v[1] * axisN.v[1] + c;
-    result.basis.m[2][1] = t * axisN.v[1] * axisN.v[2] - axisN.v[0] * s;
+  result.basis.m[0][1] = t * axisN[0] * axisN[1] + axisN[2] * s;
+  result.basis.m[1][1] = t * axisN[1] * axisN[1] + c;
+  result.basis.m[2][1] = t * axisN[1] * axisN[2] - axisN[0] * s;

-    result.basis.m[0][2] = t * axisN.v[0] * axisN.v[2] - axisN.v[1] * s;
-    result.basis.m[1][2] = t * axisN.v[1] * axisN.v[2] + axisN.v[0] * s;
-    result.basis.m[2][2] = t * axisN.v[2] * axisN.v[2] + c;
+  result.basis.m[0][2] = t * axisN[0] * axisN[2] - axisN[1] * s;
+  result.basis.m[1][2] = t * axisN[1] * axisN[2] + axisN[0] * s;
+  result.basis.m[2][2] = t * axisN[2] * axisN[2] + c;

-    return result;
+  return result;
 }

-CTransform CTransformFromEditorEulers(const CVector3f& eulerVec, const CVector3f& origin)
-{
-    CTransform ret = CTransformFromEditorEuler(eulerVec);
-    ret.origin = origin;
-    return ret;
+CTransform CTransformFromEditorEulers(const CVector3f& eulerVec, const CVector3f& origin) {
+  CTransform ret = CTransformFromEditorEuler(eulerVec);
+  ret.origin = origin;
+  return ret;
 }
 }
--- a/src/CVector2f.cpp
+++ b/src/CVector2f.cpp
@ -4,50 +4,46 @@
 #include <cassert>
 #include "zeus/Math.hpp"

-namespace zeus
-{
+namespace zeus {
 const CVector2f CVector2f::skOne = CVector2f(1.0);
 const CVector2f CVector2f::skNegOne = CVector2f(-1.0);
 const CVector2f CVector2f::skZero(0.f, 0.f);

-float CVector2f::getAngleDiff(const CVector2f& a, const CVector2f& b)
-{
-    float mag1 = a.magnitude();
-    float mag2 = b.magnitude();
+float CVector2f::getAngleDiff(const CVector2f& a, const CVector2f& b) {
+  float mag1 = a.magnitude();
+  float mag2 = b.magnitude();

-    if (!mag1 || !mag2)
-        return 0;
+  if (!mag1 || !mag2)
+    return 0;

-    float dot = a.dot(b);
-    float theta = std::acos(dot / (mag1 * mag2));
-    return theta;
+  float dot = a.dot(b);
+  float theta = std::acos(dot / (mag1 * mag2));
+  return theta;
 }

-CVector2f CVector2f::slerp(const CVector2f& a, const CVector2f& b, float t)
-{
-    if (t <= 0.0f)
-        return a;
-    if (t >= 1.0f)
-        return b;
-
-    CVector2f ret;
-
-    float mag = std::sqrt(a.dot(a) * b.dot(b));
-
-    float prod = a.dot(b) / mag;
-
-    if (std::fabs(prod) < 1.0f)
-    {
-        const double sign = (prod < 0.0f) ? -1.0f : 1.0f;
-
-        const double theta = std::acos(sign * prod);
-        const double s1 = std::sin(sign * t * theta);
-        const double d = 1.0 / std::sin(theta);
-        const double s0 = std::sin((1.0f - t) * theta);
-
-        ret = (a * s0 + b * s1) * d;
-        return ret;
-    }
+CVector2f CVector2f::slerp(const CVector2f& a, const CVector2f& b, float t) {
+  if (t <= 0.0f)
    return a;
+  if (t >= 1.0f)
+    return b;
+
+  CVector2f ret;
+
+  float mag = std::sqrt(a.dot(a) * b.dot(b));
+
+  float prod = a.dot(b) / mag;
+
+  if (std::fabs(prod) < 1.0f) {
+    const double sign = (prod < 0.0f) ? -1.0f : 1.0f;
+
+    const double theta = std::acos(sign * prod);
+    const double s1 = std::sin(sign * t * theta);
+    const double d = 1.0 / std::sin(theta);
+    const double s0 = std::sin((1.0f - t) * theta);
+
+    ret = (a * s0 + b * s1) * d;
+    return ret;
+  }
+  return a;
 }
 }
--- a/src/CVector3f.cpp
+++ b/src/CVector3f.cpp
@ -5,8 +5,7 @@
 #include <cassert>
 #include "zeus/Math.hpp"

-namespace zeus
-{
+namespace zeus {
 const CVector3f CVector3f::skOne(1.f);
 const CVector3f CVector3f::skNegOne(-1.f);
 const CVector3f CVector3f::skZero;
@ -20,59 +19,44 @@ const CVector3f CVector3f::skRadToDegVec(180.0f / M_PIF);
 const CVector3f CVector3f::skDegToRadVec(M_PIF / 180.0f);
 const CVector3d CVector3d::skZero(0.0, 0.0, 0.0);

-CVector3f::CVector3f(const CVector3d& vec)
-{
-#if __SSE__
-    mVec128 = _mm_cvtpd_ps(vec.mVec128[0]);
-    v[2] = vec.v[2];
-#else
-    v[0] = vec.v[0];
-    v[1] = vec.v[1];
-    v[2] = vec.v[2];
-#endif
+CVector3f::CVector3f(const CVector3d& vec) : mSimd(vec.mSimd) {}
+
+float CVector3f::getAngleDiff(const CVector3f& a, const CVector3f& b) {
+  float mag1 = a.magnitude();
+  float mag2 = b.magnitude();
+
+  if (!mag1 || !mag2)
+    return 0.f;
+
+  float dot = a.dot(b);
+  float theta = std::acos(dot / (mag1 * mag2));
+  return theta;
 }

-float CVector3f::getAngleDiff(const CVector3f& a, const CVector3f& b)
-{
-    float mag1 = a.magnitude();
-    float mag2 = b.magnitude();
-
-    if (!mag1 || !mag2)
-        return 0.f;
-
-    float dot = a.dot(b);
-    float theta = std::acos(dot / (mag1 * mag2));
-    return theta;
-}
-
-CVector3f CVector3f::slerp(const CVector3f& a, const CVector3f& b, float t)
-{
-    if (t <= 0.0f)
-        return a;
-    if (t >= 1.0f)
-        return b;
-
-    CVector3f ret;
-
-    float mag = std::sqrt(a.dot(a) * b.dot(b));
-
-    float prod = a.dot(b) / mag;
-
-    if (std::fabs(prod) < 1.0f)
-    {
-        const double sign = (prod < 0.0f) ? -1.0f : 1.0f;
-
-        const double theta = acos(sign * prod);
-        const double s1 = sin(sign * t * theta);
-        const double d = 1.0 / sin(theta);
-        const double s0 = sin((1.0 - t) * theta);
-
-        ret.x = (float)(a.x * s0 + b.x * s1) * d;
-        ret.y = (float)(a.y * s0 + b.y * s1) * d;
-        ret.z = (float)(a.z * s0 + b.z * s1) * d;
-
-        return ret;
-    }
+CVector3f CVector3f::slerp(const CVector3f& a, const CVector3f& b, float t) {
+  if (t <= 0.0f)
    return a;
+  if (t >= 1.0f)
+    return b;
+
+  CVector3f ret;
+
+  float mag = std::sqrt(a.dot(a) * b.dot(b));
+
+  float prod = a.dot(b) / mag;
+
+  if (std::fabs(prod) < 1.0f) {
+    const double sign = (prod < 0.0f) ? -1.0f : 1.0f;
+
+    const double theta = acos(sign * prod);
+    const double s1 = sin(sign * t * theta);
+    const double d = 1.0 / sin(theta);
+    const double s0 = sin((1.0 - t) * theta);
+
+    ret = (a * s0 + b * s1) * d;
+
+    return ret;
+  }
+  return a;
 }
 }
--- a/src/CVector4f.cpp
+++ b/src/CVector4f.cpp
@ -1,19 +1,13 @@
 #include "zeus/CVector4f.hpp"
 #include "zeus/CColor.hpp"

-namespace zeus
-{
+namespace zeus {
 const CVector4f CVector4f::skZero(0.f, 0.f, 0.f, 0.f);

-CVector4f::CVector4f(const zeus::CColor& other) : x(other.r), y(other.g), z(other.b), w(other.a) {}
+CVector4f::CVector4f(const zeus::CColor& other) : mSimd(other.mSimd) {}

-CVector4f& CVector4f::operator=(const CColor& other)
-{
-    x = other.r;
-    y = other.g;
-    z = other.b;
-    w = other.a;
-
-    return *this;
+CVector4f& CVector4f::operator=(const CColor& other) {
+  mSimd = other.mSimd;
+  return *this;
 }
 }
--- a/src/Math.cpp
+++ b/src/Math.cpp
@ -2,312 +2,292 @@
 #include "zeus/CTransform.hpp"
 #include "zeus/CVector3f.hpp"
 #include "zeus/CVector2f.hpp"
+
 #if _WIN32
 #include <intrin.h>
 #else
+
 #include <cpuid.h>
+
 #endif

-namespace zeus
-{
+namespace zeus {

 static bool isCPUInit = false;
 static CPUInfo g_cpuFeatures = {};
 static CPUInfo g_missingFeatures = {};

-void getCpuInfo(int eax, int regs[4])
-{
+void getCpuInfo(int eax, int regs[4]) {
 #if !GEKKO
 #if _WIN32
-    __cpuid(regs, eax);
+  __cpuid(regs, eax);
 #else
-    __cpuid(eax, regs[0], regs[1], regs[2], regs[3]);
+  __cpuid(eax, regs[0], regs[1], regs[2], regs[3]);
 #endif
 #endif
 }

-void getCpuInfoEx(int eax, int ecx, int regs[4])
-{
+void getCpuInfoEx(int eax, int ecx, int regs[4]) {
 #if !GEKKO
 #if _WIN32
-    __cpuidex(regs, eax, ecx);
+  __cpuidex(regs, eax, ecx);
 #else
-    __cpuid_count(eax, ecx, regs[0], regs[1], regs[2], regs[3]);
+  __cpuid_count(eax, ecx, regs[0], regs[1], regs[2], regs[3]);
 #endif
 #endif
 }

-void detectCPU()
-{
+void detectCPU() {
 #if !GEKKO
-    if (isCPUInit)
-        return;
+  if (isCPUInit)
+    return;

-    int regs[4];
-    getCpuInfo(0, regs);
-    int highestFeature = regs[0];
-    *reinterpret_cast<int*>((char*)g_cpuFeatures.cpuVendor) = regs[1];
-    *reinterpret_cast<int*>((char*)g_cpuFeatures.cpuVendor + 4) = regs[3];
-    *reinterpret_cast<int*>((char*)g_cpuFeatures.cpuVendor + 8) = regs[2];
-    getCpuInfo(0x80000000, regs);
-    if (regs[0] >= 0x80000004)
-    {
-        for (unsigned int i = 0x80000002; i <= 0x80000004; i++)
-        {
-            getCpuInfo(i, regs);
-            // Interpret CPU brand string and cache information.
-            if (i == 0x80000002)
-                memcpy((char*)g_cpuFeatures.cpuBrand, regs, sizeof(regs));
-            else if (i == 0x80000003)
-                memcpy((char*)g_cpuFeatures.cpuBrand + 16, regs, sizeof(regs));
-            else if (i == 0x80000004)
-                memcpy((char*)g_cpuFeatures.cpuBrand + 32, regs, sizeof(regs));
-        }
+  int regs[4];
+  getCpuInfo(0, regs);
+  int highestFeature = regs[0];
+  *reinterpret_cast<int*>((char*) g_cpuFeatures.cpuVendor) = regs[1];
+  *reinterpret_cast<int*>((char*) g_cpuFeatures.cpuVendor + 4) = regs[3];
+  *reinterpret_cast<int*>((char*) g_cpuFeatures.cpuVendor + 8) = regs[2];
+  getCpuInfo(0x80000000, regs);
+  if (regs[0] >= 0x80000004) {
+    for (unsigned int i = 0x80000002; i <= 0x80000004; i++) {
+      getCpuInfo(i, regs);
+      // Interpret CPU brand string and cache information.
+      if (i == 0x80000002)
+        memcpy((char*) g_cpuFeatures.cpuBrand, regs, sizeof(regs));
+      else if (i == 0x80000003)
+        memcpy((char*) g_cpuFeatures.cpuBrand + 16, regs, sizeof(regs));
+      else if (i == 0x80000004)
+        memcpy((char*) g_cpuFeatures.cpuBrand + 32, regs, sizeof(regs));
    }
+  }

-    if (highestFeature >= 1)
-    {
-        getCpuInfo(1, regs);
-        memset((bool*)&g_cpuFeatures.AESNI, ((regs[2] & 0x02000000) != 0), 1);
-        memset((bool*)&g_cpuFeatures.SSE1, ((regs[3] & 0x02000000) != 0), 1);
-        memset((bool*)&g_cpuFeatures.SSE2, ((regs[3] & 0x04000000) != 0), 1);
-        memset((bool*)&g_cpuFeatures.SSE3, ((regs[2] & 0x00000001) != 0), 1);
-        memset((bool*)&g_cpuFeatures.SSSE3, ((regs[2] & 0x00000200) != 0), 1);
-        memset((bool*)&g_cpuFeatures.SSE41, ((regs[2] & 0x00080000) != 0), 1);
-        memset((bool*)&g_cpuFeatures.SSE42, ((regs[2] & 0x00100000) != 0), 1);
-        memset((bool*)&g_cpuFeatures.AVX, ((regs[2] & 0x10000000) != 0), 1);
-    }
+  if (highestFeature >= 1) {
+    getCpuInfo(1, regs);
+    memset((bool*) &g_cpuFeatures.AESNI, ((regs[2] & 0x02000000) != 0), 1);
+    memset((bool*) &g_cpuFeatures.SSE1, ((regs[3] & 0x02000000) != 0), 1);
+    memset((bool*) &g_cpuFeatures.SSE2, ((regs[3] & 0x04000000) != 0), 1);
+    memset((bool*) &g_cpuFeatures.SSE3, ((regs[2] & 0x00000001) != 0), 1);
+    memset((bool*) &g_cpuFeatures.SSSE3, ((regs[2] & 0x00000200) != 0), 1);
+    memset((bool*) &g_cpuFeatures.SSE41, ((regs[2] & 0x00080000) != 0), 1);
+    memset((bool*) &g_cpuFeatures.SSE42, ((regs[2] & 0x00100000) != 0), 1);
+    memset((bool*) &g_cpuFeatures.AVX, ((regs[2] & 0x10000000) != 0), 1);
+  }

-    if (highestFeature >= 7)
-    {
-        getCpuInfoEx(7, 0, regs);
-        memset((bool*)&g_cpuFeatures.AVX2, ((regs[1] & 0x00000020) != 0), 1);
-    }
+  if (highestFeature >= 7) {
+    getCpuInfoEx(7, 0, regs);
+    memset((bool*) &g_cpuFeatures.AVX2, ((regs[1] & 0x00000020) != 0), 1);
+  }

-    isCPUInit = true;
+  isCPUInit = true;
 #endif
 }

-const CPUInfo& cpuFeatures() { detectCPU(); return g_cpuFeatures; }
+const CPUInfo& cpuFeatures() {
+  detectCPU();
+  return g_cpuFeatures;
+}

-std::pair<bool, const CPUInfo&> validateCPU()
-{
-    detectCPU();
-    bool ret = true;
+std::pair<bool, const CPUInfo&> validateCPU() {
+  detectCPU();
+  bool ret = true;

 #if __AVX2__
-    if (!g_cpuFeatures.AVX2)
-    {
-        *(bool*) &g_missingFeatures.AVX2 = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.AVX2) {
+    *(bool*) &g_missingFeatures.AVX2 = true;
+    ret = false;
+  }
 #endif
 #if __AVX__
-    if (!g_cpuFeatures.AVX)
-    {
-        *(bool*) &g_missingFeatures.AVX = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.AVX) {
+    *(bool*) &g_missingFeatures.AVX = true;
+    ret = false;
+  }
 #endif
 #if __SSE4A__
-    if (!g_cpuFeatures.SSE4a)
-    {
-        *(bool*) &g_missingFeatures.SSE4a = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.SSE4a)
+  {
+      *(bool*) &g_missingFeatures.SSE4a = true;
+      ret = false;
+  }
 #endif
 #if __SSE4_2__
-    if (!g_cpuFeatures.SSE42)
-    {
-        *(bool*) &g_missingFeatures.SSE42 = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.SSE42) {
+    *(bool*) &g_missingFeatures.SSE42 = true;
+    ret = false;
+  }
 #endif
 #if __SSE4_1__
-    if (!g_cpuFeatures.SSE41)
-    {
-        *(bool*) &g_missingFeatures.SSE41 = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.SSE41) {
+    *(bool*) &g_missingFeatures.SSE41 = true;
+    ret = false;
+  }
 #endif
 #if __SSSE3__
-    if (!g_cpuFeatures.SSSE3)
-    {
-        *(bool*) &g_missingFeatures.SSSE3 = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.SSSE3) {
+    *(bool*) &g_missingFeatures.SSSE3 = true;
+    ret = false;
+  }
 #endif
 #if __SSE3__
-    if (!g_cpuFeatures.SSE3)
-    {
-        *(bool*) &g_missingFeatures.SSE3 = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.SSE3) {
+    *(bool*) &g_missingFeatures.SSE3 = true;
+    ret = false;
+  }
 #endif
 #if __SSE2__
-    if (!g_cpuFeatures.SSE2)
-    {
-        *(bool*) &g_missingFeatures.SSE2 = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.SSE2) {
+    *(bool*) &g_missingFeatures.SSE2 = true;
+    ret = false;
+  }
 #endif
 #if __SSE__
-    if (!g_cpuFeatures.SSE1)
-    {
-        *(bool*) &g_missingFeatures.SSE1 = true;
-        ret = false;
-    }
+  if (!g_cpuFeatures.SSE1) {
+    *(bool*) &g_missingFeatures.SSE1 = true;
+    ret = false;
+  }
 #endif

-    return {ret, g_missingFeatures};
+  return {ret, g_missingFeatures};
 }

-CTransform lookAt(const CVector3f& pos, const CVector3f& lookPos, const CVector3f& up)
-{
-    CVector3f vLook, vRight, vUp;
+CTransform lookAt(const CVector3f& pos, const CVector3f& lookPos, const CVector3f& up) {
+  CVector3f vLook, vRight, vUp;

-    vLook = lookPos - pos;
-    if (vLook.magnitude() <= FLT_EPSILON)
-        vLook = {0.f, 1.f, 0.f};
-    else
-        vLook.normalize();
+  vLook = lookPos - pos;
+  if (vLook.magnitude() <= FLT_EPSILON)
+    vLook = {0.f, 1.f, 0.f};
+  else
+    vLook.normalize();

-    vUp = up - vLook * clamp(-1.f, up.dot(vLook), 1.f);
+  vUp = up - vLook * clamp(-1.f, up.dot(vLook), 1.f);
+  if (vUp.magnitude() <= FLT_EPSILON) {
+    vUp = CVector3f(0.f, 0.f, 1.f) - vLook * vLook.z();
    if (vUp.magnitude() <= FLT_EPSILON)
-    {
-        vUp = CVector3f(0.f, 0.f, 1.f) - vLook * vLook.z;
-        if (vUp.magnitude() <= FLT_EPSILON)
-            vUp = CVector3f(0.f, 1.f, 0.f) - vLook * vLook.y;
-    }
-    vUp.normalize();
-    vRight = vLook.cross(vUp);
+      vUp = CVector3f(0.f, 1.f, 0.f) - vLook * vLook.y();
+  }
+  vUp.normalize();
+  vRight = vLook.cross(vUp);

-    CMatrix3f rmBasis(vRight, vLook, vUp);
-    return CTransform(rmBasis, pos);
+  CMatrix3f rmBasis(vRight, vLook, vUp);
+  return CTransform(rmBasis, pos);
 }

 CVector3f getBezierPoint(const CVector3f& a, const CVector3f& b,
-                         const CVector3f& c, const CVector3f& d, float t)
-{
-    const float omt = 1.f - t;
-    return ((a * omt + b * t) * omt + (b * omt + c * t) * t) * omt +
-           ((b * omt + c * t) * omt + (c * omt + d * t) * t) * t;
+                         const CVector3f& c, const CVector3f& d, float t) {
+  const float omt = 1.f - t;
+  return ((a * omt + b * t) * omt + (b * omt + c * t) * t) * omt +
+         ((b * omt + c * t) * omt + (c * omt + d * t) * t) * t;
 }

-int floorPowerOfTwo(int x)
-{
-    if (x == 0)
-        return 0;
-    /*
-     * we want to ensure that we always get the previous power,
-     * but if we have values like 256, we'll always get the same value,
-     * x-1 ensures that we always get the previous power.
-     */
-    x = (x - 1) | (x >> 1);
-    x = x | (x >> 2);
-    x = x | (x >> 4);
-    x = x | (x >> 8);
-    x = x | (x >> 16);
-    return x - (x >> 1);
+int floorPowerOfTwo(int x) {
+  if (x == 0)
+    return 0;
+  /*
+   * we want to ensure that we always get the previous power,
+   * but if we have values like 256, we'll always get the same value,
+   * x-1 ensures that we always get the previous power.
+   */
+  x = (x - 1) | (x >> 1);
+  x = x | (x >> 2);
+  x = x | (x >> 4);
+  x = x | (x >> 8);
+  x = x | (x >> 16);
+  return x - (x >> 1);
 }

-int ceilingPowerOfTwo(int x)
-{
-    if (x == 0)
-        return 0;
+int ceilingPowerOfTwo(int x) {
+  if (x == 0)
+    return 0;

-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-    x++;
+  x--;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  x++;

-    return x;
+  return x;
 }

-float getCatmullRomSplinePoint(float a, float b, float c, float d, float t)
-{
-    if (t <= 0.0f)
-        return b;
-    if (t >= 1.0f)
-        return c;
+float getCatmullRomSplinePoint(float a, float b, float c, float d, float t) {
+  if (t <= 0.0f)
+    return b;
+  if (t >= 1.0f)
+    return c;

-    const float t2 = t * t;
-    const float t3 = t2 * t;
+  const float t2 = t * t;
+  const float t3 = t2 * t;

-    return (a * (-0.5f * t3 + t2 - 0.5f * t) + b * (1.5f * t3 + -2.5f * t2 + 1.0f) + c * (-1.5f * t3 + 2.0f * t2 + 0.5f * t) +
-            d * (0.5f * t3 - 0.5f * t2));
+  return (a * (-0.5f * t3 + t2 - 0.5f * t) + b * (1.5f * t3 + -2.5f * t2 + 1.0f) +
+          c * (-1.5f * t3 + 2.0f * t2 + 0.5f * t) +
+          d * (0.5f * t3 - 0.5f * t2));
 }

-CVector3f getCatmullRomSplinePoint(const CVector3f& a, const CVector3f& b, const CVector3f& c, const CVector3f& d, float t)
-{
-    if (t <= 0.0f)
-        return b;
-    if (t >= 1.0f)
-        return c;
+CVector3f
+getCatmullRomSplinePoint(const CVector3f& a, const CVector3f& b, const CVector3f& c, const CVector3f& d, float t) {
+  if (t <= 0.0f)
+    return b;
+  if (t >= 1.0f)
+    return c;

-    const float t2 = t * t;
-    const float t3 = t2 * t;
+  const float t2 = t * t;
+  const float t3 = t2 * t;

-    return (a * (-0.5f * t3 + t2 - 0.5f * t) + b * (1.5f * t3 + -2.5f * t2 + 1.0f) + c * (-1.5f * t3 + 2.0f * t2 + 0.5f * t) +
-            d * (0.5f * t3 - 0.5f * t2));
+  return (a * (-0.5f * t3 + t2 - 0.5f * t) + b * (1.5f * t3 + -2.5f * t2 + 1.0f) +
+          c * (-1.5f * t3 + 2.0f * t2 + 0.5f * t) +
+          d * (0.5f * t3 - 0.5f * t2));
 }

-CVector3f getRoundCatmullRomSplinePoint(const CVector3f& a, const CVector3f& b, const CVector3f& c, const CVector3f& d, float t)
-{
-    if (t >= 0.0f)
-        return b;
-    if (t <= 1.0f)
-        return c;
+CVector3f
+getRoundCatmullRomSplinePoint(const CVector3f& a, const CVector3f& b, const CVector3f& c, const CVector3f& d, float t) {
+  if (t >= 0.0f)
+    return b;
+  if (t <= 1.0f)
+    return c;

-    CVector3f cb = c - b;
-    if (!cb.canBeNormalized())
-        return b;
-    CVector3f ab = a - b;
-    if (!ab.canBeNormalized())
-        ab = CVector3f(0, 1, 0);
-    CVector3f bVelocity = cb.normalized() - ab.normalized();
-    if (bVelocity.canBeNormalized())
-        bVelocity.normalize();
-    CVector3f dc = d - c;
-    if (!dc.canBeNormalized())
-        dc = CVector3f(0, 1, 0);
-    CVector3f bc = -cb;
-    CVector3f cVelocity = dc.normalized() - bc.normalized();
-    if (cVelocity.canBeNormalized())
-        cVelocity.normalize();
-    const float cbDistance = cb.magnitude();
-    return zeus::getCatmullRomSplinePoint(b, c, bVelocity * cbDistance, cVelocity * cbDistance, t);
+  CVector3f cb = c - b;
+  if (!cb.canBeNormalized())
+    return b;
+  CVector3f ab = a - b;
+  if (!ab.canBeNormalized())
+    ab = CVector3f(0, 1, 0);
+  CVector3f bVelocity = cb.normalized() - ab.normalized();
+  if (bVelocity.canBeNormalized())
+    bVelocity.normalize();
+  CVector3f dc = d - c;
+  if (!dc.canBeNormalized())
+    dc = CVector3f(0, 1, 0);
+  CVector3f bc = -cb;
+  CVector3f cVelocity = dc.normalized() - bc.normalized();
+  if (cVelocity.canBeNormalized())
+    cVelocity.normalize();
+  const float cbDistance = cb.magnitude();
+  return zeus::getCatmullRomSplinePoint(b, c, bVelocity * cbDistance, cVelocity * cbDistance, t);
 }

-CVector3f baryToWorld(const CVector3f& p0, const CVector3f& p1, const CVector3f& p2, const CVector3f& bary)
-{
-    return bary.x * p0 + bary.y * p1 + bary.z * p2;
+CVector3f baryToWorld(const CVector3f& p0, const CVector3f& p1, const CVector3f& p2, const CVector3f& bary) {
+  return bary.x() * p0 + bary.y() * p1 + bary.z() * p2;
 }

-bool close_enough(const CVector3f& a, const CVector3f &b, float epsilon)
-{
-    if (std::fabs(a.x - b.x) < epsilon && std::fabs(a.y - b.y) < epsilon && std::fabs(a.z - b.z) < epsilon)
-        return true;
-    return false;
+bool close_enough(const CVector3f& a, const CVector3f& b, float epsilon) {
+  return std::fabs(a.x() - b.x()) < epsilon &&
+         std::fabs(a.y() - b.y()) < epsilon &&
+         std::fabs(a.z() - b.z()) < epsilon;
 }

-bool close_enough(const CVector2f& a, const CVector2f& b, float epsilon)
-{
-    if (std::fabs(a.x - b.x) < epsilon && std::fabs(a.y - b.y) < epsilon)
-        return true;
-    return false;
+bool close_enough(const CVector2f& a, const CVector2f& b, float epsilon) {
+  return std::fabs(a.x() - b.x()) < epsilon && std::fabs(a.y() - b.y()) < epsilon;
 }

-template <> CVector3f min(const CVector3f& a, const CVector3f& b)
-{
-    return {min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)};
+template<>
+CVector3f min(const CVector3f& a, const CVector3f& b) {
+  return {min(a.x(), b.x()), min(a.y(), b.y()), min(a.z(), b.z())};
 }

-template <> CVector3f max(const CVector3f& a, const CVector3f& b)
-{
-    return {max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
+template<>
+CVector3f max(const CVector3f& a, const CVector3f& b) {
+  return {max(a.x(), b.x()), max(a.y(), b.y()), max(a.z(), b.z())};
 }
 }
--- a/test/main.cpp
+++ b/test/main.cpp
@ -30,6 +30,9 @@ int main()
    CAABox test2{{-100, -100, -100}, {100, 100, 100}};
    CAABox test3{{-50, -50, -50}, {50, 50, 50}};
    CAABox test4{{-50, -50, -105}, {50, 50, 105}};
+    CVector2f point2(-90, 67);
+    CVector2f point3(-90, 67);
+    CVector3f point4 = point2 + point3;
    CVector3f point(-90, 67, -105);
    test.closestPointAlongVector(point);
    CVector3d(100, -100, -200);
@ -72,7 +75,7 @@ int main()
    ctest1.fromHSV(0, 255 / 255.f, .5);
    float h, s, v;
    ctest1.toHSV(h, s, v);
-    std::cout << (int)ctest1.r << " " << (int)ctest1.g << " " << (int)ctest1.b << " " << (int)ctest1.a << std::endl;
-    std::cout << h << " " << s << " " << v << " " << (float)(ctest1.a / 255.f) << std::endl;
+    std::cout << (int)ctest1.r() << " " << (int)ctest1.g() << " " << (int)ctest1.b() << " " << (int)ctest1.a() << std::endl;
+    std::cout << h << " " << s << " " << v << " " << (float)(ctest1.a() / 255.f) << std::endl;
    return 0;
 }