From 692dc1adfb0e5c7f43b220c6a2204e40b01598e9 Mon Sep 17 00:00:00 2001
From: Jack Andersen <jackoalan@gmail.com>
Date: Mon, 18 Dec 2017 17:02:59 -1000
Subject: [PATCH] Add AVX intrinsics for CVector3d

---
 include/zeus/CVector2f.hpp    |  7 +++-
 include/zeus/CVector3d.hpp    | 78 ++++++++++++++++++++++++++++-------
 include/zeus/CVector3f.hpp    |  4 +-
 include/zeus/CVector4f.hpp    |  7 +++-
 include/zeus/Global.hpp       | 11 +++++
 include/zeus/TVectorUnion.hpp |  3 ++
 6 files changed, 91 insertions(+), 19 deletions(-)

diff --git a/include/zeus/CVector2f.hpp b/include/zeus/CVector2f.hpp
index 4c5f598..8496ed9 100644
--- a/include/zeus/CVector2f.hpp
+++ b/include/zeus/CVector2f.hpp
@@ -333,7 +333,12 @@ public:
 
     inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }
 
-    inline bool canBeNormalized() const { return !isNormalized(); }
+    inline bool canBeNormalized() const
+    {
+        if (std::isinf(x) || std::isinf(y))
+            return false;
+        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON;
+    }
 
     inline bool isZero() const { return magSquared() <= 1.1920929e-7f; }
 
diff --git a/include/zeus/CVector3d.hpp b/include/zeus/CVector3d.hpp
index 232450f..ccf33c0 100644
--- a/include/zeus/CVector3d.hpp
+++ b/include/zeus/CVector3d.hpp
@@ -1,6 +1,7 @@
 #ifndef CVECTOR3D_HPP
 #define CVECTOR3D_HPP
 
+#include <athena/Types.hpp>
 #include "Global.hpp"
 #include "zeus/Math.hpp"
 #include "TVectorUnion.hpp"
@@ -8,12 +9,19 @@
 
 namespace zeus
 {
-class alignas(16) CVector3d
+class alignas(32) CVector3d
 {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+    ZE_DECLARE_ALIGNED_ALLOCATOR32();
     CVector3d() { zeroOut(); }
-#if __SSE__
+
+#if __AVX__
+    CVector3d(const __m256d& mVec256)
+    {
+        this->mVec256 = mVec256;
+        v[3] = 0.0;
+    }
+#elif __SSE__
     CVector3d(const __m128d mVec128[2])
     {
         this->mVec128[0] = mVec128[0];
@@ -24,7 +32,9 @@ public:
 #if ZE_ATHENA_TYPES
     CVector3d(const atVec3d& vec)
     {
-#if __SSE__
+#if __AVX__
+        mVec256 = vec.mVec256;
+#elif __SSE__
         mVec128[0] = vec.mVec128[0];
         mVec128[1] = vec.mVec128[1];
 #else
@@ -37,20 +47,25 @@ public:
 
     CVector3d(const CVector3f& vec)
     {
-#if __SSE__
+#if __AVX__
+        mVec256 = _mm256_cvtps_pd(vec.mVec128);
+#elif __SSE__
         mVec128[0] = _mm_cvtps_pd(vec.mVec128);
         v[2] = vec[2];
-        v[3] = 0.0;
 #else
         v[0] = vec[0];
         v[1] = vec[1];
         v[2] = vec[2];
+        v[3] = 0.0;
 #endif
     }
 
     CVector3d(double x, double y, double z)
     {
-#if __SSE__
+#if __AVX__
+        TDblVectorUnion splat{x, y, z, 0.0};
+        mVec256 = splat.mVec256;
+#elif __SSE__
         TDblVectorUnion splat{x, y, z, 0.0};
         mVec128[0] = splat.mVec128[0];
         mVec128[1] = splat.mVec128[1];
@@ -58,12 +73,17 @@ public:
         v[0] = x;
         v[1] = y;
         v[2] = z;
+        v[3] = 0.0;
 #endif
     }
 
     CVector3f asCVector3f()
     {
+#if __AVX__
+        return CVector3f(_mm256_cvtpd_ps(mVec256));
+#else
         return CVector3f(float(x), float(y), float(z));
+#endif
     }
 
     double magSquared() const
@@ -115,7 +135,10 @@ public:
 
     void splat(double xyz)
     {
-#if __SSE__
+#if __AVX__
+        TDblVectorUnion splat = {xyz, xyz, xyz, 0.0};
+        mVec256 = splat.mVec256;
+#elif __SSE__
         TDblVectorUnion splat = {xyz, xyz, xyz, 0.0};
         mVec128[0] = splat.mVec128[0];
         mVec128[1] = splat.mVec128[1];
@@ -134,7 +157,9 @@ public:
 
     inline CVector3d operator+(const CVector3d& rhs) const
     {
-#if __SSE__
+#if __AVX__
+        return _mm256_add_pd(mVec256, rhs.mVec256);
+#elif __SSE__
         const __m128d tmpVec128[2] = {_mm_add_pd(mVec128[0], rhs.mVec128[0]),
                                       _mm_add_pd(mVec128[1], rhs.mVec128[1])};
         return CVector3d(tmpVec128);
@@ -144,7 +169,9 @@ public:
     }
     inline CVector3d operator-(const CVector3d& rhs) const
     {
-#if __SSE__
+#if __AVX__
+        return _mm256_sub_pd(mVec256, rhs.mVec256);
+#elif __SSE__
         const __m128d tmpVec128[2] = {_mm_sub_pd(mVec128[0], rhs.mVec128[0]),
                                       _mm_sub_pd(mVec128[1], rhs.mVec128[1])};
         return CVector3d(tmpVec128);
@@ -154,7 +181,9 @@ public:
     }
     inline CVector3d operator*(const CVector3d& rhs) const
     {
-#if __SSE__
+#if __AVX__
+        return _mm256_mul_pd(mVec256, rhs.mVec256);
+#elif __SSE__
         const __m128d tmpVec128[2] = {_mm_mul_pd(mVec128[0], rhs.mVec128[0]),
                                       _mm_mul_pd(mVec128[1], rhs.mVec128[1])};
         return CVector3d(tmpVec128);
@@ -164,7 +193,9 @@ public:
     }
     inline CVector3d operator/(const CVector3d& rhs) const
     {
-#if __SSE__
+#if __AVX__
+        return _mm256_div_pd(mVec256, rhs.mVec256);
+#elif __SSE__
         const __m128d tmpVec128[2] = {_mm_div_pd(mVec128[0], rhs.mVec128[0]),
                                       _mm_div_pd(mVec128[1], rhs.mVec128[1])};
         return CVector3d(tmpVec128);
@@ -182,6 +213,9 @@ public:
             double x, y, z;
         };
         double v[4];
+#if __AVX__
+        __m256d mVec256;
+#endif
 #if __SSE__
         __m128d mVec128[2];
 #endif
@@ -192,7 +226,10 @@ public:
 
 static inline CVector3d operator+(double lhs, const CVector3d& rhs)
 {
-#if __SSE__
+#if __AVX__
+    TDblVectorUnion splat{lhs, lhs, lhs, 0};
+    return _mm256_add_pd(splat.mVec256, rhs.mVec256);
+#elif __SSE__
     TDblVectorUnion splat{lhs, lhs, lhs, 0};
     splat.mVec128[0] = _mm_add_pd(splat.mVec128[0], rhs.mVec128[0]);
     splat.mVec128[1] = _mm_add_pd(splat.mVec128[1], rhs.mVec128[1]);
@@ -204,7 +241,10 @@ static inline CVector3d operator+(double lhs, const CVector3d& rhs)
 
 static inline CVector3d operator-(double lhs, const CVector3d& rhs)
 {
-#if __SSE__
+#if __AVX__
+    TDblVectorUnion splat{lhs, lhs, lhs, 0};
+    return _mm256_sub_pd(splat.mVec256, rhs.mVec256);
+#elif __SSE__
     TDblVectorUnion splat{lhs, lhs, lhs, 0};
     splat.mVec128[0] = _mm_sub_pd(splat.mVec128[0], rhs.mVec128[0]);
     splat.mVec128[1] = _mm_sub_pd(splat.mVec128[1], rhs.mVec128[1]);
@@ -216,7 +256,10 @@ static inline CVector3d operator-(double lhs, const CVector3d& rhs)
 
 static inline CVector3d operator*(double lhs, const CVector3d& rhs)
 {
-#if __SSE__
+#if __AVX__
+    TDblVectorUnion splat{lhs, lhs, lhs, 0};
+    return _mm256_mul_pd(splat.mVec256, rhs.mVec256);
+#elif __SSE__
     TDblVectorUnion splat{lhs, lhs, lhs, 0};
     splat.mVec128[0] = _mm_mul_pd(splat.mVec128[0], rhs.mVec128[0]);
     splat.mVec128[1] = _mm_mul_pd(splat.mVec128[1], rhs.mVec128[1]);
@@ -228,7 +271,10 @@ static inline CVector3d operator*(double lhs, const CVector3d& rhs)
 
 static inline CVector3d operator/(double lhs, const CVector3d& rhs)
 {
-#if __SSE__
+#if __AVX__
+    TDblVectorUnion splat{lhs, lhs, lhs, 0};
+    return _mm256_div_pd(splat.mVec256, rhs.mVec256);
+#elif __SSE__
     TDblVectorUnion splat{lhs, lhs, lhs, 0};
     splat.mVec128[0] = _mm_div_pd(splat.mVec128[0], rhs.mVec128[0]);
     splat.mVec128[1] = _mm_div_pd(splat.mVec128[1], rhs.mVec128[1]);
diff --git a/include/zeus/CVector3f.hpp b/include/zeus/CVector3f.hpp
index 2cf76f5..a63f35f 100644
--- a/include/zeus/CVector3f.hpp
+++ b/include/zeus/CVector3f.hpp
@@ -338,7 +338,9 @@ public:
 
     inline bool canBeNormalized() const
     {
-        return (x < FLT_EPSILON || y < FLT_EPSILON || z < FLT_EPSILON);
+        if (std::isinf(x) || std::isinf(y) || std::isinf(z))
+            return false;
+        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON || std::fabs(z) >= FLT_EPSILON;
     }
 
     inline bool isZero() const { return magSquared() <= 1.1920929e-7f; }
diff --git a/include/zeus/CVector4f.hpp b/include/zeus/CVector4f.hpp
index 9ae3175..6f8acf7 100644
--- a/include/zeus/CVector4f.hpp
+++ b/include/zeus/CVector4f.hpp
@@ -363,7 +363,12 @@ public:
 
     inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }
 
-    inline bool canBeNormalized() const { return !isNormalized(); }
+    inline bool canBeNormalized() const
+    {
+        if (std::isinf(x) || std::isinf(y) || std::isinf(z) || std::isinf(w))
+            return false;
+        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON || std::fabs(z) >= FLT_EPSILON || std::fabs(w) >= FLT_EPSILON;
+    }
 
     inline bool isEqu(const CVector4f& other, float epsilon = 1.1920929e-7f)
     {
diff --git a/include/zeus/Global.hpp b/include/zeus/Global.hpp
index d5af42e..a2c6849 100644
--- a/include/zeus/Global.hpp
+++ b/include/zeus/Global.hpp
@@ -29,8 +29,19 @@
     inline void* operator new[](size_t, void* ptr) { return ptr; }                                                             \
     inline void operator delete[](void*, void*) {}                                                                             \
     void __unused__()
+#define ZE_DECLARE_ALIGNED_ALLOCATOR32()                                                                                       \
+    inline void* operator new(size_t sizeInBytes) { return zeAlloc(sizeInBytes, 32); }                                         \
+    inline void operator delete(void* ptr) { zeFree(ptr); }                                                                    \
+    inline void* operator new(size_t, void* ptr) { return ptr; }                                                               \
+    inline void operator delete(void*, void*) {}                                                                               \
+    inline void* operator new[](size_t sizeInBytes) { return zeAlloc(sizeInBytes, 32); }                                       \
+    inline void operator delete[](void* ptr) { zeFree(ptr); }                                                                  \
+    inline void* operator new[](size_t, void* ptr) { return ptr; }                                                             \
+    inline void operator delete[](void*, void*) {}                                                                             \
+    void __unused__()
 #else
 #define ZE_DECLARE_ALIGNED_ALLOCATOR() void __unused__()
+#define ZE_DECLARE_ALIGNED_ALLOCATOR32() void __unused__()
 #endif
 
 #if __SSE__
diff --git a/include/zeus/TVectorUnion.hpp b/include/zeus/TVectorUnion.hpp
index a772c02..8f7d6f9 100644
--- a/include/zeus/TVectorUnion.hpp
+++ b/include/zeus/TVectorUnion.hpp
@@ -12,6 +12,9 @@ typedef union {
 
 typedef union {
     double v[4];
+#if __AVX__
+    __m256d mVec256;
+#endif
 #if __SSE__
     __m128d mVec128[2];
 #endif