Add AVX intrinsics for CVector3d

2025-10-05 09:29:37 +00:00 · 2017-12-18 17:02:59 -10:00 · 2017-12-18 17:02:59 -10:00 · 692dc1adfb
commit 692dc1adfb
parent b29b181570
6 changed files with 91 additions and 19 deletions
--- a/include/zeus/CVector2f.hpp
+++ b/include/zeus/CVector2f.hpp
@ -333,7 +333,12 @@ public:

    inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }

-    inline bool canBeNormalized() const { return !isNormalized(); }
+    inline bool canBeNormalized() const
+    {
+        if (std::isinf(x) || std::isinf(y))
+            return false;
+        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON;
+    }

    inline bool isZero() const { return magSquared() <= 1.1920929e-7f; }

--- a/include/zeus/CVector3d.hpp
+++ b/include/zeus/CVector3d.hpp
@ -1,6 +1,7 @@
 #ifndef CVECTOR3D_HPP
 #define CVECTOR3D_HPP

+#include <athena/Types.hpp>
 #include "Global.hpp"
 #include "zeus/Math.hpp"
 #include "TVectorUnion.hpp"
@ -8,12 +9,19 @@

 namespace zeus
 {
-class alignas(16) CVector3d
+class alignas(32) CVector3d
 {
 public:
-    ZE_DECLARE_ALIGNED_ALLOCATOR();
+    ZE_DECLARE_ALIGNED_ALLOCATOR32();
    CVector3d() { zeroOut(); }
-#if __SSE__
+
+#if __AVX__
+    CVector3d(const __m256d& mVec256)
+    {
+        this->mVec256 = mVec256;
+        v[3] = 0.0;
+    }
+#elif __SSE__
    CVector3d(const __m128d mVec128[2])
    {
        this->mVec128[0] = mVec128[0];
@ -24,7 +32,9 @@ public:
 #if ZE_ATHENA_TYPES
    CVector3d(const atVec3d& vec)
    {
-#if __SSE__
+#if __AVX__
+        mVec256 = vec.mVec256;
+#elif __SSE__
        mVec128[0] = vec.mVec128[0];
        mVec128[1] = vec.mVec128[1];
 #else
@ -37,20 +47,25 @@ public:

    CVector3d(const CVector3f& vec)
    {
-#if __SSE__
+#if __AVX__
+        mVec256 = _mm256_cvtps_pd(vec.mVec128);
+#elif __SSE__
        mVec128[0] = _mm_cvtps_pd(vec.mVec128);
        v[2] = vec[2];
-        v[3] = 0.0;
 #else
        v[0] = vec[0];
        v[1] = vec[1];
        v[2] = vec[2];
+        v[3] = 0.0;
 #endif
    }

    CVector3d(double x, double y, double z)
    {
-#if __SSE__
+#if __AVX__
+        TDblVectorUnion splat{x, y, z, 0.0};
+        mVec256 = splat.mVec256;
+#elif __SSE__
        TDblVectorUnion splat{x, y, z, 0.0};
        mVec128[0] = splat.mVec128[0];
        mVec128[1] = splat.mVec128[1];
@ -58,12 +73,17 @@ public:
        v[0] = x;
        v[1] = y;
        v[2] = z;
+        v[3] = 0.0;
 #endif
    }

    CVector3f asCVector3f()
    {
+#if __AVX__
+        return CVector3f(_mm256_cvtpd_ps(mVec256));
+#else
        return CVector3f(float(x), float(y), float(z));
+#endif
    }

    double magSquared() const
@ -115,7 +135,10 @@ public:

    void splat(double xyz)
    {
-#if __SSE__
+#if __AVX__
+        TDblVectorUnion splat = {xyz, xyz, xyz, 0.0};
+        mVec256 = splat.mVec256;
+#elif __SSE__
        TDblVectorUnion splat = {xyz, xyz, xyz, 0.0};
        mVec128[0] = splat.mVec128[0];
        mVec128[1] = splat.mVec128[1];
@ -134,7 +157,9 @@ public:

    inline CVector3d operator+(const CVector3d& rhs) const
    {
-#if __SSE__
+#if __AVX__
+        return _mm256_add_pd(mVec256, rhs.mVec256);
+#elif __SSE__
        const __m128d tmpVec128[2] = {_mm_add_pd(mVec128[0], rhs.mVec128[0]),
                                      _mm_add_pd(mVec128[1], rhs.mVec128[1])};
        return CVector3d(tmpVec128);
@ -144,7 +169,9 @@ public:
    }
    inline CVector3d operator-(const CVector3d& rhs) const
    {
-#if __SSE__
+#if __AVX__
+        return _mm256_sub_pd(mVec256, rhs.mVec256);
+#elif __SSE__
        const __m128d tmpVec128[2] = {_mm_sub_pd(mVec128[0], rhs.mVec128[0]),
                                      _mm_sub_pd(mVec128[1], rhs.mVec128[1])};
        return CVector3d(tmpVec128);
@ -154,7 +181,9 @@ public:
    }
    inline CVector3d operator*(const CVector3d& rhs) const
    {
-#if __SSE__
+#if __AVX__
+        return _mm256_mul_pd(mVec256, rhs.mVec256);
+#elif __SSE__
        const __m128d tmpVec128[2] = {_mm_mul_pd(mVec128[0], rhs.mVec128[0]),
                                      _mm_mul_pd(mVec128[1], rhs.mVec128[1])};
        return CVector3d(tmpVec128);
@ -164,7 +193,9 @@ public:
    }
    inline CVector3d operator/(const CVector3d& rhs) const
    {
-#if __SSE__
+#if __AVX__
+        return _mm256_div_pd(mVec256, rhs.mVec256);
+#elif __SSE__
        const __m128d tmpVec128[2] = {_mm_div_pd(mVec128[0], rhs.mVec128[0]),
                                      _mm_div_pd(mVec128[1], rhs.mVec128[1])};
        return CVector3d(tmpVec128);
@ -182,6 +213,9 @@ public:
            double x, y, z;
        };
        double v[4];
+#if __AVX__
+        __m256d mVec256;
+#endif
 #if __SSE__
        __m128d mVec128[2];
 #endif
@ -192,7 +226,10 @@ public:

 static inline CVector3d operator+(double lhs, const CVector3d& rhs)
 {
-#if __SSE__
+#if __AVX__
+    TDblVectorUnion splat{lhs, lhs, lhs, 0};
+    return _mm256_add_pd(splat.mVec256, rhs.mVec256);
+#elif __SSE__
    TDblVectorUnion splat{lhs, lhs, lhs, 0};
    splat.mVec128[0] = _mm_add_pd(splat.mVec128[0], rhs.mVec128[0]);
    splat.mVec128[1] = _mm_add_pd(splat.mVec128[1], rhs.mVec128[1]);
@ -204,7 +241,10 @@ static inline CVector3d operator+(double lhs, const CVector3d& rhs)

 static inline CVector3d operator-(double lhs, const CVector3d& rhs)
 {
-#if __SSE__
+#if __AVX__
+    TDblVectorUnion splat{lhs, lhs, lhs, 0};
+    return _mm256_sub_pd(splat.mVec256, rhs.mVec256);
+#elif __SSE__
    TDblVectorUnion splat{lhs, lhs, lhs, 0};
    splat.mVec128[0] = _mm_sub_pd(splat.mVec128[0], rhs.mVec128[0]);
    splat.mVec128[1] = _mm_sub_pd(splat.mVec128[1], rhs.mVec128[1]);
@ -216,7 +256,10 @@ static inline CVector3d operator-(double lhs, const CVector3d& rhs)

 static inline CVector3d operator*(double lhs, const CVector3d& rhs)
 {
-#if __SSE__
+#if __AVX__
+    TDblVectorUnion splat{lhs, lhs, lhs, 0};
+    return _mm256_mul_pd(splat.mVec256, rhs.mVec256);
+#elif __SSE__
    TDblVectorUnion splat{lhs, lhs, lhs, 0};
    splat.mVec128[0] = _mm_mul_pd(splat.mVec128[0], rhs.mVec128[0]);
    splat.mVec128[1] = _mm_mul_pd(splat.mVec128[1], rhs.mVec128[1]);
@ -228,7 +271,10 @@ static inline CVector3d operator*(double lhs, const CVector3d& rhs)

 static inline CVector3d operator/(double lhs, const CVector3d& rhs)
 {
-#if __SSE__
+#if __AVX__
+    TDblVectorUnion splat{lhs, lhs, lhs, 0};
+    return _mm256_div_pd(splat.mVec256, rhs.mVec256);
+#elif __SSE__
    TDblVectorUnion splat{lhs, lhs, lhs, 0};
    splat.mVec128[0] = _mm_div_pd(splat.mVec128[0], rhs.mVec128[0]);
    splat.mVec128[1] = _mm_div_pd(splat.mVec128[1], rhs.mVec128[1]);
--- a/include/zeus/CVector3f.hpp
+++ b/include/zeus/CVector3f.hpp
@ -338,7 +338,9 @@ public:

    inline bool canBeNormalized() const
    {
-        return (x < FLT_EPSILON || y < FLT_EPSILON || z < FLT_EPSILON);
+        if (std::isinf(x) || std::isinf(y) || std::isinf(z))
+            return false;
+        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON || std::fabs(z) >= FLT_EPSILON;
    }

    inline bool isZero() const { return magSquared() <= 1.1920929e-7f; }
--- a/include/zeus/CVector4f.hpp
+++ b/include/zeus/CVector4f.hpp
@ -363,7 +363,12 @@ public:

    inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }

-    inline bool canBeNormalized() const { return !isNormalized(); }
+    inline bool canBeNormalized() const
+    {
+        if (std::isinf(x) || std::isinf(y) || std::isinf(z) || std::isinf(w))
+            return false;
+        return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON || std::fabs(z) >= FLT_EPSILON || std::fabs(w) >= FLT_EPSILON;
+    }

    inline bool isEqu(const CVector4f& other, float epsilon = 1.1920929e-7f)
    {
--- a/include/zeus/Global.hpp
+++ b/include/zeus/Global.hpp
@ -29,8 +29,19 @@
    inline void* operator new[](size_t, void* ptr) { return ptr; }                                                             \
    inline void operator delete[](void*, void*) {}                                                                             \
    void __unused__()
+#define ZE_DECLARE_ALIGNED_ALLOCATOR32()                                                                                       \
+    inline void* operator new(size_t sizeInBytes) { return zeAlloc(sizeInBytes, 32); }                                         \
+    inline void operator delete(void* ptr) { zeFree(ptr); }                                                                    \
+    inline void* operator new(size_t, void* ptr) { return ptr; }                                                               \
+    inline void operator delete(void*, void*) {}                                                                               \
+    inline void* operator new[](size_t sizeInBytes) { return zeAlloc(sizeInBytes, 32); }                                       \
+    inline void operator delete[](void* ptr) { zeFree(ptr); }                                                                  \
+    inline void* operator new[](size_t, void* ptr) { return ptr; }                                                             \
+    inline void operator delete[](void*, void*) {}                                                                             \
+    void __unused__()
 #else
 #define ZE_DECLARE_ALIGNED_ALLOCATOR() void __unused__()
+#define ZE_DECLARE_ALIGNED_ALLOCATOR32() void __unused__()
 #endif

 #if __SSE__
--- a/include/zeus/TVectorUnion.hpp
+++ b/include/zeus/TVectorUnion.hpp
@ -12,6 +12,9 @@ typedef union {

 typedef union {
    double v[4];
+#if __AVX__
+    __m256d mVec256;
+#endif
 #if __SSE__
    __m128d mVec128[2];
 #endif