From 692dc1adfb0e5c7f43b220c6a2204e40b01598e9 Mon Sep 17 00:00:00 2001 From: Jack Andersen Date: Mon, 18 Dec 2017 17:02:59 -1000 Subject: [PATCH] Add AVX intrinsics for CVector3d --- include/zeus/CVector2f.hpp | 7 +++- include/zeus/CVector3d.hpp | 78 ++++++++++++++++++++++++++++------- include/zeus/CVector3f.hpp | 4 +- include/zeus/CVector4f.hpp | 7 +++- include/zeus/Global.hpp | 11 +++++ include/zeus/TVectorUnion.hpp | 3 ++ 6 files changed, 91 insertions(+), 19 deletions(-) diff --git a/include/zeus/CVector2f.hpp b/include/zeus/CVector2f.hpp index 4c5f598..8496ed9 100644 --- a/include/zeus/CVector2f.hpp +++ b/include/zeus/CVector2f.hpp @@ -333,7 +333,12 @@ public: inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; } - inline bool canBeNormalized() const { return !isNormalized(); } + inline bool canBeNormalized() const + { + if (std::isinf(x) || std::isinf(y)) + return false; + return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON; + } inline bool isZero() const { return magSquared() <= 1.1920929e-7f; } diff --git a/include/zeus/CVector3d.hpp b/include/zeus/CVector3d.hpp index 232450f..ccf33c0 100644 --- a/include/zeus/CVector3d.hpp +++ b/include/zeus/CVector3d.hpp @@ -1,6 +1,7 @@ #ifndef CVECTOR3D_HPP #define CVECTOR3D_HPP +#include #include "Global.hpp" #include "zeus/Math.hpp" #include "TVectorUnion.hpp" @@ -8,12 +9,19 @@ namespace zeus { -class alignas(16) CVector3d +class alignas(32) CVector3d { public: - ZE_DECLARE_ALIGNED_ALLOCATOR(); + ZE_DECLARE_ALIGNED_ALLOCATOR32(); CVector3d() { zeroOut(); } -#if __SSE__ + +#if __AVX__ + CVector3d(const __m256d& mVec256) + { + this->mVec256 = mVec256; + v[3] = 0.0; + } +#elif __SSE__ CVector3d(const __m128d mVec128[2]) { this->mVec128[0] = mVec128[0]; @@ -24,7 +32,9 @@ public: #if ZE_ATHENA_TYPES CVector3d(const atVec3d& vec) { -#if __SSE__ +#if __AVX__ + mVec256 = vec.mVec256; +#elif __SSE__ mVec128[0] = vec.mVec128[0]; mVec128[1] = vec.mVec128[1]; #else @@ -37,20 +47,25 @@ public: CVector3d(const CVector3f& vec) { -#if __SSE__ +#if __AVX__ + mVec256 = _mm256_cvtps_pd(vec.mVec128); +#elif __SSE__ mVec128[0] = _mm_cvtps_pd(vec.mVec128); v[2] = vec[2]; - v[3] = 0.0; #else v[0] = vec[0]; v[1] = vec[1]; v[2] = vec[2]; + v[3] = 0.0; #endif } CVector3d(double x, double y, double z) { -#if __SSE__ +#if __AVX__ + TDblVectorUnion splat{x, y, z, 0.0}; + mVec256 = splat.mVec256; +#elif __SSE__ TDblVectorUnion splat{x, y, z, 0.0}; mVec128[0] = splat.mVec128[0]; mVec128[1] = splat.mVec128[1]; @@ -58,12 +73,17 @@ public: v[0] = x; v[1] = y; v[2] = z; + v[3] = 0.0; #endif } CVector3f asCVector3f() { +#if __AVX__ + return CVector3f(_mm256_cvtpd_ps(mVec256)); +#else return CVector3f(float(x), float(y), float(z)); +#endif } double magSquared() const @@ -115,7 +135,10 @@ public: void splat(double xyz) { -#if __SSE__ +#if __AVX__ + TDblVectorUnion splat = {xyz, xyz, xyz, 0.0}; + mVec256 = splat.mVec256; +#elif __SSE__ TDblVectorUnion splat = {xyz, xyz, xyz, 0.0}; mVec128[0] = splat.mVec128[0]; mVec128[1] = splat.mVec128[1]; @@ -134,7 +157,9 @@ public: inline CVector3d operator+(const CVector3d& rhs) const { -#if __SSE__ +#if __AVX__ + return _mm256_add_pd(mVec256, rhs.mVec256); +#elif __SSE__ const __m128d tmpVec128[2] = {_mm_add_pd(mVec128[0], rhs.mVec128[0]), _mm_add_pd(mVec128[1], rhs.mVec128[1])}; return CVector3d(tmpVec128); @@ -144,7 +169,9 @@ public: } inline CVector3d operator-(const CVector3d& rhs) const { -#if __SSE__ +#if __AVX__ + return _mm256_sub_pd(mVec256, rhs.mVec256); +#elif __SSE__ const __m128d tmpVec128[2] = {_mm_sub_pd(mVec128[0], rhs.mVec128[0]), _mm_sub_pd(mVec128[1], rhs.mVec128[1])}; return CVector3d(tmpVec128); @@ -154,7 +181,9 @@ public: } inline CVector3d operator*(const CVector3d& rhs) const { -#if __SSE__ +#if __AVX__ + return _mm256_mul_pd(mVec256, rhs.mVec256); +#elif __SSE__ const __m128d tmpVec128[2] = {_mm_mul_pd(mVec128[0], rhs.mVec128[0]), _mm_mul_pd(mVec128[1], rhs.mVec128[1])}; return CVector3d(tmpVec128); @@ -164,7 +193,9 @@ public: } inline CVector3d operator/(const CVector3d& rhs) const { -#if __SSE__ +#if __AVX__ + return _mm256_div_pd(mVec256, rhs.mVec256); +#elif __SSE__ const __m128d tmpVec128[2] = {_mm_div_pd(mVec128[0], rhs.mVec128[0]), _mm_div_pd(mVec128[1], rhs.mVec128[1])}; return CVector3d(tmpVec128); @@ -182,6 +213,9 @@ public: double x, y, z; }; double v[4]; +#if __AVX__ + __m256d mVec256; +#endif #if __SSE__ __m128d mVec128[2]; #endif @@ -192,7 +226,10 @@ public: static inline CVector3d operator+(double lhs, const CVector3d& rhs) { -#if __SSE__ +#if __AVX__ + TDblVectorUnion splat{lhs, lhs, lhs, 0}; + return _mm256_add_pd(splat.mVec256, rhs.mVec256); +#elif __SSE__ TDblVectorUnion splat{lhs, lhs, lhs, 0}; splat.mVec128[0] = _mm_add_pd(splat.mVec128[0], rhs.mVec128[0]); splat.mVec128[1] = _mm_add_pd(splat.mVec128[1], rhs.mVec128[1]); @@ -204,7 +241,10 @@ static inline CVector3d operator+(double lhs, const CVector3d& rhs) static inline CVector3d operator-(double lhs, const CVector3d& rhs) { -#if __SSE__ +#if __AVX__ + TDblVectorUnion splat{lhs, lhs, lhs, 0}; + return _mm256_sub_pd(splat.mVec256, rhs.mVec256); +#elif __SSE__ TDblVectorUnion splat{lhs, lhs, lhs, 0}; splat.mVec128[0] = _mm_sub_pd(splat.mVec128[0], rhs.mVec128[0]); splat.mVec128[1] = _mm_sub_pd(splat.mVec128[1], rhs.mVec128[1]); @@ -216,7 +256,10 @@ static inline CVector3d operator-(double lhs, const CVector3d& rhs) static inline CVector3d operator*(double lhs, const CVector3d& rhs) { -#if __SSE__ +#if __AVX__ + TDblVectorUnion splat{lhs, lhs, lhs, 0}; + return _mm256_mul_pd(splat.mVec256, rhs.mVec256); +#elif __SSE__ TDblVectorUnion splat{lhs, lhs, lhs, 0}; splat.mVec128[0] = _mm_mul_pd(splat.mVec128[0], rhs.mVec128[0]); splat.mVec128[1] = _mm_mul_pd(splat.mVec128[1], rhs.mVec128[1]); @@ -228,7 +271,10 @@ static inline CVector3d operator*(double lhs, const CVector3d& rhs) static inline CVector3d operator/(double lhs, const CVector3d& rhs) { -#if __SSE__ +#if __AVX__ + TDblVectorUnion splat{lhs, lhs, lhs, 0}; + return _mm256_div_pd(splat.mVec256, rhs.mVec256); +#elif __SSE__ TDblVectorUnion splat{lhs, lhs, lhs, 0}; splat.mVec128[0] = _mm_div_pd(splat.mVec128[0], rhs.mVec128[0]); splat.mVec128[1] = _mm_div_pd(splat.mVec128[1], rhs.mVec128[1]); diff --git a/include/zeus/CVector3f.hpp b/include/zeus/CVector3f.hpp index 2cf76f5..a63f35f 100644 --- a/include/zeus/CVector3f.hpp +++ b/include/zeus/CVector3f.hpp @@ -338,7 +338,9 @@ public: inline bool canBeNormalized() const { - return (x < FLT_EPSILON || y < FLT_EPSILON || z < FLT_EPSILON); + if (std::isinf(x) || std::isinf(y) || std::isinf(z)) + return false; + return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON || std::fabs(z) >= FLT_EPSILON; } inline bool isZero() const { return magSquared() <= 1.1920929e-7f; } diff --git a/include/zeus/CVector4f.hpp b/include/zeus/CVector4f.hpp index 9ae3175..6f8acf7 100644 --- a/include/zeus/CVector4f.hpp +++ b/include/zeus/CVector4f.hpp @@ -363,7 +363,12 @@ public: inline bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; } - inline bool canBeNormalized() const { return !isNormalized(); } + inline bool canBeNormalized() const + { + if (std::isinf(x) || std::isinf(y) || std::isinf(z) || std::isinf(w)) + return false; + return std::fabs(x) >= FLT_EPSILON || std::fabs(y) >= FLT_EPSILON || std::fabs(z) >= FLT_EPSILON || std::fabs(w) >= FLT_EPSILON; + } inline bool isEqu(const CVector4f& other, float epsilon = 1.1920929e-7f) { diff --git a/include/zeus/Global.hpp b/include/zeus/Global.hpp index d5af42e..a2c6849 100644 --- a/include/zeus/Global.hpp +++ b/include/zeus/Global.hpp @@ -29,8 +29,19 @@ inline void* operator new[](size_t, void* ptr) { return ptr; } \ inline void operator delete[](void*, void*) {} \ void __unused__() +#define ZE_DECLARE_ALIGNED_ALLOCATOR32() \ + inline void* operator new(size_t sizeInBytes) { return zeAlloc(sizeInBytes, 32); } \ + inline void operator delete(void* ptr) { zeFree(ptr); } \ + inline void* operator new(size_t, void* ptr) { return ptr; } \ + inline void operator delete(void*, void*) {} \ + inline void* operator new[](size_t sizeInBytes) { return zeAlloc(sizeInBytes, 32); } \ + inline void operator delete[](void* ptr) { zeFree(ptr); } \ + inline void* operator new[](size_t, void* ptr) { return ptr; } \ + inline void operator delete[](void*, void*) {} \ + void __unused__() #else #define ZE_DECLARE_ALIGNED_ALLOCATOR() void __unused__() +#define ZE_DECLARE_ALIGNED_ALLOCATOR32() void __unused__() #endif #if __SSE__ diff --git a/include/zeus/TVectorUnion.hpp b/include/zeus/TVectorUnion.hpp index a772c02..8f7d6f9 100644 --- a/include/zeus/TVectorUnion.hpp +++ b/include/zeus/TVectorUnion.hpp @@ -12,6 +12,9 @@ typedef union { typedef union { double v[4]; +#if __AVX__ + __m256d mVec256; +#endif #if __SSE__ __m128d mVec128[2]; #endif