diff --git a/CMakeLists.txt b/CMakeLists.txt index 6eff6cd..56fb5e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ add_library(zeus include/zeus/simd/simd.hpp include/zeus/simd/simd_sse.hpp include/zeus/simd/simd_avx.hpp + include/zeus/simd/simd_neon.hpp include/zeus/simd/parallelism_v2_simd.hpp) target_include_directories(zeus PUBLIC $) diff --git a/include/zeus/CAABox.hpp b/include/zeus/CAABox.hpp index 58c893f..83209b4 100644 --- a/include/zeus/CAABox.hpp +++ b/include/zeus/CAABox.hpp @@ -56,13 +56,9 @@ public: #endif [[nodiscard]] bool intersects(const CAABox& other) const { - const bool x1 = max[0] >= other.min[0]; - const bool x2 = min[0] <= other.max[0]; - const bool y1 = max[1] >= other.min[1]; - const bool y2 = min[1] <= other.max[1]; - const bool z1 = max[2] >= other.min[2]; - const bool z2 = min[2] <= other.max[2]; - return x1 && x2 && y1 && y2 && z1 && z2; + const auto mmax = max >= other.min; + const auto mmin = min <= other.max; + return mmax[0] && mmax[1] && mmax[2] && mmin[0] && mmin[1] && mmin[2]; } [[nodiscard]] bool intersects(const CSphere& other) const; @@ -70,10 +66,9 @@ public: [[nodiscard]] CAABox booleanIntersection(const CAABox& other) const; [[nodiscard]] bool inside(const CAABox& other) const { - const bool x = min[0] >= other.min[0] && max[0] <= other.max[0]; - const bool y = min[1] >= other.min[1] && max[1] <= other.max[1]; - const bool z = min[2] >= other.min[2] && max[2] <= other.max[2]; - return x && y && z; + const auto mmax = max <= other.max; + const auto mmin = min >= other.min; + return mmax[0] && mmax[1] && mmax[2] && mmin[0] && mmin[1] && mmin[2]; } [[nodiscard]] bool insidePlane(const CPlane& plane) const { diff --git a/include/zeus/CMatrix3f.hpp b/include/zeus/CMatrix3f.hpp index 1b23e70..5d8ebd0 100644 --- a/include/zeus/CMatrix3f.hpp +++ b/include/zeus/CMatrix3f.hpp @@ -23,13 +23,13 @@ public: constexpr CMatrix3f(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22) : m{{{m00, m10, m20}, {m01, m11, m21}, {m02, m12, m22}}} {} - CMatrix3f(const CVector3f& scaleVec) { + constexpr CMatrix3f(const CVector3f& scaleVec) { m[0][0] = scaleVec[0]; m[1][1] = scaleVec[1]; m[2][2] = scaleVec[2]; } - CMatrix3f(float scale) : CMatrix3f(CVector3f(scale)) {} + constexpr CMatrix3f(float scale) : CMatrix3f(CVector3f(scale)) {} constexpr CMatrix3f(const CVector3f& r0, const CVector3f& r1, const CVector3f& r2) : m{{r0, r1, r2}} {} @@ -81,7 +81,7 @@ public: CMatrix3f(const CQuaternion& quat); - CMatrix3f& operator=(const CMatrix3f& other) = default; + constexpr CMatrix3f& operator=(const CMatrix3f& other) = default; [[nodiscard]] CVector3f operator*(const CVector3f& other) const { return m[0].mSimd * other.mSimd.shuffle<0, 0, 0, 0>() + m[1].mSimd * other.mSimd.shuffle<1, 1, 1, 1>() + diff --git a/include/zeus/CVector2f.hpp b/include/zeus/CVector2f.hpp index c46290e..e5fd3d3 100644 --- a/include/zeus/CVector2f.hpp +++ b/include/zeus/CVector2f.hpp @@ -13,7 +13,7 @@ namespace zeus { class CVector2f { public: simd mSimd; - constexpr CVector2f() : mSimd(0.f) {} + constexpr CVector2f() : mSimd() {} template constexpr CVector2f(const simd& s) : mSimd(s) {} @@ -54,11 +54,8 @@ public: explicit constexpr CVector2f(float xy) : mSimd(xy) {} - void assign(float x, float y) { - mSimd[0] = x; - mSimd[1] = y; - mSimd[2] = 0.0f; - mSimd[3] = 0.0f; + constexpr void assign(float x, float y) { + mSimd.set(x, y); } constexpr CVector2f(float x, float y) : mSimd(x, y, 0.f, 0.f) {} @@ -161,19 +158,19 @@ public: return *this * mag; } - [[nodiscard]] CVector2f perpendicularVector() const { return {-y(), x()}; } + [[nodiscard]] constexpr CVector2f perpendicularVector() const { return {-y(), x()}; } - [[nodiscard]] float cross(const CVector2f& rhs) const { return (x() * rhs.y()) - (y() * rhs.x()); } + [[nodiscard]] constexpr float cross(const CVector2f& rhs) const { return (x() * rhs.y()) - (y() * rhs.x()); } - [[nodiscard]] float dot(const CVector2f& rhs) const { return mSimd.dot2(rhs.mSimd); } + [[nodiscard]] constexpr float dot(const CVector2f& rhs) const { return mSimd.dot2(rhs.mSimd); } - [[nodiscard]] float magSquared() const { return mSimd.dot2(mSimd); } + [[nodiscard]] constexpr float magSquared() const { return mSimd.dot2(mSimd); } - [[nodiscard]] float magnitude() const { return std::sqrt(magSquared()); } + [[nodiscard]] constexpr float magnitude() const { return std::sqrt(magSquared()); } - void zeroOut() { mSimd = zeus::simd(0.f); } + constexpr void zeroOut() { mSimd = 0.f; } - void splat(float xy) { mSimd = zeus::simd(xy); } + constexpr void splat(float xy) { mSimd = xy; } [[nodiscard]] static float getAngleDiff(const CVector2f& a, const CVector2f& b); @@ -187,9 +184,9 @@ public: [[nodiscard]] static CVector2f slerp(const CVector2f& a, const CVector2f& b, float t); - [[nodiscard]] bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; } + [[nodiscard]] constexpr bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; } - [[nodiscard]] bool canBeNormalized() const { + [[nodiscard]] constexpr bool canBeNormalized() const { if (std::isinf(x()) || std::isinf(y())) return false; return std::fabs(x()) >= FLT_EPSILON || std::fabs(y()) >= FLT_EPSILON; @@ -202,21 +199,21 @@ public: return (diffVec.x() <= epsilon && diffVec.y() <= epsilon); } - [[nodiscard]] simd::reference operator[](size_t idx) { + [[nodiscard]] constexpr simd::reference operator[](size_t idx) { assert(idx < 2); return mSimd[idx]; } - [[nodiscard]] float operator[](size_t idx) const { + [[nodiscard]] constexpr float operator[](size_t idx) const { assert(idx < 2); return mSimd[idx]; } - [[nodiscard]] float x() const { return mSimd[0]; } - [[nodiscard]] float y() const { return mSimd[1]; } + [[nodiscard]] constexpr float x() const { return mSimd[0]; } + [[nodiscard]] constexpr float y() const { return mSimd[1]; } - [[nodiscard]] simd::reference x() { return mSimd[0]; } - [[nodiscard]] simd::reference y() { return mSimd[1]; } + [[nodiscard]] constexpr simd::reference x() { return mSimd[0]; } + [[nodiscard]] constexpr simd::reference y() { return mSimd[1]; } }; constexpr inline CVector2f skOne2f(1.f); constexpr inline CVector2f skNegOne2f(-1.f); diff --git a/include/zeus/CVector3d.hpp b/include/zeus/CVector3d.hpp index 943caf0..00e2961 100644 --- a/include/zeus/CVector3d.hpp +++ b/include/zeus/CVector3d.hpp @@ -14,7 +14,7 @@ namespace zeus { class CVector3d { public: zeus::simd mSimd; - constexpr CVector3d() : mSimd(0.0) {} + constexpr CVector3d() : mSimd() {} template constexpr CVector3d(const simd& s) : mSimd(s) {} diff --git a/include/zeus/CVector3f.hpp b/include/zeus/CVector3f.hpp index 1fe8ef8..f98f005 100644 --- a/include/zeus/CVector3f.hpp +++ b/include/zeus/CVector3f.hpp @@ -19,7 +19,7 @@ class CRelAngle; class CVector3f { public: zeus::simd mSimd; - constexpr CVector3f() : mSimd(0.f) {} + constexpr CVector3f() : mSimd() {} template constexpr CVector3f(const simd& s) : mSimd(s) {} @@ -64,7 +64,7 @@ public: explicit constexpr CVector3f(float xyz) : mSimd(xyz) {} - void assign(float x, float y, float z) { mSimd = zeus::simd(x, y, z); } + void assign(float x, float y, float z) { mSimd.set(x, y, z); } constexpr CVector3f(float x, float y, float z) : mSimd(x, y, z) {} @@ -73,17 +73,22 @@ public: CVector3f(const CVector2f& other, float z = 0.f) { mSimd = other.mSimd; mSimd[2] = z; - mSimd[3] = 0.f; } [[nodiscard]] CVector2f toVec2f() const { return CVector2f(mSimd); } [[nodiscard]] bool operator==(const CVector3f& rhs) const { - return mSimd[0] == rhs.mSimd[0] && mSimd[1] == rhs.mSimd[1] && mSimd[2] == rhs.mSimd[2]; + const auto mask = mSimd == rhs.mSimd; + return mask[0] && mask[1] && mask[2]; } [[nodiscard]] bool operator!=(const CVector3f& rhs) const { return !(*this == rhs); } + [[nodiscard]] simd::mask_type operator>(const CVector3f& rhs) const { return mSimd > rhs.mSimd; } + [[nodiscard]] simd::mask_type operator>=(const CVector3f& rhs) const { return mSimd >= rhs.mSimd; } + [[nodiscard]] simd::mask_type operator<(const CVector3f& rhs) const { return mSimd < rhs.mSimd; } + [[nodiscard]] simd::mask_type operator<=(const CVector3f& rhs) const { return mSimd <= rhs.mSimd; } + [[nodiscard]] CVector3f operator+(const CVector3f& rhs) const { return mSimd + rhs.mSimd; } [[nodiscard]] CVector3f operator-(const CVector3f& rhs) const { return mSimd - rhs.mSimd; } @@ -94,16 +99,13 @@ public: [[nodiscard]] CVector3f operator/(const CVector3f& rhs) const { return mSimd / rhs.mSimd; } - [[nodiscard]] CVector3f operator+(float val) const { return mSimd + zeus::simd(val); } + [[nodiscard]] CVector3f operator+(float val) const { return mSimd + val; } - [[nodiscard]] CVector3f operator-(float val) const { return mSimd - zeus::simd(val); } + [[nodiscard]] CVector3f operator-(float val) const { return mSimd - val; } - [[nodiscard]] CVector3f operator*(float val) const { return mSimd * zeus::simd(val); } + [[nodiscard]] CVector3f operator*(float val) const { return mSimd * val; } - [[nodiscard]] CVector3f operator/(float val) const { - const float ooval = 1.f / val; - return mSimd * zeus::simd(ooval); - } + [[nodiscard]] CVector3f operator/(float val) const { return mSimd / val; } const CVector3f& operator+=(const CVector3f& rhs) { mSimd += rhs.mSimd; @@ -136,7 +138,7 @@ public: } [[nodiscard]] CVector3f cross(const CVector3f& rhs) const { - return CVector3f(y() * rhs.z() - z() * rhs.y(), z() * rhs.x() - x() * rhs.z(), x() * rhs.y() - y() * rhs.x()); + return {y() * rhs.z() - z() * rhs.y(), z() * rhs.x() - x() * rhs.z(), x() * rhs.y() - y() * rhs.x()}; } [[nodiscard]] float dot(const CVector3f& rhs) const { return mSimd.dot3(rhs.mSimd); } @@ -149,9 +151,9 @@ public: [[nodiscard]] bool isMagnitudeSafe() const { return isNotInf() && magSquared() >= 9.9999994e-29; } - void zeroOut() { mSimd = zeus::simd(0.f); } + void zeroOut() { mSimd.broadcast(0.f); } - void splat(float xyz) { mSimd = zeus::simd(xyz); } + void splat(float xyz) { mSimd.broadcast(xyz); } [[nodiscard]] static float getAngleDiff(const CVector3f& a, const CVector3f& b); diff --git a/include/zeus/CVector4f.hpp b/include/zeus/CVector4f.hpp index 86401f7..c5341ec 100644 --- a/include/zeus/CVector4f.hpp +++ b/include/zeus/CVector4f.hpp @@ -18,7 +18,7 @@ class CVector4f { public: zeus::simd mSimd; - constexpr CVector4f() : mSimd(0.f) {} + constexpr CVector4f() : mSimd() {} template constexpr CVector4f(const simd& s) : mSimd(s) {} diff --git a/include/zeus/simd/parallelism_v2_simd.hpp b/include/zeus/simd/parallelism_v2_simd.hpp index 5fca03d..7f08baa 100644 --- a/include/zeus/simd/parallelism_v2_simd.hpp +++ b/include/zeus/simd/parallelism_v2_simd.hpp @@ -713,7 +713,7 @@ public: operator _Vp() const { return __ptr_->__get(__index_); } - __simd_reference operator=(_Vp __value) && { + constexpr __simd_reference& operator=(_Vp __value) && { __ptr_->__set(__index_, __value); return *this; } @@ -1223,9 +1223,9 @@ public: using mask_type = simd_mask<_Tp, _Abi>; using abi_type = _Abi; - simd() = default; - simd(const simd&) = default; - simd& operator=(const simd&) = default; + constexpr simd() = default; + constexpr simd(const simd&) = default; + constexpr simd& operator=(const simd&) = default; static constexpr size_t size() noexcept { return simd_size<_Tp, _Abi>::value; } @@ -1254,13 +1254,14 @@ private: } template - void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) { + constexpr void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) { int __not_used[]{((*this)[__indicies] = __g(std::integral_constant()), 0)...}; (void)__not_used; } public: - simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {} + constexpr simd(const __simd_storage<_Tp, _Abi>& s) : __s_(s) {} + constexpr simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {} #if 0 // implicit type conversion constructor @@ -1278,7 +1279,7 @@ public: template , __simd_storage<_Up, _UAbi>>::value>> - simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {} + constexpr simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {} #if 0 template ( - std::make_index_sequence::value>()), int>::type()> + int = typename std::enable_if< + __can_generate<_Generator>(std::make_index_sequence::value>()), int>::type()> explicit simd(_Generator&& __g) { __generator_init(std::forward<_Generator>(__g), std::make_index_sequence::value>()); } @@ -1349,10 +1350,13 @@ public: // stores [simd.store] void copy_to(simd_data& __buffer) const { __s_.__copy_to(__buffer); } - // scalar access [simd.subscr] - reference operator[](size_t __i) { return reference(&__s_, __i); } + constexpr void set(_Tp a, _Tp b, _Tp c = {}, _Tp d = {}) { __s_.__set4(a, b, c, d); } + constexpr void broadcast(_Tp rv) { __s_.__broadcast(rv); } - value_type operator[](size_t __i) const { return __s_.__get(__i); } + // scalar access [simd.subscr] + constexpr reference operator[](size_t __i) { return reference(&__s_, __i); } + + constexpr value_type operator[](size_t __i) const { return __s_.__get(__i); } // unary operators [simd.unary] simd& operator++(); @@ -1401,15 +1405,13 @@ public: friend mask_type operator>(const simd&, const simd&); friend mask_type operator<(const simd&, const simd&); - value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); } - value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); } - value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); } + constexpr value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); } + constexpr value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); } + constexpr value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); } template - simd shuffle() const { - simd s; - s.__s_ = __s_.template __shuffle(); - return s; + constexpr simd shuffle() const { + return __s_.template __shuffle(); } const typename __simd_storage<_Tp, _Abi>::storage_type& native() const { return __s_.__native(); } @@ -1499,50 +1501,48 @@ private: friend class simd_mask; public: - _Tp __get(size_t __index) const noexcept { return __storage_[__index]; }; - void __set(size_t __index, _Tp __val) noexcept { __storage_[__index] = __val; } - std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept { + constexpr _Tp __get(size_t __index) const noexcept { return __storage_[__index]; }; + constexpr void __set(size_t __index, _Tp __val) noexcept { __storage_[__index] = __val; } + constexpr std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept { __storage_[0] = a; __storage_[1] = b; __storage_[2] = c; __storage_[3] = d; } - void __broadcast(float __val) noexcept { std::fill(__storage_.begin(), __storage_.end(), __val); } - std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept { + constexpr void __broadcast(float __val) noexcept { std::fill(__storage_.begin(), __storage_.end(), __val); } + constexpr std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept { return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1]; } - std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept { + constexpr std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept { return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] + __storage_[2] * other.__storage_[2]; } - std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept { + constexpr std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept { return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] + __storage_[2] * other.__storage_[2] + __storage_[3] * other.__storage_[3]; } template - std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept { - __simd_storage s; - s.__storage_[0] = __storage_[x]; - s.__storage_[1] = __storage_[y]; - s.__storage_[2] = __storage_[z]; - s.__storage_[3] = __storage_[w]; - return s; + constexpr std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept { + return {__storage_[x], __storage_[y], __storage_[z], __storage_[w]}; } - void __copy_from(const simd_data>>& __buffer) noexcept { + constexpr void + __copy_from(const simd_data>>& __buffer) noexcept { std::copy(__buffer.begin(), __buffer.end(), __storage_.begin()); } - void __copy_to(simd_data>>& __buffer) const noexcept { + constexpr void __copy_to(simd_data>>& __buffer) const + noexcept { std::copy(__storage_.begin(), __storage_.end(), __buffer.begin()); } - __simd_storage() = default; + constexpr __simd_storage() = default; template - explicit __simd_storage(const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) { + constexpr explicit __simd_storage( + const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) { std::copy(other.__native().begin(), other.__native().end(), __storage_.begin()); } - const storage_type& __native() const { return __storage_; } + constexpr const storage_type& __native() const { return __storage_; } }; template @@ -1550,8 +1550,8 @@ class __simd_mask_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>> std::bitset<__num_element> __storage_; public: - bool __get(size_t __index) const noexcept { return __storage_.test(__index); } - void __set(size_t __index, bool __val) noexcept { __storage_.set(__index, __val); } + [[nodiscard]] constexpr bool __get(size_t __index) const noexcept { return __storage_.test(__index); } + constexpr void __set(size_t __index, bool __val) noexcept { __storage_.set(__index, __val); } }; } // namespace zeus::_simd diff --git a/include/zeus/simd/simd.hpp b/include/zeus/simd/simd.hpp index 6531af4..a503b39 100644 --- a/include/zeus/simd/simd.hpp +++ b/include/zeus/simd/simd.hpp @@ -15,6 +15,8 @@ using namespace std; #include "simd_avx.hpp" #elif __SSE__ #include "simd_sse.hpp" +#elif __ARM_NEON +#include "simd_neon.hpp" #else namespace simd_abi { template diff --git a/include/zeus/simd/simd_neon.hpp b/include/zeus/simd/simd_neon.hpp new file mode 100644 index 0000000..9640847 --- /dev/null +++ b/include/zeus/simd/simd_neon.hpp @@ -0,0 +1,341 @@ +#pragma once +#include +#ifndef _ZEUS_SIMD_INCLUDED +#error simd_neon.hpp must not be included directly. Include simd.hpp instead. +#endif +namespace zeus::_simd { +// __m128 ABI +using m128_abi = __simd_abi<_StorageKind(int(_StorageKind::_Array) + 1), 4>; +// __m128d ABI +using m128d_abi = __simd_abi<_StorageKind(int(_StorageKind::_Array) + 2), 4>; + +template <> +class __simd_storage; + +// __m128 storage for NEON +template <> +class __simd_storage { +public: + using storage_type = float32x4_t; + storage_type __storage_{}; + [[nodiscard]] constexpr float __get(size_t __index) const noexcept { return __storage_[__index]; } + inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; } + constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {} + constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; } + constexpr explicit __simd_storage(float rv) : __simd_storage(rv, rv, rv, rv) {} + inline void __broadcast(float __val) noexcept { __storage_ = vdupq_n_f32(__val); } + [[nodiscard]] inline float __dot2(const __simd_storage& other) const noexcept { + return vaddv_f32(vget_low_f32(vmulq_f32(__storage_, other.__storage_))); + } + [[nodiscard]] inline float __dot3(const __simd_storage& other) const noexcept { + return vaddvq_f32(vsetq_lane_f32(0.f, vmulq_f32(__storage_, other.__storage_), 3)); + } + [[nodiscard]] inline float __dot4(const __simd_storage& other) const noexcept { + return vaddvq_f32(vmulq_f32(__storage_, other.__storage_)); + } + template + [[nodiscard]] inline __simd_storage __shuffle() const noexcept { + storage_type ret; + ret = vmovq_n_f32(vgetq_lane_f32(__storage_, x)); + ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, y), ret, 1); + ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, z), ret, 2); + ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, w), ret, 3); + return __simd_storage(ret); + } + + inline void __copy_from(const simd_data>& __buffer) noexcept { + __storage_ = vld1q_f32(__buffer.data()); + } + + inline void __copy_to(simd_data>& __buffer) const noexcept { vst1q_f32(__buffer.data(), __storage_); } + + constexpr __simd_storage() = default; + explicit __simd_storage(const __simd_storage& other); + + constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {} + [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; } +}; +// __m128 mask storage for NEON +template <> +class __simd_mask_storage : public __simd_storage { +public: + inline bool __get(size_t __index) const noexcept { + return vreinterpretq_u32_f32(__storage_)[__index] != 0; + } + inline void __set(size_t __index, bool __val) noexcept { + uint32x4_t data = vreinterpretq_u32_f32(__storage_); + data[__index] = __val ? UINT32_MAX : 0; + __storage_ = vreinterpretq_f32_u32(data); + } +}; + +template <> +inline simd simd::operator-() const { + return vreinterpretq_f32_s32( + veorq_s32(vreinterpretq_s32_f32(__s_.__storage_), vreinterpretq_s32_f32(vdupq_n_f32(-0.f)))); +} + +inline simd operator+(const simd& a, const simd& b) { + return vaddq_f32(a.__s_.__storage_, b.__s_.__storage_); +} + +inline simd operator-(const simd& a, const simd& b) { + return vsubq_f32(a.__s_.__storage_, b.__s_.__storage_); +} + +inline simd operator*(const simd& a, const simd& b) { + return vmulq_f32(a.__s_.__storage_, b.__s_.__storage_); +} + +inline simd operator/(const simd& a, const simd& b) { + return vdivq_f32(a.__s_.__storage_, b.__s_.__storage_); +} + +inline simd& operator+=(simd& a, const simd& b) { + a.__s_.__storage_ += b.__s_.__storage_; + return a; +} + +inline simd& operator-=(simd& a, const simd& b) { + a.__s_.__storage_ -= b.__s_.__storage_; + return a; +} + +inline simd& operator*=(simd& a, const simd& b) { + a.__s_.__storage_ *= b.__s_.__storage_; + return a; +} + +inline simd& operator/=(simd& a, const simd& b) { + a.__s_.__storage_ /= b.__s_.__storage_; + return a; +} + +inline simd::mask_type operator==(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vceqq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +inline simd::mask_type operator!=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a.__s_.__storage_, b.__s_.__storage_))); + return ret; +} + +inline simd::mask_type operator>=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vcgeq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +inline simd::mask_type operator<=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vcleq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +inline simd::mask_type operator>(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vcgtq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +inline simd::mask_type operator<(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vcltq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +// __m128d storage for NEON +template <> +class __simd_storage { +public: + using storage_type = float64x2x2_t; + using vector_type = float64x2_t; + storage_type __storage_{}; + [[nodiscard]] inline double __get(size_t __index) const noexcept { return __storage_.val[__index / 2][__index % 2]; } + inline void __set(size_t __index, double __val) noexcept { __storage_.val[__index / 2][__index % 2] = __val; } + // Make GCC happy + static constexpr storage_type __make_array(vector_type a, vector_type b) { return {a, b}; } + constexpr __simd_storage(double a, double b, double c, double d) + : __storage_(__make_array(vector_type{a, b}, vector_type{c, d})) {} + constexpr void __set4(double a, double b, double c, double d) noexcept { + __storage_.val[0] = vector_type{a, b}; + __storage_.val[1] = vector_type{c, d}; + } + constexpr __simd_storage(double rv) : __simd_storage(rv, rv, rv, rv) {} + constexpr void __broadcast(double __val) noexcept { __set4(__val, __val, __val, __val); } + [[nodiscard]] inline double __dot2(const __simd_storage& other) const noexcept { + return vaddvq_f64(vmulq_f64(__storage_.val[0], other.__storage_.val[0])); + } + [[nodiscard]] inline double __dot3(const __simd_storage& other) const noexcept { + const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]); + const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]); + return vaddvq_f64(vcombine_f64(vcreate_f64(vaddvq_f64(mul1)), vget_low_f64(mul2))); + } + [[nodiscard]] inline double __dot4(const __simd_storage& other) const noexcept { + const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]); + const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]); + return vaddvq_f64(vaddq_f64(mul1, mul2)); + } + + inline void __copy_from(const simd_data>& __buffer) noexcept { + __storage_ = vld2q_f64(__buffer.data()); + } + + inline void __copy_to(simd_data>& __buffer) const noexcept { + vst2q_f64(__buffer.data(), __storage_); + } + + constexpr __simd_storage() = default; + explicit inline __simd_storage(const __simd_storage& other) { + __storage_.val[0] = vcvt_f64_f32(vget_low_f32(other.__storage_)); + __storage_.val[1] = vcvt_f64_f32(vget_high_f32(other.__storage_)); + } + + constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {} + [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; } +}; +// __m128d mask storage for NEON +template <> +class __simd_mask_storage : public __simd_storage { +public: + inline bool __get(size_t __index) const noexcept { + return vreinterpretq_u64_f64(__storage_.val[__index / 2])[__index % 2] != 0; + } + inline void __set(size_t __index, bool __val) noexcept { + uint64x2_t vec = vreinterpretq_u64_f64(__storage_.val[__index / 2]); + vec[__index % 2] = __val ? UINT64_MAX : 0; + __storage_.val[__index / 2] = vreinterpretq_f64_u64(vec); + } +}; + +template <> +inline simd simd::operator-() const { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_s64( + veorq_s64(vreinterpretq_s64_f64(__s_.__storage_.val[i]), vreinterpretq_s64_f64(vdupq_n_f64(-0.0)))); + return ret; +} + +inline simd operator+(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vaddq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return ret; +} + +inline simd operator-(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vsubq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return ret; +} + +inline simd operator*(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vmulq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return ret; +} + +inline simd operator/(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vdivq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return ret; +} + +inline simd& operator+=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_.val[i] = vaddq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return a; +} + +inline simd& operator-=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_.val[i] = vsubq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return a; +} + +inline simd& operator*=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_.val[i] = vmulq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return a; +} + +inline simd& operator/=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_.val[i] = vdivq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return a; +} + +inline simd::mask_type operator==(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vceqq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline simd::mask_type operator!=(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vreinterpretq_u64_u32( + vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]))))); + return ret; +} + +inline simd::mask_type operator>=(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcgeq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline simd::mask_type operator<=(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcleq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline simd::mask_type operator>(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcgtq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline simd::mask_type operator<(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcltq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline __simd_storage::__simd_storage(const __simd_storage& other) { + __storage_ = vcombine_f32(vcvt_f32_f64(other.__storage_.val[0]), vcvt_f32_f64(other.__storage_.val[1])); +} + +namespace simd_abi { +template +struct zeus_native {}; +template <> +struct zeus_native { + using type = m128_abi; +}; +template <> +struct zeus_native { + using type = m128d_abi; +}; +} // namespace simd_abi + +} // namespace zeus::_simd diff --git a/include/zeus/simd/simd_sse.hpp b/include/zeus/simd/simd_sse.hpp index a465a52..aa93d40 100644 --- a/include/zeus/simd/simd_sse.hpp +++ b/include/zeus/simd/simd_sse.hpp @@ -29,23 +29,14 @@ template <> class __simd_storage { public: using storage_type = __m128; - storage_type __storage_; - float __get(size_t __index) const noexcept { - alignas(16) std::array sse_data; - _mm_store_ps(sse_data.data(), __storage_); - return sse_data[__index]; - } - void __set(size_t __index, float __val) noexcept { - alignas(16) std::array sse_data; - _mm_store_ps(sse_data.data(), __storage_); - sse_data[__index] = __val; - __storage_ = _mm_load_ps(sse_data.data()); - } + storage_type __storage_{}; + [[nodiscard]] inline float __get(size_t __index) const noexcept { return __storage_[__index]; } + inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; } constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {} - void __set4(float a, float b, float c, float d) noexcept { __storage_ = _mm_set_ps(d, c, b, a); } - constexpr __simd_storage(float rv) : __storage_{rv, rv, rv, rv} {} - void __broadcast(float __val) noexcept { __storage_ = _mm_set1_ps(__val); } - float __dot2(const __simd_storage& other) const noexcept { + constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; } + constexpr explicit __simd_storage(float rv) : __storage_{rv, rv, rv, rv} {} + inline void __broadcast(float __val) noexcept { __storage_ = _mm_set1_ps(__val); } + [[nodiscard]] inline float __dot2(const __simd_storage& other) const noexcept { #if __SSE4_1__ float ret; _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x3F)); @@ -56,7 +47,7 @@ public: return sse_data[0] + sse_data[1]; #endif } - float __dot3(const __simd_storage& other) const noexcept { + [[nodiscard]] inline float __dot3(const __simd_storage& other) const noexcept { #if __SSE4_1__ float ret; _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x7F)); @@ -67,7 +58,7 @@ public: return sse_data[0] + sse_data[1] + sse_data[2]; #endif } - float __dot4(const __simd_storage& other) const noexcept { + [[nodiscard]] inline float __dot4(const __simd_storage& other) const noexcept { #if __SSE4_1__ float ret; _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0xFF)); @@ -78,40 +69,39 @@ public: return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3]; #endif } + template - __simd_storage __shuffle() const noexcept { - __simd_storage s; - s.__storage_ = _mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x)); - return s; + [[nodiscard]] constexpr __simd_storage __shuffle() const noexcept { + return __simd_storage(_mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x))); } - void __copy_from(const simd_data>& __buffer) noexcept { + inline void __copy_from(const simd_data>& __buffer) noexcept { __storage_ = _mm_load_ps(__buffer.data()); } - void __copy_to(simd_data>& __buffer) const noexcept { + inline void __copy_to(simd_data>& __buffer) const noexcept { _mm_store_ps(__buffer.data(), __storage_); } __simd_storage() = default; - explicit __simd_storage(const __simd_storage& other); + explicit inline __simd_storage(const __simd_storage& other); #ifdef __AVX__ - explicit __simd_storage(const __simd_storage& other); + explicit inline __simd_storage(const __simd_storage& other); #endif - explicit __simd_storage(const storage_type& s) : __storage_(s) {} - const storage_type& __native() const { return __storage_; } + constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {} + [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; } }; // __m128 mask storage for SSE2+ template <> class __simd_mask_storage : public __simd_storage { public: - bool __get(size_t __index) const noexcept { + [[nodiscard]] inline bool __get(size_t __index) const noexcept { alignas(16) uint32_t sse_data[4]; _mm_store_ps(reinterpret_cast(sse_data), __storage_); return sse_data[__index] != 0; } - void __set(size_t __index, bool __val) noexcept { + inline void __set(size_t __index, bool __val) noexcept { alignas(16) uint32_t sse_data[4]; _mm_store_ps(reinterpret_cast(sse_data), __storage_); sse_data[__index] = __val ? UINT32_MAX : 0; @@ -125,27 +115,19 @@ inline simd simd::operator-() const { } inline simd operator+(const simd& a, const simd& b) { - simd ret; - ret.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_); - return ret; + return _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_); } inline simd operator-(const simd& a, const simd& b) { - simd ret; - ret.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_); - return ret; + return _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_); } inline simd operator*(const simd& a, const simd& b) { - simd ret; - ret.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_); - return ret; + return _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_); } inline simd operator/(const simd& a, const simd& b) { - simd ret; - ret.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_); - return ret; + return _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_); } inline simd& operator+=(simd& a, const simd& b) { @@ -209,31 +191,19 @@ template <> class __simd_storage { public: using storage_type = std::array<__m128d, 2>; - storage_type __storage_; - double __get(size_t __index) const noexcept { - alignas(16) std::array sse_data; - _mm_store_pd(sse_data.data(), __storage_[__index / 2]); - return sse_data[__index % 2]; - } - void __set(size_t __index, double __val) noexcept { - alignas(16) std::array sse_data; - _mm_store_pd(sse_data.data(), __storage_[__index / 2]); - sse_data[__index % 2] = __val; - __storage_[__index / 2] = _mm_load_pd(sse_data.data()); - } + storage_type __storage_{}; + [[nodiscard]] inline double __get(size_t __index) const noexcept { return __storage_[__index / 2][__index % 2]; } + inline void __set(size_t __index, double __val) noexcept { __storage_[__index / 2][__index % 2] = __val; } + // Make GCC happy static constexpr storage_type __make_array(__m128d a, __m128d b) { return {a, b}; } - constexpr __simd_storage(double a, double b, double c, double d) - : __storage_(__make_array(__m128d{a, b}, __m128d{c, d})) {} - void __set4(double a, double b, double c, double d) noexcept { - __storage_[0] = _mm_set_pd(b, a); - __storage_[1] = _mm_set_pd(d, c); + constexpr __simd_storage(double a, double b, double c, double d) : __storage_(__make_array(__m128d{a, b}, __m128d{c, d})) {} + constexpr void __set4(double a, double b, double c, double d) noexcept { + __storage_[0] = __m128d{a, b}; + __storage_[1] = __m128d{c, d}; } - constexpr __simd_storage(double rv) : __storage_(__make_array(__m128d{rv, rv}, __m128d{rv, rv})) {} - void __broadcast(double __val) noexcept { - for (int i = 0; i < 2; ++i) - __storage_[i] = _mm_set1_pd(__val); - } - double __dot2(const __simd_storage& other) const noexcept { + constexpr __simd_storage(double rv) : __simd_storage(rv, rv, rv, rv) {} + constexpr void __broadcast(double __val) noexcept { __set4(__val, __val, __val, __val); } + [[nodiscard]] inline double __dot2(const __simd_storage& other) const noexcept { #if __SSE4_1__ double ret; _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); @@ -244,7 +214,7 @@ public: return sse_data[0] + sse_data[1]; #endif } - double __dot3(const __simd_storage& other) const noexcept { + [[nodiscard]] inline double __dot3(const __simd_storage& other) const noexcept { #if __SSE4_1__ double ret; _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); @@ -259,7 +229,7 @@ public: return sse_data[0] + sse_data[1] + sse_data2[0]; #endif } - double __dot4(const __simd_storage& other) const noexcept { + [[nodiscard]] inline double __dot4(const __simd_storage& other) const noexcept { #if __SSE4_1__ double ret; _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); @@ -275,24 +245,24 @@ public: #endif } - void __copy_from(const simd_data>& __buffer) noexcept { + inline void __copy_from(const simd_data>& __buffer) noexcept { __storage_[0] = _mm_load_pd(__buffer.data()); __storage_[1] = _mm_load_pd(__buffer.data() + 2); } - void __copy_to(simd_data>& __buffer) const noexcept { + inline void __copy_to(simd_data>& __buffer) const noexcept { _mm_store_pd(__buffer.data(), __storage_[0]); _mm_store_pd(__buffer.data() + 2, __storage_[1]); } - __simd_storage() = default; - explicit __simd_storage(const __simd_storage& other) { + constexpr __simd_storage() = default; + inline explicit __simd_storage(const __simd_storage& other) { __storage_[0] = _mm_cvtps_pd(other.__storage_); __storage_[1] = _mm_cvtps_pd(_mm_movehl_ps(other.__storage_, other.__storage_)); } - explicit __simd_storage(const storage_type& s) : __storage_(s) {} - const storage_type& __native() const { return __storage_; } + constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {} + [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; } }; // __m128d mask storage for SSE2+ template <> diff --git a/src/CMatrix3f.cpp b/src/CMatrix3f.cpp index 9dda9c0..7dcd16c 100644 --- a/src/CMatrix3f.cpp +++ b/src/CMatrix3f.cpp @@ -34,8 +34,8 @@ void CMatrix3f::transpose() { m[1].mSimd = _mm_movehl_ps(T2, T0); m[2].mSimd = _mm_movelh_ps(T1, T3); #elif __ARM_NEON - float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); - float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native()); + float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native()); float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); @@ -69,8 +69,8 @@ CMatrix3f CMatrix3f::transposed() const { __m128 T3 = _mm_unpackhi_ps(m[2].mSimd.native(), zero); return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3)); #elif __ARM_NEON - float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); - float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native()); + float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native()); float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); diff --git a/src/CMatrix4f.cpp b/src/CMatrix4f.cpp index c663b8e..32bb56f 100644 --- a/src/CMatrix4f.cpp +++ b/src/CMatrix4f.cpp @@ -15,8 +15,8 @@ CMatrix4f CMatrix4f::transposed() const { ret.m[2].mSimd = _mm_movelh_ps(T1, T3); ret.m[3].mSimd = _mm_movehl_ps(T3, T1); #elif __ARM_NEON - float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); - float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native()); + float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native()); float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); diff --git a/src/Math.cpp b/src/Math.cpp index 3b1cc5d..81b17e0 100644 --- a/src/Math.cpp +++ b/src/Math.cpp @@ -10,10 +10,8 @@ #if _WIN32 #include -#else - +#elif __x86_64__ #include - #endif namespace zeus { @@ -23,8 +21,8 @@ static CPUInfo g_cpuFeatures = {}; static CPUInfo g_missingFeatures = {}; void getCpuInfo(int eax, int regs[4]) { -#if !GEKKO -#if _WIN32 +#if __x86_64__ + #if _WIN32 __cpuid(regs, eax); #else __cpuid(eax, regs[0], regs[1], regs[2], regs[3]); @@ -33,8 +31,8 @@ void getCpuInfo(int eax, int regs[4]) { } void getCpuInfoEx(int eax, int ecx, int regs[4]) { -#if !GEKKO -#if _WIN32 +#if __x86_64__ + #if _WIN32 __cpuidex(regs, eax, ecx); #else __cpuid_count(eax, ecx, regs[0], regs[1], regs[2], regs[3]); @@ -43,7 +41,7 @@ void getCpuInfoEx(int eax, int ecx, int regs[4]) { } void detectCPU() { -#if !GEKKO +#if __x86_64__ if (isCPUInit) return;