Add NEON support & more constexpr

This commit is contained in:
Luke Street 2020-05-01 19:20:31 -04:00
parent eec855018a
commit 566db657d9
13 changed files with 483 additions and 178 deletions

View File

@ -56,13 +56,9 @@ public:
#endif #endif
[[nodiscard]] bool intersects(const CAABox& other) const { [[nodiscard]] bool intersects(const CAABox& other) const {
const bool x1 = max[0] >= other.min[0]; const auto mmax = max >= other.min;
const bool x2 = min[0] <= other.max[0]; const auto mmin = min <= other.max;
const bool y1 = max[1] >= other.min[1]; return mmax[0] && mmax[1] && mmax[2] && mmin[0] && mmin[1] && mmin[2];
const bool y2 = min[1] <= other.max[1];
const bool z1 = max[2] >= other.min[2];
const bool z2 = min[2] <= other.max[2];
return x1 && x2 && y1 && y2 && z1 && z2;
} }
[[nodiscard]] bool intersects(const CSphere& other) const; [[nodiscard]] bool intersects(const CSphere& other) const;
@ -70,10 +66,9 @@ public:
[[nodiscard]] CAABox booleanIntersection(const CAABox& other) const; [[nodiscard]] CAABox booleanIntersection(const CAABox& other) const;
[[nodiscard]] bool inside(const CAABox& other) const { [[nodiscard]] bool inside(const CAABox& other) const {
const bool x = min[0] >= other.min[0] && max[0] <= other.max[0]; const auto mmax = max <= other.max;
const bool y = min[1] >= other.min[1] && max[1] <= other.max[1]; const auto mmin = min >= other.min;
const bool z = min[2] >= other.min[2] && max[2] <= other.max[2]; return mmax[0] && mmax[1] && mmax[2] && mmin[0] && mmin[1] && mmin[2];
return x && y && z;
} }
[[nodiscard]] bool insidePlane(const CPlane& plane) const { [[nodiscard]] bool insidePlane(const CPlane& plane) const {

View File

@ -23,13 +23,13 @@ public:
constexpr CMatrix3f(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22) constexpr CMatrix3f(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22)
: m{{{m00, m10, m20}, {m01, m11, m21}, {m02, m12, m22}}} {} : m{{{m00, m10, m20}, {m01, m11, m21}, {m02, m12, m22}}} {}
CMatrix3f(const CVector3f& scaleVec) { constexpr CMatrix3f(const CVector3f& scaleVec) {
m[0][0] = scaleVec[0]; m[0][0] = scaleVec[0];
m[1][1] = scaleVec[1]; m[1][1] = scaleVec[1];
m[2][2] = scaleVec[2]; m[2][2] = scaleVec[2];
} }
CMatrix3f(float scale) : CMatrix3f(CVector3f(scale)) {} constexpr CMatrix3f(float scale) : CMatrix3f(CVector3f(scale)) {}
constexpr CMatrix3f(const CVector3f& r0, const CVector3f& r1, const CVector3f& r2) : m{{r0, r1, r2}} {} constexpr CMatrix3f(const CVector3f& r0, const CVector3f& r1, const CVector3f& r2) : m{{r0, r1, r2}} {}
@ -71,7 +71,7 @@ public:
CMatrix3f(const CQuaternion& quat); CMatrix3f(const CQuaternion& quat);
CMatrix3f& operator=(const CMatrix3f& other) = default; constexpr CMatrix3f& operator=(const CMatrix3f& other) = default;
[[nodiscard]] CVector3f operator*(const CVector3f& other) const { [[nodiscard]] CVector3f operator*(const CVector3f& other) const {
return m[0].mSimd * other.mSimd.shuffle<0, 0, 0, 0>() + m[1].mSimd * other.mSimd.shuffle<1, 1, 1, 1>() + return m[0].mSimd * other.mSimd.shuffle<0, 0, 0, 0>() + m[1].mSimd * other.mSimd.shuffle<1, 1, 1, 1>() +

View File

@ -13,7 +13,7 @@ namespace zeus {
class CVector2f { class CVector2f {
public: public:
simd<float> mSimd; simd<float> mSimd;
constexpr CVector2f() : mSimd(0.f) {} constexpr CVector2f() : mSimd() {}
template <typename T> template <typename T>
constexpr CVector2f(const simd<T>& s) : mSimd(s) {} constexpr CVector2f(const simd<T>& s) : mSimd(s) {}
@ -43,11 +43,8 @@ public:
explicit constexpr CVector2f(float xy) : mSimd(xy) {} explicit constexpr CVector2f(float xy) : mSimd(xy) {}
void assign(float x, float y) { constexpr void assign(float x, float y) {
mSimd[0] = x; mSimd.set(x, y);
mSimd[1] = y;
mSimd[2] = 0.0f;
mSimd[3] = 0.0f;
} }
constexpr CVector2f(float x, float y) : mSimd(x, y, 0.f, 0.f) {} constexpr CVector2f(float x, float y) : mSimd(x, y, 0.f, 0.f) {}
@ -150,19 +147,19 @@ public:
return *this * mag; return *this * mag;
} }
[[nodiscard]] CVector2f perpendicularVector() const { return {-y(), x()}; } [[nodiscard]] constexpr CVector2f perpendicularVector() const { return {-y(), x()}; }
[[nodiscard]] float cross(const CVector2f& rhs) const { return (x() * rhs.y()) - (y() * rhs.x()); } [[nodiscard]] constexpr float cross(const CVector2f& rhs) const { return (x() * rhs.y()) - (y() * rhs.x()); }
[[nodiscard]] float dot(const CVector2f& rhs) const { return mSimd.dot2(rhs.mSimd); } [[nodiscard]] constexpr float dot(const CVector2f& rhs) const { return mSimd.dot2(rhs.mSimd); }
[[nodiscard]] float magSquared() const { return mSimd.dot2(mSimd); } [[nodiscard]] constexpr float magSquared() const { return mSimd.dot2(mSimd); }
[[nodiscard]] float magnitude() const { return std::sqrt(magSquared()); } [[nodiscard]] constexpr float magnitude() const { return std::sqrt(magSquared()); }
void zeroOut() { mSimd = zeus::simd<float>(0.f); } constexpr void zeroOut() { mSimd = 0.f; }
void splat(float xy) { mSimd = zeus::simd<float>(xy); } constexpr void splat(float xy) { mSimd = xy; }
[[nodiscard]] static float getAngleDiff(const CVector2f& a, const CVector2f& b); [[nodiscard]] static float getAngleDiff(const CVector2f& a, const CVector2f& b);
@ -176,9 +173,9 @@ public:
[[nodiscard]] static CVector2f slerp(const CVector2f& a, const CVector2f& b, float t); [[nodiscard]] static CVector2f slerp(const CVector2f& a, const CVector2f& b, float t);
[[nodiscard]] bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; } [[nodiscard]] constexpr bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }
[[nodiscard]] bool canBeNormalized() const { [[nodiscard]] constexpr bool canBeNormalized() const {
if (std::isinf(x()) || std::isinf(y())) if (std::isinf(x()) || std::isinf(y()))
return false; return false;
return std::fabs(x()) >= FLT_EPSILON || std::fabs(y()) >= FLT_EPSILON; return std::fabs(x()) >= FLT_EPSILON || std::fabs(y()) >= FLT_EPSILON;
@ -191,21 +188,21 @@ public:
return (diffVec.x() <= epsilon && diffVec.y() <= epsilon); return (diffVec.x() <= epsilon && diffVec.y() <= epsilon);
} }
[[nodiscard]] simd<float>::reference operator[](size_t idx) { [[nodiscard]] constexpr simd<float>::reference operator[](size_t idx) {
assert(idx < 2); assert(idx < 2);
return mSimd[idx]; return mSimd[idx];
} }
[[nodiscard]] float operator[](size_t idx) const { [[nodiscard]] constexpr float operator[](size_t idx) const {
assert(idx < 2); assert(idx < 2);
return mSimd[idx]; return mSimd[idx];
} }
[[nodiscard]] float x() const { return mSimd[0]; } [[nodiscard]] constexpr float x() const { return mSimd[0]; }
[[nodiscard]] float y() const { return mSimd[1]; } [[nodiscard]] constexpr float y() const { return mSimd[1]; }
[[nodiscard]] simd<float>::reference x() { return mSimd[0]; } [[nodiscard]] constexpr simd<float>::reference x() { return mSimd[0]; }
[[nodiscard]] simd<float>::reference y() { return mSimd[1]; } [[nodiscard]] constexpr simd<float>::reference y() { return mSimd[1]; }
}; };
constexpr inline CVector2f skOne2f(1.f); constexpr inline CVector2f skOne2f(1.f);
constexpr inline CVector2f skNegOne2f(-1.f); constexpr inline CVector2f skNegOne2f(-1.f);

View File

@ -14,7 +14,7 @@ namespace zeus {
class CVector3d { class CVector3d {
public: public:
zeus::simd<double> mSimd; zeus::simd<double> mSimd;
constexpr CVector3d() : mSimd(0.0) {} constexpr CVector3d() : mSimd() {}
template <typename T> template <typename T>
constexpr CVector3d(const simd<T>& s) : mSimd(s) {} constexpr CVector3d(const simd<T>& s) : mSimd(s) {}

View File

@ -19,7 +19,7 @@ class CRelAngle;
class CVector3f { class CVector3f {
public: public:
zeus::simd<float> mSimd; zeus::simd<float> mSimd;
constexpr CVector3f() : mSimd(0.f) {} constexpr CVector3f() : mSimd() {}
template <typename T> template <typename T>
constexpr CVector3f(const simd<T>& s) : mSimd(s) {} constexpr CVector3f(const simd<T>& s) : mSimd(s) {}
@ -53,7 +53,7 @@ public:
explicit constexpr CVector3f(float xyz) : mSimd(xyz) {} explicit constexpr CVector3f(float xyz) : mSimd(xyz) {}
void assign(float x, float y, float z) { mSimd = zeus::simd<float>(x, y, z); } void assign(float x, float y, float z) { mSimd.set(x, y, z); }
constexpr CVector3f(float x, float y, float z) : mSimd(x, y, z) {} constexpr CVector3f(float x, float y, float z) : mSimd(x, y, z) {}
@ -62,17 +62,22 @@ public:
CVector3f(const CVector2f& other, float z = 0.f) { CVector3f(const CVector2f& other, float z = 0.f) {
mSimd = other.mSimd; mSimd = other.mSimd;
mSimd[2] = z; mSimd[2] = z;
mSimd[3] = 0.f;
} }
[[nodiscard]] CVector2f toVec2f() const { return CVector2f(mSimd); } [[nodiscard]] CVector2f toVec2f() const { return CVector2f(mSimd); }
[[nodiscard]] bool operator==(const CVector3f& rhs) const { [[nodiscard]] bool operator==(const CVector3f& rhs) const {
return mSimd[0] == rhs.mSimd[0] && mSimd[1] == rhs.mSimd[1] && mSimd[2] == rhs.mSimd[2]; const auto mask = mSimd == rhs.mSimd;
return mask[0] && mask[1] && mask[2];
} }
[[nodiscard]] bool operator!=(const CVector3f& rhs) const { return !(*this == rhs); } [[nodiscard]] bool operator!=(const CVector3f& rhs) const { return !(*this == rhs); }
[[nodiscard]] simd<float>::mask_type operator>(const CVector3f& rhs) const { return mSimd > rhs.mSimd; }
[[nodiscard]] simd<float>::mask_type operator>=(const CVector3f& rhs) const { return mSimd >= rhs.mSimd; }
[[nodiscard]] simd<float>::mask_type operator<(const CVector3f& rhs) const { return mSimd < rhs.mSimd; }
[[nodiscard]] simd<float>::mask_type operator<=(const CVector3f& rhs) const { return mSimd <= rhs.mSimd; }
[[nodiscard]] CVector3f operator+(const CVector3f& rhs) const { return mSimd + rhs.mSimd; } [[nodiscard]] CVector3f operator+(const CVector3f& rhs) const { return mSimd + rhs.mSimd; }
[[nodiscard]] CVector3f operator-(const CVector3f& rhs) const { return mSimd - rhs.mSimd; } [[nodiscard]] CVector3f operator-(const CVector3f& rhs) const { return mSimd - rhs.mSimd; }
@ -83,16 +88,13 @@ public:
[[nodiscard]] CVector3f operator/(const CVector3f& rhs) const { return mSimd / rhs.mSimd; } [[nodiscard]] CVector3f operator/(const CVector3f& rhs) const { return mSimd / rhs.mSimd; }
[[nodiscard]] CVector3f operator+(float val) const { return mSimd + zeus::simd<float>(val); } [[nodiscard]] CVector3f operator+(float val) const { return mSimd + val; }
[[nodiscard]] CVector3f operator-(float val) const { return mSimd - zeus::simd<float>(val); } [[nodiscard]] CVector3f operator-(float val) const { return mSimd - val; }
[[nodiscard]] CVector3f operator*(float val) const { return mSimd * zeus::simd<float>(val); } [[nodiscard]] CVector3f operator*(float val) const { return mSimd * val; }
[[nodiscard]] CVector3f operator/(float val) const { [[nodiscard]] CVector3f operator/(float val) const { return mSimd / val; }
const float ooval = 1.f / val;
return mSimd * zeus::simd<float>(ooval);
}
const CVector3f& operator+=(const CVector3f& rhs) { const CVector3f& operator+=(const CVector3f& rhs) {
mSimd += rhs.mSimd; mSimd += rhs.mSimd;
@ -125,7 +127,7 @@ public:
} }
[[nodiscard]] CVector3f cross(const CVector3f& rhs) const { [[nodiscard]] CVector3f cross(const CVector3f& rhs) const {
return CVector3f(y() * rhs.z() - z() * rhs.y(), z() * rhs.x() - x() * rhs.z(), x() * rhs.y() - y() * rhs.x()); return {y() * rhs.z() - z() * rhs.y(), z() * rhs.x() - x() * rhs.z(), x() * rhs.y() - y() * rhs.x()};
} }
[[nodiscard]] float dot(const CVector3f& rhs) const { return mSimd.dot3(rhs.mSimd); } [[nodiscard]] float dot(const CVector3f& rhs) const { return mSimd.dot3(rhs.mSimd); }
@ -138,9 +140,9 @@ public:
[[nodiscard]] bool isMagnitudeSafe() const { return isNotInf() && magSquared() >= 9.9999994e-29; } [[nodiscard]] bool isMagnitudeSafe() const { return isNotInf() && magSquared() >= 9.9999994e-29; }
void zeroOut() { mSimd = zeus::simd<float>(0.f); } void zeroOut() { mSimd.broadcast(0.f); }
void splat(float xyz) { mSimd = zeus::simd<float>(xyz); } void splat(float xyz) { mSimd.broadcast(xyz); }
[[nodiscard]] static float getAngleDiff(const CVector3f& a, const CVector3f& b); [[nodiscard]] static float getAngleDiff(const CVector3f& a, const CVector3f& b);

View File

@ -18,7 +18,7 @@ class CVector4f {
public: public:
zeus::simd<float> mSimd; zeus::simd<float> mSimd;
constexpr CVector4f() : mSimd(0.f) {} constexpr CVector4f() : mSimd() {}
template <typename T> template <typename T>
constexpr CVector4f(const simd<T>& s) : mSimd(s) {} constexpr CVector4f(const simd<T>& s) : mSimd(s) {}

View File

@ -713,7 +713,7 @@ public:
operator _Vp() const { return __ptr_->__get(__index_); } operator _Vp() const { return __ptr_->__get(__index_); }
__simd_reference operator=(_Vp __value) && { constexpr __simd_reference& operator=(_Vp __value) && {
__ptr_->__set(__index_, __value); __ptr_->__set(__index_, __value);
return *this; return *this;
} }
@ -1199,9 +1199,9 @@ public:
using mask_type = simd_mask<_Tp, _Abi>; using mask_type = simd_mask<_Tp, _Abi>;
using abi_type = _Abi; using abi_type = _Abi;
simd() = default; constexpr simd() = default;
simd(const simd&) = default; constexpr simd(const simd&) = default;
simd& operator=(const simd&) = default; constexpr simd& operator=(const simd&) = default;
static constexpr size_t size() noexcept { return simd_size<_Tp, _Abi>::value; } static constexpr size_t size() noexcept { return simd_size<_Tp, _Abi>::value; }
@ -1230,13 +1230,14 @@ private:
} }
template <class _Generator, size_t... __indicies> template <class _Generator, size_t... __indicies>
void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) { constexpr void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) {
int __not_used[]{((*this)[__indicies] = __g(std::integral_constant<size_t, __indicies>()), 0)...}; int __not_used[]{((*this)[__indicies] = __g(std::integral_constant<size_t, __indicies>()), 0)...};
(void)__not_used; (void)__not_used;
} }
public: public:
simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {} constexpr simd(const __simd_storage<_Tp, _Abi>& s) : __s_(s) {}
constexpr simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {}
#if 0 #if 0
// implicit type conversion constructor // implicit type conversion constructor
@ -1254,7 +1255,7 @@ public:
template <class _Up, class _UAbi, template <class _Up, class _UAbi,
class = typename std::enable_if< class = typename std::enable_if<
std::is_constructible<__simd_storage<_Tp, _Abi>, __simd_storage<_Up, _UAbi>>::value>> std::is_constructible<__simd_storage<_Tp, _Abi>, __simd_storage<_Up, _UAbi>>::value>>
simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {} constexpr simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {}
#if 0 #if 0
template <class _Up, class _UAbi, template <class _Up, class _UAbi,
@ -1282,8 +1283,8 @@ public:
// generator constructor // generator constructor
template <class _Generator, template <class _Generator,
int = typename std::enable_if<__can_generate<_Generator>( int = typename std::enable_if<
std::make_index_sequence<simd_size<_Tp, _Abi>::value>()), int>::type()> __can_generate<_Generator>(std::make_index_sequence<simd_size<_Tp, _Abi>::value>()), int>::type()>
explicit simd(_Generator&& __g) { explicit simd(_Generator&& __g) {
__generator_init(std::forward<_Generator>(__g), std::make_index_sequence<simd_size<_Tp, _Abi>::value>()); __generator_init(std::forward<_Generator>(__g), std::make_index_sequence<simd_size<_Tp, _Abi>::value>());
} }
@ -1325,10 +1326,13 @@ public:
// stores [simd.store] // stores [simd.store]
void copy_to(simd_data<simd>& __buffer) const { __s_.__copy_to(__buffer); } void copy_to(simd_data<simd>& __buffer) const { __s_.__copy_to(__buffer); }
// scalar access [simd.subscr] constexpr void set(_Tp a, _Tp b, _Tp c = {}, _Tp d = {}) { __s_.__set4(a, b, c, d); }
reference operator[](size_t __i) { return reference(&__s_, __i); } constexpr void broadcast(_Tp rv) { __s_.__broadcast(rv); }
value_type operator[](size_t __i) const { return __s_.__get(__i); } // scalar access [simd.subscr]
constexpr reference operator[](size_t __i) { return reference(&__s_, __i); }
constexpr value_type operator[](size_t __i) const { return __s_.__get(__i); }
// unary operators [simd.unary] // unary operators [simd.unary]
simd& operator++(); simd& operator++();
@ -1377,15 +1381,13 @@ public:
friend mask_type operator>(const simd&, const simd&); friend mask_type operator>(const simd&, const simd&);
friend mask_type operator<(const simd&, const simd&); friend mask_type operator<(const simd&, const simd&);
value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); } constexpr value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); }
value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); } constexpr value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); }
value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); } constexpr value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); }
template <int x, int y, int z, int w> template <int x, int y, int z, int w>
simd shuffle() const { constexpr simd shuffle() const {
simd s; return __s_.template __shuffle<x, y, z, w>();
s.__s_ = __s_.template __shuffle<x, y, z, w>();
return s;
} }
const typename __simd_storage<_Tp, _Abi>::storage_type& native() const { return __s_.__native(); } const typename __simd_storage<_Tp, _Abi>::storage_type& native() const { return __s_.__native(); }
@ -1475,50 +1477,48 @@ private:
friend class simd_mask; friend class simd_mask;
public: public:
_Tp __get(size_t __index) const noexcept { return __storage_[__index]; }; constexpr _Tp __get(size_t __index) const noexcept { return __storage_[__index]; };
void __set(size_t __index, _Tp __val) noexcept { __storage_[__index] = __val; } constexpr void __set(size_t __index, _Tp __val) noexcept { __storage_[__index] = __val; }
std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept { constexpr std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept {
__storage_[0] = a; __storage_[0] = a;
__storage_[1] = b; __storage_[1] = b;
__storage_[2] = c; __storage_[2] = c;
__storage_[3] = d; __storage_[3] = d;
} }
void __broadcast(float __val) noexcept { std::fill(__storage_.begin(), __storage_.end(), __val); } constexpr void __broadcast(float __val) noexcept { std::fill(__storage_.begin(), __storage_.end(), __val); }
std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept { constexpr std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept {
return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1]; return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1];
} }
std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept { constexpr std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept {
return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] + return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] +
__storage_[2] * other.__storage_[2]; __storage_[2] * other.__storage_[2];
} }
std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept { constexpr std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept {
return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] + return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] +
__storage_[2] * other.__storage_[2] + __storage_[3] * other.__storage_[3]; __storage_[2] * other.__storage_[2] + __storage_[3] * other.__storage_[3];
} }
template <int x, int y, int z, int w> template <int x, int y, int z, int w>
std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept { constexpr std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept {
__simd_storage s; return {__storage_[x], __storage_[y], __storage_[z], __storage_[w]};
s.__storage_[0] = __storage_[x];
s.__storage_[1] = __storage_[y];
s.__storage_[2] = __storage_[z];
s.__storage_[3] = __storage_[w];
return s;
} }
void __copy_from(const simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) noexcept { constexpr void
__copy_from(const simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) noexcept {
std::copy(__buffer.begin(), __buffer.end(), __storage_.begin()); std::copy(__buffer.begin(), __buffer.end(), __storage_.begin());
} }
void __copy_to(simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) const noexcept { constexpr void __copy_to(simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) const
noexcept {
std::copy(__storage_.begin(), __storage_.end(), __buffer.begin()); std::copy(__storage_.begin(), __storage_.end(), __buffer.begin());
} }
__simd_storage() = default; constexpr __simd_storage() = default;
template <class _Up, int __Unum_element> template <class _Up, int __Unum_element>
explicit __simd_storage(const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) { constexpr explicit __simd_storage(
const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) {
std::copy(other.__native().begin(), other.__native().end(), __storage_.begin()); std::copy(other.__native().begin(), other.__native().end(), __storage_.begin());
} }
const storage_type& __native() const { return __storage_; } constexpr const storage_type& __native() const { return __storage_; }
}; };
template <class _Tp, int __num_element> template <class _Tp, int __num_element>
@ -1526,8 +1526,8 @@ class __simd_mask_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>
std::bitset<__num_element> __storage_; std::bitset<__num_element> __storage_;
public: public:
bool __get(size_t __index) const noexcept { return __storage_.test(__index); } [[nodiscard]] constexpr bool __get(size_t __index) const noexcept { return __storage_.test(__index); }
void __set(size_t __index, bool __val) noexcept { __storage_.set(__index, __val); } constexpr void __set(size_t __index, bool __val) noexcept { __storage_.set(__index, __val); }
}; };
} // namespace zeus::_simd } // namespace zeus::_simd

View File

@ -15,6 +15,8 @@ using namespace std;
#include "simd_avx.hpp" #include "simd_avx.hpp"
#elif __SSE__ #elif __SSE__
#include "simd_sse.hpp" #include "simd_sse.hpp"
#elif __ARM_NEON
#include "simd_neon.hpp"
#else #else
namespace simd_abi { namespace simd_abi {
template <typename T> template <typename T>

View File

@ -0,0 +1,341 @@
#pragma once
#include <arm_neon.h>
#ifndef _ZEUS_SIMD_INCLUDED
#error simd_neon.hpp must not be included directly. Include simd.hpp instead.
#endif
namespace zeus::_simd {
// __m128 ABI
using m128_abi = __simd_abi<_StorageKind(int(_StorageKind::_Array) + 1), 4>;
// __m128d ABI
using m128d_abi = __simd_abi<_StorageKind(int(_StorageKind::_Array) + 2), 4>;
template <>
class __simd_storage<double, m128d_abi>;
// __m128 storage for NEON
template <>
class __simd_storage<float, m128_abi> {
public:
using storage_type = float32x4_t;
storage_type __storage_{};
[[nodiscard]] constexpr float __get(size_t __index) const noexcept { return __storage_[__index]; }
inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; }
constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {}
constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; }
constexpr explicit __simd_storage(float rv) : __simd_storage(rv, rv, rv, rv) {}
inline void __broadcast(float __val) noexcept { __storage_ = vdupq_n_f32(__val); }
[[nodiscard]] inline float __dot2(const __simd_storage<float, m128_abi>& other) const noexcept {
return vaddv_f32(vget_low_f32(vmulq_f32(__storage_, other.__storage_)));
}
[[nodiscard]] inline float __dot3(const __simd_storage<float, m128_abi>& other) const noexcept {
return vaddvq_f32(vsetq_lane_f32(0.f, vmulq_f32(__storage_, other.__storage_), 3));
}
[[nodiscard]] inline float __dot4(const __simd_storage<float, m128_abi>& other) const noexcept {
return vaddvq_f32(vmulq_f32(__storage_, other.__storage_));
}
template <int x, int y, int z, int w>
[[nodiscard]] inline __simd_storage __shuffle() const noexcept {
storage_type ret;
ret = vmovq_n_f32(vgetq_lane_f32(__storage_, x));
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, y), ret, 1);
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, z), ret, 2);
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, w), ret, 3);
return __simd_storage(ret);
}
inline void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept {
__storage_ = vld1q_f32(__buffer.data());
}
inline void __copy_to(simd_data<simd<float, m128_abi>>& __buffer) const noexcept { vst1q_f32(__buffer.data(), __storage_); }
constexpr __simd_storage() = default;
explicit __simd_storage(const __simd_storage<double, m128d_abi>& other);
constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {}
[[nodiscard]] constexpr const storage_type& __native() const { return __storage_; }
};
// __m128 mask storage for NEON
template <>
class __simd_mask_storage<float, m128_abi> : public __simd_storage<float, m128_abi> {
public:
inline bool __get(size_t __index) const noexcept {
return vreinterpretq_u32_f32(__storage_)[__index] != 0;
}
inline void __set(size_t __index, bool __val) noexcept {
uint32x4_t data = vreinterpretq_u32_f32(__storage_);
data[__index] = __val ? UINT32_MAX : 0;
__storage_ = vreinterpretq_f32_u32(data);
}
};
template <>
inline simd<float, m128_abi> simd<float, m128_abi>::operator-() const {
return vreinterpretq_f32_s32(
veorq_s32(vreinterpretq_s32_f32(__s_.__storage_), vreinterpretq_s32_f32(vdupq_n_f32(-0.f))));
}
inline simd<float, m128_abi> operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
return vaddq_f32(a.__s_.__storage_, b.__s_.__storage_);
}
inline simd<float, m128_abi> operator-(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
return vsubq_f32(a.__s_.__storage_, b.__s_.__storage_);
}
inline simd<float, m128_abi> operator*(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
return vmulq_f32(a.__s_.__storage_, b.__s_.__storage_);
}
inline simd<float, m128_abi> operator/(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
return vdivq_f32(a.__s_.__storage_, b.__s_.__storage_);
}
inline simd<float, m128_abi>& operator+=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
a.__s_.__storage_ += b.__s_.__storage_;
return a;
}
inline simd<float, m128_abi>& operator-=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
a.__s_.__storage_ -= b.__s_.__storage_;
return a;
}
inline simd<float, m128_abi>& operator*=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
a.__s_.__storage_ *= b.__s_.__storage_;
return a;
}
inline simd<float, m128_abi>& operator/=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
a.__s_.__storage_ /= b.__s_.__storage_;
return a;
}
inline simd<float, m128_abi>::mask_type operator==(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi>::mask_type ret;
ret.__s_.__storage_ = vreinterpretq_f32_u32(vceqq_f32(a.__s_.__storage_, b.__s_.__storage_));
return ret;
}
inline simd<float, m128_abi>::mask_type operator!=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi>::mask_type ret;
ret.__s_.__storage_ = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a.__s_.__storage_, b.__s_.__storage_)));
return ret;
}
inline simd<float, m128_abi>::mask_type operator>=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi>::mask_type ret;
ret.__s_.__storage_ = vreinterpretq_f32_u32(vcgeq_f32(a.__s_.__storage_, b.__s_.__storage_));
return ret;
}
inline simd<float, m128_abi>::mask_type operator<=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi>::mask_type ret;
ret.__s_.__storage_ = vreinterpretq_f32_u32(vcleq_f32(a.__s_.__storage_, b.__s_.__storage_));
return ret;
}
inline simd<float, m128_abi>::mask_type operator>(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi>::mask_type ret;
ret.__s_.__storage_ = vreinterpretq_f32_u32(vcgtq_f32(a.__s_.__storage_, b.__s_.__storage_));
return ret;
}
inline simd<float, m128_abi>::mask_type operator<(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi>::mask_type ret;
ret.__s_.__storage_ = vreinterpretq_f32_u32(vcltq_f32(a.__s_.__storage_, b.__s_.__storage_));
return ret;
}
// __m128d storage for NEON
template <>
class __simd_storage<double, m128d_abi> {
public:
using storage_type = float64x2x2_t;
using vector_type = float64x2_t;
storage_type __storage_{};
[[nodiscard]] inline double __get(size_t __index) const noexcept { return __storage_.val[__index / 2][__index % 2]; }
inline void __set(size_t __index, double __val) noexcept { __storage_.val[__index / 2][__index % 2] = __val; }
// Make GCC happy
static constexpr storage_type __make_array(vector_type a, vector_type b) { return {a, b}; }
constexpr __simd_storage(double a, double b, double c, double d)
: __storage_(__make_array(vector_type{a, b}, vector_type{c, d})) {}
constexpr void __set4(double a, double b, double c, double d) noexcept {
__storage_.val[0] = vector_type{a, b};
__storage_.val[1] = vector_type{c, d};
}
constexpr __simd_storage(double rv) : __simd_storage(rv, rv, rv, rv) {}
constexpr void __broadcast(double __val) noexcept { __set4(__val, __val, __val, __val); }
[[nodiscard]] inline double __dot2(const __simd_storage<double, m128d_abi>& other) const noexcept {
return vaddvq_f64(vmulq_f64(__storage_.val[0], other.__storage_.val[0]));
}
[[nodiscard]] inline double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept {
const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]);
const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]);
return vaddvq_f64(vcombine_f64(vcreate_f64(vaddvq_f64(mul1)), vget_low_f64(mul2)));
}
[[nodiscard]] inline double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept {
const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]);
const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]);
return vaddvq_f64(vaddq_f64(mul1, mul2));
}
inline void __copy_from(const simd_data<simd<double, m128d_abi>>& __buffer) noexcept {
__storage_ = vld2q_f64(__buffer.data());
}
inline void __copy_to(simd_data<simd<double, m128d_abi>>& __buffer) const noexcept {
vst2q_f64(__buffer.data(), __storage_);
}
constexpr __simd_storage() = default;
explicit inline __simd_storage(const __simd_storage<float, m128_abi>& other) {
__storage_.val[0] = vcvt_f64_f32(vget_low_f32(other.__storage_));
__storage_.val[1] = vcvt_f64_f32(vget_high_f32(other.__storage_));
}
constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {}
[[nodiscard]] constexpr const storage_type& __native() const { return __storage_; }
};
// __m128d mask storage for NEON
template <>
class __simd_mask_storage<double, m128d_abi> : public __simd_storage<double, m128d_abi> {
public:
inline bool __get(size_t __index) const noexcept {
return vreinterpretq_u64_f64(__storage_.val[__index / 2])[__index % 2] != 0;
}
inline void __set(size_t __index, bool __val) noexcept {
uint64x2_t vec = vreinterpretq_u64_f64(__storage_.val[__index / 2]);
vec[__index % 2] = __val ? UINT64_MAX : 0;
__storage_.val[__index / 2] = vreinterpretq_f64_u64(vec);
}
};
template <>
inline simd<double, m128d_abi> simd<double, m128d_abi>::operator-() const {
simd<double, m128d_abi> ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vreinterpretq_f64_s64(
veorq_s64(vreinterpretq_s64_f64(__s_.__storage_.val[i]), vreinterpretq_s64_f64(vdupq_n_f64(-0.0))));
return ret;
}
inline simd<double, m128d_abi> operator+(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
simd<double, m128d_abi> ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vaddq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
return ret;
}
inline simd<double, m128d_abi> operator-(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
simd<double, m128d_abi> ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vsubq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
return ret;
}
inline simd<double, m128d_abi> operator*(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
simd<double, m128d_abi> ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vmulq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
return ret;
}
inline simd<double, m128d_abi> operator/(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
simd<double, m128d_abi> ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vdivq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
return ret;
}
inline simd<double, m128d_abi>& operator+=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
for (int i = 0; i < 2; ++i)
a.__s_.__storage_.val[i] = vaddq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
return a;
}
inline simd<double, m128d_abi>& operator-=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
for (int i = 0; i < 2; ++i)
a.__s_.__storage_.val[i] = vsubq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
return a;
}
inline simd<double, m128d_abi>& operator*=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
for (int i = 0; i < 2; ++i)
a.__s_.__storage_.val[i] = vmulq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
return a;
}
inline simd<double, m128d_abi>& operator/=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
for (int i = 0; i < 2; ++i)
a.__s_.__storage_.val[i] = vdivq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
return a;
}
inline simd<double, m128d_abi>::mask_type operator==(const simd<double, m128d_abi>& a,
const simd<double, m128d_abi>& b) {
simd<double, m128d_abi>::mask_type ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vceqq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
return ret;
}
inline simd<double, m128d_abi>::mask_type operator!=(const simd<double, m128d_abi>& a,
const simd<double, m128d_abi>& b) {
simd<double, m128d_abi>::mask_type ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vreinterpretq_u64_u32(
vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])))));
return ret;
}
inline simd<double, m128d_abi>::mask_type operator>=(const simd<double, m128d_abi>& a,
const simd<double, m128d_abi>& b) {
simd<double, m128d_abi>::mask_type ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcgeq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
return ret;
}
inline simd<double, m128d_abi>::mask_type operator<=(const simd<double, m128d_abi>& a,
const simd<double, m128d_abi>& b) {
simd<double, m128d_abi>::mask_type ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcleq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
return ret;
}
inline simd<double, m128d_abi>::mask_type operator>(const simd<double, m128d_abi>& a,
const simd<double, m128d_abi>& b) {
simd<double, m128d_abi>::mask_type ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcgtq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
return ret;
}
inline simd<double, m128d_abi>::mask_type operator<(const simd<double, m128d_abi>& a,
const simd<double, m128d_abi>& b) {
simd<double, m128d_abi>::mask_type ret;
for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcltq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
return ret;
}
inline __simd_storage<float, m128_abi>::__simd_storage(const __simd_storage<double, m128d_abi>& other) {
__storage_ = vcombine_f32(vcvt_f32_f64(other.__storage_.val[0]), vcvt_f32_f64(other.__storage_.val[1]));
}
namespace simd_abi {
template <typename T>
struct zeus_native {};
template <>
struct zeus_native<float> {
using type = m128_abi;
};
template <>
struct zeus_native<double> {
using type = m128d_abi;
};
} // namespace simd_abi
} // namespace zeus::_simd

View File

@ -29,23 +29,14 @@ template <>
class __simd_storage<float, m128_abi> { class __simd_storage<float, m128_abi> {
public: public:
using storage_type = __m128; using storage_type = __m128;
storage_type __storage_; storage_type __storage_{};
float __get(size_t __index) const noexcept { [[nodiscard]] inline float __get(size_t __index) const noexcept { return __storage_[__index]; }
alignas(16) std::array<float, 4> sse_data; inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; }
_mm_store_ps(sse_data.data(), __storage_);
return sse_data[__index];
}
void __set(size_t __index, float __val) noexcept {
alignas(16) std::array<float, 4> sse_data;
_mm_store_ps(sse_data.data(), __storage_);
sse_data[__index] = __val;
__storage_ = _mm_load_ps(sse_data.data());
}
constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {} constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {}
void __set4(float a, float b, float c, float d) noexcept { __storage_ = _mm_set_ps(d, c, b, a); } constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; }
constexpr __simd_storage(float rv) : __storage_{rv, rv, rv, rv} {} constexpr explicit __simd_storage(float rv) : __storage_{rv, rv, rv, rv} {}
void __broadcast(float __val) noexcept { __storage_ = _mm_set1_ps(__val); } inline void __broadcast(float __val) noexcept { __storage_ = _mm_set1_ps(__val); }
float __dot2(const __simd_storage<float, m128_abi>& other) const noexcept { [[nodiscard]] inline float __dot2(const __simd_storage<float, m128_abi>& other) const noexcept {
#if __SSE4_1__ #if __SSE4_1__
float ret; float ret;
_mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x3F)); _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x3F));
@ -56,7 +47,7 @@ public:
return sse_data[0] + sse_data[1]; return sse_data[0] + sse_data[1];
#endif #endif
} }
float __dot3(const __simd_storage<float, m128_abi>& other) const noexcept { [[nodiscard]] inline float __dot3(const __simd_storage<float, m128_abi>& other) const noexcept {
#if __SSE4_1__ #if __SSE4_1__
float ret; float ret;
_mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x7F)); _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x7F));
@ -67,7 +58,7 @@ public:
return sse_data[0] + sse_data[1] + sse_data[2]; return sse_data[0] + sse_data[1] + sse_data[2];
#endif #endif
} }
float __dot4(const __simd_storage<float, m128_abi>& other) const noexcept { [[nodiscard]] inline float __dot4(const __simd_storage<float, m128_abi>& other) const noexcept {
#if __SSE4_1__ #if __SSE4_1__
float ret; float ret;
_mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0xFF)); _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0xFF));
@ -78,40 +69,39 @@ public:
return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3]; return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3];
#endif #endif
} }
template <int x, int y, int z, int w> template <int x, int y, int z, int w>
__simd_storage __shuffle() const noexcept { [[nodiscard]] constexpr __simd_storage __shuffle() const noexcept {
__simd_storage s; return __simd_storage(_mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x)));
s.__storage_ = _mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x));
return s;
} }
void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept { inline void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept {
__storage_ = _mm_load_ps(__buffer.data()); __storage_ = _mm_load_ps(__buffer.data());
} }
void __copy_to(simd_data<simd<float, m128_abi>>& __buffer) const noexcept { inline void __copy_to(simd_data<simd<float, m128_abi>>& __buffer) const noexcept {
_mm_store_ps(__buffer.data(), __storage_); _mm_store_ps(__buffer.data(), __storage_);
} }
__simd_storage() = default; __simd_storage() = default;
explicit __simd_storage(const __simd_storage<double, m128d_abi>& other); explicit inline __simd_storage(const __simd_storage<double, m128d_abi>& other);
#ifdef __AVX__ #ifdef __AVX__
explicit __simd_storage(const __simd_storage<double, m256d_abi>& other); explicit inline __simd_storage(const __simd_storage<double, m256d_abi>& other);
#endif #endif
explicit __simd_storage(const storage_type& s) : __storage_(s) {} constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {}
const storage_type& __native() const { return __storage_; } [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; }
}; };
// __m128 mask storage for SSE2+ // __m128 mask storage for SSE2+
template <> template <>
class __simd_mask_storage<float, m128_abi> : public __simd_storage<float, m128_abi> { class __simd_mask_storage<float, m128_abi> : public __simd_storage<float, m128_abi> {
public: public:
bool __get(size_t __index) const noexcept { [[nodiscard]] inline bool __get(size_t __index) const noexcept {
alignas(16) uint32_t sse_data[4]; alignas(16) uint32_t sse_data[4];
_mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_); _mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_);
return sse_data[__index] != 0; return sse_data[__index] != 0;
} }
void __set(size_t __index, bool __val) noexcept { inline void __set(size_t __index, bool __val) noexcept {
alignas(16) uint32_t sse_data[4]; alignas(16) uint32_t sse_data[4];
_mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_); _mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_);
sse_data[__index] = __val ? UINT32_MAX : 0; sse_data[__index] = __val ? UINT32_MAX : 0;
@ -125,27 +115,19 @@ inline simd<float, m128_abi> simd<float, m128_abi>::operator-() const {
} }
inline simd<float, m128_abi> operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) { inline simd<float, m128_abi> operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi> ret; return _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_);
ret.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_);
return ret;
} }
inline simd<float, m128_abi> operator-(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) { inline simd<float, m128_abi> operator-(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi> ret; return _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_);
ret.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_);
return ret;
} }
inline simd<float, m128_abi> operator*(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) { inline simd<float, m128_abi> operator*(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi> ret; return _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_);
ret.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_);
return ret;
} }
inline simd<float, m128_abi> operator/(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) { inline simd<float, m128_abi> operator/(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
simd<float, m128_abi> ret; return _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_);
ret.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_);
return ret;
} }
inline simd<float, m128_abi>& operator+=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) { inline simd<float, m128_abi>& operator+=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
@ -209,31 +191,19 @@ template <>
class __simd_storage<double, m128d_abi> { class __simd_storage<double, m128d_abi> {
public: public:
using storage_type = std::array<__m128d, 2>; using storage_type = std::array<__m128d, 2>;
storage_type __storage_; storage_type __storage_{};
double __get(size_t __index) const noexcept { [[nodiscard]] inline double __get(size_t __index) const noexcept { return __storage_[__index / 2][__index % 2]; }
alignas(16) std::array<double, 2> sse_data; inline void __set(size_t __index, double __val) noexcept { __storage_[__index / 2][__index % 2] = __val; }
_mm_store_pd(sse_data.data(), __storage_[__index / 2]); // Make GCC happy
return sse_data[__index % 2];
}
void __set(size_t __index, double __val) noexcept {
alignas(16) std::array<double, 2> sse_data;
_mm_store_pd(sse_data.data(), __storage_[__index / 2]);
sse_data[__index % 2] = __val;
__storage_[__index / 2] = _mm_load_pd(sse_data.data());
}
static constexpr storage_type __make_array(__m128d a, __m128d b) { return {a, b}; } static constexpr storage_type __make_array(__m128d a, __m128d b) { return {a, b}; }
constexpr __simd_storage(double a, double b, double c, double d) constexpr __simd_storage(double a, double b, double c, double d) : __storage_(__make_array(__m128d{a, b}, __m128d{c, d})) {}
: __storage_(__make_array(__m128d{a, b}, __m128d{c, d})) {} constexpr void __set4(double a, double b, double c, double d) noexcept {
void __set4(double a, double b, double c, double d) noexcept { __storage_[0] = __m128d{a, b};
__storage_[0] = _mm_set_pd(b, a); __storage_[1] = __m128d{c, d};
__storage_[1] = _mm_set_pd(d, c);
} }
constexpr __simd_storage(double rv) : __storage_(__make_array(__m128d{rv, rv}, __m128d{rv, rv})) {} constexpr __simd_storage(double rv) : __simd_storage(rv, rv, rv, rv) {}
void __broadcast(double __val) noexcept { constexpr void __broadcast(double __val) noexcept { __set4(__val, __val, __val, __val); }
for (int i = 0; i < 2; ++i) [[nodiscard]] inline double __dot2(const __simd_storage<double, m128d_abi>& other) const noexcept {
__storage_[i] = _mm_set1_pd(__val);
}
double __dot2(const __simd_storage<double, m128d_abi>& other) const noexcept {
#if __SSE4_1__ #if __SSE4_1__
double ret; double ret;
_mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
@ -244,7 +214,7 @@ public:
return sse_data[0] + sse_data[1]; return sse_data[0] + sse_data[1];
#endif #endif
} }
double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept { [[nodiscard]] inline double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept {
#if __SSE4_1__ #if __SSE4_1__
double ret; double ret;
_mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
@ -259,7 +229,7 @@ public:
return sse_data[0] + sse_data[1] + sse_data2[0]; return sse_data[0] + sse_data[1] + sse_data2[0];
#endif #endif
} }
double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept { [[nodiscard]] inline double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept {
#if __SSE4_1__ #if __SSE4_1__
double ret; double ret;
_mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
@ -275,24 +245,24 @@ public:
#endif #endif
} }
void __copy_from(const simd_data<simd<double, m128d_abi>>& __buffer) noexcept { inline void __copy_from(const simd_data<simd<double, m128d_abi>>& __buffer) noexcept {
__storage_[0] = _mm_load_pd(__buffer.data()); __storage_[0] = _mm_load_pd(__buffer.data());
__storage_[1] = _mm_load_pd(__buffer.data() + 2); __storage_[1] = _mm_load_pd(__buffer.data() + 2);
} }
void __copy_to(simd_data<simd<double, m128d_abi>>& __buffer) const noexcept { inline void __copy_to(simd_data<simd<double, m128d_abi>>& __buffer) const noexcept {
_mm_store_pd(__buffer.data(), __storage_[0]); _mm_store_pd(__buffer.data(), __storage_[0]);
_mm_store_pd(__buffer.data() + 2, __storage_[1]); _mm_store_pd(__buffer.data() + 2, __storage_[1]);
} }
__simd_storage() = default; constexpr __simd_storage() = default;
explicit __simd_storage(const __simd_storage<float, m128_abi>& other) { inline explicit __simd_storage(const __simd_storage<float, m128_abi>& other) {
__storage_[0] = _mm_cvtps_pd(other.__storage_); __storage_[0] = _mm_cvtps_pd(other.__storage_);
__storage_[1] = _mm_cvtps_pd(_mm_movehl_ps(other.__storage_, other.__storage_)); __storage_[1] = _mm_cvtps_pd(_mm_movehl_ps(other.__storage_, other.__storage_));
} }
explicit __simd_storage(const storage_type& s) : __storage_(s) {} constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {}
const storage_type& __native() const { return __storage_; } [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; }
}; };
// __m128d mask storage for SSE2+ // __m128d mask storage for SSE2+
template <> template <>

View File

@ -34,8 +34,8 @@ void CMatrix3f::transpose() {
m[1].mSimd = _mm_movehl_ps(T2, T0); m[1].mSimd = _mm_movehl_ps(T2, T0);
m[2].mSimd = _mm_movelh_ps(T1, T3); m[2].mSimd = _mm_movelh_ps(T1, T3);
#elif __ARM_NEON #elif __ARM_NEON
float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native());
float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native());
float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
@ -69,8 +69,8 @@ CMatrix3f CMatrix3f::transposed() const {
__m128 T3 = _mm_unpackhi_ps(m[2].mSimd.native(), zero); __m128 T3 = _mm_unpackhi_ps(m[2].mSimd.native(), zero);
return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3)); return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3));
#elif __ARM_NEON #elif __ARM_NEON
float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native());
float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native());
float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);

View File

@ -15,8 +15,8 @@ CMatrix4f CMatrix4f::transposed() const {
ret.m[2].mSimd = _mm_movelh_ps(T1, T3); ret.m[2].mSimd = _mm_movelh_ps(T1, T3);
ret.m[3].mSimd = _mm_movehl_ps(T3, T1); ret.m[3].mSimd = _mm_movehl_ps(T3, T1);
#elif __ARM_NEON #elif __ARM_NEON
float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native());
float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native());
float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);

View File

@ -10,10 +10,8 @@
#if _WIN32 #if _WIN32
#include <intrin.h> #include <intrin.h>
#else #elif __x86_64__
#include <cpuid.h> #include <cpuid.h>
#endif #endif
namespace zeus { namespace zeus {
@ -23,8 +21,8 @@ static CPUInfo g_cpuFeatures = {};
static CPUInfo g_missingFeatures = {}; static CPUInfo g_missingFeatures = {};
void getCpuInfo(int eax, int regs[4]) { void getCpuInfo(int eax, int regs[4]) {
#if !GEKKO #if __x86_64__
#if _WIN32 #if _WIN32
__cpuid(regs, eax); __cpuid(regs, eax);
#else #else
__cpuid(eax, regs[0], regs[1], regs[2], regs[3]); __cpuid(eax, regs[0], regs[1], regs[2], regs[3]);
@ -33,8 +31,8 @@ void getCpuInfo(int eax, int regs[4]) {
} }
void getCpuInfoEx(int eax, int ecx, int regs[4]) { void getCpuInfoEx(int eax, int ecx, int regs[4]) {
#if !GEKKO #if __x86_64__
#if _WIN32 #if _WIN32
__cpuidex(regs, eax, ecx); __cpuidex(regs, eax, ecx);
#else #else
__cpuid_count(eax, ecx, regs[0], regs[1], regs[2], regs[3]); __cpuid_count(eax, ecx, regs[0], regs[1], regs[2], regs[3]);
@ -43,7 +41,7 @@ void getCpuInfoEx(int eax, int ecx, int regs[4]) {
} }
void detectCPU() { void detectCPU() {
#if !GEKKO #if __x86_64__
if (isCPUInit) if (isCPUInit)
return; return;