mirror of https://github.com/AxioDL/zeus.git
Add NEON support & more constexpr
This commit is contained in:
parent
eec855018a
commit
566db657d9
|
@ -56,13 +56,9 @@ public:
|
|||
#endif
|
||||
|
||||
[[nodiscard]] bool intersects(const CAABox& other) const {
|
||||
const bool x1 = max[0] >= other.min[0];
|
||||
const bool x2 = min[0] <= other.max[0];
|
||||
const bool y1 = max[1] >= other.min[1];
|
||||
const bool y2 = min[1] <= other.max[1];
|
||||
const bool z1 = max[2] >= other.min[2];
|
||||
const bool z2 = min[2] <= other.max[2];
|
||||
return x1 && x2 && y1 && y2 && z1 && z2;
|
||||
const auto mmax = max >= other.min;
|
||||
const auto mmin = min <= other.max;
|
||||
return mmax[0] && mmax[1] && mmax[2] && mmin[0] && mmin[1] && mmin[2];
|
||||
}
|
||||
|
||||
[[nodiscard]] bool intersects(const CSphere& other) const;
|
||||
|
@ -70,10 +66,9 @@ public:
|
|||
[[nodiscard]] CAABox booleanIntersection(const CAABox& other) const;
|
||||
|
||||
[[nodiscard]] bool inside(const CAABox& other) const {
|
||||
const bool x = min[0] >= other.min[0] && max[0] <= other.max[0];
|
||||
const bool y = min[1] >= other.min[1] && max[1] <= other.max[1];
|
||||
const bool z = min[2] >= other.min[2] && max[2] <= other.max[2];
|
||||
return x && y && z;
|
||||
const auto mmax = max <= other.max;
|
||||
const auto mmin = min >= other.min;
|
||||
return mmax[0] && mmax[1] && mmax[2] && mmin[0] && mmin[1] && mmin[2];
|
||||
}
|
||||
|
||||
[[nodiscard]] bool insidePlane(const CPlane& plane) const {
|
||||
|
|
|
@ -23,13 +23,13 @@ public:
|
|||
constexpr CMatrix3f(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22)
|
||||
: m{{{m00, m10, m20}, {m01, m11, m21}, {m02, m12, m22}}} {}
|
||||
|
||||
CMatrix3f(const CVector3f& scaleVec) {
|
||||
constexpr CMatrix3f(const CVector3f& scaleVec) {
|
||||
m[0][0] = scaleVec[0];
|
||||
m[1][1] = scaleVec[1];
|
||||
m[2][2] = scaleVec[2];
|
||||
}
|
||||
|
||||
CMatrix3f(float scale) : CMatrix3f(CVector3f(scale)) {}
|
||||
constexpr CMatrix3f(float scale) : CMatrix3f(CVector3f(scale)) {}
|
||||
|
||||
constexpr CMatrix3f(const CVector3f& r0, const CVector3f& r1, const CVector3f& r2) : m{{r0, r1, r2}} {}
|
||||
|
||||
|
@ -71,7 +71,7 @@ public:
|
|||
|
||||
CMatrix3f(const CQuaternion& quat);
|
||||
|
||||
CMatrix3f& operator=(const CMatrix3f& other) = default;
|
||||
constexpr CMatrix3f& operator=(const CMatrix3f& other) = default;
|
||||
|
||||
[[nodiscard]] CVector3f operator*(const CVector3f& other) const {
|
||||
return m[0].mSimd * other.mSimd.shuffle<0, 0, 0, 0>() + m[1].mSimd * other.mSimd.shuffle<1, 1, 1, 1>() +
|
||||
|
|
|
@ -13,7 +13,7 @@ namespace zeus {
|
|||
class CVector2f {
|
||||
public:
|
||||
simd<float> mSimd;
|
||||
constexpr CVector2f() : mSimd(0.f) {}
|
||||
constexpr CVector2f() : mSimd() {}
|
||||
|
||||
template <typename T>
|
||||
constexpr CVector2f(const simd<T>& s) : mSimd(s) {}
|
||||
|
@ -43,11 +43,8 @@ public:
|
|||
|
||||
explicit constexpr CVector2f(float xy) : mSimd(xy) {}
|
||||
|
||||
void assign(float x, float y) {
|
||||
mSimd[0] = x;
|
||||
mSimd[1] = y;
|
||||
mSimd[2] = 0.0f;
|
||||
mSimd[3] = 0.0f;
|
||||
constexpr void assign(float x, float y) {
|
||||
mSimd.set(x, y);
|
||||
}
|
||||
|
||||
constexpr CVector2f(float x, float y) : mSimd(x, y, 0.f, 0.f) {}
|
||||
|
@ -150,19 +147,19 @@ public:
|
|||
return *this * mag;
|
||||
}
|
||||
|
||||
[[nodiscard]] CVector2f perpendicularVector() const { return {-y(), x()}; }
|
||||
[[nodiscard]] constexpr CVector2f perpendicularVector() const { return {-y(), x()}; }
|
||||
|
||||
[[nodiscard]] float cross(const CVector2f& rhs) const { return (x() * rhs.y()) - (y() * rhs.x()); }
|
||||
[[nodiscard]] constexpr float cross(const CVector2f& rhs) const { return (x() * rhs.y()) - (y() * rhs.x()); }
|
||||
|
||||
[[nodiscard]] float dot(const CVector2f& rhs) const { return mSimd.dot2(rhs.mSimd); }
|
||||
[[nodiscard]] constexpr float dot(const CVector2f& rhs) const { return mSimd.dot2(rhs.mSimd); }
|
||||
|
||||
[[nodiscard]] float magSquared() const { return mSimd.dot2(mSimd); }
|
||||
[[nodiscard]] constexpr float magSquared() const { return mSimd.dot2(mSimd); }
|
||||
|
||||
[[nodiscard]] float magnitude() const { return std::sqrt(magSquared()); }
|
||||
[[nodiscard]] constexpr float magnitude() const { return std::sqrt(magSquared()); }
|
||||
|
||||
void zeroOut() { mSimd = zeus::simd<float>(0.f); }
|
||||
constexpr void zeroOut() { mSimd = 0.f; }
|
||||
|
||||
void splat(float xy) { mSimd = zeus::simd<float>(xy); }
|
||||
constexpr void splat(float xy) { mSimd = xy; }
|
||||
|
||||
[[nodiscard]] static float getAngleDiff(const CVector2f& a, const CVector2f& b);
|
||||
|
||||
|
@ -176,9 +173,9 @@ public:
|
|||
|
||||
[[nodiscard]] static CVector2f slerp(const CVector2f& a, const CVector2f& b, float t);
|
||||
|
||||
[[nodiscard]] bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }
|
||||
[[nodiscard]] constexpr bool isNormalized() const { return std::fabs(1.f - magSquared()) < 0.01f; }
|
||||
|
||||
[[nodiscard]] bool canBeNormalized() const {
|
||||
[[nodiscard]] constexpr bool canBeNormalized() const {
|
||||
if (std::isinf(x()) || std::isinf(y()))
|
||||
return false;
|
||||
return std::fabs(x()) >= FLT_EPSILON || std::fabs(y()) >= FLT_EPSILON;
|
||||
|
@ -191,21 +188,21 @@ public:
|
|||
return (diffVec.x() <= epsilon && diffVec.y() <= epsilon);
|
||||
}
|
||||
|
||||
[[nodiscard]] simd<float>::reference operator[](size_t idx) {
|
||||
[[nodiscard]] constexpr simd<float>::reference operator[](size_t idx) {
|
||||
assert(idx < 2);
|
||||
return mSimd[idx];
|
||||
}
|
||||
|
||||
[[nodiscard]] float operator[](size_t idx) const {
|
||||
[[nodiscard]] constexpr float operator[](size_t idx) const {
|
||||
assert(idx < 2);
|
||||
return mSimd[idx];
|
||||
}
|
||||
|
||||
[[nodiscard]] float x() const { return mSimd[0]; }
|
||||
[[nodiscard]] float y() const { return mSimd[1]; }
|
||||
[[nodiscard]] constexpr float x() const { return mSimd[0]; }
|
||||
[[nodiscard]] constexpr float y() const { return mSimd[1]; }
|
||||
|
||||
[[nodiscard]] simd<float>::reference x() { return mSimd[0]; }
|
||||
[[nodiscard]] simd<float>::reference y() { return mSimd[1]; }
|
||||
[[nodiscard]] constexpr simd<float>::reference x() { return mSimd[0]; }
|
||||
[[nodiscard]] constexpr simd<float>::reference y() { return mSimd[1]; }
|
||||
};
|
||||
constexpr inline CVector2f skOne2f(1.f);
|
||||
constexpr inline CVector2f skNegOne2f(-1.f);
|
||||
|
|
|
@ -14,7 +14,7 @@ namespace zeus {
|
|||
class CVector3d {
|
||||
public:
|
||||
zeus::simd<double> mSimd;
|
||||
constexpr CVector3d() : mSimd(0.0) {}
|
||||
constexpr CVector3d() : mSimd() {}
|
||||
|
||||
template <typename T>
|
||||
constexpr CVector3d(const simd<T>& s) : mSimd(s) {}
|
||||
|
|
|
@ -19,7 +19,7 @@ class CRelAngle;
|
|||
class CVector3f {
|
||||
public:
|
||||
zeus::simd<float> mSimd;
|
||||
constexpr CVector3f() : mSimd(0.f) {}
|
||||
constexpr CVector3f() : mSimd() {}
|
||||
|
||||
template <typename T>
|
||||
constexpr CVector3f(const simd<T>& s) : mSimd(s) {}
|
||||
|
@ -53,7 +53,7 @@ public:
|
|||
|
||||
explicit constexpr CVector3f(float xyz) : mSimd(xyz) {}
|
||||
|
||||
void assign(float x, float y, float z) { mSimd = zeus::simd<float>(x, y, z); }
|
||||
void assign(float x, float y, float z) { mSimd.set(x, y, z); }
|
||||
|
||||
constexpr CVector3f(float x, float y, float z) : mSimd(x, y, z) {}
|
||||
|
||||
|
@ -62,17 +62,22 @@ public:
|
|||
CVector3f(const CVector2f& other, float z = 0.f) {
|
||||
mSimd = other.mSimd;
|
||||
mSimd[2] = z;
|
||||
mSimd[3] = 0.f;
|
||||
}
|
||||
|
||||
[[nodiscard]] CVector2f toVec2f() const { return CVector2f(mSimd); }
|
||||
|
||||
[[nodiscard]] bool operator==(const CVector3f& rhs) const {
|
||||
return mSimd[0] == rhs.mSimd[0] && mSimd[1] == rhs.mSimd[1] && mSimd[2] == rhs.mSimd[2];
|
||||
const auto mask = mSimd == rhs.mSimd;
|
||||
return mask[0] && mask[1] && mask[2];
|
||||
}
|
||||
|
||||
[[nodiscard]] bool operator!=(const CVector3f& rhs) const { return !(*this == rhs); }
|
||||
|
||||
[[nodiscard]] simd<float>::mask_type operator>(const CVector3f& rhs) const { return mSimd > rhs.mSimd; }
|
||||
[[nodiscard]] simd<float>::mask_type operator>=(const CVector3f& rhs) const { return mSimd >= rhs.mSimd; }
|
||||
[[nodiscard]] simd<float>::mask_type operator<(const CVector3f& rhs) const { return mSimd < rhs.mSimd; }
|
||||
[[nodiscard]] simd<float>::mask_type operator<=(const CVector3f& rhs) const { return mSimd <= rhs.mSimd; }
|
||||
|
||||
[[nodiscard]] CVector3f operator+(const CVector3f& rhs) const { return mSimd + rhs.mSimd; }
|
||||
|
||||
[[nodiscard]] CVector3f operator-(const CVector3f& rhs) const { return mSimd - rhs.mSimd; }
|
||||
|
@ -83,16 +88,13 @@ public:
|
|||
|
||||
[[nodiscard]] CVector3f operator/(const CVector3f& rhs) const { return mSimd / rhs.mSimd; }
|
||||
|
||||
[[nodiscard]] CVector3f operator+(float val) const { return mSimd + zeus::simd<float>(val); }
|
||||
[[nodiscard]] CVector3f operator+(float val) const { return mSimd + val; }
|
||||
|
||||
[[nodiscard]] CVector3f operator-(float val) const { return mSimd - zeus::simd<float>(val); }
|
||||
[[nodiscard]] CVector3f operator-(float val) const { return mSimd - val; }
|
||||
|
||||
[[nodiscard]] CVector3f operator*(float val) const { return mSimd * zeus::simd<float>(val); }
|
||||
[[nodiscard]] CVector3f operator*(float val) const { return mSimd * val; }
|
||||
|
||||
[[nodiscard]] CVector3f operator/(float val) const {
|
||||
const float ooval = 1.f / val;
|
||||
return mSimd * zeus::simd<float>(ooval);
|
||||
}
|
||||
[[nodiscard]] CVector3f operator/(float val) const { return mSimd / val; }
|
||||
|
||||
const CVector3f& operator+=(const CVector3f& rhs) {
|
||||
mSimd += rhs.mSimd;
|
||||
|
@ -125,7 +127,7 @@ public:
|
|||
}
|
||||
|
||||
[[nodiscard]] CVector3f cross(const CVector3f& rhs) const {
|
||||
return CVector3f(y() * rhs.z() - z() * rhs.y(), z() * rhs.x() - x() * rhs.z(), x() * rhs.y() - y() * rhs.x());
|
||||
return {y() * rhs.z() - z() * rhs.y(), z() * rhs.x() - x() * rhs.z(), x() * rhs.y() - y() * rhs.x()};
|
||||
}
|
||||
|
||||
[[nodiscard]] float dot(const CVector3f& rhs) const { return mSimd.dot3(rhs.mSimd); }
|
||||
|
@ -138,9 +140,9 @@ public:
|
|||
|
||||
[[nodiscard]] bool isMagnitudeSafe() const { return isNotInf() && magSquared() >= 9.9999994e-29; }
|
||||
|
||||
void zeroOut() { mSimd = zeus::simd<float>(0.f); }
|
||||
void zeroOut() { mSimd.broadcast(0.f); }
|
||||
|
||||
void splat(float xyz) { mSimd = zeus::simd<float>(xyz); }
|
||||
void splat(float xyz) { mSimd.broadcast(xyz); }
|
||||
|
||||
[[nodiscard]] static float getAngleDiff(const CVector3f& a, const CVector3f& b);
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ class CVector4f {
|
|||
public:
|
||||
zeus::simd<float> mSimd;
|
||||
|
||||
constexpr CVector4f() : mSimd(0.f) {}
|
||||
constexpr CVector4f() : mSimd() {}
|
||||
|
||||
template <typename T>
|
||||
constexpr CVector4f(const simd<T>& s) : mSimd(s) {}
|
||||
|
|
|
@ -713,7 +713,7 @@ public:
|
|||
|
||||
operator _Vp() const { return __ptr_->__get(__index_); }
|
||||
|
||||
__simd_reference operator=(_Vp __value) && {
|
||||
constexpr __simd_reference& operator=(_Vp __value) && {
|
||||
__ptr_->__set(__index_, __value);
|
||||
return *this;
|
||||
}
|
||||
|
@ -1199,9 +1199,9 @@ public:
|
|||
using mask_type = simd_mask<_Tp, _Abi>;
|
||||
using abi_type = _Abi;
|
||||
|
||||
simd() = default;
|
||||
simd(const simd&) = default;
|
||||
simd& operator=(const simd&) = default;
|
||||
constexpr simd() = default;
|
||||
constexpr simd(const simd&) = default;
|
||||
constexpr simd& operator=(const simd&) = default;
|
||||
|
||||
static constexpr size_t size() noexcept { return simd_size<_Tp, _Abi>::value; }
|
||||
|
||||
|
@ -1230,13 +1230,14 @@ private:
|
|||
}
|
||||
|
||||
template <class _Generator, size_t... __indicies>
|
||||
void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) {
|
||||
constexpr void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) {
|
||||
int __not_used[]{((*this)[__indicies] = __g(std::integral_constant<size_t, __indicies>()), 0)...};
|
||||
(void)__not_used;
|
||||
}
|
||||
|
||||
public:
|
||||
simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {}
|
||||
constexpr simd(const __simd_storage<_Tp, _Abi>& s) : __s_(s) {}
|
||||
constexpr simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {}
|
||||
|
||||
#if 0
|
||||
// implicit type conversion constructor
|
||||
|
@ -1254,7 +1255,7 @@ public:
|
|||
template <class _Up, class _UAbi,
|
||||
class = typename std::enable_if<
|
||||
std::is_constructible<__simd_storage<_Tp, _Abi>, __simd_storage<_Up, _UAbi>>::value>>
|
||||
simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {}
|
||||
constexpr simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {}
|
||||
|
||||
#if 0
|
||||
template <class _Up, class _UAbi,
|
||||
|
@ -1282,8 +1283,8 @@ public:
|
|||
|
||||
// generator constructor
|
||||
template <class _Generator,
|
||||
int = typename std::enable_if<__can_generate<_Generator>(
|
||||
std::make_index_sequence<simd_size<_Tp, _Abi>::value>()), int>::type()>
|
||||
int = typename std::enable_if<
|
||||
__can_generate<_Generator>(std::make_index_sequence<simd_size<_Tp, _Abi>::value>()), int>::type()>
|
||||
explicit simd(_Generator&& __g) {
|
||||
__generator_init(std::forward<_Generator>(__g), std::make_index_sequence<simd_size<_Tp, _Abi>::value>());
|
||||
}
|
||||
|
@ -1325,10 +1326,13 @@ public:
|
|||
// stores [simd.store]
|
||||
void copy_to(simd_data<simd>& __buffer) const { __s_.__copy_to(__buffer); }
|
||||
|
||||
// scalar access [simd.subscr]
|
||||
reference operator[](size_t __i) { return reference(&__s_, __i); }
|
||||
constexpr void set(_Tp a, _Tp b, _Tp c = {}, _Tp d = {}) { __s_.__set4(a, b, c, d); }
|
||||
constexpr void broadcast(_Tp rv) { __s_.__broadcast(rv); }
|
||||
|
||||
value_type operator[](size_t __i) const { return __s_.__get(__i); }
|
||||
// scalar access [simd.subscr]
|
||||
constexpr reference operator[](size_t __i) { return reference(&__s_, __i); }
|
||||
|
||||
constexpr value_type operator[](size_t __i) const { return __s_.__get(__i); }
|
||||
|
||||
// unary operators [simd.unary]
|
||||
simd& operator++();
|
||||
|
@ -1377,15 +1381,13 @@ public:
|
|||
friend mask_type operator>(const simd&, const simd&);
|
||||
friend mask_type operator<(const simd&, const simd&);
|
||||
|
||||
value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); }
|
||||
value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); }
|
||||
value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); }
|
||||
constexpr value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); }
|
||||
constexpr value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); }
|
||||
constexpr value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); }
|
||||
|
||||
template <int x, int y, int z, int w>
|
||||
simd shuffle() const {
|
||||
simd s;
|
||||
s.__s_ = __s_.template __shuffle<x, y, z, w>();
|
||||
return s;
|
||||
constexpr simd shuffle() const {
|
||||
return __s_.template __shuffle<x, y, z, w>();
|
||||
}
|
||||
|
||||
const typename __simd_storage<_Tp, _Abi>::storage_type& native() const { return __s_.__native(); }
|
||||
|
@ -1475,50 +1477,48 @@ private:
|
|||
friend class simd_mask;
|
||||
|
||||
public:
|
||||
_Tp __get(size_t __index) const noexcept { return __storage_[__index]; };
|
||||
void __set(size_t __index, _Tp __val) noexcept { __storage_[__index] = __val; }
|
||||
std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept {
|
||||
constexpr _Tp __get(size_t __index) const noexcept { return __storage_[__index]; };
|
||||
constexpr void __set(size_t __index, _Tp __val) noexcept { __storage_[__index] = __val; }
|
||||
constexpr std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept {
|
||||
__storage_[0] = a;
|
||||
__storage_[1] = b;
|
||||
__storage_[2] = c;
|
||||
__storage_[3] = d;
|
||||
}
|
||||
void __broadcast(float __val) noexcept { std::fill(__storage_.begin(), __storage_.end(), __val); }
|
||||
std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept {
|
||||
constexpr void __broadcast(float __val) noexcept { std::fill(__storage_.begin(), __storage_.end(), __val); }
|
||||
constexpr std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept {
|
||||
return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1];
|
||||
}
|
||||
std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept {
|
||||
constexpr std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept {
|
||||
return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] +
|
||||
__storage_[2] * other.__storage_[2];
|
||||
}
|
||||
std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept {
|
||||
constexpr std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept {
|
||||
return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] +
|
||||
__storage_[2] * other.__storage_[2] + __storage_[3] * other.__storage_[3];
|
||||
}
|
||||
template <int x, int y, int z, int w>
|
||||
std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept {
|
||||
__simd_storage s;
|
||||
s.__storage_[0] = __storage_[x];
|
||||
s.__storage_[1] = __storage_[y];
|
||||
s.__storage_[2] = __storage_[z];
|
||||
s.__storage_[3] = __storage_[w];
|
||||
return s;
|
||||
constexpr std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept {
|
||||
return {__storage_[x], __storage_[y], __storage_[z], __storage_[w]};
|
||||
}
|
||||
|
||||
void __copy_from(const simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) noexcept {
|
||||
constexpr void
|
||||
__copy_from(const simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) noexcept {
|
||||
std::copy(__buffer.begin(), __buffer.end(), __storage_.begin());
|
||||
}
|
||||
|
||||
void __copy_to(simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) const noexcept {
|
||||
constexpr void __copy_to(simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) const
|
||||
noexcept {
|
||||
std::copy(__storage_.begin(), __storage_.end(), __buffer.begin());
|
||||
}
|
||||
|
||||
__simd_storage() = default;
|
||||
constexpr __simd_storage() = default;
|
||||
template <class _Up, int __Unum_element>
|
||||
explicit __simd_storage(const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) {
|
||||
constexpr explicit __simd_storage(
|
||||
const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) {
|
||||
std::copy(other.__native().begin(), other.__native().end(), __storage_.begin());
|
||||
}
|
||||
const storage_type& __native() const { return __storage_; }
|
||||
constexpr const storage_type& __native() const { return __storage_; }
|
||||
};
|
||||
|
||||
template <class _Tp, int __num_element>
|
||||
|
@ -1526,8 +1526,8 @@ class __simd_mask_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>
|
|||
std::bitset<__num_element> __storage_;
|
||||
|
||||
public:
|
||||
bool __get(size_t __index) const noexcept { return __storage_.test(__index); }
|
||||
void __set(size_t __index, bool __val) noexcept { __storage_.set(__index, __val); }
|
||||
[[nodiscard]] constexpr bool __get(size_t __index) const noexcept { return __storage_.test(__index); }
|
||||
constexpr void __set(size_t __index, bool __val) noexcept { __storage_.set(__index, __val); }
|
||||
};
|
||||
|
||||
} // namespace zeus::_simd
|
||||
|
|
|
@ -15,6 +15,8 @@ using namespace std;
|
|||
#include "simd_avx.hpp"
|
||||
#elif __SSE__
|
||||
#include "simd_sse.hpp"
|
||||
#elif __ARM_NEON
|
||||
#include "simd_neon.hpp"
|
||||
#else
|
||||
namespace simd_abi {
|
||||
template <typename T>
|
||||
|
|
|
@ -0,0 +1,341 @@
|
|||
#pragma once
|
||||
#include <arm_neon.h>
|
||||
#ifndef _ZEUS_SIMD_INCLUDED
|
||||
#error simd_neon.hpp must not be included directly. Include simd.hpp instead.
|
||||
#endif
|
||||
namespace zeus::_simd {
|
||||
// __m128 ABI
|
||||
using m128_abi = __simd_abi<_StorageKind(int(_StorageKind::_Array) + 1), 4>;
|
||||
// __m128d ABI
|
||||
using m128d_abi = __simd_abi<_StorageKind(int(_StorageKind::_Array) + 2), 4>;
|
||||
|
||||
template <>
|
||||
class __simd_storage<double, m128d_abi>;
|
||||
|
||||
// __m128 storage for NEON
|
||||
template <>
|
||||
class __simd_storage<float, m128_abi> {
|
||||
public:
|
||||
using storage_type = float32x4_t;
|
||||
storage_type __storage_{};
|
||||
[[nodiscard]] constexpr float __get(size_t __index) const noexcept { return __storage_[__index]; }
|
||||
inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; }
|
||||
constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {}
|
||||
constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; }
|
||||
constexpr explicit __simd_storage(float rv) : __simd_storage(rv, rv, rv, rv) {}
|
||||
inline void __broadcast(float __val) noexcept { __storage_ = vdupq_n_f32(__val); }
|
||||
[[nodiscard]] inline float __dot2(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
return vaddv_f32(vget_low_f32(vmulq_f32(__storage_, other.__storage_)));
|
||||
}
|
||||
[[nodiscard]] inline float __dot3(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
return vaddvq_f32(vsetq_lane_f32(0.f, vmulq_f32(__storage_, other.__storage_), 3));
|
||||
}
|
||||
[[nodiscard]] inline float __dot4(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
return vaddvq_f32(vmulq_f32(__storage_, other.__storage_));
|
||||
}
|
||||
template <int x, int y, int z, int w>
|
||||
[[nodiscard]] inline __simd_storage __shuffle() const noexcept {
|
||||
storage_type ret;
|
||||
ret = vmovq_n_f32(vgetq_lane_f32(__storage_, x));
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, y), ret, 1);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, z), ret, 2);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, w), ret, 3);
|
||||
return __simd_storage(ret);
|
||||
}
|
||||
|
||||
inline void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept {
|
||||
__storage_ = vld1q_f32(__buffer.data());
|
||||
}
|
||||
|
||||
inline void __copy_to(simd_data<simd<float, m128_abi>>& __buffer) const noexcept { vst1q_f32(__buffer.data(), __storage_); }
|
||||
|
||||
constexpr __simd_storage() = default;
|
||||
explicit __simd_storage(const __simd_storage<double, m128d_abi>& other);
|
||||
|
||||
constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {}
|
||||
[[nodiscard]] constexpr const storage_type& __native() const { return __storage_; }
|
||||
};
|
||||
// __m128 mask storage for NEON
|
||||
template <>
|
||||
class __simd_mask_storage<float, m128_abi> : public __simd_storage<float, m128_abi> {
|
||||
public:
|
||||
inline bool __get(size_t __index) const noexcept {
|
||||
return vreinterpretq_u32_f32(__storage_)[__index] != 0;
|
||||
}
|
||||
inline void __set(size_t __index, bool __val) noexcept {
|
||||
uint32x4_t data = vreinterpretq_u32_f32(__storage_);
|
||||
data[__index] = __val ? UINT32_MAX : 0;
|
||||
__storage_ = vreinterpretq_f32_u32(data);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
inline simd<float, m128_abi> simd<float, m128_abi>::operator-() const {
|
||||
return vreinterpretq_f32_s32(
|
||||
veorq_s32(vreinterpretq_s32_f32(__s_.__storage_), vreinterpretq_s32_f32(vdupq_n_f32(-0.f))));
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi> operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
return vaddq_f32(a.__s_.__storage_, b.__s_.__storage_);
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi> operator-(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
return vsubq_f32(a.__s_.__storage_, b.__s_.__storage_);
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi> operator*(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
return vmulq_f32(a.__s_.__storage_, b.__s_.__storage_);
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi> operator/(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
return vdivq_f32(a.__s_.__storage_, b.__s_.__storage_);
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>& operator+=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
a.__s_.__storage_ += b.__s_.__storage_;
|
||||
return a;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>& operator-=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
a.__s_.__storage_ -= b.__s_.__storage_;
|
||||
return a;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>& operator*=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
a.__s_.__storage_ *= b.__s_.__storage_;
|
||||
return a;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>& operator/=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
a.__s_.__storage_ /= b.__s_.__storage_;
|
||||
return a;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>::mask_type operator==(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi>::mask_type ret;
|
||||
ret.__s_.__storage_ = vreinterpretq_f32_u32(vceqq_f32(a.__s_.__storage_, b.__s_.__storage_));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>::mask_type operator!=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi>::mask_type ret;
|
||||
ret.__s_.__storage_ = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a.__s_.__storage_, b.__s_.__storage_)));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>::mask_type operator>=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi>::mask_type ret;
|
||||
ret.__s_.__storage_ = vreinterpretq_f32_u32(vcgeq_f32(a.__s_.__storage_, b.__s_.__storage_));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>::mask_type operator<=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi>::mask_type ret;
|
||||
ret.__s_.__storage_ = vreinterpretq_f32_u32(vcleq_f32(a.__s_.__storage_, b.__s_.__storage_));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>::mask_type operator>(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi>::mask_type ret;
|
||||
ret.__s_.__storage_ = vreinterpretq_f32_u32(vcgtq_f32(a.__s_.__storage_, b.__s_.__storage_));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>::mask_type operator<(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi>::mask_type ret;
|
||||
ret.__s_.__storage_ = vreinterpretq_f32_u32(vcltq_f32(a.__s_.__storage_, b.__s_.__storage_));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// __m128d storage for NEON
|
||||
template <>
|
||||
class __simd_storage<double, m128d_abi> {
|
||||
public:
|
||||
using storage_type = float64x2x2_t;
|
||||
using vector_type = float64x2_t;
|
||||
storage_type __storage_{};
|
||||
[[nodiscard]] inline double __get(size_t __index) const noexcept { return __storage_.val[__index / 2][__index % 2]; }
|
||||
inline void __set(size_t __index, double __val) noexcept { __storage_.val[__index / 2][__index % 2] = __val; }
|
||||
// Make GCC happy
|
||||
static constexpr storage_type __make_array(vector_type a, vector_type b) { return {a, b}; }
|
||||
constexpr __simd_storage(double a, double b, double c, double d)
|
||||
: __storage_(__make_array(vector_type{a, b}, vector_type{c, d})) {}
|
||||
constexpr void __set4(double a, double b, double c, double d) noexcept {
|
||||
__storage_.val[0] = vector_type{a, b};
|
||||
__storage_.val[1] = vector_type{c, d};
|
||||
}
|
||||
constexpr __simd_storage(double rv) : __simd_storage(rv, rv, rv, rv) {}
|
||||
constexpr void __broadcast(double __val) noexcept { __set4(__val, __val, __val, __val); }
|
||||
[[nodiscard]] inline double __dot2(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
return vaddvq_f64(vmulq_f64(__storage_.val[0], other.__storage_.val[0]));
|
||||
}
|
||||
[[nodiscard]] inline double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]);
|
||||
const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]);
|
||||
return vaddvq_f64(vcombine_f64(vcreate_f64(vaddvq_f64(mul1)), vget_low_f64(mul2)));
|
||||
}
|
||||
[[nodiscard]] inline double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]);
|
||||
const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]);
|
||||
return vaddvq_f64(vaddq_f64(mul1, mul2));
|
||||
}
|
||||
|
||||
inline void __copy_from(const simd_data<simd<double, m128d_abi>>& __buffer) noexcept {
|
||||
__storage_ = vld2q_f64(__buffer.data());
|
||||
}
|
||||
|
||||
inline void __copy_to(simd_data<simd<double, m128d_abi>>& __buffer) const noexcept {
|
||||
vst2q_f64(__buffer.data(), __storage_);
|
||||
}
|
||||
|
||||
constexpr __simd_storage() = default;
|
||||
explicit inline __simd_storage(const __simd_storage<float, m128_abi>& other) {
|
||||
__storage_.val[0] = vcvt_f64_f32(vget_low_f32(other.__storage_));
|
||||
__storage_.val[1] = vcvt_f64_f32(vget_high_f32(other.__storage_));
|
||||
}
|
||||
|
||||
constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {}
|
||||
[[nodiscard]] constexpr const storage_type& __native() const { return __storage_; }
|
||||
};
|
||||
// __m128d mask storage for NEON
|
||||
template <>
|
||||
class __simd_mask_storage<double, m128d_abi> : public __simd_storage<double, m128d_abi> {
|
||||
public:
|
||||
inline bool __get(size_t __index) const noexcept {
|
||||
return vreinterpretq_u64_f64(__storage_.val[__index / 2])[__index % 2] != 0;
|
||||
}
|
||||
inline void __set(size_t __index, bool __val) noexcept {
|
||||
uint64x2_t vec = vreinterpretq_u64_f64(__storage_.val[__index / 2]);
|
||||
vec[__index % 2] = __val ? UINT64_MAX : 0;
|
||||
__storage_.val[__index / 2] = vreinterpretq_f64_u64(vec);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
inline simd<double, m128d_abi> simd<double, m128d_abi>::operator-() const {
|
||||
simd<double, m128d_abi> ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vreinterpretq_f64_s64(
|
||||
veorq_s64(vreinterpretq_s64_f64(__s_.__storage_.val[i]), vreinterpretq_s64_f64(vdupq_n_f64(-0.0))));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi> operator+(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi> ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vaddq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi> operator-(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi> ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vsubq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi> operator*(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi> ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vmulq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi> operator/(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi> ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vdivq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>& operator+=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
|
||||
for (int i = 0; i < 2; ++i)
|
||||
a.__s_.__storage_.val[i] = vaddq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
|
||||
return a;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>& operator-=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
|
||||
for (int i = 0; i < 2; ++i)
|
||||
a.__s_.__storage_.val[i] = vsubq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
|
||||
return a;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>& operator*=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
|
||||
for (int i = 0; i < 2; ++i)
|
||||
a.__s_.__storage_.val[i] = vmulq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
|
||||
return a;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>& operator/=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
|
||||
for (int i = 0; i < 2; ++i)
|
||||
a.__s_.__storage_.val[i] = vdivq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]);
|
||||
return a;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>::mask_type operator==(const simd<double, m128d_abi>& a,
|
||||
const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi>::mask_type ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vceqq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>::mask_type operator!=(const simd<double, m128d_abi>& a,
|
||||
const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi>::mask_type ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vreinterpretq_u64_u32(
|
||||
vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])))));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>::mask_type operator>=(const simd<double, m128d_abi>& a,
|
||||
const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi>::mask_type ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcgeq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>::mask_type operator<=(const simd<double, m128d_abi>& a,
|
||||
const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi>::mask_type ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcleq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>::mask_type operator>(const simd<double, m128d_abi>& a,
|
||||
const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi>::mask_type ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcgtq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline simd<double, m128d_abi>::mask_type operator<(const simd<double, m128d_abi>& a,
|
||||
const simd<double, m128d_abi>& b) {
|
||||
simd<double, m128d_abi>::mask_type ret;
|
||||
for (int i = 0; i < 2; ++i)
|
||||
ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcltq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline __simd_storage<float, m128_abi>::__simd_storage(const __simd_storage<double, m128d_abi>& other) {
|
||||
__storage_ = vcombine_f32(vcvt_f32_f64(other.__storage_.val[0]), vcvt_f32_f64(other.__storage_.val[1]));
|
||||
}
|
||||
|
||||
namespace simd_abi {
|
||||
template <typename T>
|
||||
struct zeus_native {};
|
||||
template <>
|
||||
struct zeus_native<float> {
|
||||
using type = m128_abi;
|
||||
};
|
||||
template <>
|
||||
struct zeus_native<double> {
|
||||
using type = m128d_abi;
|
||||
};
|
||||
} // namespace simd_abi
|
||||
|
||||
} // namespace zeus::_simd
|
|
@ -29,23 +29,14 @@ template <>
|
|||
class __simd_storage<float, m128_abi> {
|
||||
public:
|
||||
using storage_type = __m128;
|
||||
storage_type __storage_;
|
||||
float __get(size_t __index) const noexcept {
|
||||
alignas(16) std::array<float, 4> sse_data;
|
||||
_mm_store_ps(sse_data.data(), __storage_);
|
||||
return sse_data[__index];
|
||||
}
|
||||
void __set(size_t __index, float __val) noexcept {
|
||||
alignas(16) std::array<float, 4> sse_data;
|
||||
_mm_store_ps(sse_data.data(), __storage_);
|
||||
sse_data[__index] = __val;
|
||||
__storage_ = _mm_load_ps(sse_data.data());
|
||||
}
|
||||
storage_type __storage_{};
|
||||
[[nodiscard]] inline float __get(size_t __index) const noexcept { return __storage_[__index]; }
|
||||
inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; }
|
||||
constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {}
|
||||
void __set4(float a, float b, float c, float d) noexcept { __storage_ = _mm_set_ps(d, c, b, a); }
|
||||
constexpr __simd_storage(float rv) : __storage_{rv, rv, rv, rv} {}
|
||||
void __broadcast(float __val) noexcept { __storage_ = _mm_set1_ps(__val); }
|
||||
float __dot2(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; }
|
||||
constexpr explicit __simd_storage(float rv) : __storage_{rv, rv, rv, rv} {}
|
||||
inline void __broadcast(float __val) noexcept { __storage_ = _mm_set1_ps(__val); }
|
||||
[[nodiscard]] inline float __dot2(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
#if __SSE4_1__
|
||||
float ret;
|
||||
_mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x3F));
|
||||
|
@ -56,7 +47,7 @@ public:
|
|||
return sse_data[0] + sse_data[1];
|
||||
#endif
|
||||
}
|
||||
float __dot3(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
[[nodiscard]] inline float __dot3(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
#if __SSE4_1__
|
||||
float ret;
|
||||
_mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x7F));
|
||||
|
@ -67,7 +58,7 @@ public:
|
|||
return sse_data[0] + sse_data[1] + sse_data[2];
|
||||
#endif
|
||||
}
|
||||
float __dot4(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
[[nodiscard]] inline float __dot4(const __simd_storage<float, m128_abi>& other) const noexcept {
|
||||
#if __SSE4_1__
|
||||
float ret;
|
||||
_mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0xFF));
|
||||
|
@ -78,40 +69,39 @@ public:
|
|||
return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3];
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int x, int y, int z, int w>
|
||||
__simd_storage __shuffle() const noexcept {
|
||||
__simd_storage s;
|
||||
s.__storage_ = _mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x));
|
||||
return s;
|
||||
[[nodiscard]] constexpr __simd_storage __shuffle() const noexcept {
|
||||
return __simd_storage(_mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x)));
|
||||
}
|
||||
|
||||
void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept {
|
||||
inline void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept {
|
||||
__storage_ = _mm_load_ps(__buffer.data());
|
||||
}
|
||||
|
||||
void __copy_to(simd_data<simd<float, m128_abi>>& __buffer) const noexcept {
|
||||
inline void __copy_to(simd_data<simd<float, m128_abi>>& __buffer) const noexcept {
|
||||
_mm_store_ps(__buffer.data(), __storage_);
|
||||
}
|
||||
|
||||
__simd_storage() = default;
|
||||
explicit __simd_storage(const __simd_storage<double, m128d_abi>& other);
|
||||
explicit inline __simd_storage(const __simd_storage<double, m128d_abi>& other);
|
||||
#ifdef __AVX__
|
||||
explicit __simd_storage(const __simd_storage<double, m256d_abi>& other);
|
||||
explicit inline __simd_storage(const __simd_storage<double, m256d_abi>& other);
|
||||
#endif
|
||||
|
||||
explicit __simd_storage(const storage_type& s) : __storage_(s) {}
|
||||
const storage_type& __native() const { return __storage_; }
|
||||
constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {}
|
||||
[[nodiscard]] constexpr const storage_type& __native() const { return __storage_; }
|
||||
};
|
||||
// __m128 mask storage for SSE2+
|
||||
template <>
|
||||
class __simd_mask_storage<float, m128_abi> : public __simd_storage<float, m128_abi> {
|
||||
public:
|
||||
bool __get(size_t __index) const noexcept {
|
||||
[[nodiscard]] inline bool __get(size_t __index) const noexcept {
|
||||
alignas(16) uint32_t sse_data[4];
|
||||
_mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_);
|
||||
return sse_data[__index] != 0;
|
||||
}
|
||||
void __set(size_t __index, bool __val) noexcept {
|
||||
inline void __set(size_t __index, bool __val) noexcept {
|
||||
alignas(16) uint32_t sse_data[4];
|
||||
_mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_);
|
||||
sse_data[__index] = __val ? UINT32_MAX : 0;
|
||||
|
@ -125,27 +115,19 @@ inline simd<float, m128_abi> simd<float, m128_abi>::operator-() const {
|
|||
}
|
||||
|
||||
inline simd<float, m128_abi> operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi> ret;
|
||||
ret.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_);
|
||||
return ret;
|
||||
return _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_);
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi> operator-(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi> ret;
|
||||
ret.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_);
|
||||
return ret;
|
||||
return _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_);
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi> operator*(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi> ret;
|
||||
ret.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_);
|
||||
return ret;
|
||||
return _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_);
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi> operator/(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
simd<float, m128_abi> ret;
|
||||
ret.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_);
|
||||
return ret;
|
||||
return _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_);
|
||||
}
|
||||
|
||||
inline simd<float, m128_abi>& operator+=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
|
||||
|
@ -209,31 +191,19 @@ template <>
|
|||
class __simd_storage<double, m128d_abi> {
|
||||
public:
|
||||
using storage_type = std::array<__m128d, 2>;
|
||||
storage_type __storage_;
|
||||
double __get(size_t __index) const noexcept {
|
||||
alignas(16) std::array<double, 2> sse_data;
|
||||
_mm_store_pd(sse_data.data(), __storage_[__index / 2]);
|
||||
return sse_data[__index % 2];
|
||||
}
|
||||
void __set(size_t __index, double __val) noexcept {
|
||||
alignas(16) std::array<double, 2> sse_data;
|
||||
_mm_store_pd(sse_data.data(), __storage_[__index / 2]);
|
||||
sse_data[__index % 2] = __val;
|
||||
__storage_[__index / 2] = _mm_load_pd(sse_data.data());
|
||||
}
|
||||
storage_type __storage_{};
|
||||
[[nodiscard]] inline double __get(size_t __index) const noexcept { return __storage_[__index / 2][__index % 2]; }
|
||||
inline void __set(size_t __index, double __val) noexcept { __storage_[__index / 2][__index % 2] = __val; }
|
||||
// Make GCC happy
|
||||
static constexpr storage_type __make_array(__m128d a, __m128d b) { return {a, b}; }
|
||||
constexpr __simd_storage(double a, double b, double c, double d)
|
||||
: __storage_(__make_array(__m128d{a, b}, __m128d{c, d})) {}
|
||||
void __set4(double a, double b, double c, double d) noexcept {
|
||||
__storage_[0] = _mm_set_pd(b, a);
|
||||
__storage_[1] = _mm_set_pd(d, c);
|
||||
constexpr __simd_storage(double a, double b, double c, double d) : __storage_(__make_array(__m128d{a, b}, __m128d{c, d})) {}
|
||||
constexpr void __set4(double a, double b, double c, double d) noexcept {
|
||||
__storage_[0] = __m128d{a, b};
|
||||
__storage_[1] = __m128d{c, d};
|
||||
}
|
||||
constexpr __simd_storage(double rv) : __storage_(__make_array(__m128d{rv, rv}, __m128d{rv, rv})) {}
|
||||
void __broadcast(double __val) noexcept {
|
||||
for (int i = 0; i < 2; ++i)
|
||||
__storage_[i] = _mm_set1_pd(__val);
|
||||
}
|
||||
double __dot2(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
constexpr __simd_storage(double rv) : __simd_storage(rv, rv, rv, rv) {}
|
||||
constexpr void __broadcast(double __val) noexcept { __set4(__val, __val, __val, __val); }
|
||||
[[nodiscard]] inline double __dot2(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
#if __SSE4_1__
|
||||
double ret;
|
||||
_mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
|
||||
|
@ -244,7 +214,7 @@ public:
|
|||
return sse_data[0] + sse_data[1];
|
||||
#endif
|
||||
}
|
||||
double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
[[nodiscard]] inline double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
#if __SSE4_1__
|
||||
double ret;
|
||||
_mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
|
||||
|
@ -259,7 +229,7 @@ public:
|
|||
return sse_data[0] + sse_data[1] + sse_data2[0];
|
||||
#endif
|
||||
}
|
||||
double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
[[nodiscard]] inline double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept {
|
||||
#if __SSE4_1__
|
||||
double ret;
|
||||
_mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
|
||||
|
@ -275,24 +245,24 @@ public:
|
|||
#endif
|
||||
}
|
||||
|
||||
void __copy_from(const simd_data<simd<double, m128d_abi>>& __buffer) noexcept {
|
||||
inline void __copy_from(const simd_data<simd<double, m128d_abi>>& __buffer) noexcept {
|
||||
__storage_[0] = _mm_load_pd(__buffer.data());
|
||||
__storage_[1] = _mm_load_pd(__buffer.data() + 2);
|
||||
}
|
||||
|
||||
void __copy_to(simd_data<simd<double, m128d_abi>>& __buffer) const noexcept {
|
||||
inline void __copy_to(simd_data<simd<double, m128d_abi>>& __buffer) const noexcept {
|
||||
_mm_store_pd(__buffer.data(), __storage_[0]);
|
||||
_mm_store_pd(__buffer.data() + 2, __storage_[1]);
|
||||
}
|
||||
|
||||
__simd_storage() = default;
|
||||
explicit __simd_storage(const __simd_storage<float, m128_abi>& other) {
|
||||
constexpr __simd_storage() = default;
|
||||
inline explicit __simd_storage(const __simd_storage<float, m128_abi>& other) {
|
||||
__storage_[0] = _mm_cvtps_pd(other.__storage_);
|
||||
__storage_[1] = _mm_cvtps_pd(_mm_movehl_ps(other.__storage_, other.__storage_));
|
||||
}
|
||||
|
||||
explicit __simd_storage(const storage_type& s) : __storage_(s) {}
|
||||
const storage_type& __native() const { return __storage_; }
|
||||
constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {}
|
||||
[[nodiscard]] constexpr const storage_type& __native() const { return __storage_; }
|
||||
};
|
||||
// __m128d mask storage for SSE2+
|
||||
template <>
|
||||
|
|
|
@ -34,8 +34,8 @@ void CMatrix3f::transpose() {
|
|||
m[1].mSimd = _mm_movehl_ps(T2, T0);
|
||||
m[2].mSimd = _mm_movelh_ps(T1, T3);
|
||||
#elif __ARM_NEON
|
||||
float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
|
||||
float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
|
||||
float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native());
|
||||
float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native());
|
||||
|
||||
float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
|
||||
float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
|
||||
|
@ -69,8 +69,8 @@ CMatrix3f CMatrix3f::transposed() const {
|
|||
__m128 T3 = _mm_unpackhi_ps(m[2].mSimd.native(), zero);
|
||||
return CMatrix3f(_mm_movelh_ps(T0, T2), _mm_movehl_ps(T2, T0), _mm_movelh_ps(T1, T3));
|
||||
#elif __ARM_NEON
|
||||
float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
|
||||
float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
|
||||
float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native());
|
||||
float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native());
|
||||
|
||||
float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
|
||||
float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
|
||||
|
|
|
@ -15,8 +15,8 @@ CMatrix4f CMatrix4f::transposed() const {
|
|||
ret.m[2].mSimd = _mm_movelh_ps(T1, T3);
|
||||
ret.m[3].mSimd = _mm_movehl_ps(T3, T1);
|
||||
#elif __ARM_NEON
|
||||
float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
|
||||
float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
|
||||
float32x4x2_t P0 = vzipq_f32(m[0].mSimd.native(), m[2].mSimd.native());
|
||||
float32x4x2_t P1 = vzipq_f32(m[1].mSimd.native(), m[3].mSimd.native());
|
||||
|
||||
float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
|
||||
float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
|
||||
|
|
14
src/Math.cpp
14
src/Math.cpp
|
@ -10,10 +10,8 @@
|
|||
|
||||
#if _WIN32
|
||||
#include <intrin.h>
|
||||
#else
|
||||
|
||||
#elif __x86_64__
|
||||
#include <cpuid.h>
|
||||
|
||||
#endif
|
||||
|
||||
namespace zeus {
|
||||
|
@ -23,8 +21,8 @@ static CPUInfo g_cpuFeatures = {};
|
|||
static CPUInfo g_missingFeatures = {};
|
||||
|
||||
void getCpuInfo(int eax, int regs[4]) {
|
||||
#if !GEKKO
|
||||
#if _WIN32
|
||||
#if __x86_64__
|
||||
#if _WIN32
|
||||
__cpuid(regs, eax);
|
||||
#else
|
||||
__cpuid(eax, regs[0], regs[1], regs[2], regs[3]);
|
||||
|
@ -33,8 +31,8 @@ void getCpuInfo(int eax, int regs[4]) {
|
|||
}
|
||||
|
||||
void getCpuInfoEx(int eax, int ecx, int regs[4]) {
|
||||
#if !GEKKO
|
||||
#if _WIN32
|
||||
#if __x86_64__
|
||||
#if _WIN32
|
||||
__cpuidex(regs, eax, ecx);
|
||||
#else
|
||||
__cpuid_count(eax, ecx, regs[0], regs[1], regs[2], regs[3]);
|
||||
|
@ -43,7 +41,7 @@ void getCpuInfoEx(int eax, int ecx, int regs[4]) {
|
|||
}
|
||||
|
||||
void detectCPU() {
|
||||
#if !GEKKO
|
||||
#if __x86_64__
|
||||
if (isCPUInit)
|
||||
return;
|
||||
|
||||
|
|
Loading…
Reference in New Issue