From be2e0e0b7258b034c0ec2bafd089b12a838a376d Mon Sep 17 00:00:00 2001 From: Luke Street Date: Wed, 21 Oct 2020 02:00:44 -0400 Subject: [PATCH] SIMD updates from zeus --- include/athena/simd/parallelism_v2_simd.hpp | 144 ++++----- include/athena/simd/simd.hpp | 2 + include/athena/simd/simd_avx.hpp | 2 +- include/athena/simd/simd_neon.hpp | 341 ++++++++++++++++++++ include/athena/simd/simd_sse.hpp | 117 +++---- 5 files changed, 447 insertions(+), 159 deletions(-) create mode 100644 include/athena/simd/simd_neon.hpp diff --git a/include/athena/simd/parallelism_v2_simd.hpp b/include/athena/simd/parallelism_v2_simd.hpp index 804ca5b..3700e56 100644 --- a/include/athena/simd/parallelism_v2_simd.hpp +++ b/include/athena/simd/parallelism_v2_simd.hpp @@ -713,83 +713,59 @@ public: operator _Vp() const { return __ptr_->__get(__index_); } - __simd_reference operator=(_Vp __value) && { + constexpr __simd_reference& operator=(_Vp __value) && { __ptr_->__set(__index_, __value); return *this; } - __simd_reference operator++() && { return std::move(*this) = _Vp(*this) + 1; } + __simd_reference operator++() && { return std::move(*this) = __ptr_->__get(__index_) + 1; } _Vp operator++(int) && { - auto __val = _Vp(*this); + auto __val = __ptr_->__get(__index_); __ptr_->__set(__index_, __val + 1); return __val; } - __simd_reference operator--() && { return std::move(*this) = _Vp(*this) - 1; } + __simd_reference operator--() && { return std::move(*this) = __ptr_->__get(__index_) - 1; } _Vp operator--(int) && { - auto __val = _Vp(*this); + auto __val = __ptr_->__get(__index_); __ptr_->__set(__index_, __val - 1); return __val; } - __simd_reference operator+=(_Vp __value) && { return std::move(*this) = _Vp(*this) + __value; } + __simd_reference operator+=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) + __value; } - __simd_reference operator-=(_Vp __value) && { return std::move(*this) = _Vp(*this) - __value; } + __simd_reference operator-=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) - __value; } - __simd_reference operator*=(_Vp __value) && { return std::move(*this) = _Vp(*this) * __value; } + __simd_reference operator*=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) * __value; } - __simd_reference operator/=(_Vp __value) && { return std::move(*this) = _Vp(*this) / __value; } + __simd_reference operator/=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) / __value; } - __simd_reference operator%=(_Vp __value) && { return std::move(*this) = _Vp(*this) % __value; } + __simd_reference operator%=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) % __value; } - __simd_reference operator>>=(_Vp __value) && { return std::move(*this) = _Vp(*this) >> __value; } + __simd_reference operator>>=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) >> __value; } - __simd_reference operator<<=(_Vp __value) && { return std::move(*this) = _Vp(*this) << __value; } + __simd_reference operator<<=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) << __value; } - __simd_reference operator&=(_Vp __value) && { return std::move(*this) = _Vp(*this) & __value; } + __simd_reference operator&=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) & __value; } - __simd_reference operator|=(_Vp __value) && { return std::move(*this) = _Vp(*this) | __value; } + __simd_reference operator|=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) | __value; } - __simd_reference operator^=(_Vp __value) && { return std::move(*this) = _Vp(*this) ^ __value; } + __simd_reference operator^=(_Vp __value) && { return std::move(*this) = __ptr_->__get(__index_) ^ __value; } - bool operator<(const __simd_reference& __value) const { return _Vp(*this) < _Vp(__value); } + bool operator<(_Vp __value) const { return __ptr_->__get(__index_) < __value; } - bool operator<=(const __simd_reference& __value) const { return _Vp(*this) <= _Vp(__value); } + bool operator<=(_Vp __value) const { return __ptr_->__get(__index_) <= __value; } - bool operator>(const __simd_reference& __value) const { return _Vp(*this) > _Vp(__value); } + bool operator>(_Vp __value) const { return __ptr_->__get(__index_) > __value; } - bool operator>=(const __simd_reference& __value) const { return _Vp(*this) >= _Vp(__value); } + bool operator>=(_Vp __value) const { return __ptr_->__get(__index_) >= __value; } - bool operator==(const __simd_reference& __value) const { return _Vp(*this) == _Vp(__value); } + bool operator==(_Vp __value) const { return __ptr_->__get(__index_) == __value; } - bool operator!=(const __simd_reference& __value) const { return _Vp(*this) != _Vp(__value); } - - bool operator<(_Vp __value) const { return _Vp(*this) < __value; } - - bool operator<=(_Vp __value) const { return _Vp(*this) <= __value; } - - bool operator>(_Vp __value) const { return _Vp(*this) > __value; } - - bool operator>=(_Vp __value) const { return _Vp(*this) >= __value; } - - bool operator==(_Vp __value) const { return _Vp(*this) == __value; } - - bool operator!=(_Vp __value) const { return _Vp(*this) != __value; } + bool operator!=(_Vp __value) const { return __ptr_->__get(__index_) != __value; } }; -template -inline bool operator<(_Vp a, const __simd_reference<_Vp, _Tp, _Abi>& b) { return a < _Vp(b); } -template -inline bool operator<=(_Vp a, const __simd_reference<_Vp, _Tp, _Abi>& b) { return a <= _Vp(b); } -template -inline bool operator>(_Vp a, const __simd_reference<_Vp, _Tp, _Abi>& b) { return a > _Vp(b); } -template -inline bool operator>=(_Vp a, const __simd_reference<_Vp, _Tp, _Abi>& b) { return a >= _Vp(b); } -template -inline bool operator==(_Vp a, const __simd_reference<_Vp, _Tp, _Abi>& b) { return a == _Vp(b); } -template -inline bool operator!=(_Vp a, const __simd_reference<_Vp, _Tp, _Abi>& b) { return a != _Vp(b); } template class __simd_mask_reference { @@ -1223,9 +1199,9 @@ public: using mask_type = simd_mask<_Tp, _Abi>; using abi_type = _Abi; - simd() = default; - simd(const simd&) = default; - simd& operator=(const simd&) = default; + constexpr simd() = default; + constexpr simd(const simd&) = default; + constexpr simd& operator=(const simd&) = default; static constexpr size_t size() noexcept { return simd_size<_Tp, _Abi>::value; } @@ -1254,13 +1230,14 @@ private: } template - void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) { + constexpr void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) { int __not_used[]{((*this)[__indicies] = __g(std::integral_constant()), 0)...}; (void)__not_used; } public: - simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {} + constexpr simd(const __simd_storage<_Tp, _Abi>& s) : __s_(s) {} + constexpr simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {} #if 0 // implicit type conversion constructor @@ -1278,7 +1255,7 @@ public: template , __simd_storage<_Up, _UAbi>>::value>> - simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {} + constexpr simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {} #if 0 template ( - std::make_index_sequence::value>()), int>::type()> + int = typename std::enable_if< + __can_generate<_Generator>(std::make_index_sequence::value>()), int>::type()> explicit simd(_Generator&& __g) { __generator_init(std::forward<_Generator>(__g), std::make_index_sequence::value>()); } @@ -1349,10 +1326,13 @@ public: // stores [simd.store] void copy_to(simd_data& __buffer) const { __s_.__copy_to(__buffer); } - // scalar access [simd.subscr] - reference operator[](size_t __i) { return reference(&__s_, __i); } + constexpr void set(_Tp a, _Tp b, _Tp c = {}, _Tp d = {}) { __s_.__set4(a, b, c, d); } + constexpr void broadcast(_Tp rv) { __s_.__broadcast(rv); } - value_type operator[](size_t __i) const { return __s_.__get(__i); } + // scalar access [simd.subscr] + constexpr reference operator[](size_t __i) { return reference(&__s_, __i); } + + constexpr value_type operator[](size_t __i) const { return __s_.__get(__i); } // unary operators [simd.unary] simd& operator++(); @@ -1401,15 +1381,13 @@ public: friend mask_type operator>(const simd&, const simd&); friend mask_type operator<(const simd&, const simd&); - value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); } - value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); } - value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); } + constexpr value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); } + constexpr value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); } + constexpr value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); } template - simd shuffle() const { - simd s; - s.__s_ = __s_.template __shuffle(); - return s; + constexpr simd shuffle() const { + return __s_.template __shuffle(); } const typename __simd_storage<_Tp, _Abi>::storage_type& native() const { return __s_.__native(); } @@ -1499,50 +1477,48 @@ private: friend class simd_mask; public: - _Tp __get(size_t __index) const noexcept { return __storage_[__index]; }; - void __set(size_t __index, _Tp __val) noexcept { __storage_[__index] = __val; } - std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept { + constexpr _Tp __get(size_t __index) const noexcept { return __storage_[__index]; }; + constexpr void __set(size_t __index, _Tp __val) noexcept { __storage_[__index] = __val; } + constexpr std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept { __storage_[0] = a; __storage_[1] = b; __storage_[2] = c; __storage_[3] = d; } - void __broadcast(float __val) noexcept { std::fill(__storage_.begin(), __storage_.end(), __val); } - std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept { + constexpr void __broadcast(float __val) noexcept { std::fill(__storage_.begin(), __storage_.end(), __val); } + constexpr std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept { return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1]; } - std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept { + constexpr std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept { return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] + __storage_[2] * other.__storage_[2]; } - std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept { + constexpr std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept { return __storage_[0] * other.__storage_[0] + __storage_[1] * other.__storage_[1] + __storage_[2] * other.__storage_[2] + __storage_[3] * other.__storage_[3]; } template - std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept { - __simd_storage s; - s.__storage_[0] = __storage_[x]; - s.__storage_[1] = __storage_[y]; - s.__storage_[2] = __storage_[z]; - s.__storage_[3] = __storage_[w]; - return s; + constexpr std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept { + return {__storage_[x], __storage_[y], __storage_[z], __storage_[w]}; } - void __copy_from(const simd_data>>& __buffer) noexcept { + constexpr void + __copy_from(const simd_data>>& __buffer) noexcept { std::copy(__buffer.begin(), __buffer.end(), __storage_.begin()); } - void __copy_to(simd_data>>& __buffer) const noexcept { + constexpr void __copy_to(simd_data>>& __buffer) const + noexcept { std::copy(__storage_.begin(), __storage_.end(), __buffer.begin()); } - __simd_storage() = default; + constexpr __simd_storage() = default; template - explicit __simd_storage(const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) { + constexpr explicit __simd_storage( + const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) { std::copy(other.__native().begin(), other.__native().end(), __storage_.begin()); } - const storage_type& __native() const { return __storage_; } + constexpr const storage_type& __native() const { return __storage_; } }; template @@ -1550,8 +1526,8 @@ class __simd_mask_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>> std::bitset<__num_element> __storage_; public: - bool __get(size_t __index) const noexcept { return __storage_.test(__index); } - void __set(size_t __index, bool __val) noexcept { __storage_.set(__index, __val); } + [[nodiscard]] constexpr bool __get(size_t __index) const noexcept { return __storage_.test(__index); } + constexpr void __set(size_t __index, bool __val) noexcept { __storage_.set(__index, __val); } }; } // namespace athena::_simd diff --git a/include/athena/simd/simd.hpp b/include/athena/simd/simd.hpp index c0d9577..abf2448 100644 --- a/include/athena/simd/simd.hpp +++ b/include/athena/simd/simd.hpp @@ -15,6 +15,8 @@ using namespace std; #include "simd_avx.hpp" #elif __SSE__ #include "simd_sse.hpp" +#elif __ARM_NEON +#include "simd_neon.hpp" #else namespace simd_abi { template diff --git a/include/athena/simd/simd_avx.hpp b/include/athena/simd/simd_avx.hpp index 3169a34..828c464 100644 --- a/include/athena/simd/simd_avx.hpp +++ b/include/athena/simd/simd_avx.hpp @@ -178,4 +178,4 @@ struct athena_native { }; } // namespace simd_abi -} // namespace athena::_simd \ No newline at end of file +} // namespace athena::_simd diff --git a/include/athena/simd/simd_neon.hpp b/include/athena/simd/simd_neon.hpp new file mode 100644 index 0000000..75dd80c --- /dev/null +++ b/include/athena/simd/simd_neon.hpp @@ -0,0 +1,341 @@ +#pragma once +#include +#ifndef _ATHENA_SIMD_INCLUDED +#error simd_neon.hpp must not be included directly. Include simd.hpp instead. +#endif +namespace athena::_simd { +// __m128 ABI +using m128_abi = __simd_abi<_StorageKind(int(_StorageKind::_Array) + 1), 4>; +// __m128d ABI +using m128d_abi = __simd_abi<_StorageKind(int(_StorageKind::_Array) + 2), 4>; + +template <> +class __simd_storage; + +// __m128 storage for NEON +template <> +class __simd_storage { +public: + using storage_type = float32x4_t; + storage_type __storage_{}; + [[nodiscard]] constexpr float __get(size_t __index) const noexcept { return __storage_[__index]; } + inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; } + constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {} + constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; } + constexpr explicit __simd_storage(float rv) : __simd_storage(rv, rv, rv, rv) {} + inline void __broadcast(float __val) noexcept { __storage_ = vdupq_n_f32(__val); } + [[nodiscard]] inline float __dot2(const __simd_storage& other) const noexcept { + return vaddv_f32(vget_low_f32(vmulq_f32(__storage_, other.__storage_))); + } + [[nodiscard]] inline float __dot3(const __simd_storage& other) const noexcept { + return vaddvq_f32(vsetq_lane_f32(0.f, vmulq_f32(__storage_, other.__storage_), 3)); + } + [[nodiscard]] inline float __dot4(const __simd_storage& other) const noexcept { + return vaddvq_f32(vmulq_f32(__storage_, other.__storage_)); + } + template + [[nodiscard]] inline __simd_storage __shuffle() const noexcept { + storage_type ret; + ret = vmovq_n_f32(vgetq_lane_f32(__storage_, x)); + ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, y), ret, 1); + ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, z), ret, 2); + ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, w), ret, 3); + return __simd_storage(ret); + } + + inline void __copy_from(const simd_data>& __buffer) noexcept { + __storage_ = vld1q_f32(__buffer.data()); + } + + inline void __copy_to(simd_data>& __buffer) const noexcept { vst1q_f32(__buffer.data(), __storage_); } + + constexpr __simd_storage() = default; + explicit __simd_storage(const __simd_storage& other); + + constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {} + [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; } +}; +// __m128 mask storage for NEON +template <> +class __simd_mask_storage : public __simd_storage { +public: + inline bool __get(size_t __index) const noexcept { + return vreinterpretq_u32_f32(__storage_)[__index] != 0; + } + inline void __set(size_t __index, bool __val) noexcept { + uint32x4_t data = vreinterpretq_u32_f32(__storage_); + data[__index] = __val ? UINT32_MAX : 0; + __storage_ = vreinterpretq_f32_u32(data); + } +}; + +template <> +inline simd simd::operator-() const { + return vreinterpretq_f32_s32( + veorq_s32(vreinterpretq_s32_f32(__s_.__storage_), vreinterpretq_s32_f32(vdupq_n_f32(-0.f)))); +} + +inline simd operator+(const simd& a, const simd& b) { + return vaddq_f32(a.__s_.__storage_, b.__s_.__storage_); +} + +inline simd operator-(const simd& a, const simd& b) { + return vsubq_f32(a.__s_.__storage_, b.__s_.__storage_); +} + +inline simd operator*(const simd& a, const simd& b) { + return vmulq_f32(a.__s_.__storage_, b.__s_.__storage_); +} + +inline simd operator/(const simd& a, const simd& b) { + return vdivq_f32(a.__s_.__storage_, b.__s_.__storage_); +} + +inline simd& operator+=(simd& a, const simd& b) { + a.__s_.__storage_ += b.__s_.__storage_; + return a; +} + +inline simd& operator-=(simd& a, const simd& b) { + a.__s_.__storage_ -= b.__s_.__storage_; + return a; +} + +inline simd& operator*=(simd& a, const simd& b) { + a.__s_.__storage_ *= b.__s_.__storage_; + return a; +} + +inline simd& operator/=(simd& a, const simd& b) { + a.__s_.__storage_ /= b.__s_.__storage_; + return a; +} + +inline simd::mask_type operator==(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vceqq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +inline simd::mask_type operator!=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a.__s_.__storage_, b.__s_.__storage_))); + return ret; +} + +inline simd::mask_type operator>=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vcgeq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +inline simd::mask_type operator<=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vcleq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +inline simd::mask_type operator>(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vcgtq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +inline simd::mask_type operator<(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = vreinterpretq_f32_u32(vcltq_f32(a.__s_.__storage_, b.__s_.__storage_)); + return ret; +} + +// __m128d storage for NEON +template <> +class __simd_storage { +public: + using storage_type = float64x2x2_t; + using vector_type = float64x2_t; + storage_type __storage_{}; + [[nodiscard]] inline double __get(size_t __index) const noexcept { return __storage_.val[__index / 2][__index % 2]; } + inline void __set(size_t __index, double __val) noexcept { __storage_.val[__index / 2][__index % 2] = __val; } + // Make GCC happy + static constexpr storage_type __make_array(vector_type a, vector_type b) { return {a, b}; } + constexpr __simd_storage(double a, double b, double c, double d) + : __storage_(__make_array(vector_type{a, b}, vector_type{c, d})) {} + constexpr void __set4(double a, double b, double c, double d) noexcept { + __storage_.val[0] = vector_type{a, b}; + __storage_.val[1] = vector_type{c, d}; + } + constexpr __simd_storage(double rv) : __simd_storage(rv, rv, rv, rv) {} + constexpr void __broadcast(double __val) noexcept { __set4(__val, __val, __val, __val); } + [[nodiscard]] inline double __dot2(const __simd_storage& other) const noexcept { + return vaddvq_f64(vmulq_f64(__storage_.val[0], other.__storage_.val[0])); + } + [[nodiscard]] inline double __dot3(const __simd_storage& other) const noexcept { + const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]); + const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]); + return vaddvq_f64(vcombine_f64(vcreate_f64(vaddvq_f64(mul1)), vget_low_f64(mul2))); + } + [[nodiscard]] inline double __dot4(const __simd_storage& other) const noexcept { + const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]); + const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]); + return vaddvq_f64(vaddq_f64(mul1, mul2)); + } + + inline void __copy_from(const simd_data>& __buffer) noexcept { + __storage_ = vld2q_f64(__buffer.data()); + } + + inline void __copy_to(simd_data>& __buffer) const noexcept { + vst2q_f64(__buffer.data(), __storage_); + } + + constexpr __simd_storage() = default; + explicit inline __simd_storage(const __simd_storage& other) { + __storage_.val[0] = vcvt_f64_f32(vget_low_f32(other.__storage_)); + __storage_.val[1] = vcvt_f64_f32(vget_high_f32(other.__storage_)); + } + + constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {} + [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; } +}; +// __m128d mask storage for NEON +template <> +class __simd_mask_storage : public __simd_storage { +public: + inline bool __get(size_t __index) const noexcept { + return vreinterpretq_u64_f64(__storage_.val[__index / 2])[__index % 2] != 0; + } + inline void __set(size_t __index, bool __val) noexcept { + uint64x2_t vec = vreinterpretq_u64_f64(__storage_.val[__index / 2]); + vec[__index % 2] = __val ? UINT64_MAX : 0; + __storage_.val[__index / 2] = vreinterpretq_f64_u64(vec); + } +}; + +template <> +inline simd simd::operator-() const { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_s64( + veorq_s64(vreinterpretq_s64_f64(__s_.__storage_.val[i]), vreinterpretq_s64_f64(vdupq_n_f64(-0.0)))); + return ret; +} + +inline simd operator+(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vaddq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return ret; +} + +inline simd operator-(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vsubq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return ret; +} + +inline simd operator*(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vmulq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return ret; +} + +inline simd operator/(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vdivq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return ret; +} + +inline simd& operator+=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_.val[i] = vaddq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return a; +} + +inline simd& operator-=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_.val[i] = vsubq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return a; +} + +inline simd& operator*=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_.val[i] = vmulq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return a; +} + +inline simd& operator/=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_.val[i] = vdivq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]); + return a; +} + +inline simd::mask_type operator==(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vceqq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline simd::mask_type operator!=(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vreinterpretq_u64_u32( + vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i]))))); + return ret; +} + +inline simd::mask_type operator>=(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcgeq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline simd::mask_type operator<=(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcleq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline simd::mask_type operator>(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcgtq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline simd::mask_type operator<(const simd& a, + const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_.val[i] = vreinterpretq_f64_u64(vcltq_f64(a.__s_.__storage_.val[i], b.__s_.__storage_.val[i])); + return ret; +} + +inline __simd_storage::__simd_storage(const __simd_storage& other) { + __storage_ = vcombine_f32(vcvt_f32_f64(other.__storage_.val[0]), vcvt_f32_f64(other.__storage_.val[1])); +} + +namespace simd_abi { +template +struct athena_native {}; +template <> +struct athena_native { + using type = m128_abi; +}; +template <> +struct athena_native { + using type = m128d_abi; +}; +} // namespace simd_abi + +} // namespace athena::_simd diff --git a/include/athena/simd/simd_sse.hpp b/include/athena/simd/simd_sse.hpp index 7a17125..604dbc8 100644 --- a/include/athena/simd/simd_sse.hpp +++ b/include/athena/simd/simd_sse.hpp @@ -29,23 +29,14 @@ template <> class __simd_storage { public: using storage_type = __m128; - storage_type __storage_; - float __get(size_t __index) const noexcept { - alignas(16) std::array sse_data; - _mm_store_ps(sse_data.data(), __storage_); - return sse_data[__index]; - } - void __set(size_t __index, float __val) noexcept { - alignas(16) std::array sse_data; - _mm_store_ps(sse_data.data(), __storage_); - sse_data[__index] = __val; - __storage_ = _mm_load_ps(sse_data.data()); - } + storage_type __storage_{}; + [[nodiscard]] inline float __get(size_t __index) const noexcept { return __storage_[__index]; } + inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; } constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {} - void __set4(float a, float b, float c, float d) noexcept { __storage_ = _mm_set_ps(d, c, b, a); } - constexpr __simd_storage(float rv) : __storage_{rv, rv, rv, rv} {} - void __broadcast(float __val) noexcept { __storage_ = _mm_set1_ps(__val); } - float __dot2(const __simd_storage& other) const noexcept { + constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; } + constexpr explicit __simd_storage(float rv) : __storage_{rv, rv, rv, rv} {} + inline void __broadcast(float __val) noexcept { __storage_ = _mm_set1_ps(__val); } + [[nodiscard]] inline float __dot2(const __simd_storage& other) const noexcept { #if __SSE4_1__ float ret; _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x3F)); @@ -56,7 +47,7 @@ public: return sse_data[0] + sse_data[1]; #endif } - float __dot3(const __simd_storage& other) const noexcept { + [[nodiscard]] inline float __dot3(const __simd_storage& other) const noexcept { #if __SSE4_1__ float ret; _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x7F)); @@ -67,7 +58,7 @@ public: return sse_data[0] + sse_data[1] + sse_data[2]; #endif } - float __dot4(const __simd_storage& other) const noexcept { + [[nodiscard]] inline float __dot4(const __simd_storage& other) const noexcept { #if __SSE4_1__ float ret; _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0xFF)); @@ -78,40 +69,39 @@ public: return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3]; #endif } + template - __simd_storage __shuffle() const noexcept { - __simd_storage s; - s.__storage_ = _mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x)); - return s; + [[nodiscard]] constexpr __simd_storage __shuffle() const noexcept { + return __simd_storage(_mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x))); } - void __copy_from(const simd_data>& __buffer) noexcept { + inline void __copy_from(const simd_data>& __buffer) noexcept { __storage_ = _mm_load_ps(__buffer.data()); } - void __copy_to(simd_data>& __buffer) const noexcept { + inline void __copy_to(simd_data>& __buffer) const noexcept { _mm_store_ps(__buffer.data(), __storage_); } __simd_storage() = default; - explicit __simd_storage(const __simd_storage& other); + explicit inline __simd_storage(const __simd_storage& other); #ifdef __AVX__ - explicit __simd_storage(const __simd_storage& other); + explicit inline __simd_storage(const __simd_storage& other); #endif - explicit __simd_storage(const storage_type& s) : __storage_(s) {} - const storage_type& __native() const { return __storage_; } + constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {} + [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; } }; // __m128 mask storage for SSE2+ template <> class __simd_mask_storage : public __simd_storage { public: - bool __get(size_t __index) const noexcept { + [[nodiscard]] inline bool __get(size_t __index) const noexcept { alignas(16) uint32_t sse_data[4]; _mm_store_ps(reinterpret_cast(sse_data), __storage_); return sse_data[__index] != 0; } - void __set(size_t __index, bool __val) noexcept { + inline void __set(size_t __index, bool __val) noexcept { alignas(16) uint32_t sse_data[4]; _mm_store_ps(reinterpret_cast(sse_data), __storage_); sse_data[__index] = __val ? UINT32_MAX : 0; @@ -125,27 +115,19 @@ inline simd simd::operator-() const { } inline simd operator+(const simd& a, const simd& b) { - simd ret; - ret.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_); - return ret; + return _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_); } inline simd operator-(const simd& a, const simd& b) { - simd ret; - ret.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_); - return ret; + return _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_); } inline simd operator*(const simd& a, const simd& b) { - simd ret; - ret.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_); - return ret; + return _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_); } inline simd operator/(const simd& a, const simd& b) { - simd ret; - ret.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_); - return ret; + return _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_); } inline simd& operator+=(simd& a, const simd& b) { @@ -209,32 +191,19 @@ template <> class __simd_storage { public: using storage_type = std::array<__m128d, 2>; - storage_type __storage_; - double __get(size_t __index) const noexcept { - alignas(16) std::array sse_data; - _mm_store_pd(sse_data.data(), __storage_[__index / 2]); - return sse_data[__index % 2]; - } - void __set(size_t __index, double __val) noexcept { - alignas(16) std::array sse_data; - _mm_store_pd(sse_data.data(), __storage_[__index / 2]); - sse_data[__index % 2] = __val; - __storage_[__index / 2] = _mm_load_pd(sse_data.data()); - } + storage_type __storage_{}; + [[nodiscard]] inline double __get(size_t __index) const noexcept { return __storage_[__index / 2][__index % 2]; } + inline void __set(size_t __index, double __val) noexcept { __storage_[__index / 2][__index % 2] = __val; } + // Make GCC happy static constexpr storage_type __make_array(__m128d a, __m128d b) { return {a, b}; } - constexpr __simd_storage(double a, double b, double c, double d) - : __storage_(__make_array(__m128d{a, b}, __m128d{c, d})) {} - void __set4(double a, double b, double c, double d) noexcept { - __storage_[0] = _mm_set_pd(b, a); - __storage_[1] = _mm_set_pd(d, c); + constexpr __simd_storage(double a, double b, double c, double d) : __storage_(__make_array(__m128d{a, b}, __m128d{c, d})) {} + constexpr void __set4(double a, double b, double c, double d) noexcept { + __storage_[0] = __m128d{a, b}; + __storage_[1] = __m128d{c, d}; } - constexpr __simd_storage(double rv) - : __storage_(__make_array(__m128d{rv, rv}, __m128d{rv, rv})) {} - void __broadcast(double __val) noexcept { - for (int i = 0; i < 2; ++i) - __storage_[i] = _mm_set1_pd(__val); - } - double __dot2(const __simd_storage& other) const noexcept { + constexpr __simd_storage(double rv) : __simd_storage(rv, rv, rv, rv) {} + constexpr void __broadcast(double __val) noexcept { __set4(__val, __val, __val, __val); } + [[nodiscard]] inline double __dot2(const __simd_storage& other) const noexcept { #if __SSE4_1__ double ret; _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); @@ -245,7 +214,7 @@ public: return sse_data[0] + sse_data[1]; #endif } - double __dot3(const __simd_storage& other) const noexcept { + [[nodiscard]] inline double __dot3(const __simd_storage& other) const noexcept { #if __SSE4_1__ double ret; _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); @@ -260,7 +229,7 @@ public: return sse_data[0] + sse_data[1] + sse_data2[0]; #endif } - double __dot4(const __simd_storage& other) const noexcept { + [[nodiscard]] inline double __dot4(const __simd_storage& other) const noexcept { #if __SSE4_1__ double ret; _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); @@ -276,24 +245,24 @@ public: #endif } - void __copy_from(const simd_data>& __buffer) noexcept { + inline void __copy_from(const simd_data>& __buffer) noexcept { __storage_[0] = _mm_load_pd(__buffer.data()); __storage_[1] = _mm_load_pd(__buffer.data() + 2); } - void __copy_to(simd_data>& __buffer) const noexcept { + inline void __copy_to(simd_data>& __buffer) const noexcept { _mm_store_pd(__buffer.data(), __storage_[0]); _mm_store_pd(__buffer.data() + 2, __storage_[1]); } - __simd_storage() = default; - explicit __simd_storage(const __simd_storage& other) { + constexpr __simd_storage() = default; + inline explicit __simd_storage(const __simd_storage& other) { __storage_[0] = _mm_cvtps_pd(other.__storage_); __storage_[1] = _mm_cvtps_pd(_mm_movehl_ps(other.__storage_, other.__storage_)); } - explicit __simd_storage(const storage_type& s) : __storage_(s) {} - const storage_type& __native() const { return __storage_; } + constexpr explicit __simd_storage(const storage_type& s) : __storage_(s) {} + [[nodiscard]] constexpr const storage_type& __native() const { return __storage_; } }; // __m128d mask storage for SSE2+ template <>