Update simd_neon

- Fixes m128d dot3
- Simplifies negate operations
This commit is contained in:
Luke Street 2021-01-09 14:27:27 -05:00
parent 9ea070c2d7
commit 3c4bcf37d2
1 changed files with 5 additions and 12 deletions

View File

@ -18,7 +18,7 @@ class __simd_storage<float, m128_abi> {
public: public:
using storage_type = float32x4_t; using storage_type = float32x4_t;
storage_type __storage_{}; storage_type __storage_{};
[[nodiscard]] constexpr float __get(size_t __index) const noexcept { return __storage_[__index]; } [[nodiscard]] float __get(size_t __index) const noexcept { return __storage_[__index]; }
inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; } inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; }
constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {} constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {}
constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; } constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; }
@ -35,12 +35,7 @@ public:
} }
template <int x, int y, int z, int w> template <int x, int y, int z, int w>
[[nodiscard]] inline __simd_storage __shuffle() const noexcept { [[nodiscard]] inline __simd_storage __shuffle() const noexcept {
storage_type ret; return __simd_storage{__storage_[x], __storage_[y], __storage_[z], __storage_[w]};
ret = vmovq_n_f32(vgetq_lane_f32(__storage_, x));
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, y), ret, 1);
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, z), ret, 2);
ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, w), ret, 3);
return __simd_storage(ret);
} }
inline void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept { inline void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept {
@ -71,8 +66,7 @@ public:
template <> template <>
inline simd<float, m128_abi> simd<float, m128_abi>::operator-() const { inline simd<float, m128_abi> simd<float, m128_abi>::operator-() const {
return vreinterpretq_f32_s32( return vnegq_f32(__s_.__storage_);
veorq_s32(vreinterpretq_s32_f32(__s_.__storage_), vreinterpretq_s32_f32(vdupq_n_f32(-0.f))));
} }
inline simd<float, m128_abi> operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) { inline simd<float, m128_abi> operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
@ -172,7 +166,7 @@ public:
[[nodiscard]] inline double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept { [[nodiscard]] inline double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept {
const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]); const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]);
const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]); const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]);
return vaddvq_f64(vcombine_f64(vcreate_f64(vaddvq_f64(mul1)), vget_low_f64(mul2))); return vaddvq_f64(vcombine_f64(vdup_n_f64(vaddvq_f64(mul1)), vget_low_f64(mul2)));
} }
[[nodiscard]] inline double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept { [[nodiscard]] inline double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept {
const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]); const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]);
@ -215,8 +209,7 @@ template <>
inline simd<double, m128d_abi> simd<double, m128d_abi>::operator-() const { inline simd<double, m128d_abi> simd<double, m128d_abi>::operator-() const {
simd<double, m128d_abi> ret; simd<double, m128d_abi> ret;
for (int i = 0; i < 2; ++i) for (int i = 0; i < 2; ++i)
ret.__s_.__storage_.val[i] = vreinterpretq_f64_s64( ret.__s_.__storage_.val[i] = vnegq_f64(__s_.__storage_.val[i]);
veorq_s64(vreinterpretq_s64_f64(__s_.__storage_.val[i]), vreinterpretq_s64_f64(vdupq_n_f64(-0.0))));
return ret; return ret;
} }