From 3c4bcf37d216f5ece205db7c9f53329a259d00da Mon Sep 17 00:00:00 2001 From: Luke Street Date: Sat, 9 Jan 2021 14:27:27 -0500 Subject: [PATCH] Update simd_neon - Fixes m128d dot3 - Simplifies negate operations --- include/zeus/simd/simd_neon.hpp | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/include/zeus/simd/simd_neon.hpp b/include/zeus/simd/simd_neon.hpp index 9640847..3cb31fa 100644 --- a/include/zeus/simd/simd_neon.hpp +++ b/include/zeus/simd/simd_neon.hpp @@ -18,7 +18,7 @@ class __simd_storage { public: using storage_type = float32x4_t; storage_type __storage_{}; - [[nodiscard]] constexpr float __get(size_t __index) const noexcept { return __storage_[__index]; } + [[nodiscard]] float __get(size_t __index) const noexcept { return __storage_[__index]; } inline void __set(size_t __index, float __val) noexcept { __storage_[__index] = __val; } constexpr __simd_storage(float a, float b, float c, float d) : __storage_{a, b, c, d} {} constexpr void __set4(float a, float b, float c, float d) noexcept { __storage_ = storage_type{a, b, c, d}; } @@ -35,12 +35,7 @@ public: } template [[nodiscard]] inline __simd_storage __shuffle() const noexcept { - storage_type ret; - ret = vmovq_n_f32(vgetq_lane_f32(__storage_, x)); - ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, y), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, z), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(__storage_, w), ret, 3); - return __simd_storage(ret); + return __simd_storage{__storage_[x], __storage_[y], __storage_[z], __storage_[w]}; } inline void __copy_from(const simd_data>& __buffer) noexcept { @@ -71,8 +66,7 @@ public: template <> inline simd simd::operator-() const { - return vreinterpretq_f32_s32( - veorq_s32(vreinterpretq_s32_f32(__s_.__storage_), vreinterpretq_s32_f32(vdupq_n_f32(-0.f)))); + return vnegq_f32(__s_.__storage_); } inline simd operator+(const simd& a, const simd& b) { @@ -172,7 +166,7 @@ public: [[nodiscard]] inline double __dot3(const __simd_storage& other) const noexcept { const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]); const vector_type mul2 = vmulq_f64(__storage_.val[1], other.__storage_.val[1]); - return vaddvq_f64(vcombine_f64(vcreate_f64(vaddvq_f64(mul1)), vget_low_f64(mul2))); + return vaddvq_f64(vcombine_f64(vdup_n_f64(vaddvq_f64(mul1)), vget_low_f64(mul2))); } [[nodiscard]] inline double __dot4(const __simd_storage& other) const noexcept { const vector_type mul1 = vmulq_f64(__storage_.val[0], other.__storage_.val[0]); @@ -215,8 +209,7 @@ template <> inline simd simd::operator-() const { simd ret; for (int i = 0; i < 2; ++i) - ret.__s_.__storage_.val[i] = vreinterpretq_f64_s64( - veorq_s64(vreinterpretq_s64_f64(__s_.__storage_.val[i]), vreinterpretq_s64_f64(vdupq_n_f64(-0.0)))); + ret.__s_.__storage_.val[i] = vnegq_f64(__s_.__storage_.val[i]); return ret; }