#pragma once #ifndef _ZEUS_SIMD_INCLUDED #error simd_avx.hpp must not be included directly. Include simd.hpp instead. #endif #include "simd_sse.hpp" #include namespace zeus::_simd { // __m256d storage for AVX template <> class __simd_storage { public: using storage_type = __m256d; storage_type __storage_; double __get(size_t __index) const noexcept { alignas(32) std::array sse_data; _mm256_store_pd(sse_data.data(), __storage_); return sse_data[__index]; } void __set(size_t __index, double __val) noexcept { alignas(32) std::array sse_data; _mm256_store_pd(sse_data.data(), __storage_); sse_data[__index] = __val; __storage_ = _mm256_load_pd(sse_data.data()); } void __set4(double a, double b, double c, double d) noexcept { __storage_ = _mm256_set_pd(d, c, b, a); } void __broadcast(double __val) noexcept { __storage_ = _mm256_set1_pd(__val); } double __dot2(const __simd_storage& other) const noexcept { alignas(32) std::array sse_data; _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_)); return sse_data[0] + sse_data[1]; } double __dot3(const __simd_storage& other) const noexcept { alignas(32) std::array sse_data; _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_)); return sse_data[0] + sse_data[1] + sse_data[2]; } double __dot4(const __simd_storage& other) const noexcept { alignas(32) std::array sse_data; _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_)); return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3]; } void __copy_from(const simd_data>& __buffer) noexcept { __storage_ = _mm256_load_pd(__buffer.data()); } void __copy_to(simd_data>& __buffer) const noexcept { _mm256_store_pd(__buffer.data(), __storage_); } __simd_storage() = default; explicit __simd_storage(const __simd_storage& other) { __storage_ = _mm256_cvtps_pd(other.__storage_); } explicit __simd_storage(const storage_type& s) : __storage_(s) {} const storage_type& __native() const { return __storage_; } }; // __m256d mask storage for AVX template <> class __simd_mask_storage : public __simd_storage { public: bool __get(size_t __index) const noexcept { alignas(32) uint64_t sse_data[4]; _mm256_store_pd(reinterpret_cast(sse_data), __storage_); return sse_data[__index] != 0; } void __set(size_t __index, bool __val) noexcept { alignas(32) uint64_t sse_data[4]; _mm256_store_pd(reinterpret_cast(sse_data), __storage_); sse_data[__index] = __val ? UINT64_MAX : 0; __storage_ = _mm256_load_pd(reinterpret_cast(sse_data)); } }; template <> inline simd simd::operator-() const { return _mm256_xor_pd(__s_.__storage_, _mm256_set1_pd(-0.0)); } inline simd operator+(const simd& a, const simd& b) { simd ret; ret.__s_.__storage_ = _mm256_add_pd(a.__s_.__storage_, b.__s_.__storage_); return ret; } inline simd operator-(const simd& a, const simd& b) { simd ret; ret.__s_.__storage_ = _mm256_sub_pd(a.__s_.__storage_, b.__s_.__storage_); return ret; } inline simd operator*(const simd& a, const simd& b) { simd ret; ret.__s_.__storage_ = _mm256_mul_pd(a.__s_.__storage_, b.__s_.__storage_); return ret; } inline simd operator/(const simd& a, const simd& b) { simd ret; ret.__s_.__storage_ = _mm256_div_pd(a.__s_.__storage_, b.__s_.__storage_); return ret; } inline simd& operator+=(simd& a, const simd& b) { a.__s_.__storage_ = _mm256_add_pd(a.__s_.__storage_, b.__s_.__storage_); return a; } inline simd& operator-=(simd& a, const simd& b) { a.__s_.__storage_ = _mm256_sub_pd(a.__s_.__storage_, b.__s_.__storage_); return a; } inline simd& operator*=(simd& a, const simd& b) { a.__s_.__storage_ = _mm256_mul_pd(a.__s_.__storage_, b.__s_.__storage_); return a; } inline simd& operator/=(simd& a, const simd& b) { a.__s_.__storage_ = _mm256_div_pd(a.__s_.__storage_, b.__s_.__storage_); return a; } inline simd::mask_type operator==(const simd& a, const simd& b) { simd::mask_type ret; ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_EQ_OQ); return ret; } inline simd::mask_type operator!=(const simd& a, const simd& b) { simd::mask_type ret; ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_NEQ_OQ); return ret; } inline simd::mask_type operator>=(const simd& a, const simd& b) { simd::mask_type ret; ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_GE_OQ); return ret; } inline simd::mask_type operator<=(const simd& a, const simd& b) { simd::mask_type ret; ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_LE_OQ); return ret; } inline simd::mask_type operator>(const simd& a, const simd& b) { simd::mask_type ret; ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_GT_OQ); return ret; } inline simd::mask_type operator<(const simd& a, const simd& b) { simd::mask_type ret; ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_LT_OQ); return ret; } inline __simd_storage::__simd_storage(const __simd_storage& other) { __storage_ = _mm256_cvtpd_ps(other.__storage_); } namespace simd_abi { template <> struct zeus_native { using type = m256d_abi; }; } // namespace simd_abi } // namespace zeus::_simd