From e1b29fda7acf3a17a297a02a63a5f11e94eb2328 Mon Sep 17 00:00:00 2001 From: Jack Andersen Date: Fri, 7 Dec 2018 15:31:02 -1000 Subject: [PATCH] SIMD refactor --- include/athena/DNAOp.hpp | 4 +- include/athena/IStream.hpp | 11 +- include/athena/IStreamReader.hpp | 540 +++--- include/athena/IStreamWriter.hpp | 459 ++--- include/athena/Types.hpp | 107 +- include/athena/Utility.hpp | 8 +- include/athena/simd/parallelism_v2_simd.hpp | 1768 +++++++++++++++++++ include/athena/simd/simd.hpp | 26 + include/athena/simd/simd_avx.hpp | 188 ++ include/athena/simd/simd_sse.hpp | 455 +++++ src/athena/DNAYaml.cpp | 33 +- 11 files changed, 3003 insertions(+), 596 deletions(-) create mode 100644 include/athena/simd/parallelism_v2_simd.hpp create mode 100644 include/athena/simd/simd.hpp create mode 100644 include/athena/simd/simd_avx.hpp create mode 100644 include/athena/simd/simd_sse.hpp diff --git a/include/athena/DNAOp.hpp b/include/athena/DNAOp.hpp index d218f72..fe159f2 100644 --- a/include/athena/DNAOp.hpp +++ b/include/athena/DNAOp.hpp @@ -1013,7 +1013,7 @@ template static inline void __ReadProp(T& obj, athena::io::IStreamReader& r) { /* Read root 0xffffffff hash (hashed empty string) */ - atUint32 hash = T::DNAEndian == Endian::Big ? r.readUint32Big() : r.readUint32Little(); + T::DNAEndian == Endian::Big ? r.readUint32Big() : r.readUint32Little(); atInt64 size = T::DNAEndian == Endian::Big ? r.readUint16Big() : r.readUint16Little(); atInt64 start = r.position(); __Do, T, T::DNAEndian>({}, obj, r); @@ -1038,7 +1038,7 @@ template static inline void __ReadProp64(T& obj, athena::io::IStreamReader& r) { /* Read root 0x0 hash (hashed empty string) */ - atUint64 hash = T::DNAEndian == Endian::Big ? r.readUint64Big() : r.readUint64Little(); + T::DNAEndian == Endian::Big ? r.readUint64Big() : r.readUint64Little(); atInt64 size = T::DNAEndian == Endian::Big ? r.readUint16Big() : r.readUint16Little(); atInt64 start = r.position(); __Do, T, T::DNAEndian>({}, obj, r); diff --git a/include/athena/IStream.hpp b/include/athena/IStream.hpp index c902714..ffd356f 100644 --- a/include/athena/IStream.hpp +++ b/include/athena/IStream.hpp @@ -9,13 +9,12 @@ std::ostream& operator<<(std::ostream& os, Endian& endian); class IStream { public: - IStream() {} - virtual ~IStream() {} + virtual ~IStream() = default; - inline void setEndian(Endian endian) { m_endian = endian; } - inline Endian endian() const { return m_endian; } - inline bool isBigEndian() const { return (m_endian == Endian::Big); } - inline bool isLittleEndian() const { return (m_endian == Endian::Little); } + void setEndian(Endian endian) { m_endian = endian; } + Endian endian() const { return m_endian; } + bool isBigEndian() const { return (m_endian == Endian::Big); } + bool isLittleEndian() const { return (m_endian == Endian::Little); } virtual void seek(atInt64, SeekOrigin) = 0; virtual bool atEnd() const = 0; virtual atUint64 position() const = 0; diff --git a/include/athena/IStreamReader.hpp b/include/athena/IStreamReader.hpp index 7298cd8..f619d8a 100644 --- a/include/athena/IStreamReader.hpp +++ b/include/athena/IStreamReader.hpp @@ -16,7 +16,7 @@ namespace athena::io class IStreamReader : public IStream { public: - virtual ~IStreamReader() {} + virtual ~IStreamReader() = default; /** @brief Sets the buffers position relative to the specified position.
* It seeks relative to the current position by default. @@ -27,25 +27,25 @@ public: /** @brief Sets the buffer's position relative to the next 64-byte aligned position.
*/ - inline void seekAlign64() {seek(ROUND_UP_64(position()), SeekOrigin::Begin);} + void seekAlign64() {seek(ROUND_UP_64(position()), SeekOrigin::Begin);} /** @brief Sets the buffers position relative to the next 32-byte aligned position.
*/ - inline void seekAlign32() {seek(ROUND_UP_32(position()), SeekOrigin::Begin);} + void seekAlign32() {seek(ROUND_UP_32(position()), SeekOrigin::Begin);} /** @brief Sets the buffer's position relative to the next 16-byte aligned position.
*/ - inline void seekAlign16() {seek(ROUND_UP_16(position()), SeekOrigin::Begin); } + void seekAlign16() {seek(ROUND_UP_16(position()), SeekOrigin::Begin); } /** @brief Sets the buffer's position relative to the next 4-byte aligned position.
*/ - inline void seekAlign4() {seek(ROUND_UP_4(position()), SeekOrigin::Begin); } + void seekAlign4() {seek(ROUND_UP_4(position()), SeekOrigin::Begin); } /** @brief Returns whether or not the stream is at the end. * * @return True if at end; False otherwise. */ - inline bool atEnd() const + bool atEnd() const {return position() >= length();} /** @brief Returns the current position in the stream. @@ -64,37 +64,37 @@ public: * * @return The value at the current position */ - inline atInt8 readByte() {atInt8 val; readUBytesToBuf(&val, 1); return val;} + atInt8 readByte() {atInt8 val; readUBytesToBuf(&val, 1); return val;} template - inline atInt8 readVal(typename std::enable_if::value>::type* = 0) + atInt8 readVal(typename std::enable_if::value>::type* = 0) {return readByte();} template - inline atInt8 readValLittle(typename std::enable_if::value>::type* = 0) + atInt8 readValLittle(typename std::enable_if::value>::type* = 0) {return readByte();} template - inline atInt8 readValBig(typename std::enable_if::value>::type* = 0) + atInt8 readValBig(typename std::enable_if::value>::type* = 0) {return readByte();} /** @brief Reads a byte at the current position and advances the current position * * @return The value at the current position */ - inline atUint8 readUByte() {return readByte();} + atUint8 readUByte() {return readByte();} template - inline atUint8 readVal(typename std::enable_if::value>::type* = 0) + atUint8 readVal(typename std::enable_if::value>::type* = 0) {return readUByte();} template - inline atUint8 readValLittle(typename std::enable_if::value>::type* = 0) + atUint8 readValLittle(typename std::enable_if::value>::type* = 0) {return readUByte();} template - inline atUint8 readValBig(typename std::enable_if::value>::type* = 0) + atUint8 readValBig(typename std::enable_if::value>::type* = 0) {return readUByte();} /** @brief Reads a byte at the current position and advances the current position. * * @return The buffer at the current position from the given length. */ - inline std::unique_ptr readBytes(atUint64 length) + std::unique_ptr readBytes(atUint64 length) { atInt8* buf = new atInt8[length]; readUBytesToBuf(buf, length); @@ -105,7 +105,7 @@ public: * * @return The buffer at the current position from the given length. */ - inline std::unique_ptr readUBytes(atUint64 length) + std::unique_ptr readUBytes(atUint64 length) { atUint8* buf = new atUint8[length]; readUBytesToBuf(buf, length); @@ -117,7 +117,7 @@ public: * @param len The length of the buffer * @return How much data was actually read, useful for detecting read errors. */ - inline atUint64 readBytesToBuf(void* buf, atUint64 len) {return readUBytesToBuf(buf, len);} + atUint64 readBytesToBuf(void* buf, atUint64 len) {return readUBytesToBuf(buf, len);} /** @brief Attempts to read a fixed length of data into a pre-allocated buffer, this function is client defined @@ -133,14 +133,14 @@ public: * * @return The value at the current address */ - inline atInt16 readInt16() + atInt16 readInt16() { atInt16 val; readUBytesToBuf(&val, 2); return m_endian == Big ? utility::BigInt16(val) : utility::LittleInt16(val); } template - inline atInt16 readVal(typename std::enable_if::value>::type* = 0) + atInt16 readVal(typename std::enable_if::value>::type* = 0) {return readInt16();} /** @brief Reads a Int16 and swaps against little endianness depending on platform @@ -148,14 +148,14 @@ public: * * @return The value at the current address */ - inline atInt16 readInt16Little() + atInt16 readInt16Little() { atInt16 val; readUBytesToBuf(&val, 2); return utility::LittleInt16(val); } template - inline atInt16 readValLittle(typename std::enable_if::value>::type* = 0) + atInt16 readValLittle(typename std::enable_if::value>::type* = 0) {return readInt16Little();} /** @brief Reads a Int16 and swaps against big endianness depending on platform @@ -163,14 +163,14 @@ public: * * @return The value at the current address */ - inline atInt16 readInt16Big() + atInt16 readInt16Big() { atInt16 val; readUBytesToBuf(&val, 2); return utility::BigInt16(val); } template - inline atInt16 readValBig(typename std::enable_if::value>::type* = 0) + atInt16 readValBig(typename std::enable_if::value>::type* = 0) {return readInt16Big();} /** @brief Reads a Uint16 and swaps to endianness specified by setEndian depending on platform @@ -178,10 +178,10 @@ public: * * @return The value at the current address */ - inline atUint16 readUint16() + atUint16 readUint16() {return readInt16();} template - inline atUint16 readVal(typename std::enable_if::value>::type* = 0) + atUint16 readVal(typename std::enable_if::value>::type* = 0) {return readUint16();} /** @brief Reads a Uint16 and swaps against little endianness depending on platform @@ -189,14 +189,14 @@ public: * * @return The value at the current address */ - inline atUint16 readUint16Little() + atUint16 readUint16Little() { atUint16 val; readUBytesToBuf(&val, 2); return utility::LittleUint16(val); } template - inline atUint16 readValLittle(typename std::enable_if::value>::type* = 0) + atUint16 readValLittle(typename std::enable_if::value>::type* = 0) {return readUint16Little();} /** @brief Reads a Uint16 and swaps against big endianness depending on platform @@ -204,14 +204,14 @@ public: * * @return The value at the current address */ - inline atUint16 readUint16Big() + atUint16 readUint16Big() { atUint16 val; readUBytesToBuf(&val, 2); return utility::BigUint16(val); } template - inline atUint16 readValBig(typename std::enable_if::value>::type* = 0) + atUint16 readValBig(typename std::enable_if::value>::type* = 0) {return readUint16Big();} /** @brief Reads a Int32 and swaps to endianness specified by setEndian depending on platform @@ -219,14 +219,14 @@ public: * * @return The value at the current address */ - inline atInt32 readInt32() + atInt32 readInt32() { atInt32 val; readUBytesToBuf(&val, 4); return m_endian == Big ? utility::BigInt32(val) : utility::LittleInt32(val); } template - inline atInt32 readVal(typename std::enable_if::value>::type* = 0) + atInt32 readVal(typename std::enable_if::value>::type* = 0) {return readInt32();} /** @brief Reads a Int32 and swaps against little endianness depending on platform @@ -234,14 +234,14 @@ public: * * @return The value at the current address */ - inline atInt32 readInt32Little() + atInt32 readInt32Little() { atInt32 val; readUBytesToBuf(&val, 4); return utility::LittleInt32(val); } template - inline atInt32 readValLittle(typename std::enable_if::value>::type* = 0) + atInt32 readValLittle(typename std::enable_if::value>::type* = 0) {return readInt32Little();} /** @brief Reads a Int32 and swaps against big endianness depending on platform @@ -249,14 +249,14 @@ public: * * @return The value at the current address */ - inline atInt32 readInt32Big() + atInt32 readInt32Big() { atInt32 val; readUBytesToBuf(&val, 4); return utility::BigInt32(val); } template - inline atInt32 readValBig(typename std::enable_if::value>::type* = 0) + atInt32 readValBig(typename std::enable_if::value>::type* = 0) {return readInt32Big();} /** @brief Reads a Uint32 and swaps to endianness specified by setEndian depending on platform @@ -264,10 +264,10 @@ public: * * @return The value at the current address */ - inline atUint32 readUint32() + atUint32 readUint32() {return readInt32();} template - inline atUint32 readVal(typename std::enable_if::value>::type* = 0) + atUint32 readVal(typename std::enable_if::value>::type* = 0) {return readUint32();} /** @brief Reads a Uint32 and swaps against little endianness depending on platform @@ -275,14 +275,14 @@ public: * * @return The value at the current address */ - inline atUint32 readUint32Little() + atUint32 readUint32Little() { atUint32 val; readUBytesToBuf(&val, 4); return utility::LittleUint32(val); } template - inline atInt32 readValLittle(typename std::enable_if::value>::type* = 0) + atInt32 readValLittle(typename std::enable_if::value>::type* = 0) {return readUint32Little();} /** @brief Reads a Uint32 and swaps against big endianness depending on platform @@ -290,14 +290,14 @@ public: * * @return The value at the current address */ - inline atUint32 readUint32Big() + atUint32 readUint32Big() { atUint32 val; readUBytesToBuf(&val, 4); return utility::BigUint32(val); } template - inline atUint32 readValBig(typename std::enable_if::value>::type* = 0) + atUint32 readValBig(typename std::enable_if::value>::type* = 0) {return readUint32Big();} /** @brief Reads a Int64 and swaps to endianness specified by setEndian depending on platform @@ -305,14 +305,14 @@ public: * * @return The value at the current address */ - inline atInt64 readInt64() + atInt64 readInt64() { atInt64 val; readUBytesToBuf(&val, 8); return m_endian == Big ? utility::BigInt64(val) : utility::LittleInt64(val); } template - inline atInt64 readVal(typename std::enable_if::value>::type* = 0) + atInt64 readVal(typename std::enable_if::value>::type* = 0) {return readInt64();} /** @brief Reads a Int64 and swaps against little endianness depending on platform @@ -320,14 +320,14 @@ public: * * @return The value at the current address */ - inline atInt64 readInt64Little() + atInt64 readInt64Little() { atInt64 val; readUBytesToBuf(&val, 8); return utility::LittleInt64(val); } template - inline atInt64 readValLittle(typename std::enable_if::value>::type* = 0) + atInt64 readValLittle(typename std::enable_if::value>::type* = 0) {return readInt64Little();} /** @brief Reads a Int64 and swaps against big endianness depending on platform @@ -335,14 +335,14 @@ public: * * @return The value at the current address */ - inline atInt64 readInt64Big() + atInt64 readInt64Big() { atInt64 val; readUBytesToBuf(&val, 8); return utility::BigInt64(val); } template - inline atInt64 readValBig(typename std::enable_if::value>::type* = 0) + atInt64 readValBig(typename std::enable_if::value>::type* = 0) {return readInt64Big();} /** @brief Reads a Uint64 and swaps to endianness specified by setEndian depending on platform @@ -350,10 +350,10 @@ public: * * @return The value at the current address */ - inline atUint64 readUint64() + atUint64 readUint64() {return readInt64();} template - inline atUint64 readVal(typename std::enable_if::value>::type* = 0) + atUint64 readVal(typename std::enable_if::value>::type* = 0) {return readUint64();} /** @brief Reads a Uint64 and swaps against little endianness depending on platform @@ -361,14 +361,14 @@ public: * * @return The value at the current address */ - inline atUint64 readUint64Little() + atUint64 readUint64Little() { atUint64 val; readUBytesToBuf(&val, 8); return utility::LittleUint64(val); } template - inline atUint64 readValLittle(typename std::enable_if::value>::type* = 0) + atUint64 readValLittle(typename std::enable_if::value>::type* = 0) {return readUint64Little();} /** @brief Reads a Uint64 and swaps against big endianness depending on platform @@ -376,14 +376,14 @@ public: * * @return The value at the current address */ - inline atUint64 readUint64Big() + atUint64 readUint64Big() { atUint64 val; readUBytesToBuf(&val, 8); return utility::BigUint64(val); } template - inline atUint64 readValBig(typename std::enable_if::value>::type* = 0) + atUint64 readValBig(typename std::enable_if::value>::type* = 0) {return readUint64Big();} /** @brief Reads a float and swaps to endianness specified by setEndian depending on platform @@ -391,14 +391,14 @@ public: * * @return The value at the current address */ - inline float readFloat() + float readFloat() { float val; readUBytesToBuf(&val, 4); return m_endian == Big ? utility::BigFloat(val) : utility::LittleFloat(val); } template - inline float readVal(typename std::enable_if::value>::type* = 0) + float readVal(typename std::enable_if::value>::type* = 0) {return readFloat();} /** @brief Reads a float and swaps against little endianness depending on platform @@ -406,14 +406,14 @@ public: * * @return The value at the current address */ - inline float readFloatLittle() + float readFloatLittle() { float val; readUBytesToBuf(&val, 4); return utility::LittleFloat(val); } template - inline float readValLittle(typename std::enable_if::value>::type* = 0) + float readValLittle(typename std::enable_if::value>::type* = 0) {return readFloatLittle();} /** @brief Reads a float and swaps against big endianness depending on platform @@ -421,14 +421,14 @@ public: * * @return The value at the current address */ - inline float readFloatBig() + float readFloatBig() { float val; readUBytesToBuf(&val, 4); return utility::BigFloat(val); } template - inline float readValBig(typename std::enable_if::value>::type* = 0) + float readValBig(typename std::enable_if::value>::type* = 0) {return readFloatBig();} /** @brief Reads a double and swaps to endianness specified by setEndian depending on platform @@ -436,14 +436,14 @@ public: * * @return The value at the current address */ - inline double readDouble() + double readDouble() { double val; readUBytesToBuf(&val, 8); return m_endian == Big ? utility::BigDouble(val) : utility::LittleDouble(val); } template - inline double readVal(typename std::enable_if::value>::type* = 0) + double readVal(typename std::enable_if::value>::type* = 0) {return readDouble();} /** @brief Reads a double and swaps against little endianness depending on platform @@ -451,14 +451,14 @@ public: * * @return The value at the current address */ - inline double readDoubleLittle() + double readDoubleLittle() { double val; readUBytesToBuf(&val, 8); return utility::LittleDouble(val); } template - inline double readValLittle(typename std::enable_if::value>::type* = 0) + double readValLittle(typename std::enable_if::value>::type* = 0) {return readDoubleLittle();} /** @brief Reads a double and swaps against big endianness depending on platform @@ -466,34 +466,34 @@ public: * * @return The value at the current address */ - inline double readDoubleBig() + double readDoubleBig() { double val; readUBytesToBuf(&val, 8); return utility::BigDouble(val); } template - inline double readValBig(typename std::enable_if::value>::type* = 0) + double readValBig(typename std::enable_if::value>::type* = 0) {return readDoubleBig();} /** @brief Reads a bool and advances the current position * * @return The value at the current address */ - inline bool readBool() + bool readBool() { atUint8 val; readUBytesToBuf(&val, 1); return val != 0; } template - inline bool readVal(typename std::enable_if::value>::type* = 0) + bool readVal(typename std::enable_if::value>::type* = 0) {return readBool();} template - inline bool readValLittle(typename std::enable_if::value>::type* = 0) + bool readValLittle(typename std::enable_if::value>::type* = 0) {return readBool();} template - inline bool readValBig(typename std::enable_if::value>::type* = 0) + bool readValBig(typename std::enable_if::value>::type* = 0) {return readBool();} /** @brief Reads an atVec2f (8 bytes), swaps to endianness specified by setEndian depending on platform @@ -501,24 +501,28 @@ public: * * @return The value at the current address */ - inline atVec2f readVec2f() + atVec2f readVec2f() { - atVec2f val; - readUBytesToBuf(&val, 8); + simd_floats val; + readUBytesToBuf(val.data(), 8); if (m_endian == Big) { - utility::BigFloat(val.vec[0]); - utility::BigFloat(val.vec[1]); + val[0] = utility::BigFloat(val[0]); + val[1] = utility::BigFloat(val[1]); } else { - utility::LittleFloat(val.vec[0]); - utility::LittleFloat(val.vec[1]); + val[0] = utility::LittleFloat(val[0]); + val[1] = utility::LittleFloat(val[1]); } - return val; + val[2] = 0.f; + val[3] = 0.f; + atVec2f s; + s.simd.copy_from(val); + return s; } template - inline atVec2f readVal(typename std::enable_if::value>::type* = 0) + atVec2f readVal(typename std::enable_if::value>::type* = 0) {return readVec2f();} /** @brief Reads an atVec2f (8 bytes), swaps against little endianness depending on platform @@ -526,16 +530,20 @@ public: * * @return The value at the current address */ - inline atVec2f readVec2fLittle() + atVec2f readVec2fLittle() { - atVec2f val; - readUBytesToBuf(&val, 8); - utility::LittleFloat(val.vec[0]); - utility::LittleFloat(val.vec[1]); - return val; + simd_floats val; + readUBytesToBuf(val.data(), 8); + val[0] = utility::LittleFloat(val[0]); + val[1] = utility::LittleFloat(val[1]); + val[2] = 0.f; + val[3] = 0.f; + atVec2f s; + s.simd.copy_from(val); + return s; } template - inline atVec2f readValLittle(typename std::enable_if::value>::type* = 0) + atVec2f readValLittle(typename std::enable_if::value>::type* = 0) {return readVec2fLittle();} /** @brief Reads an atVec2f (8 bytes), swaps against big endianness depending on platform @@ -543,16 +551,20 @@ public: * * @return The value at the current address */ - inline atVec2f readVec2fBig() + atVec2f readVec2fBig() { - atVec2f val; - readUBytesToBuf(&val, 8); - utility::BigFloat(val.vec[0]); - utility::BigFloat(val.vec[1]); - return val; + simd_floats val; + readUBytesToBuf(val.data(), 8); + val[0] = utility::BigFloat(val[0]); + val[1] = utility::BigFloat(val[1]); + val[2] = 0.f; + val[3] = 0.f; + atVec2f s; + s.simd.copy_from(val); + return s; } template - inline atVec2f readValBig(typename std::enable_if::value>::type* = 0) + atVec2f readValBig(typename std::enable_if::value>::type* = 0) {return readVec2fBig();} /** @brief Reads an atVec3f (12 bytes), swaps to endianness specified by setEndian depending on platform @@ -560,26 +572,29 @@ public: * * @return The value at the current address */ - inline atVec3f readVec3f() + atVec3f readVec3f() { - atVec3f val; - readUBytesToBuf(&val, 12); + simd_floats val; + readUBytesToBuf(val.data(), 12); if (m_endian == Big) { - utility::BigFloat(val.vec[0]); - utility::BigFloat(val.vec[1]); - utility::BigFloat(val.vec[2]); + val[0] = utility::BigFloat(val[0]); + val[1] = utility::BigFloat(val[1]); + val[2] = utility::BigFloat(val[2]); } else { - utility::LittleFloat(val.vec[0]); - utility::LittleFloat(val.vec[1]); - utility::LittleFloat(val.vec[2]); + val[0] = utility::LittleFloat(val[0]); + val[1] = utility::LittleFloat(val[1]); + val[2] = utility::LittleFloat(val[2]); } - return val; + val[3] = 0.f; + atVec3f s; + s.simd.copy_from(val); + return s; } template - inline atVec3f readVal(typename std::enable_if::value>::type* = 0) + atVec3f readVal(typename std::enable_if::value>::type* = 0) {return readVec3f();} /** @brief Reads an atVec3f (12 bytes), swaps against little endianness depending on platform @@ -587,17 +602,20 @@ public: * * @return The value at the current address */ - inline atVec3f readVec3fLittle() + atVec3f readVec3fLittle() { - atVec3f val; - readUBytesToBuf(&val, 12); - utility::LittleFloat(val.vec[0]); - utility::LittleFloat(val.vec[1]); - utility::LittleFloat(val.vec[2]); - return val; + simd_floats val; + readUBytesToBuf(val.data(), 12); + val[0] = utility::LittleFloat(val[0]); + val[1] = utility::LittleFloat(val[1]); + val[2] = utility::LittleFloat(val[2]); + val[3] = 0.f; + atVec3f s; + s.simd.copy_from(val); + return s; } template - inline atVec3f readValLittle(typename std::enable_if::value>::type* = 0) + atVec3f readValLittle(typename std::enable_if::value>::type* = 0) {return readVec3fLittle();} /** @brief Reads an atVec3f (12 bytes), swaps against big endianness depending on platform @@ -605,17 +623,20 @@ public: * * @return The value at the current address */ - inline atVec3f readVec3fBig() + atVec3f readVec3fBig() { - atVec3f val; - readUBytesToBuf(&val, 12); - utility::BigFloat(val.vec[0]); - utility::BigFloat(val.vec[1]); - utility::BigFloat(val.vec[2]); - return val; + simd_floats val; + readUBytesToBuf(val.data(), 12); + val[0] = utility::BigFloat(val[0]); + val[1] = utility::BigFloat(val[1]); + val[2] = utility::BigFloat(val[2]); + val[3] = 0.f; + atVec3f s; + s.simd.copy_from(val); + return s; } template - inline atVec3f readValBig(typename std::enable_if::value>::type* = 0) + atVec3f readValBig(typename std::enable_if::value>::type* = 0) {return readVec3fBig();} /** @brief Reads an atVec4f (16 bytes), swaps to endianness specified by setEndian depending on platform @@ -623,28 +644,30 @@ public: * * @return The value at the current address */ - inline atVec4f readVec4f() + atVec4f readVec4f() { - atVec4f val; - readUBytesToBuf(&val, 16); + simd_floats val; + readUBytesToBuf(val.data(), 16); if (m_endian == Big) { - utility::BigFloat(val.vec[0]); - utility::BigFloat(val.vec[1]); - utility::BigFloat(val.vec[2]); - utility::BigFloat(val.vec[3]); + val[0] = utility::BigFloat(val[0]); + val[1] = utility::BigFloat(val[1]); + val[2] = utility::BigFloat(val[2]); + val[3] = utility::BigFloat(val[3]); } else { - utility::LittleFloat(val.vec[0]); - utility::LittleFloat(val.vec[1]); - utility::LittleFloat(val.vec[2]); - utility::LittleFloat(val.vec[3]); + val[0] = utility::LittleFloat(val[0]); + val[1] = utility::LittleFloat(val[1]); + val[2] = utility::LittleFloat(val[2]); + val[3] = utility::LittleFloat(val[3]); } - return val; + atVec4f s; + s.simd.copy_from(val); + return s; } template - inline atVec4f readVal(typename std::enable_if::value>::type* = 0) + atVec4f readVal(typename std::enable_if::value>::type* = 0) {return readVec4f();} /** @brief Reads an atVec4f (16 bytes), swaps against little endianness depending on platform @@ -652,18 +675,20 @@ public: * * @return The value at the current address */ - inline atVec4f readVec4fLittle() + atVec4f readVec4fLittle() { - atVec4f val; - readUBytesToBuf(&val, 16); - utility::LittleFloat(val.vec[0]); - utility::LittleFloat(val.vec[1]); - utility::LittleFloat(val.vec[2]); - utility::LittleFloat(val.vec[3]); - return val; + simd_floats val; + readUBytesToBuf(val.data(), 16); + val[0] = utility::LittleFloat(val[0]); + val[1] = utility::LittleFloat(val[1]); + val[2] = utility::LittleFloat(val[2]); + val[3] = utility::LittleFloat(val[3]); + atVec4f s; + s.simd.copy_from(val); + return s; } template - inline atVec4f readValLittle(typename std::enable_if::value>::type* = 0) + atVec4f readValLittle(typename std::enable_if::value>::type* = 0) {return readVec4fLittle();} /** @brief Reads an atVec4f (16 bytes), swaps against big endianness depending on platform @@ -671,18 +696,20 @@ public: * * @return The value at the current address */ - inline atVec4f readVec4fBig() + atVec4f readVec4fBig() { - atVec4f val; - readUBytesToBuf(&val, 16); - utility::BigFloat(val.vec[0]); - utility::BigFloat(val.vec[1]); - utility::BigFloat(val.vec[2]); - utility::BigFloat(val.vec[3]); - return val; + simd_floats val; + readUBytesToBuf(val.data(), 16); + val[0] = utility::BigFloat(val[0]); + val[1] = utility::BigFloat(val[1]); + val[2] = utility::BigFloat(val[2]); + val[3] = utility::BigFloat(val[3]); + atVec4f s; + s.simd.copy_from(val); + return s; } template - inline atVec4f readValBig(typename std::enable_if::value>::type* = 0) + atVec4f readValBig(typename std::enable_if::value>::type* = 0) {return readVec4fBig();} /** @brief Reads an atVec2d (16 bytes), swaps to endianness specified by setEndian depending on platform @@ -690,24 +717,28 @@ public: * * @return The value at the current address */ - inline atVec2d readVec2d() + atVec2d readVec2d() { - atVec2d val; - readUBytesToBuf(&val, 16); + simd_doubles val; + readUBytesToBuf(val.data(), 16); if (m_endian == Big) { - utility::BigDouble(val.vec[0]); - utility::BigDouble(val.vec[1]); + val[0] = utility::BigDouble(val[0]); + val[1] = utility::BigDouble(val[1]); } else { - utility::LittleDouble(val.vec[0]); - utility::LittleDouble(val.vec[1]); + val[0] = utility::LittleDouble(val[0]); + val[1] = utility::LittleDouble(val[1]); } - return val; + val[2] = 0.0; + val[3] = 0.0; + atVec2d s; + s.simd.copy_from(val); + return s; } template - inline atVec2d readVal(typename std::enable_if::value>::type* = 0) + atVec2d readVal(typename std::enable_if::value>::type* = 0) {return readVec2d();} /** @brief Reads an atVec2d (16 bytes), swaps against little endianness depending on platform @@ -715,16 +746,20 @@ public: * * @return The value at the current address */ - inline atVec2d readVec2dLittle() + atVec2d readVec2dLittle() { - atVec2d val; - readUBytesToBuf(&val, 16); - utility::LittleDouble(val.vec[0]); - utility::LittleDouble(val.vec[1]); - return val; + simd_doubles val; + readUBytesToBuf(val.data(), 16); + val[0] = utility::LittleDouble(val[0]); + val[1] = utility::LittleDouble(val[1]); + val[2] = 0.0; + val[3] = 0.0; + atVec2d s; + s.simd.copy_from(val); + return s; } template - inline atVec2d readValLittle(typename std::enable_if::value>::type* = 0) + atVec2d readValLittle(typename std::enable_if::value>::type* = 0) {return readVec2dLittle();} /** @brief Reads an atVec2d (16 bytes), swaps against big endianness depending on platform @@ -732,16 +767,20 @@ public: * * @return The value at the current address */ - inline atVec2d readVec2dBig() + atVec2d readVec2dBig() { - atVec2d val; - readUBytesToBuf(&val, 16); - utility::BigDouble(val.vec[0]); - utility::BigDouble(val.vec[1]); - return val; + simd_doubles val; + readUBytesToBuf(val.data(), 16); + val[0] = utility::BigDouble(val[0]); + val[1] = utility::BigDouble(val[1]); + val[2] = 0.0; + val[3] = 0.0; + atVec2d s; + s.simd.copy_from(val); + return s; } template - inline atVec2d readValBig(typename std::enable_if::value>::type* = 0) + atVec2d readValBig(typename std::enable_if::value>::type* = 0) {return readVec2dBig();} /** @brief Reads an atVec3d (24 bytes), swaps to endianness specified by setEndian depending on platform @@ -749,26 +788,29 @@ public: * * @return The value at the current address */ - inline atVec3d readVec3d() + atVec3d readVec3d() { - atVec3d val; - readUBytesToBuf(&val, 24); + simd_doubles val; + readUBytesToBuf(val.data(), 24); if (m_endian == Big) { - utility::BigDouble(val.vec[0]); - utility::BigDouble(val.vec[1]); - utility::BigDouble(val.vec[2]); + val[0] = utility::BigDouble(val[0]); + val[1] = utility::BigDouble(val[1]); + val[2] = utility::BigDouble(val[2]); } else { - utility::LittleDouble(val.vec[0]); - utility::LittleDouble(val.vec[1]); - utility::LittleDouble(val.vec[2]); + val[0] = utility::LittleDouble(val[0]); + val[1] = utility::LittleDouble(val[1]); + val[2] = utility::LittleDouble(val[2]); } - return val; + val[3] = 0.0; + atVec3d s; + s.simd.copy_from(val); + return s; } template - inline atVec3d readVal(typename std::enable_if::value>::type* = 0) + atVec3d readVal(typename std::enable_if::value>::type* = 0) {return readVec3d();} /** @brief Reads an atVec3d (24 bytes), swaps against little endianness depending on platform @@ -776,17 +818,20 @@ public: * * @return The value at the current address */ - inline atVec3d readVec3dLittle() + atVec3d readVec3dLittle() { - atVec3d val; - readUBytesToBuf(&val, 24); - utility::LittleDouble(val.vec[0]); - utility::LittleDouble(val.vec[1]); - utility::LittleDouble(val.vec[2]); - return val; + simd_doubles val; + readUBytesToBuf(val.data(), 24); + val[0] = utility::LittleDouble(val[0]); + val[1] = utility::LittleDouble(val[1]); + val[2] = utility::LittleDouble(val[2]); + val[3] = 0.0; + atVec3d s; + s.simd.copy_from(val); + return s; } template - inline atVec3d readValLittle(typename std::enable_if::value>::type* = 0) + atVec3d readValLittle(typename std::enable_if::value>::type* = 0) {return readVec3dLittle();} /** @brief Reads an atVec3d (24 bytes), swaps against big endianness depending on platform @@ -794,17 +839,20 @@ public: * * @return The value at the current address */ - inline atVec3d readVec3dBig() + atVec3d readVec3dBig() { - atVec3d val; - readUBytesToBuf(&val, 24); - utility::BigDouble(val.vec[0]); - utility::BigDouble(val.vec[1]); - utility::BigDouble(val.vec[2]); - return val; + simd_doubles val; + readUBytesToBuf(val.data(), 24); + val[0] = utility::BigDouble(val[0]); + val[1] = utility::BigDouble(val[1]); + val[2] = utility::BigDouble(val[2]); + val[3] = 0.0; + atVec3d s; + s.simd.copy_from(val); + return s; } template - inline atVec3d readValBig(typename std::enable_if::value>::type* = 0) + atVec3d readValBig(typename std::enable_if::value>::type* = 0) {return readVec3dBig();} /** @brief Reads an atVec4d (32 bytes), swaps to endianness specified by setEndian depending on platform @@ -812,28 +860,30 @@ public: * * @return The value at the current address */ - inline atVec4d readVec4d() + atVec4d readVec4d() { - atVec4d val; - readUBytesToBuf(&val, 32); + simd_doubles val; + readUBytesToBuf(val.data(), 32); if (m_endian == Big) { - utility::BigDouble(val.vec[0]); - utility::BigDouble(val.vec[1]); - utility::BigDouble(val.vec[2]); - utility::BigDouble(val.vec[3]); + val[0] = utility::BigDouble(val[0]); + val[1] = utility::BigDouble(val[1]); + val[2] = utility::BigDouble(val[2]); + val[3] = utility::BigDouble(val[3]); } else { - utility::LittleDouble(val.vec[0]); - utility::LittleDouble(val.vec[1]); - utility::LittleDouble(val.vec[2]); - utility::LittleDouble(val.vec[3]); + val[0] = utility::LittleDouble(val[0]); + val[1] = utility::LittleDouble(val[1]); + val[2] = utility::LittleDouble(val[2]); + val[3] = utility::LittleDouble(val[3]); } - return val; + atVec4d s; + s.simd.copy_from(val); + return s; } template - inline atVec4d readVal(typename std::enable_if::value>::type* = 0) + atVec4d readVal(typename std::enable_if::value>::type* = 0) {return readVec4d();} /** @brief Reads an atVec4d (32 bytes), swaps against little endianness depending on platform @@ -841,18 +891,20 @@ public: * * @return The value at the current address */ - inline atVec4d readVec4dLittle() + atVec4d readVec4dLittle() { - atVec4d val; - readUBytesToBuf(&val, 32); - utility::LittleDouble(val.vec[0]); - utility::LittleDouble(val.vec[1]); - utility::LittleDouble(val.vec[2]); - utility::LittleDouble(val.vec[3]); - return val; + simd_doubles val; + readUBytesToBuf(val.data(), 32); + val[0] = utility::LittleDouble(val[0]); + val[1] = utility::LittleDouble(val[1]); + val[2] = utility::LittleDouble(val[2]); + val[3] = utility::LittleDouble(val[3]); + atVec4d s; + s.simd.copy_from(val); + return s; } template - inline atVec4d readValLittle(typename std::enable_if::value>::type* = 0) + atVec4d readValLittle(typename std::enable_if::value>::type* = 0) {return readVec4dLittle();} /** @brief Reads an atVec4d (32 bytes), swaps against big endianness depending on platform @@ -860,18 +912,20 @@ public: * * @return The value at the current address */ - inline atVec4d readVec4dBig() + atVec4d readVec4dBig() { - atVec4d val; - readUBytesToBuf(&val, 32); - utility::BigDouble(val.vec[0]); - utility::BigDouble(val.vec[1]); - utility::BigDouble(val.vec[2]); - utility::BigDouble(val.vec[3]); - return val; + simd_doubles val; + readUBytesToBuf(val.data(), 32); + val[0] = utility::BigDouble(val[0]); + val[1] = utility::BigDouble(val[1]); + val[2] = utility::BigDouble(val[2]); + val[3] = utility::BigDouble(val[3]); + atVec4d s; + s.simd.copy_from(val); + return s; } template - inline atVec4d readValBig(typename std::enable_if::value>::type* = 0) + atVec4d readValBig(typename std::enable_if::value>::type* = 0) {return readVec4dBig();} /** @brief Reads a string and advances the position in the file @@ -879,7 +933,7 @@ public: * @param fixedLen If non-negative, this is a fixed-length string read * @return The read string */ - inline std::string readString(atInt32 fixedLen = -1, bool doSeek=true) + std::string readString(atInt32 fixedLen = -1, bool doSeek=true) { if (fixedLen == 0) return std::string(); @@ -903,7 +957,7 @@ public: return ret; } template - inline std::string readVal(typename std::enable_if::value>::type* = 0) + std::string readVal(typename std::enable_if::value>::type* = 0) {return readString();} /** @brief Reads a wstring and advances the position in the file @@ -911,7 +965,7 @@ public: * @param fixedLen If non-negative, this is a fixed-length string read * @return The read wstring */ - inline std::wstring readWString(atInt32 fixedLen = -1, bool doSeek=true) + std::wstring readWString(atInt32 fixedLen = -1, bool doSeek=true) { if (fixedLen == 0) return std::wstring(); @@ -936,7 +990,7 @@ public: return ret; } template - inline std::wstring readVal(typename std::enable_if::value>::type* = 0) + std::wstring readVal(typename std::enable_if::value>::type* = 0) {return readWString();} /** @brief Reads a wstring assuming little-endian characters @@ -945,7 +999,7 @@ public: * @param fixedLen If non-negative, this is a fixed-length string read * @return The read wstring */ - inline std::wstring readWStringLittle(atInt32 fixedLen = -1, bool doSeek=true) + std::wstring readWStringLittle(atInt32 fixedLen = -1, bool doSeek=true) { if (fixedLen == 0) return std::wstring(); @@ -970,7 +1024,7 @@ public: return ret; } template - inline std::wstring readValLittle(typename std::enable_if::value>::type* = 0) + std::wstring readValLittle(typename std::enable_if::value>::type* = 0) {return readWStringLittle();} /** @brief Reads a wstring assuming big-endian characters @@ -979,7 +1033,7 @@ public: * @param fixedLen If non-negative, this is a fixed-length string read * @return The read wstring */ - inline std::wstring readWStringBig(atInt32 fixedLen = -1, bool doSeek = true) + std::wstring readWStringBig(atInt32 fixedLen = -1, bool doSeek = true) { if (fixedLen == 0) return std::wstring(); @@ -1003,7 +1057,7 @@ public: return ret; } template - inline std::wstring readValBig(typename std::enable_if::value>::type* = 0) + std::wstring readValBig(typename std::enable_if::value>::type* = 0) {return readWStringBig();} /** @brief Reads a u16string assuming big-endian characters @@ -1012,7 +1066,7 @@ public: * @param fixedLen If non-negative, this is a fixed-length string read * @return The read wstring */ - inline std::u16string readU16StringBig(atInt32 fixedLen = -1, bool doSeek = true) + std::u16string readU16StringBig(atInt32 fixedLen = -1, bool doSeek = true) { if (fixedLen == 0) return std::u16string(); @@ -1036,7 +1090,7 @@ public: return ret; } template - inline std::u16string readValBig(typename std::enable_if::value>::type* = 0) + std::u16string readValBig(typename std::enable_if::value>::type* = 0) {return readU16StringBig();} /** @brief Reads a u32string assuming big-endian characters @@ -1045,7 +1099,7 @@ public: * @param fixedLen If non-negative, this is a fixed-length string read * @return The read wstring */ - inline std::u32string readU32StringBig(atInt32 fixedLen = -1, bool doSeek = true) + std::u32string readU32StringBig(atInt32 fixedLen = -1, bool doSeek = true) { if (fixedLen == 0) return std::u32string(); @@ -1069,7 +1123,7 @@ public: return ret; } template - inline std::u32string readValBig(typename std::enable_if::value>::type* = 0) + std::u32string readValBig(typename std::enable_if::value>::type* = 0) {return readU32StringBig();} /** @brief Performs automatic std::vector enumeration reads using numeric type T diff --git a/include/athena/IStreamWriter.hpp b/include/athena/IStreamWriter.hpp index 6f6bf56..54e8a38 100644 --- a/include/athena/IStreamWriter.hpp +++ b/include/athena/IStreamWriter.hpp @@ -11,7 +11,8 @@ namespace athena::io class IStreamWriter : public IStream { public: - virtual ~IStreamWriter() {} + virtual ~IStreamWriter() = default; + /** @brief Sets the buffers position relative to the specified position.
* It seeks relative to the current position by default. * @param position where in the buffer to seek @@ -21,11 +22,11 @@ public: /** @brief Sets the buffers position relative to the next 32-byte aligned position.
*/ - inline void seekAlign32() {seek(ROUND_UP_32(position()), SeekOrigin::Begin);} + void seekAlign32() {seek(ROUND_UP_32(position()), SeekOrigin::Begin);} /** @brief Writes zero up to specified absolute offset.
*/ - inline void writeZeroTo(atInt64 pos) + void writeZeroTo(atInt64 pos) { atInt64 delta = pos - position(); if (delta <= 0) @@ -38,7 +39,7 @@ public: * * @return True if at end; False otherwise. */ - inline bool atEnd() const {return position() >= length();} + bool atEnd() const {return position() >= length();} /** @brief Returns the current position in the stream. * @@ -55,18 +56,18 @@ public: /** @brief Writes a byte at the current position and advances the position by one byte. * @param val The value to write */ - inline void writeUByte(atUint8 val) {writeUBytes(&val, 1);} - inline void writeVal(atUint8 val) {writeUByte(val);} - inline void writeValLittle(atUint8 val) {writeUByte(val);} - inline void writeValBig(atUint8 val) {writeUByte(val);} + void writeUByte(atUint8 val) {writeUBytes(&val, 1);} + void writeVal(atUint8 val) {writeUByte(val);} + void writeValLittle(atUint8 val) {writeUByte(val);} + void writeValBig(atUint8 val) {writeUByte(val);} /** @brief Writes a byte at the current position and advances the position by one byte. * @param val The value to write */ - inline void writeByte(atInt8 val) {writeUByte(val);} - inline void writeVal(atInt8 val) {writeByte(val);} - inline void writeValLittle(atInt8 val) {writeByte(val);} - inline void writeValBig(atInt8 val) {writeByte(val);} + void writeByte(atInt8 val) {writeUByte(val);} + void writeVal(atInt8 val) {writeByte(val);} + void writeValLittle(atInt8 val) {writeByte(val);} + void writeValBig(atInt8 val) {writeByte(val);} /** @brief Writes the given buffer with the specified length, buffers can be bigger than the length * however it's undefined behavior to try and write a buffer which is smaller than the given length. @@ -82,14 +83,14 @@ public: * @param data The buffer to write * @param length The amount to write */ - inline void writeBytes(const void* data, atUint64 len) {writeUBytes((atUint8*)data, len);} + void writeBytes(const void* data, atUint64 len) {writeUBytes((atUint8*)data, len);} /** @brief Writes an Int16 to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param val The value to write to the buffer */ - inline void writeInt16(atInt16 val) + void writeInt16(atInt16 val) { if (m_endian == Big) utility::BigInt16(val); @@ -97,62 +98,62 @@ public: utility::LittleInt16(val); writeUBytes((atUint8*)&val, 2); } - inline void writeVal(atInt16 val) {writeInt16(val);} + void writeVal(atInt16 val) {writeInt16(val);} /** @brief Writes an Int16 to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param val The value to write to the buffer */ - inline void writeInt16Little(atInt16 val) + void writeInt16Little(atInt16 val) { utility::LittleInt16(val); writeUBytes((atUint8*)&val, 2); } - inline void writeValLittle(atInt16 val) {writeInt16Little(val);} + void writeValLittle(atInt16 val) {writeInt16Little(val);} /** @brief Writes an Int16 to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param val The value to write to the buffer */ - inline void writeInt16Big(atInt16 val) + void writeInt16Big(atInt16 val) { utility::BigInt16(val); writeUBytes((atUint8*)&val, 2); } - inline void writeValBig(atInt16 val) {writeInt16Big(val);} + void writeValBig(atInt16 val) {writeInt16Big(val);} /** @brief Writes an Uint16 to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings * * @param val The value to write to the buffer */ - inline void writeUint16(atUint16 val) {writeInt16(val);} - inline void writeVal(atUint16 val) {writeUint16(val);} + void writeUint16(atUint16 val) {writeInt16(val);} + void writeVal(atUint16 val) {writeUint16(val);} /** @brief Writes an Uint16 to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform * * @param val The value to write to the buffer */ - inline void writeUint16Little(atUint16 val) {writeInt16Little(val);} - inline void writeValLittle(atUint16 val) {writeUint16Little(val);} + void writeUint16Little(atUint16 val) {writeInt16Little(val);} + void writeValLittle(atUint16 val) {writeUint16Little(val);} /** @brief Writes an Uint16 to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform * * @param val The value to write to the buffer */ - inline void writeUint16Big(atUint16 val) {writeInt16Big(val);} - inline void writeValBig(atUint16 val) {writeUint16Big(val);} + void writeUint16Big(atUint16 val) {writeInt16Big(val);} + void writeValBig(atUint16 val) {writeUint16Big(val);} /** @brief Writes an Int32 to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param val The value to write to the buffer */ - inline void writeInt32(atInt32 val) + void writeInt32(atInt32 val) { if (m_endian == Big) utility::BigInt32(val); @@ -160,62 +161,62 @@ public: utility::LittleInt32(val); writeUBytes((atUint8*)&val, 4); } - inline void writeVal(atInt32 val) {writeInt32(val);} + void writeVal(atInt32 val) {writeInt32(val);} /** @brief Writes an Int32 to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param val The value to write to the buffer */ - inline void writeInt32Little(atInt32 val) + void writeInt32Little(atInt32 val) { utility::LittleInt32(val); writeUBytes((atUint8*)&val, 4); } - inline void writeValLittle(atInt32 val) {writeInt32Little(val);} + void writeValLittle(atInt32 val) {writeInt32Little(val);} /** @brief Writes an Int32 to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param val The value to write to the buffer */ - inline void writeInt32Big(atInt32 val) + void writeInt32Big(atInt32 val) { utility::BigInt32(val); writeUBytes((atUint8*)&val, 4); } - inline void writeValBig(atInt32 val) {writeInt32Big(val);} + void writeValBig(atInt32 val) {writeInt32Big(val);} /** @brief Writes an Uint32 to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param val The value to write to the buffer */ - inline void writeUint32(atUint32 val) {writeInt32(val);} - inline void writeVal(atUint32 val) {writeUint32(val);} + void writeUint32(atUint32 val) {writeInt32(val);} + void writeVal(atUint32 val) {writeUint32(val);} /** @brief Writes an Uint32 to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param val The value to write to the buffer */ - inline void writeUint32Little(atUint32 val) {writeInt32Little(val);} - inline void writeValLittle(atUint32 val) {writeUint32Little(val);} + void writeUint32Little(atUint32 val) {writeInt32Little(val);} + void writeValLittle(atUint32 val) {writeUint32Little(val);} /** @brief Writes an Uint32 to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param val The value to write to the buffer */ - inline void writeUint32Big(atUint32 val) {writeInt32Big(val);} - inline void writeValBig(atUint32 val) {writeUint32Big(val);} + void writeUint32Big(atUint32 val) {writeInt32Big(val);} + void writeValBig(atUint32 val) {writeUint32Big(val);} /** @brief Writes an Int64 to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param val The value to write to the buffer */ - inline void writeInt64(atInt64 val) + void writeInt64(atInt64 val) { if (m_endian == Big) utility::BigInt64(val); @@ -223,101 +224,101 @@ public: utility::LittleInt64(val); writeUBytes((atUint8*)&val, 8); } - inline void writeVal(atInt64 val) {writeInt64(val);} + void writeVal(atInt64 val) {writeInt64(val);} /** @brief Writes an Int64 to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param val The value to write to the buffer */ - inline void writeInt64Little(atInt64 val) + void writeInt64Little(atInt64 val) { utility::LittleInt64(val); writeUBytes((atUint8*)&val, 8); } - inline void writeValLittle(atInt64 val) {writeInt64Little(val);} + void writeValLittle(atInt64 val) {writeInt64Little(val);} /** @brief Writes an Int64 to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param val The value to write to the buffer */ - inline void writeInt64Big(atInt64 val) + void writeInt64Big(atInt64 val) { utility::BigInt64(val); writeUBytes((atUint8*)&val, 8); } - inline void writeValBig(atInt64 val) {writeInt64Big(val);} + void writeValBig(atInt64 val) {writeInt64Big(val);} /** @brief Writes an Uint64 to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param val The value to write to the buffer */ - inline void writeUint64(atUint64 val) {writeInt64(val);} - inline void writeVal(atUint64 val) {writeUint64(val);} + void writeUint64(atUint64 val) {writeInt64(val);} + void writeVal(atUint64 val) {writeUint64(val);} /** @brief Writes an Uint64 to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param val The value to write to the buffer */ - inline void writeUint64Little(atUint64 val) {writeInt64Little(val);} - inline void writeValLittle(atUint64 val) {writeUint64Little(val);} + void writeUint64Little(atUint64 val) {writeInt64Little(val);} + void writeValLittle(atUint64 val) {writeUint64Little(val);} /** @brief Writes an Uint64 to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param val The value to write to the buffer */ - inline void writeUint64Big(atUint64 val) {writeInt64Big(val);} - inline void writeValBig(atUint64 val) {writeUint64Big(val);} + void writeUint64Big(atUint64 val) {writeInt64Big(val);} + void writeValBig(atUint64 val) {writeUint64Big(val);} /** @brief Writes an float to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param val The value to write to the buffer */ - inline void writeFloat(float val) + void writeFloat(float val) { if (m_endian == Big) - utility::BigFloat(val); + val = utility::BigFloat(val); else - utility::LittleFloat(val); + val = utility::LittleFloat(val); writeUBytes((atUint8*)&val, 4); } - inline void writeVal(float val) {writeFloat(val);} + void writeVal(float val) {writeFloat(val);} /** @brief Writes an float to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param val The value to write to the buffer */ - inline void writeFloatLittle(float val) + void writeFloatLittle(float val) { utility::LittleFloat(val); writeUBytes((atUint8*)&val, 4); } - inline void writeValLittle(float val) {writeFloatLittle(val);} + void writeValLittle(float val) {writeFloatLittle(val);} /** @brief Writes an float to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param val The value to write to the buffer */ - inline void writeFloatBig(float val) + void writeFloatBig(float val) { - utility::BigFloat(val); + val = utility::BigFloat(val); writeUBytes((atUint8*)&val, 4); } - inline void writeValBig(float val) {writeFloatBig(val);} + void writeValBig(float val) {writeFloatBig(val);} /** @brief Writes an double to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param val The value to write to the buffer */ - inline void writeDouble(double val) + void writeDouble(double val) { if (m_endian == Big) utility::BigDouble(val); @@ -325,365 +326,365 @@ public: utility::LittleDouble(val); writeUBytes((atUint8*)&val, 8); } - inline void writeVal(double val) {writeDouble(val);} + void writeVal(double val) {writeDouble(val);} /** @brief Writes an double to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param val The value to write to the buffer */ - inline void writeDoubleLittle(double val) + void writeDoubleLittle(double val) { utility::LittleDouble(val); writeUBytes((atUint8*)&val, 8); } - inline void writeValLittle(double val) {writeDoubleLittle(val);} + void writeValLittle(double val) {writeDoubleLittle(val);} /** @brief Writes an double to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param val The value to write to the buffer */ - inline void writeDoubleBig(double val) + void writeDoubleBig(double val) { utility::BigDouble(val); writeUBytes((atUint8*)&val, 8); } - inline void writeValBig(double val) {writeDoubleBig(val);} + void writeValBig(double val) {writeDoubleBig(val);} /** @brief Writes an bool to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param val The value to write to the buffer */ - inline void writeBool(bool val) {writeUBytes((atUint8*)&val, 1);} - inline void writeVal(bool val) {writeBool(val);} - inline void writeValLittle(bool val) {writeBool(val);} - inline void writeValBig(bool val) {writeBool(val);} + void writeBool(bool val) {writeUBytes((atUint8*)&val, 1);} + void writeVal(bool val) {writeBool(val);} + void writeValLittle(bool val) {writeBool(val);} + void writeValBig(bool val) {writeBool(val);} /** @brief Writes an atVec2f (8 bytes) to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param vec The value to write to the buffer */ - inline void writeVec2f(const atVec2f& vec) + void writeVec2f(const atVec2f& vec) { - atVec2f tmp = vec; + simd_floats tmp(vec.simd); if (m_endian == Big) { - utility::BigFloat(tmp.vec[0]); - utility::BigFloat(tmp.vec[1]); + tmp[0] = utility::BigFloat(tmp[0]); + tmp[1] = utility::BigFloat(tmp[1]); } else { - utility::LittleFloat(tmp.vec[0]); - utility::LittleFloat(tmp.vec[1]); + tmp[0] = utility::LittleFloat(tmp[0]); + tmp[1] = utility::LittleFloat(tmp[1]); } - writeUBytes((atUint8*)&tmp, 8); + writeUBytes((atUint8*)tmp.data(), 8); } - inline void writeVal(const atVec2f& val) {writeVec2f(val);} + void writeVal(const atVec2f& val) {writeVec2f(val);} /** @brief Writes an atVec2f (8 bytes) to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec2fLittle(const atVec2f& vec) + void writeVec2fLittle(const atVec2f& vec) { - atVec2f tmp = vec; - utility::LittleFloat(tmp.vec[0]); - utility::LittleFloat(tmp.vec[1]); - writeUBytes((atUint8*)&tmp, 8); + simd_floats tmp(vec.simd); + tmp[0] = utility::LittleFloat(tmp[0]); + tmp[1] = utility::LittleFloat(tmp[1]); + writeUBytes((atUint8*)tmp.data(), 8); } - inline void writeValLittle(const atVec2f& val) {writeVec2fLittle(val);} + void writeValLittle(const atVec2f& val) {writeVec2fLittle(val);} /** @brief Writes an atVec2f (8 bytes) to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec2fBig(const atVec2f& vec) + void writeVec2fBig(const atVec2f& vec) { - atVec2f tmp = vec; - utility::BigFloat(tmp.vec[0]); - utility::BigFloat(tmp.vec[1]); - writeUBytes((atUint8*)&tmp, 8); + simd_floats tmp(vec.simd); + tmp[0] = utility::BigFloat(tmp[0]); + tmp[1] = utility::BigFloat(tmp[1]); + writeUBytes((atUint8*)tmp.data(), 8); } - inline void writeValBig(const atVec2f& val) {writeVec2fBig(val);} + void writeValBig(const atVec2f& val) {writeVec2fBig(val);} /** @brief Writes an atVec3f (12 bytes) to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param vec The value to write to the buffer */ - inline void writeVec3f(const atVec3f& vec) + void writeVec3f(const atVec3f& vec) { - atVec3f tmp = vec; + simd_floats tmp(vec.simd); if (m_endian == Big) { - utility::BigFloat(tmp.vec[0]); - utility::BigFloat(tmp.vec[1]); - utility::BigFloat(tmp.vec[2]); + tmp[0] = utility::BigFloat(tmp[0]); + tmp[1] = utility::BigFloat(tmp[1]); + tmp[2] = utility::BigFloat(tmp[2]); } else { - utility::LittleFloat(tmp.vec[0]); - utility::LittleFloat(tmp.vec[1]); - utility::LittleFloat(tmp.vec[2]); + tmp[0] = utility::LittleFloat(tmp[0]); + tmp[1] = utility::LittleFloat(tmp[1]); + tmp[2] = utility::LittleFloat(tmp[2]); } - writeUBytes((atUint8*)&tmp, 12); + writeUBytes((atUint8*)tmp.data(), 12); } - inline void writeVal(const atVec3f& val) {writeVec3f(val);} + void writeVal(const atVec3f& val) {writeVec3f(val);} /** @brief Writes an atVec3f (12 bytes) to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec3fLittle(const atVec3f& vec) + void writeVec3fLittle(const atVec3f& vec) { - atVec3f tmp = vec; - utility::LittleFloat(tmp.vec[0]); - utility::LittleFloat(tmp.vec[1]); - utility::LittleFloat(tmp.vec[2]); - writeUBytes((atUint8*)&tmp, 12); + simd_floats tmp(vec.simd); + tmp[0] = utility::LittleFloat(tmp[0]); + tmp[1] = utility::LittleFloat(tmp[1]); + tmp[2] = utility::LittleFloat(tmp[2]); + writeUBytes((atUint8*)tmp.data(), 12); } - inline void writeValLittle(const atVec3f& val) {writeVec3fLittle(val);} + void writeValLittle(const atVec3f& val) {writeVec3fLittle(val);} /** @brief Writes an atVec3f (12 bytes) to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec3fBig(const atVec3f& vec) + void writeVec3fBig(const atVec3f& vec) { - atVec3f tmp = vec; - utility::BigFloat(tmp.vec[0]); - utility::BigFloat(tmp.vec[1]); - utility::BigFloat(tmp.vec[2]); - writeUBytes((atUint8*)&tmp, 12); + simd_floats tmp(vec.simd); + tmp[0] = utility::BigFloat(tmp[0]); + tmp[1] = utility::BigFloat(tmp[1]); + tmp[2] = utility::BigFloat(tmp[2]); + writeUBytes((atUint8*)tmp.data(), 12); } - inline void writeValBig(const atVec3f& val) {writeVec3fBig(val);} + void writeValBig(const atVec3f& val) {writeVec3fBig(val);} /** @brief Writes an atVec4f (16 bytes) to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param vec The value to write to the buffer */ - inline void writeVec4f(const atVec4f& vec) + void writeVec4f(const atVec4f& vec) { - atVec4f tmp = vec; + simd_floats tmp(vec.simd); if (m_endian == Big) { - utility::BigFloat(tmp.vec[0]); - utility::BigFloat(tmp.vec[1]); - utility::BigFloat(tmp.vec[2]); - utility::BigFloat(tmp.vec[3]); + tmp[0] = utility::BigFloat(tmp[0]); + tmp[1] = utility::BigFloat(tmp[1]); + tmp[2] = utility::BigFloat(tmp[2]); + tmp[3] = utility::BigFloat(tmp[3]); } else { - utility::LittleFloat(tmp.vec[0]); - utility::LittleFloat(tmp.vec[1]); - utility::LittleFloat(tmp.vec[2]); - utility::LittleFloat(tmp.vec[3]); + tmp[0] = utility::LittleFloat(tmp[0]); + tmp[1] = utility::LittleFloat(tmp[1]); + tmp[2] = utility::LittleFloat(tmp[2]); + tmp[3] = utility::LittleFloat(tmp[3]); } - writeUBytes((atUint8*)&tmp, 16); + writeUBytes((atUint8*)tmp.data(), 16); } - inline void writeVal(const atVec4f& val) {writeVec4f(val);} + void writeVal(const atVec4f& val) {writeVec4f(val);} /** @brief Writes an atVec4f (16 bytes) to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec4fLittle(const atVec4f& vec) + void writeVec4fLittle(const atVec4f& vec) { - atVec4f tmp = vec; - utility::LittleFloat(tmp.vec[0]); - utility::LittleFloat(tmp.vec[1]); - utility::LittleFloat(tmp.vec[2]); - utility::LittleFloat(tmp.vec[3]); - writeUBytes((atUint8*)&tmp, 16); + simd_floats tmp(vec.simd); + tmp[0] = utility::LittleFloat(tmp[0]); + tmp[1] = utility::LittleFloat(tmp[1]); + tmp[2] = utility::LittleFloat(tmp[2]); + tmp[3] = utility::LittleFloat(tmp[3]); + writeUBytes((atUint8*)tmp.data(), 16); } - inline void writeValLittle(const atVec4f& val) {writeVec4fLittle(val);} + void writeValLittle(const atVec4f& val) {writeVec4fLittle(val);} /** @brief Writes an atVec4f (16 bytes) to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec4fBig(const atVec4f& vec) + void writeVec4fBig(const atVec4f& vec) { - atVec4f tmp = vec; - utility::BigFloat(tmp.vec[0]); - utility::BigFloat(tmp.vec[1]); - utility::BigFloat(tmp.vec[2]); - utility::BigFloat(tmp.vec[3]); - writeUBytes((atUint8*)&tmp, 16); + simd_floats tmp(vec.simd); + tmp[0] = utility::BigFloat(tmp[0]); + tmp[1] = utility::BigFloat(tmp[1]); + tmp[2] = utility::BigFloat(tmp[2]); + tmp[3] = utility::BigFloat(tmp[3]); + writeUBytes((atUint8*)tmp.data(), 16); } - inline void writeValBig(const atVec4f& val) {writeVec4fBig(val);} + void writeValBig(const atVec4f& val) {writeVec4fBig(val);} /** @brief Writes an atVec2d (16 bytes) to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param vec The value to write to the buffer */ - inline void writeVec2d(const atVec2d& vec) + void writeVec2d(const atVec2d& vec) { - atVec2d tmp = vec; + simd_doubles tmp(vec.simd); if (m_endian == Big) { - utility::BigDouble(tmp.vec[0]); - utility::BigDouble(tmp.vec[1]); + tmp[0] = utility::BigDouble(tmp[0]); + tmp[1] = utility::BigDouble(tmp[1]); } else { - utility::LittleDouble(tmp.vec[0]); - utility::LittleDouble(tmp.vec[1]); + tmp[0] = utility::LittleDouble(tmp[0]); + tmp[1] = utility::LittleDouble(tmp[1]); } - writeUBytes((atUint8*)&tmp, 16); + writeUBytes((atUint8*)tmp.data(), 16); } - inline void writeVal(const atVec2d& val) {writeVec2d(val);} + void writeVal(const atVec2d& val) {writeVec2d(val);} /** @brief Writes an atVec2d (16 bytes) to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec2dLittle(const atVec2d& vec) + void writeVec2dLittle(const atVec2d& vec) { - atVec2d tmp = vec; - utility::LittleDouble(tmp.vec[0]); - utility::LittleDouble(tmp.vec[1]); - writeUBytes((atUint8*)&tmp, 16); + simd_doubles tmp(vec.simd); + tmp[0] = utility::LittleDouble(tmp[0]); + tmp[1] = utility::LittleDouble(tmp[1]); + writeUBytes((atUint8*)tmp.data(), 16); } - inline void writeValLittle(const atVec2d& val) {writeVec2dLittle(val);} + void writeValLittle(const atVec2d& val) {writeVec2dLittle(val);} /** @brief Writes an atVec2d (16 bytes) to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec2dBig(const atVec2d& vec) + void writeVec2dBig(const atVec2d& vec) { - atVec2d tmp = vec; - utility::BigDouble(tmp.vec[0]); - utility::BigDouble(tmp.vec[1]); - writeUBytes((atUint8*)&tmp, 16); + simd_doubles tmp(vec.simd); + tmp[0] = utility::BigDouble(tmp[0]); + tmp[1] = utility::BigDouble(tmp[1]); + writeUBytes((atUint8*)tmp.data(), 16); } - inline void writeValBig(const atVec2d& val) {writeVec2dBig(val);} + void writeValBig(const atVec2d& val) {writeVec2dBig(val);} /** @brief Writes an atVec3d (24 bytes) to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param vec The value to write to the buffer */ - inline void writeVec3d(const atVec3d& vec) + void writeVec3d(const atVec3d& vec) { - atVec3d tmp = vec; + simd_doubles tmp(vec.simd); if (m_endian == Big) { - utility::BigDouble(tmp.vec[0]); - utility::BigDouble(tmp.vec[1]); - utility::BigDouble(tmp.vec[2]); + tmp[0] = utility::BigDouble(tmp[0]); + tmp[1] = utility::BigDouble(tmp[1]); + tmp[2] = utility::BigDouble(tmp[2]); } else { - utility::LittleDouble(tmp.vec[0]); - utility::LittleDouble(tmp.vec[1]); - utility::LittleDouble(tmp.vec[2]); + tmp[0] = utility::LittleDouble(tmp[0]); + tmp[1] = utility::LittleDouble(tmp[1]); + tmp[2] = utility::LittleDouble(tmp[2]); } - writeUBytes((atUint8*)&tmp, 24); + writeUBytes((atUint8*)tmp.data(), 24); } - inline void writeVal(const atVec3d& val) {writeVec3d(val);} + void writeVal(const atVec3d& val) {writeVec3d(val);} /** @brief Writes an atVec3d (24 bytes) to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec3dLittle(const atVec3d& vec) + void writeVec3dLittle(const atVec3d& vec) { - atVec3d tmp = vec; - utility::LittleDouble(tmp.vec[0]); - utility::LittleDouble(tmp.vec[1]); - utility::LittleDouble(tmp.vec[2]); - writeUBytes((atUint8*)&tmp, 24); + simd_doubles tmp(vec.simd); + tmp[0] = utility::LittleDouble(tmp[0]); + tmp[1] = utility::LittleDouble(tmp[1]); + tmp[2] = utility::LittleDouble(tmp[2]); + writeUBytes((atUint8*)tmp.data(), 24); } - inline void writeValLittle(const atVec3d& val) {writeVec3dLittle(val);} + void writeValLittle(const atVec3d& val) {writeVec3dLittle(val);} /** @brief Writes an atVec3d (24 bytes) to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec3dBig(const atVec3d& vec) + void writeVec3dBig(const atVec3d& vec) { - atVec3d tmp = vec; - utility::BigDouble(tmp.vec[0]); - utility::BigDouble(tmp.vec[1]); - utility::BigDouble(tmp.vec[2]); - writeUBytes((atUint8*)&tmp, 24); + simd_doubles tmp(vec.simd); + tmp[0] = utility::BigDouble(tmp[0]); + tmp[1] = utility::BigDouble(tmp[1]); + tmp[2] = utility::BigDouble(tmp[2]); + writeUBytes((atUint8*)tmp.data(), 24); } - inline void writeValBig(const atVec3d& val) {writeVec3dBig(val);} + void writeValBig(const atVec3d& val) {writeVec3dBig(val);} /** @brief Writes an atVec4d (32 bytes) to the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. * * @param vec The value to write to the buffer */ - inline void writeVec4d(const atVec4d& vec) + void writeVec4d(const atVec4d& vec) { - atVec4d tmp = vec; + simd_doubles tmp(vec.simd); if (m_endian == Big) { - utility::BigDouble(tmp.vec[0]); - utility::BigDouble(tmp.vec[1]); - utility::BigDouble(tmp.vec[2]); - utility::BigDouble(tmp.vec[3]); + tmp[0] = utility::BigDouble(tmp[0]); + tmp[1] = utility::BigDouble(tmp[1]); + tmp[2] = utility::BigDouble(tmp[2]); + tmp[3] = utility::BigDouble(tmp[3]); } else { - utility::LittleDouble(tmp.vec[0]); - utility::LittleDouble(tmp.vec[1]); - utility::LittleDouble(tmp.vec[2]); - utility::LittleDouble(tmp.vec[3]); + tmp[0] = utility::LittleDouble(tmp[0]); + tmp[1] = utility::LittleDouble(tmp[1]); + tmp[2] = utility::LittleDouble(tmp[2]); + tmp[3] = utility::LittleDouble(tmp[3]); } - writeUBytes((atUint8*)&tmp, 32); + writeUBytes((atUint8*)tmp.data(), 32); } - inline void writeVal(const atVec4d& val) {writeVec4d(val);} + void writeVal(const atVec4d& val) {writeVec4d(val);} /** @brief Writes an atVec4d (32 bytes) to the buffer and advances the buffer. * It also swaps the bytes against little depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec4dLittle(const atVec4d& vec) + void writeVec4dLittle(const atVec4d& vec) { - atVec4d tmp = vec; - utility::LittleDouble(tmp.vec[0]); - utility::LittleDouble(tmp.vec[1]); - utility::LittleDouble(tmp.vec[2]); - utility::LittleDouble(tmp.vec[3]); - writeUBytes((atUint8*)&tmp, 32); + simd_doubles tmp(vec.simd); + tmp[0] = utility::LittleDouble(tmp[0]); + tmp[1] = utility::LittleDouble(tmp[1]); + tmp[2] = utility::LittleDouble(tmp[2]); + tmp[3] = utility::LittleDouble(tmp[3]); + writeUBytes((atUint8*)tmp.data(), 32); } - inline void writeValLittle(const atVec4d& val) {writeVec4dLittle(val);} + void writeValLittle(const atVec4d& val) {writeVec4dLittle(val);} /** @brief Writes an atVec4d (32 bytes) to the buffer and advances the buffer. * It also swaps the bytes against big depending on the platform. * * @param vec The value to write to the buffer */ - inline void writeVec4dBig(const atVec4d& vec) + void writeVec4dBig(const atVec4d& vec) { - atVec4d tmp = vec; - utility::BigDouble(tmp.vec[0]); - utility::BigDouble(tmp.vec[1]); - utility::BigDouble(tmp.vec[2]); - utility::BigDouble(tmp.vec[3]); - writeUBytes((atUint8*)&tmp, 32); + simd_doubles tmp(vec.simd); + tmp[0] = utility::BigDouble(tmp[0]); + tmp[1] = utility::BigDouble(tmp[1]); + tmp[2] = utility::BigDouble(tmp[2]); + tmp[3] = utility::BigDouble(tmp[3]); + writeUBytes((atUint8*)tmp.data(), 32); } - inline void writeValBig(const atVec4d& val) {writeVec4dBig(val);} + void writeValBig(const atVec4d& val) {writeVec4dBig(val);} /** @brief Converts a UTF8 string to a wide-char string in the buffer and advances the buffer. * It also swaps the bytes depending on the platform and Stream settings. @@ -693,7 +694,7 @@ public: * * Endianness is set with setEndian */ - inline void writeStringAsWString(std::string_view str, atInt32 fixedLen = -1) + void writeStringAsWString(std::string_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -751,7 +752,7 @@ public: * * Endianness is little */ - inline void writeStringAsWStringLittle(std::string_view str, atInt32 fixedLen = -1) + void writeStringAsWStringLittle(std::string_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -809,7 +810,7 @@ public: * * Endianness is big */ - inline void writeStringAsWStringBig(std::string_view str, atInt32 fixedLen = -1) + void writeStringAsWStringBig(std::string_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -865,7 +866,7 @@ public: * @param str The string to write to the buffer * @param fixedLen If not -1, the number of characters to zero-fill string to */ - inline void writeString(std::string_view str, atInt32 fixedLen = -1) + void writeString(std::string_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -895,7 +896,7 @@ public: } } } - inline void writeVal(std::string_view val) {writeString(val);} + void writeVal(std::string_view val) {writeString(val);} /** @brief Writes an wstring to the buffer and advances the buffer. * @@ -904,7 +905,7 @@ public: * * Endianness is set with setEndian */ - inline void writeWString(std::wstring_view str, atInt32 fixedLen = -1) + void writeWString(std::wstring_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -934,7 +935,7 @@ public: } } } - inline void writeVal(std::wstring_view val) {writeWString(val);} + void writeVal(std::wstring_view val) {writeWString(val);} /** @brief Writes an wstring to the buffer and advances the buffer. * @@ -943,7 +944,7 @@ public: * * Endianness is little */ - inline void writeWStringLittle(std::wstring_view str, atInt32 fixedLen = -1) + void writeWStringLittle(std::wstring_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -973,7 +974,7 @@ public: } } } - inline void writeValLittle(std::wstring_view val) {writeWStringLittle(val);} + void writeValLittle(std::wstring_view val) {writeWStringLittle(val);} /** @brief Writes an wstring to the buffer and advances the buffer. * @@ -982,7 +983,7 @@ public: * * Endianness is big */ - inline void writeWStringBig(std::wstring_view str, atInt32 fixedLen = -1) + void writeWStringBig(std::wstring_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -1012,7 +1013,7 @@ public: } } } - inline void writeValBig(std::wstring_view val) {writeWStringBig(val);} + void writeValBig(std::wstring_view val) {writeWStringBig(val);} /** @brief Writes a u16string to the buffer and advances the buffer. * @@ -1021,7 +1022,7 @@ public: * * Endianness is big */ - inline void writeU16StringBig(std::u16string_view str, atInt32 fixedLen = -1) + void writeU16StringBig(std::u16string_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -1051,7 +1052,7 @@ public: } } } - inline void writeValBig(std::u16string_view val) {writeU16StringBig(val);} + void writeValBig(std::u16string_view val) {writeU16StringBig(val);} /** @brief Writes a u16string to the buffer and advances the buffer. * @@ -1060,7 +1061,7 @@ public: * * Endianness is big */ - inline void writeU32StringBig(std::u32string_view str, atInt32 fixedLen = -1) + void writeU32StringBig(std::u32string_view str, atInt32 fixedLen = -1) { if (fixedLen == 0) return; @@ -1090,9 +1091,9 @@ public: } } } - inline void writeValBig(std::u32string_view val) {writeU32StringBig(val);} + void writeValBig(std::u32string_view val) {writeU32StringBig(val);} - inline void fill(atUint8 val, atUint64 length) + void fill(atUint8 val, atUint64 length) { if (length == 0) return; @@ -1102,7 +1103,7 @@ public: writeUBytes(tmp.get(), length); } - inline void fill(atInt8 val, atUint64 length) + void fill(atInt8 val, atUint64 length) {fill((atUint8)val, length);} /** @brief Performs automatic std::vector enumeration writes using numeric type T diff --git a/include/athena/Types.hpp b/include/athena/Types.hpp index 2d046b6..e54a4f6 100644 --- a/include/athena/Types.hpp +++ b/include/athena/Types.hpp @@ -12,106 +12,13 @@ using atInt64 = int64_t; using atUint64 = uint64_t; // Vector types -#if __SSE__ -#include -#ifndef _WIN32 -#include -#endif -#endif - -#include -#define AT_ALIGNED_ALLOCATOR \ -void* operator new(size_t bytes) noexcept \ -{return _mm_malloc(bytes, 16);} \ -void* operator new[](size_t bytes) noexcept \ -{return _mm_malloc(bytes, 16);} \ -void operator delete(void* buf) noexcept \ -{_mm_free(buf);} \ -void operator delete[](void* buf) noexcept \ -{_mm_free(buf);} - -#define AT_ALIGNED_ALLOCATOR32 \ -void* operator new(size_t bytes) noexcept \ -{return _mm_malloc(bytes, 32);} \ -void* operator new[](size_t bytes) noexcept \ -{return _mm_malloc(bytes, 32);} \ -void operator delete(void* buf) noexcept \ -{_mm_free(buf);} \ -void operator delete[](void* buf) noexcept \ -{_mm_free(buf);} - -typedef union alignas(16) -{ -#if __clang__ - float clangVec __attribute__((__vector_size__(8))); -#endif -#if __SSE__ - __m128 mVec128; - AT_ALIGNED_ALLOCATOR -#endif - float vec[2]; -} atVec2f; - -typedef union alignas(16) -{ -#if __clang__ - float clangVec __attribute__((__vector_size__(12))); -#endif -#if __SSE__ - __m128 mVec128; - AT_ALIGNED_ALLOCATOR -#endif - float vec[3]; -} atVec3f; - -typedef union alignas(16) -{ -#if __clang__ - float clangVec __attribute__((__vector_size__(16))); -#endif -#if __SSE__ - __m128 mVec128; - AT_ALIGNED_ALLOCATOR -#endif - float vec[4]; -} atVec4f; - -typedef union alignas(16) -{ -#if __SSE__ - __m128d mVec128; - AT_ALIGNED_ALLOCATOR -#endif - double vec[2]; -} atVec2d; - -typedef union alignas(32) -{ -#if __AVX__ - __m256d mVec256; - AT_ALIGNED_ALLOCATOR32 -#elif __SSE__ - AT_ALIGNED_ALLOCATOR -#endif -#if __SSE__ - __m128d mVec128[2]; -#endif - double vec[3]; -} atVec3d; - -typedef union alignas(32) -{ -#if __AVX__ - __m256d mVec256; - AT_ALIGNED_ALLOCATOR32 -#elif __SSE__ - AT_ALIGNED_ALLOCATOR -#endif -#if __SSE__ - __m128d mVec128[2]; -#endif - double vec[4]; -} atVec4d; +#include "simd/simd.hpp" +typedef struct { athena::simd simd; } atVec2f; +typedef struct { athena::simd simd; } atVec3f; +typedef struct { athena::simd simd; } atVec4f; +typedef struct { athena::simd simd; } atVec2d; +typedef struct { athena::simd simd; } atVec3d; +typedef struct { athena::simd simd; } atVec4d; #ifndef UNUSED #define UNUSED(x) ((void)x) diff --git a/include/athena/Utility.hpp b/include/athena/Utility.hpp index 50b64a5..c3b5d06 100644 --- a/include/athena/Utility.hpp +++ b/include/athena/Utility.hpp @@ -163,28 +163,28 @@ inline atUint64 BigUint64(atUint64& val) return val; } -inline float LittleFloat(float& val) +inline float LittleFloat(float val) { if (athena::utility::isSystemBigEndian()) val = athena::utility::swapFloat(val); return val; } -inline float BigFloat(float& val) +inline float BigFloat(float val) { if (!athena::utility::isSystemBigEndian()) val = athena::utility::swapFloat(val); return val; } -inline double LittleDouble(double& val) +inline double LittleDouble(double val) { if (athena::utility::isSystemBigEndian()) val = athena::utility::swapDouble(val); return val; } -inline double BigDouble(double& val) +inline double BigDouble(double val) { if (!athena::utility::isSystemBigEndian()) val = athena::utility::swapDouble(val); diff --git a/include/athena/simd/parallelism_v2_simd.hpp b/include/athena/simd/parallelism_v2_simd.hpp new file mode 100644 index 0000000..14606ee --- /dev/null +++ b/include/athena/simd/parallelism_v2_simd.hpp @@ -0,0 +1,1768 @@ +// -*- C++ -*- +//===------------------------------- simd ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#pragma once + +/* + experimental/simd synopsis + +namespace std::experimental { + +inline namespace parallelism_v2 { + +namespace simd_abi { + +struct scalar {}; +template struct fixed_size {}; +template inline constexpr int max_fixed_size = implementation-defined; +template using compatible = implementation-defined; +template using native = implementation-defined; + +} // simd_abi + +struct element_aligned_tag {}; +struct vector_aligned_tag {}; +template struct overaligned_tag {}; +inline constexpr element_aligned_tag element_aligned{}; +inline constexpr vector_aligned_tag vector_aligned{}; +template inline constexpr overaligned_tag overaligned{}; + +// traits [simd.traits] +template struct is_abi_tag; +template inline constexpr bool is_abi_tag_v = is_abi_tag::value; + +template struct is_simd; +template inline constexpr bool is_simd_v = is_simd::value; + +template struct is_simd_mask; +template inline constexpr bool is_simd_mask_v = is_simd_mask::value; + +template struct is_simd_flag_type; +template inline constexpr bool is_simd_flag_type_v = is_simd_flag_type::value; + +template struct abi_for_size { using type = see below; }; +template using abi_for_size_t = typename abi_for_size::type; + +template > struct simd_size; +template > +inline constexpr size_t simd_size_v = simd_size::value; + +template struct memory_alignment; +template +inline constexpr size_t memory_alignment_v = memory_alignment::value; + +// class template simd [simd.class] +template > class simd; +template using native_simd = simd>; +template using fixed_size_simd = simd>; + +// class template simd_mask [simd.mask.class] +template > class simd_mask; +template using native_simd_mask = simd_mask>; +template using fixed_size_simd_mask = simd_mask>; + +// casts [simd.casts] +template see below simd_cast(const simd&); +template see below static_simd_cast(const simd&); + +template +fixed_size_simd> to_fixed_size(const simd&) noexcept; +template +fixed_size_simd_mask> to_fixed_size(const simd_mask&) noexcept; +template native_simd to_native(const fixed_size_simd&) noexcept; +template +native_simd_mask to_native(const fixed_size_simd_mask> &) noexcept; +template simd to_compatible(const fixed_size_simd&) noexcept; +template simd_mask to_compatible(const fixed_size_simd_mask&) noexcept; + +template +tuple>...> split(const simd&); +template +tuple>...> split(const simd_mask&); +template +array / V::size()> split( +const simd&); +template +array / V::size()> split( +const simd_mask&); + +template +simd + ...)>> concat(const simd&...); +template +simd_mask + ...)>> concat(const simd_mask&...); + +// reductions [simd.mask.reductions] +template bool all_of(const simd_mask&) noexcept; +template bool any_of(const simd_mask&) noexcept; +template bool none_of(const simd_mask&) noexcept; +template bool some_of(const simd_mask&) noexcept; +template int popcount(const simd_mask&) noexcept; +template int find_first_set(const simd_mask&); +template int find_last_set(const simd_mask&); + +bool all_of(see below) noexcept; +bool any_of(see below) noexcept; +bool none_of(see below) noexcept; +bool some_of(see below) noexcept; +int popcount(see below) noexcept; +int find_first_set(see below) noexcept; +int find_last_set(see below) noexcept; + +// masked assignment [simd.whereexpr] +template class const_where_expression; +template class where_expression; + +// masked assignment [simd.mask.where] +template struct nodeduce { using type = T; }; // exposition only + +template using nodeduce_t = typename nodeduce::type; // exposition only + +template +where_expression, simd> +where(const typename simd::mask_type&, simd&) noexcept; + +template +const_where_expression, const simd> +where(const typename simd::mask_type&, const simd&) noexcept; + +template +where_expression, simd_mask> +where(const nodeduce_t>&, simd_mask&) noexcept; + +template +const_where_expression, const simd_mask> +where(const nodeduce_t>&, const simd_mask&) noexcept; + +template where_expression where(see below k, T& d) noexcept; + +template +const_where_expression where(see below k, const T& d) noexcept; + +// reductions [simd.reductions] +template > +T reduce(const simd&, BinaryOperation = BinaryOperation()); + +template +typename V::value_type reduce(const const_where_expression& x, +typename V::value_type neutral_element, BinaryOperation binary_op); + +template +typename V::value_type reduce(const const_where_expression& x, plus<> binary_op = plus<>()); + +template +typename V::value_type reduce(const const_where_expression& x, multiplies<> binary_op); + +template +typename V::value_type reduce(const const_where_expression& x, bit_and<> binary_op); + +template +typename V::value_type reduce(const const_where_expression& x, bit_or<> binary_op); + +template +typename V::value_type reduce(const const_where_expression& x, bit_xor<> binary_op); + +template T hmin(const simd&); +template T hmin(const const_where_expression&); +template T hmax(const simd&); +template T hmax(const const_where_expression&); + +// algorithms [simd.alg] +template simd min(const simd&, const simd&) noexcept; + +template simd max(const simd&, const simd&) noexcept; + +template +std::pair, simd> minmax(const simd&, const simd&) noexcept; + +template +simd clamp(const simd& v, const simd& lo, const simd& hi); + +// [simd.whereexpr] +template +class const_where_expression { + const M& mask; // exposition only + T& data; // exposition only +public: + const_where_expression(const const_where_expression&) = delete; + const_where_expression& operator=(const const_where_expression&) = delete; + remove_const_t operator-() const &&; + template void copy_to(U* mem, Flags f) const &&; +}; + +template +class where_expression : public const_where_expression { +public: + where_expression(const where_expression&) = delete; + where_expression& operator=(const where_expression&) = delete; + template void operator=(U&& x); + template void operator+=(U&& x); + template void operator-=(U&& x); + template void operator*=(U&& x); + template void operator/=(U&& x); + template void operator%=(U&& x); + template void operator&=(U&& x); + template void operator|=(U&& x); + template void operator^=(U&& x); + template void operator<<=(U&& x); + template void operator>>=(U&& x); + void operator++(); + void operator++(int); + void operator--(); + void operator--(int); + template void copy_from(const U* mem, Flags); +}; + +// [simd.class] +template class simd { +public: + using value_type = T; + using reference = see below; + using mask_type = simd_mask; + + using abi_type = Abi; + static constexpr size_t size() noexcept; + simd() = default; + + // implicit type conversion constructor + template simd(const simd>&); + + // implicit broadcast constructor (see below for constraints) + template simd(U&& value); + + // generator constructor (see below for constraints) + template explicit simd(G&& gen); + + // load constructor + template simd(const U* mem, Flags f); + + // loads [simd.load] + template void copy_from(const U* mem, Flags f); + + // stores [simd.store] + template void copy_to(U* mem, Flags f) const; + + // scalar access [simd.subscr] + reference operator[](size_t); + value_type operator[](size_t) const; + + // unary operators [simd.unary] + simd& operator++(); + simd operator++(int); + simd& operator--(); + simd operator--(int); + mask_type operator!() const; + simd operator~() const; // see below + simd operator+() const; + simd operator-() const; + + // binary operators [simd.binary] + friend simd operator+ (const simd&, const simd&); + friend simd operator- (const simd&, const simd&); + friend simd operator* (const simd&, const simd&); + friend simd operator/ (const simd&, const simd&); + friend simd operator% (const simd&, const simd&); + friend simd operator& (const simd&, const simd&); + friend simd operator| (const simd&, const simd&); + friend simd operator^ (const simd&, const simd&); + friend simd operator<<(const simd&, const simd&); + friend simd operator>>(const simd&, const simd&); + friend simd operator<<(const simd&, int); + friend simd operator>>(const simd&, int); + + // compound assignment [simd.cassign] + friend simd& operator+= (simd&, const simd&); + friend simd& operator-= (simd&, const simd&); + friend simd& operator*= (simd&, const simd&); + friend simd& operator/= (simd&, const simd&); + friend simd& operator%= (simd&, const simd&); + + friend simd& operator&= (simd&, const simd&); + friend simd& operator|= (simd&, const simd&); + friend simd& operator^= (simd&, const simd&); + friend simd& operator<<=(simd&, const simd&); + friend simd& operator>>=(simd&, const simd&); + friend simd& operator<<=(simd&, int); + friend simd& operator>>=(simd&, int); + + // compares [simd.comparison] + friend mask_type operator==(const simd&, const simd&); + friend mask_type operator!=(const simd&, const simd&); + friend mask_type operator>=(const simd&, const simd&); + friend mask_type operator<=(const simd&, const simd&); + friend mask_type operator> (const simd&, const simd&); + friend mask_type operator< (const simd&, const simd&); +}; + +// [simd.math] +template using scharv = simd; // exposition only +template using shortv = simd; // exposition only +template using intv = simd; // exposition only +template using longv = simd; // exposition only +template using llongv = simd; // exposition only +template using floatv = simd; // exposition only +template using doublev = simd; // exposition only +template using ldoublev = simd; // exposition only +template using samesize = fixed_size_simd; // exposition only + +template floatv acos(floatv x); +template doublev acos(doublev x); +template ldoublev acos(ldoublev x); + +template floatv asin(floatv x); +template doublev asin(doublev x); +template ldoublev asin(ldoublev x); + +template floatv atan(floatv x); +template doublev atan(doublev x); +template ldoublev atan(ldoublev x); + +template floatv atan2(floatv y, floatv x); +template doublev atan2(doublev y, doublev x); +template ldoublev atan2(ldoublev y, ldoublev x); + +template floatv cos(floatv x); +template doublev cos(doublev x); +template ldoublev cos(ldoublev x); + +template floatv sin(floatv x); +template doublev sin(doublev x); +template ldoublev sin(ldoublev x); + +template floatv tan(floatv x); +template doublev tan(doublev x); +template ldoublev tan(ldoublev x); + +template floatv acosh(floatv x); +template doublev acosh(doublev x); +template ldoublev acosh(ldoublev x); + +template floatv asinh(floatv x); +template doublev asinh(doublev x); +template ldoublev asinh(ldoublev x); + +template floatv atanh(floatv x); +template doublev atanh(doublev x); +template ldoublev atanh(ldoublev x); + +template floatv cosh(floatv x); +template doublev cosh(doublev x); +template ldoublev cosh(ldoublev x); + +template floatv sinh(floatv x); +template doublev sinh(doublev x); +template ldoublev sinh(ldoublev x); + +template floatv tanh(floatv x); +template doublev tanh(doublev x); +template ldoublev tanh(ldoublev x); + +template floatv exp(floatv x); +template doublev exp(doublev x); +template ldoublev exp(ldoublev x); + +template floatv exp2(floatv x); +template doublev exp2(doublev x); +template ldoublev exp2(ldoublev x); + +template floatv expm1(floatv x); +template doublev expm1(doublev x); +template ldoublev expm1(ldoublev x); + +template floatv frexp(floatv value, samesize>* exp); +template doublev frexp(doublev value, samesize>* exp); +template ldoublev frexp(ldoublev value, samesize>* exp); + +template samesize> ilogb(floatv x); +template samesize> ilogb(doublev x); +template samesize> ilogb(ldoublev x); + +template floatv ldexp(floatv x, samesize> exp); +template doublev ldexp(doublev x, samesize> exp); +template ldoublev ldexp(ldoublev x, samesize> exp); + +template floatv log(floatv x); +template doublev log(doublev x); +template ldoublev log(ldoublev x); + +template floatv log10(floatv x); +template doublev log10(doublev x); +template ldoublev log10(ldoublev x); + +template floatv log1p(floatv x); +template doublev log1p(doublev x); +template ldoublev log1p(ldoublev x); + +template floatv log2(floatv x); +template doublev log2(doublev x); +template ldoublev log2(ldoublev x); + +template floatv logb(floatv x); +template doublev logb(doublev x); +template ldoublev logb(ldoublev x); + +template floatv modf(floatv value, floatv* iptr); +template doublev modf(doublev value, doublev* iptr); +template ldoublev modf(ldoublev value, ldoublev* iptr); + +template floatv scalbn(floatv x, samesize> n); +template doublev scalbn(doublev x, samesize> n); +template ldoublev scalbn(ldoublev x, samesize> n); +template floatv scalbln(floatv x, samesize> n); +template doublev scalbln(doublev x, samesize> n); +template ldoublev scalbln(ldoublev x, samesize> n); + +template floatv cbrt(floatv x); +template doublev cbrt(doublev x); +template ldoublev cbrt(ldoublev x); + +template scharv abs(scharv j); +template shortv abs(shortv j); +template intv abs(intv j); +template longv abs(longv j); +template llongv abs(llongv j); +template floatv abs(floatv j); +template doublev abs(doublev j); +template ldoublev abs(ldoublev j); + +template floatv hypot(floatv x, floatv y); +template doublev hypot(doublev x, doublev y); +template ldoublev hypot(doublev x, doublev y); +template floatv hypot(floatv x, floatv y, floatv z); +template doublev hypot(doublev x, doublev y, doublev z); +template ldoublev hypot(ldoublev x, ldoublev y, ldoublev z); + +template floatv pow(floatv x, floatv y); +template doublev pow(doublev x, doublev y); +template ldoublev pow(ldoublev x, ldoublev y); + +template floatv sqrt(floatv x); +template doublev sqrt(doublev x); +template ldoublev sqrt(ldoublev x); + +template floatv erf(floatv x); +template doublev erf(doublev x); +template ldoublev erf(ldoublev x); +template floatv erfc(floatv x); +template doublev erfc(doublev x); +template ldoublev erfc(ldoublev x); + +template floatv lgamma(floatv x); +template doublev lgamma(doublev x); +template ldoublev lgamma(ldoublev x); + +template floatv tgamma(floatv x); +template doublev tgamma(doublev x); +template ldoublev tgamma(ldoublev x); + +template floatv ceil(floatv x); +template doublev ceil(doublev x); +template ldoublev ceil(ldoublev x); + +template floatv floor(floatv x); +template doublev floor(doublev x); +template ldoublev floor(ldoublev x); + +template floatv nearbyint(floatv x); +template doublev nearbyint(doublev x); +template ldoublev nearbyint(ldoublev x); + +template floatv rint(floatv x); +template doublev rint(doublev x); +template ldoublev rint(ldoublev x); + +template samesize> lrint(floatv x); +template samesize> lrint(doublev x); +template samesize> lrint(ldoublev x); +template samesize> llrint(floatv x); +template samesize> llrint(doublev x); +template samesize> llrint(ldoublev x); + +template floatv round(floatv x); +template doublev round(doublev x); +template ldoublev round(ldoublev x); +template samesize> lround(floatv x); +template samesize> lround(doublev x); +template samesize> lround(ldoublev x); +template samesize> llround(floatv x); +template samesize> llround(doublev x); +template samesize> llround(ldoublev x); + +template floatv trunc(floatv x); +template doublev trunc(doublev x); +template ldoublev trunc(ldoublev x); + +template floatv fmod(floatv x, floatv y); +template doublev fmod(doublev x, doublev y); +template ldoublev fmod(ldoublev x, ldoublev y); + +template floatv remainder(floatv x, floatv y); +template doublev remainder(doublev x, doublev y); +template ldoublev remainder(ldoublev x, ldoublev y); + +template floatv remquo(floatv x, floatv y, samesize>* quo); +template doublev remquo(doublev x, doublev y, samesize>* quo); +template ldoublev remquo(ldoublev x, ldoublev y, samesize>* quo); + +template floatv copysign(floatv x, floatv y); +template doublev copysign(doublev x, doublev y); +template ldoublev copysign(ldoublev x, ldoublev y); + +template doublev nan(const char* tagp); +template floatv nanf(const char* tagp); +template ldoublev nanl(const char* tagp); + +template floatv nextafter(floatv x, floatv y); +template doublev nextafter(doublev x, doublev y); +template ldoublev nextafter(ldoublev x, ldoublev y); + +template floatv nexttoward(floatv x, ldoublev y); +template doublev nexttoward(doublev x, ldoublev y); +template ldoublev nexttoward(ldoublev x, ldoublev y); + +template floatv fdim(floatv x, floatv y); +template doublev fdim(doublev x, doublev y); +template ldoublev fdim(ldoublev x, ldoublev y); + +template floatv fmax(floatv x, floatv y); +template doublev fmax(doublev x, doublev y); +template ldoublev fmax(ldoublev x, ldoublev y); + +template floatv fmin(floatv x, floatv y); +template doublev fmin(doublev x, doublev y); +template ldoublev fmin(ldoublev x, ldoublev y); + +template floatv fma(floatv x, floatv y, floatv z); +template doublev fma(doublev x, doublev y, doublev z); +template ldoublev fma(ldoublev x, ldoublev y, ldoublev z); + +template samesize> fpclassify(floatv x); +template samesize> fpclassify(doublev x); +template samesize> fpclassify(ldoublev x); + +template simd_mask isfinite(floatv x); +template simd_mask isfinite(doublev x); +template simd_mask isfinite(ldoublev x); + +template simd_mask isinf(floatv x); +template simd_mask isinf(doublev x); +template simd_mask isinf(ldoublev x); + +template simd_mask isnan(floatv x); +template simd_mask isnan(doublev x); +template simd_mask isnan(ldoublev x); + +template simd_mask isnormal(floatv x); +template simd_mask isnormal(doublev x); +template simd_mask isnormal(ldoublev x); + +template simd_mask signbit(floatv x); +template simd_mask signbit(doublev x); +template simd_mask signbit(ldoublev x); + +template simd_mask isgreater(floatv x, floatv y); +template simd_mask isgreater(doublev x, doublev y); +template simd_mask isgreater(ldoublev x, ldoublev y); + +template simd_mask isgreaterequal(floatv x, floatv y); +template simd_mask isgreaterequal(doublev x, doublev y); +template simd_mask isgreaterequal(ldoublev x, ldoublev y); + +template simd_mask isless(floatv x, floatv y); +template simd_mask isless(doublev x, doublev y); +template simd_mask isless(ldoublev x, ldoublev y); + +template simd_mask islessequal(floatv x, floatv y); +template simd_mask islessequal(doublev x, doublev y); +template simd_mask islessequal(ldoublev x, ldoublev y); + +template simd_mask islessgreater(floatv x, floatv y); +template simd_mask islessgreater(doublev x, doublev y); +template simd_mask islessgreater(ldoublev x, ldoublev y); + +template simd_mask isunordered(floatv x, floatv y); +template simd_mask isunordered(doublev x, doublev y); +template simd_mask isunordered(ldoublev x, ldoublev y); + +template struct simd_div_t { V quot, rem; }; +template simd_div_t> div(scharv numer, scharv denom); +template simd_div_t> div(shortv numer, shortv denom); +template simd_div_t> div(intv numer, intv denom); +template simd_div_t> div(longv numer, longv denom); +template simd_div_t> div(llongv numer, llongv denom); + +// [simd.mask.class] +template +class simd_mask { +public: + using value_type = bool; + using reference = see below; + using simd_type = simd; + using abi_type = Abi; + static constexpr size_t size() noexcept; + simd_mask() = default; + + // broadcast constructor + explicit simd_mask(value_type) noexcept; + + // implicit type conversion constructor + template simd_mask(const simd_mask>&) noexcept; + + // load constructor + template simd_mask(const value_type* mem, Flags); + + // loads [simd.mask.copy] + template void copy_from(const value_type* mem, Flags); + template void copy_to(value_type* mem, Flags) const; + + // scalar access [simd.mask.subscr] + reference operator[](size_t); + value_type operator[](size_t) const; + + // unary operators [simd.mask.unary] + simd_mask operator!() const noexcept; + + // simd_mask binary operators [simd.mask.binary] + friend simd_mask operator&&(const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator||(const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator& (const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator| (const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator^ (const simd_mask&, const simd_mask&) noexcept; + + // simd_mask compound assignment [simd.mask.cassign] + friend simd_mask& operator&=(simd_mask&, const simd_mask&) noexcept; + friend simd_mask& operator|=(simd_mask&, const simd_mask&) noexcept; + friend simd_mask& operator^=(simd_mask&, const simd_mask&) noexcept; + + // simd_mask compares [simd.mask.comparison] + friend simd_mask operator==(const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator!=(const simd_mask&, const simd_mask&) noexcept; +}; + +} // parallelism_v2 +} // std::experimental + +*/ + +#include +#include +#include +#include +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +#pragma GCC system_header +#endif + +namespace athena::_simd { + +enum class _StorageKind { + _Scalar, + _Array, + _VecExt, +}; + +template <_StorageKind __kind, int _Np> +struct __simd_abi {}; + +template +class __simd_storage {}; + +template +class __simd_mask_storage {}; + +template +class __simd_storage<_Tp, __simd_abi<_StorageKind::_Scalar, 1>> { + _Tp __storage_; + + template + friend struct simd; + + template + friend struct simd_mask; + +public: + _Tp __get(size_t __index) const noexcept { return (&__storage_)[__index]; }; + void __set(size_t __index, _Tp __val) noexcept { + (&__storage_)[__index] = __val; + } +}; + +#ifndef _LIBCPP_HAS_NO_VECTOR_EXTENSION + +constexpr size_t __floor_pow_of_2(size_t __val) { + return ((__val - 1) & __val) == 0 ? __val + : __floor_pow_of_2((__val - 1) & __val); +} + +constexpr size_t __ceil_pow_of_2(size_t __val) { + return __val == 1 ? 1 : __floor_pow_of_2(__val - 1) << 1; +} + +template +struct __vec_ext_traits { +#if !defined(_LIBCPP_COMPILER_CLANG) + typedef _Tp type __attribute__((vector_size(__ceil_pow_of_2(__bytes)))); +#endif +}; + +#if defined(_LIBCPP_COMPILER_CLANG) +#define _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, _NUM_ELEMENT) \ + template <> \ + struct __vec_ext_traits<_TYPE, sizeof(_TYPE) * _NUM_ELEMENT> { \ + using type = \ + _TYPE __attribute__((vector_size(sizeof(_TYPE) * _NUM_ELEMENT))); \ + } + +#define _LIBCPP_SPECIALIZE_VEC_EXT_32(_TYPE) \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 1); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 2); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 3); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 4); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 5); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 6); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 7); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 8); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 9); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 10); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 11); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 12); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 13); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 14); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 15); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 16); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 17); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 18); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 19); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 20); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 21); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 22); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 23); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 24); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 25); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 26); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 27); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 28); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 29); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 30); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 31); \ + _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 32); + +_LIBCPP_SPECIALIZE_VEC_EXT_32(char); +_LIBCPP_SPECIALIZE_VEC_EXT_32(char16_t); +_LIBCPP_SPECIALIZE_VEC_EXT_32(char32_t); +_LIBCPP_SPECIALIZE_VEC_EXT_32(wchar_t); +_LIBCPP_SPECIALIZE_VEC_EXT_32(signed char); +_LIBCPP_SPECIALIZE_VEC_EXT_32(signed short); +_LIBCPP_SPECIALIZE_VEC_EXT_32(signed int); +_LIBCPP_SPECIALIZE_VEC_EXT_32(signed long); +_LIBCPP_SPECIALIZE_VEC_EXT_32(signed long long); +_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned char); +_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned short); +_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned int); +_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned long); +_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned long long); +_LIBCPP_SPECIALIZE_VEC_EXT_32(float); +_LIBCPP_SPECIALIZE_VEC_EXT_32(double); +_LIBCPP_SPECIALIZE_VEC_EXT_32(long double); + +#undef _LIBCPP_SPECIALIZE_VEC_EXT_32 +#undef _LIBCPP_SPECIALIZE_VEC_EXT +#endif + +template +class __simd_storage<_Tp, __simd_abi<_StorageKind::_VecExt, __num_element>> { + using _StorageType = + typename __vec_ext_traits<_Tp, sizeof(_Tp) * __num_element>::type; + + _StorageType __storage_; + + template + friend struct simd; + + template + friend struct simd_mask; + +public: + _Tp __get(size_t __index) const noexcept { return __storage_[__index]; }; + void __set(size_t __index, _Tp __val) noexcept { + __storage_[__index] = __val; + } +}; + +#endif // _LIBCPP_HAS_NO_VECTOR_EXTENSION + +template +class __simd_reference { + static_assert(std::is_same<_Vp, _Tp>::value, ""); + + template + friend struct simd; + + template + friend struct simd_mask; + + __simd_storage<_Tp, _Abi>* __ptr_; + size_t __index_; + + __simd_reference(__simd_storage<_Tp, _Abi>* __ptr, size_t __index) + : __ptr_(__ptr), __index_(__index) {} + + __simd_reference(const __simd_reference&) = default; + +public: + __simd_reference() = delete; + __simd_reference& operator=(const __simd_reference& other) = delete; + + operator _Vp() const { return __ptr_->__get(__index_); } + + __simd_reference operator=(_Vp __value) && { + __ptr_->__set(__index_, __value); + return *this; + } + + __simd_reference operator++() && { + return std::move(*this) = __ptr_->__get(__index_) + 1; + } + + _Vp operator++(int) && { + auto __val = __ptr_->__get(__index_); + __ptr_->__set(__index_, __val + 1); + return __val; + } + + __simd_reference operator--() && { + return std::move(*this) = __ptr_->__get(__index_) - 1; + } + + _Vp operator--(int) && { + auto __val = __ptr_->__get(__index_); + __ptr_->__set(__index_, __val - 1); + return __val; + } + + __simd_reference operator+=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) + __value; + } + + __simd_reference operator-=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) - __value; + } + + __simd_reference operator*=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) * __value; + } + + __simd_reference operator/=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) / __value; + } + + __simd_reference operator%=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) % __value; + } + + __simd_reference operator>>=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) >> __value; + } + + __simd_reference operator<<=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) << __value; + } + + __simd_reference operator&=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) & __value; + } + + __simd_reference operator|=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) | __value; + } + + __simd_reference operator^=(_Vp __value) && { + return std::move(*this) = __ptr_->__get(__index_) ^ __value; + } + + bool operator<(_Vp __value) const { + return __ptr_->__get(__index_) < __value; + } + + bool operator<=(_Vp __value) const { + return __ptr_->__get(__index_) <= __value; + } + + bool operator>(_Vp __value) const { + return __ptr_->__get(__index_) > __value; + } + + bool operator>=(_Vp __value) const { + return __ptr_->__get(__index_) >= __value; + } + + bool operator==(_Vp __value) const { + return __ptr_->__get(__index_) == __value; + } + + bool operator!=(_Vp __value) const { + return __ptr_->__get(__index_) != __value; + } +}; + +template +class __simd_mask_reference { + template + friend struct simd; + + template + friend struct simd_mask; + + __simd_mask_storage<_Tp, _Abi>* __ptr_; + size_t __index_; + + __simd_mask_reference(__simd_mask_storage<_Tp, _Abi>* __ptr, size_t __index) + : __ptr_(__ptr), __index_(__index) {} + + __simd_mask_reference(const __simd_mask_reference&) = default; + +public: + __simd_mask_reference() = delete; + __simd_mask_reference& operator=(const __simd_mask_reference&) = delete; + + operator bool() const { return __ptr_->__get(__index_); } + + __simd_mask_reference operator=(bool __value) && { + __ptr_->__set(__index_, __value); + return *this; + } +}; + +template +constexpr decltype(_To{std::declval<_From>()}, true) +__is_non_narrowing_convertible_impl(_From) { + return true; +} + +template +constexpr bool __is_non_narrowing_convertible_impl(...) { + return false; +} + +template +constexpr typename std::enable_if::value && + std::is_arithmetic<_From>::value, + bool>::type +__is_non_narrowing_arithmetic_convertible() { + return __is_non_narrowing_convertible_impl<_To>(_From{}); +} + +template +constexpr typename std::enable_if::value && + std::is_arithmetic<_From>::value), + bool>::type +__is_non_narrowing_arithmetic_convertible() { + return false; +} + +template +constexpr _Tp __variadic_sum() { + return _Tp{}; +} + +template +constexpr _Tp __variadic_sum(_Up __first, _Args... __rest) { + return static_cast<_Tp>(__first) + __variadic_sum<_Tp>(__rest...); +} + +template +struct __nodeduce { + using type = _Tp; +}; + +template +constexpr bool __vectorizable() { + return std::is_arithmetic<_Tp>::value && !std::is_const<_Tp>::value && + !std::is_volatile<_Tp>::value && !std::is_same<_Tp, bool>::value; +} + +} +namespace athena::_simd::simd_abi { + +using scalar = __simd_abi<_StorageKind::_Scalar, 1>; + +template +using fixed_size = __simd_abi<_StorageKind::_Array, _Np>; + +template +inline constexpr size_t max_fixed_size = 32; + +template +using compatible = fixed_size<16 / sizeof(_Tp)>; + +#ifndef _LIBCPP_HAS_NO_VECTOR_EXTENSION +template +using native = __simd_abi<_StorageKind::_VecExt, 16 / sizeof(_Tp)>; +#else +template +using native = + fixed_size<_Tp, 16 / sizeof(_Tp)>; +#endif // _LIBCPP_HAS_NO_VECTOR_EXTENSION + +} +namespace athena::_simd { + +template > +class simd; +template > +class simd_mask; + +struct element_aligned_tag {}; +struct vector_aligned_tag {}; +template +struct overaligned_tag {}; +inline constexpr element_aligned_tag element_aligned{}; +inline constexpr vector_aligned_tag vector_aligned{}; +template +inline constexpr overaligned_tag<_Np> overaligned{}; + +// traits [simd.traits] +template +struct is_abi_tag : std::integral_constant {}; + +template <_StorageKind __kind, int _Np> +struct is_abi_tag<__simd_abi<__kind, _Np>> + : std::integral_constant {}; + +template +struct is_simd : std::integral_constant {}; + +template +struct is_simd> : std::integral_constant {}; + +template +struct is_simd_mask : std::integral_constant {}; + +template +struct is_simd_mask> : std::integral_constant { +}; + +template +struct is_simd_flag_type : std::integral_constant {}; + +template <> +struct is_simd_flag_type + : std::integral_constant {}; + +template <> +struct is_simd_flag_type + : std::integral_constant {}; + +template +struct is_simd_flag_type> + : std::integral_constant {}; + +template +inline constexpr bool is_abi_tag_v = is_abi_tag<_Tp>::value; +template +inline constexpr bool is_simd_v = is_simd<_Tp>::value; +template +inline constexpr bool is_simd_mask_v = is_simd_mask<_Tp>::value; +template +inline constexpr bool is_simd_flag_type_v = + is_simd_flag_type<_Tp>::value; +template +struct abi_for_size { + using type = simd_abi::fixed_size<_Np>; +}; +template +using abi_for_size_t = typename abi_for_size<_Tp, _Np>::type; + +template > +struct simd_size; + +template +struct simd_size<_Tp, __simd_abi<__kind, _Np>> + : std::integral_constant { + static_assert( + std::is_arithmetic<_Tp>::value && + !std::is_same::type, bool>::value, + "Element type should be vectorizable"); +}; + +// TODO: implement it. +template +struct memory_alignment; + +template > +inline constexpr size_t simd_size_v = simd_size<_Tp, _Abi>::value; + +template +inline constexpr size_t memory_alignment_v = + memory_alignment<_Tp, _Up>::value; + +// class template simd [simd.class] +template +using native_simd = simd<_Tp, simd_abi::native<_Tp>>; +template +using fixed_size_simd = simd<_Tp, simd_abi::fixed_size<_Np>>; + +// class template simd_mask [simd.mask.class] +template +using native_simd_mask = simd_mask<_Tp, simd_abi::native<_Tp>>; + +template +using fixed_size_simd_mask = simd_mask<_Tp, simd_abi::fixed_size<_Np>>; + +// casts [simd.casts] +template +struct __static_simd_cast_traits { + template + static simd<_Tp, _Abi> __apply(const simd<_Up, _Abi>& __v); +}; + +template +struct __static_simd_cast_traits> { + template + static typename std::enable_if::size() == + simd<_Tp, _NewAbi>::size(), + simd<_Tp, _NewAbi>>::type + __apply(const simd<_Up, _Abi>& __v); +}; + +template +struct __simd_cast_traits { + template + static typename std::enable_if< + __is_non_narrowing_arithmetic_convertible<_Up, _Tp>(), + simd<_Tp, _Abi>>::type + __apply(const simd<_Up, _Abi>& __v); +}; + +template +struct __simd_cast_traits> { + template + static typename std::enable_if< + __is_non_narrowing_arithmetic_convertible<_Up, _Tp>() && + simd<_Up, _Abi>::size() == simd<_Tp, _NewAbi>::size(), + simd<_Tp, _NewAbi>>::type + __apply(const simd<_Up, _Abi>& __v); +}; + +template +auto simd_cast(const simd<_Up, _Abi>& __v) + -> decltype(__simd_cast_traits<_Tp>::__apply(__v)) { + return __simd_cast_traits<_Tp>::__apply(__v); +} + +template +auto static_simd_cast(const simd<_Up, _Abi>& __v) + -> decltype(__static_simd_cast_traits<_Tp>::__apply(__v)) { + return __static_simd_cast_traits<_Tp>::__apply(__v); +} + +template +fixed_size_simd<_Tp, simd_size<_Tp, _Abi>::value> +to_fixed_size(const simd<_Tp, _Abi>&) noexcept; + +template +fixed_size_simd_mask<_Tp, simd_size<_Tp, _Abi>::value> +to_fixed_size(const simd_mask<_Tp, _Abi>&) noexcept; + +template +native_simd<_Tp> to_native(const fixed_size_simd<_Tp, _Np>&) noexcept; + +template +native_simd_mask<_Tp> to_native(const fixed_size_simd_mask<_Tp, _Np>&) noexcept; + +template +simd<_Tp> to_compatible(const fixed_size_simd<_Tp, _Np>&) noexcept; + +template +simd_mask<_Tp> to_compatible(const fixed_size_simd_mask<_Tp, _Np>&) noexcept; + +template +tuple>...> split(const simd<_Tp, _Abi>&); + +template +tuple>...> +split(const simd_mask<_Tp, _Abi>&); + +template +array<_SimdType, simd_size::value / + _SimdType::size()> +split(const simd&); + +template +array<_SimdType, simd_size::value / + _SimdType::size()> +split(const simd_mask&); + +template +simd<_Tp, abi_for_size_t<_Tp, __variadic_sum(simd_size<_Tp, _Abis>::value...)>> +concat(const simd<_Tp, _Abis>&...); + +template +simd_mask<_Tp, + abi_for_size_t<_Tp, __variadic_sum(simd_size<_Tp, _Abis>::value...)>> +concat(const simd_mask<_Tp, _Abis>&...); + +// reductions [simd.mask.reductions] +template +bool all_of(const simd_mask<_Tp, _Abi>&) noexcept; +template +bool any_of(const simd_mask<_Tp, _Abi>&) noexcept; +template +bool none_of(const simd_mask<_Tp, _Abi>&) noexcept; +template +bool some_of(const simd_mask<_Tp, _Abi>&) noexcept; +template +int popcount(const simd_mask<_Tp, _Abi>&) noexcept; +template +int find_first_set(const simd_mask<_Tp, _Abi>&); +template +int find_last_set(const simd_mask<_Tp, _Abi>&); +bool all_of(bool) noexcept; +bool any_of(bool) noexcept; +bool none_of(bool) noexcept; +bool some_of(bool) noexcept; +int popcount(bool) noexcept; +int find_first_set(bool) noexcept; +int find_last_set(bool) noexcept; + +// masked assignment [simd.whereexpr] +template +class const_where_expression; +template +class where_expression; + +// masked assignment [simd.mask.where] +template +where_expression, simd<_Tp, _Abi>> +where(const typename simd<_Tp, _Abi>::mask_type&, simd<_Tp, _Abi>&) noexcept; + +template +const_where_expression, const simd<_Tp, _Abi>> +where(const typename simd<_Tp, _Abi>::mask_type&, + const simd<_Tp, _Abi>&) noexcept; + +template +where_expression, simd_mask<_Tp, _Abi>> +where(const typename __nodeduce>::type&, + simd_mask<_Tp, _Abi>&) noexcept; + +template +const_where_expression, const simd_mask<_Tp, _Abi>> +where(const typename __nodeduce>::type&, + const simd_mask<_Tp, _Abi>&) noexcept; + +template +where_expression where(bool, _Tp&) noexcept; + +template +const_where_expression where(bool, const _Tp&) noexcept; + +// reductions [simd.reductions] +template > +_Tp reduce(const simd<_Tp, _Abi>&, _BinaryOp = _BinaryOp()); + +template +typename _SimdType::value_type +reduce(const const_where_expression<_MaskType, _SimdType>&, + typename _SimdType::value_type neutral_element, _BinaryOp binary_op); + +template +typename _SimdType::value_type +reduce(const const_where_expression<_MaskType, _SimdType>&, + plus binary_op = {}); + +template +typename _SimdType::value_type +reduce(const const_where_expression<_MaskType, _SimdType>&, + multiplies binary_op); + +template +typename _SimdType::value_type +reduce(const const_where_expression<_MaskType, _SimdType>&, + bit_and binary_op); + +template +typename _SimdType::value_type +reduce(const const_where_expression<_MaskType, _SimdType>&, + bit_or binary_op); + +template +typename _SimdType::value_type +reduce(const const_where_expression<_MaskType, _SimdType>&, + bit_xor binary_op); + +template +_Tp hmin(const simd<_Tp, _Abi>&); +template +typename _SimdType::value_type +hmin(const const_where_expression<_MaskType, _SimdType>&); +template +_Tp hmax(const simd<_Tp, _Abi>&); +template +typename _SimdType::value_type +hmax(const const_where_expression<_MaskType, _SimdType>&); + +// algorithms [simd.alg] +template +simd<_Tp, _Abi> min(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept; + +template +simd<_Tp, _Abi> max(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept; + +template +std::pair, simd<_Tp, _Abi>> +minmax(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept; + +template +simd<_Tp, _Abi> clamp(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&, + const simd<_Tp, _Abi>&); + +// [simd.whereexpr] +// TODO implement where expressions. +template +class const_where_expression { +public: + const_where_expression(const const_where_expression&) = delete; + const_where_expression& operator=(const const_where_expression&) = delete; + typename remove_const<_Tp>::type operator-() const&&; + template + void copy_to(_Up*, _Flags) const&&; +}; + +template +class where_expression : public const_where_expression<_MaskType, _Tp> { +public: + where_expression(const where_expression&) = delete; + where_expression& operator=(const where_expression&) = delete; + template + void operator=(_Up&&); + template + void operator+=(_Up&&); + template + void operator-=(_Up&&); + template + void operator*=(_Up&&); + template + void operator/=(_Up&&); + template + void operator%=(_Up&&); + template + void operator&=(_Up&&); + template + void operator|=(_Up&&); + template + void operator^=(_Up&&); + template + void operator<<=(_Up&&); + template + void operator>>=(_Up&&); + void operator++(); + void operator++(int); + void operator--(); + void operator--(int); + template + void copy_from(const _Up*, _Flags); +}; + +template +class alignas(_Simd) simd_data { +public: + using value_type = typename _Simd::value_type; + simd_data() = default; + simd_data(const _Simd& s); + simd_data(value_type v) { std::fill(std::begin(__data_), std::end(__data_), v); } + template + simd_data(Ts... args) : __data_{args...} {} + value_type operator[](size_t idx) const noexcept { return __data_[idx]; } + value_type& operator[](size_t idx) noexcept { return __data_[idx]; } + const value_type* data() const noexcept { return __data_; } + value_type* data() noexcept { return __data_; } + auto begin() { return std::begin(__data_); } + auto end() { return std::end(__data_); } + auto begin() const { return std::begin(__data_); } + auto end() const { return std::end(__data_); } +private: + value_type __data_[_Simd::size()]; +}; + +// [simd.class] +template +class simd { + template + friend class simd; +public: + using value_type = _Tp; + using reference = __simd_reference<_Tp, _Tp, _Abi>; + using mask_type = simd_mask<_Tp, _Abi>; + using abi_type = _Abi; + + simd() = default; + simd(const simd&) = default; + simd& operator=(const simd&) = default; + + static constexpr size_t size() noexcept { + return simd_size<_Tp, _Abi>::value; + } + +private: + __simd_storage<_Tp, _Abi> __s_; + + template + static constexpr bool __can_broadcast() { + return (std::is_arithmetic<_Up>::value && + __is_non_narrowing_arithmetic_convertible<_Up, _Tp>()) || + (!std::is_arithmetic<_Up>::value && + std::is_convertible<_Up, _Tp>::value) || + std::is_same::type, int>::value || + (std::is_same::type, + unsigned int>::value && + std::is_unsigned<_Tp>::value); + } + + template + static constexpr decltype( + std::forward_as_tuple(std::declval<_Generator>()( + std::integral_constant())...), + bool()) + __can_generate(std::index_sequence<__indicies...>) { + return !__variadic_sum( + !__can_broadcast()( + std::integral_constant()))>()...); + } + + template + static bool __can_generate(...) { + return false; + } + + template + void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) { + int __not_used[]{((*this)[__indicies] = + __g(std::integral_constant()), + 0)...}; + (void)__not_used; + } + +public: + simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {} + +#if 0 +// implicit type conversion constructor + template >::value && + __is_non_narrowing_arithmetic_convertible<_Up, _Tp>()>::type> + simd(const simd<_Up, simd_abi::fixed_size>& __v) { + for (size_t __i = 0; __i < size(); __i++) { + (*this)[__i] = static_cast<_Tp>(__v[__i]); + } + } +#endif + // implicit type conversion constructor + template , __simd_storage<_Up, _UAbi>>::value>> + simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {} + +#if 0 + template , + __simd_storage<_Up, _UAbi>>>> + simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {} +#endif + // implicit broadcast constructor +#if 0 + template ()>::type> + simd(_Up&& __rv) { + auto __v = static_cast<_Tp>(__rv); + for (size_t __i = 0; __i < size(); __i++) { + (*this)[__i] = __v; + } + __s_.__broadcast(v); + } +#endif + simd(_Tp __rv) { + __s_.__broadcast(__rv); + } + + simd(_Tp a, _Tp b, _Tp c = {}, _Tp d = {}) { + __s_.__set4(a, b, c, d); + } + + // generator constructor + template (std::make_index_sequence()), + int>::type()> + explicit simd(_Generator&& __g) { + __generator_init(std::forward<_Generator>(__g), + std::make_index_sequence()); + } + + // load constructor + template < + class _Up, class _Flags, + class = typename std::enable_if<__vectorizable<_Up>()>::type, + class = typename std::enable_if::value>::type> + simd(const _Up* __buffer, _Flags) { + // TODO: optimize for overaligned flags + for (size_t __i = 0; __i < size(); __i++) { + (*this)[__i] = static_cast<_Tp>(__buffer[__i]); + } + } + +#if 0 + // loads [simd.load] + template + typename std::enable_if<__vectorizable<_Up>() && + is_simd_flag_type<_Flags>::value>::type + copy_from(const _Up* __buffer, _Flags) { + *this = simd(__buffer, _Flags()); + } +#endif + // loads [simd.load] + void copy_from(const simd_data& __buffer) { + __s_.__copy_from(__buffer); + } + +#if 0 + // stores [simd.store] + template + typename std::enable_if<__vectorizable<_Up>() && + is_simd_flag_type<_Flags>::value>::type + copy_to(_Up* __buffer, _Flags) const { + // TODO: optimize for overaligned flags + for (size_t __i = 0; __i < size(); __i++) { + __buffer[__i] = static_cast<_Up>((*this)[__i]); + } + } +#endif + // stores [simd.store] + void copy_to(simd_data& __buffer) const { + __s_.__copy_to(__buffer); + } + + // scalar access [simd.subscr] + reference operator[](size_t __i) { return reference(&__s_, __i); } + + value_type operator[](size_t __i) const { return __s_.__get(__i); } + + // unary operators [simd.unary] + simd& operator++(); + simd operator++(int); + simd& operator--(); + simd operator--(int); + mask_type operator!() const; + simd operator~() const; + simd operator+() const; + simd operator-() const; + + // binary operators [simd.binary] + friend simd operator+(const simd&, const simd&); + friend simd operator-(const simd&, const simd&); + friend simd operator*(const simd&, const simd&); + friend simd operator/(const simd&, const simd&); + friend simd operator%(const simd&, const simd&); + friend simd operator&(const simd&, const simd&); + friend simd operator|(const simd&, const simd&); + friend simd operator^(const simd&, const simd&); + friend simd operator<<(const simd&, const simd&); + friend simd operator>>(const simd&, const simd&); + friend simd operator<<(const simd&, int); + friend simd operator>>(const simd&, int); + + // compound assignment [simd.cassign] + friend simd& operator+=(simd&, const simd&); + friend simd& operator-=(simd&, const simd&); + friend simd& operator*=(simd&, const simd&); + friend simd& operator/=(simd&, const simd&); + friend simd& operator%=(simd&, const simd&); + + friend simd& operator&=(simd&, const simd&); + friend simd& operator|=(simd&, const simd&); + friend simd& operator^=(simd&, const simd&); + friend simd& operator<<=(simd&, const simd&); + friend simd& operator>>=(simd&, const simd&); + friend simd& operator<<=(simd&, int); + friend simd& operator>>=(simd&, int); + + // compares [simd.comparison] + friend mask_type operator==(const simd&, const simd&); + friend mask_type operator!=(const simd&, const simd&); + friend mask_type operator>=(const simd&, const simd&); + friend mask_type operator<=(const simd&, const simd&); + friend mask_type operator>(const simd&, const simd&); + friend mask_type operator<(const simd&, const simd&); + + value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); } + value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); } + value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); } + + template + simd shuffle() const { + simd s; + s.__s_ = __s_.template __shuffle(); + return s; + } + + const typename __simd_storage<_Tp, _Abi>::storage_type& native() const { return __s_.__native(); } +}; + +// [simd.mask.class] +template +class simd_mask { +public: + using value_type = bool; + using reference = __simd_mask_reference<_Tp, _Abi>; + using simd_type = simd<_Tp, _Abi>; + using abi_type = _Abi; + static constexpr size_t size() noexcept { + return simd_size<_Tp, _Abi>::value; + } + simd_mask() = default; + + // broadcast constructor + explicit simd_mask(value_type) noexcept; + + // implicit type conversion constructor + template + simd_mask(const simd_mask<_Up, simd_abi::fixed_size>&) noexcept; + + // load constructor + template + simd_mask(const value_type*, _Flags); + +private: + __simd_mask_storage<_Tp, _Abi> __s_; + +public: + // loads [simd.mask.copy] + template + void copy_from(const value_type*, _Flags); + template + void copy_to(value_type*, _Flags) const; + + // scalar access [simd.mask.subscr] + reference operator[](size_t __i) { return reference(&__s_, __i); } + value_type operator[](size_t __i) const { return __s_.__get(__i); } + + // unary operators [simd.mask.unary] + simd_mask operator!() const noexcept; + + // simd_mask binary operators [simd.mask.binary] + friend simd_mask operator&&(const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator||(const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator&(const simd_mask&, const simd_mask&)noexcept; + friend simd_mask operator|(const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator^(const simd_mask&, const simd_mask&) noexcept; + + // simd_mask compound assignment [simd.mask.cassign] + friend simd_mask& operator&=(simd_mask&, const simd_mask&) noexcept; + friend simd_mask& operator|=(simd_mask&, const simd_mask&) noexcept; + friend simd_mask& operator^=(simd_mask&, const simd_mask&) noexcept; + + // simd_mask compares [simd.mask.comparison] + friend simd_mask operator==(const simd_mask&, const simd_mask&) noexcept; + friend simd_mask operator!=(const simd_mask&, const simd_mask&) noexcept; + + // compares [simd.comparison] + friend simd_mask operator==(const simd_type&, const simd_type&); + friend simd_mask operator!=(const simd_type&, const simd_type&); + friend simd_mask operator>=(const simd_type&, const simd_type&); + friend simd_mask operator<=(const simd_type&, const simd_type&); + friend simd_mask operator>(const simd_type&, const simd_type&); + friend simd_mask operator<(const simd_type&, const simd_type&); +}; + +template +inline simd_data<_Simd>::simd_data(const _Simd& s) { s.copy_to(*this); } + +template +class __simd_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>> { +public: + using storage_type = std::array<_Tp, __num_element>; +private: + storage_type __storage_; + + template + friend struct simd; + + template + friend struct simd_mask; + +public: + _Tp __get(size_t __index) const noexcept { return __storage_[__index]; }; + void __set(size_t __index, _Tp __val) noexcept { + __storage_[__index] = __val; + } + std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept { + __storage_[0] = a; + __storage_[1] = b; + __storage_[2] = c; + __storage_[3] = d; + } + void __broadcast(float __val) noexcept { + std::fill(__storage_.begin(), __storage_.end(), __val); + } + std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept { + return __storage_[0] * other.__storage_[0] + + __storage_[1] * other.__storage_[1]; + } + std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept { + return __storage_[0] * other.__storage_[0] + + __storage_[1] * other.__storage_[1] + + __storage_[2] * other.__storage_[2]; + } + std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept { + return __storage_[0] * other.__storage_[0] + + __storage_[1] * other.__storage_[1] + + __storage_[2] * other.__storage_[2] + + __storage_[3] * other.__storage_[3]; + } + template + std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept { + __simd_storage s; + s.__storage_[0] = __storage_[x]; + s.__storage_[1] = __storage_[y]; + s.__storage_[2] = __storage_[z]; + s.__storage_[3] = __storage_[w]; + return s; + } + + void __copy_from(const simd_data>>& __buffer) noexcept { + std::copy(__buffer.begin(), __buffer.end(), __storage_.begin()); + } + + void __copy_to(simd_data>>& __buffer) const noexcept { + std::copy(__storage_.begin(), __storage_.end(), __buffer.begin()); + } + + __simd_storage() = default; + template + explicit __simd_storage(const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) { + std::copy(other.__native().begin(), other.__native().end(), __storage_.begin()); + } + const storage_type& __native() const { return __storage_; } +}; + +template +class __simd_mask_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>> { + std::bitset<__num_element> __storage_; +public: + bool __get(size_t __index) const noexcept { + return __storage_.test(__index); + } + void __set(size_t __index, bool __val) noexcept { + __storage_.set(__index, __val); + } +}; + +} diff --git a/include/athena/simd/simd.hpp b/include/athena/simd/simd.hpp new file mode 100644 index 0000000..7873d1f --- /dev/null +++ b/include/athena/simd/simd.hpp @@ -0,0 +1,26 @@ +#pragma once +#define _ATHENA_SIMD_INCLUDED +namespace athena::_simd { using namespace std; } +#include "parallelism_v2_simd.hpp" +#if _M_IX86_FP >= 1 || _M_X64 +#define __SSE__ 1 +#endif +#if __AVX__ +#include "simd_avx.hpp" +#elif __SSE__ +#include "simd_sse.hpp" +#else +namespace simd_abi { +template struct athena_native {}; +template<> struct athena_native { using type = fixed_size<4>; }; +template<> struct athena_native { using type = fixed_size<4>; }; +} +#endif +namespace athena { +template using simd = _simd::simd::type>; +template +using simd_values = _simd::simd_data>; +using simd_floats = simd_values; +using simd_doubles = simd_values; +} diff --git a/include/athena/simd/simd_avx.hpp b/include/athena/simd/simd_avx.hpp new file mode 100644 index 0000000..6bdc6b5 --- /dev/null +++ b/include/athena/simd/simd_avx.hpp @@ -0,0 +1,188 @@ +#pragma once +#ifndef _ATHENA_SIMD_INCLUDED +#error simd_avx.hpp must not be included directly. Include simd.hpp instead. +#endif +#include "simd_sse.hpp" +#include +namespace athena::_simd { +// __m256d storage for AVX +template<> +class __simd_storage { +public: + using storage_type = __m256d; + storage_type __storage_; + double __get(size_t __index) const noexcept { + alignas(32) std::array sse_data; + _mm256_store_pd(sse_data.data(), __storage_); + return sse_data[__index]; + } + void __set(size_t __index, double __val) noexcept { + alignas(32) std::array sse_data; + _mm256_store_pd(sse_data.data(), __storage_); + sse_data[__index] = __val; + __storage_ = _mm256_load_pd(sse_data.data()); + } + void __set4(double a, double b, double c, double d) noexcept { + __storage_ = _mm256_set_pd(d, c, b, a); + } + void __broadcast(double __val) noexcept { + __storage_ = _mm256_set1_pd(__val); + } + double __dot2(const __simd_storage& other) const noexcept { + alignas(32) std::array sse_data; + _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_)); + return sse_data[0] + sse_data[1]; + } + double __dot3(const __simd_storage& other) const noexcept { + alignas(32) std::array sse_data; + _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_)); + return sse_data[0] + sse_data[1] + sse_data[2]; + } + double __dot4(const __simd_storage& other) const noexcept { + alignas(32) std::array sse_data; + _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_)); + return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3]; + } + + void __copy_from(const simd_data>& __buffer) noexcept { + __storage_ = _mm256_load_pd(__buffer.data()); + } + + void __copy_to(simd_data>& __buffer) const noexcept { + _mm256_store_pd(__buffer.data(), __storage_); + } + + __simd_storage() = default; + explicit __simd_storage(const __simd_storage& other) { + __storage_ = _mm256_cvtps_pd(other.__storage_); + } + + explicit __simd_storage(const storage_type& s) : __storage_(s) {} + const storage_type& __native() const { return __storage_; } +}; +// __m256d mask storage for AVX +template<> +class __simd_mask_storage : public __simd_storage { +public: + bool __get(size_t __index) const noexcept { + alignas(32) uint64_t sse_data[4]; + _mm256_store_pd(reinterpret_cast(sse_data), __storage_); + return sse_data[__index] != 0; + } + + void __set(size_t __index, bool __val) noexcept { + alignas(32) uint64_t sse_data[4]; + _mm256_store_pd(reinterpret_cast(sse_data), __storage_); + sse_data[__index] = __val ? UINT64_MAX : 0; + __storage_ = _mm256_load_pd(reinterpret_cast(sse_data)); + } +}; + +template <> +inline simd simd::operator-() const { + return _mm256_xor_pd(__s_.__storage_, _mm256_set1_pd(-0.0)); +} + +inline simd +operator+(const simd& a, const simd& b) { + simd ret; + ret.__s_.__storage_ = _mm256_add_pd(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd +operator-(const simd& a, const simd& b) { + simd ret; + ret.__s_.__storage_ = _mm256_sub_pd(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd +operator*(const simd& a, const simd& b) { + simd ret; + ret.__s_.__storage_ = _mm256_mul_pd(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd +operator/(const simd& a, const simd& b) { + simd ret; + ret.__s_.__storage_ = _mm256_div_pd(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd& +operator+=(simd& a, const simd& b) { + a.__s_.__storage_ = _mm256_add_pd(a.__s_.__storage_, b.__s_.__storage_); + return a; +} + +inline simd& +operator-=(simd& a, const simd& b) { + a.__s_.__storage_ = _mm256_sub_pd(a.__s_.__storage_, b.__s_.__storage_); + return a; +} + +inline simd& +operator*=(simd& a, const simd& b) { + a.__s_.__storage_ = _mm256_mul_pd(a.__s_.__storage_, b.__s_.__storage_); + return a; +} + +inline simd& +operator/=(simd& a, const simd& b) { + a.__s_.__storage_ = _mm256_div_pd(a.__s_.__storage_, b.__s_.__storage_); + return a; +} + +inline simd::mask_type +operator==(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_EQ_OQ); + return ret; +} + +inline simd::mask_type +operator!=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_NEQ_OQ); + return ret; +} + +inline simd::mask_type +operator>=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_GE_OQ); + return ret; +} + +inline simd::mask_type +operator<=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_LE_OQ); + return ret; +} + +inline simd::mask_type +operator>(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_GT_OQ); + return ret; +} + +inline simd::mask_type +operator<(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_LT_OQ); + return ret; +} + +inline __simd_storage::__simd_storage(const __simd_storage& other) { + __storage_ = _mm256_cvtpd_ps(other.__storage_); +} + +namespace simd_abi { +template<> struct athena_native { using type = m256d_abi; }; +} // namespace simd_abi + +} // namespace athena::_simd \ No newline at end of file diff --git a/include/athena/simd/simd_sse.hpp b/include/athena/simd/simd_sse.hpp new file mode 100644 index 0000000..8d59454 --- /dev/null +++ b/include/athena/simd/simd_sse.hpp @@ -0,0 +1,455 @@ +#pragma once +#ifndef _ATHENA_SIMD_INCLUDED +#error simd_sse.hpp must not be included directly. Include simd.hpp instead. +#endif +#include +#if __SSE4_1__ +#include +#endif +namespace athena::_simd { +// __m128 ABI +using m128_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 1), 4>; +// __m128d ABI +using m128d_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 2), 4>; +#ifdef __AVX__ +// __m256d ABI +using m256d_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 3), 4>; +#endif + +template <> +class __simd_storage; +#ifdef __AVX__ +template <> +class __simd_storage; +#endif + +// __m128 storage for SSE2+ +template <> +class __simd_storage { +public: + using storage_type = __m128; + storage_type __storage_; + float __get(size_t __index) const noexcept { + alignas(16) std::array sse_data; + _mm_store_ps(sse_data.data(), __storage_); + return sse_data[__index]; + } + void __set(size_t __index, float __val) noexcept { + alignas(16) std::array sse_data; + _mm_store_ps(sse_data.data(), __storage_); + sse_data[__index] = __val; + __storage_ = _mm_load_ps(sse_data.data()); + } + void __set4(float a, float b, float c, float d) noexcept { + __storage_ = _mm_set_ps(d, c, b, a); + } + void __broadcast(float __val) noexcept { + __storage_ = _mm_set1_ps(__val); + } + float __dot2(const __simd_storage& other) const noexcept { +#if __SSE4_1__ + float ret; + _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x3F)); + return ret; +#else + alignas(16) std::array sse_data; + _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_)); + return sse_data[0] + sse_data[1]; +#endif + } + float __dot3(const __simd_storage& other) const noexcept { +#if __SSE4_1__ + float ret; + _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x7F)); + return ret; +#else + alignas(16) std::array sse_data; + _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_)); + return sse_data[0] + sse_data[1] + sse_data[2]; +#endif + } + float __dot4(const __simd_storage& other) const noexcept { +#if __SSE4_1__ + float ret; + _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0xFF)); + return ret; +#else + alignas(16) std::array sse_data; + _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_)); + return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3]; +#endif + } + template + __simd_storage __shuffle() const noexcept { + __simd_storage s; + s.__storage_ = _mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x)); + return s; + } + + void __copy_from(const simd_data>& __buffer) noexcept { + __storage_ = _mm_load_ps(__buffer.data()); + } + + void __copy_to(simd_data>& __buffer) const noexcept { + _mm_store_ps(__buffer.data(), __storage_); + } + + __simd_storage() = default; + explicit __simd_storage(const __simd_storage& other); +#ifdef __AVX__ + explicit __simd_storage(const __simd_storage& other); +#endif + + explicit __simd_storage(const storage_type& s) : __storage_(s) {} + const storage_type& __native() const { return __storage_; } +}; +// __m128 mask storage for SSE2+ +template <> +class __simd_mask_storage : public __simd_storage +{ +public: + bool __get(size_t __index) const noexcept { + alignas(16) uint32_t sse_data[4]; + _mm_store_ps(reinterpret_cast(sse_data), __storage_); + return sse_data[__index] != 0; + } + void __set(size_t __index, bool __val) noexcept { + alignas(16) uint32_t sse_data[4]; + _mm_store_ps(reinterpret_cast(sse_data), __storage_); + sse_data[__index] = __val ? UINT32_MAX : 0; + __storage_ = _mm_load_ps(reinterpret_cast(sse_data)); + } +}; + +template <> +inline simd simd::operator-() const { + return _mm_xor_ps(__s_.__storage_, _mm_set1_ps(-0.f)); +} + +inline simd +operator+(const simd& a, const simd& b) { + simd ret; + ret.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd +operator-(const simd& a, const simd& b) { + simd ret; + ret.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd +operator*(const simd& a, const simd& b) { + simd ret; + ret.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd +operator/(const simd& a, const simd& b) { + simd ret; + ret.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd& +operator+=(simd& a, const simd& b) { + a.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_); + return a; +} + +inline simd& +operator-=(simd& a, const simd& b) { + a.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_); + return a; +} + +inline simd& +operator*=(simd& a, const simd& b) { + a.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_); + return a; +} + +inline simd& +operator/=(simd& a, const simd& b) { + a.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_); + return a; +} + +inline simd::mask_type +operator==(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm_cmpeq_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd::mask_type +operator!=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm_cmpneq_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd::mask_type +operator>=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm_cmpge_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd::mask_type +operator<=(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm_cmple_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd::mask_type +operator>(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm_cmpgt_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +inline simd::mask_type +operator<(const simd& a, const simd& b) { + simd::mask_type ret; + ret.__s_.__storage_ = _mm_cmplt_ps(a.__s_.__storage_, b.__s_.__storage_); + return ret; +} + +// __m128d storage for SSE2+ +template <> +class __simd_storage { +public: + using storage_type = std::array<__m128d, 2>; + storage_type __storage_; + double __get(size_t __index) const noexcept { + alignas(16) std::array sse_data; + _mm_store_pd(sse_data.data(), __storage_[__index / 2]); + return sse_data[__index % 2]; + } + void __set(size_t __index, double __val) noexcept { + alignas(16) std::array sse_data; + _mm_store_pd(sse_data.data(), __storage_[__index / 2]); + sse_data[__index % 2] = __val; + __storage_[__index / 2] = _mm_load_pd(sse_data.data()); + } + void __set4(double a, double b, double c, double d) noexcept { + __storage_[0] = _mm_set_pd(b, a); + __storage_[1] = _mm_set_pd(d, c); + } + void __broadcast(double __val) noexcept { + for (int i = 0; i < 2; ++i) + __storage_[i] = _mm_set1_pd(__val); + } + double __dot2(const __simd_storage& other) const noexcept { +#if __SSE4_1__ + double ret; + _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); + return ret; +#else + alignas(16) std::array sse_data; + _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0])); + return sse_data[0] + sse_data[1]; +#endif + } + double __dot3(const __simd_storage& other) const noexcept { +#if __SSE4_1__ + double ret; + _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); + alignas(16) std::array sse_data2; + _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1])); + return ret + sse_data2[0]; +#else + alignas(16) std::array sse_data; + _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0])); + alignas(16) std::array sse_data2; + _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1])); + return sse_data[0] + sse_data[1] + sse_data2[0]; +#endif + } + double __dot4(const __simd_storage& other) const noexcept { +#if __SSE4_1__ + double ret; + _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F)); + double ret2; + _mm_store_sd(&ret2, _mm_dp_pd(__storage_[1], other.__storage_[1], 0x3F)); + return ret + ret2; +#else + alignas(16) std::array sse_data; + _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0])); + alignas(16) std::array sse_data2; + _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1])); + return sse_data[0] + sse_data[1] + sse_data2[0] + sse_data2[1]; +#endif + } + + void __copy_from(const simd_data>& __buffer) noexcept { + __storage_[0] = _mm_load_pd(__buffer.data()); + __storage_[1] = _mm_load_pd(__buffer.data() + 2); + } + + void __copy_to(simd_data>& __buffer) const noexcept { + _mm_store_pd(__buffer.data(), __storage_[0]); + _mm_store_pd(__buffer.data() + 2, __storage_[1]); + } + + __simd_storage() = default; + explicit __simd_storage(const __simd_storage& other) { + __storage_[0] = _mm_cvtps_pd(other.__storage_); + __storage_[1] = _mm_cvtps_pd(_mm_movehl_ps(other.__storage_, other.__storage_)); + } + + explicit __simd_storage(const storage_type& s) : __storage_(s) {} + const storage_type& __native() const { return __storage_; } +}; +// __m128d mask storage for SSE2+ +template <> +class __simd_mask_storage : public __simd_storage +{ +public: + bool __get(size_t __index) const noexcept { + alignas(16) uint64_t sse_data[2]; + _mm_store_pd(reinterpret_cast(sse_data), __storage_[__index / 2]); + return sse_data[__index] != 0; + } + void __set(size_t __index, bool __val) noexcept { + alignas(16) uint64_t sse_data[2]; + _mm_store_pd(reinterpret_cast(sse_data), __storage_[__index / 2]); + sse_data[__index % 2] = __val ? UINT64_MAX : 0; + __storage_[__index / 2] = _mm_load_pd(reinterpret_cast(sse_data)); + } +}; + +template <> +inline simd simd::operator-() const { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_xor_pd(__s_.__storage_[i], _mm_set1_pd(-0.0)); + return ret; +} + +inline simd +operator+(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_add_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd +operator-(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_sub_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd +operator*(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_mul_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd +operator/(const simd& a, const simd& b) { + simd ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_div_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd& +operator+=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_[i] = _mm_add_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return a; +} + +inline simd& +operator-=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_[i] = _mm_sub_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return a; +} + +inline simd& +operator*=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_[i] = _mm_mul_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return a; +} + +inline simd& +operator/=(simd& a, const simd& b) { + for (int i = 0; i < 2; ++i) + a.__s_.__storage_[i] = _mm_div_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return a; +} + +inline simd::mask_type +operator==(const simd& a, const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_cmpeq_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd::mask_type +operator!=(const simd& a, const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_cmpneq_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd::mask_type +operator>=(const simd& a, const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_cmpge_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd::mask_type +operator<=(const simd& a, const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_cmple_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd::mask_type +operator>(const simd& a, const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_cmpgt_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline simd::mask_type +operator<(const simd& a, const simd& b) { + simd::mask_type ret; + for (int i = 0; i < 2; ++i) + ret.__s_.__storage_[i] = _mm_cmplt_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]); + return ret; +} + +inline __simd_storage::__simd_storage(const __simd_storage& other) { + __storage_ = _mm_movelh_ps(_mm_cvtpd_ps(other.__storage_[0]), _mm_cvtpd_ps(other.__storage_[1])); +} + +namespace simd_abi { +template struct athena_native {}; +template<> struct athena_native { using type = m128_abi; }; +#ifndef __AVX__ +template<> struct athena_native { using type = m128d_abi; }; +#endif +} // namespace simd_abi + +} // namespace athena::_simd diff --git a/src/athena/DNAYaml.cpp b/src/athena/DNAYaml.cpp index 826b389..7c75881 100644 --- a/src/athena/DNAYaml.cpp +++ b/src/athena/DNAYaml.cpp @@ -184,8 +184,12 @@ std::unique_ptr ValToNode(double val) template RETURNTYPE NodeToVec(const YAMLNode* node) { + constexpr bool isDouble = std::is_same::value || + std::is_same::value || + std::is_same::value; RETURNTYPE retval = {}; auto it = node->m_seqChildren.begin(); + simd_values> f; for (size_t i=0; i<4 && it != node->m_seqChildren.end(); ++i, ++it) @@ -193,16 +197,15 @@ RETURNTYPE NodeToVec(const YAMLNode* node) YAMLNode* snode = it->get(); if (snode->m_type == YAML_SCALAR_NODE) { - if (std::is_same::value || - std::is_same::value || - std::is_same::value) - retval.vec[i] = NodeToVal(snode); + if (isDouble) + f[i] = NodeToVal(snode); else - retval.vec[i] = NodeToVal(snode); + f[i] = NodeToVal(snode); } else - retval.vec[i] = 0.0; + f[i] = 0.0; } + retval.simd.copy_from(f); return retval; } @@ -216,10 +219,11 @@ std::unique_ptr ValToNode(const atVec2f& val) { YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE); ret->m_seqChildren.reserve(2); + simd_floats f(val.simd); for (size_t i=0 ; i<2 ; ++i) { char str[64]; - snprintf(str, 64, "%f", val.vec[i]); + snprintf(str, 64, "%f", f[i]); YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE); comp->m_scalarString = str; ret->m_seqChildren.emplace_back(comp); @@ -237,10 +241,11 @@ std::unique_ptr ValToNode(const atVec3f& val) { YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE); ret->m_seqChildren.reserve(3); + simd_floats f(val.simd); for (size_t i=0 ; i<3 ; ++i) { char str[64]; - snprintf(str, 64, "%f", val.vec[i]); + snprintf(str, 64, "%f", f[i]); YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE); comp->m_scalarString = str; ret->m_seqChildren.emplace_back(comp); @@ -258,10 +263,11 @@ std::unique_ptr ValToNode(const atVec4f& val) { YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE); ret->m_seqChildren.reserve(4); + simd_floats f(val.simd); for (size_t i=0 ; i<4 ; ++i) { char str[64]; - snprintf(str, 64, "%f", val.vec[i]); + snprintf(str, 64, "%f", f[i]); YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE); comp->m_scalarString = str; ret->m_seqChildren.emplace_back(comp); @@ -279,10 +285,11 @@ std::unique_ptr ValToNode(const atVec2d& val) { YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE); ret->m_seqChildren.reserve(2); + simd_doubles f(val.simd); for (size_t i=0 ; i<2 ; ++i) { char str[64]; - snprintf(str, 64, "%f", val.vec[i]); + snprintf(str, 64, "%f", f[i]); YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE); comp->m_scalarString = str; ret->m_seqChildren.emplace_back(comp); @@ -300,10 +307,11 @@ std::unique_ptr ValToNode(const atVec3d& val) { YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE); ret->m_seqChildren.reserve(3); + simd_doubles f(val.simd); for (size_t i=0 ; i<3 ; ++i) { char str[64]; - snprintf(str, 64, "%f", val.vec[i]); + snprintf(str, 64, "%f", f[i]); YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE); comp->m_scalarString = str; ret->m_seqChildren.emplace_back(comp); @@ -321,10 +329,11 @@ std::unique_ptr ValToNode(const atVec4d& val) { YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE); ret->m_seqChildren.reserve(4); + simd_doubles f(val.simd); for (size_t i=0 ; i<4 ; ++i) { char str[64]; - snprintf(str, 64, "%f", val.vec[i]); + snprintf(str, 64, "%f", f[i]); YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE); comp->m_scalarString = str; ret->m_seqChildren.emplace_back(comp);