From e1b29fda7acf3a17a297a02a63a5f11e94eb2328 Mon Sep 17 00:00:00 2001
From: Jack Andersen <jackoalan@gmail.com>
Date: Fri, 7 Dec 2018 15:31:02 -1000
Subject: [PATCH] SIMD refactor

---
 include/athena/DNAOp.hpp                    |    4 +-
 include/athena/IStream.hpp                  |   11 +-
 include/athena/IStreamReader.hpp            |  540 +++---
 include/athena/IStreamWriter.hpp            |  459 ++---
 include/athena/Types.hpp                    |  107 +-
 include/athena/Utility.hpp                  |    8 +-
 include/athena/simd/parallelism_v2_simd.hpp | 1768 +++++++++++++++++++
 include/athena/simd/simd.hpp                |   26 +
 include/athena/simd/simd_avx.hpp            |  188 ++
 include/athena/simd/simd_sse.hpp            |  455 +++++
 src/athena/DNAYaml.cpp                      |   33 +-
 11 files changed, 3003 insertions(+), 596 deletions(-)
 create mode 100644 include/athena/simd/parallelism_v2_simd.hpp
 create mode 100644 include/athena/simd/simd.hpp
 create mode 100644 include/athena/simd/simd_avx.hpp
 create mode 100644 include/athena/simd/simd_sse.hpp
diff --git a/include/athena/DNAOp.hpp b/include/athena/DNAOp.hpp
index d218f72..fe159f2 100644
--- a/include/athena/DNAOp.hpp
+++ b/include/athena/DNAOp.hpp
@@ -1013,7 +1013,7 @@ template <class T>
 static inline void __ReadProp(T& obj, athena::io::IStreamReader& r)
 {
     /* Read root 0xffffffff hash (hashed empty string) */
-    atUint32 hash = T::DNAEndian == Endian::Big ? r.readUint32Big() : r.readUint32Little();
+    T::DNAEndian == Endian::Big ? r.readUint32Big() : r.readUint32Little();
     atInt64 size = T::DNAEndian == Endian::Big ? r.readUint16Big() : r.readUint16Little();
     atInt64 start = r.position();
     __Do<Read<PropType::CRC32>, T, T::DNAEndian>({}, obj, r);
@@ -1038,7 +1038,7 @@ template <class T>
 static inline void __ReadProp64(T& obj, athena::io::IStreamReader& r)
 {
     /* Read root 0x0 hash (hashed empty string) */
-    atUint64 hash = T::DNAEndian == Endian::Big ? r.readUint64Big() : r.readUint64Little();
+    T::DNAEndian == Endian::Big ? r.readUint64Big() : r.readUint64Little();
     atInt64 size = T::DNAEndian == Endian::Big ? r.readUint16Big() : r.readUint16Little();
     atInt64 start = r.position();
     __Do<Read<PropType::CRC64>, T, T::DNAEndian>({}, obj, r);
diff --git a/include/athena/IStream.hpp b/include/athena/IStream.hpp
index c902714..ffd356f 100644
--- a/include/athena/IStream.hpp
+++ b/include/athena/IStream.hpp
@@ -9,13 +9,12 @@ std::ostream& operator<<(std::ostream& os, Endian& endian);
 class IStream
 {
 public:
-    IStream() {}
-    virtual ~IStream() {}
+    virtual ~IStream() = default;
 
-    inline void setEndian(Endian endian) { m_endian = endian; }
-    inline Endian endian() const { return m_endian; }
-    inline bool isBigEndian() const { return (m_endian == Endian::Big); }
-    inline bool isLittleEndian() const { return (m_endian == Endian::Little); }
+    void setEndian(Endian endian) { m_endian = endian; }
+    Endian endian() const { return m_endian; }
+    bool isBigEndian() const { return (m_endian == Endian::Big); }
+    bool isLittleEndian() const { return (m_endian == Endian::Little); }
     virtual void seek(atInt64, SeekOrigin) = 0;
     virtual bool atEnd() const = 0;
     virtual atUint64 position() const = 0;
diff --git a/include/athena/IStreamReader.hpp b/include/athena/IStreamReader.hpp
index 7298cd8..f619d8a 100644
--- a/include/athena/IStreamReader.hpp
+++ b/include/athena/IStreamReader.hpp
@@ -16,7 +16,7 @@ namespace athena::io
 class IStreamReader : public IStream
 {
 public:
-    virtual ~IStreamReader() {}
+    virtual ~IStreamReader() = default;
 
     /** @brief Sets the buffers position relative to the specified position.<br />
      *         It seeks relative to the current position by default.
@@ -27,25 +27,25 @@ public:
 
     /** @brief Sets the buffer's position relative to the next 64-byte aligned position.<br />
      */
-    inline void seekAlign64() {seek(ROUND_UP_64(position()), SeekOrigin::Begin);}
+    void seekAlign64() {seek(ROUND_UP_64(position()), SeekOrigin::Begin);}
 
     /** @brief Sets the buffers position relative to the next 32-byte aligned position.<br />
      */
-    inline void seekAlign32() {seek(ROUND_UP_32(position()), SeekOrigin::Begin);}
+    void seekAlign32() {seek(ROUND_UP_32(position()), SeekOrigin::Begin);}
 
     /** @brief Sets the buffer's position relative to the next 16-byte aligned position.<br />
      */
-    inline void seekAlign16() {seek(ROUND_UP_16(position()), SeekOrigin::Begin); }
+    void seekAlign16() {seek(ROUND_UP_16(position()), SeekOrigin::Begin); }
 
     /** @brief Sets the buffer's position relative to the next 4-byte aligned position.<br />
      */
-    inline void seekAlign4() {seek(ROUND_UP_4(position()), SeekOrigin::Begin); }
+    void seekAlign4() {seek(ROUND_UP_4(position()), SeekOrigin::Begin); }
 
     /** @brief Returns whether or not the stream is at the end.
      *
      *  @return True if at end; False otherwise.
      */
-    inline bool atEnd() const
+    bool atEnd() const
     {return position() >= length();}
 
     /** @brief Returns the current position in the stream.
@@ -64,37 +64,37 @@ public:
      *
      * @return The value at the current position
      */
-    inline atInt8 readByte() {atInt8 val; readUBytesToBuf(&val, 1); return val;}
+    atInt8 readByte() {atInt8 val; readUBytesToBuf(&val, 1); return val;}
     template <class T>
-    inline atInt8 readVal(typename std::enable_if<std::is_same<T, atInt8>::value>::type* = 0)
+    atInt8 readVal(typename std::enable_if<std::is_same<T, atInt8>::value>::type* = 0)
     {return readByte();}
     template <class T>
-    inline atInt8 readValLittle(typename std::enable_if<std::is_same<T, atInt8>::value>::type* = 0)
+    atInt8 readValLittle(typename std::enable_if<std::is_same<T, atInt8>::value>::type* = 0)
     {return readByte();}
     template <class T>
-    inline atInt8 readValBig(typename std::enable_if<std::is_same<T, atInt8>::value>::type* = 0)
+    atInt8 readValBig(typename std::enable_if<std::is_same<T, atInt8>::value>::type* = 0)
     {return readByte();}
 
     /** @brief Reads a byte at the current position and advances the current position
      *
      * @return The value at the current position
      */
-    inline atUint8 readUByte() {return readByte();}
+    atUint8 readUByte() {return readByte();}
     template <class T>
-    inline atUint8 readVal(typename std::enable_if<std::is_same<T, atUint8>::value>::type* = 0)
+    atUint8 readVal(typename std::enable_if<std::is_same<T, atUint8>::value>::type* = 0)
     {return readUByte();}
     template <class T>
-    inline atUint8 readValLittle(typename std::enable_if<std::is_same<T, atUint8>::value>::type* = 0)
+    atUint8 readValLittle(typename std::enable_if<std::is_same<T, atUint8>::value>::type* = 0)
     {return readUByte();}
     template <class T>
-    inline atUint8 readValBig(typename std::enable_if<std::is_same<T, atUint8>::value>::type* = 0)
+    atUint8 readValBig(typename std::enable_if<std::is_same<T, atUint8>::value>::type* = 0)
     {return readUByte();}
 
     /** @brief Reads a byte at the current position and advances the current position.
      *
      * @return The buffer at the current position from the given length.
      */
-    inline std::unique_ptr<atInt8[]> readBytes(atUint64 length)
+    std::unique_ptr<atInt8[]> readBytes(atUint64 length)
     {
         atInt8* buf = new atInt8[length];
         readUBytesToBuf(buf, length);
@@ -105,7 +105,7 @@ public:
      *
      *  @return The buffer at the current position from the given length.
      */
-    inline std::unique_ptr<atUint8[]> readUBytes(atUint64 length)
+    std::unique_ptr<atUint8[]> readUBytes(atUint64 length)
     {
         atUint8* buf = new atUint8[length];
         readUBytesToBuf(buf, length);
@@ -117,7 +117,7 @@ public:
      *  @param len The length of the buffer
      *  @return How much data was actually read, useful for detecting read errors.
      */
-    inline atUint64 readBytesToBuf(void* buf, atUint64 len) {return readUBytesToBuf(buf, len);}
+    atUint64 readBytesToBuf(void* buf, atUint64 len) {return readUBytesToBuf(buf, len);}
 
 
     /** @brief Attempts to read a fixed length of data into a pre-allocated buffer, this function is client defined
@@ -133,14 +133,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt16 readInt16()
+    atInt16 readInt16()
     {
         atInt16 val;
         readUBytesToBuf(&val, 2);
         return m_endian == Big ? utility::BigInt16(val) : utility::LittleInt16(val);
     }
     template <class T>
-    inline atInt16 readVal(typename std::enable_if<std::is_same<T, atInt16>::value>::type* = 0)
+    atInt16 readVal(typename std::enable_if<std::is_same<T, atInt16>::value>::type* = 0)
     {return readInt16();}
 
     /** @brief Reads a Int16 and swaps against little endianness depending on platform
@@ -148,14 +148,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt16 readInt16Little()
+    atInt16 readInt16Little()
     {
         atInt16 val;
         readUBytesToBuf(&val, 2);
         return utility::LittleInt16(val);
     }
     template <class T>
-    inline atInt16 readValLittle(typename std::enable_if<std::is_same<T, atInt16>::value>::type* = 0)
+    atInt16 readValLittle(typename std::enable_if<std::is_same<T, atInt16>::value>::type* = 0)
     {return readInt16Little();}
 
     /** @brief Reads a Int16 and swaps against big endianness depending on platform
@@ -163,14 +163,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt16 readInt16Big()
+    atInt16 readInt16Big()
     {
         atInt16 val;
         readUBytesToBuf(&val, 2);
         return utility::BigInt16(val);
     }
     template <class T>
-    inline atInt16 readValBig(typename std::enable_if<std::is_same<T, atInt16>::value>::type* = 0)
+    atInt16 readValBig(typename std::enable_if<std::is_same<T, atInt16>::value>::type* = 0)
     {return readInt16Big();}
 
     /** @brief Reads a Uint16 and swaps to endianness specified by setEndian depending on platform
@@ -178,10 +178,10 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint16 readUint16()
+    atUint16 readUint16()
     {return readInt16();}
     template <class T>
-    inline atUint16 readVal(typename std::enable_if<std::is_same<T, atUint16>::value>::type* = 0)
+    atUint16 readVal(typename std::enable_if<std::is_same<T, atUint16>::value>::type* = 0)
     {return readUint16();}
 
     /** @brief Reads a Uint16 and swaps against little endianness depending on platform
@@ -189,14 +189,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint16 readUint16Little()
+    atUint16 readUint16Little()
     {
         atUint16 val;
         readUBytesToBuf(&val, 2);
         return utility::LittleUint16(val);
     }
     template <class T>
-    inline atUint16 readValLittle(typename std::enable_if<std::is_same<T, atUint16>::value>::type* = 0)
+    atUint16 readValLittle(typename std::enable_if<std::is_same<T, atUint16>::value>::type* = 0)
     {return readUint16Little();}
 
     /** @brief Reads a Uint16 and swaps against big endianness depending on platform
@@ -204,14 +204,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint16 readUint16Big()
+    atUint16 readUint16Big()
     {
         atUint16 val;
         readUBytesToBuf(&val, 2);
         return utility::BigUint16(val);
     }
     template <class T>
-    inline atUint16 readValBig(typename std::enable_if<std::is_same<T, atUint16>::value>::type* = 0)
+    atUint16 readValBig(typename std::enable_if<std::is_same<T, atUint16>::value>::type* = 0)
     {return readUint16Big();}
 
     /** @brief Reads a Int32 and swaps to endianness specified by setEndian depending on platform
@@ -219,14 +219,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt32 readInt32()
+    atInt32 readInt32()
     {
         atInt32 val;
         readUBytesToBuf(&val, 4);
         return m_endian == Big ? utility::BigInt32(val) : utility::LittleInt32(val);
     }
     template <class T>
-    inline atInt32 readVal(typename std::enable_if<std::is_same<T, atInt32>::value>::type* = 0)
+    atInt32 readVal(typename std::enable_if<std::is_same<T, atInt32>::value>::type* = 0)
     {return readInt32();}
 
     /** @brief Reads a Int32 and swaps against little endianness depending on platform
@@ -234,14 +234,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt32 readInt32Little()
+    atInt32 readInt32Little()
     {
         atInt32 val;
         readUBytesToBuf(&val, 4);
         return utility::LittleInt32(val);
     }
     template <class T>
-    inline atInt32 readValLittle(typename std::enable_if<std::is_same<T, atInt32>::value>::type* = 0)
+    atInt32 readValLittle(typename std::enable_if<std::is_same<T, atInt32>::value>::type* = 0)
     {return readInt32Little();}
 
     /** @brief Reads a Int32 and swaps against big endianness depending on platform
@@ -249,14 +249,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt32 readInt32Big()
+    atInt32 readInt32Big()
     {
         atInt32 val;
         readUBytesToBuf(&val, 4);
         return utility::BigInt32(val);
     }
     template <class T>
-    inline atInt32 readValBig(typename std::enable_if<std::is_same<T, atInt32>::value>::type* = 0)
+    atInt32 readValBig(typename std::enable_if<std::is_same<T, atInt32>::value>::type* = 0)
     {return readInt32Big();}
 
     /** @brief Reads a Uint32 and swaps to endianness specified by setEndian depending on platform
@@ -264,10 +264,10 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint32 readUint32()
+    atUint32 readUint32()
     {return readInt32();}
     template <class T>
-    inline atUint32 readVal(typename std::enable_if<std::is_same<T, atUint32>::value>::type* = 0)
+    atUint32 readVal(typename std::enable_if<std::is_same<T, atUint32>::value>::type* = 0)
     {return readUint32();}
 
     /** @brief Reads a Uint32 and swaps against little endianness depending on platform
@@ -275,14 +275,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint32 readUint32Little()
+    atUint32 readUint32Little()
     {
         atUint32 val;
         readUBytesToBuf(&val, 4);
         return utility::LittleUint32(val);
     }
     template <class T>
-    inline atInt32 readValLittle(typename std::enable_if<std::is_same<T, atUint32>::value>::type* = 0)
+    atInt32 readValLittle(typename std::enable_if<std::is_same<T, atUint32>::value>::type* = 0)
     {return readUint32Little();}
 
     /** @brief Reads a Uint32 and swaps against big endianness depending on platform
@@ -290,14 +290,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint32 readUint32Big()
+    atUint32 readUint32Big()
     {
         atUint32 val;
         readUBytesToBuf(&val, 4);
         return utility::BigUint32(val);
     }
     template <class T>
-    inline atUint32 readValBig(typename std::enable_if<std::is_same<T, atUint32>::value>::type* = 0)
+    atUint32 readValBig(typename std::enable_if<std::is_same<T, atUint32>::value>::type* = 0)
     {return readUint32Big();}
 
     /** @brief Reads a Int64 and swaps to endianness specified by setEndian depending on platform
@@ -305,14 +305,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt64 readInt64()
+    atInt64 readInt64()
     {
         atInt64 val;
         readUBytesToBuf(&val, 8);
         return m_endian == Big ? utility::BigInt64(val) : utility::LittleInt64(val);
     }
     template <class T>
-    inline atInt64 readVal(typename std::enable_if<std::is_same<T, atInt64>::value>::type* = 0)
+    atInt64 readVal(typename std::enable_if<std::is_same<T, atInt64>::value>::type* = 0)
     {return readInt64();}
 
     /** @brief Reads a Int64 and swaps against little endianness depending on platform
@@ -320,14 +320,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt64 readInt64Little()
+    atInt64 readInt64Little()
     {
         atInt64 val;
         readUBytesToBuf(&val, 8);
         return utility::LittleInt64(val);
     }
     template <class T>
-    inline atInt64 readValLittle(typename std::enable_if<std::is_same<T, atInt64>::value>::type* = 0)
+    atInt64 readValLittle(typename std::enable_if<std::is_same<T, atInt64>::value>::type* = 0)
     {return readInt64Little();}
 
     /** @brief Reads a Int64 and swaps against big endianness depending on platform
@@ -335,14 +335,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atInt64 readInt64Big()
+    atInt64 readInt64Big()
     {
         atInt64 val;
         readUBytesToBuf(&val, 8);
         return utility::BigInt64(val);
     }
     template <class T>
-    inline atInt64 readValBig(typename std::enable_if<std::is_same<T, atInt64>::value>::type* = 0)
+    atInt64 readValBig(typename std::enable_if<std::is_same<T, atInt64>::value>::type* = 0)
     {return readInt64Big();}
 
     /** @brief Reads a Uint64 and swaps to endianness specified by setEndian depending on platform
@@ -350,10 +350,10 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint64 readUint64()
+    atUint64 readUint64()
     {return readInt64();}
     template <class T>
-    inline atUint64 readVal(typename std::enable_if<std::is_same<T, atUint64>::value>::type* = 0)
+    atUint64 readVal(typename std::enable_if<std::is_same<T, atUint64>::value>::type* = 0)
     {return readUint64();}
 
     /** @brief Reads a Uint64 and swaps against little endianness depending on platform
@@ -361,14 +361,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint64 readUint64Little()
+    atUint64 readUint64Little()
     {
         atUint64 val;
         readUBytesToBuf(&val, 8);
         return utility::LittleUint64(val);
     }
     template <class T>
-    inline atUint64 readValLittle(typename std::enable_if<std::is_same<T, atUint64>::value>::type* = 0)
+    atUint64 readValLittle(typename std::enable_if<std::is_same<T, atUint64>::value>::type* = 0)
     {return readUint64Little();}
 
     /** @brief Reads a Uint64 and swaps against big endianness depending on platform
@@ -376,14 +376,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atUint64 readUint64Big()
+    atUint64 readUint64Big()
     {
         atUint64 val;
         readUBytesToBuf(&val, 8);
         return utility::BigUint64(val);
     }
     template <class T>
-    inline atUint64 readValBig(typename std::enable_if<std::is_same<T, atUint64>::value>::type* = 0)
+    atUint64 readValBig(typename std::enable_if<std::is_same<T, atUint64>::value>::type* = 0)
     {return readUint64Big();}
 
     /** @brief Reads a float and swaps to endianness specified by setEndian depending on platform
@@ -391,14 +391,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline float readFloat()
+    float readFloat()
     {
         float val;
         readUBytesToBuf(&val, 4);
         return m_endian == Big ? utility::BigFloat(val) : utility::LittleFloat(val);
     }
     template <class T>
-    inline float readVal(typename std::enable_if<std::is_same<T, float>::value>::type* = 0)
+    float readVal(typename std::enable_if<std::is_same<T, float>::value>::type* = 0)
     {return readFloat();}
 
     /** @brief Reads a float and swaps against little endianness depending on platform
@@ -406,14 +406,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline float readFloatLittle()
+    float readFloatLittle()
     {
         float val;
         readUBytesToBuf(&val, 4);
         return utility::LittleFloat(val);
     }
     template <class T>
-    inline float readValLittle(typename std::enable_if<std::is_same<T, float>::value>::type* = 0)
+    float readValLittle(typename std::enable_if<std::is_same<T, float>::value>::type* = 0)
     {return readFloatLittle();}
 
     /** @brief Reads a float and swaps against big endianness depending on platform
@@ -421,14 +421,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline float readFloatBig()
+    float readFloatBig()
     {
         float val;
         readUBytesToBuf(&val, 4);
         return utility::BigFloat(val);
     }
     template <class T>
-    inline float readValBig(typename std::enable_if<std::is_same<T, float>::value>::type* = 0)
+    float readValBig(typename std::enable_if<std::is_same<T, float>::value>::type* = 0)
     {return readFloatBig();}
 
     /** @brief Reads a double and swaps to endianness specified by setEndian depending on platform
@@ -436,14 +436,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline double readDouble()
+    double readDouble()
     {
         double val;
         readUBytesToBuf(&val, 8);
         return m_endian == Big ? utility::BigDouble(val) : utility::LittleDouble(val);
     }
     template <class T>
-    inline double readVal(typename std::enable_if<std::is_same<T, double>::value>::type* = 0)
+    double readVal(typename std::enable_if<std::is_same<T, double>::value>::type* = 0)
     {return readDouble();}
 
     /** @brief Reads a double and swaps against little endianness depending on platform
@@ -451,14 +451,14 @@ public:
      *
      *  @return The value at the current address
      */
-    inline double readDoubleLittle()
+    double readDoubleLittle()
     {
         double val;
         readUBytesToBuf(&val, 8);
         return utility::LittleDouble(val);
     }
     template <class T>
-    inline double readValLittle(typename std::enable_if<std::is_same<T, double>::value>::type* = 0)
+    double readValLittle(typename std::enable_if<std::is_same<T, double>::value>::type* = 0)
     {return readDoubleLittle();}
 
     /** @brief Reads a double and swaps against big endianness depending on platform
@@ -466,34 +466,34 @@ public:
      *
      *  @return The value at the current address
      */
-    inline double readDoubleBig()
+    double readDoubleBig()
     {
         double val;
         readUBytesToBuf(&val, 8);
         return utility::BigDouble(val);
     }
     template <class T>
-    inline double readValBig(typename std::enable_if<std::is_same<T, double>::value>::type* = 0)
+    double readValBig(typename std::enable_if<std::is_same<T, double>::value>::type* = 0)
     {return readDoubleBig();}
 
     /** @brief Reads a bool and advances the current position
      *
      *  @return The value at the current address
      */
-    inline bool readBool()
+    bool readBool()
     {
         atUint8 val;
         readUBytesToBuf(&val, 1);
         return val != 0;
     }
     template <class T>
-    inline bool readVal(typename std::enable_if<std::is_same<T, bool>::value>::type* = 0)
+    bool readVal(typename std::enable_if<std::is_same<T, bool>::value>::type* = 0)
     {return readBool();}
     template <class T>
-    inline bool readValLittle(typename std::enable_if<std::is_same<T, bool>::value>::type* = 0)
+    bool readValLittle(typename std::enable_if<std::is_same<T, bool>::value>::type* = 0)
     {return readBool();}
     template <class T>
-    inline bool readValBig(typename std::enable_if<std::is_same<T, bool>::value>::type* = 0)
+    bool readValBig(typename std::enable_if<std::is_same<T, bool>::value>::type* = 0)
     {return readBool();}
 
     /** @brief Reads an atVec2f (8 bytes), swaps to endianness specified by setEndian depending on platform
@@ -501,24 +501,28 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec2f readVec2f()
+    atVec2f readVec2f()
     {
-        atVec2f val;
-        readUBytesToBuf(&val, 8);
+        simd_floats val;
+        readUBytesToBuf(val.data(), 8);
         if (m_endian == Big)
         {
-            utility::BigFloat(val.vec[0]);
-            utility::BigFloat(val.vec[1]);
+            val[0] = utility::BigFloat(val[0]);
+            val[1] = utility::BigFloat(val[1]);
         }
         else
         {
-            utility::LittleFloat(val.vec[0]);
-            utility::LittleFloat(val.vec[1]);
+            val[0] = utility::LittleFloat(val[0]);
+            val[1] = utility::LittleFloat(val[1]);
         }
-        return val;
+        val[2] = 0.f;
+        val[3] = 0.f;
+        atVec2f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec2f readVal(typename std::enable_if<std::is_same<T, atVec2f>::value>::type* = 0)
+    atVec2f readVal(typename std::enable_if<std::is_same<T, atVec2f>::value>::type* = 0)
     {return readVec2f();}
 
     /** @brief Reads an atVec2f (8 bytes), swaps against little endianness depending on platform
@@ -526,16 +530,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec2f readVec2fLittle()
+    atVec2f readVec2fLittle()
     {
-        atVec2f val;
-        readUBytesToBuf(&val, 8);
-        utility::LittleFloat(val.vec[0]);
-        utility::LittleFloat(val.vec[1]);
-        return val;
+        simd_floats val;
+        readUBytesToBuf(val.data(), 8);
+        val[0] = utility::LittleFloat(val[0]);
+        val[1] = utility::LittleFloat(val[1]);
+        val[2] = 0.f;
+        val[3] = 0.f;
+        atVec2f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec2f readValLittle(typename std::enable_if<std::is_same<T, atVec2f>::value>::type* = 0)
+    atVec2f readValLittle(typename std::enable_if<std::is_same<T, atVec2f>::value>::type* = 0)
     {return readVec2fLittle();}
 
     /** @brief Reads an atVec2f (8 bytes), swaps against big endianness depending on platform
@@ -543,16 +551,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec2f readVec2fBig()
+    atVec2f readVec2fBig()
     {
-        atVec2f val;
-        readUBytesToBuf(&val, 8);
-        utility::BigFloat(val.vec[0]);
-        utility::BigFloat(val.vec[1]);
-        return val;
+        simd_floats val;
+        readUBytesToBuf(val.data(), 8);
+        val[0] = utility::BigFloat(val[0]);
+        val[1] = utility::BigFloat(val[1]);
+        val[2] = 0.f;
+        val[3] = 0.f;
+        atVec2f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec2f readValBig(typename std::enable_if<std::is_same<T, atVec2f>::value>::type* = 0)
+    atVec2f readValBig(typename std::enable_if<std::is_same<T, atVec2f>::value>::type* = 0)
     {return readVec2fBig();}
 
     /** @brief Reads an atVec3f (12 bytes), swaps to endianness specified by setEndian depending on platform
@@ -560,26 +572,29 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec3f readVec3f()
+    atVec3f readVec3f()
     {
-        atVec3f val;
-        readUBytesToBuf(&val, 12);
+        simd_floats val;
+        readUBytesToBuf(val.data(), 12);
         if (m_endian == Big)
         {
-            utility::BigFloat(val.vec[0]);
-            utility::BigFloat(val.vec[1]);
-            utility::BigFloat(val.vec[2]);
+            val[0] = utility::BigFloat(val[0]);
+            val[1] = utility::BigFloat(val[1]);
+            val[2] = utility::BigFloat(val[2]);
         }
         else
         {
-            utility::LittleFloat(val.vec[0]);
-            utility::LittleFloat(val.vec[1]);
-            utility::LittleFloat(val.vec[2]);
+            val[0] = utility::LittleFloat(val[0]);
+            val[1] = utility::LittleFloat(val[1]);
+            val[2] = utility::LittleFloat(val[2]);
         }
-        return val;
+        val[3] = 0.f;
+        atVec3f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec3f readVal(typename std::enable_if<std::is_same<T, atVec3f>::value>::type* = 0)
+    atVec3f readVal(typename std::enable_if<std::is_same<T, atVec3f>::value>::type* = 0)
     {return readVec3f();}
 
     /** @brief Reads an atVec3f (12 bytes), swaps against little endianness depending on platform
@@ -587,17 +602,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec3f readVec3fLittle()
+    atVec3f readVec3fLittle()
     {
-        atVec3f val;
-        readUBytesToBuf(&val, 12);
-        utility::LittleFloat(val.vec[0]);
-        utility::LittleFloat(val.vec[1]);
-        utility::LittleFloat(val.vec[2]);
-        return val;
+        simd_floats val;
+        readUBytesToBuf(val.data(), 12);
+        val[0] = utility::LittleFloat(val[0]);
+        val[1] = utility::LittleFloat(val[1]);
+        val[2] = utility::LittleFloat(val[2]);
+        val[3] = 0.f;
+        atVec3f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec3f readValLittle(typename std::enable_if<std::is_same<T, atVec3f>::value>::type* = 0)
+    atVec3f readValLittle(typename std::enable_if<std::is_same<T, atVec3f>::value>::type* = 0)
     {return readVec3fLittle();}
 
     /** @brief Reads an atVec3f (12 bytes), swaps against big endianness depending on platform
@@ -605,17 +623,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec3f readVec3fBig()
+    atVec3f readVec3fBig()
     {
-        atVec3f val;
-        readUBytesToBuf(&val, 12);
-        utility::BigFloat(val.vec[0]);
-        utility::BigFloat(val.vec[1]);
-        utility::BigFloat(val.vec[2]);
-        return val;
+        simd_floats val;
+        readUBytesToBuf(val.data(), 12);
+        val[0] = utility::BigFloat(val[0]);
+        val[1] = utility::BigFloat(val[1]);
+        val[2] = utility::BigFloat(val[2]);
+        val[3] = 0.f;
+        atVec3f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec3f readValBig(typename std::enable_if<std::is_same<T, atVec3f>::value>::type* = 0)
+    atVec3f readValBig(typename std::enable_if<std::is_same<T, atVec3f>::value>::type* = 0)
     {return readVec3fBig();}
 
     /** @brief Reads an atVec4f (16 bytes), swaps to endianness specified by setEndian depending on platform
@@ -623,28 +644,30 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec4f readVec4f()
+    atVec4f readVec4f()
     {
-        atVec4f val;
-        readUBytesToBuf(&val, 16);
+        simd_floats val;
+        readUBytesToBuf(val.data(), 16);
         if (m_endian == Big)
         {
-            utility::BigFloat(val.vec[0]);
-            utility::BigFloat(val.vec[1]);
-            utility::BigFloat(val.vec[2]);
-            utility::BigFloat(val.vec[3]);
+            val[0] = utility::BigFloat(val[0]);
+            val[1] = utility::BigFloat(val[1]);
+            val[2] = utility::BigFloat(val[2]);
+            val[3] = utility::BigFloat(val[3]);
         }
         else
         {
-            utility::LittleFloat(val.vec[0]);
-            utility::LittleFloat(val.vec[1]);
-            utility::LittleFloat(val.vec[2]);
-            utility::LittleFloat(val.vec[3]);
+            val[0] = utility::LittleFloat(val[0]);
+            val[1] = utility::LittleFloat(val[1]);
+            val[2] = utility::LittleFloat(val[2]);
+            val[3] = utility::LittleFloat(val[3]);
         }
-        return val;
+        atVec4f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec4f readVal(typename std::enable_if<std::is_same<T, atVec4f>::value>::type* = 0)
+    atVec4f readVal(typename std::enable_if<std::is_same<T, atVec4f>::value>::type* = 0)
     {return readVec4f();}
 
     /** @brief Reads an atVec4f (16 bytes), swaps against little endianness depending on platform
@@ -652,18 +675,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec4f readVec4fLittle()
+    atVec4f readVec4fLittle()
     {
-        atVec4f val;
-        readUBytesToBuf(&val, 16);
-        utility::LittleFloat(val.vec[0]);
-        utility::LittleFloat(val.vec[1]);
-        utility::LittleFloat(val.vec[2]);
-        utility::LittleFloat(val.vec[3]);
-        return val;
+        simd_floats val;
+        readUBytesToBuf(val.data(), 16);
+        val[0] = utility::LittleFloat(val[0]);
+        val[1] = utility::LittleFloat(val[1]);
+        val[2] = utility::LittleFloat(val[2]);
+        val[3] = utility::LittleFloat(val[3]);
+        atVec4f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec4f readValLittle(typename std::enable_if<std::is_same<T, atVec4f>::value>::type* = 0)
+    atVec4f readValLittle(typename std::enable_if<std::is_same<T, atVec4f>::value>::type* = 0)
     {return readVec4fLittle();}
 
     /** @brief Reads an atVec4f (16 bytes), swaps against big endianness depending on platform
@@ -671,18 +696,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec4f readVec4fBig()
+    atVec4f readVec4fBig()
     {
-        atVec4f val;
-        readUBytesToBuf(&val, 16);
-        utility::BigFloat(val.vec[0]);
-        utility::BigFloat(val.vec[1]);
-        utility::BigFloat(val.vec[2]);
-        utility::BigFloat(val.vec[3]);
-        return val;
+        simd_floats val;
+        readUBytesToBuf(val.data(), 16);
+        val[0] = utility::BigFloat(val[0]);
+        val[1] = utility::BigFloat(val[1]);
+        val[2] = utility::BigFloat(val[2]);
+        val[3] = utility::BigFloat(val[3]);
+        atVec4f s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec4f readValBig(typename std::enable_if<std::is_same<T, atVec4f>::value>::type* = 0)
+    atVec4f readValBig(typename std::enable_if<std::is_same<T, atVec4f>::value>::type* = 0)
     {return readVec4fBig();}
 
     /** @brief Reads an atVec2d (16 bytes), swaps to endianness specified by setEndian depending on platform
@@ -690,24 +717,28 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec2d readVec2d()
+    atVec2d readVec2d()
     {
-        atVec2d val;
-        readUBytesToBuf(&val, 16);
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 16);
         if (m_endian == Big)
         {
-            utility::BigDouble(val.vec[0]);
-            utility::BigDouble(val.vec[1]);
+            val[0] = utility::BigDouble(val[0]);
+            val[1] = utility::BigDouble(val[1]);
         }
         else
         {
-            utility::LittleDouble(val.vec[0]);
-            utility::LittleDouble(val.vec[1]);
+            val[0] = utility::LittleDouble(val[0]);
+            val[1] = utility::LittleDouble(val[1]);
         }
-        return val;
+        val[2] = 0.0;
+        val[3] = 0.0;
+        atVec2d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec2d readVal(typename std::enable_if<std::is_same<T, atVec2d>::value>::type* = 0)
+    atVec2d readVal(typename std::enable_if<std::is_same<T, atVec2d>::value>::type* = 0)
     {return readVec2d();}
 
     /** @brief Reads an atVec2d (16 bytes), swaps against little endianness depending on platform
@@ -715,16 +746,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec2d readVec2dLittle()
+    atVec2d readVec2dLittle()
     {
-        atVec2d val;
-        readUBytesToBuf(&val, 16);
-        utility::LittleDouble(val.vec[0]);
-        utility::LittleDouble(val.vec[1]);
-        return val;
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 16);
+        val[0] = utility::LittleDouble(val[0]);
+        val[1] = utility::LittleDouble(val[1]);
+        val[2] = 0.0;
+        val[3] = 0.0;
+        atVec2d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec2d readValLittle(typename std::enable_if<std::is_same<T, atVec2d>::value>::type* = 0)
+    atVec2d readValLittle(typename std::enable_if<std::is_same<T, atVec2d>::value>::type* = 0)
     {return readVec2dLittle();}
 
     /** @brief Reads an atVec2d (16 bytes), swaps against big endianness depending on platform
@@ -732,16 +767,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec2d readVec2dBig()
+    atVec2d readVec2dBig()
     {
-        atVec2d val;
-        readUBytesToBuf(&val, 16);
-        utility::BigDouble(val.vec[0]);
-        utility::BigDouble(val.vec[1]);
-        return val;
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 16);
+        val[0] = utility::BigDouble(val[0]);
+        val[1] = utility::BigDouble(val[1]);
+        val[2] = 0.0;
+        val[3] = 0.0;
+        atVec2d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec2d readValBig(typename std::enable_if<std::is_same<T, atVec2d>::value>::type* = 0)
+    atVec2d readValBig(typename std::enable_if<std::is_same<T, atVec2d>::value>::type* = 0)
     {return readVec2dBig();}
 
     /** @brief Reads an atVec3d (24 bytes), swaps to endianness specified by setEndian depending on platform
@@ -749,26 +788,29 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec3d readVec3d()
+    atVec3d readVec3d()
     {
-        atVec3d val;
-        readUBytesToBuf(&val, 24);
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 24);
         if (m_endian == Big)
         {
-            utility::BigDouble(val.vec[0]);
-            utility::BigDouble(val.vec[1]);
-            utility::BigDouble(val.vec[2]);
+            val[0] = utility::BigDouble(val[0]);
+            val[1] = utility::BigDouble(val[1]);
+            val[2] = utility::BigDouble(val[2]);
         }
         else
         {
-            utility::LittleDouble(val.vec[0]);
-            utility::LittleDouble(val.vec[1]);
-            utility::LittleDouble(val.vec[2]);
+            val[0] = utility::LittleDouble(val[0]);
+            val[1] = utility::LittleDouble(val[1]);
+            val[2] = utility::LittleDouble(val[2]);
         }
-        return val;
+        val[3] = 0.0;
+        atVec3d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec3d readVal(typename std::enable_if<std::is_same<T, atVec3d>::value>::type* = 0)
+    atVec3d readVal(typename std::enable_if<std::is_same<T, atVec3d>::value>::type* = 0)
     {return readVec3d();}
 
     /** @brief Reads an atVec3d (24 bytes), swaps against little endianness depending on platform
@@ -776,17 +818,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec3d readVec3dLittle()
+    atVec3d readVec3dLittle()
     {
-        atVec3d val;
-        readUBytesToBuf(&val, 24);
-        utility::LittleDouble(val.vec[0]);
-        utility::LittleDouble(val.vec[1]);
-        utility::LittleDouble(val.vec[2]);
-        return val;
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 24);
+        val[0] = utility::LittleDouble(val[0]);
+        val[1] = utility::LittleDouble(val[1]);
+        val[2] = utility::LittleDouble(val[2]);
+        val[3] = 0.0;
+        atVec3d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec3d readValLittle(typename std::enable_if<std::is_same<T, atVec3d>::value>::type* = 0)
+    atVec3d readValLittle(typename std::enable_if<std::is_same<T, atVec3d>::value>::type* = 0)
     {return readVec3dLittle();}
 
     /** @brief Reads an atVec3d (24 bytes), swaps against big endianness depending on platform
@@ -794,17 +839,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec3d readVec3dBig()
+    atVec3d readVec3dBig()
     {
-        atVec3d val;
-        readUBytesToBuf(&val, 24);
-        utility::BigDouble(val.vec[0]);
-        utility::BigDouble(val.vec[1]);
-        utility::BigDouble(val.vec[2]);
-        return val;
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 24);
+        val[0] = utility::BigDouble(val[0]);
+        val[1] = utility::BigDouble(val[1]);
+        val[2] = utility::BigDouble(val[2]);
+        val[3] = 0.0;
+        atVec3d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec3d readValBig(typename std::enable_if<std::is_same<T, atVec3d>::value>::type* = 0)
+    atVec3d readValBig(typename std::enable_if<std::is_same<T, atVec3d>::value>::type* = 0)
     {return readVec3dBig();}
 
     /** @brief Reads an atVec4d (32 bytes), swaps to endianness specified by setEndian depending on platform
@@ -812,28 +860,30 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec4d readVec4d()
+    atVec4d readVec4d()
     {
-        atVec4d val;
-        readUBytesToBuf(&val, 32);
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 32);
         if (m_endian == Big)
         {
-            utility::BigDouble(val.vec[0]);
-            utility::BigDouble(val.vec[1]);
-            utility::BigDouble(val.vec[2]);
-            utility::BigDouble(val.vec[3]);
+            val[0] = utility::BigDouble(val[0]);
+            val[1] = utility::BigDouble(val[1]);
+            val[2] = utility::BigDouble(val[2]);
+            val[3] = utility::BigDouble(val[3]);
         }
         else
         {
-            utility::LittleDouble(val.vec[0]);
-            utility::LittleDouble(val.vec[1]);
-            utility::LittleDouble(val.vec[2]);
-            utility::LittleDouble(val.vec[3]);
+            val[0] = utility::LittleDouble(val[0]);
+            val[1] = utility::LittleDouble(val[1]);
+            val[2] = utility::LittleDouble(val[2]);
+            val[3] = utility::LittleDouble(val[3]);
         }
-        return val;
+        atVec4d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec4d readVal(typename std::enable_if<std::is_same<T, atVec4d>::value>::type* = 0)
+    atVec4d readVal(typename std::enable_if<std::is_same<T, atVec4d>::value>::type* = 0)
     {return readVec4d();}
 
     /** @brief Reads an atVec4d (32 bytes), swaps against little endianness depending on platform
@@ -841,18 +891,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec4d readVec4dLittle()
+    atVec4d readVec4dLittle()
     {
-        atVec4d val;
-        readUBytesToBuf(&val, 32);
-        utility::LittleDouble(val.vec[0]);
-        utility::LittleDouble(val.vec[1]);
-        utility::LittleDouble(val.vec[2]);
-        utility::LittleDouble(val.vec[3]);
-        return val;
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 32);
+        val[0] = utility::LittleDouble(val[0]);
+        val[1] = utility::LittleDouble(val[1]);
+        val[2] = utility::LittleDouble(val[2]);
+        val[3] = utility::LittleDouble(val[3]);
+        atVec4d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec4d readValLittle(typename std::enable_if<std::is_same<T, atVec4d>::value>::type* = 0)
+    atVec4d readValLittle(typename std::enable_if<std::is_same<T, atVec4d>::value>::type* = 0)
     {return readVec4dLittle();}
 
     /** @brief Reads an atVec4d (32 bytes), swaps against big endianness depending on platform
@@ -860,18 +912,20 @@ public:
      *
      *  @return The value at the current address
      */
-    inline atVec4d readVec4dBig()
+    atVec4d readVec4dBig()
     {
-        atVec4d val;
-        readUBytesToBuf(&val, 32);
-        utility::BigDouble(val.vec[0]);
-        utility::BigDouble(val.vec[1]);
-        utility::BigDouble(val.vec[2]);
-        utility::BigDouble(val.vec[3]);
-        return val;
+        simd_doubles val;
+        readUBytesToBuf(val.data(), 32);
+        val[0] = utility::BigDouble(val[0]);
+        val[1] = utility::BigDouble(val[1]);
+        val[2] = utility::BigDouble(val[2]);
+        val[3] = utility::BigDouble(val[3]);
+        atVec4d s;
+        s.simd.copy_from(val);
+        return s;
     }
     template <class T>
-    inline atVec4d readValBig(typename std::enable_if<std::is_same<T, atVec4d>::value>::type* = 0)
+    atVec4d readValBig(typename std::enable_if<std::is_same<T, atVec4d>::value>::type* = 0)
     {return readVec4dBig();}
 
     /** @brief Reads a string and advances the position in the file
@@ -879,7 +933,7 @@ public:
      *  @param fixedLen If non-negative, this is a fixed-length string read
      *  @return The read string
      */
-    inline std::string readString(atInt32 fixedLen = -1, bool doSeek=true)
+    std::string readString(atInt32 fixedLen = -1, bool doSeek=true)
     {
         if (fixedLen == 0)
             return std::string();
@@ -903,7 +957,7 @@ public:
         return ret;
     }
     template <class T>
-    inline std::string readVal(typename std::enable_if<std::is_same<T, std::string>::value>::type* = 0)
+    std::string readVal(typename std::enable_if<std::is_same<T, std::string>::value>::type* = 0)
     {return readString();}
 
     /** @brief Reads a wstring and advances the position in the file
@@ -911,7 +965,7 @@ public:
      *  @param fixedLen If non-negative, this is a fixed-length string read
      *  @return The read wstring
      */
-    inline std::wstring readWString(atInt32 fixedLen = -1, bool doSeek=true)
+    std::wstring readWString(atInt32 fixedLen = -1, bool doSeek=true)
     {
         if (fixedLen == 0)
             return std::wstring();
@@ -936,7 +990,7 @@ public:
         return ret;
     }
     template <class T>
-    inline std::wstring readVal(typename std::enable_if<std::is_same<T, std::wstring>::value>::type* = 0)
+    std::wstring readVal(typename std::enable_if<std::is_same<T, std::wstring>::value>::type* = 0)
     {return readWString();}
 
     /** @brief Reads a wstring assuming little-endian characters
@@ -945,7 +999,7 @@ public:
      *  @param fixedLen If non-negative, this is a fixed-length string read
      *  @return The read wstring
      */
-    inline std::wstring readWStringLittle(atInt32 fixedLen = -1, bool doSeek=true)
+    std::wstring readWStringLittle(atInt32 fixedLen = -1, bool doSeek=true)
     {
         if (fixedLen == 0)
             return std::wstring();
@@ -970,7 +1024,7 @@ public:
         return ret;
     }
     template <class T>
-    inline std::wstring readValLittle(typename std::enable_if<std::is_same<T, std::wstring>::value>::type* = 0)
+    std::wstring readValLittle(typename std::enable_if<std::is_same<T, std::wstring>::value>::type* = 0)
     {return readWStringLittle();}
 
     /** @brief Reads a wstring assuming big-endian characters
@@ -979,7 +1033,7 @@ public:
      *  @param fixedLen If non-negative, this is a fixed-length string read
      *  @return The read wstring
      */
-    inline std::wstring readWStringBig(atInt32 fixedLen = -1, bool doSeek = true)
+    std::wstring readWStringBig(atInt32 fixedLen = -1, bool doSeek = true)
     {
         if (fixedLen == 0)
             return std::wstring();
@@ -1003,7 +1057,7 @@ public:
         return ret;
     }
     template <class T>
-    inline std::wstring readValBig(typename std::enable_if<std::is_same<T, std::wstring>::value>::type* = 0)
+    std::wstring readValBig(typename std::enable_if<std::is_same<T, std::wstring>::value>::type* = 0)
     {return readWStringBig();}
 
     /** @brief Reads a u16string assuming big-endian characters
@@ -1012,7 +1066,7 @@ public:
      *  @param fixedLen If non-negative, this is a fixed-length string read
      *  @return The read wstring
      */
-    inline std::u16string readU16StringBig(atInt32 fixedLen = -1, bool doSeek = true)
+    std::u16string readU16StringBig(atInt32 fixedLen = -1, bool doSeek = true)
     {
         if (fixedLen == 0)
             return std::u16string();
@@ -1036,7 +1090,7 @@ public:
         return ret;
     }
     template <class T>
-    inline std::u16string readValBig(typename std::enable_if<std::is_same<T, std::u16string>::value>::type* = 0)
+    std::u16string readValBig(typename std::enable_if<std::is_same<T, std::u16string>::value>::type* = 0)
     {return readU16StringBig();}
 
     /** @brief Reads a u32string assuming big-endian characters
@@ -1045,7 +1099,7 @@ public:
      *  @param fixedLen If non-negative, this is a fixed-length string read
      *  @return The read wstring
      */
-    inline std::u32string readU32StringBig(atInt32 fixedLen = -1, bool doSeek = true)
+    std::u32string readU32StringBig(atInt32 fixedLen = -1, bool doSeek = true)
     {
         if (fixedLen == 0)
             return std::u32string();
@@ -1069,7 +1123,7 @@ public:
         return ret;
     }
     template <class T>
-    inline std::u32string readValBig(typename std::enable_if<std::is_same<T, std::u32string>::value>::type* = 0)
+    std::u32string readValBig(typename std::enable_if<std::is_same<T, std::u32string>::value>::type* = 0)
     {return readU32StringBig();}
 
     /** @brief Performs automatic std::vector enumeration reads using numeric type T
diff --git a/include/athena/IStreamWriter.hpp b/include/athena/IStreamWriter.hpp
index 6f6bf56..54e8a38 100644
--- a/include/athena/IStreamWriter.hpp
+++ b/include/athena/IStreamWriter.hpp
@@ -11,7 +11,8 @@ namespace athena::io
 class IStreamWriter : public IStream
 {
 public:
-    virtual ~IStreamWriter() {}
+    virtual ~IStreamWriter() = default;
+
     /** @brief Sets the buffers position relative to the specified position.<br />
      *         It seeks relative to the current position by default.
      *  @param position where in the buffer to seek
@@ -21,11 +22,11 @@ public:
 
     /** @brief Sets the buffers position relative to the next 32-byte aligned position.<br />
      */
-    inline void seekAlign32() {seek(ROUND_UP_32(position()), SeekOrigin::Begin);}
+    void seekAlign32() {seek(ROUND_UP_32(position()), SeekOrigin::Begin);}
 
     /** @brief Writes zero up to specified absolute offset.<br />
      */
-    inline void writeZeroTo(atInt64 pos)
+    void writeZeroTo(atInt64 pos)
     {
         atInt64 delta = pos - position();
         if (delta <= 0)
@@ -38,7 +39,7 @@ public:
      *
      *  @return True if at end; False otherwise.
      */
-    inline bool atEnd() const {return position() >= length();}
+    bool atEnd() const {return position() >= length();}
 
     /** @brief Returns the current position in the stream.
      *
@@ -55,18 +56,18 @@ public:
     /** @brief Writes a byte at the current position and advances the position by one byte.
      *  @param val The value to write
      */
-    inline void writeUByte(atUint8 val) {writeUBytes(&val, 1);}
-    inline void writeVal(atUint8 val) {writeUByte(val);}
-    inline void writeValLittle(atUint8 val) {writeUByte(val);}
-    inline void writeValBig(atUint8 val) {writeUByte(val);}
+    void writeUByte(atUint8 val) {writeUBytes(&val, 1);}
+    void writeVal(atUint8 val) {writeUByte(val);}
+    void writeValLittle(atUint8 val) {writeUByte(val);}
+    void writeValBig(atUint8 val) {writeUByte(val);}
 
     /** @brief Writes a byte at the current position and advances the position by one byte.
      *  @param val The value to write
      */
-    inline void writeByte(atInt8 val) {writeUByte(val);}
-    inline void writeVal(atInt8 val) {writeByte(val);}
-    inline void writeValLittle(atInt8 val) {writeByte(val);}
-    inline void writeValBig(atInt8 val) {writeByte(val);}
+    void writeByte(atInt8 val) {writeUByte(val);}
+    void writeVal(atInt8 val) {writeByte(val);}
+    void writeValLittle(atInt8 val) {writeByte(val);}
+    void writeValBig(atInt8 val) {writeByte(val);}
 
     /** @brief Writes the given buffer with the specified length, buffers can be bigger than the length
      *  however it's undefined behavior to try and write a buffer which is smaller than the given length.
@@ -82,14 +83,14 @@ public:
      *  @param data The buffer to write
      *  @param length The amount to write
      */
-    inline void writeBytes(const void* data, atUint64 len) {writeUBytes((atUint8*)data, len);}
+    void writeBytes(const void* data, atUint64 len) {writeUBytes((atUint8*)data, len);}
 
     /** @brief Writes an Int16 to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt16(atInt16 val)
+    void writeInt16(atInt16 val)
     {
         if (m_endian == Big)
             utility::BigInt16(val);
@@ -97,62 +98,62 @@ public:
             utility::LittleInt16(val);
         writeUBytes((atUint8*)&val, 2);
     }
-    inline void writeVal(atInt16 val) {writeInt16(val);}
+    void writeVal(atInt16 val) {writeInt16(val);}
 
     /** @brief Writes an Int16 to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt16Little(atInt16 val)
+    void writeInt16Little(atInt16 val)
     {
         utility::LittleInt16(val);
         writeUBytes((atUint8*)&val, 2);
     }
-    inline void writeValLittle(atInt16 val) {writeInt16Little(val);}
+    void writeValLittle(atInt16 val) {writeInt16Little(val);}
 
     /** @brief Writes an Int16 to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt16Big(atInt16 val)
+    void writeInt16Big(atInt16 val)
     {
         utility::BigInt16(val);
         writeUBytes((atUint8*)&val, 2);
     }
-    inline void writeValBig(atInt16 val) {writeInt16Big(val);}
+    void writeValBig(atInt16 val) {writeInt16Big(val);}
 
     /** @brief Writes an Uint16 to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint16(atUint16 val) {writeInt16(val);}
-    inline void writeVal(atUint16 val) {writeUint16(val);}
+    void writeUint16(atUint16 val) {writeInt16(val);}
+    void writeVal(atUint16 val) {writeUint16(val);}
 
     /** @brief Writes an Uint16 to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint16Little(atUint16 val) {writeInt16Little(val);}
-    inline void writeValLittle(atUint16 val) {writeUint16Little(val);}
+    void writeUint16Little(atUint16 val) {writeInt16Little(val);}
+    void writeValLittle(atUint16 val) {writeUint16Little(val);}
 
     /** @brief Writes an Uint16 to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint16Big(atUint16 val) {writeInt16Big(val);}
-    inline void writeValBig(atUint16 val) {writeUint16Big(val);}
+    void writeUint16Big(atUint16 val) {writeInt16Big(val);}
+    void writeValBig(atUint16 val) {writeUint16Big(val);}
 
     /** @brief Writes an Int32 to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt32(atInt32 val)
+    void writeInt32(atInt32 val)
     {
         if (m_endian == Big)
             utility::BigInt32(val);
@@ -160,62 +161,62 @@ public:
             utility::LittleInt32(val);
         writeUBytes((atUint8*)&val, 4);
     }
-    inline void writeVal(atInt32 val) {writeInt32(val);}
+    void writeVal(atInt32 val) {writeInt32(val);}
 
     /** @brief Writes an Int32 to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt32Little(atInt32 val)
+    void writeInt32Little(atInt32 val)
     {
         utility::LittleInt32(val);
         writeUBytes((atUint8*)&val, 4);
     }
-    inline void writeValLittle(atInt32 val) {writeInt32Little(val);}
+    void writeValLittle(atInt32 val) {writeInt32Little(val);}
 
     /** @brief Writes an Int32 to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt32Big(atInt32 val)
+    void writeInt32Big(atInt32 val)
     {
         utility::BigInt32(val);
         writeUBytes((atUint8*)&val, 4);
     }
-    inline void writeValBig(atInt32 val) {writeInt32Big(val);}
+    void writeValBig(atInt32 val) {writeInt32Big(val);}
 
     /** @brief Writes an Uint32 to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint32(atUint32 val) {writeInt32(val);}
-    inline void writeVal(atUint32 val) {writeUint32(val);}
+    void writeUint32(atUint32 val) {writeInt32(val);}
+    void writeVal(atUint32 val) {writeUint32(val);}
 
     /** @brief Writes an Uint32 to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint32Little(atUint32 val) {writeInt32Little(val);}
-    inline void writeValLittle(atUint32 val) {writeUint32Little(val);}
+    void writeUint32Little(atUint32 val) {writeInt32Little(val);}
+    void writeValLittle(atUint32 val) {writeUint32Little(val);}
 
     /** @brief Writes an Uint32 to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint32Big(atUint32 val) {writeInt32Big(val);}
-    inline void writeValBig(atUint32 val) {writeUint32Big(val);}
+    void writeUint32Big(atUint32 val) {writeInt32Big(val);}
+    void writeValBig(atUint32 val) {writeUint32Big(val);}
 
     /** @brief Writes an Int64 to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt64(atInt64 val)
+    void writeInt64(atInt64 val)
     {
         if (m_endian == Big)
             utility::BigInt64(val);
@@ -223,101 +224,101 @@ public:
             utility::LittleInt64(val);
         writeUBytes((atUint8*)&val, 8);
     }
-    inline void writeVal(atInt64 val) {writeInt64(val);}
+    void writeVal(atInt64 val) {writeInt64(val);}
 
     /** @brief Writes an Int64 to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt64Little(atInt64 val)
+    void writeInt64Little(atInt64 val)
     {
         utility::LittleInt64(val);
         writeUBytes((atUint8*)&val, 8);
     }
-    inline void writeValLittle(atInt64 val) {writeInt64Little(val);}
+    void writeValLittle(atInt64 val) {writeInt64Little(val);}
 
     /** @brief Writes an Int64 to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeInt64Big(atInt64 val)
+    void writeInt64Big(atInt64 val)
     {
         utility::BigInt64(val);
         writeUBytes((atUint8*)&val, 8);
     }
-    inline void writeValBig(atInt64 val) {writeInt64Big(val);}
+    void writeValBig(atInt64 val) {writeInt64Big(val);}
 
     /** @brief Writes an Uint64 to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint64(atUint64 val) {writeInt64(val);}
-    inline void writeVal(atUint64 val) {writeUint64(val);}
+    void writeUint64(atUint64 val) {writeInt64(val);}
+    void writeVal(atUint64 val) {writeUint64(val);}
 
     /** @brief Writes an Uint64 to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint64Little(atUint64 val) {writeInt64Little(val);}
-    inline void writeValLittle(atUint64 val) {writeUint64Little(val);}
+    void writeUint64Little(atUint64 val) {writeInt64Little(val);}
+    void writeValLittle(atUint64 val) {writeUint64Little(val);}
 
     /** @brief Writes an Uint64 to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeUint64Big(atUint64 val) {writeInt64Big(val);}
-    inline void writeValBig(atUint64 val) {writeUint64Big(val);}
+    void writeUint64Big(atUint64 val) {writeInt64Big(val);}
+    void writeValBig(atUint64 val) {writeUint64Big(val);}
 
     /** @brief Writes an float to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeFloat(float val)
+    void writeFloat(float val)
     {
         if (m_endian == Big)
-            utility::BigFloat(val);
+            val = utility::BigFloat(val);
         else
-            utility::LittleFloat(val);
+            val = utility::LittleFloat(val);
         writeUBytes((atUint8*)&val, 4);
     }
-    inline void writeVal(float val) {writeFloat(val);}
+    void writeVal(float val) {writeFloat(val);}
 
     /** @brief Writes an float to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeFloatLittle(float val)
+    void writeFloatLittle(float val)
     {
         utility::LittleFloat(val);
         writeUBytes((atUint8*)&val, 4);
     }
-    inline void writeValLittle(float val) {writeFloatLittle(val);}
+    void writeValLittle(float val) {writeFloatLittle(val);}
 
     /** @brief Writes an float to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeFloatBig(float val)
+    void writeFloatBig(float val)
     {
-        utility::BigFloat(val);
+        val = utility::BigFloat(val);
         writeUBytes((atUint8*)&val, 4);
     }
-    inline void writeValBig(float val) {writeFloatBig(val);}
+    void writeValBig(float val) {writeFloatBig(val);}
 
     /** @brief Writes an double to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeDouble(double val)
+    void writeDouble(double val)
     {
         if (m_endian == Big)
             utility::BigDouble(val);
@@ -325,365 +326,365 @@ public:
             utility::LittleDouble(val);
         writeUBytes((atUint8*)&val, 8);
     }
-    inline void writeVal(double val) {writeDouble(val);}
+    void writeVal(double val) {writeDouble(val);}
 
     /** @brief Writes an double to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeDoubleLittle(double val)
+    void writeDoubleLittle(double val)
     {
         utility::LittleDouble(val);
         writeUBytes((atUint8*)&val, 8);
     }
-    inline void writeValLittle(double val) {writeDoubleLittle(val);}
+    void writeValLittle(double val) {writeDoubleLittle(val);}
 
     /** @brief Writes an double to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeDoubleBig(double val)
+    void writeDoubleBig(double val)
     {
         utility::BigDouble(val);
         writeUBytes((atUint8*)&val, 8);
     }
-    inline void writeValBig(double val) {writeDoubleBig(val);}
+    void writeValBig(double val) {writeDoubleBig(val);}
 
     /** @brief Writes an bool to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param val The value to write to the buffer
      */
-    inline void writeBool(bool val) {writeUBytes((atUint8*)&val, 1);}
-    inline void writeVal(bool val) {writeBool(val);}
-    inline void writeValLittle(bool val) {writeBool(val);}
-    inline void writeValBig(bool val) {writeBool(val);}
+    void writeBool(bool val) {writeUBytes((atUint8*)&val, 1);}
+    void writeVal(bool val) {writeBool(val);}
+    void writeValLittle(bool val) {writeBool(val);}
+    void writeValBig(bool val) {writeBool(val);}
 
     /** @brief Writes an atVec2f (8 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      * @param vec The value to write to the buffer
      */
-    inline void writeVec2f(const atVec2f& vec)
+    void writeVec2f(const atVec2f& vec)
     {
-        atVec2f tmp = vec;
+        simd_floats tmp(vec.simd);
         if (m_endian == Big)
         {
-            utility::BigFloat(tmp.vec[0]);
-            utility::BigFloat(tmp.vec[1]);
+            tmp[0] = utility::BigFloat(tmp[0]);
+            tmp[1] = utility::BigFloat(tmp[1]);
         }
         else
         {
-            utility::LittleFloat(tmp.vec[0]);
-            utility::LittleFloat(tmp.vec[1]);
+            tmp[0] = utility::LittleFloat(tmp[0]);
+            tmp[1] = utility::LittleFloat(tmp[1]);
         }
-        writeUBytes((atUint8*)&tmp, 8);
+        writeUBytes((atUint8*)tmp.data(), 8);
     }
-    inline void writeVal(const atVec2f& val) {writeVec2f(val);}
+    void writeVal(const atVec2f& val) {writeVec2f(val);}
 
     /** @brief Writes an atVec2f (8 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      * @param vec The value to write to the buffer
      */
-    inline void writeVec2fLittle(const atVec2f& vec)
+    void writeVec2fLittle(const atVec2f& vec)
     {
-        atVec2f tmp = vec;
-        utility::LittleFloat(tmp.vec[0]);
-        utility::LittleFloat(tmp.vec[1]);
-        writeUBytes((atUint8*)&tmp, 8);
+        simd_floats tmp(vec.simd);
+        tmp[0] = utility::LittleFloat(tmp[0]);
+        tmp[1] = utility::LittleFloat(tmp[1]);
+        writeUBytes((atUint8*)tmp.data(), 8);
     }
-    inline void writeValLittle(const atVec2f& val) {writeVec2fLittle(val);}
+    void writeValLittle(const atVec2f& val) {writeVec2fLittle(val);}
 
     /** @brief Writes an atVec2f (8 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      * @param vec The value to write to the buffer
      */
-    inline void writeVec2fBig(const atVec2f& vec)
+    void writeVec2fBig(const atVec2f& vec)
     {
-        atVec2f tmp = vec;
-        utility::BigFloat(tmp.vec[0]);
-        utility::BigFloat(tmp.vec[1]);
-        writeUBytes((atUint8*)&tmp, 8);
+        simd_floats tmp(vec.simd);
+        tmp[0] = utility::BigFloat(tmp[0]);
+        tmp[1] = utility::BigFloat(tmp[1]);
+        writeUBytes((atUint8*)tmp.data(), 8);
     }
-    inline void writeValBig(const atVec2f& val) {writeVec2fBig(val);}
+    void writeValBig(const atVec2f& val) {writeVec2fBig(val);}
 
     /** @brief Writes an atVec3f (12 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec3f(const atVec3f& vec)
+    void writeVec3f(const atVec3f& vec)
     {
-        atVec3f tmp = vec;
+        simd_floats tmp(vec.simd);
         if (m_endian == Big)
         {
-            utility::BigFloat(tmp.vec[0]);
-            utility::BigFloat(tmp.vec[1]);
-            utility::BigFloat(tmp.vec[2]);
+            tmp[0] = utility::BigFloat(tmp[0]);
+            tmp[1] = utility::BigFloat(tmp[1]);
+            tmp[2] = utility::BigFloat(tmp[2]);
         }
         else
         {
-            utility::LittleFloat(tmp.vec[0]);
-            utility::LittleFloat(tmp.vec[1]);
-            utility::LittleFloat(tmp.vec[2]);
+            tmp[0] = utility::LittleFloat(tmp[0]);
+            tmp[1] = utility::LittleFloat(tmp[1]);
+            tmp[2] = utility::LittleFloat(tmp[2]);
         }
-        writeUBytes((atUint8*)&tmp, 12);
+        writeUBytes((atUint8*)tmp.data(), 12);
     }
-    inline void writeVal(const atVec3f& val) {writeVec3f(val);}
+    void writeVal(const atVec3f& val) {writeVec3f(val);}
 
     /** @brief Writes an atVec3f (12 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec3fLittle(const atVec3f& vec)
+    void writeVec3fLittle(const atVec3f& vec)
     {
-        atVec3f tmp = vec;
-        utility::LittleFloat(tmp.vec[0]);
-        utility::LittleFloat(tmp.vec[1]);
-        utility::LittleFloat(tmp.vec[2]);
-        writeUBytes((atUint8*)&tmp, 12);
+        simd_floats tmp(vec.simd);
+        tmp[0] = utility::LittleFloat(tmp[0]);
+        tmp[1] = utility::LittleFloat(tmp[1]);
+        tmp[2] = utility::LittleFloat(tmp[2]);
+        writeUBytes((atUint8*)tmp.data(), 12);
     }
-    inline void writeValLittle(const atVec3f& val) {writeVec3fLittle(val);}
+    void writeValLittle(const atVec3f& val) {writeVec3fLittle(val);}
 
     /** @brief Writes an atVec3f (12 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec3fBig(const atVec3f& vec)
+    void writeVec3fBig(const atVec3f& vec)
     {
-        atVec3f tmp = vec;
-        utility::BigFloat(tmp.vec[0]);
-        utility::BigFloat(tmp.vec[1]);
-        utility::BigFloat(tmp.vec[2]);
-        writeUBytes((atUint8*)&tmp, 12);
+        simd_floats tmp(vec.simd);
+        tmp[0] = utility::BigFloat(tmp[0]);
+        tmp[1] = utility::BigFloat(tmp[1]);
+        tmp[2] = utility::BigFloat(tmp[2]);
+        writeUBytes((atUint8*)tmp.data(), 12);
     }
-    inline void writeValBig(const atVec3f& val) {writeVec3fBig(val);}
+    void writeValBig(const atVec3f& val) {writeVec3fBig(val);}
 
     /** @brief Writes an atVec4f (16 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec4f(const atVec4f& vec)
+    void writeVec4f(const atVec4f& vec)
     {
-        atVec4f tmp = vec;
+        simd_floats tmp(vec.simd);
         if (m_endian == Big)
         {
-            utility::BigFloat(tmp.vec[0]);
-            utility::BigFloat(tmp.vec[1]);
-            utility::BigFloat(tmp.vec[2]);
-            utility::BigFloat(tmp.vec[3]);
+            tmp[0] = utility::BigFloat(tmp[0]);
+            tmp[1] = utility::BigFloat(tmp[1]);
+            tmp[2] = utility::BigFloat(tmp[2]);
+            tmp[3] = utility::BigFloat(tmp[3]);
         }
         else
         {
-            utility::LittleFloat(tmp.vec[0]);
-            utility::LittleFloat(tmp.vec[1]);
-            utility::LittleFloat(tmp.vec[2]);
-            utility::LittleFloat(tmp.vec[3]);
+            tmp[0] = utility::LittleFloat(tmp[0]);
+            tmp[1] = utility::LittleFloat(tmp[1]);
+            tmp[2] = utility::LittleFloat(tmp[2]);
+            tmp[3] = utility::LittleFloat(tmp[3]);
         }
-        writeUBytes((atUint8*)&tmp, 16);
+        writeUBytes((atUint8*)tmp.data(), 16);
     }
-    inline void writeVal(const atVec4f& val) {writeVec4f(val);}
+    void writeVal(const atVec4f& val) {writeVec4f(val);}
 
     /** @brief Writes an atVec4f (16 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec4fLittle(const atVec4f& vec)
+    void writeVec4fLittle(const atVec4f& vec)
     {
-        atVec4f tmp = vec;
-        utility::LittleFloat(tmp.vec[0]);
-        utility::LittleFloat(tmp.vec[1]);
-        utility::LittleFloat(tmp.vec[2]);
-        utility::LittleFloat(tmp.vec[3]);
-        writeUBytes((atUint8*)&tmp, 16);
+        simd_floats tmp(vec.simd);
+        tmp[0] = utility::LittleFloat(tmp[0]);
+        tmp[1] = utility::LittleFloat(tmp[1]);
+        tmp[2] = utility::LittleFloat(tmp[2]);
+        tmp[3] = utility::LittleFloat(tmp[3]);
+        writeUBytes((atUint8*)tmp.data(), 16);
     }
-    inline void writeValLittle(const atVec4f& val) {writeVec4fLittle(val);}
+    void writeValLittle(const atVec4f& val) {writeVec4fLittle(val);}
 
     /** @brief Writes an atVec4f (16 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec4fBig(const atVec4f& vec)
+    void writeVec4fBig(const atVec4f& vec)
     {
-        atVec4f tmp = vec;
-        utility::BigFloat(tmp.vec[0]);
-        utility::BigFloat(tmp.vec[1]);
-        utility::BigFloat(tmp.vec[2]);
-        utility::BigFloat(tmp.vec[3]);
-        writeUBytes((atUint8*)&tmp, 16);
+        simd_floats tmp(vec.simd);
+        tmp[0] = utility::BigFloat(tmp[0]);
+        tmp[1] = utility::BigFloat(tmp[1]);
+        tmp[2] = utility::BigFloat(tmp[2]);
+        tmp[3] = utility::BigFloat(tmp[3]);
+        writeUBytes((atUint8*)tmp.data(), 16);
     }
-    inline void writeValBig(const atVec4f& val) {writeVec4fBig(val);}
+    void writeValBig(const atVec4f& val) {writeVec4fBig(val);}
 
     /** @brief Writes an atVec2d (16 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      * @param vec The value to write to the buffer
      */
-    inline void writeVec2d(const atVec2d& vec)
+    void writeVec2d(const atVec2d& vec)
     {
-        atVec2d tmp = vec;
+        simd_doubles tmp(vec.simd);
         if (m_endian == Big)
         {
-            utility::BigDouble(tmp.vec[0]);
-            utility::BigDouble(tmp.vec[1]);
+            tmp[0] = utility::BigDouble(tmp[0]);
+            tmp[1] = utility::BigDouble(tmp[1]);
         }
         else
         {
-            utility::LittleDouble(tmp.vec[0]);
-            utility::LittleDouble(tmp.vec[1]);
+            tmp[0] = utility::LittleDouble(tmp[0]);
+            tmp[1] = utility::LittleDouble(tmp[1]);
         }
-        writeUBytes((atUint8*)&tmp, 16);
+        writeUBytes((atUint8*)tmp.data(), 16);
     }
-    inline void writeVal(const atVec2d& val) {writeVec2d(val);}
+    void writeVal(const atVec2d& val) {writeVec2d(val);}
 
     /** @brief Writes an atVec2d (16 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      * @param vec The value to write to the buffer
      */
-    inline void writeVec2dLittle(const atVec2d& vec)
+    void writeVec2dLittle(const atVec2d& vec)
     {
-        atVec2d tmp = vec;
-        utility::LittleDouble(tmp.vec[0]);
-        utility::LittleDouble(tmp.vec[1]);
-        writeUBytes((atUint8*)&tmp, 16);
+        simd_doubles tmp(vec.simd);
+        tmp[0] = utility::LittleDouble(tmp[0]);
+        tmp[1] = utility::LittleDouble(tmp[1]);
+        writeUBytes((atUint8*)tmp.data(), 16);
     }
-    inline void writeValLittle(const atVec2d& val) {writeVec2dLittle(val);}
+    void writeValLittle(const atVec2d& val) {writeVec2dLittle(val);}
 
     /** @brief Writes an atVec2d (16 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      * @param vec The value to write to the buffer
      */
-    inline void writeVec2dBig(const atVec2d& vec)
+    void writeVec2dBig(const atVec2d& vec)
     {
-        atVec2d tmp = vec;
-        utility::BigDouble(tmp.vec[0]);
-        utility::BigDouble(tmp.vec[1]);
-        writeUBytes((atUint8*)&tmp, 16);
+        simd_doubles tmp(vec.simd);
+        tmp[0] = utility::BigDouble(tmp[0]);
+        tmp[1] = utility::BigDouble(tmp[1]);
+        writeUBytes((atUint8*)tmp.data(), 16);
     }
-    inline void writeValBig(const atVec2d& val) {writeVec2dBig(val);}
+    void writeValBig(const atVec2d& val) {writeVec2dBig(val);}
 
     /** @brief Writes an atVec3d (24 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec3d(const atVec3d& vec)
+    void writeVec3d(const atVec3d& vec)
     {
-        atVec3d tmp = vec;
+        simd_doubles tmp(vec.simd);
         if (m_endian == Big)
         {
-            utility::BigDouble(tmp.vec[0]);
-            utility::BigDouble(tmp.vec[1]);
-            utility::BigDouble(tmp.vec[2]);
+            tmp[0] = utility::BigDouble(tmp[0]);
+            tmp[1] = utility::BigDouble(tmp[1]);
+            tmp[2] = utility::BigDouble(tmp[2]);
         }
         else
         {
-            utility::LittleDouble(tmp.vec[0]);
-            utility::LittleDouble(tmp.vec[1]);
-            utility::LittleDouble(tmp.vec[2]);
+            tmp[0] = utility::LittleDouble(tmp[0]);
+            tmp[1] = utility::LittleDouble(tmp[1]);
+            tmp[2] = utility::LittleDouble(tmp[2]);
         }
-        writeUBytes((atUint8*)&tmp, 24);
+        writeUBytes((atUint8*)tmp.data(), 24);
     }
-    inline void writeVal(const atVec3d& val) {writeVec3d(val);}
+    void writeVal(const atVec3d& val) {writeVec3d(val);}
 
     /** @brief Writes an atVec3d (24 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec3dLittle(const atVec3d& vec)
+    void writeVec3dLittle(const atVec3d& vec)
     {
-        atVec3d tmp = vec;
-        utility::LittleDouble(tmp.vec[0]);
-        utility::LittleDouble(tmp.vec[1]);
-        utility::LittleDouble(tmp.vec[2]);
-        writeUBytes((atUint8*)&tmp, 24);
+        simd_doubles tmp(vec.simd);
+        tmp[0] = utility::LittleDouble(tmp[0]);
+        tmp[1] = utility::LittleDouble(tmp[1]);
+        tmp[2] = utility::LittleDouble(tmp[2]);
+        writeUBytes((atUint8*)tmp.data(), 24);
     }
-    inline void writeValLittle(const atVec3d& val) {writeVec3dLittle(val);}
+    void writeValLittle(const atVec3d& val) {writeVec3dLittle(val);}
 
     /** @brief Writes an atVec3d (24 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec3dBig(const atVec3d& vec)
+    void writeVec3dBig(const atVec3d& vec)
     {
-        atVec3d tmp = vec;
-        utility::BigDouble(tmp.vec[0]);
-        utility::BigDouble(tmp.vec[1]);
-        utility::BigDouble(tmp.vec[2]);
-        writeUBytes((atUint8*)&tmp, 24);
+        simd_doubles tmp(vec.simd);
+        tmp[0] = utility::BigDouble(tmp[0]);
+        tmp[1] = utility::BigDouble(tmp[1]);
+        tmp[2] = utility::BigDouble(tmp[2]);
+        writeUBytes((atUint8*)tmp.data(), 24);
     }
-    inline void writeValBig(const atVec3d& val) {writeVec3dBig(val);}
+    void writeValBig(const atVec3d& val) {writeVec3dBig(val);}
 
     /** @brief Writes an atVec4d (32 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec4d(const atVec4d& vec)
+    void writeVec4d(const atVec4d& vec)
     {
-        atVec4d tmp = vec;
+        simd_doubles tmp(vec.simd);
         if (m_endian == Big)
         {
-            utility::BigDouble(tmp.vec[0]);
-            utility::BigDouble(tmp.vec[1]);
-            utility::BigDouble(tmp.vec[2]);
-            utility::BigDouble(tmp.vec[3]);
+            tmp[0] = utility::BigDouble(tmp[0]);
+            tmp[1] = utility::BigDouble(tmp[1]);
+            tmp[2] = utility::BigDouble(tmp[2]);
+            tmp[3] = utility::BigDouble(tmp[3]);
         }
         else
         {
-            utility::LittleDouble(tmp.vec[0]);
-            utility::LittleDouble(tmp.vec[1]);
-            utility::LittleDouble(tmp.vec[2]);
-            utility::LittleDouble(tmp.vec[3]);
+            tmp[0] = utility::LittleDouble(tmp[0]);
+            tmp[1] = utility::LittleDouble(tmp[1]);
+            tmp[2] = utility::LittleDouble(tmp[2]);
+            tmp[3] = utility::LittleDouble(tmp[3]);
         }
-        writeUBytes((atUint8*)&tmp, 32);
+        writeUBytes((atUint8*)tmp.data(), 32);
     }
-    inline void writeVal(const atVec4d& val) {writeVec4d(val);}
+    void writeVal(const atVec4d& val) {writeVec4d(val);}
 
     /** @brief Writes an atVec4d (32 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against little depending on the platform.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec4dLittle(const atVec4d& vec)
+    void writeVec4dLittle(const atVec4d& vec)
     {
-        atVec4d tmp = vec;
-        utility::LittleDouble(tmp.vec[0]);
-        utility::LittleDouble(tmp.vec[1]);
-        utility::LittleDouble(tmp.vec[2]);
-        utility::LittleDouble(tmp.vec[3]);
-        writeUBytes((atUint8*)&tmp, 32);
+        simd_doubles tmp(vec.simd);
+        tmp[0] = utility::LittleDouble(tmp[0]);
+        tmp[1] = utility::LittleDouble(tmp[1]);
+        tmp[2] = utility::LittleDouble(tmp[2]);
+        tmp[3] = utility::LittleDouble(tmp[3]);
+        writeUBytes((atUint8*)tmp.data(), 32);
     }
-    inline void writeValLittle(const atVec4d& val) {writeVec4dLittle(val);}
+    void writeValLittle(const atVec4d& val) {writeVec4dLittle(val);}
 
     /** @brief Writes an atVec4d (32 bytes) to the buffer and advances the buffer.
      *         It also swaps the bytes against big depending on the platform.
      *
      *  @param vec The value to write to the buffer
      */
-    inline void writeVec4dBig(const atVec4d& vec)
+    void writeVec4dBig(const atVec4d& vec)
     {
-        atVec4d tmp = vec;
-        utility::BigDouble(tmp.vec[0]);
-        utility::BigDouble(tmp.vec[1]);
-        utility::BigDouble(tmp.vec[2]);
-        utility::BigDouble(tmp.vec[3]);
-        writeUBytes((atUint8*)&tmp, 32);
+        simd_doubles tmp(vec.simd);
+        tmp[0] = utility::BigDouble(tmp[0]);
+        tmp[1] = utility::BigDouble(tmp[1]);
+        tmp[2] = utility::BigDouble(tmp[2]);
+        tmp[3] = utility::BigDouble(tmp[3]);
+        writeUBytes((atUint8*)tmp.data(), 32);
     }
-    inline void writeValBig(const atVec4d& val) {writeVec4dBig(val);}
+    void writeValBig(const atVec4d& val) {writeVec4dBig(val);}
 
     /** @brief Converts a UTF8 string to a wide-char string in the buffer and advances the buffer.
      *         It also swaps the bytes depending on the platform and Stream settings.
@@ -693,7 +694,7 @@ public:
      *
      *  Endianness is set with setEndian
      */
-    inline void writeStringAsWString(std::string_view str, atInt32 fixedLen = -1)
+    void writeStringAsWString(std::string_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -751,7 +752,7 @@ public:
      *
      *  Endianness is little
      */
-    inline void writeStringAsWStringLittle(std::string_view str, atInt32 fixedLen = -1)
+    void writeStringAsWStringLittle(std::string_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -809,7 +810,7 @@ public:
      *
      *  Endianness is big
      */
-    inline void writeStringAsWStringBig(std::string_view str, atInt32 fixedLen = -1)
+    void writeStringAsWStringBig(std::string_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -865,7 +866,7 @@ public:
      *  @param str The string to write to the buffer
      *  @param fixedLen If not -1, the number of characters to zero-fill string to
      */
-    inline void writeString(std::string_view str, atInt32 fixedLen = -1)
+    void writeString(std::string_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -895,7 +896,7 @@ public:
             }
         }
     }
-    inline void writeVal(std::string_view val) {writeString(val);}
+    void writeVal(std::string_view val) {writeString(val);}
 
     /** @brief Writes an wstring to the buffer and advances the buffer.
      *
@@ -904,7 +905,7 @@ public:
      *
      *  Endianness is set with setEndian
      */
-    inline void writeWString(std::wstring_view str, atInt32 fixedLen = -1)
+    void writeWString(std::wstring_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -934,7 +935,7 @@ public:
             }
         }
     }
-    inline void writeVal(std::wstring_view val) {writeWString(val);}
+    void writeVal(std::wstring_view val) {writeWString(val);}
 
     /** @brief Writes an wstring to the buffer and advances the buffer.
      *
@@ -943,7 +944,7 @@ public:
      *
      *  Endianness is little
      */
-    inline void writeWStringLittle(std::wstring_view str, atInt32 fixedLen = -1)
+    void writeWStringLittle(std::wstring_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -973,7 +974,7 @@ public:
             }
         }
     }
-    inline void writeValLittle(std::wstring_view val) {writeWStringLittle(val);}
+    void writeValLittle(std::wstring_view val) {writeWStringLittle(val);}
 
     /** @brief Writes an wstring to the buffer and advances the buffer.
      *
@@ -982,7 +983,7 @@ public:
      *
      *  Endianness is big
      */
-    inline void writeWStringBig(std::wstring_view str, atInt32 fixedLen = -1)
+    void writeWStringBig(std::wstring_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -1012,7 +1013,7 @@ public:
             }
         }
     }
-    inline void writeValBig(std::wstring_view val) {writeWStringBig(val);}
+    void writeValBig(std::wstring_view val) {writeWStringBig(val);}
 
     /** @brief Writes a u16string to the buffer and advances the buffer.
      *
@@ -1021,7 +1022,7 @@ public:
      *
      *  Endianness is big
      */
-    inline void writeU16StringBig(std::u16string_view str, atInt32 fixedLen = -1)
+    void writeU16StringBig(std::u16string_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -1051,7 +1052,7 @@ public:
             }
         }
     }
-    inline void writeValBig(std::u16string_view val) {writeU16StringBig(val);}
+    void writeValBig(std::u16string_view val) {writeU16StringBig(val);}
 
     /** @brief Writes a u16string to the buffer and advances the buffer.
      *
@@ -1060,7 +1061,7 @@ public:
      *
      *  Endianness is big
      */
-    inline void writeU32StringBig(std::u32string_view str, atInt32 fixedLen = -1)
+    void writeU32StringBig(std::u32string_view str, atInt32 fixedLen = -1)
     {
         if (fixedLen == 0)
             return;
@@ -1090,9 +1091,9 @@ public:
             }
         }
     }
-    inline void writeValBig(std::u32string_view val) {writeU32StringBig(val);}
+    void writeValBig(std::u32string_view val) {writeU32StringBig(val);}
 
-    inline void fill(atUint8 val, atUint64 length)
+    void fill(atUint8 val, atUint64 length)
     {
         if (length == 0)
             return;
@@ -1102,7 +1103,7 @@ public:
         writeUBytes(tmp.get(), length);
     }
 
-    inline void fill(atInt8 val, atUint64 length)
+    void fill(atInt8 val, atUint64 length)
     {fill((atUint8)val, length);}
 
     /** @brief Performs automatic std::vector enumeration writes using numeric type T
diff --git a/include/athena/Types.hpp b/include/athena/Types.hpp
index 2d046b6..e54a4f6 100644
--- a/include/athena/Types.hpp
+++ b/include/athena/Types.hpp
@@ -12,106 +12,13 @@ using atInt64 = int64_t;
 using atUint64 = uint64_t;
 
 // Vector types
-#if __SSE__
-#include <immintrin.h>
-#ifndef _WIN32
-#include <mm_malloc.h>
-#endif
-#endif
-
-#include <new>
-#define AT_ALIGNED_ALLOCATOR \
-void* operator new(size_t bytes) noexcept \
-{return _mm_malloc(bytes, 16);} \
-void* operator new[](size_t bytes) noexcept \
-{return _mm_malloc(bytes, 16);} \
-void operator delete(void* buf) noexcept \
-{_mm_free(buf);} \
-void operator delete[](void* buf) noexcept \
-{_mm_free(buf);}
-
-#define AT_ALIGNED_ALLOCATOR32 \
-void* operator new(size_t bytes) noexcept \
-{return _mm_malloc(bytes, 32);} \
-void* operator new[](size_t bytes) noexcept \
-{return _mm_malloc(bytes, 32);} \
-void operator delete(void* buf) noexcept \
-{_mm_free(buf);} \
-void operator delete[](void* buf) noexcept \
-{_mm_free(buf);}
-
-typedef union alignas(16)
-{
-#if __clang__
-    float clangVec __attribute__((__vector_size__(8)));
-#endif
-#if __SSE__
-    __m128 mVec128;
-    AT_ALIGNED_ALLOCATOR
-#endif
-    float vec[2];
-} atVec2f;
-
-typedef union alignas(16)
-{
-#if __clang__
-    float clangVec __attribute__((__vector_size__(12)));
-#endif
-#if __SSE__
-    __m128 mVec128;
-    AT_ALIGNED_ALLOCATOR
-#endif
-    float vec[3];
-} atVec3f;
-
-typedef union alignas(16)
-{
-#if __clang__
-    float clangVec __attribute__((__vector_size__(16)));
-#endif
-#if __SSE__
-    __m128 mVec128;
-    AT_ALIGNED_ALLOCATOR
-#endif
-    float vec[4];
-} atVec4f;
-
-typedef union alignas(16)
-{
-#if __SSE__
-    __m128d mVec128;
-    AT_ALIGNED_ALLOCATOR
-#endif
-    double vec[2];
-} atVec2d;
-
-typedef union alignas(32)
-{
-#if __AVX__
-    __m256d mVec256;
-    AT_ALIGNED_ALLOCATOR32
-#elif __SSE__
-    AT_ALIGNED_ALLOCATOR
-#endif
-#if __SSE__
-    __m128d mVec128[2];
-#endif
-    double vec[3];
-} atVec3d;
-
-typedef union alignas(32)
-{
-#if __AVX__
-    __m256d mVec256;
-    AT_ALIGNED_ALLOCATOR32
-#elif __SSE__
-    AT_ALIGNED_ALLOCATOR
-#endif
-#if __SSE__
-    __m128d mVec128[2];
-#endif
-    double vec[4];
-} atVec4d;
+#include "simd/simd.hpp"
+typedef struct { athena::simd<float> simd; } atVec2f;
+typedef struct { athena::simd<float> simd; } atVec3f;
+typedef struct { athena::simd<float> simd; } atVec4f;
+typedef struct { athena::simd<double> simd; } atVec2d;
+typedef struct { athena::simd<double> simd; } atVec3d;
+typedef struct { athena::simd<double> simd; } atVec4d;
 
 #ifndef UNUSED
 #define UNUSED(x) ((void)x)
diff --git a/include/athena/Utility.hpp b/include/athena/Utility.hpp
index 50b64a5..c3b5d06 100644
--- a/include/athena/Utility.hpp
+++ b/include/athena/Utility.hpp
@@ -163,28 +163,28 @@ inline atUint64 BigUint64(atUint64& val)
     return val;
 }
 
-inline float LittleFloat(float& val)
+inline float LittleFloat(float val)
 {
     if (athena::utility::isSystemBigEndian())
         val = athena::utility::swapFloat(val);
 
     return val;
 }
-inline float BigFloat(float& val)
+inline float BigFloat(float val)
 {
     if (!athena::utility::isSystemBigEndian())
         val = athena::utility::swapFloat(val);
 
     return val;
 }
-inline double LittleDouble(double& val)
+inline double LittleDouble(double val)
 {
     if (athena::utility::isSystemBigEndian())
         val = athena::utility::swapDouble(val);
 
     return val;
 }
-inline double BigDouble(double& val)
+inline double BigDouble(double val)
 {
     if (!athena::utility::isSystemBigEndian())
         val = athena::utility::swapDouble(val);
diff --git a/include/athena/simd/parallelism_v2_simd.hpp b/include/athena/simd/parallelism_v2_simd.hpp
new file mode 100644
index 0000000..14606ee
--- /dev/null
+++ b/include/athena/simd/parallelism_v2_simd.hpp
@@ -0,0 +1,1768 @@
+// -*- C++ -*-
+//===------------------------------- simd ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+/*
+    experimental/simd synopsis
+
+namespace std::experimental {
+
+inline namespace parallelism_v2 {
+
+namespace simd_abi {
+
+struct scalar {};
+template <int N> struct fixed_size {};
+template <typename T> inline constexpr int max_fixed_size = implementation-defined;
+template <typename T> using compatible = implementation-defined;
+template <typename T> using native = implementation-defined;
+
+} // simd_abi
+
+struct element_aligned_tag {};
+struct vector_aligned_tag {};
+template <size_t> struct overaligned_tag {};
+inline constexpr element_aligned_tag element_aligned{};
+inline constexpr vector_aligned_tag vector_aligned{};
+template <size_t N> inline constexpr overaligned_tag<N> overaligned{};
+
+// traits [simd.traits]
+template <class T> struct is_abi_tag;
+template <class T> inline constexpr bool is_abi_tag_v = is_abi_tag<T>::value;
+
+template <class T> struct is_simd;
+template <class T> inline constexpr bool is_simd_v = is_simd<T>::value;
+
+template <class T> struct is_simd_mask;
+template <class T> inline constexpr bool is_simd_mask_v = is_simd_mask<T>::value;
+
+template <class T> struct is_simd_flag_type;
+template <class T> inline constexpr bool is_simd_flag_type_v = is_simd_flag_type<T>::value;
+
+template <class T, size_t N> struct abi_for_size { using type = see below; };
+template <class T, size_t N> using abi_for_size_t = typename abi_for_size<T, N>::type;
+
+template <class T, class Abi = simd_abi::compatible<T>> struct simd_size;
+template <class T, class Abi = simd_abi::compatible<T>>
+inline constexpr size_t simd_size_v = simd_size<T, Abi>::value;
+
+template <class T, class U = typename T::value_type> struct memory_alignment;
+template <class T, class U = typename T::value_type>
+inline constexpr size_t memory_alignment_v = memory_alignment<T, U>::value;
+
+// class template simd [simd.class]
+template <class T, class Abi = simd_abi::compatible<T>> class simd;
+template <class T> using native_simd = simd<T, simd_abi::native<T>>;
+template <class T, int N> using fixed_size_simd = simd<T, simd_abi::fixed_size<N>>;
+
+// class template simd_mask [simd.mask.class]
+template <class T, class Abi = simd_abi::compatible<T>> class simd_mask;
+template <class T> using native_simd_mask = simd_mask<T, simd_abi::native<T>>;
+template <class T, int N> using fixed_size_simd_mask = simd_mask<T, simd_abi::fixed_size<N>>;
+
+// casts [simd.casts]
+template <class T, class U, class Abi> see below simd_cast(const simd<U, Abi>&);
+template <class T, class U, class Abi> see below static_simd_cast(const simd<U, Abi>&);
+
+template <class T, class Abi>
+fixed_size_simd<T, simd_size_v<T, Abi>> to_fixed_size(const simd<T, Abi>&) noexcept;
+template <class T, class Abi>
+fixed_size_simd_mask<T, simd_size_v<T, Abi>> to_fixed_size(const simd_mask<T, Abi>&) noexcept;
+template <class T, size_t N> native_simd<T> to_native(const fixed_size_simd<T, N>&) noexcept;
+template <class T, size_t N>
+native_simd_mask<T> to_native(const fixed_size_simd_mask<T, N>> &) noexcept;
+template <class T, size_t N> simd<T> to_compatible(const fixed_size_simd<T, N>&) noexcept;
+template <class T, size_t N> simd_mask<T> to_compatible(const fixed_size_simd_mask<T, N>&) noexcept;
+
+template <size_t... Sizes, class T, class Abi>
+tuple<simd<T, abi_for_size_t<Sizes>>...> split(const simd<T, Abi>&);
+template <size_t... Sizes, class T, class Abi>
+tuple<simd_mask<T, abi_for_size_t<Sizes>>...> split(const simd_mask<T, Abi>&);
+template <class V, class Abi>
+array<V, simd_size_v<typename V::value_type, Abi> / V::size()> split(
+const simd<typename V::value_type, Abi>&);
+template <class V, class Abi>
+array<V, simd_size_v<typename V::value_type, Abi> / V::size()> split(
+const simd_mask<typename V::value_type, Abi>&);
+
+template <class T, class... Abis>
+simd<T, abi_for_size_t<T, (simd_size_v<T, Abis> + ...)>> concat(const simd<T, Abis>&...);
+template <class T, class... Abis>
+simd_mask<T, abi_for_size_t<T, (simd_size_v<T, Abis> + ...)>> concat(const simd_mask<T, Abis>&...);
+
+// reductions [simd.mask.reductions]
+template <class T, class Abi> bool all_of(const simd_mask<T, Abi>&) noexcept;
+template <class T, class Abi> bool any_of(const simd_mask<T, Abi>&) noexcept;
+template <class T, class Abi> bool none_of(const simd_mask<T, Abi>&) noexcept;
+template <class T, class Abi> bool some_of(const simd_mask<T, Abi>&) noexcept;
+template <class T, class Abi> int popcount(const simd_mask<T, Abi>&) noexcept;
+template <class T, class Abi> int find_first_set(const simd_mask<T, Abi>&);
+template <class T, class Abi> int find_last_set(const simd_mask<T, Abi>&);
+
+bool all_of(see below) noexcept;
+bool any_of(see below) noexcept;
+bool none_of(see below) noexcept;
+bool some_of(see below) noexcept;
+int popcount(see below) noexcept;
+int find_first_set(see below) noexcept;
+int find_last_set(see below) noexcept;
+
+// masked assignment [simd.whereexpr]
+template <class M, class T> class const_where_expression;
+template <class M, class T> class where_expression;
+
+// masked assignment [simd.mask.where]
+template <class T> struct nodeduce { using type = T; }; // exposition only
+
+template <class T> using nodeduce_t = typename nodeduce<T>::type; // exposition only
+
+template <class T, class Abi>
+where_expression<simd_mask<T, Abi>, simd<T, Abi>>
+where(const typename simd<T, Abi>::mask_type&, simd<T, Abi>&) noexcept;
+
+template <class T, class Abi>
+const_where_expression<simd_mask<T, Abi>, const simd<T, Abi>>
+where(const typename simd<T, Abi>::mask_type&, const simd<T, Abi>&) noexcept;
+
+template <class T, class Abi>
+where_expression<simd_mask<T, Abi>, simd_mask<T, Abi>>
+where(const nodeduce_t<simd_mask<T, Abi>>&, simd_mask<T, Abi>&) noexcept;
+
+template <class T, class Abi>
+const_where_expression<simd_mask<T, Abi>, const simd_mask<T, Abi>>
+where(const nodeduce_t<simd_mask<T, Abi>>&, const simd_mask<T, Abi>&) noexcept;
+
+template <class T> where_expression<bool, T> where(see below k, T& d) noexcept;
+
+template <class T>
+const_where_expression<bool, const T> where(see below k, const T& d) noexcept;
+
+// reductions [simd.reductions]
+template <class T, class Abi, class BinaryOperation = std::plus<>>
+T reduce(const simd<T, Abi>&, BinaryOperation = BinaryOperation());
+
+template <class M, class V, class BinaryOperation>
+typename V::value_type reduce(const const_where_expression<M, V>& x,
+typename V::value_type neutral_element, BinaryOperation binary_op);
+
+template <class M, class V>
+typename V::value_type reduce(const const_where_expression<M, V>& x, plus<> binary_op = plus<>());
+
+template <class M, class V>
+typename V::value_type reduce(const const_where_expression<M, V>& x, multiplies<> binary_op);
+
+template <class M, class V>
+typename V::value_type reduce(const const_where_expression<M, V>& x, bit_and<> binary_op);
+
+template <class M, class V>
+typename V::value_type reduce(const const_where_expression<M, V>& x, bit_or<> binary_op);
+
+template <class M, class V>
+typename V::value_type reduce(const const_where_expression<M, V>& x, bit_xor<> binary_op);
+
+template <class T, class Abi> T hmin(const simd<T, Abi>&);
+template <class M, class V> T hmin(const const_where_expression<M, V>&);
+template <class T, class Abi> T hmax(const simd<T, Abi>&);
+template <class M, class V> T hmax(const const_where_expression<M, V>&);
+
+// algorithms [simd.alg]
+template <class T, class Abi> simd<T, Abi> min(const simd<T, Abi>&, const simd<T, Abi>&) noexcept;
+
+template <class T, class Abi> simd<T, Abi> max(const simd<T, Abi>&, const simd<T, Abi>&) noexcept;
+
+template <class T, class Abi>
+std::pair<simd<T, Abi>, simd<T, Abi>> minmax(const simd<T, Abi>&, const simd<T, Abi>&) noexcept;
+
+template <class T, class Abi>
+simd<T, Abi> clamp(const simd<T, Abi>& v, const simd<T, Abi>& lo, const simd<T, Abi>& hi);
+
+// [simd.whereexpr]
+template <class M, class T>
+class const_where_expression {
+  const M& mask; // exposition only
+  T& data; // exposition only
+public:
+  const_where_expression(const const_where_expression&) = delete;
+  const_where_expression& operator=(const const_where_expression&) = delete;
+  remove_const_t<T> operator-() const &&;
+  template <class U, class Flags> void copy_to(U* mem, Flags f) const &&;
+};
+
+template <class M, class T>
+class where_expression : public const_where_expression<M, T> {
+public:
+  where_expression(const where_expression&) = delete;
+  where_expression& operator=(const where_expression&) = delete;
+  template <class U> void operator=(U&& x);
+  template <class U> void operator+=(U&& x);
+  template <class U> void operator-=(U&& x);
+  template <class U> void operator*=(U&& x);
+  template <class U> void operator/=(U&& x);
+  template <class U> void operator%=(U&& x);
+  template <class U> void operator&=(U&& x);
+  template <class U> void operator|=(U&& x);
+  template <class U> void operator^=(U&& x);
+  template <class U> void operator<<=(U&& x);
+  template <class U> void operator>>=(U&& x);
+  void operator++();
+  void operator++(int);
+  void operator--();
+  void operator--(int);
+  template <class U, class Flags> void copy_from(const U* mem, Flags);
+};
+
+// [simd.class]
+template <class T, class Abi> class simd {
+public:
+  using value_type = T;
+  using reference = see below;
+  using mask_type = simd_mask<T, Abi>;
+
+  using abi_type = Abi;
+  static constexpr size_t size() noexcept;
+  simd() = default;
+
+  // implicit type conversion constructor
+  template <class U> simd(const simd<U, simd_abi::fixed_size<size()>>&);
+
+  // implicit broadcast constructor (see below for constraints)
+  template <class U> simd(U&& value);
+
+  // generator constructor (see below for constraints)
+  template <class G> explicit simd(G&& gen);
+
+  // load constructor
+  template <class U, class Flags> simd(const U* mem, Flags f);
+
+  // loads [simd.load]
+  template <class U, class Flags> void copy_from(const U* mem, Flags f);
+
+  // stores [simd.store]
+  template <class U, class Flags> void copy_to(U* mem, Flags f) const;
+
+  // scalar access [simd.subscr]
+  reference operator[](size_t);
+  value_type operator[](size_t) const;
+
+  // unary operators [simd.unary]
+  simd& operator++();
+  simd operator++(int);
+  simd& operator--();
+  simd operator--(int);
+  mask_type operator!() const;
+  simd operator~() const; // see below
+  simd operator+() const;
+  simd operator-() const;
+
+  // binary operators [simd.binary]
+  friend simd operator+ (const simd&, const simd&);
+  friend simd operator- (const simd&, const simd&);
+  friend simd operator* (const simd&, const simd&);
+  friend simd operator/ (const simd&, const simd&);
+  friend simd operator% (const simd&, const simd&);
+  friend simd operator& (const simd&, const simd&);
+  friend simd operator| (const simd&, const simd&);
+  friend simd operator^ (const simd&, const simd&);
+  friend simd operator<<(const simd&, const simd&);
+  friend simd operator>>(const simd&, const simd&);
+  friend simd operator<<(const simd&, int);
+  friend simd operator>>(const simd&, int);
+
+  // compound assignment [simd.cassign]
+  friend simd& operator+= (simd&, const simd&);
+  friend simd& operator-= (simd&, const simd&);
+  friend simd& operator*= (simd&, const simd&);
+  friend simd& operator/= (simd&, const simd&);
+  friend simd& operator%= (simd&, const simd&);
+
+  friend simd& operator&= (simd&, const simd&);
+  friend simd& operator|= (simd&, const simd&);
+  friend simd& operator^= (simd&, const simd&);
+  friend simd& operator<<=(simd&, const simd&);
+  friend simd& operator>>=(simd&, const simd&);
+  friend simd& operator<<=(simd&, int);
+  friend simd& operator>>=(simd&, int);
+
+  // compares [simd.comparison]
+  friend mask_type operator==(const simd&, const simd&);
+  friend mask_type operator!=(const simd&, const simd&);
+  friend mask_type operator>=(const simd&, const simd&);
+  friend mask_type operator<=(const simd&, const simd&);
+  friend mask_type operator> (const simd&, const simd&);
+  friend mask_type operator< (const simd&, const simd&);
+};
+
+// [simd.math]
+template <class Abi> using scharv = simd<signed char, Abi>; // exposition only
+template <class Abi> using shortv = simd<short, Abi>; // exposition only
+template <class Abi> using intv = simd<int, Abi>; // exposition only
+template <class Abi> using longv = simd<long int, Abi>; // exposition only
+template <class Abi> using llongv = simd<long long int, Abi>; // exposition only
+template <class Abi> using floatv = simd<float, Abi>; // exposition only
+template <class Abi> using doublev = simd<double, Abi>; // exposition only
+template <class Abi> using ldoublev = simd<long double, Abi>; // exposition only
+template <class T, class V> using samesize = fixed_size_simd<T, V::size()>; // exposition only
+
+template <class Abi> floatv<Abi> acos(floatv<Abi> x);
+template <class Abi> doublev<Abi> acos(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> acos(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> asin(floatv<Abi> x);
+template <class Abi> doublev<Abi> asin(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> asin(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> atan(floatv<Abi> x);
+template <class Abi> doublev<Abi> atan(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> atan(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> atan2(floatv<Abi> y, floatv<Abi> x);
+template <class Abi> doublev<Abi> atan2(doublev<Abi> y, doublev<Abi> x);
+template <class Abi> ldoublev<Abi> atan2(ldoublev<Abi> y, ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> cos(floatv<Abi> x);
+template <class Abi> doublev<Abi> cos(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> cos(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> sin(floatv<Abi> x);
+template <class Abi> doublev<Abi> sin(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> sin(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> tan(floatv<Abi> x);
+template <class Abi> doublev<Abi> tan(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> tan(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> acosh(floatv<Abi> x);
+template <class Abi> doublev<Abi> acosh(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> acosh(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> asinh(floatv<Abi> x);
+template <class Abi> doublev<Abi> asinh(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> asinh(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> atanh(floatv<Abi> x);
+template <class Abi> doublev<Abi> atanh(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> atanh(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> cosh(floatv<Abi> x);
+template <class Abi> doublev<Abi> cosh(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> cosh(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> sinh(floatv<Abi> x);
+template <class Abi> doublev<Abi> sinh(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> sinh(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> tanh(floatv<Abi> x);
+template <class Abi> doublev<Abi> tanh(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> tanh(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> exp(floatv<Abi> x);
+template <class Abi> doublev<Abi> exp(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> exp(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> exp2(floatv<Abi> x);
+template <class Abi> doublev<Abi> exp2(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> exp2(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> expm1(floatv<Abi> x);
+template <class Abi> doublev<Abi> expm1(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> expm1(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> frexp(floatv<Abi> value, samesize<int, floatv<Abi>>* exp);
+template <class Abi> doublev<Abi> frexp(doublev<Abi> value, samesize<int, doublev<Abi>>* exp);
+template <class Abi> ldoublev<Abi> frexp(ldoublev<Abi> value, samesize<int, ldoublev<Abi>>* exp);
+
+template <class Abi> samesize<int, floatv<Abi>> ilogb(floatv<Abi> x);
+template <class Abi> samesize<int, doublev<Abi>> ilogb(doublev<Abi> x);
+template <class Abi> samesize<int, ldoublev<Abi>> ilogb(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> ldexp(floatv<Abi> x, samesize<int, floatv<Abi>> exp);
+template <class Abi> doublev<Abi> ldexp(doublev<Abi> x, samesize<int, doublev<Abi>> exp);
+template <class Abi> ldoublev<Abi> ldexp(ldoublev<Abi> x, samesize<int, ldoublev<Abi>> exp);
+
+template <class Abi> floatv<Abi> log(floatv<Abi> x);
+template <class Abi> doublev<Abi> log(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> log(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> log10(floatv<Abi> x);
+template <class Abi> doublev<Abi> log10(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> log10(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> log1p(floatv<Abi> x);
+template <class Abi> doublev<Abi> log1p(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> log1p(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> log2(floatv<Abi> x);
+template <class Abi> doublev<Abi> log2(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> log2(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> logb(floatv<Abi> x);
+template <class Abi> doublev<Abi> logb(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> logb(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> modf(floatv<Abi> value, floatv<Abi>* iptr);
+template <class Abi> doublev<Abi> modf(doublev<Abi> value, doublev<Abi>* iptr);
+template <class Abi> ldoublev<Abi> modf(ldoublev<Abi> value, ldoublev<Abi>* iptr);
+
+template <class Abi> floatv<Abi> scalbn(floatv<Abi> x, samesize<int, floatv<Abi>> n);
+template <class Abi> doublev<Abi> scalbn(doublev<Abi> x, samesize<int, doublev<Abi>> n);
+template <class Abi> ldoublev<Abi> scalbn(ldoublev<Abi> x, samesize<int, ldoublev<Abi>> n);
+template <class Abi> floatv<Abi> scalbln(floatv<Abi> x, samesize<long int, floatv<Abi>> n);
+template <class Abi> doublev<Abi> scalbln(doublev<Abi> x, samesize<long int, doublev<Abi>> n);
+template <class Abi> ldoublev<Abi> scalbln(ldoublev<Abi> x, samesize<long int, ldoublev<Abi>> n);
+
+template <class Abi> floatv<Abi> cbrt(floatv<Abi> x);
+template <class Abi> doublev<Abi> cbrt(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> cbrt(ldoublev<Abi> x);
+
+template <class Abi> scharv<Abi> abs(scharv<Abi> j);
+template <class Abi> shortv<Abi> abs(shortv<Abi> j);
+template <class Abi> intv<Abi> abs(intv<Abi> j);
+template <class Abi> longv<Abi> abs(longv<Abi> j);
+template <class Abi> llongv<Abi> abs(llongv<Abi> j);
+template <class Abi> floatv<Abi> abs(floatv<Abi> j);
+template <class Abi> doublev<Abi> abs(doublev<Abi> j);
+template <class Abi> ldoublev<Abi> abs(ldoublev<Abi> j);
+
+template <class Abi> floatv<Abi> hypot(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> hypot(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> hypot(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> floatv<Abi> hypot(floatv<Abi> x, floatv<Abi> y, floatv<Abi> z);
+template <class Abi> doublev<Abi> hypot(doublev<Abi> x, doublev<Abi> y, doublev<Abi> z);
+template <class Abi> ldoublev<Abi> hypot(ldoublev<Abi> x, ldoublev<Abi> y, ldoublev<Abi> z);
+
+template <class Abi> floatv<Abi> pow(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> pow(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> pow(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> floatv<Abi> sqrt(floatv<Abi> x);
+template <class Abi> doublev<Abi> sqrt(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> sqrt(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> erf(floatv<Abi> x);
+template <class Abi> doublev<Abi> erf(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> erf(ldoublev<Abi> x);
+template <class Abi> floatv<Abi> erfc(floatv<Abi> x);
+template <class Abi> doublev<Abi> erfc(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> erfc(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> lgamma(floatv<Abi> x);
+template <class Abi> doublev<Abi> lgamma(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> lgamma(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> tgamma(floatv<Abi> x);
+template <class Abi> doublev<Abi> tgamma(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> tgamma(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> ceil(floatv<Abi> x);
+template <class Abi> doublev<Abi> ceil(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> ceil(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> floor(floatv<Abi> x);
+template <class Abi> doublev<Abi> floor(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> floor(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> nearbyint(floatv<Abi> x);
+template <class Abi> doublev<Abi> nearbyint(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> nearbyint(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> rint(floatv<Abi> x);
+template <class Abi> doublev<Abi> rint(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> rint(ldoublev<Abi> x);
+
+template <class Abi> samesize<long int, floatv<Abi>> lrint(floatv<Abi> x);
+template <class Abi> samesize<long int, doublev<Abi>> lrint(doublev<Abi> x);
+template <class Abi> samesize<long int, ldoublev<Abi>> lrint(ldoublev<Abi> x);
+template <class Abi> samesize<long long int, floatv<Abi>> llrint(floatv<Abi> x);
+template <class Abi> samesize<long long int, doublev<Abi>> llrint(doublev<Abi> x);
+template <class Abi> samesize<long long int, ldoublev<Abi>> llrint(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> round(floatv<Abi> x);
+template <class Abi> doublev<Abi> round(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> round(ldoublev<Abi> x);
+template <class Abi> samesize<long int, floatv<Abi>> lround(floatv<Abi> x);
+template <class Abi> samesize<long int, doublev<Abi>> lround(doublev<Abi> x);
+template <class Abi> samesize<long int, ldoublev<Abi>> lround(ldoublev<Abi> x);
+template <class Abi> samesize<long long int, floatv<Abi>> llround(floatv<Abi> x);
+template <class Abi> samesize<long long int, doublev<Abi>> llround(doublev<Abi> x);
+template <class Abi> samesize<long long int, ldoublev<Abi>> llround(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> trunc(floatv<Abi> x);
+template <class Abi> doublev<Abi> trunc(doublev<Abi> x);
+template <class Abi> ldoublev<Abi> trunc(ldoublev<Abi> x);
+
+template <class Abi> floatv<Abi> fmod(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> fmod(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> fmod(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> floatv<Abi> remainder(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> remainder(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> remainder(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> floatv<Abi> remquo(floatv<Abi> x, floatv<Abi> y, samesize<int, floatv<Abi>>* quo);
+template <class Abi> doublev<Abi> remquo(doublev<Abi> x, doublev<Abi> y, samesize<int, doublev<Abi>>* quo);
+template <class Abi> ldoublev<Abi> remquo(ldoublev<Abi> x, ldoublev<Abi> y, samesize<int, ldoublev<Abi>>* quo);
+
+template <class Abi> floatv<Abi> copysign(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> copysign(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> copysign(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> doublev<Abi> nan(const char* tagp);
+template <class Abi> floatv<Abi> nanf(const char* tagp);
+template <class Abi> ldoublev<Abi> nanl(const char* tagp);
+
+template <class Abi> floatv<Abi> nextafter(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> nextafter(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> nextafter(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> floatv<Abi> nexttoward(floatv<Abi> x, ldoublev<Abi> y);
+template <class Abi> doublev<Abi> nexttoward(doublev<Abi> x, ldoublev<Abi> y);
+template <class Abi> ldoublev<Abi> nexttoward(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> floatv<Abi> fdim(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> fdim(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> fdim(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> floatv<Abi> fmax(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> fmax(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> fmax(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> floatv<Abi> fmin(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> doublev<Abi> fmin(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> ldoublev<Abi> fmin(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> floatv<Abi> fma(floatv<Abi> x, floatv<Abi> y, floatv<Abi> z);
+template <class Abi> doublev<Abi> fma(doublev<Abi> x, doublev<Abi> y, doublev<Abi> z);
+template <class Abi> ldoublev<Abi> fma(ldoublev<Abi> x, ldoublev<Abi> y, ldoublev<Abi> z);
+
+template <class Abi> samesize<int, floatv<Abi>> fpclassify(floatv<Abi> x);
+template <class Abi> samesize<int, doublev<Abi>> fpclassify(doublev<Abi> x);
+template <class Abi> samesize<int, ldoublev<Abi>> fpclassify(ldoublev<Abi> x);
+
+template <class Abi> simd_mask<float, Abi> isfinite(floatv<Abi> x);
+template <class Abi> simd_mask<double, Abi> isfinite(doublev<Abi> x);
+template <class Abi> simd_mask<long double, Abi> isfinite(ldoublev<Abi> x);
+
+template <class Abi> simd_mask<float, Abi> isinf(floatv<Abi> x);
+template <class Abi> simd_mask<double, Abi> isinf(doublev<Abi> x);
+template <class Abi> simd_mask<long double, Abi> isinf(ldoublev<Abi> x);
+
+template <class Abi> simd_mask<float, Abi> isnan(floatv<Abi> x);
+template <class Abi> simd_mask<double, Abi> isnan(doublev<Abi> x);
+template <class Abi> simd_mask<long double, Abi> isnan(ldoublev<Abi> x);
+
+template <class Abi> simd_mask<float, Abi> isnormal(floatv<Abi> x);
+template <class Abi> simd_mask<double, Abi> isnormal(doublev<Abi> x);
+template <class Abi> simd_mask<long double, Abi> isnormal(ldoublev<Abi> x);
+
+template <class Abi> simd_mask<float, Abi> signbit(floatv<Abi> x);
+template <class Abi> simd_mask<double, Abi> signbit(doublev<Abi> x);
+template <class Abi> simd_mask<long double, Abi> signbit(ldoublev<Abi> x);
+
+template <class Abi> simd_mask<float, Abi> isgreater(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> simd_mask<double, Abi> isgreater(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> simd_mask<long double, Abi> isgreater(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> simd_mask<float, Abi> isgreaterequal(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> simd_mask<double, Abi> isgreaterequal(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> simd_mask<long double, Abi> isgreaterequal(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> simd_mask<float, Abi> isless(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> simd_mask<double, Abi> isless(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> simd_mask<long double, Abi> isless(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> simd_mask<float, Abi> islessequal(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> simd_mask<double, Abi> islessequal(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> simd_mask<long double, Abi> islessequal(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> simd_mask<float, Abi> islessgreater(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> simd_mask<double, Abi> islessgreater(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> simd_mask<long double, Abi> islessgreater(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class Abi> simd_mask<float, Abi> isunordered(floatv<Abi> x, floatv<Abi> y);
+template <class Abi> simd_mask<double, Abi> isunordered(doublev<Abi> x, doublev<Abi> y);
+template <class Abi> simd_mask<long double, Abi> isunordered(ldoublev<Abi> x, ldoublev<Abi> y);
+
+template <class V> struct simd_div_t { V quot, rem; };
+template <class Abi> simd_div_t<scharv<Abi>> div(scharv<Abi> numer, scharv<Abi> denom);
+template <class Abi> simd_div_t<shortv<Abi>> div(shortv<Abi> numer, shortv<Abi> denom);
+template <class Abi> simd_div_t<intv<Abi>> div(intv<Abi> numer, intv<Abi> denom);
+template <class Abi> simd_div_t<longv<Abi>> div(longv<Abi> numer, longv<Abi> denom);
+template <class Abi> simd_div_t<llongv<Abi>> div(llongv<Abi> numer, llongv<Abi> denom);
+
+// [simd.mask.class]
+template <class T, class Abi>
+class simd_mask {
+public:
+  using value_type = bool;
+  using reference = see below;
+  using simd_type = simd<T, Abi>;
+  using abi_type = Abi;
+  static constexpr size_t size() noexcept;
+  simd_mask() = default;
+
+  // broadcast constructor
+  explicit simd_mask(value_type) noexcept;
+
+  // implicit type conversion constructor
+  template <class U> simd_mask(const simd_mask<U, simd_abi::fixed_size<size()>>&) noexcept;
+
+  // load constructor
+  template <class Flags> simd_mask(const value_type* mem, Flags);
+
+  // loads [simd.mask.copy]
+  template <class Flags> void copy_from(const value_type* mem, Flags);
+  template <class Flags> void copy_to(value_type* mem, Flags) const;
+
+  // scalar access [simd.mask.subscr]
+  reference operator[](size_t);
+  value_type operator[](size_t) const;
+
+  // unary operators [simd.mask.unary]
+  simd_mask operator!() const noexcept;
+
+  // simd_mask binary operators [simd.mask.binary]
+  friend simd_mask operator&&(const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator||(const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator& (const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator| (const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator^ (const simd_mask&, const simd_mask&) noexcept;
+
+  // simd_mask compound assignment [simd.mask.cassign]
+  friend simd_mask& operator&=(simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask& operator|=(simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask& operator^=(simd_mask&, const simd_mask&) noexcept;
+
+  // simd_mask compares [simd.mask.comparison]
+  friend simd_mask operator==(const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator!=(const simd_mask&, const simd_mask&) noexcept;
+};
+
+} // parallelism_v2
+} // std::experimental
+
+*/
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <bitset>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+namespace athena::_simd {
+
+enum class _StorageKind {
+  _Scalar,
+  _Array,
+  _VecExt,
+};
+
+template <_StorageKind __kind, int _Np>
+struct __simd_abi {};
+
+template <class _Tp, class _Abi>
+class __simd_storage {};
+
+template <class _Tp, class _Abi>
+class __simd_mask_storage {};
+
+template <class _Tp>
+class __simd_storage<_Tp, __simd_abi<_StorageKind::_Scalar, 1>> {
+  _Tp __storage_;
+
+  template <class, class>
+  friend struct simd;
+
+  template <class, class>
+  friend struct simd_mask;
+
+public:
+  _Tp __get(size_t __index) const noexcept { return (&__storage_)[__index]; };
+  void __set(size_t __index, _Tp __val) noexcept {
+    (&__storage_)[__index] = __val;
+  }
+};
+
+#ifndef _LIBCPP_HAS_NO_VECTOR_EXTENSION
+
+constexpr size_t __floor_pow_of_2(size_t __val) {
+  return ((__val - 1) & __val) == 0 ? __val
+                                    : __floor_pow_of_2((__val - 1) & __val);
+}
+
+constexpr size_t __ceil_pow_of_2(size_t __val) {
+  return __val == 1 ? 1 : __floor_pow_of_2(__val - 1) << 1;
+}
+
+template <class _Tp, size_t __bytes>
+struct __vec_ext_traits {
+#if !defined(_LIBCPP_COMPILER_CLANG)
+  typedef _Tp type __attribute__((vector_size(__ceil_pow_of_2(__bytes))));
+#endif
+};
+
+#if defined(_LIBCPP_COMPILER_CLANG)
+#define _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, _NUM_ELEMENT)                        \
+  template <>                                                                  \
+  struct __vec_ext_traits<_TYPE, sizeof(_TYPE) * _NUM_ELEMENT> {               \
+    using type =                                                               \
+        _TYPE __attribute__((vector_size(sizeof(_TYPE) * _NUM_ELEMENT)));      \
+  }
+
+#define _LIBCPP_SPECIALIZE_VEC_EXT_32(_TYPE)                                   \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 1);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 2);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 3);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 4);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 5);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 6);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 7);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 8);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 9);                                        \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 10);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 11);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 12);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 13);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 14);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 15);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 16);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 17);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 18);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 19);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 20);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 21);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 22);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 23);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 24);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 25);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 26);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 27);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 28);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 29);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 30);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 31);                                       \
+  _LIBCPP_SPECIALIZE_VEC_EXT(_TYPE, 32);
+
+_LIBCPP_SPECIALIZE_VEC_EXT_32(char);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(char16_t);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(char32_t);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(wchar_t);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(signed char);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(signed short);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(signed int);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(signed long);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(signed long long);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned char);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned short);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned int);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned long);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(unsigned long long);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(float);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(double);
+_LIBCPP_SPECIALIZE_VEC_EXT_32(long double);
+
+#undef _LIBCPP_SPECIALIZE_VEC_EXT_32
+#undef _LIBCPP_SPECIALIZE_VEC_EXT
+#endif
+
+template <class _Tp, int __num_element>
+class __simd_storage<_Tp, __simd_abi<_StorageKind::_VecExt, __num_element>> {
+  using _StorageType =
+      typename __vec_ext_traits<_Tp, sizeof(_Tp) * __num_element>::type;
+
+  _StorageType __storage_;
+
+  template <class, class>
+  friend struct simd;
+
+  template <class, class>
+  friend struct simd_mask;
+
+public:
+  _Tp __get(size_t __index) const noexcept { return __storage_[__index]; };
+  void __set(size_t __index, _Tp __val) noexcept {
+    __storage_[__index] = __val;
+  }
+};
+
+#endif // _LIBCPP_HAS_NO_VECTOR_EXTENSION
+
+template <class _Vp, class _Tp, class _Abi>
+class __simd_reference {
+  static_assert(std::is_same<_Vp, _Tp>::value, "");
+
+  template <class, class>
+  friend struct simd;
+
+  template <class, class>
+  friend struct simd_mask;
+
+  __simd_storage<_Tp, _Abi>* __ptr_;
+  size_t __index_;
+
+  __simd_reference(__simd_storage<_Tp, _Abi>* __ptr, size_t __index)
+      : __ptr_(__ptr), __index_(__index) {}
+
+  __simd_reference(const __simd_reference&) = default;
+
+public:
+  __simd_reference() = delete;
+  __simd_reference& operator=(const __simd_reference& other) = delete;
+
+  operator _Vp() const { return __ptr_->__get(__index_); }
+
+  __simd_reference operator=(_Vp __value) && {
+    __ptr_->__set(__index_, __value);
+    return *this;
+  }
+
+  __simd_reference operator++() && {
+    return std::move(*this) = __ptr_->__get(__index_) + 1;
+  }
+
+  _Vp operator++(int) && {
+    auto __val = __ptr_->__get(__index_);
+    __ptr_->__set(__index_, __val + 1);
+    return __val;
+  }
+
+  __simd_reference operator--() && {
+    return std::move(*this) = __ptr_->__get(__index_) - 1;
+  }
+
+  _Vp operator--(int) && {
+    auto __val = __ptr_->__get(__index_);
+    __ptr_->__set(__index_, __val - 1);
+    return __val;
+  }
+
+  __simd_reference operator+=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) + __value;
+  }
+
+  __simd_reference operator-=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) - __value;
+  }
+
+  __simd_reference operator*=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) * __value;
+  }
+
+  __simd_reference operator/=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) / __value;
+  }
+
+  __simd_reference operator%=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) % __value;
+  }
+
+  __simd_reference operator>>=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) >> __value;
+  }
+
+  __simd_reference operator<<=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) << __value;
+  }
+
+  __simd_reference operator&=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) & __value;
+  }
+
+  __simd_reference operator|=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) | __value;
+  }
+
+  __simd_reference operator^=(_Vp __value) && {
+    return std::move(*this) = __ptr_->__get(__index_) ^ __value;
+  }
+
+  bool operator<(_Vp __value) const {
+    return __ptr_->__get(__index_) < __value;
+  }
+
+  bool operator<=(_Vp __value) const {
+    return __ptr_->__get(__index_) <= __value;
+  }
+
+  bool operator>(_Vp __value) const {
+    return __ptr_->__get(__index_) > __value;
+  }
+
+  bool operator>=(_Vp __value) const {
+    return __ptr_->__get(__index_) >= __value;
+  }
+
+  bool operator==(_Vp __value) const {
+    return __ptr_->__get(__index_) == __value;
+  }
+
+  bool operator!=(_Vp __value) const {
+    return __ptr_->__get(__index_) != __value;
+  }
+};
+
+template <class _Tp, class _Abi>
+class __simd_mask_reference {
+  template <class, class>
+  friend struct simd;
+
+  template <class, class>
+  friend struct simd_mask;
+
+  __simd_mask_storage<_Tp, _Abi>* __ptr_;
+  size_t __index_;
+
+  __simd_mask_reference(__simd_mask_storage<_Tp, _Abi>* __ptr, size_t __index)
+    : __ptr_(__ptr), __index_(__index) {}
+
+  __simd_mask_reference(const __simd_mask_reference&) = default;
+
+public:
+  __simd_mask_reference() = delete;
+  __simd_mask_reference& operator=(const __simd_mask_reference&) = delete;
+
+  operator bool() const { return __ptr_->__get(__index_); }
+
+  __simd_mask_reference operator=(bool __value) && {
+    __ptr_->__set(__index_, __value);
+    return *this;
+  }
+};
+
+template <class _To, class _From>
+constexpr decltype(_To{std::declval<_From>()}, true)
+__is_non_narrowing_convertible_impl(_From) {
+  return true;
+}
+
+template <class _To>
+constexpr bool __is_non_narrowing_convertible_impl(...) {
+  return false;
+}
+
+template <class _From, class _To>
+constexpr typename std::enable_if<std::is_arithmetic<_To>::value &&
+                                      std::is_arithmetic<_From>::value,
+                                  bool>::type
+__is_non_narrowing_arithmetic_convertible() {
+  return __is_non_narrowing_convertible_impl<_To>(_From{});
+}
+
+template <class _From, class _To>
+constexpr typename std::enable_if<!(std::is_arithmetic<_To>::value &&
+                                    std::is_arithmetic<_From>::value),
+                                  bool>::type
+__is_non_narrowing_arithmetic_convertible() {
+  return false;
+}
+
+template <class _Tp>
+constexpr _Tp __variadic_sum() {
+  return _Tp{};
+}
+
+template <class _Tp, class _Up, class... _Args>
+constexpr _Tp __variadic_sum(_Up __first, _Args... __rest) {
+  return static_cast<_Tp>(__first) + __variadic_sum<_Tp>(__rest...);
+}
+
+template <class _Tp>
+struct __nodeduce {
+  using type = _Tp;
+};
+
+template <class _Tp>
+constexpr bool __vectorizable() {
+  return std::is_arithmetic<_Tp>::value && !std::is_const<_Tp>::value &&
+         !std::is_volatile<_Tp>::value && !std::is_same<_Tp, bool>::value;
+}
+
+}
+namespace athena::_simd::simd_abi {
+
+using scalar = __simd_abi<_StorageKind::_Scalar, 1>;
+
+template <int _Np>
+using fixed_size = __simd_abi<_StorageKind::_Array, _Np>;
+
+template <class _Tp>
+inline constexpr size_t max_fixed_size = 32;
+
+template <class _Tp>
+using compatible = fixed_size<16 / sizeof(_Tp)>;
+
+#ifndef _LIBCPP_HAS_NO_VECTOR_EXTENSION
+template <class _Tp>
+using native = __simd_abi<_StorageKind::_VecExt, 16 / sizeof(_Tp)>;
+#else
+template <class _Tp>
+using native =
+    fixed_size<_Tp, 16 / sizeof(_Tp)>;
+#endif // _LIBCPP_HAS_NO_VECTOR_EXTENSION
+
+}
+namespace athena::_simd {
+
+template <class _Tp, class _Abi = simd_abi::compatible<_Tp>>
+class simd;
+template <class _Tp, class _Abi = simd_abi::compatible<_Tp>>
+class simd_mask;
+
+struct element_aligned_tag {};
+struct vector_aligned_tag {};
+template <size_t>
+struct overaligned_tag {};
+inline constexpr element_aligned_tag element_aligned{};
+inline constexpr vector_aligned_tag vector_aligned{};
+template <size_t _Np>
+inline constexpr overaligned_tag<_Np> overaligned{};
+
+// traits [simd.traits]
+template <class _Tp>
+struct is_abi_tag : std::integral_constant<bool, false> {};
+
+template <_StorageKind __kind, int _Np>
+struct is_abi_tag<__simd_abi<__kind, _Np>>
+    : std::integral_constant<bool, true> {};
+
+template <class _Tp>
+struct is_simd : std::integral_constant<bool, false> {};
+
+template <class _Tp, class _Abi>
+struct is_simd<simd<_Tp, _Abi>> : std::integral_constant<bool, true> {};
+
+template <class _Tp>
+struct is_simd_mask : std::integral_constant<bool, false> {};
+
+template <class _Tp, class _Abi>
+struct is_simd_mask<simd_mask<_Tp, _Abi>> : std::integral_constant<bool, true> {
+};
+
+template <class _Tp>
+struct is_simd_flag_type : std::integral_constant<bool, false> {};
+
+template <>
+struct is_simd_flag_type<element_aligned_tag>
+    : std::integral_constant<bool, true> {};
+
+template <>
+struct is_simd_flag_type<vector_aligned_tag>
+    : std::integral_constant<bool, true> {};
+
+template <size_t _Align>
+struct is_simd_flag_type<overaligned_tag<_Align>>
+    : std::integral_constant<bool, true> {};
+
+template <class _Tp>
+inline constexpr bool is_abi_tag_v = is_abi_tag<_Tp>::value;
+template <class _Tp>
+inline constexpr bool is_simd_v = is_simd<_Tp>::value;
+template <class _Tp>
+inline constexpr bool is_simd_mask_v = is_simd_mask<_Tp>::value;
+template <class _Tp>
+inline constexpr bool is_simd_flag_type_v =
+    is_simd_flag_type<_Tp>::value;
+template <class _Tp, size_t _Np>
+struct abi_for_size {
+  using type = simd_abi::fixed_size<_Np>;
+};
+template <class _Tp, size_t _Np>
+using abi_for_size_t = typename abi_for_size<_Tp, _Np>::type;
+
+template <class _Tp, class _Abi = simd_abi::compatible<_Tp>>
+struct simd_size;
+
+template <class _Tp, _StorageKind __kind, int _Np>
+struct simd_size<_Tp, __simd_abi<__kind, _Np>>
+    : std::integral_constant<size_t, _Np> {
+  static_assert(
+      std::is_arithmetic<_Tp>::value &&
+          !std::is_same<typename std::remove_const<_Tp>::type, bool>::value,
+      "Element type should be vectorizable");
+};
+
+// TODO: implement it.
+template <class _Tp, class _Up = typename _Tp::value_type>
+struct memory_alignment;
+
+template <class _Tp, class _Abi = simd_abi::compatible<_Tp>>
+inline constexpr size_t simd_size_v = simd_size<_Tp, _Abi>::value;
+
+template <class _Tp, class _Up = typename _Tp::value_type>
+inline constexpr size_t memory_alignment_v =
+    memory_alignment<_Tp, _Up>::value;
+
+// class template simd [simd.class]
+template <class _Tp>
+using native_simd = simd<_Tp, simd_abi::native<_Tp>>;
+template <class _Tp, int _Np>
+using fixed_size_simd = simd<_Tp, simd_abi::fixed_size<_Np>>;
+
+// class template simd_mask [simd.mask.class]
+template <class _Tp>
+using native_simd_mask = simd_mask<_Tp, simd_abi::native<_Tp>>;
+
+template <class _Tp, int _Np>
+using fixed_size_simd_mask = simd_mask<_Tp, simd_abi::fixed_size<_Np>>;
+
+// casts [simd.casts]
+template <class _Tp>
+struct __static_simd_cast_traits {
+  template <class _Up, class _Abi>
+  static simd<_Tp, _Abi> __apply(const simd<_Up, _Abi>& __v);
+};
+
+template <class _Tp, class _NewAbi>
+struct __static_simd_cast_traits<simd<_Tp, _NewAbi>> {
+  template <class _Up, class _Abi>
+  static typename std::enable_if<simd<_Up, _Abi>::size() ==
+                                     simd<_Tp, _NewAbi>::size(),
+                                 simd<_Tp, _NewAbi>>::type
+  __apply(const simd<_Up, _Abi>& __v);
+};
+
+template <class _Tp>
+struct __simd_cast_traits {
+  template <class _Up, class _Abi>
+  static typename std::enable_if<
+      __is_non_narrowing_arithmetic_convertible<_Up, _Tp>(),
+      simd<_Tp, _Abi>>::type
+  __apply(const simd<_Up, _Abi>& __v);
+};
+
+template <class _Tp, class _NewAbi>
+struct __simd_cast_traits<simd<_Tp, _NewAbi>> {
+  template <class _Up, class _Abi>
+  static typename std::enable_if<
+      __is_non_narrowing_arithmetic_convertible<_Up, _Tp>() &&
+          simd<_Up, _Abi>::size() == simd<_Tp, _NewAbi>::size(),
+      simd<_Tp, _NewAbi>>::type
+  __apply(const simd<_Up, _Abi>& __v);
+};
+
+template <class _Tp, class _Up, class _Abi>
+auto simd_cast(const simd<_Up, _Abi>& __v)
+    -> decltype(__simd_cast_traits<_Tp>::__apply(__v)) {
+  return __simd_cast_traits<_Tp>::__apply(__v);
+}
+
+template <class _Tp, class _Up, class _Abi>
+auto static_simd_cast(const simd<_Up, _Abi>& __v)
+    -> decltype(__static_simd_cast_traits<_Tp>::__apply(__v)) {
+  return __static_simd_cast_traits<_Tp>::__apply(__v);
+}
+
+template <class _Tp, class _Abi>
+fixed_size_simd<_Tp, simd_size<_Tp, _Abi>::value>
+to_fixed_size(const simd<_Tp, _Abi>&) noexcept;
+
+template <class _Tp, class _Abi>
+fixed_size_simd_mask<_Tp, simd_size<_Tp, _Abi>::value>
+to_fixed_size(const simd_mask<_Tp, _Abi>&) noexcept;
+
+template <class _Tp, size_t _Np>
+native_simd<_Tp> to_native(const fixed_size_simd<_Tp, _Np>&) noexcept;
+
+template <class _Tp, size_t _Np>
+native_simd_mask<_Tp> to_native(const fixed_size_simd_mask<_Tp, _Np>&) noexcept;
+
+template <class _Tp, size_t _Np>
+simd<_Tp> to_compatible(const fixed_size_simd<_Tp, _Np>&) noexcept;
+
+template <class _Tp, size_t _Np>
+simd_mask<_Tp> to_compatible(const fixed_size_simd_mask<_Tp, _Np>&) noexcept;
+
+template <size_t... __sizes, class _Tp, class _Abi>
+tuple<simd<_Tp, abi_for_size_t<_Tp, __sizes>>...> split(const simd<_Tp, _Abi>&);
+
+template <size_t... __sizes, class _Tp, class _Abi>
+tuple<simd_mask<_Tp, abi_for_size_t<_Tp, __sizes>>...>
+split(const simd_mask<_Tp, _Abi>&);
+
+template <class _SimdType, class _Abi>
+array<_SimdType, simd_size<typename _SimdType::value_type, _Abi>::value /
+                     _SimdType::size()>
+split(const simd<typename _SimdType::value_type, _Abi>&);
+
+template <class _SimdType, class _Abi>
+array<_SimdType, simd_size<typename _SimdType::value_type, _Abi>::value /
+                     _SimdType::size()>
+split(const simd_mask<typename _SimdType::value_type, _Abi>&);
+
+template <class _Tp, class... _Abis>
+simd<_Tp, abi_for_size_t<_Tp, __variadic_sum(simd_size<_Tp, _Abis>::value...)>>
+concat(const simd<_Tp, _Abis>&...);
+
+template <class _Tp, class... _Abis>
+simd_mask<_Tp,
+          abi_for_size_t<_Tp, __variadic_sum(simd_size<_Tp, _Abis>::value...)>>
+concat(const simd_mask<_Tp, _Abis>&...);
+
+// reductions [simd.mask.reductions]
+template <class _Tp, class _Abi>
+bool all_of(const simd_mask<_Tp, _Abi>&) noexcept;
+template <class _Tp, class _Abi>
+bool any_of(const simd_mask<_Tp, _Abi>&) noexcept;
+template <class _Tp, class _Abi>
+bool none_of(const simd_mask<_Tp, _Abi>&) noexcept;
+template <class _Tp, class _Abi>
+bool some_of(const simd_mask<_Tp, _Abi>&) noexcept;
+template <class _Tp, class _Abi>
+int popcount(const simd_mask<_Tp, _Abi>&) noexcept;
+template <class _Tp, class _Abi>
+int find_first_set(const simd_mask<_Tp, _Abi>&);
+template <class _Tp, class _Abi>
+int find_last_set(const simd_mask<_Tp, _Abi>&);
+bool all_of(bool) noexcept;
+bool any_of(bool) noexcept;
+bool none_of(bool) noexcept;
+bool some_of(bool) noexcept;
+int popcount(bool) noexcept;
+int find_first_set(bool) noexcept;
+int find_last_set(bool) noexcept;
+
+// masked assignment [simd.whereexpr]
+template <class _MaskType, class _Tp>
+class const_where_expression;
+template <class _MaskType, class _Tp>
+class where_expression;
+
+// masked assignment [simd.mask.where]
+template <class _Tp, class _Abi>
+where_expression<simd_mask<_Tp, _Abi>, simd<_Tp, _Abi>>
+where(const typename simd<_Tp, _Abi>::mask_type&, simd<_Tp, _Abi>&) noexcept;
+
+template <class _Tp, class _Abi>
+const_where_expression<simd_mask<_Tp, _Abi>, const simd<_Tp, _Abi>>
+where(const typename simd<_Tp, _Abi>::mask_type&,
+      const simd<_Tp, _Abi>&) noexcept;
+
+template <class _Tp, class _Abi>
+where_expression<simd_mask<_Tp, _Abi>, simd_mask<_Tp, _Abi>>
+where(const typename __nodeduce<simd_mask<_Tp, _Abi>>::type&,
+      simd_mask<_Tp, _Abi>&) noexcept;
+
+template <class _Tp, class _Abi>
+const_where_expression<simd_mask<_Tp, _Abi>, const simd_mask<_Tp, _Abi>>
+where(const typename __nodeduce<simd_mask<_Tp, _Abi>>::type&,
+      const simd_mask<_Tp, _Abi>&) noexcept;
+
+template <class _Tp>
+where_expression<bool, _Tp> where(bool, _Tp&) noexcept;
+
+template <class _Tp>
+const_where_expression<bool, const _Tp> where(bool, const _Tp&) noexcept;
+
+// reductions [simd.reductions]
+template <class _Tp, class _Abi, class _BinaryOp = std::plus<_Tp>>
+_Tp reduce(const simd<_Tp, _Abi>&, _BinaryOp = _BinaryOp());
+
+template <class _MaskType, class _SimdType, class _BinaryOp>
+typename _SimdType::value_type
+reduce(const const_where_expression<_MaskType, _SimdType>&,
+       typename _SimdType::value_type neutral_element, _BinaryOp binary_op);
+
+template <class _MaskType, class _SimdType>
+typename _SimdType::value_type
+reduce(const const_where_expression<_MaskType, _SimdType>&,
+       plus<typename _SimdType::value_type> binary_op = {});
+
+template <class _MaskType, class _SimdType>
+typename _SimdType::value_type
+reduce(const const_where_expression<_MaskType, _SimdType>&,
+       multiplies<typename _SimdType::value_type> binary_op);
+
+template <class _MaskType, class _SimdType>
+typename _SimdType::value_type
+reduce(const const_where_expression<_MaskType, _SimdType>&,
+       bit_and<typename _SimdType::value_type> binary_op);
+
+template <class _MaskType, class _SimdType>
+typename _SimdType::value_type
+reduce(const const_where_expression<_MaskType, _SimdType>&,
+       bit_or<typename _SimdType::value_type> binary_op);
+
+template <class _MaskType, class _SimdType>
+typename _SimdType::value_type
+reduce(const const_where_expression<_MaskType, _SimdType>&,
+       bit_xor<typename _SimdType::value_type> binary_op);
+
+template <class _Tp, class _Abi>
+_Tp hmin(const simd<_Tp, _Abi>&);
+template <class _MaskType, class _SimdType>
+typename _SimdType::value_type
+hmin(const const_where_expression<_MaskType, _SimdType>&);
+template <class _Tp, class _Abi>
+_Tp hmax(const simd<_Tp, _Abi>&);
+template <class _MaskType, class _SimdType>
+typename _SimdType::value_type
+hmax(const const_where_expression<_MaskType, _SimdType>&);
+
+// algorithms [simd.alg]
+template <class _Tp, class _Abi>
+simd<_Tp, _Abi> min(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept;
+
+template <class _Tp, class _Abi>
+simd<_Tp, _Abi> max(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept;
+
+template <class _Tp, class _Abi>
+std::pair<simd<_Tp, _Abi>, simd<_Tp, _Abi>>
+minmax(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept;
+
+template <class _Tp, class _Abi>
+simd<_Tp, _Abi> clamp(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&,
+                      const simd<_Tp, _Abi>&);
+
+// [simd.whereexpr]
+// TODO implement where expressions.
+template <class _MaskType, class _Tp>
+class const_where_expression {
+public:
+  const_where_expression(const const_where_expression&) = delete;
+  const_where_expression& operator=(const const_where_expression&) = delete;
+  typename remove_const<_Tp>::type operator-() const&&;
+  template <class _Up, class _Flags>
+  void copy_to(_Up*, _Flags) const&&;
+};
+
+template <class _MaskType, class _Tp>
+class where_expression : public const_where_expression<_MaskType, _Tp> {
+public:
+  where_expression(const where_expression&) = delete;
+  where_expression& operator=(const where_expression&) = delete;
+  template <class _Up>
+  void operator=(_Up&&);
+  template <class _Up>
+  void operator+=(_Up&&);
+  template <class _Up>
+  void operator-=(_Up&&);
+  template <class _Up>
+  void operator*=(_Up&&);
+  template <class _Up>
+  void operator/=(_Up&&);
+  template <class _Up>
+  void operator%=(_Up&&);
+  template <class _Up>
+  void operator&=(_Up&&);
+  template <class _Up>
+  void operator|=(_Up&&);
+  template <class _Up>
+  void operator^=(_Up&&);
+  template <class _Up>
+  void operator<<=(_Up&&);
+  template <class _Up>
+  void operator>>=(_Up&&);
+  void operator++();
+  void operator++(int);
+  void operator--();
+  void operator--(int);
+  template <class _Up, class _Flags>
+  void copy_from(const _Up*, _Flags);
+};
+
+template <class _Simd>
+class alignas(_Simd) simd_data {
+public:
+  using value_type = typename _Simd::value_type;
+  simd_data() = default;
+  simd_data(const _Simd& s);
+  simd_data(value_type v) { std::fill(std::begin(__data_), std::end(__data_), v); }
+  template <typename... Ts>
+  simd_data(Ts... args) : __data_{args...} {}
+  value_type operator[](size_t idx) const noexcept { return __data_[idx]; }
+  value_type& operator[](size_t idx) noexcept { return __data_[idx]; }
+  const value_type* data() const noexcept { return __data_; }
+  value_type* data() noexcept { return __data_; }
+  auto begin() { return std::begin(__data_); }
+  auto end() { return std::end(__data_); }
+  auto begin() const { return std::begin(__data_); }
+  auto end() const { return std::end(__data_); }
+private:
+  value_type __data_[_Simd::size()];
+};
+
+// [simd.class]
+template <class _Tp, class _Abi>
+class simd {
+  template <class _Up, class _UAbi>
+  friend class simd;
+public:
+  using value_type = _Tp;
+  using reference = __simd_reference<_Tp, _Tp, _Abi>;
+  using mask_type = simd_mask<_Tp, _Abi>;
+  using abi_type = _Abi;
+
+  simd() = default;
+  simd(const simd&) = default;
+  simd& operator=(const simd&) = default;
+
+  static constexpr size_t size() noexcept {
+    return simd_size<_Tp, _Abi>::value;
+  }
+
+private:
+  __simd_storage<_Tp, _Abi> __s_;
+
+  template <class _Up>
+  static constexpr bool __can_broadcast() {
+    return (std::is_arithmetic<_Up>::value &&
+            __is_non_narrowing_arithmetic_convertible<_Up, _Tp>()) ||
+           (!std::is_arithmetic<_Up>::value &&
+            std::is_convertible<_Up, _Tp>::value) ||
+           std::is_same<typename std::remove_const<_Up>::type, int>::value ||
+           (std::is_same<typename std::remove_const<_Up>::type,
+                         unsigned int>::value &&
+            std::is_unsigned<_Tp>::value);
+  }
+
+  template <class _Generator, size_t... __indicies>
+  static constexpr decltype(
+      std::forward_as_tuple(std::declval<_Generator>()(
+          std::integral_constant<size_t, __indicies>())...),
+      bool())
+  __can_generate(std::index_sequence<__indicies...>) {
+    return !__variadic_sum<bool>(
+        !__can_broadcast<decltype(std::declval<_Generator>()(
+            std::integral_constant<size_t, __indicies>()))>()...);
+  }
+
+  template <class _Generator>
+  static bool __can_generate(...) {
+    return false;
+  }
+
+  template <class _Generator, size_t... __indicies>
+  void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) {
+    int __not_used[]{((*this)[__indicies] =
+                          __g(std::integral_constant<size_t, __indicies>()),
+                      0)...};
+    (void)__not_used;
+  }
+
+public:
+  simd(const typename __simd_storage<_Tp, _Abi>::storage_type& s) : __s_(s) {}
+
+#if 0
+// implicit type conversion constructor
+  template <class _Up,
+            class = typename std::enable_if<
+                std::is_same<_Abi, simd_abi::fixed_size<size()>>::value &&
+                __is_non_narrowing_arithmetic_convertible<_Up, _Tp>()>::type>
+  simd(const simd<_Up, simd_abi::fixed_size<size()>>& __v) {
+    for (size_t __i = 0; __i < size(); __i++) {
+      (*this)[__i] = static_cast<_Tp>(__v[__i]);
+    }
+  }
+#endif
+  // implicit type conversion constructor
+  template <class _Up, class _UAbi,
+    class = typename std::enable_if<std::is_constructible<
+      __simd_storage<_Tp, _Abi>, __simd_storage<_Up, _UAbi>>::value>>
+  simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {}
+
+#if 0
+  template <class _Up, class _UAbi,
+            class = typename std::enable_if_t<
+                std::is_constructible_v<__simd_storage<_Tp, _Abi>,
+                                        __simd_storage<_Up, _UAbi>>>>
+  simd(const simd<_Up, _UAbi>& __v) : __s_(__v.__s_) {}
+#endif
+  // implicit broadcast constructor
+#if 0
+  template <class _Up,
+            class = typename std::enable_if<__can_broadcast<_Up>()>::type>
+  simd(_Up&& __rv) {
+    auto __v = static_cast<_Tp>(__rv);
+    for (size_t __i = 0; __i < size(); __i++) {
+      (*this)[__i] = __v;
+    }
+    __s_.__broadcast(v);
+  }
+#endif
+  simd(_Tp __rv) {
+    __s_.__broadcast(__rv);
+  }
+
+  simd(_Tp a, _Tp b, _Tp c = {}, _Tp d = {}) {
+    __s_.__set4(a, b, c, d);
+  }
+
+  // generator constructor
+  template <class _Generator,
+            int = typename std::enable_if<
+                __can_generate<_Generator>(std::make_index_sequence<size()>()),
+                int>::type()>
+  explicit simd(_Generator&& __g) {
+    __generator_init(std::forward<_Generator>(__g),
+                     std::make_index_sequence<size()>());
+  }
+
+  // load constructor
+  template <
+      class _Up, class _Flags,
+      class = typename std::enable_if<__vectorizable<_Up>()>::type,
+      class = typename std::enable_if<is_simd_flag_type<_Flags>::value>::type>
+  simd(const _Up* __buffer, _Flags) {
+    // TODO: optimize for overaligned flags
+    for (size_t __i = 0; __i < size(); __i++) {
+      (*this)[__i] = static_cast<_Tp>(__buffer[__i]);
+    }
+  }
+
+#if 0
+  // loads [simd.load]
+  template <class _Up, class _Flags>
+  typename std::enable_if<__vectorizable<_Up>() &&
+                          is_simd_flag_type<_Flags>::value>::type
+  copy_from(const _Up* __buffer, _Flags) {
+    *this = simd(__buffer, _Flags());
+  }
+#endif
+  // loads [simd.load]
+  void copy_from(const simd_data<simd>& __buffer) {
+    __s_.__copy_from(__buffer);
+  }
+
+#if 0
+  // stores [simd.store]
+  template <class _Up, class _Flags>
+  typename std::enable_if<__vectorizable<_Up>() &&
+                          is_simd_flag_type<_Flags>::value>::type
+  copy_to(_Up* __buffer, _Flags) const {
+    // TODO: optimize for overaligned flags
+    for (size_t __i = 0; __i < size(); __i++) {
+      __buffer[__i] = static_cast<_Up>((*this)[__i]);
+    }
+  }
+#endif
+  // stores [simd.store]
+  void copy_to(simd_data<simd>& __buffer) const {
+    __s_.__copy_to(__buffer);
+  }
+
+  // scalar access [simd.subscr]
+  reference operator[](size_t __i) { return reference(&__s_, __i); }
+
+  value_type operator[](size_t __i) const { return __s_.__get(__i); }
+
+  // unary operators [simd.unary]
+  simd& operator++();
+  simd operator++(int);
+  simd& operator--();
+  simd operator--(int);
+  mask_type operator!() const;
+  simd operator~() const;
+  simd operator+() const;
+  simd operator-() const;
+
+  // binary operators [simd.binary]
+  friend simd operator+(const simd&, const simd&);
+  friend simd operator-(const simd&, const simd&);
+  friend simd operator*(const simd&, const simd&);
+  friend simd operator/(const simd&, const simd&);
+  friend simd operator%(const simd&, const simd&);
+  friend simd operator&(const simd&, const simd&);
+  friend simd operator|(const simd&, const simd&);
+  friend simd operator^(const simd&, const simd&);
+  friend simd operator<<(const simd&, const simd&);
+  friend simd operator>>(const simd&, const simd&);
+  friend simd operator<<(const simd&, int);
+  friend simd operator>>(const simd&, int);
+
+  // compound assignment [simd.cassign]
+  friend simd& operator+=(simd&, const simd&);
+  friend simd& operator-=(simd&, const simd&);
+  friend simd& operator*=(simd&, const simd&);
+  friend simd& operator/=(simd&, const simd&);
+  friend simd& operator%=(simd&, const simd&);
+
+  friend simd& operator&=(simd&, const simd&);
+  friend simd& operator|=(simd&, const simd&);
+  friend simd& operator^=(simd&, const simd&);
+  friend simd& operator<<=(simd&, const simd&);
+  friend simd& operator>>=(simd&, const simd&);
+  friend simd& operator<<=(simd&, int);
+  friend simd& operator>>=(simd&, int);
+
+  // compares [simd.comparison]
+  friend mask_type operator==(const simd&, const simd&);
+  friend mask_type operator!=(const simd&, const simd&);
+  friend mask_type operator>=(const simd&, const simd&);
+  friend mask_type operator<=(const simd&, const simd&);
+  friend mask_type operator>(const simd&, const simd&);
+  friend mask_type operator<(const simd&, const simd&);
+
+  value_type dot2(const simd& other) const { return __s_.__dot2(other.__s_); }
+  value_type dot3(const simd& other) const { return __s_.__dot3(other.__s_); }
+  value_type dot4(const simd& other) const { return __s_.__dot4(other.__s_); }
+
+  template<int x, int y, int z, int w>
+  simd shuffle() const {
+    simd s;
+    s.__s_ = __s_.template __shuffle<x, y, z, w>();
+    return s;
+  }
+
+  const typename __simd_storage<_Tp, _Abi>::storage_type& native() const { return __s_.__native(); }
+};
+
+// [simd.mask.class]
+template <class _Tp, class _Abi>
+class simd_mask {
+public:
+  using value_type = bool;
+  using reference = __simd_mask_reference<_Tp, _Abi>;
+  using simd_type = simd<_Tp, _Abi>;
+  using abi_type = _Abi;
+  static constexpr size_t size() noexcept {
+    return simd_size<_Tp, _Abi>::value;
+  }
+  simd_mask() = default;
+
+  // broadcast constructor
+  explicit simd_mask(value_type) noexcept;
+
+  // implicit type conversion constructor
+  template <class _Up>
+  simd_mask(const simd_mask<_Up, simd_abi::fixed_size<size()>>&) noexcept;
+
+  // load constructor
+  template <class _Flags>
+  simd_mask(const value_type*, _Flags);
+
+private:
+  __simd_mask_storage<_Tp, _Abi> __s_;
+
+public:
+  // loads [simd.mask.copy]
+  template <class _Flags>
+  void copy_from(const value_type*, _Flags);
+  template <class _Flags>
+  void copy_to(value_type*, _Flags) const;
+
+  // scalar access [simd.mask.subscr]
+  reference operator[](size_t __i) { return reference(&__s_, __i); }
+  value_type operator[](size_t __i) const { return __s_.__get(__i); }
+
+  // unary operators [simd.mask.unary]
+  simd_mask operator!() const noexcept;
+
+  // simd_mask binary operators [simd.mask.binary]
+  friend simd_mask operator&&(const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator||(const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator&(const simd_mask&, const simd_mask&)noexcept;
+  friend simd_mask operator|(const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator^(const simd_mask&, const simd_mask&) noexcept;
+
+  // simd_mask compound assignment [simd.mask.cassign]
+  friend simd_mask& operator&=(simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask& operator|=(simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask& operator^=(simd_mask&, const simd_mask&) noexcept;
+
+  // simd_mask compares [simd.mask.comparison]
+  friend simd_mask operator==(const simd_mask&, const simd_mask&) noexcept;
+  friend simd_mask operator!=(const simd_mask&, const simd_mask&) noexcept;
+
+  // compares [simd.comparison]
+  friend simd_mask operator==(const simd_type&, const simd_type&);
+  friend simd_mask operator!=(const simd_type&, const simd_type&);
+  friend simd_mask operator>=(const simd_type&, const simd_type&);
+  friend simd_mask operator<=(const simd_type&, const simd_type&);
+  friend simd_mask operator>(const simd_type&, const simd_type&);
+  friend simd_mask operator<(const simd_type&, const simd_type&);
+};
+
+template <class _Simd>
+inline simd_data<_Simd>::simd_data(const _Simd& s) { s.copy_to(*this); }
+
+template <class _Tp, int __num_element>
+class __simd_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>> {
+public:
+  using storage_type = std::array<_Tp, __num_element>;
+private:
+  storage_type __storage_;
+
+  template <class, class>
+  friend struct simd;
+
+  template <class, class>
+  friend struct simd_mask;
+
+public:
+  _Tp __get(size_t __index) const noexcept { return __storage_[__index]; };
+  void __set(size_t __index, _Tp __val) noexcept {
+    __storage_[__index] = __val;
+  }
+  std::enable_if_t<__num_element >= 4> __set4(float a, float b, float c, float d) noexcept {
+    __storage_[0] = a;
+    __storage_[1] = b;
+    __storage_[2] = c;
+    __storage_[3] = d;
+  }
+  void __broadcast(float __val) noexcept {
+    std::fill(__storage_.begin(), __storage_.end(), __val);
+  }
+  std::enable_if_t<__num_element >= 2, _Tp> __dot2(const __simd_storage& other) const noexcept {
+    return __storage_[0] * other.__storage_[0] +
+           __storage_[1] * other.__storage_[1];
+  }
+  std::enable_if_t<__num_element >= 3, _Tp> __dot3(const __simd_storage& other) const noexcept {
+    return __storage_[0] * other.__storage_[0] +
+           __storage_[1] * other.__storage_[1] +
+           __storage_[2] * other.__storage_[2];
+  }
+  std::enable_if_t<__num_element >= 4, _Tp> __dot4(const __simd_storage& other) const noexcept {
+    return __storage_[0] * other.__storage_[0] +
+           __storage_[1] * other.__storage_[1] +
+           __storage_[2] * other.__storage_[2] +
+           __storage_[3] * other.__storage_[3];
+  }
+  template<int x, int y, int z, int w>
+  std::enable_if_t<__num_element >= 4, __simd_storage> __shuffle() const noexcept {
+    __simd_storage s;
+    s.__storage_[0] = __storage_[x];
+    s.__storage_[1] = __storage_[y];
+    s.__storage_[2] = __storage_[z];
+    s.__storage_[3] = __storage_[w];
+    return s;
+  }
+
+  void __copy_from(const simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) noexcept {
+    std::copy(__buffer.begin(), __buffer.end(), __storage_.begin());
+  }
+
+  void __copy_to(simd_data<simd<_Tp, __simd_abi<_StorageKind::_Array, __num_element>>>& __buffer) const noexcept {
+    std::copy(__storage_.begin(), __storage_.end(), __buffer.begin());
+  }
+
+  __simd_storage() = default;
+  template <class _Up, int __Unum_element>
+  explicit __simd_storage(const __simd_storage<_Up, __simd_abi<_StorageKind::_Array, __Unum_element>>& other) {
+    std::copy(other.__native().begin(), other.__native().end(), __storage_.begin());
+  }
+  const storage_type& __native() const { return __storage_; }
+};
+
+template <class _Tp, int __num_element>
+class __simd_mask_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>> {
+  std::bitset<__num_element> __storage_;
+public:
+  bool __get(size_t __index) const noexcept {
+    return __storage_.test(__index);
+  }
+  void __set(size_t __index, bool __val) noexcept {
+    __storage_.set(__index, __val);
+  }
+};
+
+}
diff --git a/include/athena/simd/simd.hpp b/include/athena/simd/simd.hpp
new file mode 100644
index 0000000..7873d1f
--- /dev/null
+++ b/include/athena/simd/simd.hpp
@@ -0,0 +1,26 @@
+#pragma once
+#define _ATHENA_SIMD_INCLUDED
+namespace athena::_simd { using namespace std; }
+#include "parallelism_v2_simd.hpp"
+#if _M_IX86_FP >= 1 || _M_X64
+#define __SSE__ 1
+#endif
+#if __AVX__
+#include "simd_avx.hpp"
+#elif __SSE__
+#include "simd_sse.hpp"
+#else
+namespace simd_abi {
+template<typename T> struct athena_native {};
+template<> struct athena_native<float> { using type = fixed_size<4>; };
+template<> struct athena_native<double> { using type = fixed_size<4>; };
+}
+#endif
+namespace athena {
+template<typename T> using simd = _simd::simd<T,
+  typename _simd::simd_abi::athena_native<T>::type>;
+template<typename T>
+using simd_values = _simd::simd_data<simd<T>>;
+using simd_floats = simd_values<float>;
+using simd_doubles = simd_values<double>;
+}
diff --git a/include/athena/simd/simd_avx.hpp b/include/athena/simd/simd_avx.hpp
new file mode 100644
index 0000000..6bdc6b5
--- /dev/null
+++ b/include/athena/simd/simd_avx.hpp
@@ -0,0 +1,188 @@
+#pragma once
+#ifndef _ATHENA_SIMD_INCLUDED
+#error simd_avx.hpp must not be included directly. Include simd.hpp instead.
+#endif
+#include "simd_sse.hpp"
+#include <immintrin.h>
+namespace athena::_simd {
+// __m256d storage for AVX
+template<>
+class __simd_storage<double, m256d_abi> {
+public:
+  using storage_type = __m256d;
+  storage_type __storage_;
+  double __get(size_t __index) const noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), __storage_);
+    return sse_data[__index];
+  }
+  void __set(size_t __index, double __val) noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), __storage_);
+    sse_data[__index] = __val;
+    __storage_ = _mm256_load_pd(sse_data.data());
+  }
+  void __set4(double a, double b, double c, double d) noexcept {
+    __storage_ = _mm256_set_pd(d, c, b, a);
+  }
+  void __broadcast(double __val) noexcept {
+    __storage_ = _mm256_set1_pd(__val);
+  }
+  double __dot2(const __simd_storage<double, m256d_abi>& other) const noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1];
+  }
+  double __dot3(const __simd_storage<double, m256d_abi>& other) const noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1] + sse_data[2];
+  }
+  double __dot4(const __simd_storage<double, m256d_abi>& other) const noexcept {
+    alignas(32) std::array<double, 4> sse_data;
+    _mm256_store_pd(sse_data.data(), _mm256_mul_pd(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3];
+  }
+
+  void __copy_from(const simd_data<simd<double, m256d_abi>>& __buffer) noexcept {
+    __storage_ = _mm256_load_pd(__buffer.data());
+  }
+
+  void __copy_to(simd_data<simd<double, m256d_abi>>& __buffer) const noexcept {
+    _mm256_store_pd(__buffer.data(), __storage_);
+  }
+
+  __simd_storage() = default;
+  explicit __simd_storage(const __simd_storage<float, m128_abi>& other) {
+    __storage_ = _mm256_cvtps_pd(other.__storage_);
+  }
+
+  explicit __simd_storage(const storage_type& s) : __storage_(s) {}
+  const storage_type& __native() const { return __storage_; }
+};
+// __m256d mask storage for AVX
+template<>
+class __simd_mask_storage<double, m256d_abi> : public __simd_storage<double, m256d_abi> {
+public:
+  bool __get(size_t __index) const noexcept {
+    alignas(32) uint64_t sse_data[4];
+    _mm256_store_pd(reinterpret_cast<double*>(sse_data), __storage_);
+    return sse_data[__index] != 0;
+  }
+
+  void __set(size_t __index, bool __val) noexcept {
+    alignas(32) uint64_t sse_data[4];
+    _mm256_store_pd(reinterpret_cast<double*>(sse_data), __storage_);
+    sse_data[__index] = __val ? UINT64_MAX : 0;
+    __storage_ = _mm256_load_pd(reinterpret_cast<double*>(sse_data));
+  }
+};
+
+template <>
+inline simd<double, m256d_abi> simd<double, m256d_abi>::operator-() const {
+  return _mm256_xor_pd(__s_.__storage_, _mm256_set1_pd(-0.0));
+}
+
+inline simd<double, m256d_abi>
+operator+(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi> ret;
+  ret.__s_.__storage_ = _mm256_add_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<double, m256d_abi>
+operator-(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi> ret;
+  ret.__s_.__storage_ = _mm256_sub_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<double, m256d_abi>
+operator*(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi> ret;
+  ret.__s_.__storage_ = _mm256_mul_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<double, m256d_abi>
+operator/(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi> ret;
+  ret.__s_.__storage_ = _mm256_div_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<double, m256d_abi>&
+operator+=(simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  a.__s_.__storage_ = _mm256_add_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<double, m256d_abi>&
+operator-=(simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  a.__s_.__storage_ = _mm256_sub_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<double, m256d_abi>&
+operator*=(simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  a.__s_.__storage_ = _mm256_mul_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<double, m256d_abi>&
+operator/=(simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  a.__s_.__storage_ = _mm256_div_pd(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator==(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_EQ_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator!=(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_NEQ_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator>=(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_GE_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator<=(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_LE_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator>(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_GT_OQ);
+  return ret;
+}
+
+inline simd<double, m256d_abi>::mask_type
+operator<(const simd<double, m256d_abi>& a, const simd<double, m256d_abi>& b) {
+  simd<double, m256d_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm256_cmp_pd(a.__s_.__storage_, b.__s_.__storage_, _CMP_LT_OQ);
+  return ret;
+}
+
+inline __simd_storage<float, m128_abi>::__simd_storage(const __simd_storage<double, m256d_abi>& other) {
+  __storage_ = _mm256_cvtpd_ps(other.__storage_);
+}
+
+namespace simd_abi {
+template<> struct athena_native<double> { using type = m256d_abi; };
+} // namespace simd_abi
+
+} // namespace athena::_simd
\ No newline at end of file
diff --git a/include/athena/simd/simd_sse.hpp b/include/athena/simd/simd_sse.hpp
new file mode 100644
index 0000000..8d59454
--- /dev/null
+++ b/include/athena/simd/simd_sse.hpp
@@ -0,0 +1,455 @@
+#pragma once
+#ifndef _ATHENA_SIMD_INCLUDED
+#error simd_sse.hpp must not be included directly. Include simd.hpp instead.
+#endif
+#include <xmmintrin.h>
+#if __SSE4_1__
+#include <smmintrin.h>
+#endif
+namespace athena::_simd {
+// __m128 ABI
+using m128_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 1), 4>;
+// __m128d ABI
+using m128d_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 2), 4>;
+#ifdef __AVX__
+// __m256d ABI
+using m256d_abi = __simd_abi<_StorageKind(int(_StorageKind::_VecExt) + 3), 4>;
+#endif
+
+template <>
+class __simd_storage<double, m128d_abi>;
+#ifdef __AVX__
+template <>
+class __simd_storage<double, m256d_abi>;
+#endif
+
+// __m128 storage for SSE2+
+template <>
+class __simd_storage<float, m128_abi> {
+public:
+  using storage_type = __m128;
+  storage_type __storage_;
+  float __get(size_t __index) const noexcept {
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), __storage_);
+    return sse_data[__index];
+  }
+  void __set(size_t __index, float __val) noexcept {
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), __storage_);
+    sse_data[__index] = __val;
+    __storage_ = _mm_load_ps(sse_data.data());
+  }
+  void __set4(float a, float b, float c, float d) noexcept {
+    __storage_ = _mm_set_ps(d, c, b, a);
+  }
+  void __broadcast(float __val) noexcept {
+    __storage_ = _mm_set1_ps(__val);
+  }
+  float __dot2(const __simd_storage<float, m128_abi>& other) const noexcept {
+#if __SSE4_1__
+    float ret;
+    _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x3F));
+    return ret;
+#else
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1];
+#endif
+  }
+  float __dot3(const __simd_storage<float, m128_abi>& other) const noexcept {
+#if __SSE4_1__
+    float ret;
+    _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0x7F));
+    return ret;
+#else
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1] + sse_data[2];
+#endif
+  }
+  float __dot4(const __simd_storage<float, m128_abi>& other) const noexcept {
+#if __SSE4_1__
+    float ret;
+    _mm_store_ss(&ret, _mm_dp_ps(__storage_, other.__storage_, 0xFF));
+    return ret;
+#else
+    alignas(16) std::array<float, 4> sse_data;
+    _mm_store_ps(sse_data.data(), _mm_mul_ps(__storage_, other.__storage_));
+    return sse_data[0] + sse_data[1] + sse_data[2] + sse_data[3];
+#endif
+  }
+  template<int x, int y, int z, int w>
+  __simd_storage __shuffle() const noexcept {
+    __simd_storage s;
+    s.__storage_ = _mm_shuffle_ps(__storage_, __storage_, _MM_SHUFFLE(w, z, y, x));
+    return s;
+  }
+
+  void __copy_from(const simd_data<simd<float, m128_abi>>& __buffer) noexcept {
+    __storage_ = _mm_load_ps(__buffer.data());
+  }
+
+  void __copy_to(simd_data<simd<float, m128_abi>>& __buffer) const noexcept {
+    _mm_store_ps(__buffer.data(), __storage_);
+  }
+
+  __simd_storage() = default;
+  explicit __simd_storage(const __simd_storage<double, m128d_abi>& other);
+#ifdef __AVX__
+  explicit __simd_storage(const __simd_storage<double, m256d_abi>& other);
+#endif
+
+  explicit __simd_storage(const storage_type& s) : __storage_(s) {}
+  const storage_type& __native() const { return __storage_; }
+};
+// __m128 mask storage for SSE2+
+template <>
+class __simd_mask_storage<float, m128_abi> : public __simd_storage<float, m128_abi>
+{
+public:
+  bool __get(size_t __index) const noexcept {
+    alignas(16) uint32_t sse_data[4];
+    _mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_);
+    return sse_data[__index] != 0;
+  }
+  void __set(size_t __index, bool __val) noexcept {
+    alignas(16) uint32_t sse_data[4];
+    _mm_store_ps(reinterpret_cast<float*>(sse_data), __storage_);
+    sse_data[__index] = __val ? UINT32_MAX : 0;
+    __storage_ = _mm_load_ps(reinterpret_cast<float*>(sse_data));
+  }
+};
+
+template <>
+inline simd<float, m128_abi> simd<float, m128_abi>::operator-() const {
+  return _mm_xor_ps(__s_.__storage_, _mm_set1_ps(-0.f));
+}
+
+inline simd<float, m128_abi>
+operator+(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi> ret;
+  ret.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>
+operator-(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi> ret;
+  ret.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>
+operator*(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi> ret;
+  ret.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>
+operator/(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi> ret;
+  ret.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>&
+operator+=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  a.__s_.__storage_ = _mm_add_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<float, m128_abi>&
+operator-=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  a.__s_.__storage_ = _mm_sub_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<float, m128_abi>&
+operator*=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  a.__s_.__storage_ = _mm_mul_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<float, m128_abi>&
+operator/=(simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  a.__s_.__storage_ = _mm_div_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return a;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator==(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmpeq_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator!=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmpneq_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator>=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmpge_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator<=(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmple_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator>(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmpgt_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+inline simd<float, m128_abi>::mask_type
+operator<(const simd<float, m128_abi>& a, const simd<float, m128_abi>& b) {
+  simd<float, m128_abi>::mask_type ret;
+  ret.__s_.__storage_ = _mm_cmplt_ps(a.__s_.__storage_, b.__s_.__storage_);
+  return ret;
+}
+
+// __m128d storage for SSE2+
+template <>
+class __simd_storage<double, m128d_abi> {
+public:
+  using storage_type = std::array<__m128d, 2>;
+  storage_type __storage_;
+  double __get(size_t __index) const noexcept {
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), __storage_[__index / 2]);
+    return sse_data[__index % 2];
+  }
+  void __set(size_t __index, double __val) noexcept {
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), __storage_[__index / 2]);
+    sse_data[__index % 2] = __val;
+    __storage_[__index / 2] = _mm_load_pd(sse_data.data());
+  }
+  void __set4(double a, double b, double c, double d) noexcept {
+    __storage_[0] = _mm_set_pd(b, a);
+    __storage_[1] = _mm_set_pd(d, c);
+  }
+  void __broadcast(double __val) noexcept {
+    for (int i = 0; i < 2; ++i)
+      __storage_[i] = _mm_set1_pd(__val);
+  }
+  double __dot2(const __simd_storage<double, m128d_abi>& other) const noexcept {
+#if __SSE4_1__
+    double ret;
+    _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
+    return ret;
+#else
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0]));
+    return sse_data[0] + sse_data[1];
+#endif
+  }
+  double __dot3(const __simd_storage<double, m128d_abi>& other) const noexcept {
+#if __SSE4_1__
+    double ret;
+    _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
+    alignas(16) std::array<double, 2> sse_data2;
+    _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1]));
+    return ret + sse_data2[0];
+#else
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0]));
+    alignas(16) std::array<double, 2> sse_data2;
+    _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1]));
+    return sse_data[0] + sse_data[1] + sse_data2[0];
+#endif
+  }
+  double __dot4(const __simd_storage<double, m128d_abi>& other) const noexcept {
+#if __SSE4_1__
+    double ret;
+    _mm_store_sd(&ret, _mm_dp_pd(__storage_[0], other.__storage_[0], 0x3F));
+    double ret2;
+    _mm_store_sd(&ret2, _mm_dp_pd(__storage_[1], other.__storage_[1], 0x3F));
+    return ret + ret2;
+#else
+    alignas(16) std::array<double, 2> sse_data;
+    _mm_store_pd(sse_data.data(), _mm_mul_pd(__storage_[0], other.__storage_[0]));
+    alignas(16) std::array<double, 2> sse_data2;
+    _mm_store_pd(sse_data2.data(), _mm_mul_pd(__storage_[1], other.__storage_[1]));
+    return sse_data[0] + sse_data[1] + sse_data2[0] + sse_data2[1];
+#endif
+  }
+
+  void __copy_from(const simd_data<simd<double, m128d_abi>>& __buffer) noexcept {
+    __storage_[0] = _mm_load_pd(__buffer.data());
+    __storage_[1] = _mm_load_pd(__buffer.data() + 2);
+  }
+
+  void __copy_to(simd_data<simd<double, m128d_abi>>& __buffer) const noexcept {
+    _mm_store_pd(__buffer.data(), __storage_[0]);
+    _mm_store_pd(__buffer.data() + 2, __storage_[1]);
+  }
+
+  __simd_storage() = default;
+  explicit __simd_storage(const __simd_storage<float, m128_abi>& other) {
+    __storage_[0] = _mm_cvtps_pd(other.__storage_);
+    __storage_[1] = _mm_cvtps_pd(_mm_movehl_ps(other.__storage_, other.__storage_));
+  }
+
+  explicit __simd_storage(const storage_type& s) : __storage_(s) {}
+  const storage_type& __native() const { return __storage_; }
+};
+// __m128d mask storage for SSE2+
+template <>
+class __simd_mask_storage<double, m128d_abi> : public __simd_storage<double, m128d_abi>
+{
+public:
+  bool __get(size_t __index) const noexcept {
+    alignas(16) uint64_t sse_data[2];
+    _mm_store_pd(reinterpret_cast<double*>(sse_data), __storage_[__index / 2]);
+    return sse_data[__index] != 0;
+  }
+  void __set(size_t __index, bool __val) noexcept {
+    alignas(16) uint64_t sse_data[2];
+    _mm_store_pd(reinterpret_cast<double*>(sse_data), __storage_[__index / 2]);
+    sse_data[__index % 2] = __val ? UINT64_MAX : 0;
+    __storage_[__index / 2] = _mm_load_pd(reinterpret_cast<double*>(sse_data));
+  }
+};
+
+template <>
+inline simd<double, m128d_abi> simd<double, m128d_abi>::operator-() const {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_xor_pd(__s_.__storage_[i], _mm_set1_pd(-0.0));
+  return ret;
+}
+
+inline simd<double, m128d_abi>
+operator+(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_add_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>
+operator-(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_sub_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>
+operator*(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_mul_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>
+operator/(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi> ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_div_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>&
+operator+=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  for (int i = 0; i < 2; ++i)
+    a.__s_.__storage_[i] = _mm_add_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return a;
+}
+
+inline simd<double, m128d_abi>&
+operator-=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  for (int i = 0; i < 2; ++i)
+    a.__s_.__storage_[i] = _mm_sub_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return a;
+}
+
+inline simd<double, m128d_abi>&
+operator*=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  for (int i = 0; i < 2; ++i)
+    a.__s_.__storage_[i] = _mm_mul_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return a;
+}
+
+inline simd<double, m128d_abi>&
+operator/=(simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  for (int i = 0; i < 2; ++i)
+    a.__s_.__storage_[i] = _mm_div_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return a;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator==(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmpeq_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator!=(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmpneq_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator>=(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmpge_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator<=(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmple_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator>(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmpgt_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline simd<double, m128d_abi>::mask_type
+operator<(const simd<double, m128d_abi>& a, const simd<double, m128d_abi>& b) {
+  simd<double, m128d_abi>::mask_type ret;
+  for (int i = 0; i < 2; ++i)
+    ret.__s_.__storage_[i] = _mm_cmplt_pd(a.__s_.__storage_[i], b.__s_.__storage_[i]);
+  return ret;
+}
+
+inline __simd_storage<float, m128_abi>::__simd_storage(const __simd_storage<double, m128d_abi>& other) {
+  __storage_ = _mm_movelh_ps(_mm_cvtpd_ps(other.__storage_[0]), _mm_cvtpd_ps(other.__storage_[1]));
+}
+
+namespace simd_abi {
+template<typename T> struct athena_native {};
+template<> struct athena_native<float> { using type = m128_abi; };
+#ifndef __AVX__
+template<> struct athena_native<double> { using type = m128d_abi; };
+#endif
+} // namespace simd_abi
+
+} // namespace athena::_simd
diff --git a/src/athena/DNAYaml.cpp b/src/athena/DNAYaml.cpp
index 826b389..7c75881 100644
--- a/src/athena/DNAYaml.cpp
+++ b/src/athena/DNAYaml.cpp
@@ -184,8 +184,12 @@ std::unique_ptr<YAMLNode> ValToNode(double val)
 template <typename RETURNTYPE>
 RETURNTYPE NodeToVec(const YAMLNode* node)
 {
+    constexpr bool isDouble = std::is_same<RETURNTYPE, atVec2d>::value ||
+                              std::is_same<RETURNTYPE, atVec3d>::value ||
+                              std::is_same<RETURNTYPE, atVec4d>::value;
     RETURNTYPE retval = {};
     auto it = node->m_seqChildren.begin();
+    simd_values<std::conditional_t<isDouble, double, float>> f;
     for (size_t i=0;
          i<4 && it != node->m_seqChildren.end();
          ++i, ++it)
@@ -193,16 +197,15 @@ RETURNTYPE NodeToVec(const YAMLNode* node)
         YAMLNode* snode = it->get();
         if (snode->m_type == YAML_SCALAR_NODE)
         {
-            if (std::is_same<RETURNTYPE, atVec2d>::value ||
-                std::is_same<RETURNTYPE, atVec3d>::value ||
-                std::is_same<RETURNTYPE, atVec4d>::value)
-                retval.vec[i] = NodeToVal<double>(snode);
+            if (isDouble)
+                f[i] = NodeToVal<double>(snode);
             else
-                retval.vec[i] = NodeToVal<float>(snode);
+                f[i] = NodeToVal<float>(snode);
         }
         else
-            retval.vec[i] = 0.0;
+            f[i] = 0.0;
     }
+    retval.simd.copy_from(f);
     return retval;
 }
 
@@ -216,10 +219,11 @@ std::unique_ptr<YAMLNode> ValToNode(const atVec2f& val)
 {
     YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE);
     ret->m_seqChildren.reserve(2);
+    simd_floats f(val.simd);
     for (size_t i=0 ; i<2 ; ++i)
     {
         char str[64];
-        snprintf(str, 64, "%f", val.vec[i]);
+        snprintf(str, 64, "%f", f[i]);
         YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE);
         comp->m_scalarString = str;
         ret->m_seqChildren.emplace_back(comp);
@@ -237,10 +241,11 @@ std::unique_ptr<YAMLNode> ValToNode(const atVec3f& val)
 {
     YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE);
     ret->m_seqChildren.reserve(3);
+    simd_floats f(val.simd);
     for (size_t i=0 ; i<3 ; ++i)
     {
         char str[64];
-        snprintf(str, 64, "%f", val.vec[i]);
+        snprintf(str, 64, "%f", f[i]);
         YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE);
         comp->m_scalarString = str;
         ret->m_seqChildren.emplace_back(comp);
@@ -258,10 +263,11 @@ std::unique_ptr<YAMLNode> ValToNode(const atVec4f& val)
 {
     YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE);
     ret->m_seqChildren.reserve(4);
+    simd_floats f(val.simd);
     for (size_t i=0 ; i<4 ; ++i)
     {
         char str[64];
-        snprintf(str, 64, "%f", val.vec[i]);
+        snprintf(str, 64, "%f", f[i]);
         YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE);
         comp->m_scalarString = str;
         ret->m_seqChildren.emplace_back(comp);
@@ -279,10 +285,11 @@ std::unique_ptr<YAMLNode> ValToNode(const atVec2d& val)
 {
     YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE);
     ret->m_seqChildren.reserve(2);
+    simd_doubles f(val.simd);
     for (size_t i=0 ; i<2 ; ++i)
     {
         char str[64];
-        snprintf(str, 64, "%f", val.vec[i]);
+        snprintf(str, 64, "%f", f[i]);
         YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE);
         comp->m_scalarString = str;
         ret->m_seqChildren.emplace_back(comp);
@@ -300,10 +307,11 @@ std::unique_ptr<YAMLNode> ValToNode(const atVec3d& val)
 {
     YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE);
     ret->m_seqChildren.reserve(3);
+    simd_doubles f(val.simd);
     for (size_t i=0 ; i<3 ; ++i)
     {
         char str[64];
-        snprintf(str, 64, "%f", val.vec[i]);
+        snprintf(str, 64, "%f", f[i]);
         YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE);
         comp->m_scalarString = str;
         ret->m_seqChildren.emplace_back(comp);
@@ -321,10 +329,11 @@ std::unique_ptr<YAMLNode> ValToNode(const atVec4d& val)
 {
     YAMLNode* ret = new YAMLNode(YAML_SEQUENCE_NODE);
     ret->m_seqChildren.reserve(4);
+    simd_doubles f(val.simd);
     for (size_t i=0 ; i<4 ; ++i)
     {
         char str[64];
-        snprintf(str, 64, "%f", val.vec[i]);
+        snprintf(str, 64, "%f", f[i]);
         YAMLNode* comp = new YAMLNode(YAML_SCALAR_NODE);
         comp->m_scalarString = str;
         ret->m_seqChildren.emplace_back(comp);