diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76b0fef..1e12a2a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,8 +3,6 @@ project(aurora LANGUAGES C CXX)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 20)
 
-option(AURORA_NATIVE_MATRIX "Assume OpenGL-layout matrices, disables transposing" OFF)
-
 add_subdirectory(extern)
 
 include(cmake/aurora_core.cmake)
diff --git a/cmake/aurora_gx.cmake b/cmake/aurora_gx.cmake
index 1bd8bad..2de474c 100644
--- a/cmake/aurora_gx.cmake
+++ b/cmake/aurora_gx.cmake
@@ -4,7 +4,6 @@ add_library(aurora_gx STATIC
         lib/gfx/gx.cpp
         lib/gfx/gx_shader.cpp
         lib/gfx/texture_convert.cpp
-        lib/gfx/stream/shader.cpp
         lib/gfx/model/shader.cpp
         lib/dolphin/gx/GXBump.cpp
         lib/dolphin/gx/GXCull.cpp
@@ -28,9 +27,6 @@ add_library(aurora::gx ALIAS aurora_gx)
 
 target_link_libraries(aurora_gx PUBLIC aurora::core xxhash)
 target_link_libraries(aurora_gx PRIVATE absl::btree absl::flat_hash_map)
-if (AURORA_NATIVE_MATRIX)
-    target_compile_definitions(aurora_gx PRIVATE AURORA_NATIVE_MATRIX)
-endif ()
 if (EMSCRIPTEN)
     target_link_options(aurora_gx PUBLIC -sUSE_WEBGPU=1 -sASYNCIFY -sEXIT_RUNTIME)
     target_compile_definitions(aurora_gx PRIVATE ENABLE_BACKEND_WEBGPU)
diff --git a/include/aurora/math.hpp b/include/aurora/math.hpp
index 7d51bad..e6c0dd4 100644
--- a/include/aurora/math.hpp
+++ b/include/aurora/math.hpp
@@ -35,9 +35,6 @@ struct Vec2 {
   constexpr Vec2() = default;
   constexpr Vec2(T x, T y) : x(x), y(y) {}
   AURORA_VEC2_EXTRA
-#ifdef METAFORCE
-  constexpr Vec2(const zeus::CVector2f& vec) : x(vec.x()), y(vec.y()) {}
-#endif
 
   bool operator==(const Vec2& rhs) const { return x == rhs.x && y == rhs.y; }
   bool operator!=(const Vec2& rhs) const { return !(*this == rhs); }
@@ -51,10 +48,6 @@ struct Vec3 {
   constexpr Vec3() = default;
   constexpr Vec3(T x, T y, T z) : x(x), y(y), z(z) {}
   AURORA_VEC3_EXTRA
-#ifdef METAFORCE
-  constexpr Vec3(const zeus::CVector3f& vec) : x(vec.x()), y(vec.y()), z(vec.z()) {}
-  operator zeus::CVector3f() const { return {x, y, z}; }
-#endif
 
   bool operator==(const Vec3& rhs) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
   bool operator!=(const Vec3& rhs) const { return !(*this == rhs); }
@@ -77,10 +70,6 @@ struct Vec4 {
   // For Vec3 -> Vec4
   constexpr Vec4(Vec3<T> v, T w) : m{v.x, v.y, v.z, w} {}
   AURORA_VEC4_EXTRA
-#ifdef METAFORCE
-  constexpr Vec4(const zeus::CVector4f& vec) : x(vec.x()), y(vec.y()), z(vec.z()), w(vec.w()) {}
-  constexpr Vec4(const zeus::CColor& color) : x(color.r()), y(color.g()), z(color.b()), w(color.a()) {}
-#endif
 
   inline Vec4& operator=(const Vec4& other) {
     memcpy(&m, &other.m, sizeof(Vt));
@@ -119,7 +108,7 @@ struct Vec4 {
   bool operator!=(const Vec4& rhs) const { return !(*this == rhs); }
 };
 template <typename T>
-[[nodiscard]] inline Vec4<T> operator+(const Vec4<T>& a, const Vec4<T>& b) {
+[[nodiscard]] Vec4<T> operator+(const Vec4<T>& a, const Vec4<T>& b) {
 #ifdef USE_GCC_VECTOR_EXTENSIONS
   return a.m + b.m;
 #else
@@ -127,7 +116,7 @@ template <typename T>
 #endif
 }
 template <typename T>
-[[nodiscard]] inline Vec4<T> operator*(const Vec4<T>& a, const Vec4<T>& b) {
+[[nodiscard]] Vec4<T> operator*(const Vec4<T>& a, const Vec4<T>& b) {
 #ifdef USE_GCC_VECTOR_EXTENSIONS
   return a.m * b.m;
 #else
@@ -170,6 +159,18 @@ struct Mat4x2 {
   bool operator!=(const Mat4x2& rhs) const { return !(*this == rhs); }
 };
 template <typename T>
+struct Mat2x4 {
+  Vec4<T> m0{};
+  Vec4<T> m1{};
+
+  constexpr Mat2x4() = default;
+  constexpr Mat2x4(const Vec4<T>& m0, const Vec4<T>& m1, const Vec4<T>& m2) : m0(m0), m1(m1) {}
+
+  bool operator==(const Mat2x4& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1; }
+  bool operator!=(const Mat2x4& rhs) const { return !(*this == rhs); }
+};
+static_assert(sizeof(Mat2x4<float>) == 32);
+template <typename T>
 struct Mat4x4;
 template <typename T>
 struct Mat3x4 {
@@ -180,10 +181,13 @@ struct Mat3x4 {
   constexpr Mat3x4() = default;
   constexpr Mat3x4(const Vec4<T>& m0, const Vec4<T>& m1, const Vec4<T>& m2) : m0(m0), m1(m1), m2(m2) {}
 
-  inline Mat4x4<T> to4x4() const;
-  inline Mat4x4<T> toTransposed4x4() const;
+  [[nodiscard]] Mat4x4<T> to4x4() const;
+  [[nodiscard]] Mat4x4<T> toTransposed4x4() const;
+
+  bool operator==(const Mat3x4& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1 && m2 == rhs.m2; }
+  bool operator!=(const Mat3x4& rhs) const { return !(*this == rhs); }
 };
-static_assert(sizeof(Mat3x4<float>) == sizeof(float[3][4]));
+static_assert(sizeof(Mat3x4<float>) == 48);
 template <typename T>
 struct Mat4x4 {
   Vec4<T> m0{};
@@ -195,10 +199,6 @@ struct Mat4x4 {
   constexpr Mat4x4(const Vec4<T>& m0, const Vec4<T>& m1, const Vec4<T>& m2, const Vec4<T>& m3)
   : m0(m0), m1(m1), m2(m2), m3(m3) {}
   AURORA_MAT4X4_EXTRA
-#ifdef METAFORCE
-  constexpr Mat4x4(const zeus::CMatrix4f& m) : m0(m[0]), m1(m[1]), m2(m[2]), m3(m[3]) {}
-  constexpr Mat4x4(const zeus::CTransform& m) : Mat4x4(m.toMatrix4f()) {}
-#endif
 
   [[nodiscard]] Mat4x4 transpose() const {
     return {
@@ -208,23 +208,17 @@ struct Mat4x4 {
         {m0[3], m1[3], m2[3], m3[3]},
     };
   }
-  inline Mat4x4& operator=(const Mat4x4& other) {
-    m0 = other.m0;
-    m1 = other.m1;
-    m2 = other.m2;
-    m3 = other.m3;
-    return *this;
-  }
+  Mat4x4& operator=(const Mat4x4& other) = default;
 
-  inline Vec4<T>& operator[](size_t i) { return *(&m0 + i); }
-  inline const Vec4<T>& operator[](size_t i) const { return *(&m0 + i); }
+  Vec4<T>& operator[](size_t i) { return *(&m0 + i); }
+  const Vec4<T>& operator[](size_t i) const { return *(&m0 + i); }
 
   bool operator==(const Mat4x4& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1 && m2 == rhs.m2 && m3 == rhs.m3; }
   bool operator!=(const Mat4x4& rhs) const { return !(*this == rhs); }
 };
-static_assert(sizeof(Mat4x4<float>) == sizeof(float[4][4]));
+static_assert(sizeof(Mat4x4<float>) == 64);
 template <typename T>
-[[nodiscard]] inline Mat4x4<T> operator*(const Mat4x4<T>& a, const Mat4x4<T>& b) {
+[[nodiscard]] Mat4x4<T> operator*(const Mat4x4<T>& a, const Mat4x4<T>& b) {
   Mat4x4<T> out;
   for (size_t i = 0; i < 4; ++i) {
     *(&out.m0 + i) = a.m0 * b[i].template shuffle<0, 0, 0, 0>() + a.m1 * b[i].template shuffle<1, 1, 1, 1>() +
@@ -233,28 +227,27 @@ template <typename T>
   return out;
 }
 template <typename T>
-[[nodiscard]] inline Mat4x4<T> Mat3x4<T>::to4x4() const {
+[[nodiscard]] Mat4x4<T> Mat3x4<T>::to4x4() const {
   return {
-      {m0.m[0], m0.m[1], m0.m[2], 0.f},
-      {m1.m[0], m1.m[1], m1.m[2], 0.f},
-      {m2.m[0], m2.m[1], m2.m[2], 0.f},
-      {m0.m[3], m1.m[3], m2.m[3], 1.f},
+      {m0[0], m0[1], m0[2], 0.f},
+      {m1[0], m1[1], m1[2], 0.f},
+      {m2[0], m2[1], m2[2], 0.f},
+      {m0[3], m1[3], m2[3], 1.f},
   };
 }
 template <typename T>
-[[nodiscard]] inline Mat4x4<T> Mat3x4<T>::toTransposed4x4() const {
+[[nodiscard]] Mat4x4<T> Mat3x4<T>::toTransposed4x4() const {
   return Mat4x4<T>{
-      m0,
-      m1,
-      m2,
-      {0.f, 0.f, 0.f, 1.f},
-  }
-      .transpose();
+      {m0[0], m1[0], m2[0], 0.f},
+      {m0[1], m1[1], m2[1], 0.f},
+      {m0[2], m1[2], m2[2], 0.f},
+      {m0[3], m1[3], m2[3], 1.f},
+  };
 }
-constexpr Mat4x4<float> Mat4x4_Identity{
-    Vec4<float>{1.f, 0.f, 0.f, 0.f},
-    Vec4<float>{0.f, 1.f, 0.f, 0.f},
-    Vec4<float>{0.f, 0.f, 1.f, 0.f},
-    Vec4<float>{0.f, 0.f, 0.f, 1.f},
+constexpr Mat4x4 Mat4x4_Identity{
+    Vec4{1.f, 0.f, 0.f, 0.f},
+    Vec4{0.f, 1.f, 0.f, 0.f},
+    Vec4{0.f, 0.f, 1.f, 0.f},
+    Vec4{0.f, 0.f, 0.f, 1.f},
 };
 } // namespace aurora
diff --git a/include/dolphin/gx/GXVert.h b/include/dolphin/gx/GXVert.h
index 8af0cde..93f9914 100644
--- a/include/dolphin/gx/GXVert.h
+++ b/include/dolphin/gx/GXVert.h
@@ -68,11 +68,11 @@ void GXTexCoord2s16(s16 s, s16 t);
 void GXTexCoord2u8(u8 s, u8 t);
 void GXTexCoord2s8(s8 s, s8 t);
 
-void GXTexCoord1f32(f32 s, f32 t);
-void GXTexCoord1u16(u16 s, u16 t);
-void GXTexCoord1s16(s16 s, s16 t);
-void GXTexCoord1u8(u8 s, u8 t);
-void GXTexCoord1s8(s8 s, s8 t);
+void GXTexCoord1f32(f32 s);
+void GXTexCoord1u16(u16 s);
+void GXTexCoord1s16(s16 s);
+void GXTexCoord1u8(u8 s);
+void GXTexCoord1s8(s8 s);
 
 void GXTexCoord1x16(u16 index);
 void GXTexCoord1x8(u8 index);
diff --git a/lib/dolphin/gx/GXGeometry.cpp b/lib/dolphin/gx/GXGeometry.cpp
index 8bcfeda..110f1c6 100644
--- a/lib/dolphin/gx/GXGeometry.cpp
+++ b/lib/dolphin/gx/GXGeometry.cpp
@@ -7,7 +7,6 @@ extern "C" {
 void GXSetVtxDesc(GXAttr attr, GXAttrType type) { update_gx_state(g_gxState.vtxDesc[attr], type); }
 
 void GXSetVtxDescv(GXVtxDescList* list) {
-  g_gxState.vtxDesc.fill({});
   while (list->attr != GX_VA_NULL) {
     update_gx_state(g_gxState.vtxDesc[list->attr], list->type);
     ++list;
@@ -17,8 +16,8 @@ void GXSetVtxDescv(GXVtxDescList* list) {
 void GXClearVtxDesc() { g_gxState.vtxDesc.fill({}); }
 
 void GXSetVtxAttrFmt(GXVtxFmt vtxfmt, GXAttr attr, GXCompCnt cnt, GXCompType type, u8 frac) {
-  CHECK(vtxfmt >= GX_VTXFMT0 && vtxfmt < GX_MAX_VTXFMT, "invalid vtxfmt {}", static_cast<int>(vtxfmt));
-  CHECK(attr >= GX_VA_PNMTXIDX && attr < GX_VA_MAX_ATTR, "invalid attr {}", static_cast<int>(attr));
+  CHECK(vtxfmt >= GX_VTXFMT0 && vtxfmt < GX_MAX_VTXFMT, "invalid vtxfmt {}", underlying(vtxfmt));
+  CHECK(attr >= GX_VA_PNMTXIDX && attr < GX_VA_MAX_ATTR, "invalid attr {}", underlying(attr));
   auto& fmt = g_gxState.vtxFmts[vtxfmt].attrs[attr];
   update_gx_state(fmt.cnt, cnt);
   update_gx_state(fmt.type, type);
@@ -38,7 +37,7 @@ void GXSetArray(GXAttr attr, const void* data, u32 size, u8 stride) {
 // TODO move GXBegin, GXEnd here
 
 void GXSetTexCoordGen2(GXTexCoordID dst, GXTexGenType type, GXTexGenSrc src, u32 mtx, GXBool normalize, u32 postMtx) {
-  CHECK(dst >= GX_TEXCOORD0 && dst <= GX_TEXCOORD7, "invalid tex coord {}", static_cast<int>(dst));
+  CHECK(dst >= GX_TEXCOORD0 && dst <= GX_TEXCOORD7, "invalid tex coord {}", underlying(dst));
   update_gx_state(g_gxState.tcgs[dst],
                   {type, src, static_cast<GXTexMtx>(mtx), static_cast<GXPTTexMtx>(postMtx), normalize});
 }
diff --git a/lib/dolphin/gx/GXGet.cpp b/lib/dolphin/gx/GXGet.cpp
index 4c2cb16..3c5c39a 100644
--- a/lib/dolphin/gx/GXGet.cpp
+++ b/lib/dolphin/gx/GXGet.cpp
@@ -20,7 +20,7 @@ void GXGetVtxAttrFmt(GXVtxFmt idx, GXAttr attr, GXCompCnt* compCnt, GXCompType*
 // TODO GXGetViewportv
 
 void GXGetProjectionv(f32* p) {
-  const auto& mtx = g_gxState.origProj;
+  const auto& mtx = g_gxState.proj;
   p[0] = static_cast<float>(g_gxState.projType);
   p[1] = mtx.m0[0];
   p[3] = mtx.m1[1];
diff --git a/lib/dolphin/gx/GXTransform.cpp b/lib/dolphin/gx/GXTransform.cpp
index 5c69504..de668ea 100644
--- a/lib/dolphin/gx/GXTransform.cpp
+++ b/lib/dolphin/gx/GXTransform.cpp
@@ -4,15 +4,8 @@ extern "C" {
 
 void GXSetProjection(const void* mtx_, GXProjectionType type) {
   const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
-  g_gxState.origProj = mtx;
   g_gxState.projType = type;
-  update_gx_state(g_gxState.proj,
-#ifdef AURORA_NATIVE_MATRIX
-                  mtx
-#else
-                  mtx.transpose()
-#endif
-  );
+  update_gx_state(g_gxState.proj, mtx);
 }
 
 // TODO GXSetProjectionv
@@ -20,13 +13,8 @@ void GXSetProjection(const void* mtx_, GXProjectionType type) {
 void GXLoadPosMtxImm(const void* mtx_, u32 id) {
   CHECK(id >= GX_PNMTX0 && id <= GX_PNMTX9, "invalid pn mtx {}", static_cast<int>(id));
   auto& state = g_gxState.pnMtx[id / 3];
-#ifdef AURORA_NATIVE_MATRIX
-  const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
+  const auto& mtx = *reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
   update_gx_state(state.pos, mtx);
-#else
-  const auto* mtx = reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
-  update_gx_state(state.pos, mtx->toTransposed4x4());
-#endif
 }
 
 // TODO GXLoadPosMtxIndx
@@ -34,56 +22,37 @@ void GXLoadPosMtxImm(const void* mtx_, u32 id) {
 void GXLoadNrmMtxImm(const void* mtx_, u32 id) {
   CHECK(id >= GX_PNMTX0 && id <= GX_PNMTX9, "invalid pn mtx {}", static_cast<int>(id));
   auto& state = g_gxState.pnMtx[id / 3];
-#ifdef AURORA_NATIVE_MATRIX
-  const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
+  const auto& mtx = *reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
   update_gx_state(state.nrm, mtx);
-#else
-  const auto* mtx = reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
-  update_gx_state(state.nrm, mtx->toTransposed4x4());
-#endif
 }
 
 // TODO GXLoadNrmMtxImm3x3
 // TODO GXLoadNrmMtxIndx3x3
 
 void GXSetCurrentMtx(u32 id) {
-  CHECK(id >= GX_PNMTX0 && id <= GX_PNMTX9, "invalid pn mtx {}", static_cast<int>(id));
+  CHECK(id >= GX_PNMTX0 && id <= GX_PNMTX9, "invalid pn mtx {}", id);
   update_gx_state(g_gxState.currentPnMtx, id / 3);
 }
 
 void GXLoadTexMtxImm(const void* mtx_, u32 id, GXTexMtxType type) {
   CHECK((id >= GX_TEXMTX0 && id <= GX_IDENTITY) || (id >= GX_PTTEXMTX0 && id <= GX_PTIDENTITY), "invalid tex mtx {}",
-        static_cast<int>(id));
+        id);
   if (id >= GX_PTTEXMTX0) {
-    CHECK(type == GX_MTX3x4, "invalid pt mtx type {}", static_cast<int>(type));
+    CHECK(type == GX_MTX3x4, "invalid pt mtx type {}", underlying(type));
     const auto idx = (id - GX_PTTEXMTX0) / 3;
-#ifdef AURORA_NATIVE_MATRIX
-    const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
-    update_gx_state<aurora::Mat4x4<float>>(g_gxState.ptTexMtxs[idx], mtx);
-#else
     const auto& mtx = *reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
-    update_gx_state<aurora::Mat4x4<float>>(g_gxState.ptTexMtxs[idx], mtx.toTransposed4x4());
-#endif
+    update_gx_state(g_gxState.ptTexMtxs[idx], mtx);
   } else {
     const auto idx = (id - GX_TEXMTX0) / 3;
     switch (type) {
     case GX_MTX3x4: {
-#ifdef AURORA_NATIVE_MATRIX
-      const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
-      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx);
-#else
       const auto& mtx = *reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
-      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx.toTransposed4x4());
-#endif
+      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx);
       break;
     }
     case GX_MTX2x4: {
-      const auto& mtx = *reinterpret_cast<const aurora::Mat4x2<float>*>(mtx_);
-#ifdef AURORA_NATIVE_MATRIX
+      const auto& mtx = *reinterpret_cast<const aurora::Mat2x4<float>*>(mtx_);
       update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx);
-#else
-      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx.transpose());
-#endif
       break;
     }
     }
diff --git a/lib/dolphin/gx/GXVert.cpp b/lib/dolphin/gx/GXVert.cpp
index 1791322..023874c 100644
--- a/lib/dolphin/gx/GXVert.cpp
+++ b/lib/dolphin/gx/GXVert.cpp
@@ -1,47 +1,113 @@
 #include "gx.hpp"
 
-#include "../../gfx/stream/shader.hpp"
+#include "aurora/math.hpp"
+#include "../../gfx/model/shader.hpp"
+#include "../../gfx/gx_fmt.hpp"
 
-#include <algorithm>
+#include <cstring>
 #include <optional>
 
-#ifndef NDEBUG
-static inline GXAttr next_attr(size_t begin) {
-  auto iter = std::find_if(g_gxState.vtxDesc.begin() + begin, g_gxState.vtxDesc.end(),
-                           [](const auto type) { return type != GX_NONE; });
-  if (begin > 0 && iter == g_gxState.vtxDesc.end()) {
-    // wrap around
-    iter = std::find_if(g_gxState.vtxDesc.begin(), g_gxState.vtxDesc.end(),
-                        [](const auto type) { return type != GX_NONE; });
-  }
-  return GXAttr(iter - g_gxState.vtxDesc.begin());
-}
-#endif
+struct Attribute {
+  uint32_t offset;
+  GXAttr attr;
+  GXAttrType type;
+  aurora::gfx::gx::VtxAttrFmt fmt;
+};
 
 struct SStreamState {
   GXPrimitive primitive;
   GXVtxFmt vtxFmt;
+  std::vector<Attribute> attrs;
+  u16 curAttr = 0;
   u16 vertexCount = 0;
-  u16 vertexStart = 0;
+  u16 vertexStart;
+  u16 vertexSize;
   aurora::ByteBuffer vertexBuffer;
+  uint8_t* vertexData = nullptr;
   std::vector<u16> indices;
-#ifndef NDEBUG
-  GXAttr nextAttr;
-#endif
 
-  explicit SStreamState(GXPrimitive primitive, GXVtxFmt vtxFmt, u16 numVerts, u16 vertexSize, u16 vertexStart) noexcept
-  : primitive(primitive), vtxFmt(vtxFmt), vertexStart(vertexStart) {
-    vertexBuffer.reserve_extra(size_t(numVerts) * vertexSize);
+  explicit SStreamState(GXPrimitive primitive, GXVtxFmt vtxFmt, std::vector<Attribute> attrs, u16 numVerts,
+                        u16 vertexSize, u16 vertexStart) noexcept
+  : primitive(primitive), vtxFmt(vtxFmt), attrs(std::move(attrs)), vertexStart(vertexStart), vertexSize(vertexSize) {
+    vertexBuffer.reserve_extra(static_cast<size_t>(numVerts) * vertexSize);
     if (numVerts > 3 && (primitive == GX_TRIANGLEFAN || primitive == GX_TRIANGLESTRIP)) {
-      indices.reserve((u32(numVerts) - 3) * 3 + 3);
+      indices.reserve(((static_cast<u32>(numVerts) - 3) * 3) + 3);
     } else if (numVerts > 4 && primitive == GX_QUADS) {
-      indices.reserve(u32(numVerts) / 4 * 6);
+      indices.reserve(static_cast<u32>(numVerts) / 4 * 6);
     } else {
       indices.reserve(numVerts);
     }
-#ifndef NDEBUG
-    nextAttr = next_attr(0);
-#endif
+  }
+
+  [[maybe_unused]] u8 check_direct(GXAttr attr, GXCompCnt cnt, GXCompType type) noexcept {
+    const auto& curAttr = attrs[this->curAttr];
+    ASSERT(curAttr.attr == attr, "bad attribute order: {}, expected {}", attr, curAttr.attr);
+    ASSERT(curAttr.type == GX_DIRECT, "bad attribute type: GX_DIRECT, expected {}", curAttr.type);
+    ASSERT(curAttr.fmt.cnt == cnt, "bad attribute count: {}, expected {}", cnt, curAttr.fmt.cnt);
+    ASSERT(curAttr.fmt.type == type, "bad attribute type: {}, expected {}", type, curAttr.fmt.type);
+    return curAttr.fmt.frac;
+  }
+
+  void check_indexed(GXAttr attr, GXAttrType type) noexcept {
+    const auto& curAttr = attrs[this->curAttr];
+    ASSERT(curAttr.attr == attr, "bad attribute order: {}, expected {}", attr, curAttr.attr);
+    ASSERT(curAttr.type == type, "bad attribute type: {}, expected {}", type, curAttr.type);
+  }
+
+  template <typename T>
+  void append(const T& value) noexcept {
+    append_data(&value, sizeof(value), attrs[curAttr].offset);
+    next_attribute();
+  }
+
+private:
+  void append_data(const void* ptr, size_t size, uint32_t offset) {
+    if (vertexData == nullptr) {
+      const auto vertexStart = vertexBuffer.size();
+      vertexBuffer.append_zeroes(vertexSize);
+      vertexData = vertexBuffer.data() + vertexStart;
+      inc_vertex_count();
+    }
+    ASSERT(offset + size <= vertexSize, "bad attribute end: {}, expected {}", offset + size, vertexSize);
+    memcpy(vertexData + offset, ptr, size);
+  }
+
+  void next_attribute() noexcept {
+    curAttr = curAttr + 1;
+    if (curAttr >= attrs.size()) {
+      curAttr = 0;
+      vertexData = nullptr;
+    }
+  }
+
+  void inc_vertex_count() noexcept {
+    auto curVertex = vertexStart + vertexCount;
+    if (primitive == GX_LINES || primitive == GX_LINESTRIP || primitive == GX_POINTS) {
+      // Currently unsupported, skip
+      return;
+    }
+    if (primitive == GX_TRIANGLES || primitive == GX_TRIANGLESTRIP || vertexCount < 3) {
+      // pass
+    } else if (primitive == GX_TRIANGLEFAN) {
+      indices.push_back(vertexStart);
+      indices.push_back(curVertex - 1);
+    } /*else if (primitive == GX_TRIANGLESTRIP) {
+      if ((vertexCount & 1) == 0) {
+        indices.push_back(curVertex - 2);
+        indices.push_back(curVertex - 1);
+      } else {
+        indices.push_back(curVertex - 1);
+        indices.push_back(curVertex - 2);
+      }
+    }*/
+    else if (primitive == GX_QUADS) {
+      if ((vertexCount & 3) == 3) {
+        indices.push_back(curVertex - 3);
+        indices.push_back(curVertex - 1);
+      }
+    }
+    indices.push_back(curVertex);
+    ++vertexCount;
   }
 };
 
@@ -51,228 +117,319 @@ static u16 lastVertexStart = 0;
 extern "C" {
 void GXBegin(GXPrimitive primitive, GXVtxFmt vtxFmt, u16 nVerts) {
   CHECK(!sStreamState, "Stream began twice!");
+
   uint16_t vertexSize = 0;
+  uint16_t numDirectAttrs = 0;
+  uint16_t numIndexedAttrs = 0;
   for (GXAttr attr{}; const auto type : g_gxState.vtxDesc) {
     if (type == GX_DIRECT) {
+      ++numDirectAttrs;
       if (attr == GX_VA_POS || attr == GX_VA_NRM) {
         vertexSize += 12;
       } else if (attr == GX_VA_CLR0 || attr == GX_VA_CLR1) {
         vertexSize += 16;
       } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) {
         vertexSize += 8;
-      } else UNLIKELY {
-        FATAL("dont know how to handle attr {}", static_cast<int>(attr));
-      }
+      } else
+        UNLIKELY { FATAL("dont know how to handle attr {}", attr); }
     } else if (type == GX_INDEX8 || type == GX_INDEX16) {
-      vertexSize += 2;
+      ++numIndexedAttrs;
     }
-    attr = GXAttr(attr + 1);
+    attr = static_cast<GXAttr>(attr + 1);
   }
+  auto [num4xAttr, rem] = std::div(numIndexedAttrs, 4);
+  u32 num2xAttr = 0;
+  if (rem > 2) {
+    ++num4xAttr;
+  } else if (rem > 0) {
+    ++num2xAttr;
+  }
+  u32 directStart = num4xAttr * 8 + num2xAttr * 4;
+  vertexSize += directStart;
+
+  u32 indexOffset = 0;
+  u32 directOffset = directStart;
+  std::vector<Attribute> attrs;
+  attrs.reserve(numDirectAttrs + numIndexedAttrs);
+  const auto& curVtxFmt = g_gxState.vtxFmts[vtxFmt];
+  for (GXAttr attr{}; const auto type : g_gxState.vtxDesc) {
+    if (type == GX_DIRECT) {
+      u32 attrSize;
+      if (attr == GX_VA_POS || attr == GX_VA_NRM) {
+        attrSize = 12;
+      } else if (attr == GX_VA_CLR0 || attr == GX_VA_CLR1) {
+        attrSize = 16;
+      } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) {
+        attrSize = 8;
+      } else
+        UNLIKELY { FATAL("dont know how to handle attr {}", attr); }
+      const auto& attrFmt = curVtxFmt.attrs[attr];
+      attrs.emplace_back(directOffset, attr, type, attrFmt);
+      directOffset += attrSize;
+    } else if (type == GX_INDEX8 || type == GX_INDEX16) {
+      attrs.emplace_back(indexOffset, attr, type);
+      indexOffset += 2;
+    }
+    attr = static_cast<GXAttr>(attr + 1);
+  }
+
   CHECK(vertexSize > 0, "no vtx attributes enabled?");
-  sStreamState.emplace(primitive, vtxFmt, nVerts, vertexSize, g_gxState.stateDirty ? 0 : lastVertexStart);
+  sStreamState.emplace(primitive, vtxFmt, std::move(attrs), nVerts, vertexSize,
+                       /*g_gxState.stateDirty ? 0 : lastVertexStart*/ 0);
 }
 
-static inline void check_attr_order(GXAttr attr) noexcept {
-#ifndef NDEBUG
-  CHECK(sStreamState, "Stream not started!");
-  CHECK(sStreamState->nextAttr == attr, "bad attribute order: {}, expected {}", static_cast<int>(attr),
-        static_cast<int>(sStreamState->nextAttr));
-  sStreamState->nextAttr = next_attr(attr + 1);
-#endif
-}
-
-void GXPosition3f32(float x, float y, float z) {
-  check_attr_order(GX_VA_POS);
-  auto& state = *sStreamState;
-  state.vertexBuffer.append(&x, sizeof(float));
-  state.vertexBuffer.append(&y, sizeof(float));
-  state.vertexBuffer.append(&z, sizeof(float));
-  auto curVertex = state.vertexStart + state.vertexCount;
-  if (state.primitive == GX_TRIANGLES || state.vertexCount < 3) {
-    // pass
-  } else if (state.primitive == GX_TRIANGLEFAN) {
-    state.indices.push_back(state.vertexStart);
-    state.indices.push_back(curVertex - 1);
-  } else if (state.primitive == GX_TRIANGLESTRIP) {
-    if ((state.vertexCount & 1) == 0) {
-      state.indices.push_back(curVertex - 2);
-      state.indices.push_back(curVertex - 1);
-    } else {
-      state.indices.push_back(curVertex - 1);
-      state.indices.push_back(curVertex - 2);
-    }
-  } else if (state.primitive == GX_QUADS) {
-    if ((state.vertexCount & 3) == 3) {
-      state.indices.push_back(curVertex - 3);
-      state.indices.push_back(curVertex - 1);
-    }
-  }
-  state.indices.push_back(curVertex);
-  ++state.vertexCount;
+void GXPosition3f32(f32 x, f32 y, f32 z) {
+  sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_F32);
+  sStreamState->append(aurora::Vec3{x, y, z});
 }
 
 void GXPosition3u16(u16 x, u16 y, u16 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_POS];
-  GXPosition3f32(
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
-  );
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_U16);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
+  });
 }
 
 void GXPosition3s16(s16 x, s16 y, s16 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_POS];
-  GXPosition3f32(
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
-  );
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_S16);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
+  });
 }
 
 void GXPosition3u8(u8 x, u8 y, u8 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_POS];
-  GXPosition3f32(
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
-  );
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_U8);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
+  });
 }
 
 void GXPosition3s8(s8 x, s8 y, s8 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_POS];
-  GXPosition3f32(
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
-  );
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_S8);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
+  });
 }
 
-void GXPosition2f32(float x, float y) {
-  GXPosition3f32(x, y, 0.f);
+void GXPosition2f32(f32 x, f32 y) {
+  sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_F32);
+  sStreamState->append(aurora::Vec3{x, y, 0.f});
 }
 
 void GXPosition2u16(u16 x, u16 y) {
-  GXPosition3u16(x, y, 0);
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_U16);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      0.f,
+  });
 }
 
 void GXPosition2s16(s16 x, s16 y) {
-  GXPosition3s16(x, y, 0);
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_S16);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      0.f,
+  });
 }
 
 void GXPosition2u8(u8 x, u8 y) {
-  GXPosition3u8(x, y, 0);
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_U8);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      0.f,
+  });
 }
 
 void GXPosition2s8(s8 x, s8 y) {
-  GXPosition3s8(x, y, 0);
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_S8);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      0.f,
+  });
 }
 
 void GXPosition1x16(u16 idx) {
-  check_attr_order(GX_VA_POS);
-  // keep aligned
-  if (sStreamState->vertexBuffer.size() % 4 != 0) {
-    sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4));
-  }
-  sStreamState->vertexBuffer.append(&idx, 2);
+  sStreamState->check_indexed(GX_VA_POS, GX_INDEX16);
+  sStreamState->append<u16>(idx);
 }
 
 void GXPosition1x8(u8 idx) {
-  GXPosition1x16(idx);
+  sStreamState->check_indexed(GX_VA_POS, GX_INDEX8);
+  sStreamState->append<u16>(idx);
 }
 
-void GXNormal3f32(float x, float y, float z) {
-  check_attr_order(GX_VA_NRM);
-  sStreamState->vertexBuffer.append(&x, 4);
-  sStreamState->vertexBuffer.append(&y, 4);
-  sStreamState->vertexBuffer.append(&z, 4);
+void GXNormal3f32(f32 x, f32 y, f32 z) {
+  sStreamState->check_direct(GX_VA_NRM, GX_NRM_XYZ, GX_F32);
+  sStreamState->append(aurora::Vec3{x, y, z});
 }
 
 void GXNormal3s16(s16 x, s16 y, s16 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_NRM];
-  GXNormal3f32(
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
-  );
+  const auto frac = sStreamState->check_direct(GX_VA_NRM, GX_NRM_XYZ, GX_S16);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
+  });
 }
 
 void GXNormal3s8(s8 x, s8 y, s8 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_NRM];
-  GXNormal3f32(
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
-  );
+  const auto frac = sStreamState->check_direct(GX_VA_NRM, GX_NRM_XYZ, GX_S8);
+  sStreamState->append(aurora::Vec3{
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
+  });
 }
 
-void GXNormal1x16(u16 idx) {
-  check_attr_order(GX_VA_NRM);
-  // keep aligned
-  if (sStreamState->vertexBuffer.size() % 4 != 0) {
-    sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4));
-  }
-  sStreamState->vertexBuffer.append(&idx, 2);
+void GXNormal1x16(u16 index) {
+  sStreamState->check_indexed(GX_VA_NRM, GX_INDEX16);
+  sStreamState->append<u16>(index);
 }
 
-void GXNormal1x8(u8 idx) {
-  GXNormal1x16(idx);
+void GXNormal1x8(u8 index) {
+  sStreamState->check_indexed(GX_VA_POS, GX_INDEX8);
+  sStreamState->append<u16>(index);
 }
 
-void GXColor4f32(float r, float g, float b, float a) {
-  check_attr_order(GX_VA_CLR0);
-  sStreamState->vertexBuffer.append(&r, 4);
-  sStreamState->vertexBuffer.append(&g, 4);
-  sStreamState->vertexBuffer.append(&b, 4);
-  sStreamState->vertexBuffer.append(&a, 4);
+void GXColor4f32(f32 r, f32 g, f32 b, f32 a) {
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGBA, GX_RGBA8);
+  sStreamState->append(aurora::Vec4{r, g, b, a});
 }
 
 void GXColor4u8(u8 r, u8 g, u8 b, u8 a) {
-  GXColor4f32(static_cast<float>(r) / 255.f, static_cast<float>(g) / 255.f, static_cast<float>(b) / 255.f,
-              static_cast<float>(a) / 255.f);
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGBA, GX_RGBA8);
+  sStreamState->append(aurora::Vec4{
+      static_cast<f32>(r) / 255.f,
+      static_cast<f32>(g) / 255.f,
+      static_cast<f32>(b) / 255.f,
+      static_cast<f32>(a) / 255.f,
+  });
 }
 
 void GXColor3u8(u8 r, u8 g, u8 b) {
-  GXColor4u8(r, g, b, 255);
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGB, GX_RGB8);
+  sStreamState->append(aurora::Vec4{
+      static_cast<f32>(r) / 255.f,
+      static_cast<f32>(g) / 255.f,
+      static_cast<f32>(b) / 255.f,
+      1.f,
+  });
 }
 
-void GXColor1x16(u16 idx) {
-  check_attr_order(GX_VA_CLR0);
-  // keep aligned
-  if (sStreamState->vertexBuffer.size() % 4 != 0) {
-    sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4));
-  }
-  sStreamState->vertexBuffer.append(&idx, 2);
+void GXColor1u32(u32 clr) {
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGBA, GX_RGBA8);
+  sStreamState->append(aurora::Vec4{
+      static_cast<f32>((clr >> 24) & 0xff) / 255.f,
+      static_cast<f32>((clr >> 16) & 0xff) / 255.f,
+      static_cast<f32>((clr >> 8) & 0xff) / 255.f,
+      static_cast<f32>(clr & 0xff) / 255.f,
+  });
 }
 
-void GXColor1x8(u8 idx) {
-  GXColor1x16(idx);
+void GXColor1u16(u16 clr) {
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGB, GX_RGB565);
+  sStreamState->append(aurora::Vec4{
+      static_cast<f32>((clr >> 11) & 0x1f) / 31.f,
+      static_cast<f32>((clr >> 5) & 0x3f) / 63.f,
+      static_cast<f32>(clr & 0x1f) / 31.f,
+      1.f,
+  });
 }
 
-void GXTexCoord2f32(float u, float v) {
-  check_attr_order(GX_VA_TEX0);
-  sStreamState->vertexBuffer.append(&u, 4);
-  sStreamState->vertexBuffer.append(&v, 4);
+void GXTexCoord2f32(f32 s, f32 t) {
+  sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_F32);
+  sStreamState->append(aurora::Vec2{s, t});
+}
+
+void GXTexCoord2u16(u16 s, u16 t) {
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_U16);
+  sStreamState->append(aurora::Vec2{
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
+      static_cast<f32>(t) / static_cast<f32>(1 << frac),
+  });
 }
 
 void GXTexCoord2s16(s16 s, s16 t) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_TEX0];
-  GXTexCoord2f32(
-    static_cast<float>(s) / static_cast<f32>(1 << attrFmt.frac),
-    static_cast<float>(t) / static_cast<f32>(1 << attrFmt.frac)
-  );
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_S16);
+  sStreamState->append(aurora::Vec2{
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
+      static_cast<f32>(t) / static_cast<f32>(1 << frac),
+  });
 }
 
-void GXTexCoord1x16(u16 idx) {
-  check_attr_order(GX_VA_TEX0);
-  // keep aligned
-  if (sStreamState->vertexBuffer.size() % 4 != 0) {
-    sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4));
-  }
-  sStreamState->vertexBuffer.append(&idx, 2);
+void GXTexCoord2u8(u8 s, u8 t) {
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_U8);
+  sStreamState->append(aurora::Vec2{
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
+      static_cast<f32>(t) / static_cast<f32>(1 << frac),
+  });
 }
 
-void GXTexCoord1x8(u8 idx) {
-  GXTexCoord1x16(idx);
+void GXTexCoord2s8(s8 s, s8 t) {
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_S8);
+  sStreamState->append(aurora::Vec2{
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
+      static_cast<f32>(t) / static_cast<f32>(1 << frac),
+  });
+}
+
+void GXTexCoord1f32(f32 s) {
+  sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_F32);
+  sStreamState->append(aurora::Vec2{s, 0.f});
+}
+
+void GXTexCoord1u16(u16 s) {
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_U16);
+  sStreamState->append(aurora::Vec2{
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
+      0.f,
+  });
+}
+
+void GXTexCoord1s16(s16 s) {
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_S16);
+  sStreamState->append(aurora::Vec2{
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
+      0.f,
+  });
+}
+
+void GXTexCoord1u8(u8 s) {
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_U8);
+  sStreamState->append(aurora::Vec2{
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
+      0.f,
+  });
+}
+
+void GXTexCoord1s8(s8 s) {
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_S8);
+  sStreamState->append(aurora::Vec2{
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
+      0.f,
+  });
+}
+
+void GXTexCoord1x16(u16 index) {
+  sStreamState->check_indexed(GX_VA_TEX0, GX_INDEX16);
+  sStreamState->append(index);
+}
+
+void GXTexCoord1x8(u8 index) {
+  sStreamState->check_indexed(GX_VA_TEX0, GX_INDEX8);
+  sStreamState->append(static_cast<u16>(index));
 }
 
 void GXEnd() {
@@ -282,27 +439,55 @@ void GXEnd() {
   }
   const auto vertRange = aurora::gfx::push_verts(sStreamState->vertexBuffer.data(), sStreamState->vertexBuffer.size());
   const auto indexRange = aurora::gfx::push_indices(aurora::ArrayRef{sStreamState->indices});
-  if (g_gxState.stateDirty) {
-    aurora::gfx::stream::PipelineConfig config{};
-    populate_pipeline_config(config, GX_TRIANGLES);
-    const auto info = build_shader_info(config.shaderConfig);
-    const auto pipeline = aurora::gfx::pipeline_ref(config);
-    aurora::gfx::push_draw_command(aurora::gfx::stream::DrawData{
-        .pipeline = pipeline,
-        .vertRange = vertRange,
-        .uniformRange = build_uniform(info),
-        .indexRange = indexRange,
-        .indexCount = static_cast<uint32_t>(sStreamState->indices.size()),
-        .bindGroups = build_bind_groups(info, config.shaderConfig, {}),
-        .dstAlpha = g_gxState.dstAlpha,
-    });
-  } else {
-    aurora::gfx::merge_draw_command(aurora::gfx::stream::DrawData{
-        .vertRange = vertRange,
-        .indexRange = indexRange,
-        .indexCount = static_cast<uint32_t>(sStreamState->indices.size()),
-    });
+
+  aurora::gfx::gx::BindGroupRanges ranges{};
+  for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
+    if (g_gxState.vtxDesc[i] != GX_INDEX8 && g_gxState.vtxDesc[i] != GX_INDEX16) {
+      continue;
+    }
+    auto& array = g_gxState.arrays[i];
+    if (array.cachedRange.size > 0) {
+      // Use the currently cached range
+      ranges.vaRanges[i] = array.cachedRange;
+    } else {
+      // Push array data to storage and cache range
+      const auto range = aurora::gfx::push_storage(static_cast<const uint8_t*>(array.data), array.size);
+      ranges.vaRanges[i] = range;
+      array.cachedRange = range;
+    }
   }
+
+  // if (g_gxState.stateDirty) {
+  aurora::gfx::model::PipelineConfig config{};
+  GXPrimitive primitive = GX_TRIANGLES;
+  switch (sStreamState->primitive) {
+  case GX_TRIANGLESTRIP:
+    primitive = GX_TRIANGLESTRIP;
+    break;
+  default:
+    break;
+  }
+  populate_pipeline_config(config, primitive, sStreamState->vtxFmt);
+  const auto info = build_shader_info(config.shaderConfig);
+  const auto bindGroups = aurora::gfx::gx::build_bind_groups(info, config.shaderConfig, ranges);
+  const auto pipeline = aurora::gfx::pipeline_ref(config);
+  aurora::gfx::push_draw_command(aurora::gfx::model::DrawData{
+      .pipeline = pipeline,
+      .vertRange = vertRange,
+      .idxRange = indexRange,
+      .dataRanges = ranges,
+      .uniformRange = build_uniform(info),
+      .indexCount = static_cast<uint32_t>(sStreamState->indices.size()),
+      .bindGroups = bindGroups,
+      .dstAlpha = g_gxState.dstAlpha,
+  });
+  // } else {
+  //   aurora::gfx::merge_draw_command(aurora::gfx::model::DrawData{
+  //       .vertRange = vertRange,
+  //       .idxRange = indexRange,
+  //       .indexCount = static_cast<uint32_t>(sStreamState->indices.size()),
+  //   });
+  // }
   lastVertexStart = sStreamState->vertexStart + sStreamState->vertexCount;
   sStreamState.reset();
 }
diff --git a/lib/gfx/common.cpp b/lib/gfx/common.cpp
index 968b90a..0e12d82 100644
--- a/lib/gfx/common.cpp
+++ b/lib/gfx/common.cpp
@@ -3,7 +3,6 @@
 #include "../internal.hpp"
 #include "../webgpu/gpu.hpp"
 #include "model/shader.hpp"
-#include "stream/shader.hpp"
 #include "texture.hpp"
 
 #include <condition_variable>
@@ -11,7 +10,6 @@
 #include <fstream>
 #include <mutex>
 #include <thread>
-#include <variant>
 
 #include <absl/container/flat_hash_map.h>
 #include <magic_enum.hpp>
@@ -37,13 +35,11 @@ constexpr uint64_t StagingBufferSize =
     UniformBufferSize + VertexBufferSize + IndexBufferSize + StorageBufferSize + TextureUploadSize;
 
 struct ShaderState {
-  stream::State stream;
   model::State model;
 };
 struct ShaderDrawCommand {
   ShaderType type;
   union {
-    stream::DrawData stream;
     model::DrawData model;
   };
 };
@@ -168,10 +164,9 @@ static u32 g_serializedPipelineCount = 0;
 template <typename PipelineConfig>
 static void serialize_pipeline_config(ShaderType type, const PipelineConfig& config) {
   static_assert(std::has_unique_object_representations_v<PipelineConfig>);
-  g_serializedPipelines.append(&type, sizeof(type));
-  const u32 configSize = sizeof(config);
-  g_serializedPipelines.append(&configSize, sizeof(configSize));
-  g_serializedPipelines.append(&config, configSize);
+  g_serializedPipelines.append(type);
+  g_serializedPipelines.append<u32>(sizeof(config));
+  g_serializedPipelines.append(config);
   ++g_serializedPipelineCount;
 }
 
@@ -278,33 +273,19 @@ void resolve_pass(TextureHandle texture, ClipRect rect, bool clear, Vec4<float>
   ++g_currentRenderPass;
 }
 
-template <>
-const stream::State& get_state() {
-  return g_state.stream;
-}
-
-template <>
-void push_draw_command(stream::DrawData data) {
-  push_draw_command(ShaderDrawCommand{.type = ShaderType::Stream, .stream = data});
-}
-
-template <>
-void merge_draw_command(stream::DrawData data) {
-  auto& last = get_last_draw_command(ShaderType::Stream).data.draw.stream;
-  CHECK(last.vertRange.offset + last.vertRange.size == data.vertRange.offset, "Invalid vertex merge range: {} -> {}",
-        last.vertRange.offset + last.vertRange.size, data.vertRange.offset);
-  CHECK(last.indexRange.offset + last.indexRange.size == data.indexRange.offset, "Invalid index merge range: {} -> {}",
-        last.indexRange.offset + last.indexRange.size, data.indexRange.offset);
-  last.vertRange.size += data.vertRange.size;
-  last.indexRange.size += data.indexRange.size;
-  last.indexCount += data.indexCount;
-  ++g_mergedDrawCallCount;
-}
-
-template <>
-PipelineRef pipeline_ref(stream::PipelineConfig config) {
-  return find_pipeline(ShaderType::Stream, config, [=]() { return create_pipeline(g_state.stream, config); });
-}
+// template <>
+// void merge_draw_command(stream::DrawData data) {
+//   auto& last = get_last_draw_command(ShaderType::Stream).data.draw.stream;
+//   CHECK(last.vertRange.offset + last.vertRange.size == data.vertRange.offset, "Invalid vertex merge range: {} -> {}",
+//         last.vertRange.offset + last.vertRange.size, data.vertRange.offset);
+//   CHECK(last.indexRange.offset + last.indexRange.size == data.indexRange.offset, "Invalid index merge range: {} ->
+//   {}",
+//         last.indexRange.offset + last.indexRange.size, data.indexRange.offset);
+//   last.vertRange.size += data.vertRange.size;
+//   last.indexRange.size += data.indexRange.size;
+//   last.indexCount += data.indexCount;
+//   ++g_mergedDrawCallCount;
+// }
 
 template <>
 void push_draw_command(model::DrawData data) {
@@ -378,16 +359,6 @@ void load_pipeline_cache() {
       u32 size = *reinterpret_cast<const u32*>(pipelineCache.data() + offset);
       offset += sizeof(u32);
       switch (type) {
-      case ShaderType::Stream: {
-        if (size != sizeof(stream::PipelineConfig)) {
-          break;
-        }
-        const auto config = *reinterpret_cast<const stream::PipelineConfig*>(pipelineCache.data() + offset);
-        if (config.version != gx::GXPipelineConfigVersion) {
-          break;
-        }
-        find_pipeline(type, config, [=]() { return stream::create_pipeline(g_state.stream, config); }, true);
-      } break;
       case ShaderType::Model: {
         if (size != sizeof(model::PipelineConfig)) {
           break;
@@ -397,9 +368,10 @@ void load_pipeline_cache() {
           break;
         }
         find_pipeline(type, config, [=]() { return model::create_pipeline(g_state.model, config); }, true);
-      } break;
+        break;
+      }
       default:
-        Log.warn("Unknown pipeline type {}", static_cast<int>(type));
+        Log.warn("Unknown pipeline type {}", underlying(type));
         break;
       }
       offset += size;
@@ -459,7 +431,6 @@ void initialize() {
   }
   map_staging_buffer();
 
-  g_state.stream = stream::construct_state();
   g_state.model = model::construct_state();
 
   load_pipeline_cache();
@@ -581,6 +552,9 @@ void end_frame(const wgpu::CommandEncoder& cmd) {
   currentStagingBuffer = (currentStagingBuffer + 1) % g_stagingBuffers.size();
   map_staging_buffer();
   g_currentRenderPass = UINT32_MAX;
+  for (auto& array : gx::g_gxState.arrays) {
+    array.cachedRange = {};
+  }
 
   if (!g_hasPipelineThread) {
     pipeline_worker();
@@ -612,7 +586,7 @@ void render(wgpu::CommandEncoder& cmd) {
         .view = webgpu::g_depthBuffer.view,
         .depthLoadOp = passInfo.clear ? wgpu::LoadOp::Clear : wgpu::LoadOp::Load,
         .depthStoreOp = wgpu::StoreOp::Store,
-        .depthClearValue = 1.f,
+        .depthClearValue = gx::UseReversedZ ? 0.f : 1.f,
     };
     const auto label = fmt::format("Render pass {}", i);
     const wgpu::RenderPassDescriptor renderPassDescriptor{
@@ -680,7 +654,9 @@ void render_pass(const wgpu::RenderPassEncoder& pass, u32 idx) {
     switch (cmd.type) {
     case CommandType::SetViewport: {
       const auto& vp = cmd.data.setViewport;
-      pass.SetViewport(vp.left, vp.top, vp.width, vp.height, vp.znear, vp.zfar);
+      const float minDepth = gx::UseReversedZ ? 1.f - vp.zfar : vp.znear;
+      const float maxDepth = gx::UseReversedZ ? 1.f - vp.znear : vp.zfar;
+      pass.SetViewport(vp.left, vp.top, vp.width, vp.height, minDepth, maxDepth);
     } break;
     case CommandType::SetScissor: {
       const auto& sc = cmd.data.setScissor;
@@ -694,9 +670,6 @@ void render_pass(const wgpu::RenderPassEncoder& pass, u32 idx) {
     case CommandType::Draw: {
       const auto& draw = cmd.data.draw;
       switch (draw.type) {
-      case ShaderType::Stream:
-        stream::render(g_state.stream, draw.stream, pass);
-        break;
       case ShaderType::Model:
         model::render(g_state.model, draw.model, pass);
         break;
diff --git a/lib/gfx/common.hpp b/lib/gfx/common.hpp
index 6bde6e9..89f798d 100644
--- a/lib/gfx/common.hpp
+++ b/lib/gfx/common.hpp
@@ -56,8 +56,7 @@ public:
   ByteBuffer() noexcept = default;
   explicit ByteBuffer(size_t size) noexcept
   : m_data(static_cast<uint8_t*>(calloc(1, size))), m_length(size), m_capacity(size) {}
-  explicit ByteBuffer(uint8_t* data, size_t size) noexcept
-  : m_data(data), m_capacity(size), m_owned(false) {}
+  explicit ByteBuffer(uint8_t* data, size_t size) noexcept : m_data(data), m_capacity(size), m_owned(false) {}
   ~ByteBuffer() noexcept {
     if (m_data != nullptr && m_owned) {
       free(m_data);
@@ -98,6 +97,11 @@ public:
     m_length += size;
   }
 
+  template <typename T>
+  void append(const T& obj) {
+    append(&obj, sizeof(T));
+  }
+
   void append_zeroes(size_t size) {
     resize(m_length + size, true);
     m_length += size;
@@ -179,8 +183,7 @@ struct TextureRef;
 using TextureHandle = std::shared_ptr<TextureRef>;
 
 enum class ShaderType : uint8_t {
-  Stream,
-  Model,
+  Model = 1,
 };
 
 void initialize();
diff --git a/lib/gfx/gx.cpp b/lib/gfx/gx.cpp
index edbd69f..ad52148 100644
--- a/lib/gfx/gx.cpp
+++ b/lib/gfx/gx.cpp
@@ -7,7 +7,6 @@
 
 #include <absl/container/flat_hash_map.h>
 #include <cfloat>
-#include <cmath>
 
 using aurora::gfx::gx::g_gxState;
 static aurora::Module Log("aurora::gx");
@@ -25,7 +24,7 @@ const TextureBind& get_texture(GXTexMapID id) noexcept { return g_gxState.textur
 
 static inline wgpu::BlendFactor to_blend_factor(GXBlendFactor fac, bool isDst) {
   switch (fac) {
-    DEFAULT_FATAL("invalid blend factor {}", static_cast<int>(fac));
+    DEFAULT_FATAL("invalid blend factor {}", underlying(fac));
   case GX_BL_ZERO:
     return wgpu::BlendFactor::Zero;
   case GX_BL_ONE:
@@ -55,21 +54,21 @@ static inline wgpu::BlendFactor to_blend_factor(GXBlendFactor fac, bool isDst) {
 
 static inline wgpu::CompareFunction to_compare_function(GXCompare func) {
   switch (func) {
-    DEFAULT_FATAL("invalid depth fn {}", static_cast<int>(func));
+    DEFAULT_FATAL("invalid depth fn {}", underlying(func));
   case GX_NEVER:
     return wgpu::CompareFunction::Never;
   case GX_LESS:
-    return wgpu::CompareFunction::Less;
+    return UseReversedZ ? wgpu::CompareFunction::Greater : wgpu::CompareFunction::Less;
   case GX_EQUAL:
     return wgpu::CompareFunction::Equal;
   case GX_LEQUAL:
-    return wgpu::CompareFunction::LessEqual;
+    return UseReversedZ ? wgpu::CompareFunction::GreaterEqual : wgpu::CompareFunction::LessEqual;
   case GX_GREATER:
-    return wgpu::CompareFunction::Greater;
+    return UseReversedZ ? wgpu::CompareFunction::Less : wgpu::CompareFunction::Greater;
   case GX_NEQUAL:
     return wgpu::CompareFunction::NotEqual;
   case GX_GEQUAL:
-    return wgpu::CompareFunction::GreaterEqual;
+    return UseReversedZ ? wgpu::CompareFunction::LessEqual : wgpu::CompareFunction::GreaterEqual;
   case GX_ALWAYS:
     return wgpu::CompareFunction::Always;
   }
@@ -79,7 +78,7 @@ static inline wgpu::BlendState to_blend_state(GXBlendMode mode, GXBlendFactor sr
                                               GXLogicOp op, u32 dstAlpha) {
   wgpu::BlendComponent colorBlendComponent;
   switch (mode) {
-    DEFAULT_FATAL("unsupported blend mode {}", static_cast<int>(mode));
+    DEFAULT_FATAL("unsupported blend mode {}", underlying(mode));
   case GX_BM_NONE:
     colorBlendComponent = {
         .operation = wgpu::BlendOperation::Add,
@@ -103,7 +102,7 @@ static inline wgpu::BlendState to_blend_state(GXBlendMode mode, GXBlendFactor sr
     break;
   case GX_BM_LOGIC:
     switch (op) {
-      DEFAULT_FATAL("unsupported logic op {}", static_cast<int>(op));
+      DEFAULT_FATAL("unsupported logic op {}", underlying(op));
     case GX_LO_CLEAR:
       colorBlendComponent = {
           .operation = wgpu::BlendOperation::Add,
@@ -160,7 +159,7 @@ static inline wgpu::ColorWriteMask to_write_mask(bool colorUpdate, bool alphaUpd
 static inline wgpu::PrimitiveState to_primitive_state(GXPrimitive gx_prim, GXCullMode gx_cullMode) {
   wgpu::PrimitiveTopology primitive = wgpu::PrimitiveTopology::TriangleList;
   switch (gx_prim) {
-    DEFAULT_FATAL("unsupported primitive type {}", static_cast<int>(gx_prim));
+    DEFAULT_FATAL("unsupported primitive type {}", underlying(gx_prim));
   case GX_TRIANGLES:
     break;
   case GX_TRIANGLESTRIP:
@@ -169,7 +168,7 @@ static inline wgpu::PrimitiveState to_primitive_state(GXPrimitive gx_prim, GXCul
   }
   wgpu::CullMode cullMode = wgpu::CullMode::None;
   switch (gx_cullMode) {
-    DEFAULT_FATAL("unsupported cull mode {}", static_cast<int>(gx_cullMode));
+    DEFAULT_FATAL("unsupported cull mode {}", underlying(gx_cullMode));
   case GX_CULL_FRONT:
     cullMode = wgpu::CullMode::Front;
     break;
@@ -193,14 +192,6 @@ wgpu::RenderPipeline build_pipeline(const PipelineConfig& config, const ShaderIn
       .format = g_graphicsConfig.depthFormat,
       .depthWriteEnabled = config.depthUpdate,
       .depthCompare = to_compare_function(config.depthFunc),
-      .stencilFront =
-          wgpu::StencilFaceState{
-              .compare = wgpu::CompareFunction::Always,
-          },
-      .stencilBack =
-          wgpu::StencilFaceState{
-              .compare = wgpu::CompareFunction::Always,
-          },
   };
   const auto blendState =
       to_blend_state(config.blendMode, config.blendFacSrc, config.blendFacDst, config.blendOp, config.dstAlpha);
@@ -249,25 +240,23 @@ wgpu::RenderPipeline build_pipeline(const PipelineConfig& config, const ShaderIn
   return g_device.CreateRenderPipeline(&descriptor);
 }
 
-void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive) noexcept {
+void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive, GXVtxFmt fmt) noexcept {
+  const auto& vtxFmt = g_gxState.vtxFmts[fmt];
   config.shaderConfig.fogType = g_gxState.fog.type;
   config.shaderConfig.vtxAttrs = g_gxState.vtxDesc;
-  int lastIndexedAttr = -1;
   for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
     const auto type = g_gxState.vtxDesc[i];
     if (type != GX_INDEX8 && type != GX_INDEX16) {
-      config.shaderConfig.attrMapping[i] = GX_VA_NULL;
+      config.shaderConfig.attrMapping[i] = {};
       continue;
     }
-    const auto& array = g_gxState.arrays[i];
-    if (lastIndexedAttr >= 0 && array == g_gxState.arrays[lastIndexedAttr]) {
-      // Map attribute to previous attribute
-      config.shaderConfig.attrMapping[i] = config.shaderConfig.attrMapping[lastIndexedAttr];
-    } else {
-      // Map attribute to its own storage
-      config.shaderConfig.attrMapping[i] = static_cast<GXAttr>(i);
-    }
-    lastIndexedAttr = i;
+    // Map attribute to its own storage
+    config.shaderConfig.attrMapping[i] = StorageConfig {
+      .attr = static_cast<GXAttr>(i),
+      .cnt = vtxFmt.attrs[i].cnt,
+      .compType = vtxFmt.attrs[i].type,
+      .frac = vtxFmt.attrs[i].frac,
+    };
   }
   config.shaderConfig.tevSwapTable = g_gxState.tevSwapTable;
   for (u8 i = 0; i < g_gxState.numTevStages; ++i) {
@@ -328,14 +317,14 @@ void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive) noe
 Range build_uniform(const ShaderInfo& info) noexcept {
   auto [buf, range] = map_uniform(info.uniformSize);
   {
-    buf.append(&g_gxState.pnMtx[g_gxState.currentPnMtx], 128);
-    buf.append(&g_gxState.proj, 64);
+    buf.append(g_gxState.pnMtx[g_gxState.currentPnMtx]);
+    buf.append(g_gxState.proj);
   }
   for (int i = 0; i < info.loadsTevReg.size(); ++i) {
     if (!info.loadsTevReg.test(i)) {
       continue;
     }
-    buf.append(&g_gxState.colorRegs[i], 16);
+    buf.append(g_gxState.colorRegs[i]);
   }
   bool lightingEnabled = false;
   for (int i = 0; i < info.sampledColorChannels.size(); ++i) {
@@ -352,11 +341,10 @@ Range build_uniform(const ShaderInfo& info) noexcept {
   if (lightingEnabled) {
     // Lights
     static_assert(sizeof(g_gxState.lights) == 80 * GX::MaxLights);
-    buf.append(&g_gxState.lights, 80 * GX::MaxLights);
+    buf.append(g_gxState.lights);
     // Light state for all channels
     for (int i = 0; i < 4; ++i) {
-      u32 lightState = g_gxState.colorChannelState[i].lightMask.to_ulong();
-      buf.append(&lightState, 4);
+      buf.append<u32>(g_gxState.colorChannelState[i].lightMask.to_ulong());
     }
   }
   for (int i = 0; i < info.sampledColorChannels.size(); ++i) {
@@ -366,25 +354,25 @@ Range build_uniform(const ShaderInfo& info) noexcept {
     const auto& ccc = g_gxState.colorChannelConfig[i * 2];
     const auto& ccs = g_gxState.colorChannelState[i * 2];
     if (ccc.lightingEnabled && ccc.ambSrc == GX_SRC_REG) {
-      buf.append(&ccs.ambColor, 16);
+      buf.append(ccs.ambColor);
     }
     if (ccc.matSrc == GX_SRC_REG) {
-      buf.append(&ccs.matColor, 16);
+      buf.append(ccs.matColor);
     }
     const auto& ccca = g_gxState.colorChannelConfig[i * 2 + 1];
     const auto& ccsa = g_gxState.colorChannelState[i * 2 + 1];
     if (ccca.lightingEnabled && ccca.ambSrc == GX_SRC_REG) {
-      buf.append(&ccsa.ambColor, 16);
+      buf.append(ccsa.ambColor);
     }
     if (ccca.matSrc == GX_SRC_REG) {
-      buf.append(&ccsa.matColor, 16);
+      buf.append(ccsa.matColor);
     }
   }
   for (int i = 0; i < info.sampledKColors.size(); ++i) {
     if (!info.sampledKColors.test(i)) {
       continue;
     }
-    buf.append(&g_gxState.kcolors[i], 16);
+    buf.append(g_gxState.kcolors[i]);
   }
   for (int i = 0; i < info.usesTexMtx.size(); ++i) {
     if (!info.usesTexMtx.test(i)) {
@@ -392,26 +380,16 @@ Range build_uniform(const ShaderInfo& info) noexcept {
     }
     const auto& state = g_gxState;
     switch (info.texMtxTypes[i]) {
-      DEFAULT_FATAL("unhandled tex mtx type {}", static_cast<int>(info.texMtxTypes[i]));
+      DEFAULT_FATAL("unhandled tex mtx type {}", underlying(info.texMtxTypes[i]));
     case GX_TG_MTX2x4:
-      if (std::holds_alternative<Mat4x2<float>>(state.texMtxs[i])) {
-        buf.append(&std::get<Mat4x2<float>>(state.texMtxs[i]), 32);
-      } else if (std::holds_alternative<Mat4x4<float>>(g_gxState.texMtxs[i])) {
-        // TODO: SMB hits this?
-        Mat4x2<float> mtx{
-            {1.f, 0.f},
-            {0.f, 1.f},
-            {0.f, 0.f},
-            {0.f, 0.f},
-        };
-        buf.append(&mtx, 32);
+      if (std::holds_alternative<Mat2x4<float>>(state.texMtxs[i])) {
+        buf.append(std::get<Mat2x4<float>>(state.texMtxs[i]));
       } else
         UNLIKELY FATAL("expected 2x4 mtx in idx {}", i);
       break;
     case GX_TG_MTX3x4:
-      if (std::holds_alternative<Mat4x4<float>>(g_gxState.texMtxs[i])) {
-        const auto& mat = std::get<Mat4x4<float>>(g_gxState.texMtxs[i]);
-        buf.append(&mat, 64);
+      if (std::holds_alternative<Mat3x4<float>>(g_gxState.texMtxs[i])) {
+        buf.append(std::get<Mat3x4<float>>(g_gxState.texMtxs[i]));
       } else
         UNLIKELY FATAL("expected 3x4 mtx in idx {}", i);
       break;
@@ -421,18 +399,11 @@ Range build_uniform(const ShaderInfo& info) noexcept {
     if (!info.usesPTTexMtx.test(i)) {
       continue;
     }
-    buf.append(&g_gxState.ptTexMtxs[i], 64);
+    buf.append(g_gxState.ptTexMtxs[i]);
   }
   if (info.usesFog) {
     const auto& state = g_gxState.fog;
-    struct Fog {
-      Vec4<float> color = state.color;
-      float a = 0.f;
-      float b = 0.5f;
-      float c = 0.f;
-      float pad = FLT_MAX;
-    } fog{};
-    static_assert(sizeof(Fog) == 32);
+    Fog fog{.color = state.color};
     if (state.nearZ != state.farZ && state.startZ != state.endZ) {
       const float depthRange = state.farZ - state.nearZ;
       const float fogRange = state.endZ - state.startZ;
@@ -440,7 +411,7 @@ Range build_uniform(const ShaderInfo& info) noexcept {
       fog.b = state.farZ / depthRange;
       fog.c = state.startZ / fogRange;
     }
-    buf.append(&fog, 32);
+    buf.append(fog);
   }
   for (int i = 0; i < info.sampledTextures.size(); ++i) {
     if (!info.sampledTextures.test(i)) {
@@ -448,7 +419,7 @@ Range build_uniform(const ShaderInfo& info) noexcept {
     }
     const auto& tex = get_texture(static_cast<GXTexMapID>(i));
     CHECK(tex, "unbound texture {}", i);
-    buf.append(&tex.texObj.lodBias, 4);
+    buf.append(tex.texObj.lodBias);
   }
   g_gxState.stateDirty = false;
   return range;
@@ -564,7 +535,7 @@ GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const Shader
     };
     u32 bindIdx = 1;
     for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
-      if (config.attrMapping[i] == static_cast<GXAttr>(i)) {
+      if (config.attrMapping[i].attr == static_cast<GXAttr>(i)) {
         uniformLayoutEntries[bindIdx] = wgpu::BindGroupLayoutEntry{
             .binding = bindIdx,
             .visibility = wgpu::ShaderStage::Vertex,
@@ -688,7 +659,7 @@ void shutdown() noexcept {
 
 static wgpu::AddressMode wgpu_address_mode(GXTexWrapMode mode) {
   switch (mode) {
-    DEFAULT_FATAL("invalid wrap mode {}", static_cast<int>(mode));
+    DEFAULT_FATAL("invalid wrap mode {}", underlying(mode));
   case GX_CLAMP:
     return wgpu::AddressMode::ClampToEdge;
   case GX_REPEAT:
@@ -735,8 +706,6 @@ wgpu::SamplerDescriptor TextureBind::get_descriptor() const noexcept {
         .magFilter = wgpu::FilterMode::Nearest,
         .minFilter = wgpu::FilterMode::Nearest,
         .mipmapFilter = wgpu::MipmapFilterMode::Nearest,
-        .lodMinClamp = 0.f,
-        .lodMaxClamp = 1000.f,
         .maxAnisotropy = 1,
     };
   }
@@ -750,8 +719,6 @@ wgpu::SamplerDescriptor TextureBind::get_descriptor() const noexcept {
       .magFilter = magFilter,
       .minFilter = minFilter,
       .mipmapFilter = mipFilter,
-      .lodMinClamp = 0.f,
-      .lodMaxClamp = 1000.f,
       .maxAnisotropy = wgpu_aniso(texObj.maxAniso),
   };
 }
diff --git a/lib/gfx/gx.hpp b/lib/gfx/gx.hpp
index 40094fd..ba4a170 100644
--- a/lib/gfx/gx.hpp
+++ b/lib/gfx/gx.hpp
@@ -46,6 +46,11 @@ constexpr float GX_LARGE_NUMBER = -1048576.0f;
 #endif
 
 namespace aurora::gfx::gx {
+constexpr bool EnableNormalVisualization = false;
+constexpr bool EnableDebugPrints = false;
+constexpr bool UsePerPixelLighting = true;
+constexpr bool UseReversedZ = true;
+
 constexpr u32 MaxTextures = GX_MAX_TEXMAP;
 constexpr u32 MaxTluts = 20;
 constexpr u32 MaxTevStages = GX_MAX_TEVSTAGE;
@@ -144,8 +149,7 @@ struct ColorChannelState {
   Vec4<float> ambColor;
   GX::LightMask lightMask;
 };
-// Mat4x4 used instead of Mat4x3 for padding purposes
-using TexMtxVariant = std::variant<std::monostate, Mat4x2<float>, Mat4x4<float>>;
+using TexMtxVariant = std::variant<std::monostate, Mat2x4<float>, Mat3x4<float>>;
 struct TcgConfig {
   GXTexGenType type = GX_TG_MTX2x4;
   GXTexGenSrc src = GX_MAX_TEXGENSRC;
@@ -213,10 +217,10 @@ struct VtxFmt {
   std::array<VtxAttrFmt, MaxVtxAttr> attrs;
 };
 struct PnMtx {
-  Mat4x4<float> pos;
-  Mat4x4<float> nrm;
+  Mat3x4<float> pos;
+  Mat3x4<float> nrm;
 };
-static_assert(sizeof(PnMtx) == sizeof(Mat4x4<float>) * 2);
+static_assert(sizeof(PnMtx) == sizeof(Mat3x4<float>) * 2);
 struct Light {
   Vec4<float> pos{0.f, 0.f, 0.f};
   Vec4<float> dir{0.f, 0.f, 0.f};
@@ -230,6 +234,14 @@ struct Light {
   bool operator!=(const Light& rhs) const { return !(*this == rhs); }
 };
 static_assert(sizeof(Light) == 80);
+struct Fog {
+  Vec4<float> color;
+  float a = 0.f;
+  float b = 0.5f;
+  float c = 0.f;
+  float pad = FLT_MAX;
+};
+static_assert(sizeof(Fog) == 32);
 struct AttrArray {
   const void* data;
   u32 size;
@@ -245,7 +257,6 @@ struct GXState {
   std::array<PnMtx, MaxPnMtx> pnMtx;
   u32 currentPnMtx;
   Mat4x4<float> proj;
-  Mat4x4<float> origProj;    // for GXGetProjectionv
   GXProjectionType projType; // for GXGetProjectionv
   FogState fog;
   GXCullMode cullMode = GX_CULL_BACK;
@@ -266,7 +277,7 @@ struct GXState {
   std::array<TextureBind, MaxTextures> textures;
   std::array<GXTlutObj_, MaxTluts> tluts;
   std::array<TexMtxVariant, MaxTexMtx> texMtxs;
-  std::array<Mat4x4<float>, MaxPTTexMtx> ptTexMtxs;
+  std::array<Mat3x4<float>, MaxPTTexMtx> ptTexMtxs;
   std::array<TcgConfig, MaxTexCoord> tcgs;
   std::array<GXAttrType, MaxVtxAttr> vtxDesc;
   std::array<VtxFmt, MaxVtxFmt> vtxFmts;
@@ -345,11 +356,18 @@ struct TextureConfig {
   bool operator==(const TextureConfig& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; }
 };
 static_assert(std::has_unique_object_representations_v<TextureConfig>);
+struct StorageConfig {
+  GXAttr attr = GX_VA_NULL;
+  GXCompCnt cnt = static_cast<GXCompCnt>(0xFF);
+  GXCompType compType = static_cast<GXCompType>(0xFF);
+  u8 frac = 0;
+  std::array<u8, 3> pad{};
+};
 struct ShaderConfig {
   GXFogType fogType;
   std::array<GXAttrType, MaxVtxAttr> vtxAttrs;
   // Mapping for indexed attributes -> storage buffer
-  std::array<GXAttr, MaxVtxAttr> attrMapping;
+  std::array<StorageConfig, MaxVtxAttr> attrMapping;
   std::array<TevSwap, MaxTevSwap> tevSwapTable;
   std::array<TevStage, MaxTevStages> tevStages;
   u32 tevStageCount = 0;
@@ -363,7 +381,7 @@ struct ShaderConfig {
 };
 static_assert(std::has_unique_object_representations_v<ShaderConfig>);
 
-constexpr u32 GXPipelineConfigVersion = 4;
+constexpr u32 GXPipelineConfigVersion = 5;
 struct PipelineConfig {
   u32 version = GXPipelineConfigVersion;
   ShaderConfig shaderConfig;
@@ -405,7 +423,7 @@ struct ShaderInfo {
 struct BindGroupRanges {
   std::array<Range, GX_VA_MAX_ATTR> vaRanges{};
 };
-void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive) noexcept;
+void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive, GXVtxFmt fmt) noexcept;
 wgpu::RenderPipeline build_pipeline(const PipelineConfig& config, const ShaderInfo& info,
                                     ArrayRef<wgpu::VertexBufferLayout> vtxBuffers, wgpu::ShaderModule shader,
                                     const char* label) noexcept;
diff --git a/lib/gfx/gx_fmt.hpp b/lib/gfx/gx_fmt.hpp
index e1e6ae7..a6ff7dc 100644
--- a/lib/gfx/gx_fmt.hpp
+++ b/lib/gfx/gx_fmt.hpp
@@ -1,3 +1,7 @@
+#pragma once
+
+#include "../internal.hpp"
+
 #include <dolphin/gx/GXEnum.h>
 #include <fmt/format.h>
 #include <string>
@@ -25,7 +29,7 @@ inline std::string format_as(const GXTevOp& op) {
   case GX_TEV_COMP_RGB8_EQ:
     return "GX_TEV_COMP_RGB8_EQ";
   default:
-    return fmt::format("GXTevOp({})", static_cast<int>(op));
+    return fmt::format("GXTevOp({})", underlying(op));
   }
 }
 
@@ -64,7 +68,7 @@ inline std::string format_as(const GXTevColorArg& arg) {
   case GX_CC_ZERO:
     return "GX_CC_ZERO";
   default:
-    return fmt::format("GXTevColorArg({})", static_cast<int>(arg));
+    return fmt::format("GXTevColorArg({})", underlying(arg));
   }
 }
 
@@ -87,7 +91,7 @@ inline std::string format_as(const GXTevAlphaArg& arg) {
   case GX_CA_ZERO:
     return "GX_CA_ZERO";
   default:
-    return fmt::format("GXTevAlphaArg({})", static_cast<int>(arg));
+    return fmt::format("GXTevAlphaArg({})", underlying(arg));
   }
 }
 
@@ -118,7 +122,7 @@ inline std::string format_as(const GXTexGenSrc& src) {
   case GX_TG_TEX7:
     return "GX_TG_TEX7";
   default:
-    return fmt::format("GXTexGenSrc({})", static_cast<int>(src));
+    return fmt::format("GXTexGenSrc({})", underlying(src));
   }
 }
 
@@ -133,7 +137,7 @@ inline std::string format_as(const GXTexGenType& type) {
   case GX_TG_BUMP1:
     return "GX_TG_BUMP1";
   default:
-    return fmt::format("GXTexGenType({})", static_cast<int>(type));
+    return fmt::format("GXTexGenType({})", underlying(type));
   }
 }
 
@@ -146,7 +150,7 @@ inline std::string format_as(const GXTevBias& bias) {
   case GX_TB_SUBHALF:
     return "GX_TB_SUBHALF";
   default:
-    return fmt::format("GXTevBias({})", static_cast<int>(bias));
+    return fmt::format("GXTevBias({})", underlying(bias));
   }
 }
 
@@ -161,7 +165,7 @@ inline std::string format_as(const GXTevScale& scale) {
   case GX_CS_DIVIDE_2:
     return "GX_CS_DIVIDE_2";
   default:
-    return fmt::format("GXTevScale({})", static_cast<int>(scale));
+    return fmt::format("GXTevScale({})", underlying(scale));
   }
 }
 
@@ -176,7 +180,7 @@ inline std::string format_as(const GXTevRegID& reg) {
   case GX_TEVREG2:
     return "GX_TEVREG2";
   default:
-    return fmt::format("GXTevRegID({})", static_cast<int>(reg));
+    return fmt::format("GXTevRegID({})", underlying(reg));
   }
 }
 
@@ -231,7 +235,7 @@ inline std::string format_as(const GXTevKColorSel& sel) {
   case GX_TEV_KCSEL_K3_A:
     return "GX_TEV_KCSEL_K3_A";
   default:
-    return fmt::format("GXTevKColorSel({})", static_cast<int>(sel));
+    return fmt::format("GXTevKColorSel({})", underlying(sel));
   }
 }
 
@@ -286,7 +290,7 @@ inline std::string format_as(const GXTevKAlphaSel& sel) {
   case GX_TEV_KASEL_K3_A:
     return "GX_TEV_KASEL_K3_A";
   default:
-    return fmt::format("GXTevKAlphaSel({})", static_cast<int>(sel));
+    return fmt::format("GXTevKAlphaSel({})", underlying(sel));
   }
 }
 
@@ -313,7 +317,7 @@ inline std::string format_as(const GXTexMapID& id) {
   case GX_TEX_DISABLE:
     return "GX_TEX_DISABLE";
   default:
-    return fmt::format("GXTexMapID({})", static_cast<int>(id));
+    return fmt::format("GXTexMapID({})", underlying(id));
   }
 }
 
@@ -340,7 +344,7 @@ inline std::string format_as(const GXChannelID& id) {
   case GX_COLOR_NULL:
     return "GX_COLOR_NULL";
   default:
-    return fmt::format("GXChannelID({})", static_cast<int>(id));
+    return fmt::format("GXChannelID({})", underlying(id));
   }
 }
 
@@ -351,7 +355,7 @@ inline std::string format_as(const GXColorSrc& src) {
   case GX_SRC_VTX:
     return "GX_SRC_VTX";
   default:
-    return fmt::format("GXColorSrc({})", static_cast<int>(src));
+    return fmt::format("GXColorSrc({})", underlying(src));
   }
 }
 
@@ -380,7 +384,7 @@ inline std::string format_as(const GXTexMtx& mtx) {
   case GX_IDENTITY:
     return "GX_IDENTITY";
   default:
-    return fmt::format("GXTexMtx({})", static_cast<int>(mtx));
+    return fmt::format("GXTexMtx({})", underlying(mtx));
   }
 }
 
@@ -429,7 +433,7 @@ inline std::string format_as(const GXPTTexMtx& mtx) {
   case GX_PTIDENTITY:
     return "GX_PTIDENTITY";
   default:
-    return fmt::format("GXPTTexMtx({})", static_cast<int>(mtx));
+    return fmt::format("GXPTTexMtx({})", underlying(mtx));
   }
 }
 
@@ -452,7 +456,7 @@ inline std::string format_as(const GXCompare& comp) {
   case GX_ALWAYS:
     return "GX_ALWAYS";
   default:
-    return fmt::format("GXCompare({})", static_cast<int>(comp));
+    return fmt::format("GXCompare({})", underlying(comp));
   }
 }
 
@@ -467,7 +471,7 @@ inline std::string format_as(const GXAlphaOp& op) {
   case GX_AOP_XNOR:
     return "GX_AOP_XNOR";
   default:
-    return fmt::format("GXAlphaOp({})", static_cast<int>(op));
+    return fmt::format("GXAlphaOp({})", underlying(op));
   }
 }
 
@@ -496,7 +500,7 @@ inline std::string format_as(const GXFogType& type) {
   case GX_FOG_ORTHO_REVEXP2:
     return "GX_FOG_ORTHO_REVEXP2";
   default:
-    return fmt::format("GXFogType({})", static_cast<int>(type));
+    return fmt::format("GXFogType({})", underlying(type));
   }
 }
 
@@ -521,6 +525,158 @@ inline std::string format_as(const GXTexCoordID& id) {
   case GX_TEXCOORD_NULL:
     return "GX_TEXCOORD_NULL";
   default:
-    return fmt::format("GXTexCoordID({})", static_cast<int>(id));
+    return fmt::format("GXTexCoordID({})", underlying(id));
+  }
+}
+
+inline std::string format_as(const GXPrimitive& prim) {
+  switch (prim) {
+  case GX_QUADS:
+    return "GX_QUADS";
+  case GX_TRIANGLES:
+    return "GX_TRIANGLES";
+  case GX_TRIANGLESTRIP:
+    return "GX_TRIANGLESTRIP";
+  case GX_TRIANGLEFAN:
+    return "GX_TRIANGLEFAN";
+  case GX_LINES:
+    return "GX_LINES";
+  case GX_LINESTRIP:
+    return "GX_LINESTRIP";
+  case GX_POINTS:
+    return "GX_POINTS";
+  default:
+    return fmt::format("GXPrimitive({})", underlying(prim));
+  }
+}
+
+inline std::string format_as(const GXAttr& attr) {
+  switch (attr) {
+  case GX_VA_PNMTXIDX:
+    return "GX_VA_PNMTXIDX";
+  case GX_VA_TEX0MTXIDX:
+    return "GX_VA_TEX0MTXIDX";
+  case GX_VA_TEX1MTXIDX:
+    return "GX_VA_TEX1MTXIDX";
+  case GX_VA_TEX2MTXIDX:
+    return "GX_VA_TEX2MTXIDX";
+  case GX_VA_TEX3MTXIDX:
+    return "GX_VA_TEX3MTXIDX";
+  case GX_VA_TEX4MTXIDX:
+    return "GX_VA_TEX4MTXIDX";
+  case GX_VA_TEX5MTXIDX:
+    return "GX_VA_TEX5MTXIDX";
+  case GX_VA_TEX6MTXIDX:
+    return "GX_VA_TEX6MTXIDX";
+  case GX_VA_TEX7MTXIDX:
+    return "GX_VA_TEX7MTXIDX";
+  case GX_VA_POS:
+    return "GX_VA_POS";
+  case GX_VA_NRM:
+    return "GX_VA_NRM";
+  case GX_VA_CLR0:
+    return "GX_VA_CLR0";
+  case GX_VA_CLR1:
+    return "GX_VA_CLR1";
+  case GX_VA_TEX0:
+    return "GX_VA_TEX0";
+  case GX_VA_TEX1:
+    return "GX_VA_TEX1";
+  case GX_VA_TEX2:
+    return "GX_VA_TEX2";
+  case GX_VA_TEX3:
+    return "GX_VA_TEX3";
+  case GX_VA_TEX4:
+    return "GX_VA_TEX4";
+  case GX_VA_TEX5:
+    return "GX_VA_TEX5";
+  case GX_VA_TEX6:
+    return "GX_VA_TEX6";
+  case GX_VA_TEX7:
+    return "GX_VA_TEX7";
+  case GX_POS_MTX_ARRAY:
+    return "GX_POS_MTX_ARRAY";
+  case GX_NRM_MTX_ARRAY:
+    return "GX_NRM_MTX_ARRAY";
+  case GX_TEX_MTX_ARRAY:
+    return "GX_TEX_MTX_ARRAY";
+  case GX_LIGHT_ARRAY:
+    return "GX_LIGHT_ARRAY";
+  case GX_VA_NBT:
+    return "GX_VA_NBT";
+  case GX_VA_NULL:
+    return "GX_VA_NULL";
+  default:
+    return fmt::format("GXAttr({})", underlying(attr));
+  }
+}
+
+inline std::string format_as(const GXCompCnt& cnt) {
+  switch (cnt) {
+  case GX_POS_XY:
+    return "GX_POS_XY|GX_NRM_XYZ|GX_CLR_RGB|GX_TEX_S";
+  case GX_POS_XYZ:
+    return "GX_POS_XYZ|GX_NRM_NBT|GX_CLR_RGBA|GX_TEX_ST";
+  case GX_NRM_NBT3:
+    return "GX_NRM_NBT3";
+  default:
+    return fmt::format("GXCompCnt({})", underlying(cnt));
+  }
+}
+
+inline std::string format_as(const GXCompType& type) {
+  switch (type) {
+  case GX_U8:
+    return "GX_U8|GX_RGB565";
+  case GX_S8:
+    return "GX_S8|GX_RGB8";
+  case GX_U16:
+    return "GX_U16|GX_RGBX8";
+  case GX_S16:
+    return "GX_S16|GX_RGBA4";
+  case GX_F32:
+    return "GX_F32|GX_RGBA6";
+  case GX_RGBA8:
+    return "GX_RGBA8";
+  default:
+    return fmt::format("GXCompType({})", underlying(type));
+  }
+}
+
+inline std::string format_as(const GXAttrType& type) {
+  switch (type) {
+  case GX_NONE:
+    return "GX_NONE";
+  case GX_DIRECT:
+    return "GX_DIRECT";
+  case GX_INDEX8:
+    return "GX_INDEX8";
+  case GX_INDEX16:
+    return "GX_INDEX16";
+  default:
+    return fmt::format("GXAttrType({})", underlying(type));
+  }
+}
+
+inline std::string format_as(const GXVtxFmt& fmt) {
+  switch (fmt) {
+  case GX_VTXFMT0:
+    return "GX_VTXFMT0";
+  case GX_VTXFMT1:
+    return "GX_VTXFMT1";
+  case GX_VTXFMT2:
+    return "GX_VTXFMT2";
+  case GX_VTXFMT3:
+    return "GX_VTXFMT3";
+  case GX_VTXFMT4:
+    return "GX_VTXFMT4";
+  case GX_VTXFMT5:
+    return "GX_VTXFMT5";
+  case GX_VTXFMT6:
+    return "GX_VTXFMT6";
+  case GX_VTXFMT7:
+    return "GX_VTXFMT7";
+  default:
+    return fmt::format("GXVtxFmt({})", underlying(fmt));
   }
 }
diff --git a/lib/gfx/gx_shader.cpp b/lib/gfx/gx_shader.cpp
index e5e5302..ad87a00 100644
--- a/lib/gfx/gx_shader.cpp
+++ b/lib/gfx/gx_shader.cpp
@@ -1,5 +1,6 @@
 #include "common.hpp"
 
+#include "../internal.hpp"
 #include "../webgpu/gpu.hpp"
 #include "gx.hpp"
 #include "gx_fmt.hpp"
@@ -10,10 +11,6 @@
 #include <string_view>
 #include <utility>
 
-constexpr bool EnableNormalVisualization = false;
-constexpr bool EnableDebugPrints = false;
-constexpr bool UsePerPixelLighting = true;
-
 namespace aurora::gfx::gx {
 using namespace fmt::literals;
 using namespace std::string_literals;
@@ -140,44 +137,44 @@ static bool formatHasAlpha(u32 format) {
 static std::string color_arg_reg(GXTevColorArg arg, size_t stageIdx, const ShaderConfig& config,
                                  const TevStage& stage) {
   switch (arg) {
-    DEFAULT_FATAL("invalid color arg {}", static_cast<int>(arg));
+    DEFAULT_FATAL("invalid color arg {}", underlying(arg));
   case GX_CC_CPREV:
     return "prev.rgb";
   case GX_CC_APREV:
-    return "vec3<f32>(prev.a)";
+    return "vec3f(prev.a)";
   case GX_CC_C0:
     return "tevreg0.rgb";
   case GX_CC_A0:
-    return "vec3<f32>(tevreg0.a)";
+    return "vec3f(tevreg0.a)";
   case GX_CC_C1:
     return "tevreg1.rgb";
   case GX_CC_A1:
-    return "vec3<f32>(tevreg1.a)";
+    return "vec3f(tevreg1.a)";
   case GX_CC_C2:
     return "tevreg2.rgb";
   case GX_CC_A2:
-    return "vec3<f32>(tevreg2.a)";
+    return "vec3f(tevreg2.a)";
   case GX_CC_TEXC: {
     CHECK(stage.texMapId != GX_TEXMAP_NULL, "unmapped texture for stage {}", stageIdx);
     CHECK(stage.texMapId >= GX_TEXMAP0 && stage.texMapId <= GX_TEXMAP7, "invalid texture {} for stage {}",
-          static_cast<int>(stage.texMapId), stageIdx);
+          underlying(stage.texMapId), stageIdx);
     const auto& swap = config.tevSwapTable[stage.tevSwapTex];
     return fmt::format("sampled{}.{}{}{}", stageIdx, chan_comp(swap.red), chan_comp(swap.green), chan_comp(swap.blue));
   }
   case GX_CC_TEXA: {
     CHECK(stage.texMapId != GX_TEXMAP_NULL, "unmapped texture for stage {}", stageIdx);
     CHECK(stage.texMapId >= GX_TEXMAP0 && stage.texMapId <= GX_TEXMAP7, "invalid texture {} for stage {}",
-          static_cast<int>(stage.texMapId), stageIdx);
+          underlying(stage.texMapId), stageIdx);
     const auto& swap = config.tevSwapTable[stage.tevSwapTex];
-    return fmt::format("vec3<f32>(sampled{}.{})", stageIdx, chan_comp(swap.alpha));
+    return fmt::format("vec3f(sampled{}.{})", stageIdx, chan_comp(swap.alpha));
   }
   case GX_CC_RASC: {
     CHECK(stage.channelId != GX_COLOR_NULL, "unmapped color channel for stage {}", stageIdx);
     if (stage.channelId == GX_COLOR_ZERO) {
-      return "vec3<f32>(0.0)";
+      return "vec3f(0.0)";
     }
     CHECK(stage.channelId >= GX_COLOR0A0 && stage.channelId <= GX_COLOR1A1, "invalid color channel {} for stage {}",
-          static_cast<int>(stage.channelId), stageIdx);
+          underlying(stage.channelId), stageIdx);
     u32 idx = stage.channelId - GX_COLOR0A0;
     const auto& swap = config.tevSwapTable[stage.tevSwapRas];
     return fmt::format("rast{}.{}{}{}", idx, chan_comp(swap.red), chan_comp(swap.green), chan_comp(swap.blue));
@@ -185,37 +182,37 @@ static std::string color_arg_reg(GXTevColorArg arg, size_t stageIdx, const Shade
   case GX_CC_RASA: {
     CHECK(stage.channelId != GX_COLOR_NULL, "unmapped color channel for stage {}", stageIdx);
     if (stage.channelId == GX_COLOR_ZERO) {
-      return "vec3<f32>(0.0)";
+      return "vec3f(0.0)";
     }
     CHECK(stage.channelId >= GX_COLOR0A0 && stage.channelId <= GX_COLOR1A1, "invalid color channel {} for stage {}",
-          static_cast<int>(stage.channelId), stageIdx);
+          underlying(stage.channelId), stageIdx);
     u32 idx = stage.channelId - GX_COLOR0A0;
     const auto& swap = config.tevSwapTable[stage.tevSwapRas];
-    return fmt::format("vec3<f32>(rast{}.{})", idx, chan_comp(swap.alpha));
+    return fmt::format("vec3f(rast{}.{})", idx, chan_comp(swap.alpha));
   }
   case GX_CC_ONE:
-    return "vec3<f32>(1.0)";
+    return "vec3f(1.0)";
   case GX_CC_HALF:
-    return "vec3<f32>(0.5)";
+    return "vec3f(0.5)";
   case GX_CC_KONST: {
     switch (stage.kcSel) {
-      DEFAULT_FATAL("invalid kcSel {}", static_cast<int>(stage.kcSel));
+      DEFAULT_FATAL("invalid kcSel {}", underlying(stage.kcSel));
     case GX_TEV_KCSEL_8_8:
-      return "vec3<f32>(1.0)";
+      return "vec3f(1.0)";
     case GX_TEV_KCSEL_7_8:
-      return "vec3<f32>(7.0/8.0)";
+      return "vec3f(7.0/8.0)";
     case GX_TEV_KCSEL_6_8:
-      return "vec3<f32>(6.0/8.0)";
+      return "vec3f(6.0/8.0)";
     case GX_TEV_KCSEL_5_8:
-      return "vec3<f32>(5.0/8.0)";
+      return "vec3f(5.0/8.0)";
     case GX_TEV_KCSEL_4_8:
-      return "vec3<f32>(4.0/8.0)";
+      return "vec3f(4.0/8.0)";
     case GX_TEV_KCSEL_3_8:
-      return "vec3<f32>(3.0/8.0)";
+      return "vec3f(3.0/8.0)";
     case GX_TEV_KCSEL_2_8:
-      return "vec3<f32>(2.0/8.0)";
+      return "vec3f(2.0/8.0)";
     case GX_TEV_KCSEL_1_8:
-      return "vec3<f32>(1.0/8.0)";
+      return "vec3f(1.0/8.0)";
     case GX_TEV_KCSEL_K0:
       return "ubuf.kcolor0.rgb";
     case GX_TEV_KCSEL_K1:
@@ -225,41 +222,41 @@ static std::string color_arg_reg(GXTevColorArg arg, size_t stageIdx, const Shade
     case GX_TEV_KCSEL_K3:
       return "ubuf.kcolor3.rgb";
     case GX_TEV_KCSEL_K0_R:
-      return "vec3<f32>(ubuf.kcolor0.r)";
+      return "vec3f(ubuf.kcolor0.r)";
     case GX_TEV_KCSEL_K1_R:
-      return "vec3<f32>(ubuf.kcolor1.r)";
+      return "vec3f(ubuf.kcolor1.r)";
     case GX_TEV_KCSEL_K2_R:
-      return "vec3<f32>(ubuf.kcolor2.r)";
+      return "vec3f(ubuf.kcolor2.r)";
     case GX_TEV_KCSEL_K3_R:
-      return "vec3<f32>(ubuf.kcolor3.r)";
+      return "vec3f(ubuf.kcolor3.r)";
     case GX_TEV_KCSEL_K0_G:
-      return "vec3<f32>(ubuf.kcolor0.g)";
+      return "vec3f(ubuf.kcolor0.g)";
     case GX_TEV_KCSEL_K1_G:
-      return "vec3<f32>(ubuf.kcolor1.g)";
+      return "vec3f(ubuf.kcolor1.g)";
     case GX_TEV_KCSEL_K2_G:
-      return "vec3<f32>(ubuf.kcolor2.g)";
+      return "vec3f(ubuf.kcolor2.g)";
     case GX_TEV_KCSEL_K3_G:
-      return "vec3<f32>(ubuf.kcolor3.g)";
+      return "vec3f(ubuf.kcolor3.g)";
     case GX_TEV_KCSEL_K0_B:
-      return "vec3<f32>(ubuf.kcolor0.b)";
+      return "vec3f(ubuf.kcolor0.b)";
     case GX_TEV_KCSEL_K1_B:
-      return "vec3<f32>(ubuf.kcolor1.b)";
+      return "vec3f(ubuf.kcolor1.b)";
     case GX_TEV_KCSEL_K2_B:
-      return "vec3<f32>(ubuf.kcolor2.b)";
+      return "vec3f(ubuf.kcolor2.b)";
     case GX_TEV_KCSEL_K3_B:
-      return "vec3<f32>(ubuf.kcolor3.b)";
+      return "vec3f(ubuf.kcolor3.b)";
     case GX_TEV_KCSEL_K0_A:
-      return "vec3<f32>(ubuf.kcolor0.a)";
+      return "vec3f(ubuf.kcolor0.a)";
     case GX_TEV_KCSEL_K1_A:
-      return "vec3<f32>(ubuf.kcolor1.a)";
+      return "vec3f(ubuf.kcolor1.a)";
     case GX_TEV_KCSEL_K2_A:
-      return "vec3<f32>(ubuf.kcolor2.a)";
+      return "vec3f(ubuf.kcolor2.a)";
     case GX_TEV_KCSEL_K3_A:
-      return "vec3<f32>(ubuf.kcolor3.a)";
+      return "vec3f(ubuf.kcolor3.a)";
     }
   }
   case GX_CC_ZERO:
-    return "vec3<f32>(0.0)";
+    return "vec3f(0.0)";
   }
 }
 
@@ -334,7 +331,7 @@ static void alpha_arg_reg_info(GXTevAlphaArg arg, const TevStage& stage, ShaderI
 static std::string alpha_arg_reg(GXTevAlphaArg arg, size_t stageIdx, const ShaderConfig& config,
                                  const TevStage& stage) {
   switch (arg) {
-    DEFAULT_FATAL("invalid alpha arg {}", static_cast<int>(arg));
+    DEFAULT_FATAL("invalid alpha arg {}", underlying(arg));
   case GX_CA_APREV:
     return "prev.a";
   case GX_CA_A0:
@@ -346,7 +343,7 @@ static std::string alpha_arg_reg(GXTevAlphaArg arg, size_t stageIdx, const Shade
   case GX_CA_TEXA: {
     CHECK(stage.texMapId != GX_TEXMAP_NULL, "unmapped texture for stage {}", stageIdx);
     CHECK(stage.texMapId >= GX_TEXMAP0 && stage.texMapId <= GX_TEXMAP7, "invalid texture {} for stage {}",
-          static_cast<int>(stage.texMapId), stageIdx);
+          underlying(stage.texMapId), stageIdx);
     const auto& swap = config.tevSwapTable[stage.tevSwapTex];
     return fmt::format("sampled{}.{}", stageIdx, chan_comp(swap.alpha));
   }
@@ -356,14 +353,14 @@ static std::string alpha_arg_reg(GXTevAlphaArg arg, size_t stageIdx, const Shade
       return "0.0";
     }
     CHECK(stage.channelId >= GX_COLOR0A0 && stage.channelId <= GX_COLOR1A1, "invalid color channel {} for stage {}",
-          static_cast<int>(stage.channelId), stageIdx);
+          underlying(stage.channelId), stageIdx);
     u32 idx = stage.channelId - GX_COLOR0A0;
     const auto& swap = config.tevSwapTable[stage.tevSwapRas];
     return fmt::format("rast{}.{}", idx, chan_comp(swap.alpha));
   }
   case GX_CA_KONST: {
     switch (stage.kaSel) {
-      DEFAULT_FATAL("invalid kaSel {}", static_cast<int>(stage.kaSel));
+      DEFAULT_FATAL("invalid kaSel {}", underlying(stage.kaSel));
     case GX_TEV_KASEL_8_8:
       return "1.0";
     case GX_TEV_KASEL_7_8:
@@ -421,7 +418,7 @@ static std::string alpha_arg_reg(GXTevAlphaArg arg, size_t stageIdx, const Shade
 
 static std::string_view tev_op(GXTevOp op) {
   switch (op) {
-    DEFAULT_FATAL("unimplemented tev op {}", static_cast<int>(op));
+    DEFAULT_FATAL("unimplemented tev op {}", underlying(op));
   case GX_TEV_ADD:
     return ""sv;
   case GX_TEV_SUB:
@@ -431,7 +428,7 @@ static std::string_view tev_op(GXTevOp op) {
 
 static std::string_view tev_bias(GXTevBias bias) {
   switch (bias) {
-    DEFAULT_FATAL("invalid tev bias {}", static_cast<int>(bias));
+    DEFAULT_FATAL("invalid tev bias {}", underlying(bias));
   case GX_TB_ZERO:
     return ""sv;
   case GX_TB_ADDHALF:
@@ -444,7 +441,7 @@ static std::string_view tev_bias(GXTevBias bias) {
 static std::string alpha_compare(GXCompare comp, u8 ref, bool& valid) {
   const float fref = ref / 255.f;
   switch (comp) {
-    DEFAULT_FATAL("invalid alpha comp {}", static_cast<int>(comp));
+    DEFAULT_FATAL("invalid alpha comp {}", underlying(comp));
   case GX_NEVER:
     return "false"s;
   case GX_LESS:
@@ -467,7 +464,7 @@ static std::string alpha_compare(GXCompare comp, u8 ref, bool& valid) {
 
 static std::string_view tev_scale(GXTevScale scale) {
   switch (scale) {
-    DEFAULT_FATAL("invalid tev scale {}", static_cast<int>(scale));
+    DEFAULT_FATAL("invalid tev scale {}", underlying(scale));
   case GX_CS_SCALE_1:
     return ""sv;
   case GX_CS_SCALE_2:
@@ -484,9 +481,9 @@ static inline std::string vtx_attr(const ShaderConfig& config, GXAttr attr) {
   if (type == GX_NONE) {
     if (attr == GX_VA_NRM) {
       // Default normal
-      return "vec3<f32>(1.0, 0.0, 0.0)"s;
+      return "vec3f(1.0, 0.0, 0.0)"s;
     }
-    UNLIKELY FATAL("unmapped vtx attr {}", static_cast<int>(attr));
+    UNLIKELY FATAL("unmapped vtx attr {}", underlying(attr));
   }
   if (attr == GX_VA_POS) {
     return "in_pos"s;
@@ -502,7 +499,7 @@ static inline std::string vtx_attr(const ShaderConfig& config, GXAttr attr) {
     const auto idx = attr - GX_VA_TEX0;
     return fmt::format("in_tex{}_uv", idx);
   }
-  UNLIKELY FATAL("unhandled vtx attr {}", static_cast<int>(attr));
+  UNLIKELY FATAL("unhandled vtx attr {}", underlying(attr));
 }
 
 static inline std::string texture_conversion(const TextureConfig& tex, u32 stageIdx, u32 texMapId) {
@@ -520,7 +517,7 @@ static inline std::string texture_conversion(const TextureConfig& tex, u32 stage
       // FIXME HACK
       if (!is_palette_format(tex.loadFmt)) {
         // Perform intensity conversion
-        out += fmt::format("\n    sampled{0} = vec4<f32>(intensityF32(sampled{0}.rgb), 0.f, 0.f, 1.f);", stageIdx);
+        out += fmt::format("\n    sampled{0} = vec4f(intensityF32(sampled{0}.rgb), 0.f, 0.f, 1.f);", stageIdx);
       }
       break;
     }
@@ -531,7 +528,7 @@ static inline std::string texture_conversion(const TextureConfig& tex, u32 stage
   case GX_TF_I8:
   case GX_TF_R8_PC:
     // Splat R to RGBA
-    out += fmt::format("\n    sampled{0} = vec4<f32>(sampled{0}.r);", stageIdx);
+    out += fmt::format("\n    sampled{0} = vec4f(sampled{0}.r);", stageIdx);
     break;
   }
   return out;
@@ -560,7 +557,7 @@ ShaderInfo build_shader_info(const ShaderConfig& config) noexcept {
   //  }
 
   ShaderInfo info{
-      .uniformSize = 64 * 3, // mv, mvInv, proj
+      .uniformSize = sizeof(PnMtx) + sizeof(Mat4x4<float>), // pos_mtx, nrm_mtx, proj
   };
   for (int i = 0; i < config.tevStageCount; ++i) {
     const auto& stage = config.tevStages[i];
@@ -583,7 +580,7 @@ ShaderInfo build_shader_info(const ShaderConfig& config) noexcept {
       info.writesTevReg.set(stage.alphaOp.outReg);
     }
   }
-  info.uniformSize += info.loadsTevReg.count() * 16;
+  info.uniformSize += info.loadsTevReg.count() * sizeof(Vec4<float>);
   bool lightingEnabled = false;
   for (int i = 0; i < info.sampledColorChannels.size(); ++i) {
     if (info.sampledColorChannels.test(i)) {
@@ -596,27 +593,27 @@ ShaderInfo build_shader_info(const ShaderConfig& config) noexcept {
   }
   if (lightingEnabled) {
     // Lights + light state for all channels
-    info.uniformSize += 16 + (80 * GX::MaxLights);
+    info.uniformSize += sizeof(Vec4<float>) + sizeof(Light) * GX::MaxLights;
   }
   for (int i = 0; i < info.sampledColorChannels.size(); ++i) {
     if (info.sampledColorChannels.test(i)) {
       const auto& cc = config.colorChannels[i * 2];
       if (cc.lightingEnabled && cc.ambSrc == GX_SRC_REG) {
-        info.uniformSize += 16;
+        info.uniformSize += sizeof(Vec4<float>);
       }
       if (cc.matSrc == GX_SRC_REG) {
-        info.uniformSize += 16;
+        info.uniformSize += sizeof(Vec4<float>);
       }
       const auto& cca = config.colorChannels[i * 2 + 1];
       if (cca.lightingEnabled && cca.ambSrc == GX_SRC_REG) {
-        info.uniformSize += 16;
+        info.uniformSize += sizeof(Vec4<float>);
       }
       if (cca.matSrc == GX_SRC_REG) {
-        info.uniformSize += 16;
+        info.uniformSize += sizeof(Vec4<float>);
       }
     }
   }
-  info.uniformSize += info.sampledKColors.count() * 16;
+  info.uniformSize += info.sampledKColors.count() * sizeof(Vec4<float>);
   for (int i = 0; i < info.sampledTexCoords.size(); ++i) {
     if (!info.sampledTexCoords.test(i)) {
       continue;
@@ -636,26 +633,192 @@ ShaderInfo build_shader_info(const ShaderConfig& config) noexcept {
     if (info.usesTexMtx.test(i)) {
       switch (info.texMtxTypes[i]) {
       case GX_TG_MTX2x4:
-        info.uniformSize += 32;
+        info.uniformSize += sizeof(Mat2x4<float>);
         break;
       case GX_TG_MTX3x4:
-        info.uniformSize += 64;
+        info.uniformSize += sizeof(Mat3x4<float>);
         break;
       default:
         break;
       }
     }
   }
-  info.uniformSize += info.usesPTTexMtx.count() * 64;
+  info.uniformSize += info.usesPTTexMtx.count() * sizeof(Mat3x4<float>);
   if (config.fogType != GX_FOG_NONE) {
     info.usesFog = true;
-    info.uniformSize += 32;
+    info.uniformSize += sizeof(Fog);
   }
-  info.uniformSize += info.sampledTextures.count() * 4;
+  info.uniformSize += info.sampledTextures.count() * sizeof(u32);
   info.uniformSize = align_uniform(info.uniformSize);
   return info;
 }
 
+struct StorageLoadResult {
+  std::string attrLoad;
+  std::string_view arrType;
+};
+
+auto storage_load(const StorageConfig& mapping, u32 attrIdx) -> StorageLoadResult {
+  const std::string_view attrName = VtxAttributeNames[mapping.attr];
+
+  uint8_t compCnt = 0;
+  GXCompType compType = GX_U8;
+  switch (mapping.attr) {
+  case GX_VA_POS:
+    switch (mapping.cnt) {
+    case GX_POS_XY:
+      compCnt = 2;
+      break;
+    case GX_POS_XYZ:
+      compCnt = 3;
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} component count {}", mapping.attr, mapping.cnt);
+    }
+    switch (mapping.compType) {
+    case GX_U8:
+    case GX_S8:
+    case GX_U16:
+    case GX_S16:
+    case GX_F32:
+      compType = mapping.compType;
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} component type {}", mapping.attr, mapping.compType);
+    }
+    break;
+  case GX_VA_NRM:
+    switch (mapping.cnt) {
+    case GX_NRM_XYZ:
+      compCnt = 3;
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} component count {}", mapping.attr, mapping.cnt);
+    }
+    switch (mapping.compType) {
+    case GX_S8:
+    case GX_S16:
+    case GX_F32:
+      compType = mapping.compType;
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} component type {}", mapping.attr, mapping.compType);
+    }
+    break;
+  case GX_VA_CLR0:
+  case GX_VA_CLR1:
+    switch (mapping.cnt) {
+    case GX_CLR_RGB:
+      compCnt = 3;
+      break;
+    case GX_CLR_RGBA:
+      compCnt = 4;
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} component count {}", mapping.attr, mapping.cnt);
+    }
+    switch (mapping.compType) {
+    case GX_RGB8:
+    case GX_RGBA8:
+      compType = mapping.compType;
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} component type {}", mapping.attr, mapping.compType);
+    }
+    break;
+  case GX_VA_TEX0:
+  case GX_VA_TEX1:
+  case GX_VA_TEX2:
+  case GX_VA_TEX3:
+  case GX_VA_TEX4:
+  case GX_VA_TEX5:
+  case GX_VA_TEX6:
+  case GX_VA_TEX7:
+    switch (mapping.cnt) {
+    case GX_TEX_S:
+      compCnt = 1;
+      break;
+    case GX_TEX_ST:
+      compCnt = 2;
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} component count {}", mapping.attr, mapping.cnt);
+    }
+    switch (mapping.compType) {
+    case GX_U8:
+    case GX_S8:
+    case GX_U16:
+    case GX_S16:
+    case GX_F32:
+      compType = mapping.compType;
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} component type {}", mapping.attr, mapping.compType);
+    }
+    break;
+  default:
+    Log.fatal("storage_load: Unsupported attribute {}", mapping.attr);
+  }
+
+  const auto [div, rem] = std::div(attrIdx, 4);
+  std::string idxFetch = fmt::format("in_dl{}[{}]", div, rem);
+
+  std::string_view arrType;
+  std::string attrLoad;
+
+  switch (compType) {
+  case GX_U16:
+    switch (compCnt) {
+    case 2:
+      arrType = "u32";
+      attrLoad = fmt::format("fetch_u16_2(&v_arr_{}, {}, {})", attrName, idxFetch, mapping.frac);
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} count {}", compType, compCnt);
+    }
+    break;
+  case GX_S16:
+    switch (compCnt) {
+    case 3:
+      arrType = "i32";
+      attrLoad = fmt::format("fetch_i16_3(&v_arr_{}, {}, {})", attrName, idxFetch, mapping.frac);
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} count {}", compType, compCnt);
+    }
+    break;
+  case GX_F32:
+    switch (compCnt) {
+    case 1:
+      arrType = "f32";
+      attrLoad = fmt::format("v_arr_{}[{}]", attrName, idxFetch);
+      break;
+    case 2:
+      arrType = "vec2f";
+      attrLoad = fmt::format("v_arr_{}[{}]", attrName, idxFetch);
+      break;
+    case 3:
+      arrType = "f32";
+      attrLoad = fmt::format("fetch_f32_3(&v_arr_{}, {})", attrName, idxFetch);
+      break;
+    case 4:
+      arrType = "vec4f";
+      attrLoad = fmt::format("v_arr_{}[{}]", attrName, idxFetch);
+      break;
+    default:
+      Log.fatal("storage_load: Unsupported {} count {}", compType, compCnt);
+    }
+    break;
+  default:
+    Log.fatal("storage_load: Unimplemented {}", compType);
+  }
+
+  return {
+      .attrLoad = attrLoad,
+      .arrType = arrType,
+  };
+}
+
 wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& info) noexcept {
   const auto hash = xxh3_hash(config);
   const auto it = g_gxCachedShaders.find(hash);
@@ -727,33 +890,19 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
   if (config.indexedAttributeCount > 0) {
     // Display list attributes
     int currAttrIdx = 0;
-    for (GXAttr attr{}; attr < MaxVtxAttr; attr = GXAttr(attr + 1)) {
+    for (GXAttr attr{}; attr < MaxVtxAttr; attr = static_cast<GXAttr>(attr + 1)) {
       // Indexed attributes
       if (config.vtxAttrs[attr] != GX_INDEX8 && config.vtxAttrs[attr] != GX_INDEX16) {
         continue;
       }
-      const auto [div, rem] = std::div(currAttrIdx, 4);
-      std::string_view attrName;
-      bool addUniformBinding = true;
-      if (config.attrMapping[attr] != attr) {
-        attrName = VtxAttributeNames[config.attrMapping[attr]];
-        addUniformBinding = false;
-      } else {
-        attrName = VtxAttributeNames[attr];
-      }
-      vtxXfrAttrsPre +=
-          fmt::format("\n    var {} = v_arr_{}[in_dl{}[{}]];", vtx_attr(config, attr), attrName, div, rem);
-      if (addUniformBinding) {
-        std::string_view arrType;
-        if (attr == GX_VA_POS || attr == GX_VA_NRM) {
-          arrType = "vec3<f32>";
-        } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) {
-          arrType = "vec2<f32>";
-        }
-        uniformBindings += fmt::format(FMT_STRING("\n@group(0) @binding({})"
-                                                  "\nvar<storage, read> v_arr_{}: array<{}>;"),
-                                       uniBindingIdx++, attrName, arrType);
-      }
+      const auto& mapping = config.attrMapping[attr];
+      std::string_view attrName = VtxAttributeNames[mapping.attr];
+      const auto result = storage_load(mapping, currAttrIdx);
+      vtxXfrAttrsPre += fmt::format("\n    var {} = {};", vtx_attr(config, attr), result.attrLoad);
+      uniformBindings += fmt::format(
+          "\n@group(0) @binding({})"
+          "\nvar<storage, read> v_arr_{}: array<{}>;",
+          uniBindingIdx++, attrName, result.arrType);
       ++currAttrIdx;
     }
     auto [num4xAttrArrays, rem] = std::div(currAttrIdx, 4);
@@ -769,7 +918,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
       } else {
         vtxInAttrs += "\n    ";
       }
-      vtxInAttrs += fmt::format("@location({}) in_dl{}: vec4<i32>", locIdx++, i);
+      vtxInAttrs += fmt::format("@location({}) in_dl{}: vec4u", locIdx++, i);
     }
     for (u32 i = 0; i < num2xAttrArrays; ++i) {
       if (locIdx > 0) {
@@ -777,7 +926,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
       } else {
         vtxInAttrs += "\n    ";
       }
-      vtxInAttrs += fmt::format("@location({}) in_dl{}: vec2<i32>", locIdx++, num4xAttrArrays + i);
+      vtxInAttrs += fmt::format("@location({}) in_dl{}: vec2u", locIdx++, num4xAttrArrays + i);
     }
   }
   for (GXAttr attr{}; attr < MaxVtxAttr; attr = GXAttr(attr + 1)) {
@@ -791,23 +940,27 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
       vtxInAttrs += "\n    ";
     }
     if (attr == GX_VA_POS) {
-      vtxInAttrs += fmt::format("@location({}) in_pos: vec3<f32>", locIdx++);
+      vtxInAttrs += fmt::format("@location({}) in_pos: vec3f", locIdx++);
     } else if (attr == GX_VA_NRM) {
-      vtxInAttrs += fmt::format("@location({}) in_nrm: vec3<f32>", locIdx++);
+      vtxInAttrs += fmt::format("@location({}) in_nrm: vec3f", locIdx++);
     } else if (attr == GX_VA_CLR0 || attr == GX_VA_CLR1) {
-      vtxInAttrs += fmt::format("@location({}) in_clr{}: vec4<f32>", locIdx++, attr - GX_VA_CLR0);
+      vtxInAttrs += fmt::format("@location({}) in_clr{}: vec4f", locIdx++, attr - GX_VA_CLR0);
     } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) {
-      vtxInAttrs += fmt::format("@location({}) in_tex{}_uv: vec2<f32>", locIdx++, attr - GX_VA_TEX0);
+      vtxInAttrs += fmt::format("@location({}) in_tex{}_uv: vec2f", locIdx++, attr - GX_VA_TEX0);
     }
   }
   vtxXfrAttrsPre += fmt::format(
-      "\n    var mv_pos = mul4x3(ubuf.pos_mtx, vec4<f32>({}, 1.0));"
-      "\n    var mv_nrm = normalize(mul4x3(ubuf.nrm_mtx, vec4<f32>({}, 0.0)));"
-      "\n    out.pos = mul4x4(ubuf.proj, vec4<f32>(mv_pos, 1.0));"
-      "\n    out.pos.z += out.pos.w;",
+      "\n    var mv_pos = vec4<f32>({}, 1.0) * ubuf.pos_mtx;"
+      "\n    var mv_nrm = normalize(vec4<f32>({}, 0.0) * ubuf.nrm_mtx);"
+      "\n    out.pos = vec4f(mv_pos, 1.0) * ubuf.proj;",
       vtx_attr(config, GX_VA_POS), vtx_attr(config, GX_VA_NRM));
+  if constexpr (UseReversedZ) {
+    vtxXfrAttrsPre += "\n    out.pos.z = -out.pos.z;";
+  } else {
+    vtxXfrAttrsPre += "\n    out.pos.z += out.pos.w;";
+  }
   if constexpr (EnableNormalVisualization) {
-    vtxOutAttrs += fmt::format("\n    @location({}) nrm: vec3<f32>,", vtxOutIdx++);
+    vtxOutAttrs += fmt::format("\n    @location({}) nrm: vec3f,", vtxOutIdx++);
     vtxXfrAttrsPre += "\n    out.nrm = mv_nrm;";
   }
 
@@ -818,7 +971,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
     {
       std::string outReg;
       switch (stage.colorOp.outReg) {
-        DEFAULT_FATAL("invalid colorOp outReg {}", static_cast<int>(stage.colorOp.outReg));
+        DEFAULT_FATAL("invalid colorOp outReg {}", underlying(stage.colorOp.outReg));
       case GX_TEVPREV:
         outReg = "prev";
         break;
@@ -838,14 +991,14 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
           color_arg_reg(stage.colorPass.d, idx, config, stage), tev_op(stage.colorOp.op), tev_bias(stage.colorOp.bias),
           tev_scale(stage.colorOp.scale));
       if (stage.colorOp.clamp) {
-        op = fmt::format("clamp({}, vec3<f32>(0.0), vec3<f32>(1.0))", op);
+        op = fmt::format("clamp({}, vec3f(0.0), vec3f(1.0))", op);
       }
-      fragmentFn += fmt::format("\n    // TEV stage {2}\n    {0} = vec4<f32>({1}, {0}.a);", outReg, op, idx);
+      fragmentFn += fmt::format("\n    // TEV stage {2}\n    {0} = vec4f({1}, {0}.a);", outReg, op, idx);
     }
     {
       std::string outReg;
       switch (stage.alphaOp.outReg) {
-        DEFAULT_FATAL("invalid alphaOp outReg {}", static_cast<int>(stage.alphaOp.outReg));
+        DEFAULT_FATAL("invalid alphaOp outReg {}", underlying(stage.alphaOp.outReg));
       case GX_TEVPREV:
         outReg = "prev.a";
         break;
@@ -871,17 +1024,17 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
     }
   }
   if (info.loadsTevReg.test(0)) {
-    uniBufAttrs += "\n    tevprev: vec4<f32>,";
+    uniBufAttrs += "\n    tevprev: vec4f,";
     fragmentFnPre += "\n    var prev = ubuf.tevprev;";
   } else {
-    fragmentFnPre += "\n    var prev: vec4<f32>;";
+    fragmentFnPre += "\n    var prev: vec4f;";
   }
   for (int i = 1 /* Skip TEVPREV */; i < info.loadsTevReg.size(); ++i) {
     if (info.loadsTevReg.test(i)) {
-      uniBufAttrs += fmt::format("\n    tevreg{}: vec4<f32>,", i - 1);
+      uniBufAttrs += fmt::format("\n    tevreg{}: vec4f,", i - 1);
       fragmentFnPre += fmt::format("\n    var tevreg{0} = ubuf.tevreg{0};", i - 1);
     } else if (info.writesTevReg.test(i)) {
-      fragmentFnPre += fmt::format("\n    var tevreg{0}: vec4<f32>;", i - 1);
+      fragmentFnPre += fmt::format("\n    var tevreg{0}: vec4f;", i - 1);
     }
   }
   bool addedLightStruct = false;
@@ -903,15 +1056,15 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
       uniformPre +=
           "\n"
           "struct Light {\n"
-          "    pos: vec3<f32>,\n"
-          "    dir: vec3<f32>,\n"
-          "    color: vec4<f32>,\n"
-          "    cos_att: vec3<f32>,\n"
-          "    dist_att: vec3<f32>,\n"
+          "    pos: vec3f,\n"
+          "    dir: vec3f,\n"
+          "    color: vec4f,\n"
+          "    cos_att: vec3f,\n"
+          "    dist_att: vec3f,\n"
           "};";
       if (UsePerPixelLighting) {
-        vtxOutAttrs += fmt::format("\n    @location({}) mv_pos: vec3<f32>,", vtxOutIdx++);
-        vtxOutAttrs += fmt::format("\n    @location({}) mv_nrm: vec3<f32>,", vtxOutIdx++);
+        vtxOutAttrs += fmt::format("\n    @location({}) mv_pos: vec3f,", vtxOutIdx++);
+        vtxOutAttrs += fmt::format("\n    @location({}) mv_nrm: vec3f,", vtxOutIdx++);
         vtxXfrAttrs += fmt::format(FMT_STRING(R"""(
     out.mv_pos = mv_pos;
     out.mv_nrm = mv_nrm;)"""));
@@ -920,16 +1073,16 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
     }
 
     if (cc.lightingEnabled && cc.ambSrc == GX_SRC_REG) {
-      uniBufAttrs += fmt::format("\n    cc{0}_amb: vec4<f32>,", i);
+      uniBufAttrs += fmt::format("\n    cc{0}_amb: vec4f,", i);
     }
     if (cc.matSrc == GX_SRC_REG) {
-      uniBufAttrs += fmt::format("\n    cc{0}_mat: vec4<f32>,", i);
+      uniBufAttrs += fmt::format("\n    cc{0}_mat: vec4f,", i);
     }
     if (cca.lightingEnabled && cca.ambSrc == GX_SRC_REG) {
-      uniBufAttrs += fmt::format("\n    cc{0}a_amb: vec4<f32>,", i);
+      uniBufAttrs += fmt::format("\n    cc{0}a_amb: vec4f,", i);
     }
     if (cca.matSrc == GX_SRC_REG) {
-      uniBufAttrs += fmt::format("\n    cc{0}a_mat: vec4<f32>,", i);
+      uniBufAttrs += fmt::format("\n    cc{0}a_mat: vec4f,", i);
     }
 
     // Output vertex color if necessary
@@ -937,7 +1090,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
     if (((cc.lightingEnabled && cc.ambSrc == GX_SRC_VTX) || cc.matSrc == GX_SRC_VTX ||
          (cca.lightingEnabled && cca.matSrc == GX_SRC_VTX) || cca.matSrc == GX_SRC_VTX)) {
       if (UsePerPixelLighting) {
-        vtxOutAttrs += fmt::format("\n    @location({}) clr{}: vec4<f32>,", vtxOutIdx++, vtxColorIdx);
+        vtxOutAttrs += fmt::format("\n    @location({}) clr{}: vec4f,", vtxOutIdx++, vtxColorIdx);
         vtxXfrAttrs += fmt::format("\n    out.clr{} = {};", vtxColorIdx,
                                    vtx_attr(config, static_cast<GXAttr>(GX_VA_CLR0 + vtxColorIdx)));
       }
@@ -969,11 +1122,11 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
       if (cc.attnFn == GX_AF_NONE) {
         lightAttnFn = "attn = 1.0;";
       } else if (cc.attnFn == GX_AF_SPOT) {
-        lightAttnFn = fmt::format(FMT_STRING(R"""(
+        lightAttnFn = fmt::format(R"""(
           var cosine = max(0.0, dot(ldir, light.dir));
-          var cos_attn = dot(light.cos_att, vec3<f32>(1.0, cosine, cosine * cosine));
-          var dist_attn = dot(light.dist_att, vec3<f32>(1.0, dist, dist2));
-          attn = max(0.0, cos_attn / dist_attn);)"""));
+          var cos_attn = dot(light.cos_att, vec3f(1.0, cosine, cosine * cosine));
+          var dist_attn = dot(light.dist_att, vec3f(1.0, dist, dist2));
+          attn = max(0.0, cos_attn / dist_attn);)""");
       } else if (cc.attnFn == GX_AF_SPEC) {
         diffFn = GX_DF_NONE;
         FATAL("AF_SPEC unimplemented");
@@ -1001,7 +1154,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
         outVar = fmt::format("out.cc{}", i);
         posVar = "mv_pos";
       }
-      auto lightFunc = fmt::format(FMT_STRING(R"""(
+      auto lightFunc = fmt::format(R"""(
     {{
       var lighting = {5};
       for (var i = 0u; i < {1}u; i++) {{
@@ -1016,14 +1169,14 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
           lighting = lighting + (attn * diff * light.color);
       }}
       // TODO alpha lighting
-      {6} = vec4<f32>(({4} * clamp(lighting, vec4<f32>(0.0), vec4<f32>(1.0))).xyz, {4}.a);
-    }})"""),
+      {6} = vec4f(({4} * clamp(lighting, vec4f(0.0), vec4f(1.0))).xyz, {4}.a);
+    }})""",
                                    i, GX::MaxLights, lightAttnFn, lightDiffFn, matSrc, ambSrc, outVar, posVar);
       if (UsePerPixelLighting) {
-        fragmentFnPre += fmt::format("\n    var rast{}: vec4<f32>;", i);
+        fragmentFnPre += fmt::format("\n    var rast{}: vec4f;", i);
         fragmentFnPre += lightFunc;
       } else {
-        vtxOutAttrs += fmt::format("\n    @location({}) cc{}: vec4<f32>,", vtxOutIdx++, i);
+        vtxOutAttrs += fmt::format("\n    @location({}) cc{}: vec4f,", vtxOutIdx++, i);
         vtxXfrAttrs += lightFunc;
         fragmentFnPre += fmt::format("\n    var rast{0} = in.cc{0};", i);
       }
@@ -1032,7 +1185,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
         // Color will already be written to clr{}
         fragmentFnPre += fmt::format("\n    var rast{0} = in.clr{0};", vtxColorIdx);
       } else {
-        vtxOutAttrs += fmt::format("\n    @location({}) cc{}: vec4<f32>,", vtxOutIdx++, i);
+        vtxOutAttrs += fmt::format("\n    @location({}) cc{}: vec4f,", vtxOutIdx++, i);
         vtxXfrAttrs += fmt::format("\n    out.cc{} = {};", i, vtx_attr(config, GXAttr(GX_VA_CLR0 + vtxColorIdx)));
         fragmentFnPre += fmt::format("\n    var rast{0} = in.cc{0};", i);
       }
@@ -1046,7 +1199,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
   }
   for (int i = 0; i < info.sampledKColors.size(); ++i) {
     if (info.sampledKColors.test(i)) {
-      uniBufAttrs += fmt::format("\n    kcolor{}: vec4<f32>,", i);
+      uniBufAttrs += fmt::format("\n    kcolor{}: vec4f,", i);
     }
   }
   for (int i = 0; i < info.sampledTexCoords.size(); ++i) {
@@ -1054,22 +1207,21 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
       continue;
     }
     const auto& tcg = config.tcgs[i];
-    vtxOutAttrs += fmt::format("\n    @location({}) tex{}_uv: vec2<f32>,", vtxOutIdx++, i);
+    vtxOutAttrs += fmt::format("\n    @location({}) tex{}_uv: vec2f,", vtxOutIdx++, i);
     if (tcg.src >= GX_TG_TEX0 && tcg.src <= GX_TG_TEX7) {
-      vtxXfrAttrs += fmt::format("\n    var tc{} = vec4<f32>({}, 0.0, 1.0);", i,
+      vtxXfrAttrs += fmt::format("\n    var tc{} = vec4f({}, 0.0, 1.0);", i,
                                  vtx_attr(config, GXAttr(GX_VA_TEX0 + (tcg.src - GX_TG_TEX0))));
     } else if (tcg.src == GX_TG_POS) {
-      vtxXfrAttrs += fmt::format("\n    var tc{} = vec4<f32>(in_pos, 1.0);", i);
+      vtxXfrAttrs += fmt::format("\n    var tc{} = vec4f(in_pos, 1.0);", i);
     } else if (tcg.src == GX_TG_NRM) {
-      vtxXfrAttrs += fmt::format("\n    var tc{} = vec4<f32>(in_nrm, 1.0);", i);
+      vtxXfrAttrs += fmt::format("\n    var tc{} = vec4f(in_nrm, 1.0);", i);
     } else
-      UNLIKELY FATAL("unhandled tcg src {}", static_cast<int>(tcg.src));
+      UNLIKELY FATAL("unhandled tcg src {}", underlying(tcg.src));
     if (tcg.mtx == GX_IDENTITY) {
       vtxXfrAttrs += fmt::format("\n    var tc{0}_tmp = tc{0}.xyz;", i);
     } else {
       u32 texMtxIdx = (tcg.mtx - GX_TEXMTX0) / 3;
-      vtxXfrAttrs += fmt::format("\n    var tc{0}_tmp = mul{2}(ubuf.texmtx{1}, tc{0});", i, texMtxIdx,
-                                 info.texMtxTypes[texMtxIdx] == GX_TG_MTX3x4 ? "4x3" : "4x2");
+      vtxXfrAttrs += fmt::format("\n    var tc{0}_tmp = tc{0} * ubuf.texmtx{1};", i, texMtxIdx);
     }
     if (tcg.normalize) {
       vtxXfrAttrs += fmt::format("\n    tc{0}_tmp = normalize(tc{0}_tmp);", i);
@@ -1078,8 +1230,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
       vtxXfrAttrs += fmt::format("\n    var tc{0}_proj = tc{0}_tmp;", i);
     } else {
       u32 postMtxIdx = (tcg.postMtx - GX_PTTEXMTX0) / 3;
-      vtxXfrAttrs +=
-          fmt::format("\n    var tc{0}_proj = mul4x3(ubuf.postmtx{1}, vec4<f32>(tc{0}_tmp.xyz, 1.0));", i, postMtxIdx);
+      vtxXfrAttrs += fmt::format("\n    var tc{0}_proj = vec4f(tc{0}_tmp.xyz, 1.0) * ubuf.postmtx{1};", i, postMtxIdx);
     }
     vtxXfrAttrs += fmt::format("\n    out.tex{0}_uv = tc{0}_proj.xy;", i);
   }
@@ -1091,13 +1242,13 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
         || !info.sampledTextures.test(stage.texMapId)) {
       continue;
     }
-    std::string uvIn = fmt::format("in.tex{0}_uv", static_cast<int>(stage.texCoordId));
+    std::string uvIn = fmt::format("in.tex{0}_uv", underlying(stage.texCoordId));
     const auto& texConfig = config.textureConfig[stage.texMapId];
     if (is_palette_format(texConfig.loadFmt)) {
       std::string_view suffix;
       if (!is_palette_format(texConfig.copyFmt)) {
         switch (texConfig.loadFmt) {
-          DEFAULT_FATAL("unimplemented palette format {}", static_cast<int>(texConfig.loadFmt));
+          DEFAULT_FATAL("unimplemented palette format {}", texConfig.loadFmt);
         case GX_TF_C4:
           suffix = "I4"sv;
           break;
@@ -1110,37 +1261,37 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
         }
       }
       fragmentFnPre += fmt::format("\n    var sampled{0} = textureSamplePalette{3}(tex{1}, tex{1}_samp, {2}, tlut{1});",
-                                   i, static_cast<int>(stage.texMapId), uvIn, suffix);
+                                   i, underlying(stage.texMapId), uvIn, suffix);
     } else {
       fragmentFnPre +=
           fmt::format("\n    var sampled{0} = textureSampleBias(tex{1}, tex{1}_samp, {2}, ubuf.tex{1}_lod);", i,
-                      static_cast<int>(stage.texMapId), uvIn);
+                      underlying(stage.texMapId), uvIn);
     }
     fragmentFnPre += texture_conversion(texConfig, i, stage.texMapId);
   }
   for (int i = 0; i < info.usesTexMtx.size(); ++i) {
     if (info.usesTexMtx.test(i)) {
       switch (info.texMtxTypes[i]) {
-        DEFAULT_FATAL("unhandled tex mtx type {}", static_cast<int>(info.texMtxTypes[i]));
+        DEFAULT_FATAL("unhandled tex mtx type {}", underlying(info.texMtxTypes[i]));
       case GX_TG_MTX2x4:
-        uniBufAttrs += fmt::format("\n    texmtx{}: mtx4x2,", i);
+        uniBufAttrs += fmt::format("\n    texmtx{}: mat2x4f,", i);
         break;
       case GX_TG_MTX3x4:
-        uniBufAttrs += fmt::format("\n    texmtx{}: mtx4x3,", i);
+        uniBufAttrs += fmt::format("\n    texmtx{}: mat3x4f,", i);
         break;
       }
     }
   }
   for (int i = 0; i < info.usesPTTexMtx.size(); ++i) {
     if (info.usesPTTexMtx.test(i)) {
-      uniBufAttrs += fmt::format("\n    postmtx{}: mtx4x3,", i);
+      uniBufAttrs += fmt::format("\n    postmtx{}: mat3x4f,", i);
     }
   }
   if (info.usesFog) {
     uniformPre +=
         "\n"
         "struct Fog {\n"
-        "    color: vec4<f32>,\n"
+        "    color: vec4f,\n"
         "    a: f32,\n"
         "    b: f32,\n"
         "    c: f32,\n"
@@ -1148,9 +1299,11 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
         "}";
     uniBufAttrs += "\n    fog: Fog,";
 
-    fragmentFn += "\n    // Fog\n    var fogF = clamp((ubuf.fog.a / (ubuf.fog.b - in.pos.z)) - ubuf.fog.c, 0.0, 1.0);";
+    fragmentFn +=
+        fmt::format("\n    // Fog\n    var fogF = clamp((ubuf.fog.a / (ubuf.fog.b - {})) - ubuf.fog.c, 0.0, 1.0);",
+                    UseReversedZ ? "(1.0 - in.pos.z)" : "in.pos.z");
     switch (config.fogType) {
-      DEFAULT_FATAL("invalid fog type {}", static_cast<int>(config.fogType));
+      DEFAULT_FATAL("invalid fog type {}", underlying(config.fogType));
     case GX_FOG_PERSP_LIN:
     case GX_FOG_ORTHO_LIN:
       fragmentFn += "\n    var fogZ = fogF;";
@@ -1174,7 +1327,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
           "\n    var fogZ = exp2(-8.0 * fogF * fogF);";
       break;
     }
-    fragmentFn += "\n    prev = vec4<f32>(mix(prev.rgb, ubuf.fog.color.rgb, clamp(fogZ, 0.0, 1.0)), prev.a);";
+    fragmentFn += "\n    prev = vec4f(mix(prev.rgb, ubuf.fog.color.rgb, clamp(fogZ, 0.0, 1.0)), prev.a);";
   }
   size_t texBindIdx = 0;
   for (int i = 0; i < info.sampledTextures.size(); ++i) {
@@ -1183,23 +1336,27 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
     }
     uniBufAttrs += fmt::format("\n    tex{}_lod: f32,", i);
 
-    sampBindings += fmt::format(FMT_STRING("\n@group(1) @binding({})\n"
-                                           "var tex{}_samp: sampler;"),
-                                texBindIdx, i);
+    sampBindings += fmt::format(
+        "\n@group(1) @binding({})\n"
+        "var tex{}_samp: sampler;",
+        texBindIdx, i);
 
     const auto& texConfig = config.textureConfig[i];
     if (is_palette_format(texConfig.loadFmt)) {
-      texBindings += fmt::format(FMT_STRING("\n@group(2) @binding({})\n"
-                                            "var tex{}: texture_2d<{}>;"),
-                                 texBindIdx, i, is_palette_format(texConfig.copyFmt) ? "i32"sv : "f32"sv);
+      texBindings += fmt::format(
+          "\n@group(2) @binding({})\n"
+          "var tex{}: texture_2d<{}>;",
+          texBindIdx, i, is_palette_format(texConfig.copyFmt) ? "i32"sv : "f32"sv);
       ++texBindIdx;
-      texBindings += fmt::format(FMT_STRING("\n@group(2) @binding({})\n"
-                                            "var tlut{}: texture_2d<f32>;"),
-                                 texBindIdx, i);
+      texBindings += fmt::format(
+          "\n@group(2) @binding({})\n"
+          "var tlut{}: texture_2d<f32>;",
+          texBindIdx, i);
     } else {
-      texBindings += fmt::format(FMT_STRING("\n@group(2) @binding({})\n"
-                                            "var tex{}: texture_2d<f32>;"),
-                                 texBindIdx, i);
+      texBindings += fmt::format(
+          "\n@group(2) @binding({})\n"
+          "var tex{}: texture_2d<f32>;",
+          texBindIdx, i);
     }
     ++texBindIdx;
   }
@@ -1212,7 +1369,7 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
     if (comp0Valid || comp1Valid) {
       fragmentFn += "\n    // Alpha compare";
       switch (config.alphaCompare.op) {
-        DEFAULT_FATAL("invalid alpha compare op {}", static_cast<int>(config.alphaCompare.op));
+        DEFAULT_FATAL("invalid alpha compare op {}", underlying(config.alphaCompare.op));
       case GX_AOP_AND:
         fragmentFn += fmt::format("\n    if (!({} && {})) {{ discard; }}", comp0, comp1);
         break;
@@ -1229,82 +1386,104 @@ wgpu::ShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& in
     }
   }
   if constexpr (EnableNormalVisualization) {
-    fragmentFn += "\n    prev = vec4<f32>(in.nrm, prev.a);";
+    fragmentFn += "\n    prev = vec4f(in.nrm, prev.a);";
   }
 
-  const auto shaderSource = fmt::format(FMT_STRING(R"""(
-struct mtx4x4 {{ mx: vec4<f32>, my: vec4<f32>, mz: vec4<f32>, mw: vec4<f32> }};
-struct mtx4x3 {{ mx: vec4<f32>, my: vec4<f32>, mz: vec4<f32>, mw: vec4<f32> }};
-struct mtx4x2 {{ mx: vec4<f32>, my: vec4<f32>, }};
-// TODO convert these to row major
-fn mul4x4(m: mtx4x4, v: vec4<f32>) -> vec4<f32> {{
-  var mx = vec4<f32>(m.mx.x, m.my.x, m.mz.x, m.mw.x);
-  var my = vec4<f32>(m.mx.y, m.my.y, m.mz.y, m.mw.y);
-  var mz = vec4<f32>(m.mx.z, m.my.z, m.mz.z, m.mw.z);
-  var mw = vec4<f32>(m.mx.w, m.my.w, m.mz.w, m.mw.w);
-  return vec4<f32>(dot(mx, v), dot(my, v), dot(mz, v), dot(mw, v));
+  const auto shaderSource = fmt::format(R"""(
+fn fetch_f32_3(p: ptr<storage, array<f32>>, idx: u32) -> vec3<f32> {{
+  var start = idx * 3;
+  return vec3<f32>(
+    p[start],
+    p[start + 1],
+    p[start + 2],
+  );
 }}
-fn mul4x3(m: mtx4x3, v: vec4<f32>) -> vec3<f32> {{
-  var mx = vec4<f32>(m.mx.x, m.my.x, m.mz.x, m.mw.x);
-  var my = vec4<f32>(m.mx.y, m.my.y, m.mz.y, m.mw.y);
-  var mz = vec4<f32>(m.mx.z, m.my.z, m.mz.z, m.mw.z);
-  return vec3<f32>(dot(mx, v), dot(my, v), dot(mz, v));
+fn fetch_u8_2(p: ptr<storage, array<u32>>, idx: u32, frac: u32) -> vec2<f32> {{
+  var v0 = p[idx / 2];
+  var r = (idx % 2) != 0;
+  var o0 = select(extractBits(v0, 0, 8), extractBits(v0, 16, 8), r);
+  var o1 = select(extractBits(v0, 8, 8), extractBits(v0, 24, 8), r);
+  return vec2<f32>(
+    f32(o0) / f32(1 << frac),
+    f32(o1) / f32(1 << frac),
+  );
 }}
-fn mul4x2(m: mtx4x2, v: vec4<f32>) -> vec2<f32> {{
-  return vec2<f32>(dot(m.mx, v), dot(m.my, v));
+fn fetch_u16_2(p: ptr<storage, array<u32>>, idx: u32, frac: u32) -> vec2<f32> {{
+  var v0 = p[idx];
+  var o0 = extractBits(v0, 0, 16);
+  var o1 = extractBits(v0, 16, 16);
+  return vec2<f32>(
+    f32(o0) / f32(1 << frac),
+    f32(o1) / f32(1 << frac),
+  );
+}}
+fn fetch_i16_3(p: ptr<storage, array<i32>>, idx: u32, frac: u32) -> vec3<f32> {{
+  var n = idx * 3;
+  var d = n / 2;
+  var r = (n % 2) != 0;
+  var v0 = p[d];
+  var v1 = p[d + 1];
+  var o0 = select(extractBits(v0, 0, 16), extractBits(v0, 16, 16), r);
+  var o1 = select(extractBits(v0, 16, 16), extractBits(v1, 0, 16), r);
+  var o2 = select(extractBits(v1, 0, 16), extractBits(v1, 16, 16), r);
+  return vec3<f32>(
+    f32(o0) / f32(1 << frac),
+    f32(o1) / f32(1 << frac),
+    f32(o2) / f32(1 << frac),
+  );
 }}
 {10}
 struct Uniform {{
-    pos_mtx: mtx4x3,
-    nrm_mtx: mtx4x3,
-    proj: mtx4x4,{0}
+    pos_mtx: mat3x4f,
+    nrm_mtx: mat3x4f,
+    proj: mat4x4f,{0}
 }};
 @group(0) @binding(0)
 var<uniform> ubuf: Uniform;{3}{1}{2}
 
 struct VertexOutput {{
-    @builtin(position) pos: vec4<f32>,{4}
+    @builtin(position) pos: vec4f,{4}
 }};
 
-fn intensityF32(rgb: vec3<f32>) -> f32 {{
+fn intensityF32(rgb: vec3f) -> f32 {{
     // RGB to intensity conversion
     // https://github.com/dolphin-emu/dolphin/blob/4cd48e609c507e65b95bca5afb416b59eaf7f683/Source/Core/VideoCommon/TextureConverterShaderGen.cpp#L237-L241
     return dot(rgb, vec3(0.257, 0.504, 0.098)) + 16.0 / 255.0;
 }}
-fn intensityI4(rgb: vec3<f32>) -> i32 {{
+fn intensityI4(rgb: vec3f) -> i32 {{
     return i32(intensityF32(rgb) * 16.f);
 }}
-fn textureSamplePalette(tex: texture_2d<i32>, samp: sampler, uv: vec2<f32>, tlut: texture_2d<f32>) -> vec4<f32> {{
+fn textureSamplePalette(tex: texture_2d<i32>, samp: sampler, uv: vec2f, tlut: texture_2d<f32>) -> vec4f {{
     // Gather index values
     var i = textureGather(0, tex, samp, uv);
     // Load palette colors
-    var c0 = textureLoad(tlut, vec2<i32>(i[0], 0), 0);
-    var c1 = textureLoad(tlut, vec2<i32>(i[1], 0), 0);
-    var c2 = textureLoad(tlut, vec2<i32>(i[2], 0), 0);
-    var c3 = textureLoad(tlut, vec2<i32>(i[3], 0), 0);
+    var c0 = textureLoad(tlut, vec2i(i[0], 0), 0);
+    var c1 = textureLoad(tlut, vec2i(i[1], 0), 0);
+    var c2 = textureLoad(tlut, vec2i(i[2], 0), 0);
+    var c3 = textureLoad(tlut, vec2i(i[3], 0), 0);
     // Perform bilinear filtering
-    var f = fract(uv * vec2<f32>(textureDimensions(tex)) + 0.5);
+    var f = fract(uv * vec2f(textureDimensions(tex)) + 0.5);
     var t0 = mix(c3, c2, f.x);
     var t1 = mix(c0, c1, f.x);
     return mix(t0, t1, f.y);
 }}
-fn textureSamplePaletteI4(tex: texture_2d<f32>, samp: sampler, uv: vec2<f32>, tlut: texture_2d<f32>) -> vec4<f32> {{
+fn textureSamplePaletteI4(tex: texture_2d<f32>, samp: sampler, uv: vec2f, tlut: texture_2d<f32>) -> vec4f {{
     // Gather RGB channels
     var iR = textureGather(0, tex, samp, uv);
     var iG = textureGather(1, tex, samp, uv);
     var iB = textureGather(2, tex, samp, uv);
     // Perform intensity conversion
-    var i0 = intensityI4(vec3<f32>(iR[0], iG[0], iB[0]));
-    var i1 = intensityI4(vec3<f32>(iR[1], iG[1], iB[1]));
-    var i2 = intensityI4(vec3<f32>(iR[2], iG[2], iB[2]));
-    var i3 = intensityI4(vec3<f32>(iR[3], iG[3], iB[3]));
+    var i0 = intensityI4(vec3f(iR[0], iG[0], iB[0]));
+    var i1 = intensityI4(vec3f(iR[1], iG[1], iB[1]));
+    var i2 = intensityI4(vec3f(iR[2], iG[2], iB[2]));
+    var i3 = intensityI4(vec3f(iR[3], iG[3], iB[3]));
     // Load palette colors
-    var c0 = textureLoad(tlut, vec2<i32>(i0, 0), 0);
-    var c1 = textureLoad(tlut, vec2<i32>(i1, 0), 0);
-    var c2 = textureLoad(tlut, vec2<i32>(i2, 0), 0);
-    var c3 = textureLoad(tlut, vec2<i32>(i3, 0), 0);
+    var c0 = textureLoad(tlut, vec2i(i0, 0), 0);
+    var c1 = textureLoad(tlut, vec2i(i1, 0), 0);
+    var c2 = textureLoad(tlut, vec2i(i2, 0), 0);
+    var c3 = textureLoad(tlut, vec2i(i3, 0), 0);
     // Perform bilinear filtering
-    var f = fract(uv * vec2<f32>(textureDimensions(tex)) + 0.5);
+    var f = fract(uv * vec2f(textureDimensions(tex)) + 0.5);
     var t0 = mix(c3, c2, f.x);
     var t1 = mix(c0, c1, f.x);
     return mix(t0, t1, f.y);
@@ -1318,10 +1497,10 @@ fn vs_main({5}
 }}
 
 @fragment
-fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {{{8}{7}
+fn fs_main(in: VertexOutput) -> @location(0) vec4f {{{8}{7}
     return prev;
 }}
-)"""),
+)""",
                                         uniBufAttrs, sampBindings, texBindings, uniformBindings, vtxOutAttrs,
                                         vtxInAttrs, vtxXfrAttrs, fragmentFn, fragmentFnPre, vtxXfrAttrsPre, uniformPre);
   if (EnableDebugPrints) {
diff --git a/lib/gfx/model/shader.cpp b/lib/gfx/model/shader.cpp
index acd4aba..80cd9b5 100644
--- a/lib/gfx/model/shader.cpp
+++ b/lib/gfx/model/shader.cpp
@@ -1,60 +1,29 @@
 #include "shader.hpp"
 
 #include "../../webgpu/gpu.hpp"
+#include "../gx_fmt.hpp"
 
 #include <absl/container/flat_hash_map.h>
 
 namespace aurora::gfx::model {
 static Module Log("aurora::gfx::model");
 
-template <typename T>
-constexpr T bswap16(T val) noexcept {
-  static_assert(sizeof(T) == sizeof(u16));
-  union {
-    u16 u;
-    T t;
-  } v{.t = val};
-#if __GNUC__
-  v.u = __builtin_bswap16(v.u);
-#elif _WIN32
-  v.u = _byteswap_ushort(v.u);
-#else
-  v.u = (v.u << 8) | ((v.u >> 8) & 0xFF);
-#endif
-  return v.t;
-}
-template <typename T>
-constexpr T bswap32(T val) noexcept {
-  static_assert(sizeof(T) == sizeof(u32));
-  union {
-    u32 u;
-    T t;
-  } v{.t = val};
-#if __GNUC__
-  v.u = __builtin_bswap32(v.u);
-#elif _WIN32
-  v.u = _byteswap_ulong(v.u);
-#else
-  v.u = ((v.u & 0x0000FFFF) << 16) | ((v.u & 0xFFFF0000) >> 16) | ((v.u & 0x00FF00FF) << 8) | ((v.u & 0xFF00FF00) >> 8);
-#endif
-  return v.t;
-}
-
 using IndexedAttrs = std::array<bool, GX_VA_MAX_ATTR>;
 struct DisplayListCache {
   ByteBuffer vtxBuf;
   ByteBuffer idxBuf;
   IndexedAttrs indexedAttrs;
+  GXVtxFmt fmt;
 
-  DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs)
-  : vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs) {}
+  DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs, GXVtxFmt fmt)
+  : vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs), fmt(fmt) {}
 };
 
 static absl::flat_hash_map<HashType, DisplayListCache> sCachedDisplayLists;
 
 static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u16 vtxCount,
                               IndexedAttrs& indexedAttrs) {
-  using aurora::gfx::gx::g_gxState;
+  using gx::g_gxState;
   struct {
     u8 count;
     GXCompType type;
@@ -66,14 +35,13 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
   for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) {
     const auto& attrFmt = g_gxState.vtxFmts[vtxfmt].attrs[attr];
     switch (g_gxState.vtxDesc[attr]) {
-      DEFAULT_FATAL("unhandled attribute type {}", static_cast<int>(g_gxState.vtxDesc[attr]));
+      DEFAULT_FATAL("unhandled attribute type {}", g_gxState.vtxDesc[attr]);
     case GX_NONE:
       break;
     case GX_DIRECT:
 #define COMBINE(val1, val2, val3) (((val1) << 16) | ((val2) << 8) | (val3))
       switch (COMBINE(attr, attrFmt.cnt, attrFmt.type)) {
-        DEFAULT_FATAL("not handled: attr {}, cnt {}, type {}", static_cast<int>(attr), static_cast<int>(attrFmt.cnt),
-                      static_cast<int>(attrFmt.type));
+        DEFAULT_FATAL("not handled: attr {}, cnt {}, type {}", attr, attrFmt.cnt, attrFmt.type);
       case COMBINE(GX_VA_POS, GX_POS_XYZ, GX_F32):
       case COMBINE(GX_VA_NRM, GX_NRM_XYZ, GX_F32):
         attrArrays[attr].count = 3;
@@ -150,12 +118,10 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
   for (u32 v = 0; v < vtxCount; ++v) {
     for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) {
       if (g_gxState.vtxDesc[attr] == GX_INDEX8) {
-        u16 index = *ptr;
-        buf.append(&index, 2);
+        buf.append(static_cast<u16>(*ptr));
         ++ptr;
       } else if (g_gxState.vtxDesc[attr] == GX_INDEX16) {
-        u16 index = bswap16(*reinterpret_cast<const u16*>(ptr));
-        buf.append(&index, 2);
+        buf.append(bswap(*reinterpret_cast<const u16*>(ptr)));
         ptr += 2;
       }
       if (g_gxState.vtxDesc[attr] != GX_DIRECT) {
@@ -182,7 +148,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
         break;
       case GX_U16:
         for (int i = 0; i < count; ++i) {
-          const auto value = bswap16(reinterpret_cast<const u16*>(ptr)[i]);
+          const auto value = bswap(reinterpret_cast<const u16*>(ptr)[i]);
           out[i] = static_cast<f32>(value) / static_cast<f32>(1 << attrFmt.frac);
         }
         buf.append(out.data(), sizeof(f32) * count);
@@ -190,7 +156,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
         break;
       case GX_S16:
         for (int i = 0; i < count; ++i) {
-          const auto value = bswap16(reinterpret_cast<const s16*>(ptr)[i]);
+          const auto value = bswap(reinterpret_cast<const s16*>(ptr)[i]);
           out[i] = static_cast<f32>(value) / static_cast<f32>(1 << attrFmt.frac);
         }
         buf.append(out.data(), sizeof(f32) * count);
@@ -198,7 +164,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
         break;
       case GX_F32:
         for (int i = 0; i < count; ++i) {
-          out[i] = bswap32(reinterpret_cast<const f32*>(ptr)[i]);
+          out[i] = bswap(reinterpret_cast<const f32*>(ptr)[i]);
         }
         buf.append(out.data(), sizeof(f32) * count);
         ptr += count * sizeof(f32);
@@ -227,7 +193,7 @@ static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u
     buf.reserve_extra(vtxCount * sizeof(u16));
     for (u16 v = 0; v < vtxCount; ++v) {
       const u16 idx = vtxStart + v;
-      buf.append(&idx, sizeof(u16));
+      buf.append(idx);
       ++numIndices;
     }
   } else if (prim == GX_TRIANGLEFAN) {
@@ -235,29 +201,26 @@ static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u
     for (u16 v = 0; v < vtxCount; ++v) {
       const u16 idx = vtxStart + v;
       if (v < 3) {
-        buf.append(&idx, sizeof(u16));
+        buf.append(idx);
         ++numIndices;
         continue;
       }
-      const std::array<u16, 3> idxs{vtxStart, u16(idx - 1), idx};
-      buf.append(idxs.data(), sizeof(u16) * 3);
+      buf.append(std::array{vtxStart, static_cast<u16>(idx - 1), idx});
       numIndices += 3;
     }
   } else if (prim == GX_TRIANGLESTRIP) {
-    buf.reserve_extra(((u32(vtxCount) - 3) * 3 + 3) * sizeof(u16));
+    buf.reserve_extra(((static_cast<u32>(vtxCount) - 3) * 3 + 3) * sizeof(u16));
     for (u16 v = 0; v < vtxCount; ++v) {
       const u16 idx = vtxStart + v;
       if (v < 3) {
-        buf.append(&idx, sizeof(u16));
+        buf.append(idx);
         ++numIndices;
         continue;
       }
       if ((v & 1) == 0) {
-        const std::array<u16, 3> idxs{u16(idx - 2), u16(idx - 1), idx};
-        buf.append(idxs.data(), sizeof(u16) * 3);
+        buf.append(std::array{static_cast<u16>(idx - 2), static_cast<u16>(idx - 1), idx});
       } else {
-        const std::array<u16, 3> idxs{u16(idx - 1), u16(idx - 2), idx};
-        buf.append(idxs.data(), sizeof(u16) * 3);
+        buf.append(std::array{static_cast<u16>(idx - 1), static_cast<u16>(idx - 2), idx});
       }
       numIndices += 3;
     }
@@ -271,6 +234,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
   Range vertRange, idxRange;
   u32 numIndices = 0;
   IndexedAttrs indexedAttrs{};
+  GXVtxFmt fmt = GX_MAX_VTXFMT;
   auto it = sCachedDisplayLists.find(hash);
   if (it != sCachedDisplayLists.end()) {
     const auto& cache = it->second;
@@ -278,6 +242,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
     vertRange = push_verts(cache.vtxBuf.data(), cache.vtxBuf.size());
     idxRange = push_indices(cache.idxBuf.data(), cache.idxBuf.size());
     indexedAttrs = cache.indexedAttrs;
+    fmt = cache.fmt;
   } else {
     const u8* data = dlStart;
     u32 pos = 0;
@@ -302,8 +267,12 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
       case GX_DRAW_TRIANGLE_STRIP:
       case GX_DRAW_TRIANGLE_FAN: {
         const auto prim = static_cast<GXPrimitive>(opcode);
-        const auto fmt = static_cast<GXVtxFmt>(cmd & GX_VAT_MASK);
-        u16 vtxCount = bswap16(*reinterpret_cast<const u16*>(data + pos));
+        const auto newFmt = static_cast<GXVtxFmt>(cmd & GX_VAT_MASK);
+        if (fmt != GX_MAX_VTXFMT && fmt != newFmt) {
+          FATAL("Vertex format changed mid-display list: {} -> {}", fmt, newFmt);
+        }
+        fmt = newFmt;
+        u16 vtxCount = bswap(*reinterpret_cast<const u16*>(data + pos));
         pos += 2;
         pos += vtxCount * prepare_vtx_buffer(vtxBuf, fmt, data + pos, vtxCount, indexedAttrs);
         numIndices += prepare_idx_buffer(idxBuf, prim, vtxStart, vtxCount);
@@ -319,22 +288,16 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
     }
     vertRange = push_verts(vtxBuf.data(), vtxBuf.size());
     idxRange = push_indices(idxBuf.data(), idxBuf.size());
-    sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs);
+    sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs, fmt);
   }
 
   gx::BindGroupRanges ranges{};
-  int lastIndexedAttr = -1;
   for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
     if (!indexedAttrs[i]) {
       continue;
     }
     auto& array = gx::g_gxState.arrays[i];
-    if (lastIndexedAttr >= 0 && array == gx::g_gxState.arrays[lastIndexedAttr]) {
-      // Reuse range from last attribute in shader
-      // Don't set the output range, so it remains unbound
-      const auto range = gx::g_gxState.arrays[lastIndexedAttr].cachedRange;
-      array.cachedRange = range;
-    } else if (array.cachedRange.size > 0) {
+    if (array.cachedRange.size > 0) {
       // Use the currently cached range
       ranges.vaRanges[i] = array.cachedRange;
     } else {
@@ -343,11 +306,10 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
       ranges.vaRanges[i] = range;
       array.cachedRange = range;
     }
-    lastIndexedAttr = i;
   }
 
   model::PipelineConfig config{};
-  populate_pipeline_config(config, GX_TRIANGLES);
+  populate_pipeline_config(config, GX_TRIANGLES, fmt);
   const auto info = gx::build_shader_info(config.shaderConfig);
   const auto bindGroups = gx::build_bind_groups(info, config.shaderConfig, ranges);
   const auto pipeline = pipeline_ref(config);
@@ -366,7 +328,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
 
 State construct_state() { return {}; }
 
-wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config) {
+wgpu::RenderPipeline create_pipeline(const State& state, const PipelineConfig& config) {
   const auto info = build_shader_info(config.shaderConfig); // TODO remove
   const auto shader = build_shader(config.shaderConfig, info);
 
@@ -385,7 +347,7 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const
   // Indexed attributes
   for (u32 i = 0; i < num4xAttr; ++i) {
     vtxAttrs[shaderLocation] = {
-        .format = wgpu::VertexFormat::Sint16x4,
+        .format = wgpu::VertexFormat::Uint16x4,
         .offset = offset,
         .shaderLocation = shaderLocation,
     };
@@ -394,7 +356,7 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const
   }
   for (u32 i = 0; i < num2xAttr; ++i) {
     vtxAttrs[shaderLocation] = {
-        .format = wgpu::VertexFormat::Sint16x2,
+        .format = wgpu::VertexFormat::Uint16x2,
         .offset = offset,
         .shaderLocation = shaderLocation,
     };
diff --git a/lib/gfx/stream/shader.cpp b/lib/gfx/stream/shader.cpp
deleted file mode 100644
index 842a03f..0000000
--- a/lib/gfx/stream/shader.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "shader.hpp"
-
-#include "../../webgpu/gpu.hpp"
-
-namespace aurora::gfx::stream {
-static Module Log("aurora::gfx::stream");
-
-using webgpu::g_device;
-
-wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config) {
-  const auto info = build_shader_info(config.shaderConfig); // TODO remove
-  const auto shader = build_shader(config.shaderConfig, info);
-
-  std::array<wgpu::VertexAttribute, 4> attributes{};
-  attributes[0] = wgpu::VertexAttribute{
-      .format = wgpu::VertexFormat::Float32x3,
-      .offset = 0,
-      .shaderLocation = 0,
-  };
-  uint64_t offset = 12;
-  uint32_t shaderLocation = 1;
-  if (config.shaderConfig.vtxAttrs[GX_VA_NRM] == GX_DIRECT) {
-    attributes[shaderLocation] = wgpu::VertexAttribute{
-        .format = wgpu::VertexFormat::Float32x3,
-        .offset = offset,
-        .shaderLocation = shaderLocation,
-    };
-    offset += 12;
-    shaderLocation++;
-  }
-  if (config.shaderConfig.vtxAttrs[GX_VA_CLR0] == GX_DIRECT) {
-    attributes[shaderLocation] = wgpu::VertexAttribute{
-        .format = wgpu::VertexFormat::Float32x4,
-        .offset = offset,
-        .shaderLocation = shaderLocation,
-    };
-    offset += 16;
-    shaderLocation++;
-  }
-  for (int i = GX_VA_TEX0; i < GX_VA_TEX7; ++i) {
-    if (config.shaderConfig.vtxAttrs[i] != GX_DIRECT) {
-      continue;
-    }
-    attributes[shaderLocation] = wgpu::VertexAttribute{
-        .format = wgpu::VertexFormat::Float32x2,
-        .offset = offset,
-        .shaderLocation = shaderLocation,
-    };
-    offset += 8;
-    shaderLocation++;
-  }
-  const std::array vertexBuffers{wgpu::VertexBufferLayout{
-      .arrayStride = offset,
-      .attributeCount = shaderLocation,
-      .attributes = attributes.data(),
-  }};
-
-  return build_pipeline(config, info, vertexBuffers, shader, "Stream Pipeline");
-}
-
-State construct_state() { return {}; }
-
-void render(const State& state, const DrawData& data, const wgpu::RenderPassEncoder& pass) {
-  if (!bind_pipeline(data.pipeline, pass)) {
-    return;
-  }
-
-  const std::array offsets{data.uniformRange.offset};
-  pass.SetBindGroup(0, find_bind_group(data.bindGroups.uniformBindGroup), offsets.size(), offsets.data());
-  if (data.bindGroups.samplerBindGroup && data.bindGroups.textureBindGroup) {
-    pass.SetBindGroup(1, find_bind_group(data.bindGroups.samplerBindGroup));
-    pass.SetBindGroup(2, find_bind_group(data.bindGroups.textureBindGroup));
-  }
-  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size);
-  pass.SetIndexBuffer(g_indexBuffer, wgpu::IndexFormat::Uint16, data.indexRange.offset, data.indexRange.size);
-  if (data.dstAlpha != UINT32_MAX) {
-    const wgpu::Color color{0.f, 0.f, 0.f, data.dstAlpha / 255.f};
-    pass.SetBlendConstant(&color);
-  }
-  pass.DrawIndexed(data.indexCount);
-}
-} // namespace aurora::gfx::stream
diff --git a/lib/gfx/stream/shader.hpp b/lib/gfx/stream/shader.hpp
deleted file mode 100644
index ffbde72..0000000
--- a/lib/gfx/stream/shader.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-#include "../common.hpp"
-#include "../gx.hpp"
-
-namespace aurora::gfx::stream {
-struct DrawData {
-  PipelineRef pipeline;
-  Range vertRange;
-  Range uniformRange;
-  Range indexRange;
-  uint32_t indexCount;
-  gx::GXBindGroups bindGroups;
-  u32 dstAlpha;
-};
-
-struct PipelineConfig : public gx::PipelineConfig {};
-
-struct State {};
-
-State construct_state();
-wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config);
-void render(const State& state, const DrawData& data, const wgpu::RenderPassEncoder& pass);
-} // namespace aurora::gfx::stream
diff --git a/lib/gfx/texture_convert.cpp b/lib/gfx/texture_convert.cpp
index 6148ea9..d0bff21 100644
--- a/lib/gfx/texture_convert.cpp
+++ b/lib/gfx/texture_convert.cpp
@@ -66,17 +66,6 @@ static size_t ComputeMippedBlockCountDXT1(uint32_t w, uint32_t h, uint32_t mips)
   return ret;
 }
 
-template <typename T>
-constexpr T bswap16(T val) noexcept {
-#if __GNUC__
-  return __builtin_bswap16(val);
-#elif _WIN32
-  return _byteswap_ushort(val);
-#else
-  return (val = (val << 8) | ((val >> 8) & 0xFF));
-#endif
-}
-
 template <typename T>
 concept TextureDecoder = requires(T) {
   typename T::Source;
@@ -178,15 +167,15 @@ struct TextureDecoderIA4 {
 };
 
 struct TextureDecoderIA8 {
-  using Source = uint8_t;
+  using Source = uint16_t;
   using Target = RGBA8;
 
   static constexpr uint32_t Frac = 1;
-  static constexpr uint32_t BlockWidth = 8;
+  static constexpr uint32_t BlockWidth = 4;
   static constexpr uint32_t BlockHeight = 4;
 
   static void decode_texel(Target* target, const Source* in, const uint32_t x) {
-    const auto texel = bswap16(in[x]);
+    const auto texel = bswap(in[x]);
     const uint8_t intensity = texel >> 8;
     target[x].r = intensity;
     target[x].g = intensity;
@@ -228,7 +217,7 @@ struct TextureDecoderRGB565 {
   static constexpr uint32_t BlockHeight = 4;
 
   static void decode_texel(Target* target, const Source* in, const uint32_t x) {
-    const auto texel = bswap16(in[x]);
+    const auto texel = bswap(in[x]);
     target[x].r = ExpandTo8<5>(texel >> 11 & 0x1f);
     target[x].g = ExpandTo8<6>(texel >> 5 & 0x3f);
     target[x].b = ExpandTo8<5>(texel & 0x1f);
@@ -245,7 +234,7 @@ struct TextureDecoderRGB5A3 {
   static constexpr uint32_t BlockHeight = 4;
 
   static void decode_texel(Target* target, const Source* in, const uint32_t x) {
-    const auto texel = bswap16(in[x]);
+    const auto texel = bswap(in[x]);
     if ((texel & 0x8000) != 0) {
       target[x].r = ExpandTo8<5>(texel >> 10 & 0x1f);
       target[x].g = ExpandTo8<5>(texel >> 5 & 0x1f);
@@ -322,8 +311,8 @@ static ByteBuffer BuildDXT1FromGCN(uint32_t width, uint32_t height, uint32_t mip
         for (uint32_t y = 0; y < 2; ++y) {
           DXT1Block* target = targetMip + (baseY + y) * w + baseX;
           for (size_t x = 0; x < 2; ++x) {
-            target[x].color1 = bswap16(in[x].color1);
-            target[x].color2 = bswap16(in[x].color2);
+            target[x].color1 = bswap(in[x].color1);
+            target[x].color2 = bswap(in[x].color2);
             for (size_t i = 0; i < 4; ++i) {
               std::array<uint8_t, 4> ind;
               const uint8_t packed = in[x].lines[i];
@@ -365,8 +354,8 @@ static ByteBuffer BuildRGBA8FromCMPR(uint32_t width, uint32_t height, uint32_t m
         for (uint32_t yb = 0; yb < 8; yb += 4) {
           for (uint32_t xb = 0; xb < 8; xb += 4) {
             // CMPR difference: Big-endian color1/2
-            const uint16_t color1 = bswap16(*reinterpret_cast<const uint16_t*>(src));
-            const uint16_t color2 = bswap16(*reinterpret_cast<const uint16_t*>(src + 2));
+            const uint16_t color1 = bswap(*reinterpret_cast<const uint16_t*>(src));
+            const uint16_t color2 = bswap(*reinterpret_cast<const uint16_t*>(src + 2));
             src += 4;
 
             // Fill in first two colors in color table.
@@ -480,4 +469,4 @@ ByteBuffer convert_tlut(u32 format, uint32_t width, ArrayRef<uint8_t> data) {
     return DecodeLinear<TextureDecoderRGB5A3>(width, data);
   }
 }
-} // namespace aurora::gfx
+} // namespace aurora::gfx
\ No newline at end of file
diff --git a/lib/internal.hpp b/lib/internal.hpp
index 5423030..162c16b 100644
--- a/lib/internal.hpp
+++ b/lib/internal.hpp
@@ -6,6 +6,8 @@
 
 #include <array>
 #include <cassert>
+#include <cstdint>
+#include <type_traits>
 #include <vector>
 
 using namespace std::string_view_literals;
@@ -21,6 +23,46 @@ using namespace std::string_view_literals;
 #endif
 #endif
 
+template <typename T>
+  requires(sizeof(T) == sizeof(uint16_t) && std::is_arithmetic_v<T>)
+constexpr T bswap(T val) noexcept {
+  union {
+    uint16_t u;
+    T t;
+  } v{.t = val};
+#if __GNUC__
+  v.u = __builtin_bswap16(v.u);
+#elif _WIN32
+  v.u = _byteswap_ushort(v.u);
+#else
+  v.u = (v.u << 8) | ((v.u >> 8) & 0xFF);
+#endif
+  return v.t;
+}
+
+template <typename T>
+  requires(sizeof(T) == sizeof(uint32_t) && std::is_arithmetic_v<T>)
+constexpr T bswap(T val) noexcept {
+  union {
+    uint32_t u;
+    T t;
+  } v{.t = val};
+#if __GNUC__
+  v.u = __builtin_bswap32(v.u);
+#elif _WIN32
+  v.u = _byteswap_ulong(v.u);
+#else
+  v.u = ((v.u & 0x0000FFFF) << 16) | ((v.u & 0xFFFF0000) >> 16) | ((v.u & 0x00FF00FF) << 8) | ((v.u & 0xFF00FF00) >> 8);
+#endif
+  return v.t;
+}
+
+template <typename T>
+  requires(std::is_enum_v<T>)
+auto underlying(T value) -> std::underlying_type_t<T> {
+  return static_cast<std::underlying_type_t<T>>(value);
+}
+
 #ifndef ALIGN
 #define ALIGN(x, a) (((x) + ((a) - 1)) & ~((a) - 1))
 #endif
@@ -33,11 +75,7 @@ using namespace std::string_view_literals;
 #else
 #define UNLIKELY
 #endif
-#define FATAL(msg, ...)                                                                                                \
-  {                                                                                                                    \
-    Log.fatal(msg, ##__VA_ARGS__);                                                                                     \
-    unreachable();                                                                                                     \
-  }
+#define FATAL(msg, ...) Log.fatal(msg, ##__VA_ARGS__);
 #define ASSERT(cond, msg, ...)                                                                                         \
   if (!(cond))                                                                                                         \
   UNLIKELY FATAL(msg, ##__VA_ARGS__)
diff --git a/lib/logging.hpp b/lib/logging.hpp
index bed551c..09cae88 100644
--- a/lib/logging.hpp
+++ b/lib/logging.hpp
@@ -4,15 +4,9 @@
 
 #include <fmt/base.h>
 #include <fmt/format.h>
-#include <string_view>
 
-#ifdef __GNUC__
-[[noreturn]] inline __attribute__((always_inline)) void unreachable() { __builtin_unreachable(); }
-#elif defined(_MSC_VER)
-[[noreturn]] __forceinline void unreachable() { __assume(false); }
-#else
-#error Unknown compiler
-#endif
+#include <cstdlib>
+#include <string_view>
 
 namespace aurora {
 void log_internal(AuroraLogLevel level, const char* module, const char* message, unsigned int len) noexcept;
@@ -50,7 +44,7 @@ struct Module {
   template <typename... T>
   [[noreturn]] void fatal(fmt::format_string<T...> fmt, T&&... args) noexcept {
     report(LOG_FATAL, fmt, std::forward<T>(args)...);
-    unreachable();
+    std::abort();
   }
 };
 } // namespace aurora
diff --git a/lib/webgpu/gpu.cpp b/lib/webgpu/gpu.cpp
index 3944fc0..cce583d 100644
--- a/lib/webgpu/gpu.cpp
+++ b/lib/webgpu/gpu.cpp
@@ -385,15 +385,12 @@ bool initialize(AuroraBackend auroraBackend) {
     g_adapter.GetLimits(&supportedLimits);
     const wgpu::Limits requiredLimits{
         // Use "best" supported alignments
-        .maxTextureDimension1D = supportedLimits.maxTextureDimension1D == 0
-                                               ? WGPU_LIMIT_U32_UNDEFINED
-                                               : supportedLimits.maxTextureDimension1D,
-        .maxTextureDimension2D = supportedLimits.maxTextureDimension2D == 0
-                                               ? WGPU_LIMIT_U32_UNDEFINED
-                                               : supportedLimits.maxTextureDimension2D,
-        .maxTextureDimension3D = supportedLimits.maxTextureDimension3D == 0
-                                               ? WGPU_LIMIT_U32_UNDEFINED
-                                               : supportedLimits.maxTextureDimension3D,
+        .maxTextureDimension1D = supportedLimits.maxTextureDimension1D == 0 ? WGPU_LIMIT_U32_UNDEFINED
+                                                                            : supportedLimits.maxTextureDimension1D,
+        .maxTextureDimension2D = supportedLimits.maxTextureDimension2D == 0 ? WGPU_LIMIT_U32_UNDEFINED
+                                                                            : supportedLimits.maxTextureDimension2D,
+        .maxTextureDimension3D = supportedLimits.maxTextureDimension3D == 0 ? WGPU_LIMIT_U32_UNDEFINED
+                                                                            : supportedLimits.maxTextureDimension3D,
         .minUniformBufferOffsetAlignment = supportedLimits.minUniformBufferOffsetAlignment == 0
                                                ? WGPU_LIMIT_U32_UNDEFINED
                                                : supportedLimits.minUniformBufferOffsetAlignment,
@@ -401,6 +398,12 @@ bool initialize(AuroraBackend auroraBackend) {
                                                ? WGPU_LIMIT_U32_UNDEFINED
                                                : supportedLimits.minStorageBufferOffsetAlignment,
     };
+    Log.info(
+        "Using limits\n  maxTextureDimension1D: {}\n  maxTextureDimension2D: {}\n  maxTextureDimension3D: {}\n  "
+        "minUniformBufferOffsetAlignment: {}\n  minStorageBufferOffsetAlignment: {}",
+        requiredLimits.maxTextureDimension1D, requiredLimits.maxTextureDimension2D,
+        requiredLimits.maxTextureDimension3D, requiredLimits.minUniformBufferOffsetAlignment,
+        requiredLimits.minStorageBufferOffsetAlignment);
     std::vector<wgpu::FeatureName> requiredFeatures;
     wgpu::SupportedFeatures supportedFeatures;
     g_adapter.GetFeatures(&supportedFeatures);
@@ -442,22 +445,20 @@ bool initialize(AuroraBackend auroraBackend) {
     });
     deviceDescriptor.SetUncapturedErrorCallback(
         [](const wgpu::Device& device, wgpu::ErrorType type, wgpu::StringView message) {
-          FATAL("WebGPU error {}: {}", static_cast<int>(type), message);
-        });
-    deviceDescriptor.SetDeviceLostCallback(
-        wgpu::CallbackMode::AllowSpontaneous,
-        [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
-          Log.warn("Device lost: {}", message);
-        });
-    const auto future = g_adapter.RequestDevice(
-        &deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly,
-        [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
-          if (status == wgpu::RequestDeviceStatus::Success) {
-            g_device = std::move(device);
-          } else {
-            Log.warn("Device request failed: {}", message);
-          }
+          FATAL("WebGPU error {}: {}", underlying(type), message);
         });
+    deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous,
+                                           [](const wgpu::Device& device, wgpu::DeviceLostReason reason,
+                                              wgpu::StringView message) { Log.warn("Device lost: {}", message); });
+    const auto future =
+        g_adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly,
+                                [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
+                                  if (status == wgpu::RequestDeviceStatus::Success) {
+                                    g_device = std::move(device);
+                                  } else {
+                                    Log.warn("Device request failed: {}", message);
+                                  }
+                                });
     const auto status = g_instance.WaitAny(future, 5000000000);
     if (status != wgpu::WaitStatus::Success) {
       Log.error("Failed to create device: {}", magic_enum::enum_name(status));