Rewrite attribute buffer, matrix & stream handling

Now array attributes (GXSetArray) will be properly fetched based on the vertex format. Buffers are still assumed to be byte-swapped to little-endian. Stream handling completely redone and many issues resolved. Eliminates matrix transposes. AURORA_NATIVE_MATRIX is no longer necessary and removed.
2025-09-04 01:48:40 +00:00 · 2025-04-14 17:16:13 -06:00 · 2025-04-14 17:16:13 -06:00 · a600b0b84c
commit a600b0b84c
parent 3316ad9a7f
21 changed files with 1215 additions and 901 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,8 +3,6 @@ project(aurora LANGUAGES C CXX)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 20)
 option(AURORA_NATIVE_MATRIX "Assume OpenGL-layout matrices, disables transposing" OFF)
 add_subdirectory(extern)
 include(cmake/aurora_core.cmake)
--- a/cmake/aurora_gx.cmake
+++ b/cmake/aurora_gx.cmake
@ -4,7 +4,6 @@ add_library(aurora_gx STATIC
        lib/gfx/gx.cpp
        lib/gfx/gx_shader.cpp
        lib/gfx/texture_convert.cpp
        lib/gfx/stream/shader.cpp
        lib/gfx/model/shader.cpp
        lib/dolphin/gx/GXBump.cpp
        lib/dolphin/gx/GXCull.cpp
@ -28,9 +27,6 @@ add_library(aurora::gx ALIAS aurora_gx)
 target_link_libraries(aurora_gx PUBLIC aurora::core xxhash)
 target_link_libraries(aurora_gx PRIVATE absl::btree absl::flat_hash_map)
 if (AURORA_NATIVE_MATRIX)
    target_compile_definitions(aurora_gx PRIVATE AURORA_NATIVE_MATRIX)
 endif ()
 if (EMSCRIPTEN)
    target_link_options(aurora_gx PUBLIC -sUSE_WEBGPU=1 -sASYNCIFY -sEXIT_RUNTIME)
    target_compile_definitions(aurora_gx PRIVATE ENABLE_BACKEND_WEBGPU)
--- a/include/aurora/math.hpp
+++ b/include/aurora/math.hpp
@ -35,9 +35,6 @@ struct Vec2 {
  constexpr Vec2() = default;
  constexpr Vec2(T x, T y) : x(x), y(y) {}
  AURORA_VEC2_EXTRA
 #ifdef METAFORCE
  constexpr Vec2(const zeus::CVector2f& vec) : x(vec.x()), y(vec.y()) {}
 #endif
  bool operator==(const Vec2& rhs) const { return x == rhs.x && y == rhs.y; }
  bool operator!=(const Vec2& rhs) const { return !(*this == rhs); }
@ -51,10 +48,6 @@ struct Vec3 {
  constexpr Vec3() = default;
  constexpr Vec3(T x, T y, T z) : x(x), y(y), z(z) {}
  AURORA_VEC3_EXTRA
 #ifdef METAFORCE
  constexpr Vec3(const zeus::CVector3f& vec) : x(vec.x()), y(vec.y()), z(vec.z()) {}
  operator zeus::CVector3f() const { return {x, y, z}; }
 #endif
  bool operator==(const Vec3& rhs) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
  bool operator!=(const Vec3& rhs) const { return !(*this == rhs); }
@ -77,10 +70,6 @@ struct Vec4 {
  // For Vec3 -> Vec4
  constexpr Vec4(Vec3<T> v, T w) : m{v.x, v.y, v.z, w} {}
  AURORA_VEC4_EXTRA
 #ifdef METAFORCE
  constexpr Vec4(const zeus::CVector4f& vec) : x(vec.x()), y(vec.y()), z(vec.z()), w(vec.w()) {}
  constexpr Vec4(const zeus::CColor& color) : x(color.r()), y(color.g()), z(color.b()), w(color.a()) {}
 #endif
  inline Vec4& operator=(const Vec4& other) {
    memcpy(&m, &other.m, sizeof(Vt));
@ -119,7 +108,7 @@ struct Vec4 {
  bool operator!=(const Vec4& rhs) const { return !(*this == rhs); }
 };
 template <typename T>
-[[nodiscard]] inline Vec4<T> operator+(const Vec4<T>& a, const Vec4<T>& b) {
+[[nodiscard]] Vec4<T> operator+(const Vec4<T>& a, const Vec4<T>& b) {
 #ifdef USE_GCC_VECTOR_EXTENSIONS
  return a.m + b.m;
 #else
@ -127,7 +116,7 @@ template <typename T>
 #endif
 }
 template <typename T>
-[[nodiscard]] inline Vec4<T> operator*(const Vec4<T>& a, const Vec4<T>& b) {
+[[nodiscard]] Vec4<T> operator*(const Vec4<T>& a, const Vec4<T>& b) {
 #ifdef USE_GCC_VECTOR_EXTENSIONS
  return a.m * b.m;
 #else
@ -170,6 +159,18 @@ struct Mat4x2 {
  bool operator!=(const Mat4x2& rhs) const { return !(*this == rhs); }
 };
 template <typename T>
 struct Mat2x4 {
  Vec4<T> m0{};
  Vec4<T> m1{};
  constexpr Mat2x4() = default;
  constexpr Mat2x4(const Vec4<T>& m0, const Vec4<T>& m1, const Vec4<T>& m2) : m0(m0), m1(m1) {}
  bool operator==(const Mat2x4& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1; }
  bool operator!=(const Mat2x4& rhs) const { return !(*this == rhs); }
 };
 static_assert(sizeof(Mat2x4<float>) == 32);
 template <typename T>
 struct Mat4x4;
 template <typename T>
 struct Mat3x4 {
@ -180,10 +181,13 @@ struct Mat3x4 {
  constexpr Mat3x4() = default;
  constexpr Mat3x4(const Vec4<T>& m0, const Vec4<T>& m1, const Vec4<T>& m2) : m0(m0), m1(m1), m2(m2) {}
-  inline Mat4x4<T> to4x4() const;
+  [[nodiscard]] Mat4x4<T> to4x4() const;
-  inline Mat4x4<T> toTransposed4x4() const;
+  [[nodiscard]] Mat4x4<T> toTransposed4x4() const;
  bool operator==(const Mat3x4& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1 && m2 == rhs.m2; }
  bool operator!=(const Mat3x4& rhs) const { return !(*this == rhs); }
 };
-static_assert(sizeof(Mat3x4<float>) == sizeof(float[3][4]));
+static_assert(sizeof(Mat3x4<float>) == 48);
 template <typename T>
 struct Mat4x4 {
  Vec4<T> m0{};
@ -195,10 +199,6 @@ struct Mat4x4 {
  constexpr Mat4x4(const Vec4<T>& m0, const Vec4<T>& m1, const Vec4<T>& m2, const Vec4<T>& m3)
  : m0(m0), m1(m1), m2(m2), m3(m3) {}
  AURORA_MAT4X4_EXTRA
 #ifdef METAFORCE
  constexpr Mat4x4(const zeus::CMatrix4f& m) : m0(m[0]), m1(m[1]), m2(m[2]), m3(m[3]) {}
  constexpr Mat4x4(const zeus::CTransform& m) : Mat4x4(m.toMatrix4f()) {}
 #endif
  [[nodiscard]] Mat4x4 transpose() const {
    return {
@ -208,23 +208,17 @@ struct Mat4x4 {
        {m0[3], m1[3], m2[3], m3[3]},
    };
  }
-  inline Mat4x4& operator=(const Mat4x4& other) {
+  Mat4x4& operator=(const Mat4x4& other) = default;
    m0 = other.m0;
    m1 = other.m1;
    m2 = other.m2;
    m3 = other.m3;
    return *this;
  }
-  inline Vec4<T>& operator[](size_t i) { return *(&m0 + i); }
+  Vec4<T>& operator[](size_t i) { return *(&m0 + i); }
-  inline const Vec4<T>& operator[](size_t i) const { return *(&m0 + i); }
+  const Vec4<T>& operator[](size_t i) const { return *(&m0 + i); }
  bool operator==(const Mat4x4& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1 && m2 == rhs.m2 && m3 == rhs.m3; }
  bool operator!=(const Mat4x4& rhs) const { return !(*this == rhs); }
 };
-static_assert(sizeof(Mat4x4<float>) == sizeof(float[4][4]));
+static_assert(sizeof(Mat4x4<float>) == 64);
 template <typename T>
-[[nodiscard]] inline Mat4x4<T> operator*(const Mat4x4<T>& a, const Mat4x4<T>& b) {
+[[nodiscard]] Mat4x4<T> operator*(const Mat4x4<T>& a, const Mat4x4<T>& b) {
  Mat4x4<T> out;
  for (size_t i = 0; i < 4; ++i) {
    *(&out.m0 + i) = a.m0 * b[i].template shuffle<0, 0, 0, 0>() + a.m1 * b[i].template shuffle<1, 1, 1, 1>() +
@ -233,28 +227,27 @@ template <typename T>
  return out;
 }
 template <typename T>
-[[nodiscard]] inline Mat4x4<T> Mat3x4<T>::to4x4() const {
+[[nodiscard]] Mat4x4<T> Mat3x4<T>::to4x4() const {
  return {
-      {m0.m[0], m0.m[1], m0.m[2], 0.f},
+      {m0[0], m0[1], m0[2], 0.f},
-      {m1.m[0], m1.m[1], m1.m[2], 0.f},
+      {m1[0], m1[1], m1[2], 0.f},
-      {m2.m[0], m2.m[1], m2.m[2], 0.f},
+      {m2[0], m2[1], m2[2], 0.f},
-      {m0.m[3], m1.m[3], m2.m[3], 1.f},
+      {m0[3], m1[3], m2[3], 1.f},
  };
 }
 template <typename T>
-[[nodiscard]] inline Mat4x4<T> Mat3x4<T>::toTransposed4x4() const {
+[[nodiscard]] Mat4x4<T> Mat3x4<T>::toTransposed4x4() const {
  return Mat4x4<T>{
-      m0,
+      {m0[0], m1[0], m2[0], 0.f},
-      m1,
+      {m0[1], m1[1], m2[1], 0.f},
-      m2,
+      {m0[2], m1[2], m2[2], 0.f},
-      {0.f, 0.f, 0.f, 1.f},
+      {m0[3], m1[3], m2[3], 1.f},
-  }
+  };
      .transpose();
 }
-constexpr Mat4x4<float> Mat4x4_Identity{
+constexpr Mat4x4 Mat4x4_Identity{
-    Vec4<float>{1.f, 0.f, 0.f, 0.f},
+    Vec4{1.f, 0.f, 0.f, 0.f},
-    Vec4<float>{0.f, 1.f, 0.f, 0.f},
+    Vec4{0.f, 1.f, 0.f, 0.f},
-    Vec4<float>{0.f, 0.f, 1.f, 0.f},
+    Vec4{0.f, 0.f, 1.f, 0.f},
-    Vec4<float>{0.f, 0.f, 0.f, 1.f},
+    Vec4{0.f, 0.f, 0.f, 1.f},
 };
 } // namespace aurora
--- a/include/dolphin/gx/GXVert.h
+++ b/include/dolphin/gx/GXVert.h
@ -68,11 +68,11 @@ void GXTexCoord2s16(s16 s, s16 t);
 void GXTexCoord2u8(u8 s, u8 t);
 void GXTexCoord2s8(s8 s, s8 t);
-void GXTexCoord1f32(f32 s, f32 t);
+void GXTexCoord1f32(f32 s);
-void GXTexCoord1u16(u16 s, u16 t);
+void GXTexCoord1u16(u16 s);
-void GXTexCoord1s16(s16 s, s16 t);
+void GXTexCoord1s16(s16 s);
-void GXTexCoord1u8(u8 s, u8 t);
+void GXTexCoord1u8(u8 s);
-void GXTexCoord1s8(s8 s, s8 t);
+void GXTexCoord1s8(s8 s);
 void GXTexCoord1x16(u16 index);
 void GXTexCoord1x8(u8 index);
--- a/lib/dolphin/gx/GXGeometry.cpp
+++ b/lib/dolphin/gx/GXGeometry.cpp
@ -7,7 +7,6 @@ extern "C" {
 void GXSetVtxDesc(GXAttr attr, GXAttrType type) { update_gx_state(g_gxState.vtxDesc[attr], type); }
 void GXSetVtxDescv(GXVtxDescList* list) {
  g_gxState.vtxDesc.fill({});
  while (list->attr != GX_VA_NULL) {
    update_gx_state(g_gxState.vtxDesc[list->attr], list->type);
    ++list;
@ -17,8 +16,8 @@ void GXSetVtxDescv(GXVtxDescList* list) {
 void GXClearVtxDesc() { g_gxState.vtxDesc.fill({}); }
 void GXSetVtxAttrFmt(GXVtxFmt vtxfmt, GXAttr attr, GXCompCnt cnt, GXCompType type, u8 frac) {
-  CHECK(vtxfmt >= GX_VTXFMT0 && vtxfmt < GX_MAX_VTXFMT, "invalid vtxfmt {}", static_cast<int>(vtxfmt));
+  CHECK(vtxfmt >= GX_VTXFMT0 && vtxfmt < GX_MAX_VTXFMT, "invalid vtxfmt {}", underlying(vtxfmt));
-  CHECK(attr >= GX_VA_PNMTXIDX && attr < GX_VA_MAX_ATTR, "invalid attr {}", static_cast<int>(attr));
+  CHECK(attr >= GX_VA_PNMTXIDX && attr < GX_VA_MAX_ATTR, "invalid attr {}", underlying(attr));
  auto& fmt = g_gxState.vtxFmts[vtxfmt].attrs[attr];
  update_gx_state(fmt.cnt, cnt);
  update_gx_state(fmt.type, type);
@ -38,7 +37,7 @@ void GXSetArray(GXAttr attr, const void* data, u32 size, u8 stride) {
 // TODO move GXBegin, GXEnd here
 void GXSetTexCoordGen2(GXTexCoordID dst, GXTexGenType type, GXTexGenSrc src, u32 mtx, GXBool normalize, u32 postMtx) {
-  CHECK(dst >= GX_TEXCOORD0 && dst <= GX_TEXCOORD7, "invalid tex coord {}", static_cast<int>(dst));
+  CHECK(dst >= GX_TEXCOORD0 && dst <= GX_TEXCOORD7, "invalid tex coord {}", underlying(dst));
  update_gx_state(g_gxState.tcgs[dst],
                  {type, src, static_cast<GXTexMtx>(mtx), static_cast<GXPTTexMtx>(postMtx), normalize});
 }
--- a/lib/dolphin/gx/GXGet.cpp
+++ b/lib/dolphin/gx/GXGet.cpp
@ -20,7 +20,7 @@ void GXGetVtxAttrFmt(GXVtxFmt idx, GXAttr attr, GXCompCnt* compCnt, GXCompType*
 // TODO GXGetViewportv
 void GXGetProjectionv(f32* p) {
-  const auto& mtx = g_gxState.origProj;
+  const auto& mtx = g_gxState.proj;
  p[0] = static_cast<float>(g_gxState.projType);
  p[1] = mtx.m0[0];
  p[3] = mtx.m1[1];
--- a/lib/dolphin/gx/GXTransform.cpp
+++ b/lib/dolphin/gx/GXTransform.cpp
@ -4,15 +4,8 @@ extern "C" {
 void GXSetProjection(const void* mtx_, GXProjectionType type) {
  const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
  g_gxState.origProj = mtx;
  g_gxState.projType = type;
-  update_gx_state(g_gxState.proj,
+  update_gx_state(g_gxState.proj, mtx);
 #ifdef AURORA_NATIVE_MATRIX
                  mtx
 #else
                  mtx.transpose()
 #endif
  );
 }
 // TODO GXSetProjectionv
@ -20,13 +13,8 @@ void GXSetProjection(const void* mtx_, GXProjectionType type) {
 void GXLoadPosMtxImm(const void* mtx_, u32 id) {
  CHECK(id >= GX_PNMTX0 && id <= GX_PNMTX9, "invalid pn mtx {}", static_cast<int>(id));
  auto& state = g_gxState.pnMtx[id / 3];
-#ifdef AURORA_NATIVE_MATRIX
+  const auto& mtx = *reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
  const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
  update_gx_state(state.pos, mtx);
 #else
  const auto* mtx = reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
  update_gx_state(state.pos, mtx->toTransposed4x4());
 #endif
 }
 // TODO GXLoadPosMtxIndx
@ -34,56 +22,37 @@ void GXLoadPosMtxImm(const void* mtx_, u32 id) {
 void GXLoadNrmMtxImm(const void* mtx_, u32 id) {
  CHECK(id >= GX_PNMTX0 && id <= GX_PNMTX9, "invalid pn mtx {}", static_cast<int>(id));
  auto& state = g_gxState.pnMtx[id / 3];
-#ifdef AURORA_NATIVE_MATRIX
+  const auto& mtx = *reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
  const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
  update_gx_state(state.nrm, mtx);
 #else
  const auto* mtx = reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
  update_gx_state(state.nrm, mtx->toTransposed4x4());
 #endif
 }
 // TODO GXLoadNrmMtxImm3x3
 // TODO GXLoadNrmMtxIndx3x3
 void GXSetCurrentMtx(u32 id) {
-  CHECK(id >= GX_PNMTX0 && id <= GX_PNMTX9, "invalid pn mtx {}", static_cast<int>(id));
+  CHECK(id >= GX_PNMTX0 && id <= GX_PNMTX9, "invalid pn mtx {}", id);
  update_gx_state(g_gxState.currentPnMtx, id / 3);
 }
 void GXLoadTexMtxImm(const void* mtx_, u32 id, GXTexMtxType type) {
  CHECK((id >= GX_TEXMTX0 && id <= GX_IDENTITY) || (id >= GX_PTTEXMTX0 && id <= GX_PTIDENTITY), "invalid tex mtx {}",
-        static_cast<int>(id));
+        id);
  if (id >= GX_PTTEXMTX0) {
-    CHECK(type == GX_MTX3x4, "invalid pt mtx type {}", static_cast<int>(type));
+    CHECK(type == GX_MTX3x4, "invalid pt mtx type {}", underlying(type));
    const auto idx = (id - GX_PTTEXMTX0) / 3;
 #ifdef AURORA_NATIVE_MATRIX
    const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
    update_gx_state<aurora::Mat4x4<float>>(g_gxState.ptTexMtxs[idx], mtx);
 #else
    const auto& mtx = *reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
-    update_gx_state<aurora::Mat4x4<float>>(g_gxState.ptTexMtxs[idx], mtx.toTransposed4x4());
+    update_gx_state(g_gxState.ptTexMtxs[idx], mtx);
 #endif
  } else {
    const auto idx = (id - GX_TEXMTX0) / 3;
    switch (type) {
    case GX_MTX3x4: {
 #ifdef AURORA_NATIVE_MATRIX
      const auto& mtx = *reinterpret_cast<const aurora::Mat4x4<float>*>(mtx_);
      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx);
 #else
      const auto& mtx = *reinterpret_cast<const aurora::Mat3x4<float>*>(mtx_);
-      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx.toTransposed4x4());
+      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx);
 #endif
      break;
    }
    case GX_MTX2x4: {
-      const auto& mtx = *reinterpret_cast<const aurora::Mat4x2<float>*>(mtx_);
+      const auto& mtx = *reinterpret_cast<const aurora::Mat2x4<float>*>(mtx_);
 #ifdef AURORA_NATIVE_MATRIX
      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx);
 #else
      update_gx_state<aurora::gfx::gx::TexMtxVariant>(g_gxState.texMtxs[idx], mtx.transpose());
 #endif
      break;
    }
    }
--- a/lib/dolphin/gx/GXVert.cpp
+++ b/lib/dolphin/gx/GXVert.cpp
@ -1,47 +1,113 @@
 #include "gx.hpp"
-#include "../../gfx/stream/shader.hpp"
+#include "aurora/math.hpp"
 #include "../../gfx/model/shader.hpp"
 #include "../../gfx/gx_fmt.hpp"
-#include <algorithm>
+#include <cstring>
 #include <optional>
-#ifndef NDEBUG
+struct Attribute {
-static inline GXAttr next_attr(size_t begin) {
+  uint32_t offset;
-  auto iter = std::find_if(g_gxState.vtxDesc.begin() + begin, g_gxState.vtxDesc.end(),
+  GXAttr attr;
-                           [](const auto type) { return type != GX_NONE; });
+  GXAttrType type;
-  if (begin > 0 && iter == g_gxState.vtxDesc.end()) {
+  aurora::gfx::gx::VtxAttrFmt fmt;
-    // wrap around
+};
    iter = std::find_if(g_gxState.vtxDesc.begin(), g_gxState.vtxDesc.end(),
                        [](const auto type) { return type != GX_NONE; });
  }
  return GXAttr(iter - g_gxState.vtxDesc.begin());
 }
 #endif
 struct SStreamState {
  GXPrimitive primitive;
  GXVtxFmt vtxFmt;
  std::vector<Attribute> attrs;
  u16 curAttr = 0;
  u16 vertexCount = 0;
-  u16 vertexStart = 0;
+  u16 vertexStart;
  u16 vertexSize;
  aurora::ByteBuffer vertexBuffer;
  uint8_t* vertexData = nullptr;
  std::vector<u16> indices;
 #ifndef NDEBUG
  GXAttr nextAttr;
 #endif
-  explicit SStreamState(GXPrimitive primitive, GXVtxFmt vtxFmt, u16 numVerts, u16 vertexSize, u16 vertexStart) noexcept
+  explicit SStreamState(GXPrimitive primitive, GXVtxFmt vtxFmt, std::vector<Attribute> attrs, u16 numVerts,
-  : primitive(primitive), vtxFmt(vtxFmt), vertexStart(vertexStart) {
+                        u16 vertexSize, u16 vertexStart) noexcept
-    vertexBuffer.reserve_extra(size_t(numVerts) * vertexSize);
+  : primitive(primitive), vtxFmt(vtxFmt), attrs(std::move(attrs)), vertexStart(vertexStart), vertexSize(vertexSize) {
    vertexBuffer.reserve_extra(static_cast<size_t>(numVerts) * vertexSize);
    if (numVerts > 3 && (primitive == GX_TRIANGLEFAN || primitive == GX_TRIANGLESTRIP)) {
-      indices.reserve((u32(numVerts) - 3) * 3 + 3);
+      indices.reserve(((static_cast<u32>(numVerts) - 3) * 3) + 3);
    } else if (numVerts > 4 && primitive == GX_QUADS) {
-      indices.reserve(u32(numVerts) / 4 * 6);
+      indices.reserve(static_cast<u32>(numVerts) / 4 * 6);
    } else {
      indices.reserve(numVerts);
    }
-#ifndef NDEBUG
+  }
-    nextAttr = next_attr(0);
+
-#endif
+  [[maybe_unused]] u8 check_direct(GXAttr attr, GXCompCnt cnt, GXCompType type) noexcept {
    const auto& curAttr = attrs[this->curAttr];
    ASSERT(curAttr.attr == attr, "bad attribute order: {}, expected {}", attr, curAttr.attr);
    ASSERT(curAttr.type == GX_DIRECT, "bad attribute type: GX_DIRECT, expected {}", curAttr.type);
    ASSERT(curAttr.fmt.cnt == cnt, "bad attribute count: {}, expected {}", cnt, curAttr.fmt.cnt);
    ASSERT(curAttr.fmt.type == type, "bad attribute type: {}, expected {}", type, curAttr.fmt.type);
    return curAttr.fmt.frac;
  }
  void check_indexed(GXAttr attr, GXAttrType type) noexcept {
    const auto& curAttr = attrs[this->curAttr];
    ASSERT(curAttr.attr == attr, "bad attribute order: {}, expected {}", attr, curAttr.attr);
    ASSERT(curAttr.type == type, "bad attribute type: {}, expected {}", type, curAttr.type);
  }
  template <typename T>
  void append(const T& value) noexcept {
    append_data(&value, sizeof(value), attrs[curAttr].offset);
    next_attribute();
  }
 private:
  void append_data(const void* ptr, size_t size, uint32_t offset) {
    if (vertexData == nullptr) {
      const auto vertexStart = vertexBuffer.size();
      vertexBuffer.append_zeroes(vertexSize);
      vertexData = vertexBuffer.data() + vertexStart;
      inc_vertex_count();
    }
    ASSERT(offset + size <= vertexSize, "bad attribute end: {}, expected {}", offset + size, vertexSize);
    memcpy(vertexData + offset, ptr, size);
  }
  void next_attribute() noexcept {
    curAttr = curAttr + 1;
    if (curAttr >= attrs.size()) {
      curAttr = 0;
      vertexData = nullptr;
    }
  }
  void inc_vertex_count() noexcept {
    auto curVertex = vertexStart + vertexCount;
    if (primitive == GX_LINES || primitive == GX_LINESTRIP || primitive == GX_POINTS) {
      // Currently unsupported, skip
      return;
    }
    if (primitive == GX_TRIANGLES || primitive == GX_TRIANGLESTRIP || vertexCount < 3) {
      // pass
    } else if (primitive == GX_TRIANGLEFAN) {
      indices.push_back(vertexStart);
      indices.push_back(curVertex - 1);
    } /*else if (primitive == GX_TRIANGLESTRIP) {
      if ((vertexCount & 1) == 0) {
        indices.push_back(curVertex - 2);
        indices.push_back(curVertex - 1);
      } else {
        indices.push_back(curVertex - 1);
        indices.push_back(curVertex - 2);
      }
    }*/
    else if (primitive == GX_QUADS) {
      if ((vertexCount & 3) == 3) {
        indices.push_back(curVertex - 3);
        indices.push_back(curVertex - 1);
      }
    }
    indices.push_back(curVertex);
    ++vertexCount;
  }
 };
@ -51,228 +117,319 @@ static u16 lastVertexStart = 0;
 extern "C" {
 void GXBegin(GXPrimitive primitive, GXVtxFmt vtxFmt, u16 nVerts) {
  CHECK(!sStreamState, "Stream began twice!");
  uint16_t vertexSize = 0;
  uint16_t numDirectAttrs = 0;
  uint16_t numIndexedAttrs = 0;
  for (GXAttr attr{}; const auto type : g_gxState.vtxDesc) {
    if (type == GX_DIRECT) {
      ++numDirectAttrs;
      if (attr == GX_VA_POS || attr == GX_VA_NRM) {
        vertexSize += 12;
      } else if (attr == GX_VA_CLR0 || attr == GX_VA_CLR1) {
        vertexSize += 16;
      } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) {
        vertexSize += 8;
-      } else UNLIKELY {
+      } else
-        FATAL("dont know how to handle attr {}", static_cast<int>(attr));
+        UNLIKELY { FATAL("dont know how to handle attr {}", attr); }
      }
    } else if (type == GX_INDEX8 || type == GX_INDEX16) {
-      vertexSize += 2;
+      ++numIndexedAttrs;
    }
-    attr = GXAttr(attr + 1);
+    attr = static_cast<GXAttr>(attr + 1);
  }
  auto [num4xAttr, rem] = std::div(numIndexedAttrs, 4);
  u32 num2xAttr = 0;
  if (rem > 2) {
    ++num4xAttr;
  } else if (rem > 0) {
    ++num2xAttr;
  }
  u32 directStart = num4xAttr * 8 + num2xAttr * 4;
  vertexSize += directStart;
  u32 indexOffset = 0;
  u32 directOffset = directStart;
  std::vector<Attribute> attrs;
  attrs.reserve(numDirectAttrs + numIndexedAttrs);
  const auto& curVtxFmt = g_gxState.vtxFmts[vtxFmt];
  for (GXAttr attr{}; const auto type : g_gxState.vtxDesc) {
    if (type == GX_DIRECT) {
      u32 attrSize;
      if (attr == GX_VA_POS || attr == GX_VA_NRM) {
        attrSize = 12;
      } else if (attr == GX_VA_CLR0 || attr == GX_VA_CLR1) {
        attrSize = 16;
      } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) {
        attrSize = 8;
      } else
        UNLIKELY { FATAL("dont know how to handle attr {}", attr); }
      const auto& attrFmt = curVtxFmt.attrs[attr];
      attrs.emplace_back(directOffset, attr, type, attrFmt);
      directOffset += attrSize;
    } else if (type == GX_INDEX8 || type == GX_INDEX16) {
      attrs.emplace_back(indexOffset, attr, type);
      indexOffset += 2;
    }
    attr = static_cast<GXAttr>(attr + 1);
  }
  CHECK(vertexSize > 0, "no vtx attributes enabled?");
-  sStreamState.emplace(primitive, vtxFmt, nVerts, vertexSize, g_gxState.stateDirty ? 0 : lastVertexStart);
+  sStreamState.emplace(primitive, vtxFmt, std::move(attrs), nVerts, vertexSize,
                       /*g_gxState.stateDirty ? 0 : lastVertexStart*/ 0);
 }
-static inline void check_attr_order(GXAttr attr) noexcept {
+void GXPosition3f32(f32 x, f32 y, f32 z) {
-#ifndef NDEBUG
+  sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_F32);
-  CHECK(sStreamState, "Stream not started!");
+  sStreamState->append(aurora::Vec3{x, y, z});
  CHECK(sStreamState->nextAttr == attr, "bad attribute order: {}, expected {}", static_cast<int>(attr),
        static_cast<int>(sStreamState->nextAttr));
  sStreamState->nextAttr = next_attr(attr + 1);
 #endif
 }
 void GXPosition3f32(float x, float y, float z) {
  check_attr_order(GX_VA_POS);
  auto& state = *sStreamState;
  state.vertexBuffer.append(&x, sizeof(float));
  state.vertexBuffer.append(&y, sizeof(float));
  state.vertexBuffer.append(&z, sizeof(float));
  auto curVertex = state.vertexStart + state.vertexCount;
  if (state.primitive == GX_TRIANGLES || state.vertexCount < 3) {
    // pass
  } else if (state.primitive == GX_TRIANGLEFAN) {
    state.indices.push_back(state.vertexStart);
    state.indices.push_back(curVertex - 1);
  } else if (state.primitive == GX_TRIANGLESTRIP) {
    if ((state.vertexCount & 1) == 0) {
      state.indices.push_back(curVertex - 2);
      state.indices.push_back(curVertex - 1);
    } else {
      state.indices.push_back(curVertex - 1);
      state.indices.push_back(curVertex - 2);
    }
  } else if (state.primitive == GX_QUADS) {
    if ((state.vertexCount & 3) == 3) {
      state.indices.push_back(curVertex - 3);
      state.indices.push_back(curVertex - 1);
    }
  }
  state.indices.push_back(curVertex);
  ++state.vertexCount;
 }
 void GXPosition3u16(u16 x, u16 y, u16 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_POS];
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_U16);
-  GXPosition3f32(
+  sStreamState->append(aurora::Vec3{
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
-  );
+  });
 }
 void GXPosition3s16(s16 x, s16 y, s16 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_POS];
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_S16);
-  GXPosition3f32(
+  sStreamState->append(aurora::Vec3{
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
-  );
+  });
 }
 void GXPosition3u8(u8 x, u8 y, u8 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_POS];
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_U8);
-  GXPosition3f32(
+  sStreamState->append(aurora::Vec3{
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
-  );
+  });
 }
 void GXPosition3s8(s8 x, s8 y, s8 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_POS];
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XYZ, GX_S8);
-  GXPosition3f32(
+  sStreamState->append(aurora::Vec3{
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
-  );
+  });
 }
-void GXPosition2f32(float x, float y) {
+void GXPosition2f32(f32 x, f32 y) {
-  GXPosition3f32(x, y, 0.f);
+  sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_F32);
  sStreamState->append(aurora::Vec3{x, y, 0.f});
 }
 void GXPosition2u16(u16 x, u16 y) {
-  GXPosition3u16(x, y, 0);
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_U16);
  sStreamState->append(aurora::Vec3{
      static_cast<f32>(x) / static_cast<f32>(1 << frac),
      static_cast<f32>(y) / static_cast<f32>(1 << frac),
      0.f,
  });
 }
 void GXPosition2s16(s16 x, s16 y) {
-  GXPosition3s16(x, y, 0);
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_S16);
  sStreamState->append(aurora::Vec3{
      static_cast<f32>(x) / static_cast<f32>(1 << frac),
      static_cast<f32>(y) / static_cast<f32>(1 << frac),
      0.f,
  });
 }
 void GXPosition2u8(u8 x, u8 y) {
-  GXPosition3u8(x, y, 0);
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_U8);
  sStreamState->append(aurora::Vec3{
      static_cast<f32>(x) / static_cast<f32>(1 << frac),
      static_cast<f32>(y) / static_cast<f32>(1 << frac),
      0.f,
  });
 }
 void GXPosition2s8(s8 x, s8 y) {
-  GXPosition3s8(x, y, 0);
+  const auto frac = sStreamState->check_direct(GX_VA_POS, GX_POS_XY, GX_S8);
  sStreamState->append(aurora::Vec3{
      static_cast<f32>(x) / static_cast<f32>(1 << frac),
      static_cast<f32>(y) / static_cast<f32>(1 << frac),
      0.f,
  });
 }
 void GXPosition1x16(u16 idx) {
-  check_attr_order(GX_VA_POS);
+  sStreamState->check_indexed(GX_VA_POS, GX_INDEX16);
-  // keep aligned
+  sStreamState->append<u16>(idx);
  if (sStreamState->vertexBuffer.size() % 4 != 0) {
    sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4));
  }
  sStreamState->vertexBuffer.append(&idx, 2);
 }
 void GXPosition1x8(u8 idx) {
-  GXPosition1x16(idx);
+  sStreamState->check_indexed(GX_VA_POS, GX_INDEX8);
  sStreamState->append<u16>(idx);
 }
-void GXNormal3f32(float x, float y, float z) {
+void GXNormal3f32(f32 x, f32 y, f32 z) {
-  check_attr_order(GX_VA_NRM);
+  sStreamState->check_direct(GX_VA_NRM, GX_NRM_XYZ, GX_F32);
-  sStreamState->vertexBuffer.append(&x, 4);
+  sStreamState->append(aurora::Vec3{x, y, z});
  sStreamState->vertexBuffer.append(&y, 4);
  sStreamState->vertexBuffer.append(&z, 4);
 }
 void GXNormal3s16(s16 x, s16 y, s16 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_NRM];
+  const auto frac = sStreamState->check_direct(GX_VA_NRM, GX_NRM_XYZ, GX_S16);
-  GXNormal3f32(
+  sStreamState->append(aurora::Vec3{
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
-  );
+  });
 }
 void GXNormal3s8(s8 x, s8 y, s8 z) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_NRM];
+  const auto frac = sStreamState->check_direct(GX_VA_NRM, GX_NRM_XYZ, GX_S8);
-  GXNormal3f32(
+  sStreamState->append(aurora::Vec3{
-    static_cast<float>(x) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(x) / static_cast<f32>(1 << frac),
-    static_cast<float>(y) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(y) / static_cast<f32>(1 << frac),
-    static_cast<float>(z) / static_cast<f32>(1 << attrFmt.frac)
+      static_cast<f32>(z) / static_cast<f32>(1 << frac),
-  );
+  });
 }
-void GXNormal1x16(u16 idx) {
+void GXNormal1x16(u16 index) {
-  check_attr_order(GX_VA_NRM);
+  sStreamState->check_indexed(GX_VA_NRM, GX_INDEX16);
-  // keep aligned
+  sStreamState->append<u16>(index);
  if (sStreamState->vertexBuffer.size() % 4 != 0) {
    sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4));
  }
  sStreamState->vertexBuffer.append(&idx, 2);
 }
-void GXNormal1x8(u8 idx) {
+void GXNormal1x8(u8 index) {
-  GXNormal1x16(idx);
+  sStreamState->check_indexed(GX_VA_POS, GX_INDEX8);
  sStreamState->append<u16>(index);
 }
-void GXColor4f32(float r, float g, float b, float a) {
+void GXColor4f32(f32 r, f32 g, f32 b, f32 a) {
-  check_attr_order(GX_VA_CLR0);
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGBA, GX_RGBA8);
-  sStreamState->vertexBuffer.append(&r, 4);
+  sStreamState->append(aurora::Vec4{r, g, b, a});
  sStreamState->vertexBuffer.append(&g, 4);
  sStreamState->vertexBuffer.append(&b, 4);
  sStreamState->vertexBuffer.append(&a, 4);
 }
 void GXColor4u8(u8 r, u8 g, u8 b, u8 a) {
-  GXColor4f32(static_cast<float>(r) / 255.f, static_cast<float>(g) / 255.f, static_cast<float>(b) / 255.f,
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGBA, GX_RGBA8);
-              static_cast<float>(a) / 255.f);
+  sStreamState->append(aurora::Vec4{
      static_cast<f32>(r) / 255.f,
      static_cast<f32>(g) / 255.f,
      static_cast<f32>(b) / 255.f,
      static_cast<f32>(a) / 255.f,
  });
 }
 void GXColor3u8(u8 r, u8 g, u8 b) {
-  GXColor4u8(r, g, b, 255);
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGB, GX_RGB8);
  sStreamState->append(aurora::Vec4{
      static_cast<f32>(r) / 255.f,
      static_cast<f32>(g) / 255.f,
      static_cast<f32>(b) / 255.f,
      1.f,
  });
 }
-void GXColor1x16(u16 idx) {
+void GXColor1u32(u32 clr) {
-  check_attr_order(GX_VA_CLR0);
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGBA, GX_RGBA8);
-  // keep aligned
+  sStreamState->append(aurora::Vec4{
-  if (sStreamState->vertexBuffer.size() % 4 != 0) {
+      static_cast<f32>((clr >> 24) & 0xff) / 255.f,
-    sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4));
+      static_cast<f32>((clr >> 16) & 0xff) / 255.f,
-  }
+      static_cast<f32>((clr >> 8) & 0xff) / 255.f,
-  sStreamState->vertexBuffer.append(&idx, 2);
+      static_cast<f32>(clr & 0xff) / 255.f,
  });
 }
-void GXColor1x8(u8 idx) {
+void GXColor1u16(u16 clr) {
-  GXColor1x16(idx);
+  sStreamState->check_direct(GX_VA_CLR0, GX_CLR_RGB, GX_RGB565);
  sStreamState->append(aurora::Vec4{
      static_cast<f32>((clr >> 11) & 0x1f) / 31.f,
      static_cast<f32>((clr >> 5) & 0x3f) / 63.f,
      static_cast<f32>(clr & 0x1f) / 31.f,
      1.f,
  });
 }
-void GXTexCoord2f32(float u, float v) {
+void GXTexCoord2f32(f32 s, f32 t) {
-  check_attr_order(GX_VA_TEX0);
+  sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_F32);
-  sStreamState->vertexBuffer.append(&u, 4);
+  sStreamState->append(aurora::Vec2{s, t});
-  sStreamState->vertexBuffer.append(&v, 4);
+}
 void GXTexCoord2u16(u16 s, u16 t) {
  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_U16);
  sStreamState->append(aurora::Vec2{
      static_cast<f32>(s) / static_cast<f32>(1 << frac),
      static_cast<f32>(t) / static_cast<f32>(1 << frac),
  });
 }
 void GXTexCoord2s16(s16 s, s16 t) {
-  const auto& attrFmt = g_gxState.vtxFmts[sStreamState->vtxFmt].attrs[GX_VA_TEX0];
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_S16);
-  GXTexCoord2f32(
+  sStreamState->append(aurora::Vec2{
-    static_cast<float>(s) / static_cast<f32>(1 << attrFmt.frac),
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
-    static_cast<float>(t) / static_cast<f32>(1 << attrFmt.frac)
+      static_cast<f32>(t) / static_cast<f32>(1 << frac),
-  );
+  });
 }
-void GXTexCoord1x16(u16 idx) {
+void GXTexCoord2u8(u8 s, u8 t) {
-  check_attr_order(GX_VA_TEX0);
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_U8);
-  // keep aligned
+  sStreamState->append(aurora::Vec2{
-  if (sStreamState->vertexBuffer.size() % 4 != 0) {
+      static_cast<f32>(s) / static_cast<f32>(1 << frac),
-    sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4));
+      static_cast<f32>(t) / static_cast<f32>(1 << frac),
-  }
+  });
  sStreamState->vertexBuffer.append(&idx, 2);
 }
-void GXTexCoord1x8(u8 idx) {
+void GXTexCoord2s8(s8 s, s8 t) {
-  GXTexCoord1x16(idx);
+  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_ST, GX_S8);
  sStreamState->append(aurora::Vec2{
      static_cast<f32>(s) / static_cast<f32>(1 << frac),
      static_cast<f32>(t) / static_cast<f32>(1 << frac),
  });
 }
 void GXTexCoord1f32(f32 s) {
  sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_F32);
  sStreamState->append(aurora::Vec2{s, 0.f});
 }
 void GXTexCoord1u16(u16 s) {
  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_U16);
  sStreamState->append(aurora::Vec2{
      static_cast<f32>(s) / static_cast<f32>(1 << frac),
      0.f,
  });
 }
 void GXTexCoord1s16(s16 s) {
  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_S16);
  sStreamState->append(aurora::Vec2{
      static_cast<f32>(s) / static_cast<f32>(1 << frac),
      0.f,
  });
 }
 void GXTexCoord1u8(u8 s) {
  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_U8);
  sStreamState->append(aurora::Vec2{
      static_cast<f32>(s) / static_cast<f32>(1 << frac),
      0.f,
  });
 }
 void GXTexCoord1s8(s8 s) {
  const auto frac = sStreamState->check_direct(GX_VA_TEX0, GX_TEX_S, GX_S8);
  sStreamState->append(aurora::Vec2{
      static_cast<f32>(s) / static_cast<f32>(1 << frac),
      0.f,
  });
 }
 void GXTexCoord1x16(u16 index) {
  sStreamState->check_indexed(GX_VA_TEX0, GX_INDEX16);
  sStreamState->append(index);
 }
 void GXTexCoord1x8(u8 index) {
  sStreamState->check_indexed(GX_VA_TEX0, GX_INDEX8);
  sStreamState->append(static_cast<u16>(index));
 }
 void GXEnd() {
@ -282,27 +439,55 @@ void GXEnd() {
  }
  const auto vertRange = aurora::gfx::push_verts(sStreamState->vertexBuffer.data(), sStreamState->vertexBuffer.size());
  const auto indexRange = aurora::gfx::push_indices(aurora::ArrayRef{sStreamState->indices});
-  if (g_gxState.stateDirty) {
+
-    aurora::gfx::stream::PipelineConfig config{};
+  aurora::gfx::gx::BindGroupRanges ranges{};
-    populate_pipeline_config(config, GX_TRIANGLES);
+  for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
-    const auto info = build_shader_info(config.shaderConfig);
+    if (g_gxState.vtxDesc[i] != GX_INDEX8 && g_gxState.vtxDesc[i] != GX_INDEX16) {
-    const auto pipeline = aurora::gfx::pipeline_ref(config);
+      continue;
-    aurora::gfx::push_draw_command(aurora::gfx::stream::DrawData{
+    }
-        .pipeline = pipeline,
+    auto& array = g_gxState.arrays[i];
-        .vertRange = vertRange,
+    if (array.cachedRange.size > 0) {
-        .uniformRange = build_uniform(info),
+      // Use the currently cached range
-        .indexRange = indexRange,
+      ranges.vaRanges[i] = array.cachedRange;
-        .indexCount = static_cast<uint32_t>(sStreamState->indices.size()),
+    } else {
-        .bindGroups = build_bind_groups(info, config.shaderConfig, {}),
+      // Push array data to storage and cache range
-        .dstAlpha = g_gxState.dstAlpha,
+      const auto range = aurora::gfx::push_storage(static_cast<const uint8_t*>(array.data), array.size);
-    });
+      ranges.vaRanges[i] = range;
-  } else {
+      array.cachedRange = range;
-    aurora::gfx::merge_draw_command(aurora::gfx::stream::DrawData{
+    }
        .vertRange = vertRange,
        .indexRange = indexRange,
        .indexCount = static_cast<uint32_t>(sStreamState->indices.size()),
    });
  }
  // if (g_gxState.stateDirty) {
  aurora::gfx::model::PipelineConfig config{};
  GXPrimitive primitive = GX_TRIANGLES;
  switch (sStreamState->primitive) {
  case GX_TRIANGLESTRIP:
    primitive = GX_TRIANGLESTRIP;
    break;
  default:
    break;
  }
  populate_pipeline_config(config, primitive, sStreamState->vtxFmt);
  const auto info = build_shader_info(config.shaderConfig);
  const auto bindGroups = aurora::gfx::gx::build_bind_groups(info, config.shaderConfig, ranges);
  const auto pipeline = aurora::gfx::pipeline_ref(config);
  aurora::gfx::push_draw_command(aurora::gfx::model::DrawData{
      .pipeline = pipeline,
      .vertRange = vertRange,
      .idxRange = indexRange,
      .dataRanges = ranges,
      .uniformRange = build_uniform(info),
      .indexCount = static_cast<uint32_t>(sStreamState->indices.size()),
      .bindGroups = bindGroups,
      .dstAlpha = g_gxState.dstAlpha,
  });
  // } else {
  //   aurora::gfx::merge_draw_command(aurora::gfx::model::DrawData{
  //       .vertRange = vertRange,
  //       .idxRange = indexRange,
  //       .indexCount = static_cast<uint32_t>(sStreamState->indices.size()),
  //   });
  // }
  lastVertexStart = sStreamState->vertexStart + sStreamState->vertexCount;
  sStreamState.reset();
 }
--- a/lib/gfx/common.cpp
+++ b/lib/gfx/common.cpp
@ -3,7 +3,6 @@
 #include "../internal.hpp"
 #include "../webgpu/gpu.hpp"
 #include "model/shader.hpp"
 #include "stream/shader.hpp"
 #include "texture.hpp"
 #include <condition_variable>
@ -11,7 +10,6 @@
 #include <fstream>
 #include <mutex>
 #include <thread>
 #include <variant>
 #include <absl/container/flat_hash_map.h>
 #include <magic_enum.hpp>
@ -37,13 +35,11 @@ constexpr uint64_t StagingBufferSize =
    UniformBufferSize + VertexBufferSize + IndexBufferSize + StorageBufferSize + TextureUploadSize;
 struct ShaderState {
  stream::State stream;
  model::State model;
 };
 struct ShaderDrawCommand {
  ShaderType type;
  union {
    stream::DrawData stream;
    model::DrawData model;
  };
 };
@ -168,10 +164,9 @@ static u32 g_serializedPipelineCount = 0;
 template <typename PipelineConfig>
 static void serialize_pipeline_config(ShaderType type, const PipelineConfig& config) {
  static_assert(std::has_unique_object_representations_v<PipelineConfig>);
-  g_serializedPipelines.append(&type, sizeof(type));
+  g_serializedPipelines.append(type);
-  const u32 configSize = sizeof(config);
+  g_serializedPipelines.append<u32>(sizeof(config));
-  g_serializedPipelines.append(&configSize, sizeof(configSize));
+  g_serializedPipelines.append(config);
  g_serializedPipelines.append(&config, configSize);
  ++g_serializedPipelineCount;
 }
@ -278,33 +273,19 @@ void resolve_pass(TextureHandle texture, ClipRect rect, bool clear, Vec4<float>
  ++g_currentRenderPass;
 }
-template <>
+// template <>
-const stream::State& get_state() {
+// void merge_draw_command(stream::DrawData data) {
-  return g_state.stream;
+//   auto& last = get_last_draw_command(ShaderType::Stream).data.draw.stream;
-}
+//   CHECK(last.vertRange.offset + last.vertRange.size == data.vertRange.offset, "Invalid vertex merge range: {} -> {}",
-
+//         last.vertRange.offset + last.vertRange.size, data.vertRange.offset);
-template <>
+//   CHECK(last.indexRange.offset + last.indexRange.size == data.indexRange.offset, "Invalid index merge range: {} ->
-void push_draw_command(stream::DrawData data) {
+//   {}",
-  push_draw_command(ShaderDrawCommand{.type = ShaderType::Stream, .stream = data});
+//         last.indexRange.offset + last.indexRange.size, data.indexRange.offset);
-}
+//   last.vertRange.size += data.vertRange.size;
-
+//   last.indexRange.size += data.indexRange.size;
-template <>
+//   last.indexCount += data.indexCount;
-void merge_draw_command(stream::DrawData data) {
+//   ++g_mergedDrawCallCount;
-  auto& last = get_last_draw_command(ShaderType::Stream).data.draw.stream;
+// }
  CHECK(last.vertRange.offset + last.vertRange.size == data.vertRange.offset, "Invalid vertex merge range: {} -> {}",
        last.vertRange.offset + last.vertRange.size, data.vertRange.offset);
  CHECK(last.indexRange.offset + last.indexRange.size == data.indexRange.offset, "Invalid index merge range: {} -> {}",
        last.indexRange.offset + last.indexRange.size, data.indexRange.offset);
  last.vertRange.size += data.vertRange.size;
  last.indexRange.size += data.indexRange.size;
  last.indexCount += data.indexCount;
  ++g_mergedDrawCallCount;
 }
 template <>
 PipelineRef pipeline_ref(stream::PipelineConfig config) {
  return find_pipeline(ShaderType::Stream, config, [=]() { return create_pipeline(g_state.stream, config); });
 }
 template <>
 void push_draw_command(model::DrawData data) {
@ -378,16 +359,6 @@ void load_pipeline_cache() {
      u32 size = *reinterpret_cast<const u32*>(pipelineCache.data() + offset);
      offset += sizeof(u32);
      switch (type) {
      case ShaderType::Stream: {
        if (size != sizeof(stream::PipelineConfig)) {
          break;
        }
        const auto config = *reinterpret_cast<const stream::PipelineConfig*>(pipelineCache.data() + offset);
        if (config.version != gx::GXPipelineConfigVersion) {
          break;
        }
        find_pipeline(type, config, [=]() { return stream::create_pipeline(g_state.stream, config); }, true);
      } break;
      case ShaderType::Model: {
        if (size != sizeof(model::PipelineConfig)) {
          break;
@ -397,9 +368,10 @@ void load_pipeline_cache() {
          break;
        }
        find_pipeline(type, config, [=]() { return model::create_pipeline(g_state.model, config); }, true);
-      } break;
+        break;
      }
      default:
-        Log.warn("Unknown pipeline type {}", static_cast<int>(type));
+        Log.warn("Unknown pipeline type {}", underlying(type));
        break;
      }
      offset += size;
@ -459,7 +431,6 @@ void initialize() {
  }
  map_staging_buffer();
  g_state.stream = stream::construct_state();
  g_state.model = model::construct_state();
  load_pipeline_cache();
@ -581,6 +552,9 @@ void end_frame(const wgpu::CommandEncoder& cmd) {
  currentStagingBuffer = (currentStagingBuffer + 1) % g_stagingBuffers.size();
  map_staging_buffer();
  g_currentRenderPass = UINT32_MAX;
  for (auto& array : gx::g_gxState.arrays) {
    array.cachedRange = {};
  }
  if (!g_hasPipelineThread) {
    pipeline_worker();
@ -612,7 +586,7 @@ void render(wgpu::CommandEncoder& cmd) {
        .view = webgpu::g_depthBuffer.view,
        .depthLoadOp = passInfo.clear ? wgpu::LoadOp::Clear : wgpu::LoadOp::Load,
        .depthStoreOp = wgpu::StoreOp::Store,
-        .depthClearValue = 1.f,
+        .depthClearValue = gx::UseReversedZ ? 0.f : 1.f,
    };
    const auto label = fmt::format("Render pass {}", i);
    const wgpu::RenderPassDescriptor renderPassDescriptor{
@ -680,7 +654,9 @@ void render_pass(const wgpu::RenderPassEncoder& pass, u32 idx) {
    switch (cmd.type) {
    case CommandType::SetViewport: {
      const auto& vp = cmd.data.setViewport;
-      pass.SetViewport(vp.left, vp.top, vp.width, vp.height, vp.znear, vp.zfar);
+      const float minDepth = gx::UseReversedZ ? 1.f - vp.zfar : vp.znear;
      const float maxDepth = gx::UseReversedZ ? 1.f - vp.znear : vp.zfar;
      pass.SetViewport(vp.left, vp.top, vp.width, vp.height, minDepth, maxDepth);
    } break;
    case CommandType::SetScissor: {
      const auto& sc = cmd.data.setScissor;
@ -694,9 +670,6 @@ void render_pass(const wgpu::RenderPassEncoder& pass, u32 idx) {
    case CommandType::Draw: {
      const auto& draw = cmd.data.draw;
      switch (draw.type) {
      case ShaderType::Stream:
        stream::render(g_state.stream, draw.stream, pass);
        break;
      case ShaderType::Model:
        model::render(g_state.model, draw.model, pass);
        break;
--- a/lib/gfx/common.hpp
+++ b/lib/gfx/common.hpp
@ -56,8 +56,7 @@ public:
  ByteBuffer() noexcept = default;
  explicit ByteBuffer(size_t size) noexcept
  : m_data(static_cast<uint8_t*>(calloc(1, size))), m_length(size), m_capacity(size) {}
-  explicit ByteBuffer(uint8_t* data, size_t size) noexcept
+  explicit ByteBuffer(uint8_t* data, size_t size) noexcept : m_data(data), m_capacity(size), m_owned(false) {}
  : m_data(data), m_capacity(size), m_owned(false) {}
  ~ByteBuffer() noexcept {
    if (m_data != nullptr && m_owned) {
      free(m_data);
@ -98,6 +97,11 @@ public:
    m_length += size;
  }
  template <typename T>
  void append(const T& obj) {
    append(&obj, sizeof(T));
  }
  void append_zeroes(size_t size) {
    resize(m_length + size, true);
    m_length += size;
@ -179,8 +183,7 @@ struct TextureRef;
 using TextureHandle = std::shared_ptr<TextureRef>;
 enum class ShaderType : uint8_t {
-  Stream,
+  Model = 1,
  Model,
 };
 void initialize();
--- a/lib/gfx/gx.cpp
+++ b/lib/gfx/gx.cpp
@ -7,7 +7,6 @@
 #include <absl/container/flat_hash_map.h>
 #include <cfloat>
 #include <cmath>
 using aurora::gfx::gx::g_gxState;
 static aurora::Module Log("aurora::gx");
@ -25,7 +24,7 @@ const TextureBind& get_texture(GXTexMapID id) noexcept { return g_gxState.textur
 static inline wgpu::BlendFactor to_blend_factor(GXBlendFactor fac, bool isDst) {
  switch (fac) {
-    DEFAULT_FATAL("invalid blend factor {}", static_cast<int>(fac));
+    DEFAULT_FATAL("invalid blend factor {}", underlying(fac));
  case GX_BL_ZERO:
    return wgpu::BlendFactor::Zero;
  case GX_BL_ONE:
@ -55,21 +54,21 @@ static inline wgpu::BlendFactor to_blend_factor(GXBlendFactor fac, bool isDst) {
 static inline wgpu::CompareFunction to_compare_function(GXCompare func) {
  switch (func) {
-    DEFAULT_FATAL("invalid depth fn {}", static_cast<int>(func));
+    DEFAULT_FATAL("invalid depth fn {}", underlying(func));
  case GX_NEVER:
    return wgpu::CompareFunction::Never;
  case GX_LESS:
-    return wgpu::CompareFunction::Less;
+    return UseReversedZ ? wgpu::CompareFunction::Greater : wgpu::CompareFunction::Less;
  case GX_EQUAL:
    return wgpu::CompareFunction::Equal;
  case GX_LEQUAL:
-    return wgpu::CompareFunction::LessEqual;
+    return UseReversedZ ? wgpu::CompareFunction::GreaterEqual : wgpu::CompareFunction::LessEqual;
  case GX_GREATER:
-    return wgpu::CompareFunction::Greater;
+    return UseReversedZ ? wgpu::CompareFunction::Less : wgpu::CompareFunction::Greater;
  case GX_NEQUAL:
    return wgpu::CompareFunction::NotEqual;
  case GX_GEQUAL:
-    return wgpu::CompareFunction::GreaterEqual;
+    return UseReversedZ ? wgpu::CompareFunction::LessEqual : wgpu::CompareFunction::GreaterEqual;
  case GX_ALWAYS:
    return wgpu::CompareFunction::Always;
  }
@ -79,7 +78,7 @@ static inline wgpu::BlendState to_blend_state(GXBlendMode mode, GXBlendFactor sr
                                              GXLogicOp op, u32 dstAlpha) {
  wgpu::BlendComponent colorBlendComponent;
  switch (mode) {
-    DEFAULT_FATAL("unsupported blend mode {}", static_cast<int>(mode));
+    DEFAULT_FATAL("unsupported blend mode {}", underlying(mode));
  case GX_BM_NONE:
    colorBlendComponent = {
        .operation = wgpu::BlendOperation::Add,
@ -103,7 +102,7 @@ static inline wgpu::BlendState to_blend_state(GXBlendMode mode, GXBlendFactor sr
    break;
  case GX_BM_LOGIC:
    switch (op) {
-      DEFAULT_FATAL("unsupported logic op {}", static_cast<int>(op));
+      DEFAULT_FATAL("unsupported logic op {}", underlying(op));
    case GX_LO_CLEAR:
      colorBlendComponent = {
          .operation = wgpu::BlendOperation::Add,
@ -160,7 +159,7 @@ static inline wgpu::ColorWriteMask to_write_mask(bool colorUpdate, bool alphaUpd
 static inline wgpu::PrimitiveState to_primitive_state(GXPrimitive gx_prim, GXCullMode gx_cullMode) {
  wgpu::PrimitiveTopology primitive = wgpu::PrimitiveTopology::TriangleList;
  switch (gx_prim) {
-    DEFAULT_FATAL("unsupported primitive type {}", static_cast<int>(gx_prim));
+    DEFAULT_FATAL("unsupported primitive type {}", underlying(gx_prim));
  case GX_TRIANGLES:
    break;
  case GX_TRIANGLESTRIP:
@ -169,7 +168,7 @@ static inline wgpu::PrimitiveState to_primitive_state(GXPrimitive gx_prim, GXCul
  }
  wgpu::CullMode cullMode = wgpu::CullMode::None;
  switch (gx_cullMode) {
-    DEFAULT_FATAL("unsupported cull mode {}", static_cast<int>(gx_cullMode));
+    DEFAULT_FATAL("unsupported cull mode {}", underlying(gx_cullMode));
  case GX_CULL_FRONT:
    cullMode = wgpu::CullMode::Front;
    break;
@ -193,14 +192,6 @@ wgpu::RenderPipeline build_pipeline(const PipelineConfig& config, const ShaderIn
      .format = g_graphicsConfig.depthFormat,
      .depthWriteEnabled = config.depthUpdate,
      .depthCompare = to_compare_function(config.depthFunc),
      .stencilFront =
          wgpu::StencilFaceState{
              .compare = wgpu::CompareFunction::Always,
          },
      .stencilBack =
          wgpu::StencilFaceState{
              .compare = wgpu::CompareFunction::Always,
          },
  };
  const auto blendState =
      to_blend_state(config.blendMode, config.blendFacSrc, config.blendFacDst, config.blendOp, config.dstAlpha);
@ -249,25 +240,23 @@ wgpu::RenderPipeline build_pipeline(const PipelineConfig& config, const ShaderIn
  return g_device.CreateRenderPipeline(&descriptor);
 }
-void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive) noexcept {
+void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive, GXVtxFmt fmt) noexcept {
  const auto& vtxFmt = g_gxState.vtxFmts[fmt];
  config.shaderConfig.fogType = g_gxState.fog.type;
  config.shaderConfig.vtxAttrs = g_gxState.vtxDesc;
  int lastIndexedAttr = -1;
  for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
    const auto type = g_gxState.vtxDesc[i];
    if (type != GX_INDEX8 && type != GX_INDEX16) {
-      config.shaderConfig.attrMapping[i] = GX_VA_NULL;
+      config.shaderConfig.attrMapping[i] = {};
      continue;
    }
-    const auto& array = g_gxState.arrays[i];
+    // Map attribute to its own storage
-    if (lastIndexedAttr >= 0 && array == g_gxState.arrays[lastIndexedAttr]) {
+    config.shaderConfig.attrMapping[i] = StorageConfig {
-      // Map attribute to previous attribute
+      .attr = static_cast<GXAttr>(i),
-      config.shaderConfig.attrMapping[i] = config.shaderConfig.attrMapping[lastIndexedAttr];
+      .cnt = vtxFmt.attrs[i].cnt,
-    } else {
+      .compType = vtxFmt.attrs[i].type,
-      // Map attribute to its own storage
+      .frac = vtxFmt.attrs[i].frac,
-      config.shaderConfig.attrMapping[i] = static_cast<GXAttr>(i);
+    };
    }
    lastIndexedAttr = i;
  }
  config.shaderConfig.tevSwapTable = g_gxState.tevSwapTable;
  for (u8 i = 0; i < g_gxState.numTevStages; ++i) {
@ -328,14 +317,14 @@ void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive) noe
 Range build_uniform(const ShaderInfo& info) noexcept {
  auto [buf, range] = map_uniform(info.uniformSize);
  {
-    buf.append(&g_gxState.pnMtx[g_gxState.currentPnMtx], 128);
+    buf.append(g_gxState.pnMtx[g_gxState.currentPnMtx]);
-    buf.append(&g_gxState.proj, 64);
+    buf.append(g_gxState.proj);
  }
  for (int i = 0; i < info.loadsTevReg.size(); ++i) {
    if (!info.loadsTevReg.test(i)) {
      continue;
    }
-    buf.append(&g_gxState.colorRegs[i], 16);
+    buf.append(g_gxState.colorRegs[i]);
  }
  bool lightingEnabled = false;
  for (int i = 0; i < info.sampledColorChannels.size(); ++i) {
@ -352,11 +341,10 @@ Range build_uniform(const ShaderInfo& info) noexcept {
  if (lightingEnabled) {
    // Lights
    static_assert(sizeof(g_gxState.lights) == 80 * GX::MaxLights);
-    buf.append(&g_gxState.lights, 80 * GX::MaxLights);
+    buf.append(g_gxState.lights);
    // Light state for all channels
    for (int i = 0; i < 4; ++i) {
-      u32 lightState = g_gxState.colorChannelState[i].lightMask.to_ulong();
+      buf.append<u32>(g_gxState.colorChannelState[i].lightMask.to_ulong());
      buf.append(&lightState, 4);
    }
  }
  for (int i = 0; i < info.sampledColorChannels.size(); ++i) {
@ -366,25 +354,25 @@ Range build_uniform(const ShaderInfo& info) noexcept {
    const auto& ccc = g_gxState.colorChannelConfig[i * 2];
    const auto& ccs = g_gxState.colorChannelState[i * 2];
    if (ccc.lightingEnabled && ccc.ambSrc == GX_SRC_REG) {
-      buf.append(&ccs.ambColor, 16);
+      buf.append(ccs.ambColor);
    }
    if (ccc.matSrc == GX_SRC_REG) {
-      buf.append(&ccs.matColor, 16);
+      buf.append(ccs.matColor);
    }
    const auto& ccca = g_gxState.colorChannelConfig[i * 2 + 1];
    const auto& ccsa = g_gxState.colorChannelState[i * 2 + 1];
    if (ccca.lightingEnabled && ccca.ambSrc == GX_SRC_REG) {
-      buf.append(&ccsa.ambColor, 16);
+      buf.append(ccsa.ambColor);
    }
    if (ccca.matSrc == GX_SRC_REG) {
-      buf.append(&ccsa.matColor, 16);
+      buf.append(ccsa.matColor);
    }
  }
  for (int i = 0; i < info.sampledKColors.size(); ++i) {
    if (!info.sampledKColors.test(i)) {
      continue;
    }
-    buf.append(&g_gxState.kcolors[i], 16);
+    buf.append(g_gxState.kcolors[i]);
  }
  for (int i = 0; i < info.usesTexMtx.size(); ++i) {
    if (!info.usesTexMtx.test(i)) {
@ -392,26 +380,16 @@ Range build_uniform(const ShaderInfo& info) noexcept {
    }
    const auto& state = g_gxState;
    switch (info.texMtxTypes[i]) {
-      DEFAULT_FATAL("unhandled tex mtx type {}", static_cast<int>(info.texMtxTypes[i]));
+      DEFAULT_FATAL("unhandled tex mtx type {}", underlying(info.texMtxTypes[i]));
    case GX_TG_MTX2x4:
-      if (std::holds_alternative<Mat4x2<float>>(state.texMtxs[i])) {
+      if (std::holds_alternative<Mat2x4<float>>(state.texMtxs[i])) {
-        buf.append(&std::get<Mat4x2<float>>(state.texMtxs[i]), 32);
+        buf.append(std::get<Mat2x4<float>>(state.texMtxs[i]));
      } else if (std::holds_alternative<Mat4x4<float>>(g_gxState.texMtxs[i])) {
        // TODO: SMB hits this?
        Mat4x2<float> mtx{
            {1.f, 0.f},
            {0.f, 1.f},
            {0.f, 0.f},
            {0.f, 0.f},
        };
        buf.append(&mtx, 32);
      } else
        UNLIKELY FATAL("expected 2x4 mtx in idx {}", i);
      break;
    case GX_TG_MTX3x4:
-      if (std::holds_alternative<Mat4x4<float>>(g_gxState.texMtxs[i])) {
+      if (std::holds_alternative<Mat3x4<float>>(g_gxState.texMtxs[i])) {
-        const auto& mat = std::get<Mat4x4<float>>(g_gxState.texMtxs[i]);
+        buf.append(std::get<Mat3x4<float>>(g_gxState.texMtxs[i]));
        buf.append(&mat, 64);
      } else
        UNLIKELY FATAL("expected 3x4 mtx in idx {}", i);
      break;
@ -421,18 +399,11 @@ Range build_uniform(const ShaderInfo& info) noexcept {
    if (!info.usesPTTexMtx.test(i)) {
      continue;
    }
-    buf.append(&g_gxState.ptTexMtxs[i], 64);
+    buf.append(g_gxState.ptTexMtxs[i]);
  }
  if (info.usesFog) {
    const auto& state = g_gxState.fog;
-    struct Fog {
+    Fog fog{.color = state.color};
      Vec4<float> color = state.color;
      float a = 0.f;
      float b = 0.5f;
      float c = 0.f;
      float pad = FLT_MAX;
    } fog{};
    static_assert(sizeof(Fog) == 32);
    if (state.nearZ != state.farZ && state.startZ != state.endZ) {
      const float depthRange = state.farZ - state.nearZ;
      const float fogRange = state.endZ - state.startZ;
@ -440,7 +411,7 @@ Range build_uniform(const ShaderInfo& info) noexcept {
      fog.b = state.farZ / depthRange;
      fog.c = state.startZ / fogRange;
    }
-    buf.append(&fog, 32);
+    buf.append(fog);
  }
  for (int i = 0; i < info.sampledTextures.size(); ++i) {
    if (!info.sampledTextures.test(i)) {
@ -448,7 +419,7 @@ Range build_uniform(const ShaderInfo& info) noexcept {
    }
    const auto& tex = get_texture(static_cast<GXTexMapID>(i));
    CHECK(tex, "unbound texture {}", i);
-    buf.append(&tex.texObj.lodBias, 4);
+    buf.append(tex.texObj.lodBias);
  }
  g_gxState.stateDirty = false;
  return range;
@ -564,7 +535,7 @@ GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const Shader
    };
    u32 bindIdx = 1;
    for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
-      if (config.attrMapping[i] == static_cast<GXAttr>(i)) {
+      if (config.attrMapping[i].attr == static_cast<GXAttr>(i)) {
        uniformLayoutEntries[bindIdx] = wgpu::BindGroupLayoutEntry{
            .binding = bindIdx,
            .visibility = wgpu::ShaderStage::Vertex,
@ -688,7 +659,7 @@ void shutdown() noexcept {
 static wgpu::AddressMode wgpu_address_mode(GXTexWrapMode mode) {
  switch (mode) {
-    DEFAULT_FATAL("invalid wrap mode {}", static_cast<int>(mode));
+    DEFAULT_FATAL("invalid wrap mode {}", underlying(mode));
  case GX_CLAMP:
    return wgpu::AddressMode::ClampToEdge;
  case GX_REPEAT:
@ -735,8 +706,6 @@ wgpu::SamplerDescriptor TextureBind::get_descriptor() const noexcept {
        .magFilter = wgpu::FilterMode::Nearest,
        .minFilter = wgpu::FilterMode::Nearest,
        .mipmapFilter = wgpu::MipmapFilterMode::Nearest,
        .lodMinClamp = 0.f,
        .lodMaxClamp = 1000.f,
        .maxAnisotropy = 1,
    };
  }
@ -750,8 +719,6 @@ wgpu::SamplerDescriptor TextureBind::get_descriptor() const noexcept {
      .magFilter = magFilter,
      .minFilter = minFilter,
      .mipmapFilter = mipFilter,
      .lodMinClamp = 0.f,
      .lodMaxClamp = 1000.f,
      .maxAnisotropy = wgpu_aniso(texObj.maxAniso),
  };
 }
--- a/lib/gfx/gx.hpp
+++ b/lib/gfx/gx.hpp
@ -46,6 +46,11 @@ constexpr float GX_LARGE_NUMBER = -1048576.0f;
 #endif
 namespace aurora::gfx::gx {
 constexpr bool EnableNormalVisualization = false;
 constexpr bool EnableDebugPrints = false;
 constexpr bool UsePerPixelLighting = true;
 constexpr bool UseReversedZ = true;
 constexpr u32 MaxTextures = GX_MAX_TEXMAP;
 constexpr u32 MaxTluts = 20;
 constexpr u32 MaxTevStages = GX_MAX_TEVSTAGE;
@ -144,8 +149,7 @@ struct ColorChannelState {
  Vec4<float> ambColor;
  GX::LightMask lightMask;
 };
-// Mat4x4 used instead of Mat4x3 for padding purposes
+using TexMtxVariant = std::variant<std::monostate, Mat2x4<float>, Mat3x4<float>>;
 using TexMtxVariant = std::variant<std::monostate, Mat4x2<float>, Mat4x4<float>>;
 struct TcgConfig {
  GXTexGenType type = GX_TG_MTX2x4;
  GXTexGenSrc src = GX_MAX_TEXGENSRC;
@ -213,10 +217,10 @@ struct VtxFmt {
  std::array<VtxAttrFmt, MaxVtxAttr> attrs;
 };
 struct PnMtx {
-  Mat4x4<float> pos;
+  Mat3x4<float> pos;
-  Mat4x4<float> nrm;
+  Mat3x4<float> nrm;
 };
-static_assert(sizeof(PnMtx) == sizeof(Mat4x4<float>) * 2);
+static_assert(sizeof(PnMtx) == sizeof(Mat3x4<float>) * 2);
 struct Light {
  Vec4<float> pos{0.f, 0.f, 0.f};
  Vec4<float> dir{0.f, 0.f, 0.f};
@ -230,6 +234,14 @@ struct Light {
  bool operator!=(const Light& rhs) const { return !(*this == rhs); }
 };
 static_assert(sizeof(Light) == 80);
 struct Fog {
  Vec4<float> color;
  float a = 0.f;
  float b = 0.5f;
  float c = 0.f;
  float pad = FLT_MAX;
 };
 static_assert(sizeof(Fog) == 32);
 struct AttrArray {
  const void* data;
  u32 size;
@ -245,7 +257,6 @@ struct GXState {
  std::array<PnMtx, MaxPnMtx> pnMtx;
  u32 currentPnMtx;
  Mat4x4<float> proj;
  Mat4x4<float> origProj;    // for GXGetProjectionv
  GXProjectionType projType; // for GXGetProjectionv
  FogState fog;
  GXCullMode cullMode = GX_CULL_BACK;
@ -266,7 +277,7 @@ struct GXState {
  std::array<TextureBind, MaxTextures> textures;
  std::array<GXTlutObj_, MaxTluts> tluts;
  std::array<TexMtxVariant, MaxTexMtx> texMtxs;
-  std::array<Mat4x4<float>, MaxPTTexMtx> ptTexMtxs;
+  std::array<Mat3x4<float>, MaxPTTexMtx> ptTexMtxs;
  std::array<TcgConfig, MaxTexCoord> tcgs;
  std::array<GXAttrType, MaxVtxAttr> vtxDesc;
  std::array<VtxFmt, MaxVtxFmt> vtxFmts;
@ -345,11 +356,18 @@ struct TextureConfig {
  bool operator==(const TextureConfig& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; }
 };
 static_assert(std::has_unique_object_representations_v<TextureConfig>);
 struct StorageConfig {
  GXAttr attr = GX_VA_NULL;
  GXCompCnt cnt = static_cast<GXCompCnt>(0xFF);
  GXCompType compType = static_cast<GXCompType>(0xFF);
  u8 frac = 0;
  std::array<u8, 3> pad{};
 };
 struct ShaderConfig {
  GXFogType fogType;
  std::array<GXAttrType, MaxVtxAttr> vtxAttrs;
  // Mapping for indexed attributes -> storage buffer
-  std::array<GXAttr, MaxVtxAttr> attrMapping;
+  std::array<StorageConfig, MaxVtxAttr> attrMapping;
  std::array<TevSwap, MaxTevSwap> tevSwapTable;
  std::array<TevStage, MaxTevStages> tevStages;
  u32 tevStageCount = 0;
@ -363,7 +381,7 @@ struct ShaderConfig {
 };
 static_assert(std::has_unique_object_representations_v<ShaderConfig>);
-constexpr u32 GXPipelineConfigVersion = 4;
+constexpr u32 GXPipelineConfigVersion = 5;
 struct PipelineConfig {
  u32 version = GXPipelineConfigVersion;
  ShaderConfig shaderConfig;
@ -405,7 +423,7 @@ struct ShaderInfo {
 struct BindGroupRanges {
  std::array<Range, GX_VA_MAX_ATTR> vaRanges{};
 };
-void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive) noexcept;
+void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive, GXVtxFmt fmt) noexcept;
 wgpu::RenderPipeline build_pipeline(const PipelineConfig& config, const ShaderInfo& info,
                                    ArrayRef<wgpu::VertexBufferLayout> vtxBuffers, wgpu::ShaderModule shader,
                                    const char* label) noexcept;
--- a/lib/gfx/gx_fmt.hpp
+++ b/lib/gfx/gx_fmt.hpp
@ -1,3 +1,7 @@
 #pragma once
 #include "../internal.hpp"
 #include <dolphin/gx/GXEnum.h>
 #include <fmt/format.h>
 #include <string>
@ -25,7 +29,7 @@ inline std::string format_as(const GXTevOp& op) {
  case GX_TEV_COMP_RGB8_EQ:
    return "GX_TEV_COMP_RGB8_EQ";
  default:
-    return fmt::format("GXTevOp({})", static_cast<int>(op));
+    return fmt::format("GXTevOp({})", underlying(op));
  }
 }
@ -64,7 +68,7 @@ inline std::string format_as(const GXTevColorArg& arg) {
  case GX_CC_ZERO:
    return "GX_CC_ZERO";
  default:
-    return fmt::format("GXTevColorArg({})", static_cast<int>(arg));
+    return fmt::format("GXTevColorArg({})", underlying(arg));
  }
 }
@ -87,7 +91,7 @@ inline std::string format_as(const GXTevAlphaArg& arg) {
  case GX_CA_ZERO:
    return "GX_CA_ZERO";
  default:
-    return fmt::format("GXTevAlphaArg({})", static_cast<int>(arg));
+    return fmt::format("GXTevAlphaArg({})", underlying(arg));
  }
 }
@ -118,7 +122,7 @@ inline std::string format_as(const GXTexGenSrc& src) {
  case GX_TG_TEX7:
    return "GX_TG_TEX7";
  default:
-    return fmt::format("GXTexGenSrc({})", static_cast<int>(src));
+    return fmt::format("GXTexGenSrc({})", underlying(src));
  }
 }
@ -133,7 +137,7 @@ inline std::string format_as(const GXTexGenType& type) {
  case GX_TG_BUMP1:
    return "GX_TG_BUMP1";
  default:
-    return fmt::format("GXTexGenType({})", static_cast<int>(type));
+    return fmt::format("GXTexGenType({})", underlying(type));
  }
 }
@ -146,7 +150,7 @@ inline std::string format_as(const GXTevBias& bias) {
  case GX_TB_SUBHALF:
    return "GX_TB_SUBHALF";
  default:
-    return fmt::format("GXTevBias({})", static_cast<int>(bias));
+    return fmt::format("GXTevBias({})", underlying(bias));
  }
 }
@ -161,7 +165,7 @@ inline std::string format_as(const GXTevScale& scale) {
  case GX_CS_DIVIDE_2:
    return "GX_CS_DIVIDE_2";
  default:
-    return fmt::format("GXTevScale({})", static_cast<int>(scale));
+    return fmt::format("GXTevScale({})", underlying(scale));
  }
 }
@ -176,7 +180,7 @@ inline std::string format_as(const GXTevRegID& reg) {
  case GX_TEVREG2:
    return "GX_TEVREG2";
  default:
-    return fmt::format("GXTevRegID({})", static_cast<int>(reg));
+    return fmt::format("GXTevRegID({})", underlying(reg));
  }
 }
@ -231,7 +235,7 @@ inline std::string format_as(const GXTevKColorSel& sel) {
  case GX_TEV_KCSEL_K3_A:
    return "GX_TEV_KCSEL_K3_A";
  default:
-    return fmt::format("GXTevKColorSel({})", static_cast<int>(sel));
+    return fmt::format("GXTevKColorSel({})", underlying(sel));
  }
 }
@ -286,7 +290,7 @@ inline std::string format_as(const GXTevKAlphaSel& sel) {
  case GX_TEV_KASEL_K3_A:
    return "GX_TEV_KASEL_K3_A";
  default:
-    return fmt::format("GXTevKAlphaSel({})", static_cast<int>(sel));
+    return fmt::format("GXTevKAlphaSel({})", underlying(sel));
  }
 }
@ -313,7 +317,7 @@ inline std::string format_as(const GXTexMapID& id) {
  case GX_TEX_DISABLE:
    return "GX_TEX_DISABLE";
  default:
-    return fmt::format("GXTexMapID({})", static_cast<int>(id));
+    return fmt::format("GXTexMapID({})", underlying(id));
  }
 }
@ -340,7 +344,7 @@ inline std::string format_as(const GXChannelID& id) {
  case GX_COLOR_NULL:
    return "GX_COLOR_NULL";
  default:
-    return fmt::format("GXChannelID({})", static_cast<int>(id));
+    return fmt::format("GXChannelID({})", underlying(id));
  }
 }
@ -351,7 +355,7 @@ inline std::string format_as(const GXColorSrc& src) {
  case GX_SRC_VTX:
    return "GX_SRC_VTX";
  default:
-    return fmt::format("GXColorSrc({})", static_cast<int>(src));
+    return fmt::format("GXColorSrc({})", underlying(src));
  }
 }
@ -380,7 +384,7 @@ inline std::string format_as(const GXTexMtx& mtx) {
  case GX_IDENTITY:
    return "GX_IDENTITY";
  default:
-    return fmt::format("GXTexMtx({})", static_cast<int>(mtx));
+    return fmt::format("GXTexMtx({})", underlying(mtx));
  }
 }
@ -429,7 +433,7 @@ inline std::string format_as(const GXPTTexMtx& mtx) {
  case GX_PTIDENTITY:
    return "GX_PTIDENTITY";
  default:
-    return fmt::format("GXPTTexMtx({})", static_cast<int>(mtx));
+    return fmt::format("GXPTTexMtx({})", underlying(mtx));
  }
 }
@ -452,7 +456,7 @@ inline std::string format_as(const GXCompare& comp) {
  case GX_ALWAYS:
    return "GX_ALWAYS";
  default:
-    return fmt::format("GXCompare({})", static_cast<int>(comp));
+    return fmt::format("GXCompare({})", underlying(comp));
  }
 }
@ -467,7 +471,7 @@ inline std::string format_as(const GXAlphaOp& op) {
  case GX_AOP_XNOR:
    return "GX_AOP_XNOR";
  default:
-    return fmt::format("GXAlphaOp({})", static_cast<int>(op));
+    return fmt::format("GXAlphaOp({})", underlying(op));
  }
 }
@ -496,7 +500,7 @@ inline std::string format_as(const GXFogType& type) {
  case GX_FOG_ORTHO_REVEXP2:
    return "GX_FOG_ORTHO_REVEXP2";
  default:
-    return fmt::format("GXFogType({})", static_cast<int>(type));
+    return fmt::format("GXFogType({})", underlying(type));
  }
 }
@ -521,6 +525,158 @@ inline std::string format_as(const GXTexCoordID& id) {
  case GX_TEXCOORD_NULL:
    return "GX_TEXCOORD_NULL";
  default:
-    return fmt::format("GXTexCoordID({})", static_cast<int>(id));
+    return fmt::format("GXTexCoordID({})", underlying(id));
  }
 }
 inline std::string format_as(const GXPrimitive& prim) {
  switch (prim) {
  case GX_QUADS:
    return "GX_QUADS";
  case GX_TRIANGLES:
    return "GX_TRIANGLES";
  case GX_TRIANGLESTRIP:
    return "GX_TRIANGLESTRIP";
  case GX_TRIANGLEFAN:
    return "GX_TRIANGLEFAN";
  case GX_LINES:
    return "GX_LINES";
  case GX_LINESTRIP:
    return "GX_LINESTRIP";
  case GX_POINTS:
    return "GX_POINTS";
  default:
    return fmt::format("GXPrimitive({})", underlying(prim));
  }
 }
 inline std::string format_as(const GXAttr& attr) {
  switch (attr) {
  case GX_VA_PNMTXIDX:
    return "GX_VA_PNMTXIDX";
  case GX_VA_TEX0MTXIDX:
    return "GX_VA_TEX0MTXIDX";
  case GX_VA_TEX1MTXIDX:
    return "GX_VA_TEX1MTXIDX";
  case GX_VA_TEX2MTXIDX:
    return "GX_VA_TEX2MTXIDX";
  case GX_VA_TEX3MTXIDX:
    return "GX_VA_TEX3MTXIDX";
  case GX_VA_TEX4MTXIDX:
    return "GX_VA_TEX4MTXIDX";
  case GX_VA_TEX5MTXIDX:
    return "GX_VA_TEX5MTXIDX";
  case GX_VA_TEX6MTXIDX:
    return "GX_VA_TEX6MTXIDX";
  case GX_VA_TEX7MTXIDX:
    return "GX_VA_TEX7MTXIDX";
  case GX_VA_POS:
    return "GX_VA_POS";
  case GX_VA_NRM:
    return "GX_VA_NRM";
  case GX_VA_CLR0:
    return "GX_VA_CLR0";
  case GX_VA_CLR1:
    return "GX_VA_CLR1";
  case GX_VA_TEX0:
    return "GX_VA_TEX0";
  case GX_VA_TEX1:
    return "GX_VA_TEX1";
  case GX_VA_TEX2:
    return "GX_VA_TEX2";
  case GX_VA_TEX3:
    return "GX_VA_TEX3";
  case GX_VA_TEX4:
    return "GX_VA_TEX4";
  case GX_VA_TEX5:
    return "GX_VA_TEX5";
  case GX_VA_TEX6:
    return "GX_VA_TEX6";
  case GX_VA_TEX7:
    return "GX_VA_TEX7";
  case GX_POS_MTX_ARRAY:
    return "GX_POS_MTX_ARRAY";
  case GX_NRM_MTX_ARRAY:
    return "GX_NRM_MTX_ARRAY";
  case GX_TEX_MTX_ARRAY:
    return "GX_TEX_MTX_ARRAY";
  case GX_LIGHT_ARRAY:
    return "GX_LIGHT_ARRAY";
  case GX_VA_NBT:
    return "GX_VA_NBT";
  case GX_VA_NULL:
    return "GX_VA_NULL";
  default:
    return fmt::format("GXAttr({})", underlying(attr));
  }
 }
 inline std::string format_as(const GXCompCnt& cnt) {
  switch (cnt) {
  case GX_POS_XY:
    return "GX_POS_XY|GX_NRM_XYZ|GX_CLR_RGB|GX_TEX_S";
  case GX_POS_XYZ:
    return "GX_POS_XYZ|GX_NRM_NBT|GX_CLR_RGBA|GX_TEX_ST";
  case GX_NRM_NBT3:
    return "GX_NRM_NBT3";
  default:
    return fmt::format("GXCompCnt({})", underlying(cnt));
  }
 }
 inline std::string format_as(const GXCompType& type) {
  switch (type) {
  case GX_U8:
    return "GX_U8|GX_RGB565";
  case GX_S8:
    return "GX_S8|GX_RGB8";
  case GX_U16:
    return "GX_U16|GX_RGBX8";
  case GX_S16:
    return "GX_S16|GX_RGBA4";
  case GX_F32:
    return "GX_F32|GX_RGBA6";
  case GX_RGBA8:
    return "GX_RGBA8";
  default:
    return fmt::format("GXCompType({})", underlying(type));
  }
 }
 inline std::string format_as(const GXAttrType& type) {
  switch (type) {
  case GX_NONE:
    return "GX_NONE";
  case GX_DIRECT:
    return "GX_DIRECT";
  case GX_INDEX8:
    return "GX_INDEX8";
  case GX_INDEX16:
    return "GX_INDEX16";
  default:
    return fmt::format("GXAttrType({})", underlying(type));
  }
 }
 inline std::string format_as(const GXVtxFmt& fmt) {
  switch (fmt) {
  case GX_VTXFMT0:
    return "GX_VTXFMT0";
  case GX_VTXFMT1:
    return "GX_VTXFMT1";
  case GX_VTXFMT2:
    return "GX_VTXFMT2";
  case GX_VTXFMT3:
    return "GX_VTXFMT3";
  case GX_VTXFMT4:
    return "GX_VTXFMT4";
  case GX_VTXFMT5:
    return "GX_VTXFMT5";
  case GX_VTXFMT6:
    return "GX_VTXFMT6";
  case GX_VTXFMT7:
    return "GX_VTXFMT7";
  default:
    return fmt::format("GXVtxFmt({})", underlying(fmt));
  }
 }
--- a/lib/gfx/gx_shader.cpp
+++ b/lib/gfx/gx_shader.cpp
--- a/lib/gfx/model/shader.cpp
+++ b/lib/gfx/model/shader.cpp
@ -1,60 +1,29 @@
 #include "shader.hpp"
 #include "../../webgpu/gpu.hpp"
 #include "../gx_fmt.hpp"
 #include <absl/container/flat_hash_map.h>
 namespace aurora::gfx::model {
 static Module Log("aurora::gfx::model");
 template <typename T>
 constexpr T bswap16(T val) noexcept {
  static_assert(sizeof(T) == sizeof(u16));
  union {
    u16 u;
    T t;
  } v{.t = val};
 #if __GNUC__
  v.u = __builtin_bswap16(v.u);
 #elif _WIN32
  v.u = _byteswap_ushort(v.u);
 #else
  v.u = (v.u << 8) | ((v.u >> 8) & 0xFF);
 #endif
  return v.t;
 }
 template <typename T>
 constexpr T bswap32(T val) noexcept {
  static_assert(sizeof(T) == sizeof(u32));
  union {
    u32 u;
    T t;
  } v{.t = val};
 #if __GNUC__
  v.u = __builtin_bswap32(v.u);
 #elif _WIN32
  v.u = _byteswap_ulong(v.u);
 #else
  v.u = ((v.u & 0x0000FFFF) << 16) | ((v.u & 0xFFFF0000) >> 16) | ((v.u & 0x00FF00FF) << 8) | ((v.u & 0xFF00FF00) >> 8);
 #endif
  return v.t;
 }
 using IndexedAttrs = std::array<bool, GX_VA_MAX_ATTR>;
 struct DisplayListCache {
  ByteBuffer vtxBuf;
  ByteBuffer idxBuf;
  IndexedAttrs indexedAttrs;
  GXVtxFmt fmt;
-  DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs)
+  DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs, GXVtxFmt fmt)
-  : vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs) {}
+  : vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs), fmt(fmt) {}
 };
 static absl::flat_hash_map<HashType, DisplayListCache> sCachedDisplayLists;
 static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u16 vtxCount,
                              IndexedAttrs& indexedAttrs) {
-  using aurora::gfx::gx::g_gxState;
+  using gx::g_gxState;
  struct {
    u8 count;
    GXCompType type;
@ -66,14 +35,13 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
  for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) {
    const auto& attrFmt = g_gxState.vtxFmts[vtxfmt].attrs[attr];
    switch (g_gxState.vtxDesc[attr]) {
-      DEFAULT_FATAL("unhandled attribute type {}", static_cast<int>(g_gxState.vtxDesc[attr]));
+      DEFAULT_FATAL("unhandled attribute type {}", g_gxState.vtxDesc[attr]);
    case GX_NONE:
      break;
    case GX_DIRECT:
 #define COMBINE(val1, val2, val3) (((val1) << 16) | ((val2) << 8) | (val3))
      switch (COMBINE(attr, attrFmt.cnt, attrFmt.type)) {
-        DEFAULT_FATAL("not handled: attr {}, cnt {}, type {}", static_cast<int>(attr), static_cast<int>(attrFmt.cnt),
+        DEFAULT_FATAL("not handled: attr {}, cnt {}, type {}", attr, attrFmt.cnt, attrFmt.type);
                      static_cast<int>(attrFmt.type));
      case COMBINE(GX_VA_POS, GX_POS_XYZ, GX_F32):
      case COMBINE(GX_VA_NRM, GX_NRM_XYZ, GX_F32):
        attrArrays[attr].count = 3;
@ -150,12 +118,10 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
  for (u32 v = 0; v < vtxCount; ++v) {
    for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) {
      if (g_gxState.vtxDesc[attr] == GX_INDEX8) {
-        u16 index = *ptr;
+        buf.append(static_cast<u16>(*ptr));
        buf.append(&index, 2);
        ++ptr;
      } else if (g_gxState.vtxDesc[attr] == GX_INDEX16) {
-        u16 index = bswap16(*reinterpret_cast<const u16*>(ptr));
+        buf.append(bswap(*reinterpret_cast<const u16*>(ptr)));
        buf.append(&index, 2);
        ptr += 2;
      }
      if (g_gxState.vtxDesc[attr] != GX_DIRECT) {
@ -182,7 +148,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
        break;
      case GX_U16:
        for (int i = 0; i < count; ++i) {
-          const auto value = bswap16(reinterpret_cast<const u16*>(ptr)[i]);
+          const auto value = bswap(reinterpret_cast<const u16*>(ptr)[i]);
          out[i] = static_cast<f32>(value) / static_cast<f32>(1 << attrFmt.frac);
        }
        buf.append(out.data(), sizeof(f32) * count);
@ -190,7 +156,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
        break;
      case GX_S16:
        for (int i = 0; i < count; ++i) {
-          const auto value = bswap16(reinterpret_cast<const s16*>(ptr)[i]);
+          const auto value = bswap(reinterpret_cast<const s16*>(ptr)[i]);
          out[i] = static_cast<f32>(value) / static_cast<f32>(1 << attrFmt.frac);
        }
        buf.append(out.data(), sizeof(f32) * count);
@ -198,7 +164,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
        break;
      case GX_F32:
        for (int i = 0; i < count; ++i) {
-          out[i] = bswap32(reinterpret_cast<const f32*>(ptr)[i]);
+          out[i] = bswap(reinterpret_cast<const f32*>(ptr)[i]);
        }
        buf.append(out.data(), sizeof(f32) * count);
        ptr += count * sizeof(f32);
@ -227,7 +193,7 @@ static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u
    buf.reserve_extra(vtxCount * sizeof(u16));
    for (u16 v = 0; v < vtxCount; ++v) {
      const u16 idx = vtxStart + v;
-      buf.append(&idx, sizeof(u16));
+      buf.append(idx);
      ++numIndices;
    }
  } else if (prim == GX_TRIANGLEFAN) {
@ -235,29 +201,26 @@ static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u
    for (u16 v = 0; v < vtxCount; ++v) {
      const u16 idx = vtxStart + v;
      if (v < 3) {
-        buf.append(&idx, sizeof(u16));
+        buf.append(idx);
        ++numIndices;
        continue;
      }
-      const std::array<u16, 3> idxs{vtxStart, u16(idx - 1), idx};
+      buf.append(std::array{vtxStart, static_cast<u16>(idx - 1), idx});
      buf.append(idxs.data(), sizeof(u16) * 3);
      numIndices += 3;
    }
  } else if (prim == GX_TRIANGLESTRIP) {
-    buf.reserve_extra(((u32(vtxCount) - 3) * 3 + 3) * sizeof(u16));
+    buf.reserve_extra(((static_cast<u32>(vtxCount) - 3) * 3 + 3) * sizeof(u16));
    for (u16 v = 0; v < vtxCount; ++v) {
      const u16 idx = vtxStart + v;
      if (v < 3) {
-        buf.append(&idx, sizeof(u16));
+        buf.append(idx);
        ++numIndices;
        continue;
      }
      if ((v & 1) == 0) {
-        const std::array<u16, 3> idxs{u16(idx - 2), u16(idx - 1), idx};
+        buf.append(std::array{static_cast<u16>(idx - 2), static_cast<u16>(idx - 1), idx});
        buf.append(idxs.data(), sizeof(u16) * 3);
      } else {
-        const std::array<u16, 3> idxs{u16(idx - 1), u16(idx - 2), idx};
+        buf.append(std::array{static_cast<u16>(idx - 1), static_cast<u16>(idx - 2), idx});
        buf.append(idxs.data(), sizeof(u16) * 3);
      }
      numIndices += 3;
    }
@ -271,6 +234,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
  Range vertRange, idxRange;
  u32 numIndices = 0;
  IndexedAttrs indexedAttrs{};
  GXVtxFmt fmt = GX_MAX_VTXFMT;
  auto it = sCachedDisplayLists.find(hash);
  if (it != sCachedDisplayLists.end()) {
    const auto& cache = it->second;
@ -278,6 +242,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
    vertRange = push_verts(cache.vtxBuf.data(), cache.vtxBuf.size());
    idxRange = push_indices(cache.idxBuf.data(), cache.idxBuf.size());
    indexedAttrs = cache.indexedAttrs;
    fmt = cache.fmt;
  } else {
    const u8* data = dlStart;
    u32 pos = 0;
@ -302,8 +267,12 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
      case GX_DRAW_TRIANGLE_STRIP:
      case GX_DRAW_TRIANGLE_FAN: {
        const auto prim = static_cast<GXPrimitive>(opcode);
-        const auto fmt = static_cast<GXVtxFmt>(cmd & GX_VAT_MASK);
+        const auto newFmt = static_cast<GXVtxFmt>(cmd & GX_VAT_MASK);
-        u16 vtxCount = bswap16(*reinterpret_cast<const u16*>(data + pos));
+        if (fmt != GX_MAX_VTXFMT && fmt != newFmt) {
          FATAL("Vertex format changed mid-display list: {} -> {}", fmt, newFmt);
        }
        fmt = newFmt;
        u16 vtxCount = bswap(*reinterpret_cast<const u16*>(data + pos));
        pos += 2;
        pos += vtxCount * prepare_vtx_buffer(vtxBuf, fmt, data + pos, vtxCount, indexedAttrs);
        numIndices += prepare_idx_buffer(idxBuf, prim, vtxStart, vtxCount);
@ -319,22 +288,16 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
    }
    vertRange = push_verts(vtxBuf.data(), vtxBuf.size());
    idxRange = push_indices(idxBuf.data(), idxBuf.size());
-    sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs);
+    sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs, fmt);
  }
  gx::BindGroupRanges ranges{};
  int lastIndexedAttr = -1;
  for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
    if (!indexedAttrs[i]) {
      continue;
    }
    auto& array = gx::g_gxState.arrays[i];
-    if (lastIndexedAttr >= 0 && array == gx::g_gxState.arrays[lastIndexedAttr]) {
+    if (array.cachedRange.size > 0) {
      // Reuse range from last attribute in shader
      // Don't set the output range, so it remains unbound
      const auto range = gx::g_gxState.arrays[lastIndexedAttr].cachedRange;
      array.cachedRange = range;
    } else if (array.cachedRange.size > 0) {
      // Use the currently cached range
      ranges.vaRanges[i] = array.cachedRange;
    } else {
@ -343,11 +306,10 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
      ranges.vaRanges[i] = range;
      array.cachedRange = range;
    }
    lastIndexedAttr = i;
  }
  model::PipelineConfig config{};
-  populate_pipeline_config(config, GX_TRIANGLES);
+  populate_pipeline_config(config, GX_TRIANGLES, fmt);
  const auto info = gx::build_shader_info(config.shaderConfig);
  const auto bindGroups = gx::build_bind_groups(info, config.shaderConfig, ranges);
  const auto pipeline = pipeline_ref(config);
@ -366,7 +328,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
 State construct_state() { return {}; }
-wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config) {
+wgpu::RenderPipeline create_pipeline(const State& state, const PipelineConfig& config) {
  const auto info = build_shader_info(config.shaderConfig); // TODO remove
  const auto shader = build_shader(config.shaderConfig, info);
@ -385,7 +347,7 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const
  // Indexed attributes
  for (u32 i = 0; i < num4xAttr; ++i) {
    vtxAttrs[shaderLocation] = {
-        .format = wgpu::VertexFormat::Sint16x4,
+        .format = wgpu::VertexFormat::Uint16x4,
        .offset = offset,
        .shaderLocation = shaderLocation,
    };
@ -394,7 +356,7 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const
  }
  for (u32 i = 0; i < num2xAttr; ++i) {
    vtxAttrs[shaderLocation] = {
-        .format = wgpu::VertexFormat::Sint16x2,
+        .format = wgpu::VertexFormat::Uint16x2,
        .offset = offset,
        .shaderLocation = shaderLocation,
    };
--- a/lib/gfx/stream/shader.cpp
+++ b/lib/gfx/stream/shader.cpp
@ -1,82 +0,0 @@
 #include "shader.hpp"
 #include "../../webgpu/gpu.hpp"
 namespace aurora::gfx::stream {
 static Module Log("aurora::gfx::stream");
 using webgpu::g_device;
 wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config) {
  const auto info = build_shader_info(config.shaderConfig); // TODO remove
  const auto shader = build_shader(config.shaderConfig, info);
  std::array<wgpu::VertexAttribute, 4> attributes{};
  attributes[0] = wgpu::VertexAttribute{
      .format = wgpu::VertexFormat::Float32x3,
      .offset = 0,
      .shaderLocation = 0,
  };
  uint64_t offset = 12;
  uint32_t shaderLocation = 1;
  if (config.shaderConfig.vtxAttrs[GX_VA_NRM] == GX_DIRECT) {
    attributes[shaderLocation] = wgpu::VertexAttribute{
        .format = wgpu::VertexFormat::Float32x3,
        .offset = offset,
        .shaderLocation = shaderLocation,
    };
    offset += 12;
    shaderLocation++;
  }
  if (config.shaderConfig.vtxAttrs[GX_VA_CLR0] == GX_DIRECT) {
    attributes[shaderLocation] = wgpu::VertexAttribute{
        .format = wgpu::VertexFormat::Float32x4,
        .offset = offset,
        .shaderLocation = shaderLocation,
    };
    offset += 16;
    shaderLocation++;
  }
  for (int i = GX_VA_TEX0; i < GX_VA_TEX7; ++i) {
    if (config.shaderConfig.vtxAttrs[i] != GX_DIRECT) {
      continue;
    }
    attributes[shaderLocation] = wgpu::VertexAttribute{
        .format = wgpu::VertexFormat::Float32x2,
        .offset = offset,
        .shaderLocation = shaderLocation,
    };
    offset += 8;
    shaderLocation++;
  }
  const std::array vertexBuffers{wgpu::VertexBufferLayout{
      .arrayStride = offset,
      .attributeCount = shaderLocation,
      .attributes = attributes.data(),
  }};
  return build_pipeline(config, info, vertexBuffers, shader, "Stream Pipeline");
 }
 State construct_state() { return {}; }
 void render(const State& state, const DrawData& data, const wgpu::RenderPassEncoder& pass) {
  if (!bind_pipeline(data.pipeline, pass)) {
    return;
  }
  const std::array offsets{data.uniformRange.offset};
  pass.SetBindGroup(0, find_bind_group(data.bindGroups.uniformBindGroup), offsets.size(), offsets.data());
  if (data.bindGroups.samplerBindGroup && data.bindGroups.textureBindGroup) {
    pass.SetBindGroup(1, find_bind_group(data.bindGroups.samplerBindGroup));
    pass.SetBindGroup(2, find_bind_group(data.bindGroups.textureBindGroup));
  }
  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size);
  pass.SetIndexBuffer(g_indexBuffer, wgpu::IndexFormat::Uint16, data.indexRange.offset, data.indexRange.size);
  if (data.dstAlpha != UINT32_MAX) {
    const wgpu::Color color{0.f, 0.f, 0.f, data.dstAlpha / 255.f};
    pass.SetBlendConstant(&color);
  }
  pass.DrawIndexed(data.indexCount);
 }
 } // namespace aurora::gfx::stream
--- a/lib/gfx/stream/shader.hpp
+++ b/lib/gfx/stream/shader.hpp
@ -1,24 +0,0 @@
 #pragma once
 #include "../common.hpp"
 #include "../gx.hpp"
 namespace aurora::gfx::stream {
 struct DrawData {
  PipelineRef pipeline;
  Range vertRange;
  Range uniformRange;
  Range indexRange;
  uint32_t indexCount;
  gx::GXBindGroups bindGroups;
  u32 dstAlpha;
 };
 struct PipelineConfig : public gx::PipelineConfig {};
 struct State {};
 State construct_state();
 wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config);
 void render(const State& state, const DrawData& data, const wgpu::RenderPassEncoder& pass);
 } // namespace aurora::gfx::stream
--- a/lib/gfx/texture_convert.cpp
+++ b/lib/gfx/texture_convert.cpp
@ -66,17 +66,6 @@ static size_t ComputeMippedBlockCountDXT1(uint32_t w, uint32_t h, uint32_t mips)
  return ret;
 }
 template <typename T>
 constexpr T bswap16(T val) noexcept {
 #if __GNUC__
  return __builtin_bswap16(val);
 #elif _WIN32
  return _byteswap_ushort(val);
 #else
  return (val = (val << 8) | ((val >> 8) & 0xFF));
 #endif
 }
 template <typename T>
 concept TextureDecoder = requires(T) {
  typename T::Source;
@ -178,15 +167,15 @@ struct TextureDecoderIA4 {
 };
 struct TextureDecoderIA8 {
-  using Source = uint8_t;
+  using Source = uint16_t;
  using Target = RGBA8;
  static constexpr uint32_t Frac = 1;
-  static constexpr uint32_t BlockWidth = 8;
+  static constexpr uint32_t BlockWidth = 4;
  static constexpr uint32_t BlockHeight = 4;
  static void decode_texel(Target* target, const Source* in, const uint32_t x) {
-    const auto texel = bswap16(in[x]);
+    const auto texel = bswap(in[x]);
    const uint8_t intensity = texel >> 8;
    target[x].r = intensity;
    target[x].g = intensity;
@ -228,7 +217,7 @@ struct TextureDecoderRGB565 {
  static constexpr uint32_t BlockHeight = 4;
  static void decode_texel(Target* target, const Source* in, const uint32_t x) {
-    const auto texel = bswap16(in[x]);
+    const auto texel = bswap(in[x]);
    target[x].r = ExpandTo8<5>(texel >> 11 & 0x1f);
    target[x].g = ExpandTo8<6>(texel >> 5 & 0x3f);
    target[x].b = ExpandTo8<5>(texel & 0x1f);
@ -245,7 +234,7 @@ struct TextureDecoderRGB5A3 {
  static constexpr uint32_t BlockHeight = 4;
  static void decode_texel(Target* target, const Source* in, const uint32_t x) {
-    const auto texel = bswap16(in[x]);
+    const auto texel = bswap(in[x]);
    if ((texel & 0x8000) != 0) {
      target[x].r = ExpandTo8<5>(texel >> 10 & 0x1f);
      target[x].g = ExpandTo8<5>(texel >> 5 & 0x1f);
@ -322,8 +311,8 @@ static ByteBuffer BuildDXT1FromGCN(uint32_t width, uint32_t height, uint32_t mip
        for (uint32_t y = 0; y < 2; ++y) {
          DXT1Block* target = targetMip + (baseY + y) * w + baseX;
          for (size_t x = 0; x < 2; ++x) {
-            target[x].color1 = bswap16(in[x].color1);
+            target[x].color1 = bswap(in[x].color1);
-            target[x].color2 = bswap16(in[x].color2);
+            target[x].color2 = bswap(in[x].color2);
            for (size_t i = 0; i < 4; ++i) {
              std::array<uint8_t, 4> ind;
              const uint8_t packed = in[x].lines[i];
@ -365,8 +354,8 @@ static ByteBuffer BuildRGBA8FromCMPR(uint32_t width, uint32_t height, uint32_t m
        for (uint32_t yb = 0; yb < 8; yb += 4) {
          for (uint32_t xb = 0; xb < 8; xb += 4) {
            // CMPR difference: Big-endian color1/2
-            const uint16_t color1 = bswap16(*reinterpret_cast<const uint16_t*>(src));
+            const uint16_t color1 = bswap(*reinterpret_cast<const uint16_t*>(src));
-            const uint16_t color2 = bswap16(*reinterpret_cast<const uint16_t*>(src + 2));
+            const uint16_t color2 = bswap(*reinterpret_cast<const uint16_t*>(src + 2));
            src += 4;
            // Fill in first two colors in color table.
@ -480,4 +469,4 @@ ByteBuffer convert_tlut(u32 format, uint32_t width, ArrayRef<uint8_t> data) {
    return DecodeLinear<TextureDecoderRGB5A3>(width, data);
  }
 }
-} // namespace aurora::gfx
+} // namespace aurora::gfx
--- a/lib/internal.hpp
+++ b/lib/internal.hpp
@ -6,6 +6,8 @@
 #include <array>
 #include <cassert>
 #include <cstdint>
 #include <type_traits>
 #include <vector>
 using namespace std::string_view_literals;
@ -21,6 +23,46 @@ using namespace std::string_view_literals;
 #endif
 #endif
 template <typename T>
  requires(sizeof(T) == sizeof(uint16_t) && std::is_arithmetic_v<T>)
 constexpr T bswap(T val) noexcept {
  union {
    uint16_t u;
    T t;
  } v{.t = val};
 #if __GNUC__
  v.u = __builtin_bswap16(v.u);
 #elif _WIN32
  v.u = _byteswap_ushort(v.u);
 #else
  v.u = (v.u << 8) | ((v.u >> 8) & 0xFF);
 #endif
  return v.t;
 }
 template <typename T>
  requires(sizeof(T) == sizeof(uint32_t) && std::is_arithmetic_v<T>)
 constexpr T bswap(T val) noexcept {
  union {
    uint32_t u;
    T t;
  } v{.t = val};
 #if __GNUC__
  v.u = __builtin_bswap32(v.u);
 #elif _WIN32
  v.u = _byteswap_ulong(v.u);
 #else
  v.u = ((v.u & 0x0000FFFF) << 16) | ((v.u & 0xFFFF0000) >> 16) | ((v.u & 0x00FF00FF) << 8) | ((v.u & 0xFF00FF00) >> 8);
 #endif
  return v.t;
 }
 template <typename T>
  requires(std::is_enum_v<T>)
 auto underlying(T value) -> std::underlying_type_t<T> {
  return static_cast<std::underlying_type_t<T>>(value);
 }
 #ifndef ALIGN
 #define ALIGN(x, a) (((x) + ((a) - 1)) & ~((a) - 1))
 #endif
@ -33,11 +75,7 @@ using namespace std::string_view_literals;
 #else
 #define UNLIKELY
 #endif
-#define FATAL(msg, ...)                                                                                                \
+#define FATAL(msg, ...) Log.fatal(msg, ##__VA_ARGS__);
  {                                                                                                                    \
    Log.fatal(msg, ##__VA_ARGS__);                                                                                     \
    unreachable();                                                                                                     \
  }
 #define ASSERT(cond, msg, ...)                                                                                         \
  if (!(cond))                                                                                                         \
  UNLIKELY FATAL(msg, ##__VA_ARGS__)
--- a/lib/logging.hpp
+++ b/lib/logging.hpp
@ -4,15 +4,9 @@
 #include <fmt/base.h>
 #include <fmt/format.h>
 #include <string_view>
-#ifdef __GNUC__
+#include <cstdlib>
-[[noreturn]] inline __attribute__((always_inline)) void unreachable() { __builtin_unreachable(); }
+#include <string_view>
 #elif defined(_MSC_VER)
 [[noreturn]] __forceinline void unreachable() { __assume(false); }
 #else
 #error Unknown compiler
 #endif
 namespace aurora {
 void log_internal(AuroraLogLevel level, const char* module, const char* message, unsigned int len) noexcept;
@ -50,7 +44,7 @@ struct Module {
  template <typename... T>
  [[noreturn]] void fatal(fmt::format_string<T...> fmt, T&&... args) noexcept {
    report(LOG_FATAL, fmt, std::forward<T>(args)...);
-    unreachable();
+    std::abort();
  }
 };
 } // namespace aurora
--- a/lib/webgpu/gpu.cpp
+++ b/lib/webgpu/gpu.cpp
@ -385,15 +385,12 @@ bool initialize(AuroraBackend auroraBackend) {
    g_adapter.GetLimits(&supportedLimits);
    const wgpu::Limits requiredLimits{
        // Use "best" supported alignments
-        .maxTextureDimension1D = supportedLimits.maxTextureDimension1D == 0
+        .maxTextureDimension1D = supportedLimits.maxTextureDimension1D == 0 ? WGPU_LIMIT_U32_UNDEFINED
-                                               ? WGPU_LIMIT_U32_UNDEFINED
+                                                                            : supportedLimits.maxTextureDimension1D,
-                                               : supportedLimits.maxTextureDimension1D,
+        .maxTextureDimension2D = supportedLimits.maxTextureDimension2D == 0 ? WGPU_LIMIT_U32_UNDEFINED
-        .maxTextureDimension2D = supportedLimits.maxTextureDimension2D == 0
+                                                                            : supportedLimits.maxTextureDimension2D,
-                                               ? WGPU_LIMIT_U32_UNDEFINED
+        .maxTextureDimension3D = supportedLimits.maxTextureDimension3D == 0 ? WGPU_LIMIT_U32_UNDEFINED
-                                               : supportedLimits.maxTextureDimension2D,
+                                                                            : supportedLimits.maxTextureDimension3D,
        .maxTextureDimension3D = supportedLimits.maxTextureDimension3D == 0
                                               ? WGPU_LIMIT_U32_UNDEFINED
                                               : supportedLimits.maxTextureDimension3D,
        .minUniformBufferOffsetAlignment = supportedLimits.minUniformBufferOffsetAlignment == 0
                                               ? WGPU_LIMIT_U32_UNDEFINED
                                               : supportedLimits.minUniformBufferOffsetAlignment,
@ -401,6 +398,12 @@ bool initialize(AuroraBackend auroraBackend) {
                                               ? WGPU_LIMIT_U32_UNDEFINED
                                               : supportedLimits.minStorageBufferOffsetAlignment,
    };
    Log.info(
        "Using limits\n  maxTextureDimension1D: {}\n  maxTextureDimension2D: {}\n  maxTextureDimension3D: {}\n  "
        "minUniformBufferOffsetAlignment: {}\n  minStorageBufferOffsetAlignment: {}",
        requiredLimits.maxTextureDimension1D, requiredLimits.maxTextureDimension2D,
        requiredLimits.maxTextureDimension3D, requiredLimits.minUniformBufferOffsetAlignment,
        requiredLimits.minStorageBufferOffsetAlignment);
    std::vector<wgpu::FeatureName> requiredFeatures;
    wgpu::SupportedFeatures supportedFeatures;
    g_adapter.GetFeatures(&supportedFeatures);
@ -442,22 +445,20 @@ bool initialize(AuroraBackend auroraBackend) {
    });
    deviceDescriptor.SetUncapturedErrorCallback(
        [](const wgpu::Device& device, wgpu::ErrorType type, wgpu::StringView message) {
-          FATAL("WebGPU error {}: {}", static_cast<int>(type), message);
+          FATAL("WebGPU error {}: {}", underlying(type), message);
        });
    deviceDescriptor.SetDeviceLostCallback(
        wgpu::CallbackMode::AllowSpontaneous,
        [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
          Log.warn("Device lost: {}", message);
        });
    const auto future = g_adapter.RequestDevice(
        &deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly,
        [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
          if (status == wgpu::RequestDeviceStatus::Success) {
            g_device = std::move(device);
          } else {
            Log.warn("Device request failed: {}", message);
          }
        });
    deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous,
                                           [](const wgpu::Device& device, wgpu::DeviceLostReason reason,
                                              wgpu::StringView message) { Log.warn("Device lost: {}", message); });
    const auto future =
        g_adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly,
                                [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
                                  if (status == wgpu::RequestDeviceStatus::Success) {
                                    g_device = std::move(device);
                                  } else {
                                    Log.warn("Device request failed: {}", message);
                                  }
                                });
    const auto status = g_instance.WaitAny(future, 5000000000);
    if (status != wgpu::WaitStatus::Success) {
      Log.error("Failed to create device: {}", magic_enum::enum_name(status));