aurora: Rework indexed attributes support

2025-07-16 02:05:53 +00:00 · 2022-03-26 20:30:29 -04:00 · 2022-03-26 20:30:29 -04:00 · 6e7d123389
commit 6e7d123389
parent a0d5c5c285
6 changed files with 235 additions and 195 deletions
--- a/aurora/lib/gfx/common.cpp
+++ b/aurora/lib/gfx/common.cpp
@ -619,7 +619,6 @@ static inline Range push(ByteBuffer& target, const uint8_t* data, size_t length,
  }
  auto begin = target.size();
  if (length == 0) {
-    // TODO shared zero buf?
    length = alignment;
    target.append_zeroes(alignment);
  } else {
@ -642,8 +641,8 @@ static inline Range map(ByteBuffer& target, size_t length, size_t alignment) {
  target.append_zeroes(length + padding);
  return {static_cast<uint32_t>(begin), static_cast<uint32_t>(length + padding)};
 }
-Range push_verts(const uint8_t* data, size_t length) { return push(g_verts, data, length, 0 /* TODO? */); }
-Range push_indices(const uint8_t* data, size_t length) { return push(g_indices, data, length, 0 /* TODO? */); }
+Range push_verts(const uint8_t* data, size_t length) { return push(g_verts, data, length, 4); }
+Range push_indices(const uint8_t* data, size_t length) { return push(g_indices, data, length, 4); }
 Range push_uniform(const uint8_t* data, size_t length) {
  wgpu::SupportedLimits limits;
  g_device.GetLimits(&limits);
@ -662,11 +661,11 @@ Range push_static_storage(const uint8_t* data, size_t length) {
  return range;
 }
 std::pair<ByteBuffer, Range> map_verts(size_t length) {
-  const auto range = map(g_verts, length, 0 /* TODO? */);
+  const auto range = map(g_verts, length, 4);
  return {ByteBuffer{g_verts.data() + range.offset, range.size}, range};
 }
 std::pair<ByteBuffer, Range> map_indices(size_t length) {
-  const auto range = map(g_indices, length, 0 /* TODO? */);
+  const auto range = map(g_indices, length, 4);
  return {ByteBuffer{g_indices.data() + range.offset, range.size}, range};
 }
 std::pair<ByteBuffer, Range> map_uniform(size_t length) {
--- a/aurora/lib/gfx/gx.cpp
+++ b/aurora/lib/gfx/gx.cpp
@ -489,10 +489,9 @@ ShaderInfo populate_pipeline_config(PipelineConfig& config, GX::Primitive primit
    config.shaderConfig.tcgs[i] = g_gxState.tcgs[i];
  }
  config.shaderConfig.alphaCompare = g_gxState.alphaCompare;
-  if (std::any_of(config.shaderConfig.vtxAttrs.begin(), config.shaderConfig.vtxAttrs.end(),
-                  [](const auto type) { return type == GX::INDEX8 || type == GX::INDEX16; })) {
-    config.shaderConfig.hasIndexedAttributes = true;
-  }
+  config.shaderConfig.indexedAttributeCount =
+      std::count_if(config.shaderConfig.vtxAttrs.begin(), config.shaderConfig.vtxAttrs.end(),
+                    [](const auto type) { return type == GX::INDEX8 || type == GX::INDEX16; });
  config = {
      .shaderConfig = config.shaderConfig,
      .primitive = primitive,
@ -658,17 +657,17 @@ GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& confi
          .buffer = g_storageBuffer,
          .size = ranges.nrmDataRange.size,
      },
-      // UVs
+      // Packed UVs
      wgpu::BindGroupEntry{
          .binding = 3,
          .buffer = g_storageBuffer,
-          .size = ranges.tcDataRange.size,
+          .size = ranges.packedTcDataRange.size,
      },
-      // Packed UVs
+      // UVs
      wgpu::BindGroupEntry{
          .binding = 4,
          .buffer = g_storageBuffer,
-          .size = ranges.packedTcDataRange.size,
+          .size = ranges.tcDataRange.size,
      },
  };
  std::array<wgpu::BindGroupEntry, MaxTextures> samplerEntries;
@ -696,7 +695,7 @@ GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& confi
      .uniformBindGroup = bind_group_ref(wgpu::BindGroupDescriptor{
          .label = "GX Uniform Bind Group",
          .layout = layouts.uniformLayout,
-          .entryCount = static_cast<uint32_t>(config.hasIndexedAttributes ? uniformEntries.size() : 1),
+          .entryCount = static_cast<uint32_t>(config.indexedAttributeCount > 0 ? uniformEntries.size() : 1),
          .entries = uniformEntries.data(),
      }),
      .samplerBindGroup = bind_group_ref(wgpu::BindGroupDescriptor{
@ -716,7 +715,7 @@ GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& confi

 GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const ShaderConfig& config) noexcept {
  GXBindGroupLayouts out;
-  u32 uniformSizeKey = info.uniformSize + (config.hasIndexedAttributes ? 1 : 0);
+  u32 uniformSizeKey = info.uniformSize + (config.indexedAttributeCount > 0 ? 1 : 0);
  const auto uniformIt = sUniformBindGroupLayouts.find(uniformSizeKey);
  if (uniformIt != sUniformBindGroupLayouts.end()) {
    out.uniformLayout = uniformIt->second;
@ -771,7 +770,7 @@ GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const Shader
    };
    const auto uniformLayoutDescriptor = wgpu::BindGroupLayoutDescriptor{
        .label = "GX Uniform Bind Group Layout",
-        .entryCount = static_cast<uint32_t>(config.hasIndexedAttributes ? uniformLayoutEntries.size() : 1),
+        .entryCount = static_cast<uint32_t>(config.indexedAttributeCount > 0 ? uniformLayoutEntries.size() : 1),
        .entries = uniformLayoutEntries.data(),
    };
    out.uniformLayout = g_device.CreateBindGroupLayout(&uniformLayoutDescriptor);
--- a/aurora/lib/gfx/gx.hpp
+++ b/aurora/lib/gfx/gx.hpp
@ -162,7 +162,7 @@ struct ShaderConfig {
  std::array<ColorChannelConfig, MaxColorChannels> colorChannels;
  std::array<TcgConfig, MaxTexCoord> tcgs;
  AlphaCompare alphaCompare;
-  bool hasIndexedAttributes = false;
+  u32 indexedAttributeCount = 0;
  bool operator==(const ShaderConfig&) const = default;
 };
 struct PipelineConfig {
@ -216,16 +216,6 @@ Range build_uniform(const ShaderInfo& info) noexcept;
 GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const ShaderConfig& config) noexcept;
 GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& config,
                               const BindGroupRanges& ranges) noexcept;
-
-struct DlVert {
-  s16 pos;
-  s16 norm;
-  // colors ignored
-  std::array<s16, 7> uvs;
-  // pn_mtx_idx ignored
-  // tex_mtx_idxs ignored
-  s16 _pad;
-};
 } // namespace aurora::gfx::gx

 namespace aurora {
@ -301,6 +291,6 @@ inline void xxh3_update(XXH3_state_t& state, const gfx::gx::ShaderConfig& input)
  if (input.alphaCompare) {
    xxh3_update(state, input.alphaCompare);
  }
-  XXH3_64bits_update(&state, &input.hasIndexedAttributes, sizeof(gfx::gx::ShaderConfig::hasIndexedAttributes));
+  XXH3_64bits_update(&state, &input.indexedAttributeCount, sizeof(gfx::gx::ShaderConfig::indexedAttributeCount));
 }
 } // namespace aurora
--- a/aurora/lib/gfx/gx_shader.cpp
+++ b/aurora/lib/gfx/gx_shader.cpp
@ -396,42 +396,30 @@ static inline std::string vtx_attr(const ShaderConfig& config, GX::Attr attr) {
    unreachable();
  }
  if (attr == GX::VA_POS) {
-    if (type == GX::DIRECT) {
-      return "in_pos";
-    }
-    return "v_verts.data[in_pos_nrm_idx[0]].xyz";
+    return "in_pos";
  }
  if (attr == GX::VA_NRM) {
-    if (type == GX::DIRECT) {
-      return "in_nrm";
-    }
-    return "v_norms.data[in_pos_nrm_idx[1]].xyz";
+    return "in_nrm";
  }
  if (attr == GX::VA_CLR0 || attr == GX::VA_CLR1) {
    const auto idx = attr - GX::VA_CLR0;
-    if (type == GX::DIRECT) {
-      return fmt::format(FMT_STRING("in_clr{}"), idx);
-    }
-    Log.report(logvisor::Fatal, FMT_STRING("indexed color unsupported"));
-    unreachable();
+    return fmt::format(FMT_STRING("in_clr{}"), idx);
  }
  if (attr >= GX::VA_TEX0 && attr <= GX::VA_TEX7) {
    const auto idx = attr - GX::VA_TEX0;
-    if (type == GX::DIRECT) {
-      return fmt::format(FMT_STRING("in_tex{}_uv"), idx);
-    }
-    if (idx == 0) {
-      return "v_packed_uvs.data[in_uv_0_4_idx[0]]";
-    }
-    if (idx < 4) {
-      return fmt::format(FMT_STRING("v_uvs.data[in_uv_0_4_idx[{}]]"), idx);
-    }
-    return fmt::format(FMT_STRING("v_uvs.data[in_uv_5_7_idx[{}]]"), idx - 4);
+    return fmt::format(FMT_STRING("in_tex{}_uv"), idx);
  }
  Log.report(logvisor::Fatal, FMT_STRING("unhandled attr {}"), attr);
  unreachable();
 }

+constexpr std::array<std::string_view, MaxVtxAttr> VtxAttributeNames{
+    "pn_mtx",        "tex0_mtx",      "tex1_mtx",      "tex2_mtx",    "tex3_mtx", "tex4_mtx", "tex5_mtx",
+    "tex6_mtx",      "tex7_mtx",      "pos",           "nrm",         "clr0",     "clr1",     "tex0_uv",
+    "tex1_uv",       "tex2_uv",       "tex3_uv",       "tex4_uv",     "tex5_uv",  "tex6_uv",  "tex7_uv",
+    "pos_mtx_array", "nrm_mtx_array", "tex_mtx_array", "light_array", "nbt",
+};
+
 std::pair<wgpu::ShaderModule, ShaderInfo> build_shader(const ShaderConfig& config) noexcept {
  const auto hash = xxh3_hash(config);
  const auto it = g_gxCachedShaders.find(hash);
@ -496,7 +484,7 @@ std::pair<wgpu::ShaderModule, ShaderInfo> build_shader(const ShaderConfig& confi
    Log.report(logvisor::Info, FMT_STRING("  alphaCompare: comp0 {} ref0 {} op {} comp1 {} ref1 {}"),
               config.alphaCompare.comp0, config.alphaCompare.ref0, config.alphaCompare.op, config.alphaCompare.comp1,
               config.alphaCompare.ref1);
-    Log.report(logvisor::Info, FMT_STRING("  hasIndexedAttributes: {}"), config.hasIndexedAttributes);
+    Log.report(logvisor::Info, FMT_STRING("  indexedAttributeCount: {}"), config.indexedAttributeCount);
    Log.report(logvisor::Info, FMT_STRING("  fogType: {}"), config.fogType);
  }

@ -511,29 +499,66 @@ std::pair<wgpu::ShaderModule, ShaderInfo> build_shader(const ShaderConfig& confi
  std::string vtxXfrAttrs;
  size_t locIdx = 0;
  size_t vtxOutIdx = 0;
-  if (config.hasIndexedAttributes) {
+  size_t uniBindingIdx = 1;
+  if (config.indexedAttributeCount > 0) {
    // Display list attributes
-    vtxInAttrs +=
-        "\n    @location(0) in_pos_nrm_idx: vec2<i32>"
-        "\n    , @location(1) in_uv_0_4_idx: vec4<i32>"
-        "\n    , @location(2) in_uv_5_7_idx: vec4<i32>";
-    locIdx += 3;
-    uniformBindings += R"""(
-struct Vec3Block {
-    data: array<vec4<f32>>;
-};
-struct Vec2Block {
-    data: array<vec2<f32>>;
-};
-@group(0) @binding(1)
-var<storage, read> v_verts: Vec3Block;
-@group(0) @binding(2)
-var<storage, read> v_norms: Vec3Block;
-@group(0) @binding(3)
-var<storage, read> v_uvs: Vec2Block;
-@group(0) @binding(4)
-var<storage, read> v_packed_uvs: Vec2Block;
-)""";
+    int currAttrIdx = 0;
+    bool addedTex1Uv = false;
+    for (GX::Attr attr{}; attr < MaxVtxAttr; attr = GX::Attr(attr + 1)) {
+      // Indexed attributes
+      if (config.vtxAttrs[attr] != GX::INDEX8 && config.vtxAttrs[attr] != GX::INDEX16) {
+        continue;
+      }
+      const auto [div, rem] = std::div(currAttrIdx, 4);
+      std::string_view attrName;
+      bool addUniformBinding = true;
+      // TODO: this is a hack to only have to bind tex0_uv and tex1_uv for MP
+      // should figure out a more generic approach
+      if (attr >= GX::VA_TEX1 && attr <= GX::VA_TEX7) {
+        attrName = VtxAttributeNames[GX::VA_TEX1];
+        addUniformBinding = !addedTex1Uv;
+        addedTex1Uv = true;
+      } else {
+        attrName = VtxAttributeNames[attr];
+      }
+      vtxXfrAttrsPre += fmt::format(FMT_STRING("\n    var {} = v_arr_{}[in_dl{}[{}]];"), vtx_attr(config, attr), attrName, div, rem);
+      if (addUniformBinding) {
+        std::string_view arrType;
+        if (attr == GX::VA_POS || attr == GX::VA_NRM) {
+          arrType = "vec3<f32>";
+        } else if (attr >= GX::VA_TEX0 && attr <= GX::VA_TEX7) {
+          arrType = "vec2<f32>";
+        }
+        uniformBindings += fmt::format(
+            "\n@group(0) @binding({})"
+            "\nvar<storage, read> v_arr_{}: array<{}>;",
+            uniBindingIdx++, attrName, arrType);
+      }
+      ++currAttrIdx;
+    }
+    auto [num4xAttrArrays, rem] = std::div(currAttrIdx, 4);
+    u32 num2xAttrArrays = 0;
+    if (rem > 2) {
+      ++num4xAttrArrays;
+    } else if (rem > 0) {
+      num2xAttrArrays = 1;
+    }
+    for (u32 i = 0; i < num4xAttrArrays; ++i) {
+      if (locIdx > 0) {
+        vtxInAttrs += "\n    , ";
+      } else {
+        vtxInAttrs += "\n    ";
+      }
+      vtxInAttrs += fmt::format(FMT_STRING("@location({}) in_dl{}: vec4<i32>"), locIdx++, i);
+    }
+    for (u32 i = 0; i < num2xAttrArrays; ++i) {
+      if (locIdx > 0) {
+        vtxInAttrs += "\n    , ";
+      } else {
+        vtxInAttrs += "\n    ";
+      }
+      vtxInAttrs += fmt::format(FMT_STRING("@location({}) in_dl{}: vec2<i32>"), locIdx++, num4xAttrArrays + i);
+    }
  }
  for (GX::Attr attr{}; attr < MaxVtxAttr; attr = GX::Attr(attr + 1)) {
    // Direct attributes
--- a/aurora/lib/gfx/model/shader.cpp
+++ b/aurora/lib/gfx/model/shader.cpp
@ -13,82 +13,70 @@ static const std::vector<zeus::CVector3f>* vtxData;
 static const std::vector<zeus::CVector3f>* nrmData;
 static const std::vector<Vec2<float>>* tex0TcData;
 static const std::vector<Vec2<float>>* tcData;
-static std::optional<Range> staticVtxRange;
-static std::optional<Range> staticNrmRange;
-static std::optional<Range> staticPackedTcRange;
-static std::optional<Range> staticTcRange;
+static std::optional<Range> cachedVtxRange;
+static std::optional<Range> cachedNrmRange;
+static std::optional<Range> cachedPackedTcRange;
+static std::optional<Range> cachedTcRange;

-static inline std::pair<gx::DlVert, size_t> readVert(const u8* data) noexcept {
-  gx::DlVert out{};
+static inline void read_vert(ByteBuffer& out, const u8* data) noexcept {
  size_t offset = 0;
-  const auto vtxTypes = gx::g_gxState.vtxDesc;
-  const auto read8 = [/*data, &offset*/](GX::AttrType type) -> s8 {
-//    if (type == GX::INDEX8) {
-//      s8 v = static_cast<s8>(data[offset]);
-//      ++offset;
-//      return v;
-//    }
-#ifndef NDEBUG
-    if (type != GX::NONE) {
-      Log.report(logvisor::Fatal, FMT_STRING("unsupported vtx attr"));
-      unreachable();
-    }
-#endif
-    return 0;
-  };
-  const auto read16 = [data, &offset](GX::AttrType type) -> s16 {
-    if (type == GX::INDEX16) {
-      s16 v = metaforce::SBig(*reinterpret_cast<const u16*>(data + offset));
+  for (const auto& type : gx::g_gxState.vtxDesc) {
+    if (type == GX::INDEX8) {
+      const auto v = static_cast<s16>(data[offset]); // expand to s16
+      out.append(&v, 2);
+      ++offset;
+    } else if (type == GX::INDEX16) {
+      const s16 v = metaforce::SBig(*reinterpret_cast<const s16*>(data + offset));
+      out.append(&v, 2);
      offset += 2;
-      return v;
    }
-    return 0;
-  };
-  read8(vtxTypes[GX::VA_PNMTXIDX]);
-  read8(vtxTypes[GX::VA_TEX0MTXIDX]);
-  read8(vtxTypes[GX::VA_TEX1MTXIDX]);
-  read8(vtxTypes[GX::VA_TEX2MTXIDX]);
-  read8(vtxTypes[GX::VA_TEX3MTXIDX]);
-  read8(vtxTypes[GX::VA_TEX4MTXIDX]);
-  read8(vtxTypes[GX::VA_TEX5MTXIDX]);
-  read8(vtxTypes[GX::VA_TEX6MTXIDX]);
-
-  out.pos = read16(vtxTypes[GX::VA_POS]);
-  out.norm = read16(vtxTypes[GX::VA_NRM]);
-  read16(vtxTypes[GX::VA_CLR0]);
-  read16(vtxTypes[GX::VA_CLR1]);
-  out.uvs[0] = read16(vtxTypes[GX::VA_TEX0]);
-  out.uvs[1] = read16(vtxTypes[GX::VA_TEX1]);
-  out.uvs[2] = read16(vtxTypes[GX::VA_TEX2]);
-  out.uvs[3] = read16(vtxTypes[GX::VA_TEX3]);
-  out.uvs[4] = read16(vtxTypes[GX::VA_TEX4]);
-  out.uvs[5] = read16(vtxTypes[GX::VA_TEX5]);
-  out.uvs[6] = read16(vtxTypes[GX::VA_TEX6]);
-
-  return {out, offset};
+  }
+  constexpr size_t align = 4; // Sint16x2
+  if (offset % align != 0) {
+    out.append_zeroes(align - (offset % align));
+  }
 }

-static absl::flat_hash_map<XXH64_hash_t, std::pair<std::vector<gx::DlVert>, std::vector<u32>>> sCachedDisplayLists;
+static absl::flat_hash_map<XXH64_hash_t, std::pair<ByteBuffer, ByteBuffer>> sCachedDisplayLists;

 void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
  const auto hash = xxh3_hash(dlStart, dlSize, 0);
  Range vertRange, idxRange;
-  uint32_t numIndices;
+  u32 numIndices = 0;
  auto it = sCachedDisplayLists.find(hash);
  if (it != sCachedDisplayLists.end()) {
    const auto& [verts, indices] = it->second;
-    numIndices = indices.size();
-    vertRange = push_verts(ArrayRef{verts});
-    idxRange = push_indices(ArrayRef{indices});
+    numIndices = indices.size() / 2;
+    vertRange = push_verts(verts.data(), verts.size());
+    idxRange = push_indices(indices.data(), indices.size());
  } else {
-    std::vector<gx::DlVert> verts;
-    std::vector<u32> indices;
+    ByteBuffer vtxBuf;
+    ByteBuffer idxBuf;
+    u8 inVtxSize = 0;
+    u8 outVtxSize = 0;
+    for (const auto& type : gx::g_gxState.vtxDesc) {
+      if (type == GX::NONE || type == GX::DIRECT) {
+        continue;
+      }
+      if (type == GX::INDEX8) {
+        ++inVtxSize;
+        outVtxSize += 2;
+      } else if (type == GX::INDEX16) {
+        inVtxSize += 2;
+        outVtxSize += 2;
+      } else {
+        Log.report(logvisor::Fatal, FMT_STRING("unexpected vtx type {}"), type);
+        unreachable();
+      }
+    }
+    outVtxSize = ALIGN(outVtxSize, 4);

+    u16 vtxStart = 0;
    size_t offset = 0;
    while (offset < dlSize - 6) {
      const auto header = dlStart[offset];
      const auto primitive = static_cast<GX::Primitive>(header & 0xF8);
-      const auto vtxCount = metaforce::SBig(*reinterpret_cast<const u16*>(dlStart + offset + 1));
+      const auto dlVtxCount = metaforce::SBig(*reinterpret_cast<const u16*>(dlStart + offset + 1));
      offset += 3;

      if (primitive == 0) {
@ -99,66 +87,81 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
        unreachable();
      }

-      const u32 idxStart = indices.size();
-      const u16 vertsStart = verts.size();
-      verts.reserve(vertsStart + vtxCount);
-      if (vtxCount > 3 && (primitive == GX::TRIANGLEFAN || primitive == GX::TRIANGLESTRIP)) {
-        indices.reserve(idxStart + (u32(vtxCount) - 3) * 3 + 3);
+      vtxBuf.reserve_extra(dlVtxCount * outVtxSize);
+      if (dlVtxCount > 3 && (primitive == GX::TRIANGLEFAN || primitive == GX::TRIANGLESTRIP)) {
+        idxBuf.reserve_extra(((u32(dlVtxCount) - 3) * 3 + 3) * 2);
      } else {
-        indices.reserve(idxStart + vtxCount);
+        idxBuf.reserve_extra(dlVtxCount * 2);
      }
-      auto curVert = vertsStart;
-      for (int v = 0; v < vtxCount; ++v) {
-        const auto [vert, read] = readVert(dlStart + offset);
-        verts.push_back(vert);
-        offset += read;
+      u16 curVert = vtxStart;
+      for (u16 v = 0; v < dlVtxCount; ++v) {
+        read_vert(vtxBuf, dlStart + offset);
+        offset += inVtxSize;
        if (primitive == GX::TRIANGLES || v < 3) {
-          // pass
+          idxBuf.append(&curVert, 2);
+          ++numIndices;
        } else if (primitive == GX::TRIANGLEFAN) {
-          indices.push_back(vertsStart);
-          indices.push_back(curVert - 1);
+          const std::array<u16, 3> idxs{
+              vtxStart,
+              u16(curVert - 1),
+              curVert,
+          };
+          idxBuf.append(idxs.data(), 6);
+          numIndices += 3;
        } else if (primitive == GX::TRIANGLESTRIP) {
          if ((v & 1) == 0) {
-            indices.push_back(curVert - 2);
-            indices.push_back(curVert - 1);
+            const std::array<u16, 3> idxs{
+                u16(curVert - 2),
+                u16(curVert - 1),
+                curVert,
+            };
+            idxBuf.append(idxs.data(), 6);
          } else {
-            indices.push_back(curVert - 1);
-            indices.push_back(curVert - 2);
+            const std::array<u16, 3> idxs{
+                u16(curVert - 1),
+                u16(curVert - 2),
+                curVert,
+            };
+            idxBuf.append(idxs.data(), 6);
          }
+          numIndices += 3;
        }
-        indices.push_back(curVert);
        ++curVert;
      }
+      vtxStart += dlVtxCount;
    }

-    numIndices = indices.size();
-    vertRange = push_verts(ArrayRef{verts});
-    idxRange = push_indices(ArrayRef{indices});
-    sCachedDisplayLists.try_emplace(hash, std::move(verts), std::move(indices));
+    vertRange = push_verts(vtxBuf.data(), vtxBuf.size());
+    idxRange = push_indices(idxBuf.data(), idxBuf.size());
+    sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf));
  }

  Range sVtxRange, sNrmRange, sTcRange, sPackedTcRange;
-  if (staticVtxRange) {
-    sVtxRange = *staticVtxRange;
+  if (cachedVtxRange) {
+    sVtxRange = *cachedVtxRange;
  } else {
    sVtxRange = push_storage(reinterpret_cast<const uint8_t*>(vtxData->data()), vtxData->size() * 16);
+    cachedVtxRange = sVtxRange;
  }
-  if (staticNrmRange) {
-    sNrmRange = *staticNrmRange;
+  if (cachedNrmRange) {
+    sNrmRange = *cachedNrmRange;
  } else {
    sNrmRange = push_storage(reinterpret_cast<const uint8_t*>(nrmData->data()), nrmData->size() * 16);
+    cachedNrmRange = sNrmRange;
  }
-  if (staticTcRange) {
-    sTcRange = *staticTcRange;
+  if (cachedTcRange) {
+    sTcRange = *cachedTcRange;
  } else {
    sTcRange = push_storage(reinterpret_cast<const uint8_t*>(tcData->data()), tcData->size() * 8);
+    cachedTcRange = sTcRange;
  }
-  if (staticPackedTcRange) {
-    sPackedTcRange = *staticPackedTcRange;
+  if (cachedPackedTcRange) {
+    sPackedTcRange = *cachedPackedTcRange;
  } else if (tcData == tex0TcData) {
    sPackedTcRange = sTcRange;
  } else {
    sPackedTcRange = push_storage(reinterpret_cast<const uint8_t*>(tex0TcData->data()), tex0TcData->size() * 8);
+    cachedPackedTcRange = sPackedTcRange;
  }

  model::PipelineConfig config{};
@ -188,11 +191,40 @@ State construct_state() { return {}; }
 wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] PipelineConfig config) {
  const auto [shader, info] = build_shader(config.shaderConfig);

-  const auto attributes = gpu::utils::make_vertex_attributes(
-      std::array{wgpu::VertexFormat::Sint16x2, wgpu::VertexFormat::Sint16x4, wgpu::VertexFormat::Sint16x4});
-  const std::array vertexBuffers{gpu::utils::make_vertex_buffer_layout(sizeof(gx::DlVert), attributes)};
+  std::array<wgpu::VertexAttribute, gx::MaxVtxAttr> vtxAttrs;
+  auto [num4xAttr, rem] = std::div(config.shaderConfig.indexedAttributeCount, 4);
+  u32 num2xAttr = 0;
+  if (rem > 2) {
+    ++num4xAttr;
+  } else if (rem > 0) {
+    ++num2xAttr;
+  }
+  u32 offset = 0;
+  for (u32 i = 0; i < num4xAttr; ++i) {
+    vtxAttrs[i] = {
+        .format = wgpu::VertexFormat::Sint16x4,
+        .offset = offset,
+        .shaderLocation = i,
+    };
+    offset += 8;
+  }
+  for (u32 i = 0; i < num2xAttr; ++i) {
+    const u32 idx = num4xAttr + i;
+    vtxAttrs[idx] = {
+        .format = wgpu::VertexFormat::Sint16x2,
+        .offset = offset,
+        .shaderLocation = idx,
+    };
+    offset += 4;
+  }
+  const std::array vtxBuffers{wgpu::VertexBufferLayout{
+      .arrayStride = offset,
+      .stepMode = wgpu::VertexStepMode::Vertex,
+      .attributeCount = num4xAttr + num2xAttr,
+      .attributes = vtxAttrs.data(),
+  }};

-  return build_pipeline(config, info, vertexBuffers, shader, "Model Pipeline");
+  return build_pipeline(config, info, vtxBuffers, shader, "Model Pipeline");
 }

 void render(const State& state, const DrawData& data, const wgpu::RenderPassEncoder& pass) {
@ -204,8 +236,8 @@ void render(const State& state, const DrawData& data, const wgpu::RenderPassEnco
      data.uniformRange.offset,
      storage_offset(data.dataRanges.vtxDataRange),
      storage_offset(data.dataRanges.nrmDataRange),
-      storage_offset(data.dataRanges.tcDataRange),
      storage_offset(data.dataRanges.packedTcDataRange),
+      storage_offset(data.dataRanges.tcDataRange),
  };
  pass.SetBindGroup(0, find_bind_group(data.bindGroups.uniformBindGroup), offsets.size(), offsets.data());
  if (data.bindGroups.samplerBindGroup && data.bindGroups.textureBindGroup) {
@ -213,7 +245,7 @@ void render(const State& state, const DrawData& data, const wgpu::RenderPassEnco
    pass.SetBindGroup(2, find_bind_group(data.bindGroups.textureBindGroup));
  }
  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size);
-  pass.SetIndexBuffer(g_indexBuffer, wgpu::IndexFormat::Uint32, data.idxRange.offset, data.idxRange.size);
+  pass.SetIndexBuffer(g_indexBuffer, wgpu::IndexFormat::Uint16, data.idxRange.offset, data.idxRange.size);
  if (data.dstAlpha) {
    const wgpu::Color color{0.f, 0.f, 0.f, *data.dstAlpha};
    pass.SetBlendConstant(&color);
@ -227,35 +259,23 @@ template <typename Vec>
 static inline void cache_array(const void* data, Vec*& outPtr, std::optional<aurora::gfx::Range>& outRange, u8 stride) {
  Vec* vecPtr = static_cast<Vec*>(data);
  outPtr = vecPtr;
-  if (stride == 1) {
-//    const auto hash = aurora::xxh3_hash(vecPtr->data(), vecPtr->size() * sizeof(typename Vec::value_type), 0);
-//    const auto it = sCachedRanges.find(hash);
-//    if (it != sCachedRanges.end()) {
-//      outRange = it->second;
-//    } else {
-//      const auto range = aurora::gfx::push_static_storage(aurora::ArrayRef{*vecPtr});
-//      sCachedRanges.try_emplace(hash, range);
-//      outRange = range;
-//    }
-  } else {
-    outRange.reset();
-  }
+  outRange.reset();
 }

 void GXSetArray(GX::Attr attr, const void* data, u8 stride) noexcept {
  using namespace aurora::gfx::model;
  switch (attr) {
  case GX::VA_POS:
-    cache_array(data, vtxData, staticVtxRange, stride);
+    cache_array(data, vtxData, cachedVtxRange, stride);
    break;
  case GX::VA_NRM:
-    cache_array(data, nrmData, staticNrmRange, stride);
+    cache_array(data, nrmData, cachedNrmRange, stride);
    break;
  case GX::VA_TEX0:
-    cache_array(data, tex0TcData, staticPackedTcRange, stride);
+    cache_array(data, tex0TcData, cachedPackedTcRange, stride);
    break;
  case GX::VA_TEX1:
-    cache_array(data, tcData, staticTcRange, stride);
+    cache_array(data, tcData, cachedTcRange, stride);
    break;
  default:
    Log.report(logvisor::Fatal, FMT_STRING("GXSetArray: invalid attr {}"), attr);
--- a/aurora/lib/gfx/stream.cpp
+++ b/aurora/lib/gfx/stream.cpp
@ -8,6 +8,19 @@ static logvisor::Module Log("aurora::gfx::stream");

 using aurora::gfx::gx::g_gxState;

+#ifndef NDEBUG
+static inline GX::Attr next_attr(size_t begin) {
+  auto iter = std::find_if(g_gxState.vtxDesc.begin() + begin, g_gxState.vtxDesc.end(),
+                           [](const auto type) { return type != GX::NONE; });
+  if (begin > 0 && iter == g_gxState.vtxDesc.end()) {
+    // wrap around
+    iter = std::find_if(g_gxState.vtxDesc.begin(), g_gxState.vtxDesc.end(),
+                        [](const auto type) { return type != GX::NONE; });
+  }
+  return GX::Attr(iter - g_gxState.vtxDesc.begin());
+}
+#endif
+
 struct SStreamState {
  GX::Primitive primitive;
  u16 vertexCount = 0;
@ -27,8 +40,7 @@ struct SStreamState {
      indices.reserve(numVerts);
    }
 #ifndef NDEBUG
-    nextAttr =
-        GX::Attr(std::find(g_gxState.vtxDesc.begin(), g_gxState.vtxDesc.end(), GX::DIRECT) - g_gxState.vtxDesc.begin());
+    nextAttr = next_attr(0);
 #endif
  }
 };
@ -54,9 +66,8 @@ void GXBegin(GX::Primitive primitive, GX::VtxFmt vtxFmt, u16 nVerts) noexcept {
        Log.report(logvisor::Fatal, FMT_STRING("don't know how to handle attr {}"), attr);
        unreachable();
      }
-    } else if (type != GX::NONE) {
-      Log.report(logvisor::Fatal, FMT_STRING("invalid vtx type {} for attr {}"), type, attr);
-      unreachable();
+    } else if (type == GX::INDEX8 || type == GX::INDEX16) {
+      vertexSize += 2;
    }
    attr = GX::Attr(attr + 1);
  }
@ -76,11 +87,7 @@ static inline void check_attr_order(GX::Attr attr) noexcept {
    Log.report(logvisor::Fatal, FMT_STRING("bad attribute order: {}, expected {}"), attr, sStreamState->nextAttr);
    unreachable();
  }
-  auto nextAttr = std::find(g_gxState.vtxDesc.begin() + attr + 1, g_gxState.vtxDesc.end(), GX::DIRECT);
-  if (nextAttr == g_gxState.vtxDesc.end()) {
-    nextAttr = std::find(g_gxState.vtxDesc.begin(), g_gxState.vtxDesc.end(), GX::DIRECT);
-  }
-  sStreamState->nextAttr = GX::Attr(nextAttr - g_gxState.vtxDesc.begin());
+  sStreamState->nextAttr = next_attr(attr + 1);
 #endif
 }
 void GXPosition3f32(const zeus::CVector3f& pos) noexcept {