Rewrite attribute buffer, matrix & stream handling

Now array attributes (GXSetArray) will be properly fetched based on the vertex format. Buffers are still assumed to be byte-swapped to little-endian. Stream handling completely redone and many issues resolved. Eliminates matrix transposes. AURORA_NATIVE_MATRIX is no longer necessary and removed.
2025-12-15 16:16:15 +00:00 · 2025-04-14 17:16:13 -06:00
parent 3316ad9a7f
commit a600b0b84c
21 changed files with 1215 additions and 901 deletions
--- a/lib/gfx/model/shader.cpp
+++ b/lib/gfx/model/shader.cpp
@@ -1,60 +1,29 @@
 #include "shader.hpp"

 #include "../../webgpu/gpu.hpp"
+#include "../gx_fmt.hpp"

 #include <absl/container/flat_hash_map.h>

 namespace aurora::gfx::model {
 static Module Log("aurora::gfx::model");

-template <typename T>
-constexpr T bswap16(T val) noexcept {
-  static_assert(sizeof(T) == sizeof(u16));
-  union {
-    u16 u;
-    T t;
-  } v{.t = val};
-#if __GNUC__
-  v.u = __builtin_bswap16(v.u);
-#elif _WIN32
-  v.u = _byteswap_ushort(v.u);
-#else
-  v.u = (v.u << 8) | ((v.u >> 8) & 0xFF);
-#endif
-  return v.t;
-}
-template <typename T>
-constexpr T bswap32(T val) noexcept {
-  static_assert(sizeof(T) == sizeof(u32));
-  union {
-    u32 u;
-    T t;
-  } v{.t = val};
-#if __GNUC__
-  v.u = __builtin_bswap32(v.u);
-#elif _WIN32
-  v.u = _byteswap_ulong(v.u);
-#else
-  v.u = ((v.u & 0x0000FFFF) << 16) | ((v.u & 0xFFFF0000) >> 16) | ((v.u & 0x00FF00FF) << 8) | ((v.u & 0xFF00FF00) >> 8);
-#endif
-  return v.t;
-}
-
 using IndexedAttrs = std::array<bool, GX_VA_MAX_ATTR>;
 struct DisplayListCache {
  ByteBuffer vtxBuf;
  ByteBuffer idxBuf;
  IndexedAttrs indexedAttrs;
+  GXVtxFmt fmt;

-  DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs)
-  : vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs) {}
+  DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs, GXVtxFmt fmt)
+  : vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs), fmt(fmt) {}
 };

 static absl::flat_hash_map<HashType, DisplayListCache> sCachedDisplayLists;

 static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u16 vtxCount,
                              IndexedAttrs& indexedAttrs) {
-  using aurora::gfx::gx::g_gxState;
+  using gx::g_gxState;
  struct {
    u8 count;
    GXCompType type;
@@ -66,14 +35,13 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
  for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) {
    const auto& attrFmt = g_gxState.vtxFmts[vtxfmt].attrs[attr];
    switch (g_gxState.vtxDesc[attr]) {
-      DEFAULT_FATAL("unhandled attribute type {}", static_cast<int>(g_gxState.vtxDesc[attr]));
+      DEFAULT_FATAL("unhandled attribute type {}", g_gxState.vtxDesc[attr]);
    case GX_NONE:
      break;
    case GX_DIRECT:
 #define COMBINE(val1, val2, val3) (((val1) << 16) | ((val2) << 8) | (val3))
      switch (COMBINE(attr, attrFmt.cnt, attrFmt.type)) {
-        DEFAULT_FATAL("not handled: attr {}, cnt {}, type {}", static_cast<int>(attr), static_cast<int>(attrFmt.cnt),
-                      static_cast<int>(attrFmt.type));
+        DEFAULT_FATAL("not handled: attr {}, cnt {}, type {}", attr, attrFmt.cnt, attrFmt.type);
      case COMBINE(GX_VA_POS, GX_POS_XYZ, GX_F32):
      case COMBINE(GX_VA_NRM, GX_NRM_XYZ, GX_F32):
        attrArrays[attr].count = 3;
@@ -150,12 +118,10 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
  for (u32 v = 0; v < vtxCount; ++v) {
    for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) {
      if (g_gxState.vtxDesc[attr] == GX_INDEX8) {
-        u16 index = *ptr;
-        buf.append(&index, 2);
+        buf.append(static_cast<u16>(*ptr));
        ++ptr;
      } else if (g_gxState.vtxDesc[attr] == GX_INDEX16) {
-        u16 index = bswap16(*reinterpret_cast<const u16*>(ptr));
-        buf.append(&index, 2);
+        buf.append(bswap(*reinterpret_cast<const u16*>(ptr)));
        ptr += 2;
      }
      if (g_gxState.vtxDesc[attr] != GX_DIRECT) {
@@ -182,7 +148,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
        break;
      case GX_U16:
        for (int i = 0; i < count; ++i) {
-          const auto value = bswap16(reinterpret_cast<const u16*>(ptr)[i]);
+          const auto value = bswap(reinterpret_cast<const u16*>(ptr)[i]);
          out[i] = static_cast<f32>(value) / static_cast<f32>(1 << attrFmt.frac);
        }
        buf.append(out.data(), sizeof(f32) * count);
@@ -190,7 +156,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
        break;
      case GX_S16:
        for (int i = 0; i < count; ++i) {
-          const auto value = bswap16(reinterpret_cast<const s16*>(ptr)[i]);
+          const auto value = bswap(reinterpret_cast<const s16*>(ptr)[i]);
          out[i] = static_cast<f32>(value) / static_cast<f32>(1 << attrFmt.frac);
        }
        buf.append(out.data(), sizeof(f32) * count);
@@ -198,7 +164,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
        break;
      case GX_F32:
        for (int i = 0; i < count; ++i) {
-          out[i] = bswap32(reinterpret_cast<const f32*>(ptr)[i]);
+          out[i] = bswap(reinterpret_cast<const f32*>(ptr)[i]);
        }
        buf.append(out.data(), sizeof(f32) * count);
        ptr += count * sizeof(f32);
@@ -227,7 +193,7 @@ static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u
    buf.reserve_extra(vtxCount * sizeof(u16));
    for (u16 v = 0; v < vtxCount; ++v) {
      const u16 idx = vtxStart + v;
-      buf.append(&idx, sizeof(u16));
+      buf.append(idx);
      ++numIndices;
    }
  } else if (prim == GX_TRIANGLEFAN) {
@@ -235,29 +201,26 @@ static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u
    for (u16 v = 0; v < vtxCount; ++v) {
      const u16 idx = vtxStart + v;
      if (v < 3) {
-        buf.append(&idx, sizeof(u16));
+        buf.append(idx);
        ++numIndices;
        continue;
      }
-      const std::array<u16, 3> idxs{vtxStart, u16(idx - 1), idx};
-      buf.append(idxs.data(), sizeof(u16) * 3);
+      buf.append(std::array{vtxStart, static_cast<u16>(idx - 1), idx});
      numIndices += 3;
    }
  } else if (prim == GX_TRIANGLESTRIP) {
-    buf.reserve_extra(((u32(vtxCount) - 3) * 3 + 3) * sizeof(u16));
+    buf.reserve_extra(((static_cast<u32>(vtxCount) - 3) * 3 + 3) * sizeof(u16));
    for (u16 v = 0; v < vtxCount; ++v) {
      const u16 idx = vtxStart + v;
      if (v < 3) {
-        buf.append(&idx, sizeof(u16));
+        buf.append(idx);
        ++numIndices;
        continue;
      }
      if ((v & 1) == 0) {
-        const std::array<u16, 3> idxs{u16(idx - 2), u16(idx - 1), idx};
-        buf.append(idxs.data(), sizeof(u16) * 3);
+        buf.append(std::array{static_cast<u16>(idx - 2), static_cast<u16>(idx - 1), idx});
      } else {
-        const std::array<u16, 3> idxs{u16(idx - 1), u16(idx - 2), idx};
-        buf.append(idxs.data(), sizeof(u16) * 3);
+        buf.append(std::array{static_cast<u16>(idx - 1), static_cast<u16>(idx - 2), idx});
      }
      numIndices += 3;
    }
@@ -271,6 +234,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
  Range vertRange, idxRange;
  u32 numIndices = 0;
  IndexedAttrs indexedAttrs{};
+  GXVtxFmt fmt = GX_MAX_VTXFMT;
  auto it = sCachedDisplayLists.find(hash);
  if (it != sCachedDisplayLists.end()) {
    const auto& cache = it->second;
@@ -278,6 +242,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
    vertRange = push_verts(cache.vtxBuf.data(), cache.vtxBuf.size());
    idxRange = push_indices(cache.idxBuf.data(), cache.idxBuf.size());
    indexedAttrs = cache.indexedAttrs;
+    fmt = cache.fmt;
  } else {
    const u8* data = dlStart;
    u32 pos = 0;
@@ -302,8 +267,12 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
      case GX_DRAW_TRIANGLE_STRIP:
      case GX_DRAW_TRIANGLE_FAN: {
        const auto prim = static_cast<GXPrimitive>(opcode);
-        const auto fmt = static_cast<GXVtxFmt>(cmd & GX_VAT_MASK);
-        u16 vtxCount = bswap16(*reinterpret_cast<const u16*>(data + pos));
+        const auto newFmt = static_cast<GXVtxFmt>(cmd & GX_VAT_MASK);
+        if (fmt != GX_MAX_VTXFMT && fmt != newFmt) {
+          FATAL("Vertex format changed mid-display list: {} -> {}", fmt, newFmt);
+        }
+        fmt = newFmt;
+        u16 vtxCount = bswap(*reinterpret_cast<const u16*>(data + pos));
        pos += 2;
        pos += vtxCount * prepare_vtx_buffer(vtxBuf, fmt, data + pos, vtxCount, indexedAttrs);
        numIndices += prepare_idx_buffer(idxBuf, prim, vtxStart, vtxCount);
@@ -319,22 +288,16 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
    }
    vertRange = push_verts(vtxBuf.data(), vtxBuf.size());
    idxRange = push_indices(idxBuf.data(), idxBuf.size());
-    sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs);
+    sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs, fmt);
  }

  gx::BindGroupRanges ranges{};
-  int lastIndexedAttr = -1;
  for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
    if (!indexedAttrs[i]) {
      continue;
    }
    auto& array = gx::g_gxState.arrays[i];
-    if (lastIndexedAttr >= 0 && array == gx::g_gxState.arrays[lastIndexedAttr]) {
-      // Reuse range from last attribute in shader
-      // Don't set the output range, so it remains unbound
-      const auto range = gx::g_gxState.arrays[lastIndexedAttr].cachedRange;
-      array.cachedRange = range;
-    } else if (array.cachedRange.size > 0) {
+    if (array.cachedRange.size > 0) {
      // Use the currently cached range
      ranges.vaRanges[i] = array.cachedRange;
    } else {
@@ -343,11 +306,10 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
      ranges.vaRanges[i] = range;
      array.cachedRange = range;
    }
-    lastIndexedAttr = i;
  }

  model::PipelineConfig config{};
-  populate_pipeline_config(config, GX_TRIANGLES);
+  populate_pipeline_config(config, GX_TRIANGLES, fmt);
  const auto info = gx::build_shader_info(config.shaderConfig);
  const auto bindGroups = gx::build_bind_groups(info, config.shaderConfig, ranges);
  const auto pipeline = pipeline_ref(config);
@@ -366,7 +328,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {

 State construct_state() { return {}; }

-wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config) {
+wgpu::RenderPipeline create_pipeline(const State& state, const PipelineConfig& config) {
  const auto info = build_shader_info(config.shaderConfig); // TODO remove
  const auto shader = build_shader(config.shaderConfig, info);

@@ -385,7 +347,7 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const
  // Indexed attributes
  for (u32 i = 0; i < num4xAttr; ++i) {
    vtxAttrs[shaderLocation] = {
-        .format = wgpu::VertexFormat::Sint16x4,
+        .format = wgpu::VertexFormat::Uint16x4,
        .offset = offset,
        .shaderLocation = shaderLocation,
    };
@@ -394,7 +356,7 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const
  }
  for (u32 i = 0; i < num2xAttr; ++i) {
    vtxAttrs[shaderLocation] = {
-        .format = wgpu::VertexFormat::Sint16x2,
+        .format = wgpu::VertexFormat::Uint16x2,
        .offset = offset,
        .shaderLocation = shaderLocation,
    };