aurora: WIP cache for display lists & static buffers

2025-12-15 10:46:10 +00:00 · 2022-03-15 02:18:45 -04:00
parent bbdad137af
commit 02a7b85b23
18 changed files with 263 additions and 215 deletions
--- a/Runtime/Graphics/CCubeModel.cpp
+++ b/Runtime/Graphics/CCubeModel.cpp
@@ -259,8 +259,8 @@ void CCubeModel::EnableShadowMaps(const CTexture& shadowTex, const zeus::CTransf
 void CCubeModel::DisableShadowMaps() { sRenderModelShadow = false; }

 void CCubeModel::SetArraysCurrent() {
-  CGX::SetArray(GX::VA_POS, x0_modelInstance.GetVertexPointer());
-  CGX::SetArray(GX::VA_NRM, x0_modelInstance.GetNormalPointer());
+  CGX::SetArray(GX::VA_POS, x0_modelInstance.GetVertexPointer(), true);
+  CGX::SetArray(GX::VA_NRM, x0_modelInstance.GetNormalPointer(), true);
  SetStaticArraysCurrent();
 }

@@ -280,8 +280,8 @@ void CCubeModel::SetRenderModelBlack(bool v) {
 }

 void CCubeModel::SetSkinningArraysCurrent(TConstVectorRef positions, TConstVectorRef normals) {
-  CGX::SetArray(GX::VA_POS, positions);
-  CGX::SetArray(GX::VA_NRM, normals);
+  CGX::SetArray(GX::VA_POS, positions, false);
+  CGX::SetArray(GX::VA_NRM, normals, false);
  // colors unused
  SetStaticArraysCurrent();
 }
@@ -294,21 +294,21 @@ void CCubeModel::SetStaticArraysCurrent() {
    sUsingPackedLightmaps = false;
  }
  if (sUsingPackedLightmaps) {
-    CGX::SetArray(GX::VA_TEX0, packedTexCoords);
+    CGX::SetArray(GX::VA_TEX0, packedTexCoords, true);
  } else {
-    CGX::SetArray(GX::VA_TEX0, texCoords);
+    CGX::SetArray(GX::VA_TEX0, texCoords, true);
  }
  // TexCoord1 is currently used for all remaining
-  CGX::SetArray(GX::VA_TEX1, texCoords);
+  CGX::SetArray(GX::VA_TEX1, texCoords, true);
  CCubeMaterial::KillCachedViewDepState();
 }

 void CCubeModel::SetUsingPackedLightmaps(bool v) {
  sUsingPackedLightmaps = v;
  if (v) {
-    CGX::SetArray(GX::VA_TEX0, x0_modelInstance.GetPackedTCPointer());
+    CGX::SetArray(GX::VA_TEX0, x0_modelInstance.GetPackedTCPointer(), true);
  } else {
-    CGX::SetArray(GX::VA_TEX0, x0_modelInstance.GetTCPointer());
+    CGX::SetArray(GX::VA_TEX0, x0_modelInstance.GetTCPointer(), true);
  }
 }

--- a/Runtime/Graphics/CGX.hpp
+++ b/Runtime/Graphics/CGX.hpp
@@ -123,9 +123,9 @@ static inline void SetAlphaCompare(GX::Compare comp0, u8 ref0, GX::AlphaOp op, G
 }

 template <typename T>
-static inline void SetArray(GX::Attr attr, const std::vector<T>* data) noexcept {
+static inline void SetArray(GX::Attr attr, const std::vector<T>* data, bool isStatic) noexcept {
  if (data != nullptr && sGXState.x0_arrayPtrs[attr - GX::VA_POS] != data) {
-    GXSetArray(attr, data, sizeof(T));
+    GXSetArray(attr, data, isStatic ? 1 : 0);
  }
 }

--- a/aurora/CMakeLists.txt
+++ b/aurora/CMakeLists.txt
@@ -19,11 +19,8 @@ add_library(aurora STATIC
 target_compile_definitions(aurora PRIVATE IMGUI_USER_CONFIG="imconfig_user.h") # IMGUI_USE_WCHAR32
 target_include_directories(aurora PUBLIC include ../)
 target_include_directories(aurora PRIVATE ../imgui ../extern/imgui)
-target_include_directories(aurora PRIVATE
-    ../extern/dawn/src
-    ../extern/dawn/third_party/abseil-cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/dawn/gen/src) # for hacks :)
-target_link_libraries(aurora PRIVATE dawn_native dawncpp webgpu_dawn zeus logvisor SDL2-static xxhash)
+target_link_libraries(aurora PRIVATE dawn_native dawncpp webgpu_dawn zeus logvisor SDL2-static xxhash
+    absl::btree absl::flat_hash_map)
 if (APPLE)
  target_compile_definitions(aurora PRIVATE DAWN_ENABLE_BACKEND_METAL)
  target_sources(aurora PRIVATE lib/dawn/MetalBinding.mm)
--- a/aurora/lib/gfx/colored_quad/shader.cpp
+++ b/aurora/lib/gfx/colored_quad/shader.cpp
@@ -302,9 +302,9 @@ void render(const State& state, const DrawData& data, const wgpu::RenderPassEnco
    return;
  }

-  const std::array offsets{data.uniformRange.first};
+  const std::array offsets{data.uniformRange.offset};
  pass.SetBindGroup(0, state.uniformBindGroup, offsets.size(), offsets.data());
-  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.first, data.vertRange.second);
+  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size);
  pass.Draw(4);
 }
 } // namespace aurora::gfx::colored_quad
--- a/aurora/lib/gfx/common.cpp
+++ b/aurora/lib/gfx/common.cpp
@@ -11,7 +11,7 @@
 #include <deque>
 #include <logvisor/logvisor.hpp>
 #include <thread>
-#include <unordered_map>
+#include <absl/container/flat_hash_map.h>

 namespace aurora::gfx {
 static logvisor::Module Log("aurora::gfx");
@@ -187,10 +187,10 @@ std::mutex g_pipelineMutex;
 static std::thread g_pipelineThread;
 static std::atomic_bool g_pipelineThreadEnd;
 static std::condition_variable g_pipelineCv;
-static std::unordered_map<PipelineRef, wgpu::RenderPipeline> g_pipelines;
+static absl::flat_hash_map<PipelineRef, wgpu::RenderPipeline> g_pipelines;
 static std::deque<std::pair<PipelineRef, NewPipelineCallback>> g_queuedPipelines;
-static std::unordered_map<BindGroupRef, wgpu::BindGroup> g_cachedBindGroups;
-static std::unordered_map<SamplerRef, wgpu::Sampler> g_cachedSamplers;
+static absl::flat_hash_map<BindGroupRef, wgpu::BindGroup> g_cachedBindGroups;
+static absl::flat_hash_map<SamplerRef, wgpu::Sampler> g_cachedSamplers;
 std::atomic_uint32_t queuedPipelines;
 std::atomic_uint32_t createdPipelines;

@@ -198,10 +198,12 @@ static ByteBuffer g_verts;
 static ByteBuffer g_uniforms;
 static ByteBuffer g_indices;
 static ByteBuffer g_storage;
+static ByteBuffer g_staticStorage;
 wgpu::Buffer g_vertexBuffer;
 wgpu::Buffer g_uniformBuffer;
 wgpu::Buffer g_indexBuffer;
 wgpu::Buffer g_storageBuffer;
+size_t g_staticStorageLastSize = 0;

 static ShaderState g_state;
 static PipelineRef g_currentPipeline;
@@ -213,7 +215,7 @@ static PipelineRef find_pipeline(PipelineCreateCommand command, NewPipelineCallb
  bool found = false;
  {
    std::scoped_lock guard{g_pipelineMutex};
-    found = g_pipelines.find(hash) != g_pipelines.end();
+    found = g_pipelines.contains(hash);
    if (!found) {
      const auto ref =
          std::find_if(g_queuedPipelines.begin(), g_queuedPipelines.end(), [=](auto v) { return v.first == hash; });
@@ -364,11 +366,10 @@ static void pipeline_worker() {
    // std::this_thread::sleep_for(std::chrono::milliseconds{1500});
    {
      std::scoped_lock lock{g_pipelineMutex};
-      if (g_pipelines.contains(cb.first)) {
+      if (!g_pipelines.try_emplace(cb.first, std::move(result)).second) {
        Log.report(logvisor::Fatal, FMT_STRING("Duplicate pipeline {}"), cb.first);
        unreachable();
      }
-      g_pipelines[cb.first] = result;
      g_queuedPipelines.pop_front();
      hasMore = !g_queuedPipelines.empty();
    }
@@ -384,7 +385,7 @@ void initialize() {
    const wgpu::BufferDescriptor descriptor{
        .label = "Shared Uniform Buffer",
        .usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst,
-        .size = 134217728, // 128mb
+        .size = 5242880, // 5mb
    };
    g_uniformBuffer = g_device.CreateBuffer(&descriptor);
  }
@@ -392,7 +393,7 @@ void initialize() {
    const wgpu::BufferDescriptor descriptor{
        .label = "Shared Vertex Buffer",
        .usage = wgpu::BufferUsage::Vertex | wgpu::BufferUsage::CopyDst,
-        .size = 16777216, // 16mb
+        .size = 5242880, // 5mb
    };
    g_vertexBuffer = g_device.CreateBuffer(&descriptor);
  }
@@ -400,7 +401,7 @@ void initialize() {
    const wgpu::BufferDescriptor descriptor{
        .label = "Shared Index Buffer",
        .usage = wgpu::BufferUsage::Index | wgpu::BufferUsage::CopyDst,
-        .size = 4194304, // 4mb
+        .size = 2097152, // 2mb
    };
    g_indexBuffer = g_device.CreateBuffer(&descriptor);
  }
@@ -439,18 +440,33 @@ void shutdown() {
 }

 void render(const wgpu::RenderPassEncoder& pass) {
-  const auto writeBuffer = [](ByteBuffer& buf, wgpu::Buffer& out) {
+  const auto writeBuffer = [](ByteBuffer& buf, wgpu::Buffer& out, std::string_view label) {
    const auto size = buf.size();
+    // Log.report(logvisor::Info, FMT_STRING("{} buffer usage: {}"), label, size);
    if (size > 0) {
      g_queue.WriteBuffer(out, 0, buf.data(), size);
      buf.clear();
      buf.reserve_extra(size); // Reserve size from previous frame
    }
  };
-  writeBuffer(g_verts, g_vertexBuffer);
-  writeBuffer(g_uniforms, g_uniformBuffer);
-  writeBuffer(g_indices, g_indexBuffer);
-  writeBuffer(g_storage, g_storageBuffer);
+  writeBuffer(g_verts, g_vertexBuffer, "Vertex");
+  writeBuffer(g_uniforms, g_uniformBuffer, "Uniform");
+  writeBuffer(g_indices, g_indexBuffer, "Index");
+  {
+    const auto staticSize = g_staticStorage.size();
+    if (staticSize > g_staticStorageLastSize) {
+      g_queue.WriteBuffer(g_storageBuffer, g_staticStorageLastSize, g_staticStorage.data() + g_staticStorageLastSize,
+                          staticSize - g_staticStorageLastSize);
+      g_staticStorageLastSize = staticSize;
+    }
+    const auto size = g_storage.size();
+    if (size > 0) {
+      g_queue.WriteBuffer(g_storageBuffer, staticSize, g_storage.data(), size);
+      g_storage.clear();
+      g_storage.reserve_extra(size); // Reserve size from previous frame
+    }
+    // Log.report(logvisor::Info, FMT_STRING("Static storage: {}, storage: {}"), staticSize, size);
+  }

  g_currentPipeline = UINT64_MAX;

@@ -498,10 +514,11 @@ bool bind_pipeline(PipelineRef ref, const wgpu::RenderPassEncoder& pass) {
    return true;
  }
  std::lock_guard guard{g_pipelineMutex};
-  if (!g_pipelines.contains(ref)) {
+  const auto it = g_pipelines.find(ref);
+  if (it == g_pipelines.end()) {
    return false;
  }
-  pass.SetPipeline(g_pipelines[ref]);
+  pass.SetPipeline(it->second);
  g_currentPipeline = ref;
  return true;
 }
@@ -522,7 +539,7 @@ static inline Range push(ByteBuffer& target, const uint8_t* data, size_t length,
      target.append_zeroes(padding);
    }
  }
-  return {begin, begin + length + padding};
+  return {static_cast<uint32_t>(begin), static_cast<uint32_t>(length + padding)};
 }
 static inline Range map(ByteBuffer& target, size_t length, size_t alignment) {
  size_t padding = 0;
@@ -534,7 +551,7 @@ static inline Range map(ByteBuffer& target, size_t length, size_t alignment) {
  }
  auto begin = target.size();
  target.append_zeroes(length + padding);
-  return {begin, begin + length + padding};
+  return {static_cast<uint32_t>(begin), static_cast<uint32_t>(length + padding)};
 }
 Range push_verts(const uint8_t* data, size_t length) { return push(g_verts, data, length, 0 /* TODO? */); }
 Range push_indices(const uint8_t* data, size_t length) { return push(g_indices, data, length, 0 /* TODO? */); }
@@ -548,48 +565,57 @@ Range push_storage(const uint8_t* data, size_t length) {
  g_device.GetLimits(&limits);
  return push(g_storage, data, length, limits.limits.minStorageBufferOffsetAlignment);
 }
+Range push_static_storage(const uint8_t* data, size_t length) {
+  wgpu::SupportedLimits limits;
+  g_device.GetLimits(&limits);
+  auto range = push(g_staticStorage, data, length, limits.limits.minStorageBufferOffsetAlignment);
+  range.isStatic = true;
+  return range;
+}
 std::pair<ByteBuffer, Range> map_verts(size_t length) {
  const auto range = map(g_verts, length, 0 /* TODO? */);
-  return {ByteBuffer{g_verts.data() + range.first, range.second - range.first}, range};
+  return {ByteBuffer{g_verts.data() + range.offset, range.size}, range};
 }
 std::pair<ByteBuffer, Range> map_indices(size_t length) {
  const auto range = map(g_indices, length, 0 /* TODO? */);
-  return {ByteBuffer{g_indices.data() + range.first, range.second - range.first}, range};
+  return {ByteBuffer{g_indices.data() + range.offset, range.size}, range};
 }
 std::pair<ByteBuffer, Range> map_uniform(size_t length) {
  wgpu::SupportedLimits limits;
  g_device.GetLimits(&limits);
  const auto range = map(g_uniforms, length, limits.limits.minUniformBufferOffsetAlignment);
-  return {ByteBuffer{g_uniforms.data() + range.first, range.second - range.first}, range};
+  return {ByteBuffer{g_uniforms.data() + range.offset, range.size}, range};
 }
 std::pair<ByteBuffer, Range> map_storage(size_t length) {
  wgpu::SupportedLimits limits;
  g_device.GetLimits(&limits);
  const auto range = map(g_storage, length, limits.limits.minStorageBufferOffsetAlignment);
-  return {ByteBuffer{g_storage.data() + range.first, range.second - range.first}, range};
+  return {ByteBuffer{g_storage.data() + range.offset, range.size}, range};
 }

 BindGroupRef bind_group_ref(const wgpu::BindGroupDescriptor& descriptor) {
  const auto id = xxh3_hash(descriptor);
  if (!g_cachedBindGroups.contains(id)) {
-    g_cachedBindGroups[id] = g_device.CreateBindGroup(&descriptor);
+    g_cachedBindGroups.try_emplace(id, g_device.CreateBindGroup(&descriptor));
  }
  return id;
 }
 const wgpu::BindGroup& find_bind_group(BindGroupRef id) {
-  if (!g_cachedBindGroups.contains(id)) {
+  const auto it = g_cachedBindGroups.find(id);
+  if (it == g_cachedBindGroups.end()) {
    Log.report(logvisor::Fatal, FMT_STRING("get_bind_group: failed to locate {}"), id);
    unreachable();
  }
-  return g_cachedBindGroups[id];
+  return it->second;
 }

 const wgpu::Sampler& sampler_ref(const wgpu::SamplerDescriptor& descriptor) {
  const auto id = xxh3_hash(descriptor);
-  if (!g_cachedSamplers.contains(id)) {
-    g_cachedSamplers[id] = g_device.CreateSampler(&descriptor);
+  auto it = g_cachedSamplers.find(id);
+  if (it == g_cachedSamplers.end()) {
+    it = g_cachedSamplers.try_emplace(id, g_device.CreateSampler(&descriptor)).first;
  }
-  return g_cachedSamplers[id];
+  return it->second;
 }

 uint32_t align_uniform(uint32_t value) {
--- a/aurora/lib/gfx/common.hpp
+++ b/aurora/lib/gfx/common.hpp
@@ -126,6 +126,7 @@ extern wgpu::Buffer g_vertexBuffer;
 extern wgpu::Buffer g_uniformBuffer;
 extern wgpu::Buffer g_indexBuffer;
 extern wgpu::Buffer g_storageBuffer;
+extern size_t g_staticStorageLastSize;

 struct TextureRef {
  wgpu::Texture texture;
@@ -149,7 +150,14 @@ using BindGroupRef = uint64_t;
 using PipelineRef = uint64_t;
 using SamplerRef = uint64_t;
 using ShaderRef = uint64_t;
-using Range = std::pair<uint32_t, uint32_t>;
+struct Range {
+  uint32_t offset;
+  uint32_t size;
+  bool isStatic;
+};
+static inline uint32_t storage_offset(Range range) {
+  return range.isStatic ? range.offset : range.offset + g_staticStorageLastSize;
+}

 enum class ShaderType {
  Aabb,
@@ -182,9 +190,22 @@ static inline Range push_uniform(const T& data) {
 }
 Range push_storage(const uint8_t* data, size_t length);
 template <typename T>
+static inline Range push_storage(ArrayRef<T> data) {
+  return push_storage(reinterpret_cast<const uint8_t*>(data.data()), data.size() * sizeof(T));
+}
+template <typename T>
 static inline Range push_storage(const T& data) {
  return push_storage(reinterpret_cast<const uint8_t*>(&data), sizeof(T));
 }
+Range push_static_storage(const uint8_t* data, size_t length);
+template <typename T>
+static inline Range push_static_storage(ArrayRef<T> data) {
+  return push_static_storage(reinterpret_cast<const uint8_t*>(data.data()), data.size() * sizeof(T));
+}
+template <typename T>
+static inline Range push_static_storage(const T& data) {
+  return push_static_storage(reinterpret_cast<const uint8_t*>(&data), sizeof(T));
+}
 std::pair<ByteBuffer, Range> map_verts(size_t length);
 std::pair<ByteBuffer, Range> map_indices(size_t length);
 std::pair<ByteBuffer, Range> map_uniform(size_t length);
--- a/aurora/lib/gfx/gx.cpp
+++ b/aurora/lib/gfx/gx.cpp
@@ -3,7 +3,7 @@
 #include "../gpu.hpp"
 #include "common.hpp"

-#include <unordered_map>
+#include <absl/container/flat_hash_map.h>

 using aurora::gfx::gx::g_gxState;
 static logvisor::Module Log("aurora::gx");
@@ -537,8 +537,8 @@ Range build_uniform(const ShaderInfo& info) noexcept {
  return range;
 }

-static std::unordered_map<u32, wgpu::BindGroupLayout> sUniformBindGroupLayouts;
-static std::unordered_map<u32, std::pair<wgpu::BindGroupLayout, wgpu::BindGroupLayout>> sTextureBindGroupLayouts;
+static absl::flat_hash_map<u32, wgpu::BindGroupLayout> sUniformBindGroupLayouts;
+static absl::flat_hash_map<u32, std::pair<wgpu::BindGroupLayout, wgpu::BindGroupLayout>> sTextureBindGroupLayouts;

 GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& config,
                               const BindGroupRanges& ranges) noexcept {
@@ -555,25 +555,25 @@ GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& confi
      wgpu::BindGroupEntry{
          .binding = 1,
          .buffer = g_storageBuffer,
-          .size = ranges.vtxDataRange.second - ranges.vtxDataRange.first,
+          .size = ranges.vtxDataRange.size,
      },
      // Normals
      wgpu::BindGroupEntry{
          .binding = 2,
          .buffer = g_storageBuffer,
-          .size = ranges.nrmDataRange.second - ranges.nrmDataRange.first,
+          .size = ranges.nrmDataRange.size,
      },
      // UVs
      wgpu::BindGroupEntry{
          .binding = 3,
          .buffer = g_storageBuffer,
-          .size = ranges.tcDataRange.second - ranges.tcDataRange.first,
+          .size = ranges.tcDataRange.size,
      },
      // Packed UVs
      wgpu::BindGroupEntry{
          .binding = 4,
          .buffer = g_storageBuffer,
-          .size = ranges.packedTcDataRange.second - ranges.packedTcDataRange.first,
+          .size = ranges.packedTcDataRange.size,
      },
  };
  std::array<wgpu::BindGroupEntry, MaxTextures> samplerEntries;
@@ -622,8 +622,9 @@ GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& confi
 GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const ShaderConfig& config) noexcept {
  GXBindGroupLayouts out;
  u32 uniformSizeKey = info.uniformSize + (config.denormalizedVertexAttributes ? 0 : 1);
-  if (sUniformBindGroupLayouts.contains(uniformSizeKey)) {
-    out.uniformLayout = sUniformBindGroupLayouts[uniformSizeKey];
+  const auto uniformIt = sUniformBindGroupLayouts.find(uniformSizeKey);
+  if (uniformIt != sUniformBindGroupLayouts.end()) {
+    out.uniformLayout = uniformIt->second;
  } else {
    const std::array uniformLayoutEntries{
        wgpu::BindGroupLayoutEntry{
@@ -683,8 +684,9 @@ GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const Shader
  }

  u32 textureCount = info.sampledTextures.count();
-  if (sTextureBindGroupLayouts.contains(textureCount)) {
-    const auto& [sl, tl] = sTextureBindGroupLayouts[textureCount];
+  const auto textureIt = sTextureBindGroupLayouts.find(textureCount);
+  if (textureIt != sTextureBindGroupLayouts.end()) {
+    const auto& [sl, tl] = textureIt->second;
    out.samplerLayout = sl;
    out.textureLayout = tl;
  } else {
@@ -728,7 +730,7 @@ GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const Shader
 }

 // TODO this is awkward
-extern std::unordered_map<ShaderRef, std::pair<wgpu::ShaderModule, gx::ShaderInfo>> g_gxCachedShaders;
+extern absl::flat_hash_map<ShaderRef, std::pair<wgpu::ShaderModule, gx::ShaderInfo>> g_gxCachedShaders;
 void shutdown() noexcept {
  // TODO we should probably store this all in g_state.gx instead
  sUniformBindGroupLayouts.clear();
--- a/aurora/lib/gfx/gx.hpp
+++ b/aurora/lib/gfx/gx.hpp
@@ -20,6 +20,7 @@ struct TevPass {
  Arg b = Default;
  Arg c = Default;
  Arg d = Default;
+  bool operator==(const TevPass&) const = default;
 };
 struct TevOp {
  GX::TevOp op = GX::TevOp::TEV_ADD;
@@ -27,6 +28,7 @@ struct TevOp {
  GX::TevScale scale = GX::TevScale::CS_SCALE_1;
  GX::TevRegID outReg = GX::TevRegID::TEVPREV;
  bool clamp = true;
+  bool operator==(const TevOp&) const = default;
 };
 struct TevStage {
  TevPass<GX::TevColorArg, GX::CC_ZERO> colorPass;
@@ -38,6 +40,7 @@ struct TevStage {
  GX::TexCoordID texCoordId = GX::TEXCOORD_NULL;
  GX::TexMapID texMapId = GX::TEXMAP_NULL;
  GX::ChannelID channelId = GX::COLOR_NULL;
+  bool operator==(const TevStage&) const = default;
 };
 struct TextureBind {
  aurora::gfx::TextureHandle handle;
@@ -56,6 +59,7 @@ struct ColorChannelConfig {
  GX::ColorSrc matSrc = GX::SRC_REG;
  GX::ColorSrc ambSrc = GX::SRC_REG;
  bool lightingEnabled = false;
+  bool operator==(const ColorChannelConfig&) const = default;
 };
 // For uniform generation
 struct ColorChannelState {
@@ -72,6 +76,7 @@ struct TcgConfig {
  GX::TexMtx mtx = GX::IDENTITY;
  GX::PTTexMtx postMtx = GX::PTIDENTITY;
  bool normalize = false;
+  bool operator==(const TcgConfig&) const = default;
 };
 struct FogState {
  GX::FogType type = GX::FOG_NONE;
@@ -129,6 +134,7 @@ struct ShaderConfig {
  std::optional<float> alphaDiscard;
  bool denormalizedVertexAttributes = false;
  bool denormalizedHasNrm = false; // TODO this is a hack
+  bool operator==(const ShaderConfig&) const = default;
 };
 struct PipelineConfig {
  ShaderConfig shaderConfig;
--- a/aurora/lib/gfx/gx_shader.cpp
+++ b/aurora/lib/gfx/gx_shader.cpp
@@ -3,14 +3,17 @@
 #include "../gpu.hpp"
 #include "gx.hpp"

-#include <unordered_map>
+#include <absl/container/flat_hash_map.h>

 namespace aurora::gfx::gx {
 using namespace fmt::literals;

 static logvisor::Module Log("aurora::gfx::gx");

-std::unordered_map<ShaderRef, std::pair<wgpu::ShaderModule, gx::ShaderInfo>> g_gxCachedShaders;
+absl::flat_hash_map<ShaderRef, std::pair<wgpu::ShaderModule, gx::ShaderInfo>> g_gxCachedShaders;
+#ifndef NDEBUG
+static absl::flat_hash_map<ShaderRef, gx::ShaderConfig> g_gxCachedShaderConfigs;
+#endif

 static std::string color_arg_reg(GX::TevColorArg arg, size_t stageIdx, const TevStage& stage, ShaderInfo& info) {
  switch (arg) {
@@ -346,8 +349,15 @@ static std::string in_uv(u32 idx) {

 std::pair<wgpu::ShaderModule, ShaderInfo> build_shader(const ShaderConfig& config) noexcept {
  const auto hash = xxh3_hash(config);
-  if (g_gxCachedShaders.contains(hash)) {
-    return g_gxCachedShaders[hash];
+  const auto it = g_gxCachedShaders.find(hash);
+  if (it != g_gxCachedShaders.end()) {
+#ifndef NDEBUG
+    if (g_gxCachedShaderConfigs[hash] != config) {
+      Log.report(logvisor::Fatal, FMT_STRING("Shader collision!"));
+      unreachable();
+    }
+#endif
+    return it->second;
  }

  Log.report(logvisor::Info, FMT_STRING("Shader config (hash {:x}):"), hash);
@@ -791,6 +801,9 @@ fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {{
  info.uniformSize = align_uniform(info.uniformSize);
  auto pair = std::make_pair(std::move(shader), info);
  g_gxCachedShaders.emplace(hash, pair);
+#ifndef NDEBUG
+  g_gxCachedShaderConfigs.emplace(hash, config);
+#endif

  return pair;
 }
--- a/aurora/lib/gfx/model/shader.cpp
+++ b/aurora/lib/gfx/model/shader.cpp
@@ -3,6 +3,7 @@
 #include "../../gpu.hpp"
 #include "../common.hpp"

+#include <absl/container/flat_hash_map.h>
 #include <aurora/model.hpp>
 #include <magic_enum.hpp>

@@ -53,11 +54,10 @@ static const std::vector<zeus::CVector3f>* vtxData;
 static const std::vector<zeus::CVector3f>* nrmData;
 static const std::vector<Vec2<float>>* tex0TcData;
 static const std::vector<Vec2<float>>* tcData;
-
-// void set_vertex_buffer(const std::vector<zeus::CVector3f>* data) noexcept { vtxData = data; }
-// void set_normal_buffer(const std::vector<zeus::CVector3f>* norm) noexcept { nrmData = norm; }
-// void set_tex0_tc_buffer(const std::vector<Vec2<float>>* tcs) noexcept { tex0TcData = tcs; }
-// void set_tc_buffer(const std::vector<Vec2<float>>* tcs) noexcept { tcData = tcs; }
+static std::optional<Range> staticVtxRange;
+static std::optional<Range> staticNrmRange;
+static std::optional<Range> staticPackedTcRange;
+static std::optional<Range> staticTcRange;

 enum class VertexFormat : u8 {
  F32F32,
@@ -110,69 +110,95 @@ static inline std::pair<gx::DlVert, size_t> readVert(const u8* data) noexcept {
  return {out, offset};
 }

+static absl::flat_hash_map<XXH64_hash_t, std::pair<std::vector<gx::DlVert>, std::vector<u32>>> sCachedDisplayLists;
+
 void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
-  //  Log.report(logvisor::Info, FMT_STRING("DL size {}"), dlSize);
-  std::vector<gx::DlVert> verts;
-  std::vector<u32> indices;
+  const auto hash = xxh3_hash(dlStart, dlSize, 0);
+  Range vertRange, idxRange;
+  uint32_t numIndices;
+  auto it = sCachedDisplayLists.find(hash);
+  if (it != sCachedDisplayLists.end()) {
+    const auto& [verts, indices] = it->second;
+    numIndices = indices.size();
+    vertRange = push_verts(ArrayRef{verts});
+    idxRange = push_indices(ArrayRef{indices});
+  } else {
+    std::vector<gx::DlVert> verts;
+    std::vector<u32> indices;

-  size_t offset = 0;
-  while (offset < dlSize - 6) {
-    const auto header = dlStart[offset];
-    const auto primitive = static_cast<GX::Primitive>(header & 0xF8);
-    const auto vtxFmt = static_cast<VertexFormat>(header & 0x3);
-    const auto vtxCount = metaforce::SBig(*reinterpret_cast<const u16*>(dlStart + offset + 1));
-    //    Log.report(logvisor::Info, FMT_STRING("DL header prim {}, fmt {}, vtx count {}"), primitive,
-    //               magic_enum::enum_name(vtxFmt), vtxCount);
-    offset += 3;
+    size_t offset = 0;
+    while (offset < dlSize - 6) {
+      const auto header = dlStart[offset];
+      const auto primitive = static_cast<GX::Primitive>(header & 0xF8);
+      const auto vtxFmt = static_cast<VertexFormat>(header & 0x3);
+      const auto vtxCount = metaforce::SBig(*reinterpret_cast<const u16*>(dlStart + offset + 1));
+      offset += 3;

-    if (primitive == 0) {
-      break;
-    }
-    if (primitive != GX::TRIANGLES && primitive != GX::TRIANGLESTRIP && primitive != GX::TRIANGLEFAN) {
-      Log.report(logvisor::Fatal, FMT_STRING("queue_surface: unsupported primitive type {}"), primitive);
-      unreachable();
-    }
-
-    const u32 idxStart = indices.size();
-    const u16 vertsStart = verts.size();
-    verts.reserve(vertsStart + vtxCount);
-    if (vtxCount > 3 && (primitive == GX::TRIANGLEFAN || primitive == GX::TRIANGLESTRIP)) {
-      indices.reserve(idxStart + (u32(vtxCount) - 3) * 3 + 3);
-    } else {
-      indices.reserve(idxStart + vtxCount);
-    }
-    auto curVert = vertsStart;
-    for (int v = 0; v < vtxCount; ++v) {
-      const auto [vert, read] = readVert(dlStart + offset);
-      verts.push_back(vert);
-      offset += read;
-      if (primitive == GX::TRIANGLES || v < 3) {
-        // pass
-      } else if (primitive == GX::TRIANGLEFAN) {
-        indices.push_back(vertsStart);
-        indices.push_back(curVert - 1);
-      } else if (primitive == GX::TRIANGLESTRIP) {
-        if ((v & 1) == 0) {
-          indices.push_back(curVert - 2);
-          indices.push_back(curVert - 1);
-        } else {
-          indices.push_back(curVert - 1);
-          indices.push_back(curVert - 2);
-        }
+      if (primitive == 0) {
+        break;
+      }
+      if (primitive != GX::TRIANGLES && primitive != GX::TRIANGLESTRIP && primitive != GX::TRIANGLEFAN) {
+        Log.report(logvisor::Fatal, FMT_STRING("queue_surface: unsupported primitive type {}"), primitive);
+        unreachable();
+      }
+
+      const u32 idxStart = indices.size();
+      const u16 vertsStart = verts.size();
+      verts.reserve(vertsStart + vtxCount);
+      if (vtxCount > 3 && (primitive == GX::TRIANGLEFAN || primitive == GX::TRIANGLESTRIP)) {
+        indices.reserve(idxStart + (u32(vtxCount) - 3) * 3 + 3);
+      } else {
+        indices.reserve(idxStart + vtxCount);
+      }
+      auto curVert = vertsStart;
+      for (int v = 0; v < vtxCount; ++v) {
+        const auto [vert, read] = readVert(dlStart + offset);
+        verts.push_back(vert);
+        offset += read;
+        if (primitive == GX::TRIANGLES || v < 3) {
+          // pass
+        } else if (primitive == GX::TRIANGLEFAN) {
+          indices.push_back(vertsStart);
+          indices.push_back(curVert - 1);
+        } else if (primitive == GX::TRIANGLESTRIP) {
+          if ((v & 1) == 0) {
+            indices.push_back(curVert - 2);
+            indices.push_back(curVert - 1);
+          } else {
+            indices.push_back(curVert - 1);
+            indices.push_back(curVert - 2);
+          }
+        }
+        indices.push_back(curVert);
+        ++curVert;
      }
-      indices.push_back(curVert);
-      ++curVert;
    }
+
+    numIndices = indices.size();
+    vertRange = push_verts(ArrayRef{verts});
+    idxRange = push_indices(ArrayRef{indices});
+    sCachedDisplayLists.try_emplace(hash, std::move(verts), std::move(indices));
  }

-  //  Log.report(logvisor::Info, FMT_STRING("Read {} verts, {} indices"), verts.size(), indices.size());
-  const auto vertRange = push_verts(ArrayRef{verts});
-  const auto idxRange = push_indices(ArrayRef{indices});
-  const auto sVtxRange = push_storage(reinterpret_cast<const uint8_t*>(vtxData->data()), vtxData->size() * 16);
-  const auto sNrmRange = push_storage(reinterpret_cast<const uint8_t*>(nrmData->data()), nrmData->size() * 16);
-  const auto sTcRange = push_storage(reinterpret_cast<const uint8_t*>(tcData->data()), tcData->size() * 8);
-  Range sPackedTcRange;
-  if (tcData == tex0TcData) {
+  Range sVtxRange, sNrmRange, sTcRange, sPackedTcRange;
+  if (staticVtxRange) {
+    sVtxRange = *staticVtxRange;
+  } else {
+    sVtxRange = push_storage(reinterpret_cast<const uint8_t*>(vtxData->data()), vtxData->size() * 16);
+  }
+  if (staticNrmRange) {
+    sNrmRange = *staticNrmRange;
+  } else {
+    sNrmRange = push_storage(reinterpret_cast<const uint8_t*>(nrmData->data()), nrmData->size() * 16);
+  }
+  if (staticTcRange) {
+    sTcRange = *staticTcRange;
+  } else {
+    sTcRange = push_storage(reinterpret_cast<const uint8_t*>(tcData->data()), tcData->size() * 8);
+  }
+  if (staticPackedTcRange) {
+    sPackedTcRange = *staticPackedTcRange;
+  } else if (tcData == tex0TcData) {
    sPackedTcRange = sTcRange;
  } else {
    sPackedTcRange = push_storage(reinterpret_cast<const uint8_t*>(tex0TcData->data()), tex0TcData->size() * 8);
@@ -192,12 +218,9 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
      .pipeline = pipeline,
      .vertRange = vertRange,
      .idxRange = idxRange,
-      .sVtxRange = sVtxRange,
-      .sNrmRange = sNrmRange,
-      .sTcRange = sTcRange,
-      .sPackedTcRange = sPackedTcRange,
+      .dataRanges = ranges,
      .uniformRange = build_uniform(info),
-      .indexCount = static_cast<uint32_t>(indices.size()),
+      .indexCount = numIndices,
      .bindGroups = info.bindGroups,
  });
 }
@@ -220,36 +243,60 @@ void render(const State& state, const DrawData& data, const wgpu::RenderPassEnco
  }

  const std::array offsets{
-      data.uniformRange.first, data.sVtxRange.first,      data.sNrmRange.first,
-      data.sTcRange.first,     data.sPackedTcRange.first,
+      data.uniformRange.offset,
+      storage_offset(data.dataRanges.vtxDataRange),
+      storage_offset(data.dataRanges.nrmDataRange),
+      storage_offset(data.dataRanges.tcDataRange),
+      storage_offset(data.dataRanges.packedTcDataRange),
  };
  pass.SetBindGroup(0, find_bind_group(data.bindGroups.uniformBindGroup), offsets.size(), offsets.data());
  if (data.bindGroups.samplerBindGroup && data.bindGroups.textureBindGroup) {
    pass.SetBindGroup(1, find_bind_group(data.bindGroups.samplerBindGroup));
    pass.SetBindGroup(2, find_bind_group(data.bindGroups.textureBindGroup));
  }
-  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.first, data.vertRange.second);
-  pass.SetIndexBuffer(g_indexBuffer, wgpu::IndexFormat::Uint32, data.idxRange.first, data.idxRange.second);
+  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size);
+  pass.SetIndexBuffer(g_indexBuffer, wgpu::IndexFormat::Uint32, data.idxRange.offset, data.idxRange.size);
  pass.DrawIndexed(data.indexCount);
 }
 } // namespace aurora::gfx::model

+static absl::flat_hash_map<XXH64_hash_t, aurora::gfx::Range> sCachedRanges;
+template <typename Vec>
+static inline void cache_array(const void* data, Vec*& outPtr, std::optional<aurora::gfx::Range>& outRange, u8 stride) {
+  Vec* vecPtr = static_cast<Vec*>(data);
+  outPtr = vecPtr;
+  if (stride == 1) {
+    const auto hash = aurora::xxh3_hash(vecPtr->data(), vecPtr->size() * sizeof(typename Vec::value_type), 0);
+    const auto it = sCachedRanges.find(hash);
+    if (it != sCachedRanges.end()) {
+      outRange = it->second;
+    } else {
+      const auto range = aurora::gfx::push_static_storage(aurora::ArrayRef{*vecPtr});
+      sCachedRanges.try_emplace(hash, range);
+      outRange = range;
+    }
+  } else {
+    outRange.reset();
+  }
+}
+
 void GXSetArray(GX::Attr attr, const void* data, u8 stride) noexcept {
+  using namespace aurora::gfx::model;
  switch (attr) {
  case GX::VA_POS:
-    aurora::gfx::model::vtxData = static_cast<const std::vector<zeus::CVector3f>*>(data);
+    cache_array(data, vtxData, staticVtxRange, stride);
    break;
  case GX::VA_NRM:
-    aurora::gfx::model::nrmData = static_cast<const std::vector<zeus::CVector3f>*>(data);
+    cache_array(data, nrmData, staticNrmRange, stride);
    break;
  case GX::VA_TEX0:
-    aurora::gfx::model::tex0TcData = static_cast<const std::vector<aurora::Vec2<float>>*>(data);
+    cache_array(data, tex0TcData, staticPackedTcRange, stride);
    break;
  case GX::VA_TEX1:
-    aurora::gfx::model::tcData = static_cast<const std::vector<aurora::Vec2<float>>*>(data);
+    cache_array(data, tcData, staticTcRange, stride);
    break;
  default:
-    aurora::gfx::model::Log.report(logvisor::Fatal, FMT_STRING("GXSetArray: invalid attr {}"), attr);
+    Log.report(logvisor::Fatal, FMT_STRING("GXSetArray: invalid attr {}"), attr);
    unreachable();
  }
 }
--- a/aurora/lib/gfx/model/shader.hpp
+++ b/aurora/lib/gfx/model/shader.hpp
@@ -3,17 +3,12 @@
 #include "../common.hpp"
 #include "../gx.hpp"

-#include <unordered_map>
-
 namespace aurora::gfx::model {
 struct DrawData {
  PipelineRef pipeline;
  Range vertRange;
  Range idxRange;
-  Range sVtxRange;
-  Range sNrmRange;
-  Range sTcRange;
-  Range sPackedTcRange;
+  gx::BindGroupRanges dataRanges;
  Range uniformRange;
  uint32_t indexCount;
  gx::GXBindGroups bindGroups;
--- a/aurora/lib/gfx/movie_player/shader.cpp
+++ b/aurora/lib/gfx/movie_player/shader.cpp
@@ -239,7 +239,7 @@ void render(const State& state, const DrawData& data, const wgpu::RenderPassEnco

  pass.SetBindGroup(0, state.uniformBindGroup);
  pass.SetBindGroup(1, find_bind_group(data.textureBindGroup));
-  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.first, data.vertRange.second);
+  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size);
  pass.Draw(4);
 }
 } // namespace aurora::gfx::movie_player
--- a/aurora/lib/gfx/stream.cpp
+++ b/aurora/lib/gfx/stream.cpp
@@ -4,8 +4,6 @@
 #include "common.hpp"
 #include "gx.hpp"

-#include <unordered_map>
-
 namespace aurora::gfx {
 static logvisor::Module Log("aurora::gfx::stream");

--- a/aurora/lib/gfx/stream/shader.cpp
+++ b/aurora/lib/gfx/stream/shader.cpp
@@ -6,10 +6,6 @@
 #include <magic_enum.hpp>
 #include <utility>

-namespace aurora::gfx {
-extern std::unordered_map<ShaderRef, wgpu::ShaderModule> g_gxCachedShaders;
-} // namespace aurora::gfx
-
 namespace aurora::gfx::stream {
 static logvisor::Module Log("aurora::gfx::stream");

@@ -66,60 +62,20 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] Pipeli
  return build_pipeline(config, info, vertexBuffers, shader, "Stream Pipeline");
 }

-State construct_state() {
-  const auto samplerBinding = wgpu::SamplerBindingLayout{
-      .type = wgpu::SamplerBindingType::Filtering,
-  };
-  const std::array samplerLayoutEntries{
-      wgpu::BindGroupLayoutEntry{
-          .binding = 0,
-          .visibility = wgpu::ShaderStage::Fragment,
-          .sampler = samplerBinding,
-      },
-  };
-  const auto samplerLayoutDescriptor = wgpu::BindGroupLayoutDescriptor{
-      .label = "Stream Sampler Bind Group Layout",
-      .entryCount = samplerLayoutEntries.size(),
-      .entries = samplerLayoutEntries.data(),
-  };
-  auto samplerLayout = g_device.CreateBindGroupLayout(&samplerLayoutDescriptor);
-
-  const auto textureBinding = wgpu::TextureBindingLayout{
-      .sampleType = wgpu::TextureSampleType::Float,
-      .viewDimension = wgpu::TextureViewDimension::e2D,
-  };
-  const std::array textureLayoutEntries{
-      wgpu::BindGroupLayoutEntry{
-          .binding = 0,
-          .visibility = wgpu::ShaderStage::Fragment,
-          .texture = textureBinding,
-      },
-  };
-  const auto textureLayoutDescriptor = wgpu::BindGroupLayoutDescriptor{
-      .label = "Stream Texture Bind Group Layout",
-      .entryCount = textureLayoutEntries.size(),
-      .entries = textureLayoutEntries.data(),
-  };
-  auto textureLayout = g_device.CreateBindGroupLayout(&textureLayoutDescriptor);
-
-  return {
-      .samplerLayout = samplerLayout,
-      .textureLayout = textureLayout,
-  };
-}
+State construct_state() { return {}; }

 void render(const State& state, const DrawData& data, const wgpu::RenderPassEncoder& pass) {
  if (!bind_pipeline(data.pipeline, pass)) {
    return;
  }

-  const std::array offsets{data.uniformRange.first};
+  const std::array offsets{data.uniformRange.offset};
  pass.SetBindGroup(0, find_bind_group(data.bindGroups.uniformBindGroup), offsets.size(), offsets.data());
  if (data.bindGroups.samplerBindGroup && data.bindGroups.textureBindGroup) {
    pass.SetBindGroup(1, find_bind_group(data.bindGroups.samplerBindGroup));
    pass.SetBindGroup(2, find_bind_group(data.bindGroups.textureBindGroup));
  }
-  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.first, data.vertRange.second);
+  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size);
  pass.Draw(data.vertexCount);
 }
 } // namespace aurora::gfx::stream
--- a/aurora/lib/gfx/stream/shader.hpp
+++ b/aurora/lib/gfx/stream/shader.hpp
@@ -3,8 +3,6 @@
 #include "../common.hpp"
 #include "../gx.hpp"

-#include <unordered_map>
-
 namespace aurora::gfx::stream {
 struct DrawData {
  PipelineRef pipeline;
@@ -16,19 +14,7 @@ struct DrawData {

 struct PipelineConfig : public gx::PipelineConfig {};

-struct CachedBindGroup {
-  wgpu::BindGroupLayout layout;
-  wgpu::BindGroup bindGroup;
-  CachedBindGroup(wgpu::BindGroupLayout layout, wgpu::BindGroup&& group)
-  : layout(std::move(layout)), bindGroup(std::move(group)) {}
-};
-struct State {
-  wgpu::BindGroupLayout samplerLayout;
-  wgpu::BindGroupLayout textureLayout;
-  mutable std::unordered_map<uint32_t, CachedBindGroup> uniform;
-  mutable std::unordered_map<uint64_t, wgpu::Sampler> sampler;
-  mutable std::unordered_map<PipelineRef, gx::ShaderInfo> shaderInfo;
-};
+struct State {};

 State construct_state();
 wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] PipelineConfig config);
--- a/aurora/lib/gfx/textured_quad/shader.cpp
+++ b/aurora/lib/gfx/textured_quad/shader.cpp
@@ -380,10 +380,10 @@ void render(const State& state, const DrawData& data, const wgpu::RenderPassEnco
    return;
  }

-  const std::array offsets{data.uniformRange.first};
+  const std::array offsets{data.uniformRange.offset};
  pass.SetBindGroup(0, state.uniformBindGroup, offsets.size(), offsets.data());
  pass.SetBindGroup(1, find_bind_group(data.textureBindGroup));
-  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.first, data.vertRange.second);
+  pass.SetVertexBuffer(0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size);
  pass.Draw(4);
 }
 } // namespace aurora::gfx::textured_quad
--- a/aurora/lib/input.cpp
+++ b/aurora/lib/input.cpp
@@ -2,6 +2,7 @@
 #include <SDL_haptic.h>

 #include <absl/container/btree_map.h>
+#include <absl/container/flat_hash_map.h>
 #include <absl/strings/str_split.h>

 namespace aurora::input {
@@ -13,7 +14,7 @@ struct GameController {
  Sint32 m_index = -1;
  bool m_hasRumble = false;
 };
-std::unordered_map<Uint32, GameController> g_GameControllers;
+absl::flat_hash_map<Uint32, GameController> g_GameControllers;

 static std::optional<std::string> remap_controller_layout(std::string_view mapping) {
  std::string newMapping;
--- a/aurora/lib/input.hpp
+++ b/aurora/lib/input.hpp
@@ -1,11 +1,11 @@
 #pragma once

-#include <unordered_map>
 #include "aurora/aurora.hpp"
 #include "SDL_gamecontroller.h"
 #include "SDL_keyboard.h"
 #include "SDL_keycode.h"
 #include "SDL_mouse.h"
+
 namespace aurora::input {
 Sint32 add_controller(Sint32 which) noexcept;
 void remove_controller(Uint32 instance) noexcept;