mirror of
https://github.com/encounter/aurora.git
synced 2025-12-15 16:16:15 +00:00
Rewrite attribute buffer, matrix & stream handling
Now array attributes (GXSetArray) will be properly fetched based on the vertex format. Buffers are still assumed to be byte-swapped to little-endian. Stream handling completely redone and many issues resolved. Eliminates matrix transposes. AURORA_NATIVE_MATRIX is no longer necessary and removed.
This commit is contained in:
@@ -1,60 +1,29 @@
|
||||
#include "shader.hpp"
|
||||
|
||||
#include "../../webgpu/gpu.hpp"
|
||||
#include "../gx_fmt.hpp"
|
||||
|
||||
#include <absl/container/flat_hash_map.h>
|
||||
|
||||
namespace aurora::gfx::model {
|
||||
static Module Log("aurora::gfx::model");
|
||||
|
||||
template <typename T>
|
||||
constexpr T bswap16(T val) noexcept {
|
||||
static_assert(sizeof(T) == sizeof(u16));
|
||||
union {
|
||||
u16 u;
|
||||
T t;
|
||||
} v{.t = val};
|
||||
#if __GNUC__
|
||||
v.u = __builtin_bswap16(v.u);
|
||||
#elif _WIN32
|
||||
v.u = _byteswap_ushort(v.u);
|
||||
#else
|
||||
v.u = (v.u << 8) | ((v.u >> 8) & 0xFF);
|
||||
#endif
|
||||
return v.t;
|
||||
}
|
||||
template <typename T>
|
||||
constexpr T bswap32(T val) noexcept {
|
||||
static_assert(sizeof(T) == sizeof(u32));
|
||||
union {
|
||||
u32 u;
|
||||
T t;
|
||||
} v{.t = val};
|
||||
#if __GNUC__
|
||||
v.u = __builtin_bswap32(v.u);
|
||||
#elif _WIN32
|
||||
v.u = _byteswap_ulong(v.u);
|
||||
#else
|
||||
v.u = ((v.u & 0x0000FFFF) << 16) | ((v.u & 0xFFFF0000) >> 16) | ((v.u & 0x00FF00FF) << 8) | ((v.u & 0xFF00FF00) >> 8);
|
||||
#endif
|
||||
return v.t;
|
||||
}
|
||||
|
||||
using IndexedAttrs = std::array<bool, GX_VA_MAX_ATTR>;
|
||||
struct DisplayListCache {
|
||||
ByteBuffer vtxBuf;
|
||||
ByteBuffer idxBuf;
|
||||
IndexedAttrs indexedAttrs;
|
||||
GXVtxFmt fmt;
|
||||
|
||||
DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs)
|
||||
: vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs) {}
|
||||
DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs, GXVtxFmt fmt)
|
||||
: vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs), fmt(fmt) {}
|
||||
};
|
||||
|
||||
static absl::flat_hash_map<HashType, DisplayListCache> sCachedDisplayLists;
|
||||
|
||||
static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u16 vtxCount,
|
||||
IndexedAttrs& indexedAttrs) {
|
||||
using aurora::gfx::gx::g_gxState;
|
||||
using gx::g_gxState;
|
||||
struct {
|
||||
u8 count;
|
||||
GXCompType type;
|
||||
@@ -66,14 +35,13 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
|
||||
for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) {
|
||||
const auto& attrFmt = g_gxState.vtxFmts[vtxfmt].attrs[attr];
|
||||
switch (g_gxState.vtxDesc[attr]) {
|
||||
DEFAULT_FATAL("unhandled attribute type {}", static_cast<int>(g_gxState.vtxDesc[attr]));
|
||||
DEFAULT_FATAL("unhandled attribute type {}", g_gxState.vtxDesc[attr]);
|
||||
case GX_NONE:
|
||||
break;
|
||||
case GX_DIRECT:
|
||||
#define COMBINE(val1, val2, val3) (((val1) << 16) | ((val2) << 8) | (val3))
|
||||
switch (COMBINE(attr, attrFmt.cnt, attrFmt.type)) {
|
||||
DEFAULT_FATAL("not handled: attr {}, cnt {}, type {}", static_cast<int>(attr), static_cast<int>(attrFmt.cnt),
|
||||
static_cast<int>(attrFmt.type));
|
||||
DEFAULT_FATAL("not handled: attr {}, cnt {}, type {}", attr, attrFmt.cnt, attrFmt.type);
|
||||
case COMBINE(GX_VA_POS, GX_POS_XYZ, GX_F32):
|
||||
case COMBINE(GX_VA_NRM, GX_NRM_XYZ, GX_F32):
|
||||
attrArrays[attr].count = 3;
|
||||
@@ -150,12 +118,10 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
|
||||
for (u32 v = 0; v < vtxCount; ++v) {
|
||||
for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) {
|
||||
if (g_gxState.vtxDesc[attr] == GX_INDEX8) {
|
||||
u16 index = *ptr;
|
||||
buf.append(&index, 2);
|
||||
buf.append(static_cast<u16>(*ptr));
|
||||
++ptr;
|
||||
} else if (g_gxState.vtxDesc[attr] == GX_INDEX16) {
|
||||
u16 index = bswap16(*reinterpret_cast<const u16*>(ptr));
|
||||
buf.append(&index, 2);
|
||||
buf.append(bswap(*reinterpret_cast<const u16*>(ptr)));
|
||||
ptr += 2;
|
||||
}
|
||||
if (g_gxState.vtxDesc[attr] != GX_DIRECT) {
|
||||
@@ -182,7 +148,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
|
||||
break;
|
||||
case GX_U16:
|
||||
for (int i = 0; i < count; ++i) {
|
||||
const auto value = bswap16(reinterpret_cast<const u16*>(ptr)[i]);
|
||||
const auto value = bswap(reinterpret_cast<const u16*>(ptr)[i]);
|
||||
out[i] = static_cast<f32>(value) / static_cast<f32>(1 << attrFmt.frac);
|
||||
}
|
||||
buf.append(out.data(), sizeof(f32) * count);
|
||||
@@ -190,7 +156,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
|
||||
break;
|
||||
case GX_S16:
|
||||
for (int i = 0; i < count; ++i) {
|
||||
const auto value = bswap16(reinterpret_cast<const s16*>(ptr)[i]);
|
||||
const auto value = bswap(reinterpret_cast<const s16*>(ptr)[i]);
|
||||
out[i] = static_cast<f32>(value) / static_cast<f32>(1 << attrFmt.frac);
|
||||
}
|
||||
buf.append(out.data(), sizeof(f32) * count);
|
||||
@@ -198,7 +164,7 @@ static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u
|
||||
break;
|
||||
case GX_F32:
|
||||
for (int i = 0; i < count; ++i) {
|
||||
out[i] = bswap32(reinterpret_cast<const f32*>(ptr)[i]);
|
||||
out[i] = bswap(reinterpret_cast<const f32*>(ptr)[i]);
|
||||
}
|
||||
buf.append(out.data(), sizeof(f32) * count);
|
||||
ptr += count * sizeof(f32);
|
||||
@@ -227,7 +193,7 @@ static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u
|
||||
buf.reserve_extra(vtxCount * sizeof(u16));
|
||||
for (u16 v = 0; v < vtxCount; ++v) {
|
||||
const u16 idx = vtxStart + v;
|
||||
buf.append(&idx, sizeof(u16));
|
||||
buf.append(idx);
|
||||
++numIndices;
|
||||
}
|
||||
} else if (prim == GX_TRIANGLEFAN) {
|
||||
@@ -235,29 +201,26 @@ static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u
|
||||
for (u16 v = 0; v < vtxCount; ++v) {
|
||||
const u16 idx = vtxStart + v;
|
||||
if (v < 3) {
|
||||
buf.append(&idx, sizeof(u16));
|
||||
buf.append(idx);
|
||||
++numIndices;
|
||||
continue;
|
||||
}
|
||||
const std::array<u16, 3> idxs{vtxStart, u16(idx - 1), idx};
|
||||
buf.append(idxs.data(), sizeof(u16) * 3);
|
||||
buf.append(std::array{vtxStart, static_cast<u16>(idx - 1), idx});
|
||||
numIndices += 3;
|
||||
}
|
||||
} else if (prim == GX_TRIANGLESTRIP) {
|
||||
buf.reserve_extra(((u32(vtxCount) - 3) * 3 + 3) * sizeof(u16));
|
||||
buf.reserve_extra(((static_cast<u32>(vtxCount) - 3) * 3 + 3) * sizeof(u16));
|
||||
for (u16 v = 0; v < vtxCount; ++v) {
|
||||
const u16 idx = vtxStart + v;
|
||||
if (v < 3) {
|
||||
buf.append(&idx, sizeof(u16));
|
||||
buf.append(idx);
|
||||
++numIndices;
|
||||
continue;
|
||||
}
|
||||
if ((v & 1) == 0) {
|
||||
const std::array<u16, 3> idxs{u16(idx - 2), u16(idx - 1), idx};
|
||||
buf.append(idxs.data(), sizeof(u16) * 3);
|
||||
buf.append(std::array{static_cast<u16>(idx - 2), static_cast<u16>(idx - 1), idx});
|
||||
} else {
|
||||
const std::array<u16, 3> idxs{u16(idx - 1), u16(idx - 2), idx};
|
||||
buf.append(idxs.data(), sizeof(u16) * 3);
|
||||
buf.append(std::array{static_cast<u16>(idx - 1), static_cast<u16>(idx - 2), idx});
|
||||
}
|
||||
numIndices += 3;
|
||||
}
|
||||
@@ -271,6 +234,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
|
||||
Range vertRange, idxRange;
|
||||
u32 numIndices = 0;
|
||||
IndexedAttrs indexedAttrs{};
|
||||
GXVtxFmt fmt = GX_MAX_VTXFMT;
|
||||
auto it = sCachedDisplayLists.find(hash);
|
||||
if (it != sCachedDisplayLists.end()) {
|
||||
const auto& cache = it->second;
|
||||
@@ -278,6 +242,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
|
||||
vertRange = push_verts(cache.vtxBuf.data(), cache.vtxBuf.size());
|
||||
idxRange = push_indices(cache.idxBuf.data(), cache.idxBuf.size());
|
||||
indexedAttrs = cache.indexedAttrs;
|
||||
fmt = cache.fmt;
|
||||
} else {
|
||||
const u8* data = dlStart;
|
||||
u32 pos = 0;
|
||||
@@ -302,8 +267,12 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
|
||||
case GX_DRAW_TRIANGLE_STRIP:
|
||||
case GX_DRAW_TRIANGLE_FAN: {
|
||||
const auto prim = static_cast<GXPrimitive>(opcode);
|
||||
const auto fmt = static_cast<GXVtxFmt>(cmd & GX_VAT_MASK);
|
||||
u16 vtxCount = bswap16(*reinterpret_cast<const u16*>(data + pos));
|
||||
const auto newFmt = static_cast<GXVtxFmt>(cmd & GX_VAT_MASK);
|
||||
if (fmt != GX_MAX_VTXFMT && fmt != newFmt) {
|
||||
FATAL("Vertex format changed mid-display list: {} -> {}", fmt, newFmt);
|
||||
}
|
||||
fmt = newFmt;
|
||||
u16 vtxCount = bswap(*reinterpret_cast<const u16*>(data + pos));
|
||||
pos += 2;
|
||||
pos += vtxCount * prepare_vtx_buffer(vtxBuf, fmt, data + pos, vtxCount, indexedAttrs);
|
||||
numIndices += prepare_idx_buffer(idxBuf, prim, vtxStart, vtxCount);
|
||||
@@ -319,22 +288,16 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
|
||||
}
|
||||
vertRange = push_verts(vtxBuf.data(), vtxBuf.size());
|
||||
idxRange = push_indices(idxBuf.data(), idxBuf.size());
|
||||
sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs);
|
||||
sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs, fmt);
|
||||
}
|
||||
|
||||
gx::BindGroupRanges ranges{};
|
||||
int lastIndexedAttr = -1;
|
||||
for (int i = 0; i < GX_VA_MAX_ATTR; ++i) {
|
||||
if (!indexedAttrs[i]) {
|
||||
continue;
|
||||
}
|
||||
auto& array = gx::g_gxState.arrays[i];
|
||||
if (lastIndexedAttr >= 0 && array == gx::g_gxState.arrays[lastIndexedAttr]) {
|
||||
// Reuse range from last attribute in shader
|
||||
// Don't set the output range, so it remains unbound
|
||||
const auto range = gx::g_gxState.arrays[lastIndexedAttr].cachedRange;
|
||||
array.cachedRange = range;
|
||||
} else if (array.cachedRange.size > 0) {
|
||||
if (array.cachedRange.size > 0) {
|
||||
// Use the currently cached range
|
||||
ranges.vaRanges[i] = array.cachedRange;
|
||||
} else {
|
||||
@@ -343,11 +306,10 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
|
||||
ranges.vaRanges[i] = range;
|
||||
array.cachedRange = range;
|
||||
}
|
||||
lastIndexedAttr = i;
|
||||
}
|
||||
|
||||
model::PipelineConfig config{};
|
||||
populate_pipeline_config(config, GX_TRIANGLES);
|
||||
populate_pipeline_config(config, GX_TRIANGLES, fmt);
|
||||
const auto info = gx::build_shader_info(config.shaderConfig);
|
||||
const auto bindGroups = gx::build_bind_groups(info, config.shaderConfig, ranges);
|
||||
const auto pipeline = pipeline_ref(config);
|
||||
@@ -366,7 +328,7 @@ void queue_surface(const u8* dlStart, u32 dlSize) noexcept {
|
||||
|
||||
State construct_state() { return {}; }
|
||||
|
||||
wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config) {
|
||||
wgpu::RenderPipeline create_pipeline(const State& state, const PipelineConfig& config) {
|
||||
const auto info = build_shader_info(config.shaderConfig); // TODO remove
|
||||
const auto shader = build_shader(config.shaderConfig, info);
|
||||
|
||||
@@ -385,7 +347,7 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const
|
||||
// Indexed attributes
|
||||
for (u32 i = 0; i < num4xAttr; ++i) {
|
||||
vtxAttrs[shaderLocation] = {
|
||||
.format = wgpu::VertexFormat::Sint16x4,
|
||||
.format = wgpu::VertexFormat::Uint16x4,
|
||||
.offset = offset,
|
||||
.shaderLocation = shaderLocation,
|
||||
};
|
||||
@@ -394,7 +356,7 @@ wgpu::RenderPipeline create_pipeline(const State& state, [[maybe_unused]] const
|
||||
}
|
||||
for (u32 i = 0; i < num2xAttr; ++i) {
|
||||
vtxAttrs[shaderLocation] = {
|
||||
.format = wgpu::VertexFormat::Sint16x2,
|
||||
.format = wgpu::VertexFormat::Uint16x2,
|
||||
.offset = offset,
|
||||
.shaderLocation = shaderLocation,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user