dawn-cmake/test/bug/tint/914.wgsl.expected.hlsl
Ben Clayton 51cfe26bb7 writer/hlsl: Simplify UBO accesses for static indexing
Use the new semantic constant value information to significantly reduce the complex indexing logic emitted for UBO accesses.
This will dramatically reduce the number of `for` loops that are decayed to `while` loops.

Change-Id: I1b0adb5edde2b4ed39c6beafc2e28106b86e0edd
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/57701
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: James Price <jrprice@google.com>
2021-07-13 12:18:13 +00:00

160 lines
5.0 KiB
HLSL

ByteAddressBuffer firstMatrix : register(t0, space0);
ByteAddressBuffer secondMatrix : register(t1, space0);
RWByteAddressBuffer resultMatrix : register(u2, space0);
cbuffer cbuffer_uniforms : register(b3, space0) {
uint4 uniforms[1];
};
float mm_readA(uint row, uint col) {
bool tint_tmp = (row < uniforms[0].x);
if (tint_tmp) {
tint_tmp = (col < uniforms[0].y);
}
if ((tint_tmp)) {
const float result = asfloat(firstMatrix.Load((4u * ((row * uniforms[0].y) + col))));
return result;
}
return 0.0f;
}
float mm_readB(uint row, uint col) {
bool tint_tmp_1 = (row < uniforms[0].y);
if (tint_tmp_1) {
tint_tmp_1 = (col < uniforms[0].z);
}
if ((tint_tmp_1)) {
const float result = asfloat(secondMatrix.Load((4u * ((row * uniforms[0].z) + col))));
return result;
}
return 0.0f;
}
void mm_write(uint row, uint col, float value) {
bool tint_tmp_2 = (row < uniforms[0].x);
if (tint_tmp_2) {
tint_tmp_2 = (col < uniforms[0].z);
}
if ((tint_tmp_2)) {
const uint index = (col + (row * uniforms[0].z));
resultMatrix.Store((4u * index), asuint(value));
}
}
static const uint RowPerThread = 4u;
static const uint ColPerThread = 4u;
static const uint TileAOuter = 64u;
static const uint TileBOuter = 64u;
static const uint TileInner = 64u;
groupshared float mm_Asub[64][64];
groupshared float mm_Bsub[64][64];
struct tint_symbol_1 {
uint3 local_id : SV_GroupThreadID;
uint local_invocation_index : SV_GroupIndex;
uint3 global_id : SV_DispatchThreadID;
};
[numthreads(16, 16, 1)]
void main(tint_symbol_1 tint_symbol) {
const uint3 local_id = tint_symbol.local_id;
const uint3 global_id = tint_symbol.global_id;
const uint local_invocation_index = tint_symbol.local_invocation_index;
if ((local_invocation_index == 0u)) {
{
for(int i = 0; (i < 64); i = (i + 1)) {
{
for(int i_1 = 0; (i_1 < 64); i_1 = (i_1 + 1)) {
mm_Asub[i][i_1] = 0.0f;
}
}
}
}
{
for(int i_2 = 0; (i_2 < 64); i_2 = (i_2 + 1)) {
{
for(int i_3 = 0; (i_3 < 64); i_3 = (i_3 + 1)) {
mm_Bsub[i_2][i_3] = 0.0f;
}
}
}
}
}
GroupMemoryBarrierWithGroupSync();
const uint tileRow = (local_id.y * RowPerThread);
const uint tileCol = (local_id.x * ColPerThread);
const uint globalRow = (global_id.y * RowPerThread);
const uint globalCol = (global_id.x * ColPerThread);
const uint numTiles = (((uniforms[0].y - 1u) / TileInner) + 1u);
float acc[16] = (float[16])0;
float ACached = 0.0f;
float BCached[4] = (float[4])0;
{
for(uint index = 0u; (index < (RowPerThread * ColPerThread)); index = (index + 1u)) {
acc[index] = 0.0f;
}
}
const uint ColPerThreadA = (TileInner / 16u);
const uint tileColA = (local_id.x * ColPerThreadA);
const uint RowPerThreadB = (TileInner / 16u);
const uint tileRowB = (local_id.y * RowPerThreadB);
{
for(uint t = 0u; (t < numTiles); t = (t + 1u)) {
{
for(uint innerRow = 0u; (innerRow < RowPerThread); innerRow = (innerRow + 1u)) {
{
for(uint innerCol = 0u; (innerCol < ColPerThreadA); innerCol = (innerCol + 1u)) {
const uint inputRow = (tileRow + innerRow);
const uint inputCol = (tileColA + innerCol);
mm_Asub[inputRow][inputCol] = mm_readA((globalRow + innerRow), ((t * TileInner) + inputCol));
}
}
}
}
{
for(uint innerRow = 0u; (innerRow < RowPerThreadB); innerRow = (innerRow + 1u)) {
{
for(uint innerCol = 0u; (innerCol < ColPerThread); innerCol = (innerCol + 1u)) {
const uint inputRow = (tileRowB + innerRow);
const uint inputCol = (tileCol + innerCol);
mm_Bsub[innerCol][inputCol] = mm_readB(((t * TileInner) + inputRow), (globalCol + innerCol));
}
}
}
}
GroupMemoryBarrierWithGroupSync();
{
for(uint k = 0u; (k < TileInner); k = (k + 1u)) {
{
for(uint inner = 0u; (inner < ColPerThread); inner = (inner + 1u)) {
BCached[inner] = mm_Bsub[k][(tileCol + inner)];
}
}
{
for(uint innerRow = 0u; (innerRow < RowPerThread); innerRow = (innerRow + 1u)) {
ACached = mm_Asub[(tileRow + innerRow)][k];
{
for(uint innerCol = 0u; (innerCol < ColPerThread); innerCol = (innerCol + 1u)) {
const uint index = ((innerRow * ColPerThread) + innerCol);
acc[index] = (acc[index] + (ACached * BCached[innerCol]));
}
}
}
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
{
for(uint innerRow = 0u; (innerRow < RowPerThread); innerRow = (innerRow + 1u)) {
{
for(uint innerCol = 0u; (innerCol < ColPerThread); innerCol = (innerCol + 1u)) {
const uint index = ((innerRow * ColPerThread) + innerCol);
mm_write((globalRow + innerRow), (globalCol + innerCol), acc[index]);
}
}
}
}
return;
}