mirror of
https://github.com/encounter/dawn-cmake.git
synced 2025-08-05 11:45:54 +00:00
Emit the "[loop]" attribute on "for" and "while" so that FXC does not attempt to unroll them. This is to work around an FXC bug where it fails to unroll loops with gradient operations. FXC ostensibly unrolls such loops because gradient operations require uniform control flow, and loops that have varying iterations may possibly not be uniform. Tint will eventually validate that control flow is indeed uniform, so forcing FXC to avoid unrolling in these cases should be fine. Bug: tint:1112 Change-Id: I10077f8b62fbbb230a0003f3864c75a8fe0e1d18 Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/69880 Kokoro: Kokoro <noreply+kokoro@google.com> Reviewed-by: Ben Clayton <bclayton@google.com> Commit-Queue: Antonio Maiorano <amaiorano@google.com>
149 lines
4.9 KiB
HLSL
149 lines
4.9 KiB
HLSL
ByteAddressBuffer firstMatrix : register(t0, space0);
|
|
ByteAddressBuffer secondMatrix : register(t1, space0);
|
|
RWByteAddressBuffer resultMatrix : register(u2, space0);
|
|
cbuffer cbuffer_uniforms : register(b3, space0) {
|
|
uint4 uniforms[1];
|
|
};
|
|
|
|
float mm_readA(uint row, uint col) {
|
|
bool tint_tmp = (row < uniforms[0].x);
|
|
if (tint_tmp) {
|
|
tint_tmp = (col < uniforms[0].y);
|
|
}
|
|
if ((tint_tmp)) {
|
|
const float result = asfloat(firstMatrix.Load((4u * ((row * uniforms[0].y) + col))));
|
|
return result;
|
|
}
|
|
return 0.0f;
|
|
}
|
|
|
|
float mm_readB(uint row, uint col) {
|
|
bool tint_tmp_1 = (row < uniforms[0].y);
|
|
if (tint_tmp_1) {
|
|
tint_tmp_1 = (col < uniforms[0].z);
|
|
}
|
|
if ((tint_tmp_1)) {
|
|
const float result = asfloat(secondMatrix.Load((4u * ((row * uniforms[0].z) + col))));
|
|
return result;
|
|
}
|
|
return 0.0f;
|
|
}
|
|
|
|
void mm_write(uint row, uint col, float value) {
|
|
bool tint_tmp_2 = (row < uniforms[0].x);
|
|
if (tint_tmp_2) {
|
|
tint_tmp_2 = (col < uniforms[0].z);
|
|
}
|
|
if ((tint_tmp_2)) {
|
|
const uint index = (col + (row * uniforms[0].z));
|
|
resultMatrix.Store((4u * index), asuint(value));
|
|
}
|
|
}
|
|
|
|
static const uint RowPerThread = 4u;
|
|
static const uint ColPerThread = 4u;
|
|
static const uint TileAOuter = 64u;
|
|
static const uint TileBOuter = 64u;
|
|
static const uint TileInner = 64u;
|
|
groupshared float mm_Asub[64][64];
|
|
groupshared float mm_Bsub[64][64];
|
|
|
|
struct tint_symbol_1 {
|
|
uint3 local_id : SV_GroupThreadID;
|
|
uint local_invocation_index : SV_GroupIndex;
|
|
uint3 global_id : SV_DispatchThreadID;
|
|
};
|
|
|
|
void main_inner(uint3 local_id, uint3 global_id, uint local_invocation_index) {
|
|
{
|
|
[loop] for(uint idx = local_invocation_index; (idx < 4096u); idx = (idx + 256u)) {
|
|
const uint i = (idx / 64u);
|
|
const uint i_1 = (idx % 64u);
|
|
mm_Asub[i][i_1] = 0.0f;
|
|
mm_Bsub[i][i_1] = 0.0f;
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
const uint tileRow = (local_id.y * RowPerThread);
|
|
const uint tileCol = (local_id.x * ColPerThread);
|
|
const uint globalRow = (global_id.y * RowPerThread);
|
|
const uint globalCol = (global_id.x * ColPerThread);
|
|
const uint numTiles = (((uniforms[0].y - 1u) / TileInner) + 1u);
|
|
float acc[16] = (float[16])0;
|
|
float ACached = 0.0f;
|
|
float BCached[4] = (float[4])0;
|
|
{
|
|
[loop] for(uint index = 0u; (index < (RowPerThread * ColPerThread)); index = (index + 1u)) {
|
|
acc[index] = 0.0f;
|
|
}
|
|
}
|
|
const uint ColPerThreadA = (TileInner / 16u);
|
|
const uint tileColA = (local_id.x * ColPerThreadA);
|
|
const uint RowPerThreadB = (TileInner / 16u);
|
|
const uint tileRowB = (local_id.y * RowPerThreadB);
|
|
{
|
|
[loop] for(uint t = 0u; (t < numTiles); t = (t + 1u)) {
|
|
{
|
|
[loop] for(uint innerRow = 0u; (innerRow < RowPerThread); innerRow = (innerRow + 1u)) {
|
|
{
|
|
[loop] for(uint innerCol = 0u; (innerCol < ColPerThreadA); innerCol = (innerCol + 1u)) {
|
|
const uint inputRow = (tileRow + innerRow);
|
|
const uint inputCol = (tileColA + innerCol);
|
|
mm_Asub[inputRow][inputCol] = mm_readA((globalRow + innerRow), ((t * TileInner) + inputCol));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
{
|
|
[loop] for(uint innerRow = 0u; (innerRow < RowPerThreadB); innerRow = (innerRow + 1u)) {
|
|
{
|
|
[loop] for(uint innerCol = 0u; (innerCol < ColPerThread); innerCol = (innerCol + 1u)) {
|
|
const uint inputRow = (tileRowB + innerRow);
|
|
const uint inputCol = (tileCol + innerCol);
|
|
mm_Bsub[innerCol][inputCol] = mm_readB(((t * TileInner) + inputRow), (globalCol + innerCol));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
{
|
|
[loop] for(uint k = 0u; (k < TileInner); k = (k + 1u)) {
|
|
{
|
|
[loop] for(uint inner = 0u; (inner < ColPerThread); inner = (inner + 1u)) {
|
|
BCached[inner] = mm_Bsub[k][(tileCol + inner)];
|
|
}
|
|
}
|
|
{
|
|
[loop] for(uint innerRow = 0u; (innerRow < RowPerThread); innerRow = (innerRow + 1u)) {
|
|
ACached = mm_Asub[(tileRow + innerRow)][k];
|
|
{
|
|
[loop] for(uint innerCol = 0u; (innerCol < ColPerThread); innerCol = (innerCol + 1u)) {
|
|
const uint index = ((innerRow * ColPerThread) + innerCol);
|
|
acc[index] = (acc[index] + (ACached * BCached[innerCol]));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
}
|
|
}
|
|
{
|
|
[loop] for(uint innerRow = 0u; (innerRow < RowPerThread); innerRow = (innerRow + 1u)) {
|
|
{
|
|
[loop] for(uint innerCol = 0u; (innerCol < ColPerThread); innerCol = (innerCol + 1u)) {
|
|
const uint index = ((innerRow * ColPerThread) + innerCol);
|
|
mm_write((globalRow + innerRow), (globalCol + innerCol), acc[index]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
[numthreads(16, 16, 1)]
|
|
void main(tint_symbol_1 tint_symbol) {
|
|
main_inner(tint_symbol.local_id, tint_symbol.global_id, tint_symbol.local_invocation_index);
|
|
return;
|
|
}
|