mirror of
https://github.com/encounter/dawn-cmake.git
synced 2025-06-07 07:03:42 +00:00
Add MatMul with 2-dimensional shared array shader
The previous matmul is using 1-dimensional shared array. This PR adds the 2-dimensional shared array. The perf result shows that: 1. For 1-dimensional shared array, enabe-robustness wil bring almost 2x regression both for matmulFloat and matmulVec4 on Intel CFL. 2. For 2-dimensional shared array, enabe-robustness will bring almost 2x regression on matmulFloat. But have little impact on matmulVec4 on Intel CFL. Tested on Intel_R_UHD_Graphics_630. shader enable robustness disable robustness MatMulFloatOneDimSharedArray 5383 us 3105 us MatMulFloatTwoDimSharedArray 4788 us 2608 us MatMulVec4OneDimSharedArray 3070 us 1743 us MatMulVec4TwoDimSharedArray 1840 us 1802 us Bug: dawn:594 Change-Id: Ia29a78cf70649ef8d3ba8476db1ad4d6ded80840 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/50481 Reviewed-by: Jiawei Shao <jiawei.shao@intel.com> Reviewed-by: Austin Eng <enga@chromium.org> Commit-Queue: Jiajia Qin <jiajia.qin@intel.com>
This commit is contained in:
parent
044b188990
commit
72ee6ade31
@ -20,7 +20,7 @@
|
|||||||
namespace {
|
namespace {
|
||||||
constexpr uint32_t kTileSize = 64u;
|
constexpr uint32_t kTileSize = 64u;
|
||||||
|
|
||||||
constexpr char kMatMulFloat[] = R"(
|
const std::string& kMatMulFloatHeader = R"(
|
||||||
[[block]] struct Uniforms {
|
[[block]] struct Uniforms {
|
||||||
dimAOuter : u32;
|
dimAOuter : u32;
|
||||||
dimInner : u32;
|
dimInner : u32;
|
||||||
@ -65,11 +65,15 @@ namespace {
|
|||||||
let ColPerThread : u32 = 4u;
|
let ColPerThread : u32 = 4u;
|
||||||
let TileAOuter : u32 = 64u;
|
let TileAOuter : u32 = 64u;
|
||||||
let TileBOuter : u32 = 64u;
|
let TileBOuter : u32 = 64u;
|
||||||
let TileInner : u32 = 64u;
|
let TileInner : u32 = 64u;)";
|
||||||
|
|
||||||
|
const std::string& kMatMulFloatSharedArray1D = R"(
|
||||||
var<workgroup> mm_Asub : array<f32, 4096>;
|
var<workgroup> mm_Asub : array<f32, 4096>;
|
||||||
var<workgroup> mm_Bsub : array<f32, 4096>;
|
var<workgroup> mm_Bsub : array<f32, 4096>;)";
|
||||||
|
const std::string& kMatMulFloatSharedArray2D = R"(
|
||||||
|
var<workgroup> mm_Asub : array<array<f32, 64>, 64>;
|
||||||
|
var<workgroup> mm_Bsub : array<array<f32, 64>, 64>;)";
|
||||||
|
const std::string& kMatMulFloatBodyPart1 = R"(
|
||||||
[[stage(compute), workgroup_size(16, 16, 1)]]
|
[[stage(compute), workgroup_size(16, 16, 1)]]
|
||||||
fn main([[builtin(local_invocation_id)]] local_id : vec3<u32>,
|
fn main([[builtin(local_invocation_id)]] local_id : vec3<u32>,
|
||||||
[[builtin(global_invocation_id)]] global_id : vec3<u32>) {
|
[[builtin(global_invocation_id)]] global_id : vec3<u32>) {
|
||||||
@ -103,7 +107,8 @@ namespace {
|
|||||||
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
|
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
|
||||||
for (var innerCol : u32 = 0u; innerCol < ColPerThreadA; innerCol = innerCol + 1u) {
|
for (var innerCol : u32 = 0u; innerCol < ColPerThreadA; innerCol = innerCol + 1u) {
|
||||||
let inputRow : u32 = tileRow + innerRow;
|
let inputRow : u32 = tileRow + innerRow;
|
||||||
let inputCol : u32 = tileColA + innerCol;
|
let inputCol : u32 = tileColA + innerCol;)";
|
||||||
|
const std::string& kMatMulFloatBodyPart2Array1D = R"(
|
||||||
let index : u32 = inputRow * TileInner + inputCol;
|
let index : u32 = inputRow * TileInner + inputCol;
|
||||||
mm_Asub[index] = mm_readA(globalRow + innerRow, t * TileInner + inputCol);
|
mm_Asub[index] = mm_readA(globalRow + innerRow, t * TileInner + inputCol);
|
||||||
}
|
}
|
||||||
@ -128,7 +133,32 @@ namespace {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
|
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
|
||||||
ACached = mm_Asub[(tileRow + innerRow) * TileInner + k];
|
ACached = mm_Asub[(tileRow + innerRow) * TileInner + k];)";
|
||||||
|
const std::string& kMatMulFloatBodyPart2Array2D = R"(
|
||||||
|
mm_Asub[inputRow][inputCol] = mm_readA(globalRow + innerRow, t * TileInner + inputCol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Load one tile of B into local memory.
|
||||||
|
for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow = innerRow + 1u) {
|
||||||
|
for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
|
||||||
|
let inputRow : u32 = tileRowB + innerRow;
|
||||||
|
let inputCol : u32 = tileCol + innerCol;
|
||||||
|
|
||||||
|
mm_Bsub[innerCol][inputCol] = mm_readB(t * TileInner + inputRow, globalCol + innerCol);;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
workgroupBarrier();
|
||||||
|
|
||||||
|
// Compute acc values for a single thread.
|
||||||
|
for (var k : u32 = 0u; k < TileInner; k = k + 1u) {
|
||||||
|
for (var inner : u32 = 0u; inner < ColPerThread; inner = inner + 1u) {
|
||||||
|
BCached[inner] = mm_Bsub[k][tileCol + inner];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
|
||||||
|
ACached = mm_Asub[tileRow + innerRow][k];)";
|
||||||
|
const std::string& kMatMulFloatBodyPart3 = R"(
|
||||||
for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
|
for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
|
||||||
let index : u32 = innerRow * ColPerThread + innerCol;
|
let index : u32 = innerRow * ColPerThread + innerCol;
|
||||||
acc[index] = acc[index] + ACached * BCached[innerCol];
|
acc[index] = acc[index] + ACached * BCached[innerCol];
|
||||||
@ -148,9 +178,16 @@ namespace {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
})";
|
})";
|
||||||
|
const std::string& kMatMulFloatOneDimensionalSharedArray =
|
||||||
|
kMatMulFloatHeader + kMatMulFloatSharedArray1D + kMatMulFloatBodyPart1 +
|
||||||
|
kMatMulFloatBodyPart2Array1D + kMatMulFloatBodyPart3;
|
||||||
|
|
||||||
|
const std::string& kMatMulFloatTwoDimensionalSharedArray =
|
||||||
|
kMatMulFloatHeader + kMatMulFloatSharedArray2D + kMatMulFloatBodyPart1 +
|
||||||
|
kMatMulFloatBodyPart2Array2D + kMatMulFloatBodyPart3;
|
||||||
|
|
||||||
// The vec4 version requires that dimInner and dimBOuter are divisible by 4.
|
// The vec4 version requires that dimInner and dimBOuter are divisible by 4.
|
||||||
constexpr char kMatMulVec4[] = R"(
|
const std::string& kMatMulVec4Header = R"(
|
||||||
[[block]] struct Uniforms {
|
[[block]] struct Uniforms {
|
||||||
dimAOuter : u32;
|
dimAOuter : u32;
|
||||||
dimInner : u32;
|
dimInner : u32;
|
||||||
@ -195,11 +232,14 @@ namespace {
|
|||||||
let ColPerThread : u32 = 4u;
|
let ColPerThread : u32 = 4u;
|
||||||
let TileAOuter : u32 = 64u;
|
let TileAOuter : u32 = 64u;
|
||||||
let TileBOuter : u32 = 64u;
|
let TileBOuter : u32 = 64u;
|
||||||
let TileInner : u32 = 64u;
|
let TileInner : u32 = 64u;)";
|
||||||
|
const std::string& kMatMulVec4SharedArray1D = R"(
|
||||||
var<workgroup> mm_Asub : array<vec4<f32>, 1024>;
|
var<workgroup> mm_Asub : array<vec4<f32>, 1024>;
|
||||||
var<workgroup> mm_Bsub : array<vec4<f32>, 1024>;
|
var<workgroup> mm_Bsub : array<vec4<f32>, 1024>;)";
|
||||||
|
const std::string& kMatMulVec4SharedArray2D = R"(
|
||||||
|
var<workgroup> mm_Asub : array<array<vec4<f32>, 16>, 64>;
|
||||||
|
var<workgroup> mm_Bsub : array<array<vec4<f32>, 16>, 64>;)";
|
||||||
|
const std::string& kMatMulVec4BodyPart1 = R"(
|
||||||
[[stage(compute), workgroup_size(16, 16, 1)]]
|
[[stage(compute), workgroup_size(16, 16, 1)]]
|
||||||
fn main([[builtin(local_invocation_id)]] local_id : vec3<u32>,
|
fn main([[builtin(local_invocation_id)]] local_id : vec3<u32>,
|
||||||
[[builtin(global_invocation_id)]] global_id : vec3<u32>) {
|
[[builtin(global_invocation_id)]] global_id : vec3<u32>) {
|
||||||
@ -231,7 +271,8 @@ namespace {
|
|||||||
// Load one tile of A into local memory.
|
// Load one tile of A into local memory.
|
||||||
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
|
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
|
||||||
let inputRow : u32 = tileRow + innerRow;
|
let inputRow : u32 = tileRow + innerRow;
|
||||||
let inputCol : u32 = tileCol;
|
let inputCol : u32 = tileCol;)";
|
||||||
|
const std::string& kMatMulVec4BodyPart2Array1D = R"(
|
||||||
let index : u32 = inputRow * TileInner / ColPerThread + inputCol;
|
let index : u32 = inputRow * TileInner / ColPerThread + inputCol;
|
||||||
mm_Asub[index] = mm_readA(globalRow + innerRow, globalColA);
|
mm_Asub[index] = mm_readA(globalRow + innerRow, globalColA);
|
||||||
}
|
}
|
||||||
@ -255,7 +296,31 @@ namespace {
|
|||||||
BCached[3] = mm_Bsub[(k * ColPerThread + 3u) * (TileBOuter / ColPerThread) + tileCol];
|
BCached[3] = mm_Bsub[(k * ColPerThread + 3u) * (TileBOuter / ColPerThread) + tileCol];
|
||||||
|
|
||||||
for (var i : u32 = 0u; i < RowPerThread; i = i + 1u) {
|
for (var i : u32 = 0u; i < RowPerThread; i = i + 1u) {
|
||||||
ACached = mm_Asub[(tileRow + i) * (TileInner / ColPerThread) + k];
|
ACached = mm_Asub[(tileRow + i) * (TileInner / ColPerThread) + k];)";
|
||||||
|
const std::string& kMatMulVec4BodyPart2Array2D = R"(
|
||||||
|
mm_Asub[inputRow][inputCol] = mm_readA(globalRow + innerRow, globalColA);
|
||||||
|
}
|
||||||
|
globalColA = globalColA + TileInner / ColPerThread;
|
||||||
|
|
||||||
|
// Load one tile of B into local memory.
|
||||||
|
for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow = innerRow + 1u) {
|
||||||
|
let inputRow : u32 = tileRowB + innerRow;
|
||||||
|
let inputCol : u32 = tileCol;
|
||||||
|
mm_Bsub[inputRow][inputCol] = mm_readB(t * TileInner + inputRow, globalCol);;
|
||||||
|
}
|
||||||
|
|
||||||
|
workgroupBarrier();
|
||||||
|
|
||||||
|
// Compute acc values for a single thread.
|
||||||
|
for (var k : u32 = 0u; k < TileInner / ColPerThread; k = k + 1u) {
|
||||||
|
BCached[0] = mm_Bsub[k * ColPerThread][tileCol];
|
||||||
|
BCached[1] = mm_Bsub[k * ColPerThread + 1u][tileCol];
|
||||||
|
BCached[2] = mm_Bsub[k * ColPerThread + 2u][tileCol];
|
||||||
|
BCached[3] = mm_Bsub[k * ColPerThread + 3u][tileCol];
|
||||||
|
|
||||||
|
for (var i : u32 = 0u; i < RowPerThread; i = i + 1u) {
|
||||||
|
ACached = mm_Asub[tileRow + i][k];)";
|
||||||
|
const std::string& kMatMulVec4BodyPart3 = R"(
|
||||||
acc[i] = BCached[0] * ACached.x + acc[i];
|
acc[i] = BCached[0] * ACached.x + acc[i];
|
||||||
acc[i] = BCached[1] * ACached.y + acc[i];
|
acc[i] = BCached[1] * ACached.y + acc[i];
|
||||||
acc[i] = BCached[2] * ACached.z + acc[i];
|
acc[i] = BCached[2] * ACached.z + acc[i];
|
||||||
@ -272,11 +337,22 @@ namespace {
|
|||||||
acc[innerRow]);
|
acc[innerRow]);
|
||||||
}
|
}
|
||||||
})";
|
})";
|
||||||
|
|
||||||
|
const std::string& kMatMulVec4OneDimensionalSharedArray =
|
||||||
|
kMatMulVec4Header + kMatMulVec4SharedArray1D + kMatMulVec4BodyPart1 +
|
||||||
|
kMatMulVec4BodyPart2Array1D + kMatMulVec4BodyPart3;
|
||||||
|
|
||||||
|
const std::string& kMatMulVec4TwoDimensionalSharedArray =
|
||||||
|
kMatMulVec4Header + kMatMulVec4SharedArray2D + kMatMulVec4BodyPart1 +
|
||||||
|
kMatMulVec4BodyPart2Array2D + kMatMulVec4BodyPart3;
|
||||||
|
|
||||||
constexpr unsigned int kNumIterations = 50;
|
constexpr unsigned int kNumIterations = 50;
|
||||||
|
|
||||||
enum class MatMulMethod {
|
enum class MatMulMethod {
|
||||||
MatMulFloat,
|
MatMulFloatOneDimSharedArray,
|
||||||
MatMulVec4,
|
MatMulFloatTwoDimSharedArray,
|
||||||
|
MatMulVec4OneDimSharedArray,
|
||||||
|
MatMulVec4TwoDimSharedArray
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ShaderRobustnessParams : AdapterTestParam {
|
struct ShaderRobustnessParams : AdapterTestParam {
|
||||||
@ -301,11 +377,17 @@ namespace {
|
|||||||
std::ostream& operator<<(std::ostream& ostream, const ShaderRobustnessParams& param) {
|
std::ostream& operator<<(std::ostream& ostream, const ShaderRobustnessParams& param) {
|
||||||
ostream << static_cast<const AdapterTestParam&>(param);
|
ostream << static_cast<const AdapterTestParam&>(param);
|
||||||
switch (param.matmulMethod) {
|
switch (param.matmulMethod) {
|
||||||
case MatMulMethod::MatMulFloat:
|
case MatMulMethod::MatMulFloatOneDimSharedArray:
|
||||||
ostream << "_MatMulFloat";
|
ostream << "_MatMulFloatOneDimSharedArray";
|
||||||
break;
|
break;
|
||||||
case MatMulMethod::MatMulVec4:
|
case MatMulMethod::MatMulFloatTwoDimSharedArray:
|
||||||
ostream << "_MatMulVec4";
|
ostream << "_MatMulFloatTwoDimSharedArray";
|
||||||
|
break;
|
||||||
|
case MatMulMethod::MatMulVec4OneDimSharedArray:
|
||||||
|
ostream << "_MatMulVec4OneDimSharedArray";
|
||||||
|
break;
|
||||||
|
case MatMulMethod::MatMulVec4TwoDimSharedArray:
|
||||||
|
ostream << "_MatMulVec4TwoDimSharedArray";
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -342,6 +424,10 @@ class ShaderRobustnessPerf : public DawnPerfTestWithParams<ShaderRobustnessParam
|
|||||||
|
|
||||||
void ShaderRobustnessPerf::SetUp() {
|
void ShaderRobustnessPerf::SetUp() {
|
||||||
DawnPerfTestWithParams<ShaderRobustnessParams>::SetUp();
|
DawnPerfTestWithParams<ShaderRobustnessParams>::SetUp();
|
||||||
|
|
||||||
|
// TODO(crbug.com/dawn/786): D3D12_Microsoft_Basic_Render_Driver_CPU
|
||||||
|
DAWN_SKIP_TEST_IF(IsD3D12() && IsWARP());
|
||||||
|
|
||||||
const size_t dataASize = mDimAOuter * mDimInner;
|
const size_t dataASize = mDimAOuter * mDimInner;
|
||||||
std::vector<float> dataA(dataASize);
|
std::vector<float> dataA(dataASize);
|
||||||
uint64_t byteASize = sizeof(float) * dataA.size();
|
uint64_t byteASize = sizeof(float) * dataA.size();
|
||||||
@ -367,13 +453,27 @@ void ShaderRobustnessPerf::SetUp() {
|
|||||||
|
|
||||||
wgpu::ShaderModule module;
|
wgpu::ShaderModule module;
|
||||||
switch (GetParam().matmulMethod) {
|
switch (GetParam().matmulMethod) {
|
||||||
case MatMulMethod::MatMulFloat: {
|
case MatMulMethod::MatMulFloatOneDimSharedArray: {
|
||||||
module = utils::CreateShaderModule(device, kMatMulFloat);
|
module =
|
||||||
|
utils::CreateShaderModule(device, kMatMulFloatOneDimensionalSharedArray.c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case MatMulMethod::MatMulVec4: {
|
case MatMulMethod::MatMulFloatTwoDimSharedArray: {
|
||||||
module = utils::CreateShaderModule(device, kMatMulVec4);
|
module =
|
||||||
|
utils::CreateShaderModule(device, kMatMulFloatTwoDimensionalSharedArray.c_str());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case MatMulMethod::MatMulVec4OneDimSharedArray: {
|
||||||
|
module =
|
||||||
|
utils::CreateShaderModule(device, kMatMulVec4OneDimensionalSharedArray.c_str());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case MatMulMethod::MatMulVec4TwoDimSharedArray: {
|
||||||
|
module =
|
||||||
|
utils::CreateShaderModule(device, kMatMulVec4TwoDimensionalSharedArray.c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -423,7 +523,10 @@ DAWN_INSTANTIATE_PERF_TEST_SUITE_P(ShaderRobustnessPerf,
|
|||||||
MetalBackend(), MetalBackend({"disable_robustness"}, {}),
|
MetalBackend(), MetalBackend({"disable_robustness"}, {}),
|
||||||
OpenGLBackend(), OpenGLBackend({"disable_robustness"}, {}),
|
OpenGLBackend(), OpenGLBackend({"disable_robustness"}, {}),
|
||||||
VulkanBackend(), VulkanBackend({"disable_robustness"}, {})},
|
VulkanBackend(), VulkanBackend({"disable_robustness"}, {})},
|
||||||
{MatMulMethod::MatMulFloat, MatMulMethod::MatMulVec4},
|
{MatMulMethod::MatMulFloatOneDimSharedArray,
|
||||||
|
MatMulMethod::MatMulFloatTwoDimSharedArray,
|
||||||
|
MatMulMethod::MatMulVec4OneDimSharedArray,
|
||||||
|
MatMulMethod::MatMulVec4TwoDimSharedArray},
|
||||||
{512},
|
{512},
|
||||||
{512},
|
{512},
|
||||||
{512});
|
{512});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user