Add MatMul with 2-dimensional shared array shader

The previous matmul is using 1-dimensional shared array. This PR
adds the 2-dimensional shared array. The perf result shows that:
1. For 1-dimensional shared array, enabe-robustness wil bring almost 2x
regression both for matmulFloat and matmulVec4 on Intel CFL.
2. For 2-dimensional shared array, enabe-robustness will bring almost 2x
regression on matmulFloat. But have little impact on matmulVec4 on Intel
CFL.

Tested on Intel_R_UHD_Graphics_630.
shader                     enable robustness    disable robustness
MatMulFloatOneDimSharedArray    5383 us            3105 us
MatMulFloatTwoDimSharedArray    4788 us            2608 us
MatMulVec4OneDimSharedArray     3070 us            1743 us
MatMulVec4TwoDimSharedArray     1840 us            1802 us

Bug: dawn:594
Change-Id: Ia29a78cf70649ef8d3ba8476db1ad4d6ded80840
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/50481
Reviewed-by: Jiawei Shao <jiawei.shao@intel.com>
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Jiajia Qin <jiajia.qin@intel.com>
This commit is contained in:
Qin Jiajia 2021-05-13 00:55:41 +00:00 committed by Commit Bot service account
parent 044b188990
commit 72ee6ade31
1 changed files with 127 additions and 24 deletions

View File

@ -20,7 +20,7 @@
namespace {
constexpr uint32_t kTileSize = 64u;
constexpr char kMatMulFloat[] = R"(
const std::string& kMatMulFloatHeader = R"(
[[block]] struct Uniforms {
dimAOuter : u32;
dimInner : u32;
@ -65,11 +65,15 @@ namespace {
let ColPerThread : u32 = 4u;
let TileAOuter : u32 = 64u;
let TileBOuter : u32 = 64u;
let TileInner : u32 = 64u;
let TileInner : u32 = 64u;)";
const std::string& kMatMulFloatSharedArray1D = R"(
var<workgroup> mm_Asub : array<f32, 4096>;
var<workgroup> mm_Bsub : array<f32, 4096>;
var<workgroup> mm_Bsub : array<f32, 4096>;)";
const std::string& kMatMulFloatSharedArray2D = R"(
var<workgroup> mm_Asub : array<array<f32, 64>, 64>;
var<workgroup> mm_Bsub : array<array<f32, 64>, 64>;)";
const std::string& kMatMulFloatBodyPart1 = R"(
[[stage(compute), workgroup_size(16, 16, 1)]]
fn main([[builtin(local_invocation_id)]] local_id : vec3<u32>,
[[builtin(global_invocation_id)]] global_id : vec3<u32>) {
@ -103,7 +107,8 @@ namespace {
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
for (var innerCol : u32 = 0u; innerCol < ColPerThreadA; innerCol = innerCol + 1u) {
let inputRow : u32 = tileRow + innerRow;
let inputCol : u32 = tileColA + innerCol;
let inputCol : u32 = tileColA + innerCol;)";
const std::string& kMatMulFloatBodyPart2Array1D = R"(
let index : u32 = inputRow * TileInner + inputCol;
mm_Asub[index] = mm_readA(globalRow + innerRow, t * TileInner + inputCol);
}
@ -128,7 +133,32 @@ namespace {
}
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
ACached = mm_Asub[(tileRow + innerRow) * TileInner + k];
ACached = mm_Asub[(tileRow + innerRow) * TileInner + k];)";
const std::string& kMatMulFloatBodyPart2Array2D = R"(
mm_Asub[inputRow][inputCol] = mm_readA(globalRow + innerRow, t * TileInner + inputCol);
}
}
// Load one tile of B into local memory.
for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow = innerRow + 1u) {
for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
let inputRow : u32 = tileRowB + innerRow;
let inputCol : u32 = tileCol + innerCol;
mm_Bsub[innerCol][inputCol] = mm_readB(t * TileInner + inputRow, globalCol + innerCol);;
}
}
workgroupBarrier();
// Compute acc values for a single thread.
for (var k : u32 = 0u; k < TileInner; k = k + 1u) {
for (var inner : u32 = 0u; inner < ColPerThread; inner = inner + 1u) {
BCached[inner] = mm_Bsub[k][tileCol + inner];
}
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
ACached = mm_Asub[tileRow + innerRow][k];)";
const std::string& kMatMulFloatBodyPart3 = R"(
for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
let index : u32 = innerRow * ColPerThread + innerCol;
acc[index] = acc[index] + ACached * BCached[innerCol];
@ -148,9 +178,16 @@ namespace {
}
}
})";
const std::string& kMatMulFloatOneDimensionalSharedArray =
kMatMulFloatHeader + kMatMulFloatSharedArray1D + kMatMulFloatBodyPart1 +
kMatMulFloatBodyPart2Array1D + kMatMulFloatBodyPart3;
const std::string& kMatMulFloatTwoDimensionalSharedArray =
kMatMulFloatHeader + kMatMulFloatSharedArray2D + kMatMulFloatBodyPart1 +
kMatMulFloatBodyPart2Array2D + kMatMulFloatBodyPart3;
// The vec4 version requires that dimInner and dimBOuter are divisible by 4.
constexpr char kMatMulVec4[] = R"(
const std::string& kMatMulVec4Header = R"(
[[block]] struct Uniforms {
dimAOuter : u32;
dimInner : u32;
@ -195,11 +232,14 @@ namespace {
let ColPerThread : u32 = 4u;
let TileAOuter : u32 = 64u;
let TileBOuter : u32 = 64u;
let TileInner : u32 = 64u;
let TileInner : u32 = 64u;)";
const std::string& kMatMulVec4SharedArray1D = R"(
var<workgroup> mm_Asub : array<vec4<f32>, 1024>;
var<workgroup> mm_Bsub : array<vec4<f32>, 1024>;
var<workgroup> mm_Bsub : array<vec4<f32>, 1024>;)";
const std::string& kMatMulVec4SharedArray2D = R"(
var<workgroup> mm_Asub : array<array<vec4<f32>, 16>, 64>;
var<workgroup> mm_Bsub : array<array<vec4<f32>, 16>, 64>;)";
const std::string& kMatMulVec4BodyPart1 = R"(
[[stage(compute), workgroup_size(16, 16, 1)]]
fn main([[builtin(local_invocation_id)]] local_id : vec3<u32>,
[[builtin(global_invocation_id)]] global_id : vec3<u32>) {
@ -231,7 +271,8 @@ namespace {
// Load one tile of A into local memory.
for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
let inputRow : u32 = tileRow + innerRow;
let inputCol : u32 = tileCol;
let inputCol : u32 = tileCol;)";
const std::string& kMatMulVec4BodyPart2Array1D = R"(
let index : u32 = inputRow * TileInner / ColPerThread + inputCol;
mm_Asub[index] = mm_readA(globalRow + innerRow, globalColA);
}
@ -255,7 +296,31 @@ namespace {
BCached[3] = mm_Bsub[(k * ColPerThread + 3u) * (TileBOuter / ColPerThread) + tileCol];
for (var i : u32 = 0u; i < RowPerThread; i = i + 1u) {
ACached = mm_Asub[(tileRow + i) * (TileInner / ColPerThread) + k];
ACached = mm_Asub[(tileRow + i) * (TileInner / ColPerThread) + k];)";
const std::string& kMatMulVec4BodyPart2Array2D = R"(
mm_Asub[inputRow][inputCol] = mm_readA(globalRow + innerRow, globalColA);
}
globalColA = globalColA + TileInner / ColPerThread;
// Load one tile of B into local memory.
for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow = innerRow + 1u) {
let inputRow : u32 = tileRowB + innerRow;
let inputCol : u32 = tileCol;
mm_Bsub[inputRow][inputCol] = mm_readB(t * TileInner + inputRow, globalCol);;
}
workgroupBarrier();
// Compute acc values for a single thread.
for (var k : u32 = 0u; k < TileInner / ColPerThread; k = k + 1u) {
BCached[0] = mm_Bsub[k * ColPerThread][tileCol];
BCached[1] = mm_Bsub[k * ColPerThread + 1u][tileCol];
BCached[2] = mm_Bsub[k * ColPerThread + 2u][tileCol];
BCached[3] = mm_Bsub[k * ColPerThread + 3u][tileCol];
for (var i : u32 = 0u; i < RowPerThread; i = i + 1u) {
ACached = mm_Asub[tileRow + i][k];)";
const std::string& kMatMulVec4BodyPart3 = R"(
acc[i] = BCached[0] * ACached.x + acc[i];
acc[i] = BCached[1] * ACached.y + acc[i];
acc[i] = BCached[2] * ACached.z + acc[i];
@ -272,11 +337,22 @@ namespace {
acc[innerRow]);
}
})";
const std::string& kMatMulVec4OneDimensionalSharedArray =
kMatMulVec4Header + kMatMulVec4SharedArray1D + kMatMulVec4BodyPart1 +
kMatMulVec4BodyPart2Array1D + kMatMulVec4BodyPart3;
const std::string& kMatMulVec4TwoDimensionalSharedArray =
kMatMulVec4Header + kMatMulVec4SharedArray2D + kMatMulVec4BodyPart1 +
kMatMulVec4BodyPart2Array2D + kMatMulVec4BodyPart3;
constexpr unsigned int kNumIterations = 50;
enum class MatMulMethod {
MatMulFloat,
MatMulVec4,
MatMulFloatOneDimSharedArray,
MatMulFloatTwoDimSharedArray,
MatMulVec4OneDimSharedArray,
MatMulVec4TwoDimSharedArray
};
struct ShaderRobustnessParams : AdapterTestParam {
@ -301,11 +377,17 @@ namespace {
std::ostream& operator<<(std::ostream& ostream, const ShaderRobustnessParams& param) {
ostream << static_cast<const AdapterTestParam&>(param);
switch (param.matmulMethod) {
case MatMulMethod::MatMulFloat:
ostream << "_MatMulFloat";
case MatMulMethod::MatMulFloatOneDimSharedArray:
ostream << "_MatMulFloatOneDimSharedArray";
break;
case MatMulMethod::MatMulVec4:
ostream << "_MatMulVec4";
case MatMulMethod::MatMulFloatTwoDimSharedArray:
ostream << "_MatMulFloatTwoDimSharedArray";
break;
case MatMulMethod::MatMulVec4OneDimSharedArray:
ostream << "_MatMulVec4OneDimSharedArray";
break;
case MatMulMethod::MatMulVec4TwoDimSharedArray:
ostream << "_MatMulVec4TwoDimSharedArray";
break;
}
@ -342,6 +424,10 @@ class ShaderRobustnessPerf : public DawnPerfTestWithParams<ShaderRobustnessParam
void ShaderRobustnessPerf::SetUp() {
DawnPerfTestWithParams<ShaderRobustnessParams>::SetUp();
// TODO(crbug.com/dawn/786): D3D12_Microsoft_Basic_Render_Driver_CPU
DAWN_SKIP_TEST_IF(IsD3D12() && IsWARP());
const size_t dataASize = mDimAOuter * mDimInner;
std::vector<float> dataA(dataASize);
uint64_t byteASize = sizeof(float) * dataA.size();
@ -367,13 +453,27 @@ void ShaderRobustnessPerf::SetUp() {
wgpu::ShaderModule module;
switch (GetParam().matmulMethod) {
case MatMulMethod::MatMulFloat: {
module = utils::CreateShaderModule(device, kMatMulFloat);
case MatMulMethod::MatMulFloatOneDimSharedArray: {
module =
utils::CreateShaderModule(device, kMatMulFloatOneDimensionalSharedArray.c_str());
break;
}
case MatMulMethod::MatMulVec4: {
module = utils::CreateShaderModule(device, kMatMulVec4);
case MatMulMethod::MatMulFloatTwoDimSharedArray: {
module =
utils::CreateShaderModule(device, kMatMulFloatTwoDimensionalSharedArray.c_str());
break;
}
case MatMulMethod::MatMulVec4OneDimSharedArray: {
module =
utils::CreateShaderModule(device, kMatMulVec4OneDimensionalSharedArray.c_str());
break;
}
case MatMulMethod::MatMulVec4TwoDimSharedArray: {
module =
utils::CreateShaderModule(device, kMatMulVec4TwoDimensionalSharedArray.c_str());
break;
}
}
@ -423,7 +523,10 @@ DAWN_INSTANTIATE_PERF_TEST_SUITE_P(ShaderRobustnessPerf,
MetalBackend(), MetalBackend({"disable_robustness"}, {}),
OpenGLBackend(), OpenGLBackend({"disable_robustness"}, {}),
VulkanBackend(), VulkanBackend({"disable_robustness"}, {})},
{MatMulMethod::MatMulFloat, MatMulMethod::MatMulVec4},
{MatMulMethod::MatMulFloatOneDimSharedArray,
MatMulMethod::MatMulFloatTwoDimSharedArray,
MatMulMethod::MatMulVec4OneDimSharedArray,
MatMulMethod::MatMulVec4TwoDimSharedArray},
{512},
{512},
{512});