[hlsl] transform: Zero init arrays with a loop

If the array size is greater than a threshold.
This is a work around for FXC stalling when initializing large arrays
with a single zero-init assignment.

Bug: tint:936
Fixed: tint:943
Fixed: tint:942
Change-Id: Ie93c8f373874b8d6d020d041fa48b38fb1352f71
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/56775
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
This commit is contained in:
Ben Clayton
2021-07-05 17:18:16 +00:00
committed by Tint LUCI CQ
parent b0455217fa
commit b4ff73e250
11 changed files with 905 additions and 23 deletions

View File

@@ -66,6 +66,10 @@ Output Hlsl::Run(const Program* in, const DataMap&) {
manager.Add<ExternalTextureTransform>();
manager.Add<PromoteInitializersToConstVar>();
manager.Add<PadArrayElements>();
ZeroInitWorkgroupMemory::Config zero_init_cfg;
zero_init_cfg.init_arrays_with_loop_size_threshold = 32; // 8 scalars
data.Add<ZeroInitWorkgroupMemory::Config>(zero_init_cfg);
data.Add<CanonicalizeEntryPointIO::Config>(
CanonicalizeEntryPointIO::BuiltinStyle::kStructMember);
auto out = manager.Run(in, data);

View File

@@ -24,6 +24,7 @@
#include "src/utils/get_or_create.h"
TINT_INSTANTIATE_TYPEINFO(tint::transform::ZeroInitWorkgroupMemory);
TINT_INSTANTIATE_TYPEINFO(tint::transform::ZeroInitWorkgroupMemory::Config);
namespace tint {
namespace transform {
@@ -32,14 +33,16 @@ namespace transform {
struct ZeroInitWorkgroupMemory::State {
/// The clone context
CloneContext& ctx;
/// The built statements
ast::StatementList& stmts;
/// The config
Config cfg;
/// Zero() generates the statements required to zero initialize the workgroup
/// storage expression of type `ty`.
/// @param ty the expression type
/// @param stmts the built statements
/// @param get_expr a function that builds the AST nodes for the expression
void Zero(const sem::Type* ty,
ast::StatementList& stmts,
const std::function<ast::Expression*()>& get_expr) {
if (CanZero(ty)) {
auto* var = get_expr();
@@ -61,21 +64,32 @@ struct ZeroInitWorkgroupMemory::State {
if (auto* str = ty->As<sem::Struct>()) {
for (auto* member : str->Members()) {
auto name = ctx.Clone(member->Declaration()->symbol());
Zero(member->Type(),
Zero(member->Type(), stmts,
[&] { return ctx.dst->MemberAccessor(get_expr(), name); });
}
return;
}
if (auto* arr = ty->As<sem::Array>()) {
// TODO(bclayton): If array sizes become pipeline-overridable then this
// will need to emit code for a loop.
// See https://github.com/gpuweb/gpuweb/pull/1792
for (size_t i = 0; i < arr->Count(); i++) {
Zero(arr->ElemType(), [&] {
return ctx.dst->IndexAccessor(get_expr(),
static_cast<ProgramBuilder::u32>(i));
});
if (ShouldEmitForLoop(arr)) {
auto i = ctx.dst->Symbols().New("i");
auto* i_decl = ctx.dst->Decl(ctx.dst->Var(i, ctx.dst->ty.i32()));
auto* cond = ctx.dst->create<ast::BinaryExpression>(
ast::BinaryOp::kLessThan, ctx.dst->Expr(i),
ctx.dst->Expr(static_cast<int>(arr->Count())));
auto* inc = ctx.dst->Assign(i, ctx.dst->Add(i, 1));
ast::StatementList for_stmts;
Zero(arr->ElemType(), for_stmts,
[&] { return ctx.dst->IndexAccessor(get_expr(), i); });
auto* body = ctx.dst->Block(for_stmts);
stmts.emplace_back(ctx.dst->For(i_decl, cond, inc, body));
} else {
for (size_t i = 0; i < arr->Count(); i++) {
Zero(arr->ElemType(), stmts, [&] {
return ctx.dst->IndexAccessor(get_expr(),
static_cast<ProgramBuilder::u32>(i));
});
}
}
return;
}
@@ -89,7 +103,7 @@ struct ZeroInitWorkgroupMemory::State {
/// CanZero() returns false, then the type needs to be initialized by
/// decomposing the initialization into multiple sub-initializations.
/// @param ty the type to inspect
static bool CanZero(const sem::Type* ty) {
bool CanZero(const sem::Type* ty) {
if (ty->Is<sem::Atomic>()) {
return false;
}
@@ -101,21 +115,39 @@ struct ZeroInitWorkgroupMemory::State {
}
}
if (auto* arr = ty->As<sem::Array>()) {
if (!CanZero(arr->ElemType())) {
if (ShouldEmitForLoop(arr) || !CanZero(arr->ElemType())) {
return false;
}
}
return true;
}
/// @returns true if the array should be emitted as a for-loop instead of
/// using zero-initializer statements.
/// @param array the array
bool ShouldEmitForLoop(const sem::Array* array) {
// TODO(bclayton): If array sizes become pipeline-overridable then this
// we need to return true for these arrays.
// See https://github.com/gpuweb/gpuweb/pull/1792
return (cfg.init_arrays_with_loop_size_threshold != 0) &&
(array->SizeInBytes() >= cfg.init_arrays_with_loop_size_threshold);
}
};
ZeroInitWorkgroupMemory::ZeroInitWorkgroupMemory() = default;
ZeroInitWorkgroupMemory::~ZeroInitWorkgroupMemory() = default;
void ZeroInitWorkgroupMemory::Run(CloneContext& ctx, const DataMap&, DataMap&) {
void ZeroInitWorkgroupMemory::Run(CloneContext& ctx,
const DataMap& inputs,
DataMap&) {
auto& sem = ctx.src->Sem();
Config cfg;
if (auto* c = inputs.Get<Config>()) {
cfg = *c;
}
for (auto* ast_func : ctx.src->AST().Functions()) {
if (!ast_func->IsEntryPoint()) {
continue;
@@ -129,7 +161,7 @@ void ZeroInitWorkgroupMemory::Run(CloneContext& ctx, const DataMap&, DataMap&) {
if (var->StorageClass() != ast::StorageClass::kWorkgroup) {
continue;
}
State{ctx, stmts}.Zero(var->Type()->UnwrapRef(), [&] {
State{ctx, cfg}.Zero(var->Type()->UnwrapRef(), stmts, [&] {
auto var_name = ctx.Clone(var->Declaration()->symbol());
return ctx.dst->Expr(var_name);
});
@@ -193,5 +225,11 @@ void ZeroInitWorkgroupMemory::Run(CloneContext& ctx, const DataMap&, DataMap&) {
ctx.Clone();
}
ZeroInitWorkgroupMemory::Config::Config() = default;
ZeroInitWorkgroupMemory::Config::Config(const Config&) = default;
ZeroInitWorkgroupMemory::Config::~Config() = default;
ZeroInitWorkgroupMemory::Config& ZeroInitWorkgroupMemory::Config::operator=(
const Config&) = default;
} // namespace transform
} // namespace tint

View File

@@ -26,6 +26,27 @@ namespace transform {
class ZeroInitWorkgroupMemory
: public Castable<ZeroInitWorkgroupMemory, Transform> {
public:
/// Configuration options for the transform
struct Config : public Castable<Config, Data> {
/// Constructor
Config();
/// Copy constructor
Config(const Config&);
/// Destructor
~Config() override;
/// Assignment operator
/// @returns this Config
Config& operator=(const Config&);
/// If greater than 0, then arrays of at least this size in bytes will be
/// zero initialized using a for loop. If 0, then the array is assigned a
/// zero initialized array with a single statement.
uint32_t init_arrays_with_loop_size_threshold = 0;
};
/// Constructor
ZeroInitWorkgroupMemory();

View File

@@ -558,6 +558,56 @@ fn f([[builtin(local_invocation_index)]] local_invocation_index : u32) {
EXPECT_EQ(expect, str(got));
}
TEST_F(ZeroInitWorkgroupMemoryTest, WorkgroupArray_InitWithLoop) {
auto* src = R"(
struct S {
a : array<i32, 3>; // size: 12, less than the loop threshold
b : array<i32, 4>; // size: 16, equal to the loop threshold
c : array<i32, 5>; // size: 20, greater than the loop threshold
};
var<workgroup> w : S;
[[stage(compute), workgroup_size(1)]]
fn f() {
ignore(w); // Initialization should be inserted above this statement
}
)";
auto* expect = R"(
struct S {
a : array<i32, 3>;
b : array<i32, 4>;
c : array<i32, 5>;
};
var<workgroup> w : S;
[[stage(compute), workgroup_size(1)]]
fn f([[builtin(local_invocation_index)]] local_invocation_index : u32) {
if ((local_invocation_index == 0u)) {
w.a = array<i32, 3>();
for(var i : i32; (i < 4); i = (i + 1)) {
w.b[i] = i32();
}
for(var i_1 : i32; (i_1 < 5); i_1 = (i_1 + 1)) {
w.c[i_1] = i32();
}
}
workgroupBarrier();
ignore(w);
}
)";
ZeroInitWorkgroupMemory::Config cfg;
cfg.init_arrays_with_loop_size_threshold = 16;
DataMap data;
data.Add<ZeroInitWorkgroupMemory::Config>(cfg);
auto got = Run<ZeroInitWorkgroupMemory>(src, data);
EXPECT_EQ(expect, str(got));
}
} // namespace
} // namespace transform
} // namespace tint