Support higher limits for maxDynamicUniform/StorageBuffers

The higher tier currently supports all D3D12 devices, all Metal,
and most Vulkan devices.

Bug: dawn:685
Change-Id: I5bcb778b92a073c9c1af943acee193073c0741ff
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/121101
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Austin Eng <enga@chromium.org>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
This commit is contained in:
Austin Eng
2023-02-25 02:10:21 +00:00
committed by Dawn LUCI CQ
parent 8ef7311413
commit 2ea4aac080
20 changed files with 247 additions and 119 deletions

View File

@@ -70,12 +70,6 @@ MaybeError AdapterBase::Initialize() {
std::min(mLimits.v1.maxStorageTexturesPerShaderStage, kMaxStorageTexturesPerShaderStage);
mLimits.v1.maxUniformBuffersPerShaderStage =
std::min(mLimits.v1.maxUniformBuffersPerShaderStage, kMaxUniformBuffersPerShaderStage);
mLimits.v1.maxDynamicUniformBuffersPerPipelineLayout =
std::min(mLimits.v1.maxDynamicUniformBuffersPerPipelineLayout,
kMaxDynamicUniformBuffersPerPipelineLayout);
mLimits.v1.maxDynamicStorageBuffersPerPipelineLayout =
std::min(mLimits.v1.maxDynamicStorageBuffersPerPipelineLayout,
kMaxDynamicStorageBuffersPerPipelineLayout);
return {};
}

View File

@@ -271,7 +271,8 @@ MaybeError ValidateBindGroupLayoutDescriptor(DeviceBase* device,
bindingsSet.insert(bindingNumber);
}
DAWN_TRY_CONTEXT(ValidateBindingCounts(bindingCounts), "validating binding counts");
DAWN_TRY_CONTEXT(ValidateBindingCounts(device->GetLimits(), bindingCounts),
"validating binding counts");
return {};
}

View File

@@ -15,6 +15,7 @@
#ifndef SRC_DAWN_NATIVE_BINDGROUPTRACKER_H_
#define SRC_DAWN_NATIVE_BINDGROUPTRACKER_H_
#include <algorithm>
#include <array>
#include <bitset>
@@ -54,8 +55,9 @@ class BindGroupTrackerBase {
}
mBindGroups[index] = bindGroup;
mDynamicOffsetCounts[index] = dynamicOffsetCount;
SetDynamicOffsets(mDynamicOffsets[index].data(), dynamicOffsetCount, dynamicOffsets);
mDynamicOffsets[index].resize(BindingIndex(dynamicOffsetCount));
std::copy(dynamicOffsets, dynamicOffsets + dynamicOffsetCount,
mDynamicOffsets[index].begin());
}
void OnSetPipeline(PipelineBase* pipeline) { mPipelineLayout = pipeline->GetLayout(); }
@@ -105,10 +107,7 @@ class BindGroupTrackerBase {
BindGroupLayoutMask mDirtyBindGroupsObjectChangedOrIsDynamic = 0;
BindGroupLayoutMask mBindGroupLayoutsMask = 0;
ityp::array<BindGroupIndex, BindGroupBase*, kMaxBindGroups> mBindGroups = {};
ityp::array<BindGroupIndex, uint32_t, kMaxBindGroups> mDynamicOffsetCounts = {};
ityp::array<BindGroupIndex,
std::array<DynamicOffset, kMaxDynamicBuffersPerPipelineLayout>,
kMaxBindGroups>
ityp::array<BindGroupIndex, ityp::vector<BindingIndex, DynamicOffset>, kMaxBindGroups>
mDynamicOffsets = {};
// |mPipelineLayout| is the current pipeline layout set on the command buffer.
@@ -116,25 +115,6 @@ class BindGroupTrackerBase {
// to the bind group bindings.
PipelineLayoutBase* mPipelineLayout = nullptr;
PipelineLayoutBase* mLastAppliedPipelineLayout = nullptr;
private:
// We have two overloads here because offsets in Vulkan are uint32_t but uint64_t
// in other backends.
static void SetDynamicOffsets(uint64_t* data,
uint32_t dynamicOffsetCount,
uint32_t* dynamicOffsets) {
for (uint32_t i = 0; i < dynamicOffsetCount; ++i) {
data[i] = static_cast<uint64_t>(dynamicOffsets[i]);
}
}
static void SetDynamicOffsets(uint32_t* data,
uint32_t dynamicOffsetCount,
uint32_t* dynamicOffsets) {
if (dynamicOffsetCount > 0) {
memcpy(data, dynamicOffsets, sizeof(uint32_t) * dynamicOffsetCount);
}
}
};
} // namespace dawn::native

View File

@@ -15,6 +15,7 @@
#include "dawn/native/BindingInfo.h"
#include "dawn/native/ChainUtils_autogen.h"
#include "dawn/native/Limits.h"
namespace dawn::native {
@@ -93,18 +94,22 @@ void AccumulateBindingCounts(BindingCounts* bindingCounts, const BindingCounts&
}
}
MaybeError ValidateBindingCounts(const BindingCounts& bindingCounts) {
MaybeError ValidateBindingCounts(const CombinedLimits& limits, const BindingCounts& bindingCounts) {
DAWN_INVALID_IF(
bindingCounts.dynamicUniformBufferCount > kMaxDynamicUniformBuffersPerPipelineLayout,
bindingCounts.dynamicUniformBufferCount >
limits.v1.maxDynamicUniformBuffersPerPipelineLayout,
"The number of dynamic uniform buffers (%u) exceeds the maximum per-pipeline-layout "
"limit (%u).",
bindingCounts.dynamicUniformBufferCount, kMaxDynamicUniformBuffersPerPipelineLayout);
bindingCounts.dynamicUniformBufferCount,
limits.v1.maxDynamicUniformBuffersPerPipelineLayout);
DAWN_INVALID_IF(
bindingCounts.dynamicStorageBufferCount > kMaxDynamicStorageBuffersPerPipelineLayout,
bindingCounts.dynamicStorageBufferCount >
limits.v1.maxDynamicStorageBuffersPerPipelineLayout,
"The number of dynamic storage buffers (%u) exceeds the maximum per-pipeline-layout "
"limit (%u).",
bindingCounts.dynamicStorageBufferCount, kMaxDynamicStorageBuffersPerPipelineLayout);
bindingCounts.dynamicStorageBufferCount,
limits.v1.maxDynamicStorageBuffersPerPipelineLayout);
for (SingleShaderStage stage : IterateStages(kAllStages)) {
DAWN_INVALID_IF(

View File

@@ -29,13 +29,6 @@
namespace dawn::native {
// Not a real WebGPU limit, but the sum of the two limits is useful for internal optimizations.
static constexpr uint32_t kMaxDynamicBuffersPerPipelineLayout =
kMaxDynamicUniformBuffersPerPipelineLayout + kMaxDynamicStorageBuffersPerPipelineLayout;
static constexpr BindingIndex kMaxDynamicBuffersPerPipelineLayoutTyped =
BindingIndex(kMaxDynamicBuffersPerPipelineLayout);
// Not a real WebGPU limit, but used to optimize parts of Dawn which expect valid usage of the
// API. There should never be more bindings than the max per stage, for each stage.
static constexpr uint32_t kMaxBindingsPerPipelineLayout =
@@ -87,9 +80,11 @@ struct BindingCounts {
PerStage<PerStageBindingCounts> perStage;
};
struct CombinedLimits;
void IncrementBindingCounts(BindingCounts* bindingCounts, const BindGroupLayoutEntry& entry);
void AccumulateBindingCounts(BindingCounts* bindingCounts, const BindingCounts& rhs);
MaybeError ValidateBindingCounts(const BindingCounts& bindingCounts);
MaybeError ValidateBindingCounts(const CombinedLimits& limits, const BindingCounts& bindingCounts);
// For buffer size validation
using RequiredBufferSizes = ityp::array<BindGroupIndex, std::vector<uint64_t>, kMaxBindGroups>;

View File

@@ -32,6 +32,13 @@
#define LIMITS_MAX_BUFFER_SIZE(X) \
X(Maximum, maxBufferSize, 0x10000000, 0x40000000, 0x80000000)
// Tiers for limits related to resource bindings.
// TODO(crbug.com/dawn/685): Define these better. For now, use two tiers where one
// offers slightly better than default limits.
#define LIMITS_RESOURCE_BINDINGS(X) \
X(Maximum, maxDynamicUniformBuffersPerPipelineLayout, 8, 10) \
X(Maximum, maxDynamicStorageBuffersPerPipelineLayout, 4, 8) \
// TODO(crbug.com/dawn/685):
// These limits don't have tiers yet. Define two tiers with the same values since the macros
// in this file expect more than one tier.
@@ -42,8 +49,6 @@
X(Maximum, maxTextureArrayLayers, 256, 256) \
X(Maximum, maxBindGroups, 4, 4) \
X(Maximum, maxBindingsPerBindGroup, 640, 640) \
X(Maximum, maxDynamicUniformBuffersPerPipelineLayout, 8, 8) \
X(Maximum, maxDynamicStorageBuffersPerPipelineLayout, 4, 4) \
X(Maximum, maxSampledTexturesPerShaderStage, 16, 16) \
X(Maximum, maxSamplersPerShaderStage, 16, 16) \
X(Maximum, maxStorageBuffersPerShaderStage, 8, 8) \
@@ -71,12 +76,14 @@
X(LIMITS_WORKGROUP_STORAGE_SIZE) \
X(LIMITS_STORAGE_BUFFER_BINDING_SIZE) \
X(LIMITS_MAX_BUFFER_SIZE) \
X(LIMITS_RESOURCE_BINDINGS) \
X(LIMITS_OTHER)
#define LIMITS(X) \
LIMITS_WORKGROUP_STORAGE_SIZE(X) \
LIMITS_STORAGE_BUFFER_BINDING_SIZE(X) \
LIMITS_MAX_BUFFER_SIZE(X) \
LIMITS_RESOURCE_BINDINGS(X) \
LIMITS_OTHER(X)
namespace dawn::native {

View File

@@ -50,7 +50,7 @@ MaybeError ValidatePipelineLayoutDescriptor(DeviceBase* device,
descriptor->bindGroupLayouts[i]->GetBindingCountInfo());
}
DAWN_TRY(ValidateBindingCounts(bindingCounts));
DAWN_TRY(ValidateBindingCounts(device->GetLimits(), bindingCounts));
return {};
}

View File

@@ -269,36 +269,45 @@ MaybeError Adapter::InitializeSupportedLimitsImpl(CombinedLimits* limits) {
// CBVs/UAVs/SRVs for bind group are a root descriptor table
// - (maxBindGroups)
// Samplers for each bind group are a root descriptor table
// - (2 * maxDynamicBuffers)
// Each dynamic buffer is a root descriptor
// - dynamic uniform buffers - root descriptor
// - dynamic storage buffers - root descriptor plus a root constant for the size
// RESERVED:
// - 3 = max of:
// - 2 root constants for the baseVertex/baseInstance constants.
// - 3 root constants for num workgroups X, Y, Z
// - 4 root constants (kMaxDynamicStorageBuffersPerPipelineLayout) for dynamic storage
// buffer lengths.
static constexpr uint32_t kReservedSlots = 7;
static constexpr uint32_t kReservedSlots = 3;
// Costs:
// - bind group: 2 = 1 cbv/uav/srv table + 1 sampler table
// - dynamic uniform buffer: 2 slots for a root descriptor
// - dynamic storage buffer: 3 slots for a root descriptor + root constant
// Available slots after base limits considered.
uint32_t availableRootSignatureSlots =
kMaxRootSignatureSize - kReservedSlots -
2 * (limits->v1.maxBindGroups + limits->v1.maxDynamicUniformBuffersPerPipelineLayout +
limits->v1.maxDynamicStorageBuffersPerPipelineLayout);
kMaxRootSignatureSize - kReservedSlots - 2 * limits->v1.maxBindGroups -
2 * limits->v1.maxDynamicUniformBuffersPerPipelineLayout -
3 * limits->v1.maxDynamicStorageBuffersPerPipelineLayout;
// Because we need either:
// - 1 cbv/uav/srv table + 1 sampler table
// - 2 slots for a root descriptor
uint32_t availableDynamicBufferOrBindGroup = availableRootSignatureSlots / 2;
while (availableRootSignatureSlots >= 2) {
// Start by incrementing maxDynamicStorageBuffersPerPipelineLayout since the
// default is just 4 and developers likely want more. This scheme currently
// gets us to 8.
if (availableRootSignatureSlots >= 3) {
limits->v1.maxDynamicStorageBuffersPerPipelineLayout += 1;
availableRootSignatureSlots -= 3;
}
if (availableRootSignatureSlots >= 2) {
limits->v1.maxBindGroups += 1;
availableRootSignatureSlots -= 2;
}
if (availableRootSignatureSlots >= 2) {
limits->v1.maxDynamicUniformBuffersPerPipelineLayout += 1;
availableRootSignatureSlots -= 2;
}
}
// We can either have a bind group, a dyn uniform buffer or a dyn storage buffer.
// Distribute evenly.
limits->v1.maxBindGroups += availableDynamicBufferOrBindGroup / 3;
limits->v1.maxDynamicUniformBuffersPerPipelineLayout += availableDynamicBufferOrBindGroup / 3;
limits->v1.maxDynamicStorageBuffersPerPipelineLayout +=
(availableDynamicBufferOrBindGroup - 2 * (availableDynamicBufferOrBindGroup / 3));
ASSERT(2 * (limits->v1.maxBindGroups + limits->v1.maxDynamicUniformBuffersPerPipelineLayout +
limits->v1.maxDynamicStorageBuffersPerPipelineLayout) <=
ASSERT(2 * limits->v1.maxBindGroups + 2 * limits->v1.maxDynamicUniformBuffersPerPipelineLayout +
3 * limits->v1.maxDynamicStorageBuffersPerPipelineLayout <=
kMaxRootSignatureSize - kReservedSlots);
// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-attributes-numthreads

View File

@@ -47,8 +47,7 @@ class BindGroup final : public BindGroupBase, public PlacementAllocated {
void SetSamplerAllocationEntry(Ref<SamplerHeapCacheEntry> entry);
using DynamicStorageBufferLengths =
ityp::stack_vec<uint32_t, uint32_t, kMaxDynamicStorageBuffersPerPipelineLayout>;
using DynamicStorageBufferLengths = ityp::stack_vec<uint32_t, uint32_t, 4u>;
const DynamicStorageBufferLengths& GetDynamicStorageBufferLengths() const;
private:

View File

@@ -442,7 +442,7 @@ class BindGroupStateTracker : public BindGroupTrackerBase<false, uint64_t> {
for (BindGroupIndex index : IterateBitSet(mDirtyBindGroupsObjectChangedOrIsDynamic)) {
BindGroup* group = ToBackend(mBindGroups[index]);
ApplyBindGroup(commandList, ToBackend(mPipelineLayout), index, group,
mDynamicOffsetCounts[index], mDynamicOffsets[index].data());
mDynamicOffsets[index]);
}
AfterApply();
@@ -484,10 +484,7 @@ class BindGroupStateTracker : public BindGroupTrackerBase<false, uint64_t> {
const PipelineLayout* pipelineLayout,
BindGroupIndex index,
BindGroup* group,
uint32_t dynamicOffsetCountIn,
const uint64_t* dynamicOffsetsIn) {
ityp::span<BindingIndex, const uint64_t> dynamicOffsets(dynamicOffsetsIn,
BindingIndex(dynamicOffsetCountIn));
const ityp::vector<BindingIndex, uint64_t>& dynamicOffsets) {
ASSERT(dynamicOffsets.size() == group->GetLayout()->GetDynamicBufferCount());
// Usually, the application won't set the same offsets many times,

View File

@@ -139,6 +139,7 @@ MaybeError PipelineLayout::Initialize() {
// Init root descriptors in root signatures for dynamic buffer bindings.
// These are packed at the beginning of the layout binding info.
mDynamicRootParameterIndices[group].resize(bindGroupLayout->GetDynamicBufferCount());
for (BindingIndex dynamicBindingIndex{0};
dynamicBindingIndex < bindGroupLayout->GetDynamicBufferCount();
++dynamicBindingIndex) {
@@ -224,8 +225,6 @@ MaybeError PipelineLayout::Initialize() {
ASSERT(mDynamicStorageBufferLengthInfo[group].bindingAndRegisterOffsets.size() ==
bgl->GetBindingCountInfo().dynamicStorageBufferCount);
}
ASSERT(dynamicStorageBufferLengthsShaderRegisterOffset <=
kMaxDynamicStorageBuffersPerPipelineLayout);
if (dynamicStorageBufferLengthsShaderRegisterOffset > 0) {
D3D12_ROOT_PARAMETER dynamicStorageBufferLengthConstants{};
@@ -322,7 +321,6 @@ PipelineLayout::GetDynamicStorageBufferLengthInfo() const {
uint32_t PipelineLayout::GetDynamicRootParameterIndex(BindGroupIndex group,
BindingIndex bindingIndex) const {
ASSERT(group < kMaxBindGroupsTyped);
ASSERT(bindingIndex < kMaxDynamicBuffersPerPipelineLayoutTyped);
ASSERT(GetBindGroupLayout(group)->GetBindingInfo(bindingIndex).buffer.hasDynamicOffset);
ASSERT(GetBindGroupLayout(group)->GetBindingInfo(bindingIndex).visibility !=
wgpu::ShaderStage::None);

View File

@@ -19,6 +19,7 @@
#include "dawn/common/Constants.h"
#include "dawn/common/ityp_array.h"
#include "dawn/common/ityp_vector.h"
#include "dawn/native/BindingInfo.h"
#include "dawn/native/PipelineLayout.h"
#include "dawn/native/d3d12/d3d12_platform.h"
@@ -91,9 +92,7 @@ class PipelineLayout final : public PipelineLayoutBase {
ityp::array<BindGroupIndex, uint32_t, kMaxBindGroups> mCbvUavSrvRootParameterInfo;
ityp::array<BindGroupIndex, uint32_t, kMaxBindGroups> mSamplerRootParameterInfo;
ityp::array<BindGroupIndex,
ityp::array<BindingIndex, uint32_t, kMaxDynamicBuffersPerPipelineLayout>,
kMaxBindGroups>
ityp::array<BindGroupIndex, ityp::vector<BindingIndex, uint32_t>, kMaxBindGroups>
mDynamicRootParameterIndices;
DynamicStorageBufferLengthInfo mDynamicStorageBufferLengthInfo;
uint32_t mFirstIndexOffsetParameterIndex;

View File

@@ -716,10 +716,10 @@ class Adapter : public AdapterBase {
// buffers, 128 textures, and 16 samplers. Mac GPU families
// with tier 2 argument buffers support 500000 buffers and
// textures, and 1024 unique samplers
limits->v1.maxDynamicUniformBuffersPerPipelineLayout =
limits->v1.maxUniformBuffersPerShaderStage;
limits->v1.maxDynamicStorageBuffersPerPipelineLayout =
limits->v1.maxStorageBuffersPerShaderStage;
// Without argument buffers, we have slots [0 -> 29], inclusive, which is 30 total.
// 8 are used by maxVertexBuffers.
limits->v1.maxDynamicUniformBuffersPerPipelineLayout = 11u;
limits->v1.maxDynamicStorageBuffersPerPipelineLayout = 11u;
// The WebGPU limit is the limit across all vertex buffers, combined.
limits->v1.maxVertexAttributes =

View File

@@ -442,8 +442,7 @@ class BindGroupTracker : public BindGroupTrackerBase<true, uint64_t> {
void Apply(Encoder encoder) {
BeforeApply();
for (BindGroupIndex index : IterateBitSet(mDirtyBindGroupsObjectChangedOrIsDynamic)) {
ApplyBindGroup(encoder, index, ToBackend(mBindGroups[index]),
mDynamicOffsetCounts[index], mDynamicOffsets[index].data(),
ApplyBindGroup(encoder, index, ToBackend(mBindGroups[index]), mDynamicOffsets[index],
ToBackend(mPipelineLayout));
}
AfterApply();
@@ -458,11 +457,8 @@ class BindGroupTracker : public BindGroupTrackerBase<true, uint64_t> {
id<MTLComputeCommandEncoder> compute,
BindGroupIndex index,
BindGroup* group,
uint32_t dynamicOffsetCount,
uint64_t* dynamicOffsets,
const ityp::vector<BindingIndex, uint64_t>& dynamicOffsets,
PipelineLayout* pipelineLayout) {
uint32_t currentDynamicBufferIndex = 0;
// TODO(crbug.com/dawn/854): Maintain buffers and offsets arrays in BindGroup
// so that we only have to do one setVertexBuffers and one setFragmentBuffers
// call here.
@@ -504,8 +500,8 @@ class BindGroupTracker : public BindGroupTrackerBase<true, uint64_t> {
// TODO(crbug.com/dawn/854): Record bound buffer status to use
// setBufferOffset to achieve better performance.
if (bindingInfo.buffer.hasDynamicOffset) {
offset += dynamicOffsets[currentDynamicBufferIndex];
currentDynamicBufferIndex++;
// Dynamic buffers are packed at the front of BindingIndices.
offset += dynamicOffsets[bindingIndex];
}
if (hasVertStage) {

View File

@@ -231,8 +231,7 @@ class BindGroupTracker : public BindGroupTrackerBase<false, uint64_t> {
void Apply(const OpenGLFunctions& gl) {
BeforeApply();
for (BindGroupIndex index : IterateBitSet(mDirtyBindGroupsObjectChangedOrIsDynamic)) {
ApplyBindGroup(gl, index, mBindGroups[index], mDynamicOffsetCounts[index],
mDynamicOffsets[index].data());
ApplyBindGroup(gl, index, mBindGroups[index], mDynamicOffsets[index]);
}
AfterApply();
}
@@ -241,10 +240,8 @@ class BindGroupTracker : public BindGroupTrackerBase<false, uint64_t> {
void ApplyBindGroup(const OpenGLFunctions& gl,
BindGroupIndex index,
BindGroupBase* group,
uint32_t dynamicOffsetCount,
uint64_t* dynamicOffsets) {
const ityp::vector<BindingIndex, uint64_t>& dynamicOffsets) {
const auto& indices = ToBackend(mPipelineLayout)->GetBindingIndexInfo()[index];
uint32_t currentDynamicOffsetIndex = 0;
for (BindingIndex bindingIndex{0}; bindingIndex < group->GetLayout()->GetBindingCount();
++bindingIndex) {
@@ -268,8 +265,8 @@ class BindGroupTracker : public BindGroupTrackerBase<false, uint64_t> {
GLuint offset = binding.offset;
if (bindingInfo.buffer.hasDynamicOffset) {
offset += dynamicOffsets[currentDynamicOffsetIndex];
++currentDynamicOffsetIndex;
// Dynamic buffers are packed at the front of BindingIndices.
offset += dynamicOffsets[bindingIndex];
}
GLenum target;

View File

@@ -141,12 +141,12 @@ class DescriptorSetTracker : public BindGroupTrackerBase<true, uint32_t> {
BeforeApply();
for (BindGroupIndex dirtyIndex : IterateBitSet(mDirtyBindGroupsObjectChangedOrIsDynamic)) {
VkDescriptorSet set = ToBackend(mBindGroups[dirtyIndex])->GetHandle();
uint32_t count = static_cast<uint32_t>(mDynamicOffsets[dirtyIndex].size());
const uint32_t* dynamicOffset =
mDynamicOffsetCounts[dirtyIndex] > 0 ? mDynamicOffsets[dirtyIndex].data() : nullptr;
device->fn.CmdBindDescriptorSets(recordingContext->commandBuffer, bindPoint,
ToBackend(mPipelineLayout)->GetHandle(),
static_cast<uint32_t>(dirtyIndex), 1, &*set,
mDynamicOffsetCounts[dirtyIndex], dynamicOffset);
count > 0 ? mDynamicOffsets[dirtyIndex].data() : nullptr;
device->fn.CmdBindDescriptorSets(
recordingContext->commandBuffer, bindPoint, ToBackend(mPipelineLayout)->GetHandle(),
static_cast<uint32_t>(dirtyIndex), 1, &*set, count, dynamicOffset);
}
AfterApply();
}