Disable frontend cache when implicit device sync is on.

Normal behavior of ApiObjectBase's APIRelease() which only locks the device when last ref dropped is not thread safe if the object is cached as raw pointers by the device. Example of cached objects: bind group layout, pipeline, sampler, shader module. The following scenario could happen: - thread A: - shaderModuleA.APIRealease() - shaderModuleA.refCount.Decrement() == true (ref count has reached zero) - going to call shaderModuleA.LockAndDeleteThis(). - thread B: - device.CreateShaderModule(). - lock() - device.GetOrCreateShaderModule() - shaderModuleA is in the cache, so return it. - unlock() - thread A: - starting to call shaderModuleA.LockAndDeleteThis() - lock() - erase shaderModuleA from the cache. - delete shaderModuleA. - unlock() This CL disables caching when ImplicitDeviceSynchronization is turned on until we find a better solution. Bug: dawn:1769 Change-Id: Ideb2a717ece0a40e18bd1c2bef00817262bd25da Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/127900 Commit-Queue: Quyen Le <lehoangquyen@chromium.org> Reviewed-by: Austin Eng <enga@chromium.org> Kokoro: Kokoro <noreply+kokoro@google.com>
2025-09-17 16:49:37 +00:00 · 2023-04-20 21:12:25 +00:00 · 2023-04-20 21:12:25 +00:00 · 8cc6205bf7
commit 8cc6205bf7
parent 335573116c
3 changed files with 182 additions and 8 deletions
--- a/docs/dawn/features/implicit_device_sync.md
+++ b/docs/dawn/features/implicit_device_sync.md
@ -0,0 +1,31 @@
+# Implicit Device Synchronization
+
+`implicit-device-sync` is an experimental feature that offers multithreading support for `dawn_native`.
+
+Additional functionality:
+ - `wgpu::Device` and most of its child objects are safe to be used on multiple threads, including:
+   - `wgpu::Queue`.
+   - `wgpu::BindGroup`.
+   - `wgpu::BindGroupLayout`.
+   - `wgpu::Buffer`. See Notes.
+   - `wgpu::Texture`.
+   - `wgpu::TextureView`.
+   - `wgpu::ComputePipeline`.
+   - `wgpu::RenderPipeline`.
+   - `wgpu::PipelineLayout`.
+   - `wgpu::Sampler`.
+   - `wgpu::ShaderModule`.
+   - `wgpu::SwapChain`.
+ - These objects are *not* safe to be used concurrently:
+   - `wgpu:CommandEncoder`.
+   - `wgpu:ComputePassEncoder`.
+   - `wgpu:RenderPassEncoder`.
+   - `wgpu:RenderBundleEncoder`.
+   - Except that the creation, referencing, releasing and destruction of these objects are guaranteed to be thread safe.
+
+
+Notes:
+ - This feature is experimental, meaning currently it has some limitations:
+   - For `wgpu::Buffer` to be safe on multiple threads. Proper manual synchronization must be done by users to ensure that the buffer is not currently being mapped and being used by any render/compute pass encoder at the same time on different threads.
+   - Enabling this feature will disable the compatibility between `wgpu::BindGroupLayout`. That means the two different `wgpu::BindGroupLayout` objects created with equivalent `wgpu::BindGroupLayoutDescriptor` won't be considered "group-equivalent" anymore. You can only use a `wgpu::BindGroup` with a `wgpu::RenderPipeline`/`wgpu::ComputePipeline` if they are both created with the same `wgpu::BindGroupLayout`.
+     - Consider using `wgpu::RenderPipeline::GetBindGroupLayout()`/`wgpu::ComputePipeline::GetBindGroupLayout()` when creating a `wgpu::BindGroup`.
--- a/src/dawn/native/Device.cpp
+++ b/src/dawn/native/Device.cpp
@ -818,9 +818,14 @@ ResultOrError<Ref<BindGroupLayoutBase>> DeviceBase::GetOrCreateBindGroupLayout(
        result = *iter;
    } else {
        DAWN_TRY_ASSIGN(result, CreateBindGroupLayoutImpl(descriptor, pipelineCompatibilityToken));
-        result->SetIsCachedReference();
        result->SetContentHash(blueprintHash);
-        mCaches->bindGroupLayouts.insert(result.Get());
+        // TODO(crbug.com/dawn/1769): We disable caching if multithreading is enabled. The cache
+        // would have been racing with the object's ref counting since it only stores raw pointers
+        // to the objects.
+        if (!HasFeature(Feature::ImplicitDeviceSynchronization)) {
+            result->SetIsCachedReference();
+            mCaches->bindGroupLayouts.insert(result.Get());
+        }
    }

    return std::move(result);
@ -870,6 +875,13 @@ Ref<RenderPipelineBase> DeviceBase::GetCachedRenderPipeline(
 Ref<ComputePipelineBase> DeviceBase::AddOrGetCachedComputePipeline(
    Ref<ComputePipelineBase> computePipeline) {
    ASSERT(IsLockedByCurrentThreadIfNeeded());
+    // TODO(crbug.com/dawn/1769): We disable caching if multithreading is enabled. The cache would
+    // have been racing with the object's ref counting since it only stores raw pointers to the
+    // objects.
+    if (HasFeature(Feature::ImplicitDeviceSynchronization)) {
+        return computePipeline;
+    }
+
    auto [cachedPipeline, inserted] = mCaches->computePipelines.insert(computePipeline.Get());
    if (inserted) {
        computePipeline->SetIsCachedReference();
@ -882,6 +894,13 @@ Ref<ComputePipelineBase> DeviceBase::AddOrGetCachedComputePipeline(
 Ref<RenderPipelineBase> DeviceBase::AddOrGetCachedRenderPipeline(
    Ref<RenderPipelineBase> renderPipeline) {
    ASSERT(IsLockedByCurrentThreadIfNeeded());
+    // TODO(crbug.com/dawn/1769): We disable caching if multithreading is enabled. The cache would
+    // have been racing with the object's ref counting since it only stores raw pointers to the
+    // objects.
+    if (HasFeature(Feature::ImplicitDeviceSynchronization)) {
+        return renderPipeline;
+    }
+
    auto [cachedPipeline, inserted] = mCaches->renderPipelines.insert(renderPipeline.Get());
    if (inserted) {
        renderPipeline->SetIsCachedReference();
@ -939,9 +958,14 @@ ResultOrError<Ref<PipelineLayoutBase>> DeviceBase::GetOrCreatePipelineLayout(
        result = *iter;
    } else {
        DAWN_TRY_ASSIGN(result, CreatePipelineLayoutImpl(descriptor));
-        result->SetIsCachedReference();
        result->SetContentHash(blueprintHash);
-        mCaches->pipelineLayouts.insert(result.Get());
+        // TODO(crbug.com/dawn/1769): We disable caching if multithreading is enabled. The cache
+        // would have been racing with the object's ref counting since it only stores raw pointers
+        // to the objects.
+        if (!HasFeature(Feature::ImplicitDeviceSynchronization)) {
+            result->SetIsCachedReference();
+            mCaches->pipelineLayouts.insert(result.Get());
+        }
    }

    return std::move(result);
@ -972,9 +996,14 @@ ResultOrError<Ref<SamplerBase>> DeviceBase::GetOrCreateSampler(
        result = *iter;
    } else {
        DAWN_TRY_ASSIGN(result, CreateSamplerImpl(descriptor));
-        result->SetIsCachedReference();
        result->SetContentHash(blueprintHash);
-        mCaches->samplers.insert(result.Get());
+        // TODO(crbug.com/dawn/1769): We disable caching if multithreading is enabled. The cache
+        // would have been racing with the object's ref counting since it only stores raw pointers
+        // to the objects.
+        if (!HasFeature(Feature::ImplicitDeviceSynchronization)) {
+            result->SetIsCachedReference();
+            mCaches->samplers.insert(result.Get());
+        }
    }

    return std::move(result);
@ -1012,9 +1041,14 @@ ResultOrError<Ref<ShaderModuleBase>> DeviceBase::GetOrCreateShaderModule(
        }
        DAWN_TRY_ASSIGN(result,
                        CreateShaderModuleImpl(descriptor, parseResult, compilationMessages));
-        result->SetIsCachedReference();
        result->SetContentHash(blueprintHash);
-        mCaches->shaderModules.insert(result.Get());
+        // TODO(crbug.com/dawn/1769): We disable caching if multithreading is enabled. The cache
+        // would have been racing with the object's ref counting since it only stores raw pointers
+        // to the objects.
+        if (!HasFeature(Feature::ImplicitDeviceSynchronization)) {
+            result->SetIsCachedReference();
+            mCaches->shaderModules.insert(result.Get());
+        }
    }

    return std::move(result);
--- a/src/dawn/tests/end2end/MultithreadTests.cpp
+++ b/src/dawn/tests/end2end/MultithreadTests.cpp
@ -91,6 +91,108 @@ class MultithreadTests : public DawnTest {
    dawn::Mutex mutex;
 };

+class MultithreadCachingTests : public MultithreadTests {
+  protected:
+    wgpu::ShaderModule CreateComputeShaderModule() const {
+        return utils::CreateShaderModule(device, R"(
+            struct SSBO {
+                value : u32
+            }
+            @group(0) @binding(0) var<storage, read_write> ssbo : SSBO;
+
+            @compute @workgroup_size(1) fn main() {
+                ssbo.value = 1;
+            })");
+    }
+
+    wgpu::BindGroupLayout CreateComputeBindGroupLayout() const {
+        return utils::MakeBindGroupLayout(
+            device, {
+                        {0, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Storage},
+                    });
+    }
+};
+
+// Test that creating a same shader module (which will return the cached shader module) and release
+// it on multiple threads won't race.
+TEST_P(MultithreadCachingTests, RefAndReleaseCachedShaderModulesInParallel) {
+    RunInParallel(100, [this](uint32_t) {
+        wgpu::ShaderModule csModule = CreateComputeShaderModule();
+        EXPECT_NE(nullptr, csModule.Get());
+    });
+}
+
+// Test that creating a same compute pipeline (which will return the cached pipeline) and release it
+// on multiple threads won't race.
+TEST_P(MultithreadCachingTests, RefAndReleaseCachedComputePipelinesInParallel) {
+    wgpu::ShaderModule csModule = CreateComputeShaderModule();
+    wgpu::BindGroupLayout bglayout = CreateComputeBindGroupLayout();
+    wgpu::PipelineLayout pipelineLayout = utils::MakePipelineLayout(device, {bglayout});
+
+    wgpu::ComputePipelineDescriptor csDesc;
+    csDesc.compute.module = csModule;
+    csDesc.compute.entryPoint = "main";
+    csDesc.layout = pipelineLayout;
+
+    RunInParallel(100, [&, this](uint32_t) {
+        wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
+        EXPECT_NE(nullptr, pipeline.Get());
+    });
+}
+
+// Test that creating a same bind group layout (which will return the cached layout) and
+// release it on multiple threads won't race.
+TEST_P(MultithreadCachingTests, RefAndReleaseCachedBindGroupLayoutsInParallel) {
+    RunInParallel(100, [&, this](uint32_t) {
+        wgpu::BindGroupLayout layout = CreateComputeBindGroupLayout();
+        EXPECT_NE(nullptr, layout.Get());
+    });
+}
+
+// Test that creating a same pipeline layout (which will return the cached layout) and
+// release it on multiple threads won't race.
+TEST_P(MultithreadCachingTests, RefAndReleaseCachedPipelineLayoutsInParallel) {
+    wgpu::BindGroupLayout bglayout = CreateComputeBindGroupLayout();
+
+    RunInParallel(100, [&, this](uint32_t) {
+        wgpu::PipelineLayout pipelineLayout = utils::MakePipelineLayout(device, {bglayout});
+        EXPECT_NE(nullptr, pipelineLayout.Get());
+    });
+}
+
+// Test that creating a same render pipeline (which will return the cached pipeline) and release it
+// on multiple threads won't race.
+TEST_P(MultithreadCachingTests, RefAndReleaseCachedRenderPipelinesInParallel) {
+    utils::ComboRenderPipelineDescriptor renderPipelineDescriptor;
+    wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, R"(
+        @vertex fn main() -> @builtin(position) vec4f {
+            return vec4f(0.0, 0.0, 0.0, 1.0);
+        })");
+    wgpu::ShaderModule fsModule = utils::CreateShaderModule(device, R"(
+        @fragment fn main() -> @location(0) vec4f {
+            return vec4f(0.0, 1.0, 0.0, 1.0);
+        })");
+    renderPipelineDescriptor.vertex.module = vsModule;
+    renderPipelineDescriptor.cFragment.module = fsModule;
+    renderPipelineDescriptor.cTargets[0].format = wgpu::TextureFormat::RGBA8Unorm;
+    renderPipelineDescriptor.primitive.topology = wgpu::PrimitiveTopology::PointList;
+
+    RunInParallel(100, [&, this](uint32_t) {
+        wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&renderPipelineDescriptor);
+        EXPECT_NE(nullptr, pipeline.Get());
+    });
+}
+
+// Test that creating a same sampler pipeline (which will return the cached sampler) and release it
+// on multiple threads won't race.
+TEST_P(MultithreadCachingTests, RefAndReleaseCachedSamplersInParallel) {
+    wgpu::SamplerDescriptor desc = {};
+    RunInParallel(100, [&, this](uint32_t) {
+        wgpu::Sampler sampler = device.CreateSampler(&desc);
+        EXPECT_NE(nullptr, sampler.Get());
+    });
+}
+
 class MultithreadEncodingTests : public MultithreadTests {};

 // Test that encoding render passes in parallel should work
@ -388,6 +490,13 @@ TEST_P(MultithreadTimestampQueryTests, ResolveQuerySets_InParallel) {

 }  // namespace

+DAWN_INSTANTIATE_TEST(MultithreadCachingTests,
+                      D3D12Backend(),
+                      MetalBackend(),
+                      OpenGLBackend(),
+                      OpenGLESBackend(),
+                      VulkanBackend());
+
 DAWN_INSTANTIATE_TEST(MultithreadEncodingTests,
                      D3D12Backend(),
                      MetalBackend(),