From e1219315635a0aa2a92c56c525154d1b6e962af0 Mon Sep 17 00:00:00 2001 From: shrekshao Date: Wed, 17 May 2023 22:25:34 +0000 Subject: [PATCH] Compat GL/GLES: blit a depth texture to a buffer using compute Add compute blit emulation path for Depth16Unorm/Depth32Float depth textures in OpenGL/OpenGLES backend. Extend DepthCopyTests for better test coverage, especially for testing Depth16Unorm compute emulation path. Bug: dawn:1782, dawn:1291, dawn:1237 Change-Id: I02f6e10d13e8b0080b412a9f9a6d62b1e470ac9c Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/130820 Auto-Submit: Shrek Shao Reviewed-by: Stephen White Kokoro: Kokoro Reviewed-by: Austin Eng Commit-Queue: Shrek Shao --- src/dawn/native/BUILD.gn | 2 + src/dawn/native/BlitDepthStencilToBuffer.cpp | 361 ++++++++++++++ src/dawn/native/BlitDepthStencilToBuffer.h | 38 ++ src/dawn/native/Buffer.cpp | 7 + src/dawn/native/Buffer.h | 1 + src/dawn/native/CMakeLists.txt | 2 + src/dawn/native/CommandEncoder.cpp | 79 +++ src/dawn/native/CommandEncoder.h | 5 + src/dawn/native/CommandValidation.cpp | 13 +- src/dawn/native/CommandValidation.h | 5 +- src/dawn/native/InternalPipelineStore.h | 3 + src/dawn/native/Texture.cpp | 7 + src/dawn/native/Toggles.cpp | 10 + src/dawn/native/Toggles.h | 2 + src/dawn/native/opengl/BufferGL.cpp | 4 +- src/dawn/native/opengl/PhysicalDeviceGL.cpp | 13 + .../tests/end2end/DepthStencilCopyTests.cpp | 452 +++++++++++++----- 17 files changed, 888 insertions(+), 116 deletions(-) create mode 100644 src/dawn/native/BlitDepthStencilToBuffer.cpp create mode 100644 src/dawn/native/BlitDepthStencilToBuffer.h diff --git a/src/dawn/native/BUILD.gn b/src/dawn/native/BUILD.gn index e56d9d9515..24853ced15 100644 --- a/src/dawn/native/BUILD.gn +++ b/src/dawn/native/BUILD.gn @@ -203,6 +203,8 @@ source_set("sources") { "BindingInfo.h", "BlitBufferToDepthStencil.cpp", "BlitBufferToDepthStencil.h", + "BlitDepthStencilToBuffer.cpp", + "BlitDepthStencilToBuffer.h", "BlitDepthToDepth.cpp", "BlitDepthToDepth.h", "Blob.cpp", diff --git a/src/dawn/native/BlitDepthStencilToBuffer.cpp b/src/dawn/native/BlitDepthStencilToBuffer.cpp new file mode 100644 index 0000000000..a6795ff1c5 --- /dev/null +++ b/src/dawn/native/BlitDepthStencilToBuffer.cpp @@ -0,0 +1,361 @@ +// Copyright 2023 The Dawn Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "dawn/native/BlitDepthStencilToBuffer.h" + +#include + +#include "dawn/common/Assert.h" +#include "dawn/native/BindGroup.h" +#include "dawn/native/CommandBuffer.h" +#include "dawn/native/CommandEncoder.h" +#include "dawn/native/ComputePassEncoder.h" +#include "dawn/native/ComputePipeline.h" +#include "dawn/native/Device.h" +#include "dawn/native/InternalPipelineStore.h" +#include "dawn/native/Queue.h" +#include "dawn/native/utils/WGPUHelpers.h" + +namespace dawn::native { + +namespace { + +constexpr uint32_t kWorkgroupSizeX = 8; +constexpr uint32_t kWorkgroupSizeY = 8; + +constexpr char kBlitDepth32FloatToBufferShaders[] = R"( +@group(0) @binding(0) var src_tex : texture_depth_2d_array; +@group(0) @binding(1) var dst_buf : array; + +struct Params { + // copyExtent + srcOrigin: vec3u, + pad0: u32, + srcExtent: vec3u, + pad1: u32, + + // GPUImageDataLayout + indicesPerRow: u32, + rowsPerImage: u32, + indicesOffset: u32, +}; + +@group(0) @binding(2) var params : Params; + +override workgroupSizeX: u32; +override workgroupSizeY: u32; + +// Load the depth value and write to storage buffer. +@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn blit_depth_to_buffer(@builtin(global_invocation_id) id : vec3u) { + let srcBoundary = params.srcOrigin + params.srcExtent; + let coord = id + params.srcOrigin; + if (any(coord >= srcBoundary)) { + return; + } + + let dstOffset = params.indicesOffset + id.x + id.y * params.indicesPerRow + id.z * params.indicesPerRow * params.rowsPerImage; + dst_buf[dstOffset] = textureLoad(src_tex, coord.xy, coord.z, 0); +} + +)"; + +// ShaderF16 extension is only enabled by GL_AMD_gpu_shader_half_float for GL +// so we should not use it generally for the emulation. +// As a result we are using f32 and array to do all the math and byte manipulation. +// If we have 2-byte scalar type (f16, u16) it can be a bit easier when writing to the storage +// buffer. + +constexpr char kBlitDepth16UnormToBufferShaders[] = R"( +@group(0) @binding(0) var src_tex : texture_depth_2d_array; +@group(0) @binding(1) var dst_buf : array; + +struct Params { + // copyExtent + srcOrigin: vec3u, + pad0: u32, + srcExtent: vec3u, + pad1: u32, + + // GPUImageDataLayout + indicesPerRow: u32, + rowsPerImage: u32, + indicesOffset: u32, +}; + +@group(0) @binding(2) var params : Params; + +// Range of v is [0.0, 1.0] +fn getUnorm16Bits(v: f32) -> u32 { + var bits: u32 = u32(v * 65535.0); + return bits; +} + +override workgroupSizeX: u32; +override workgroupSizeY: u32; + +// Load the depth value and write to storage buffer. +// Each thread is responsible for reading 2 u16 values and packing them into 1 u32 value. +@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn blit_depth_to_buffer(@builtin(global_invocation_id) id : vec3u) { + let srcBoundary = params.srcOrigin + params.srcExtent; + let coord0 = vec3u(id.x * 2, id.y, id.z) + params.srcOrigin; + + if (any(coord0 >= srcBoundary)) { + return; + } + + let v0: f32 = textureLoad(src_tex, coord0.xy, coord0.z, 0); + let r0: u32 = getUnorm16Bits(v0); + + let dstOffset = params.indicesOffset + id.x + id.y * params.indicesPerRow + id.z * params.indicesPerRow * params.rowsPerImage; + + var result: u32 = r0; + let coord1 = coord0 + vec3u(1, 0, 0); + if (coord1.x < srcBoundary.x) { + // Make sure coord1 is still within the copy boundary + // then read and write this value. + let v1: f32 = textureLoad(src_tex, coord1.xy, coord1.z, 0); + let r1: u32 = getUnorm16Bits(v1); + result += (r1 << 16); + } else { + // Otherwise, srcExtent.x is an odd number and this thread is at right edge of the texture + // To preserve the original buffer content, we need to read from the buffer and pack it + // together with r0 to avoid it being overwritten. + // TODO(dawn:1782): profiling against making a separate pass for this edge case + // as it require reading from dst_buf. + let original: u32 = dst_buf[dstOffset]; + result += original & 0xffff0000; + } + + dst_buf[dstOffset] = result; +} +)"; + +ResultOrError> CreateDepthBlitComputePipeline(DeviceBase* device, + InternalPipelineStore* store, + wgpu::TextureFormat format) { + ShaderModuleWGSLDescriptor wgslDesc = {}; + ShaderModuleDescriptor shaderModuleDesc = {}; + shaderModuleDesc.nextInChain = &wgslDesc; + switch (format) { + case wgpu::TextureFormat::Depth16Unorm: + wgslDesc.source = kBlitDepth16UnormToBufferShaders; + break; + case wgpu::TextureFormat::Depth32Float: + wgslDesc.source = kBlitDepth32FloatToBufferShaders; + break; + default: + UNREACHABLE(); + break; + } + + Ref shaderModule; + DAWN_TRY_ASSIGN(shaderModule, device->CreateShaderModule(&shaderModuleDesc)); + + Ref bindGroupLayout; + DAWN_TRY_ASSIGN(bindGroupLayout, + utils::MakeBindGroupLayout( + device, + { + {0, wgpu::ShaderStage::Compute, wgpu::TextureSampleType::Depth, + wgpu::TextureViewDimension::e2DArray}, + {1, wgpu::ShaderStage::Compute, kInternalStorageBufferBinding}, + {2, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Uniform}, + }, + /* allowInternalBinding */ true)); + + Ref pipelineLayout; + DAWN_TRY_ASSIGN(pipelineLayout, utils::MakeBasicPipelineLayout(device, bindGroupLayout)); + + ComputePipelineDescriptor computePipelineDescriptor = {}; + computePipelineDescriptor.layout = pipelineLayout.Get(); + computePipelineDescriptor.compute.module = shaderModule.Get(); + computePipelineDescriptor.compute.entryPoint = "blit_depth_to_buffer"; + + constexpr std::array constants = {{ + {nullptr, "workgroupSizeX", kWorkgroupSizeX}, + {nullptr, "workgroupSizeY", kWorkgroupSizeY}, + }}; + computePipelineDescriptor.compute.constantCount = constants.size(); + computePipelineDescriptor.compute.constants = constants.data(); + + Ref pipeline; + DAWN_TRY_ASSIGN(pipeline, device->CreateComputePipeline(&computePipelineDescriptor)); + + switch (format) { + case wgpu::TextureFormat::Depth16Unorm: + store->blitDepth16UnormToBufferComputePipeline = pipeline; + break; + case wgpu::TextureFormat::Depth32Float: + store->blitDepth32FloatToBufferComputePipeline = pipeline; + break; + default: + UNREACHABLE(); + break; + } + return pipeline; +} + +ResultOrError> GetOrCreateDepth32FloatToBufferPipeline( + DeviceBase* device) { + InternalPipelineStore* store = device->GetInternalPipelineStore(); + if (store->blitDepth32FloatToBufferComputePipeline != nullptr) { + return store->blitDepth32FloatToBufferComputePipeline; + } + + Ref pipeline; + DAWN_TRY_ASSIGN( + pipeline, CreateDepthBlitComputePipeline(device, store, wgpu::TextureFormat::Depth32Float)); + + return pipeline; +} + +ResultOrError> GetOrCreateDepth16UnormToBufferPipeline( + DeviceBase* device) { + InternalPipelineStore* store = device->GetInternalPipelineStore(); + if (store->blitDepth16UnormToBufferComputePipeline != nullptr) { + return store->blitDepth16UnormToBufferComputePipeline; + } + + Ref pipeline; + DAWN_TRY_ASSIGN( + pipeline, CreateDepthBlitComputePipeline(device, store, wgpu::TextureFormat::Depth16Unorm)); + return pipeline; +} + +} // anonymous namespace + +MaybeError BlitDepthToBuffer(DeviceBase* device, + CommandEncoder* commandEncoder, + const TextureCopy& src, + const BufferCopy& dst, + const Extent3D& copyExtent) { + const Format& format = src.texture->GetFormat(); + + Ref destinationBuffer = dst.buffer; + bool useIntermediateCopyBuffer = false; + if (format.format == wgpu::TextureFormat::Depth16Unorm && dst.buffer->GetSize() % 4 != 0 && + copyExtent.width % 2 != 0) { + // This path is made for OpenGL/GLES depth16unorm bliting a texture with an odd width, + // to a compact buffer. When we copy the last texel, we inevitably need to access an + // out of bounds location given by dst.buffer.size as we use array in the shader for + // the storage buffer. Although the allocated size of dst.buffer is aligned to 4 bytes for + // OpenGL/GLES backend, the size of the storage buffer binding for the shader is not. Thus + // we make an intermediate buffer aligned to 4 bytes for the compute shader to safely + // access, and perform an additional buffer to buffer copy at the end. This path should be + // hit rarely. + useIntermediateCopyBuffer = true; + BufferDescriptor descriptor = {}; + descriptor.size = Align(dst.buffer->GetSize(), 4); + // TODO(dawn:1485): adding CopyDst usage to add kInternalStorageBuffer usage internally. + descriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst; + DAWN_TRY_ASSIGN(destinationBuffer, device->CreateBuffer(&descriptor)); + } + + Ref pipeline; + uint32_t workgroupCountX = 1; + uint32_t workgroupCountY = 1; + uint32_t workgroupCountZ = copyExtent.depthOrArrayLayers; + switch (format.format) { + case wgpu::TextureFormat::Depth16Unorm: + // One thread is responsible for writing two texel values (x, y) and (x+1, y). + workgroupCountX = (copyExtent.width + 2 * kWorkgroupSizeX - 1) / (2 * kWorkgroupSizeX); + workgroupCountY = (copyExtent.height + kWorkgroupSizeY - 1) / kWorkgroupSizeY; + DAWN_TRY_ASSIGN(pipeline, GetOrCreateDepth16UnormToBufferPipeline(device)); + break; + case wgpu::TextureFormat::Depth32Float: + workgroupCountX = (copyExtent.width + kWorkgroupSizeX - 1) / kWorkgroupSizeX; + workgroupCountY = (copyExtent.height + kWorkgroupSizeY - 1) / kWorkgroupSizeY; + DAWN_TRY_ASSIGN(pipeline, GetOrCreateDepth32FloatToBufferPipeline(device)); + break; + default: + // Other formats (e.g. Depth32FloatStencil8) are not supported on OpenGL/OpenGLES where + // we enabled this workaround. They only support Depth24PlusStencil8. + UNREACHABLE(); + } + + // Allow internal usages since we need to use the source as a texture binding + // and buffer as a storage binding. + auto scope = commandEncoder->MakeInternalUsageScope(); + + Ref bindGroupLayout; + DAWN_TRY_ASSIGN(bindGroupLayout, pipeline->GetBindGroupLayout(0)); + + Ref uniformBuffer; + { + BufferDescriptor bufferDesc = {}; + // Uniform buffer size needs to be multiple of 16 bytes + bufferDesc.size = sizeof(uint32_t) * 12; + bufferDesc.usage = wgpu::BufferUsage::Uniform; + bufferDesc.mappedAtCreation = true; + DAWN_TRY_ASSIGN(uniformBuffer, device->CreateBuffer(&bufferDesc)); + + uint32_t* params = + static_cast(uniformBuffer->GetMappedRange(0, bufferDesc.size)); + // srcOrigin: vec3u + params[0] = src.origin.x; + params[1] = src.origin.y; + // src.origin.z is set at textureView.baseArrayLayer + params[2] = 0; + // srcExtent: vec3u + params[4] = copyExtent.width; + params[5] = copyExtent.height; + params[6] = copyExtent.depthOrArrayLayers; + + // Turn bytesPerRow, (bytes)offset to use array index as unit + // We use array for depth16unorm copy and array for depth32float copy + // Both array element sizes are 4 bytes. + params[8] = dst.bytesPerRow / 4; + params[9] = dst.rowsPerImage; + params[10] = dst.offset / 4; + + DAWN_TRY(uniformBuffer->Unmap()); + } + + TextureViewDescriptor viewDesc = {}; + viewDesc.aspect = wgpu::TextureAspect::DepthOnly; + viewDesc.dimension = wgpu::TextureViewDimension::e2DArray; + viewDesc.baseMipLevel = src.mipLevel; + viewDesc.mipLevelCount = 1; + viewDesc.baseArrayLayer = src.origin.z; + viewDesc.arrayLayerCount = copyExtent.depthOrArrayLayers; + + Ref srcView; + DAWN_TRY_ASSIGN(srcView, src.texture->CreateView(&viewDesc)); + + Ref bindGroup; + DAWN_TRY_ASSIGN(bindGroup, utils::MakeBindGroup(device, bindGroupLayout, + { + {0, srcView}, + {1, destinationBuffer}, + {2, uniformBuffer}, + }, + UsageValidationMode::Internal)); + + Ref pass = commandEncoder->BeginComputePass(); + pass->APISetPipeline(pipeline.Get()); + pass->APISetBindGroup(0, bindGroup.Get()); + pass->APIDispatchWorkgroups(workgroupCountX, workgroupCountY, workgroupCountZ); + + pass->APIEnd(); + + if (useIntermediateCopyBuffer) { + ASSERT(destinationBuffer->GetSize() <= dst.buffer->GetAllocatedSize()); + commandEncoder->InternalCopyBufferToBufferWithAllocatedSize( + destinationBuffer.Get(), 0, dst.buffer.Get(), 0, destinationBuffer->GetSize()); + } + + return {}; +} + +} // namespace dawn::native diff --git a/src/dawn/native/BlitDepthStencilToBuffer.h b/src/dawn/native/BlitDepthStencilToBuffer.h new file mode 100644 index 0000000000..ca60b9e929 --- /dev/null +++ b/src/dawn/native/BlitDepthStencilToBuffer.h @@ -0,0 +1,38 @@ +// Copyright 2023 The Dawn Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SRC_DAWN_NATIVE_BLITDEPTHSTENCILTOBUFFER_H_ +#define SRC_DAWN_NATIVE_BLITDEPTHSTENCILTOBUFFER_H_ + +#include "dawn/native/Error.h" + +namespace dawn::native { + +struct TextureCopy; +struct BufferCopy; + +// BlitDepthToBuffer works around OpenGL/GLES issues of copying depth textures to a buffer. +// Supported depth texture format: depth16unorm, depth32float +// It dispatches a compute shader textureLoad from the depth texture and writes to the buffer as a +// storage buffer. + +MaybeError BlitDepthToBuffer(DeviceBase* device, + CommandEncoder* commandEncoder, + const TextureCopy& src, + const BufferCopy& dst, + const Extent3D& copyExtent); + +} // namespace dawn::native + +#endif // SRC_DAWN_NATIVE_BLITDEPTHSTENCILTOBUFFER_H_ diff --git a/src/dawn/native/Buffer.cpp b/src/dawn/native/Buffer.cpp index fe79de4825..92acd77095 100644 --- a/src/dawn/native/Buffer.cpp +++ b/src/dawn/native/Buffer.cpp @@ -172,6 +172,13 @@ BufferBase::BufferBase(DeviceBase* device, const BufferDescriptor* descriptor) mUsage |= kInternalStorageBuffer; } + if (mUsage & wgpu::BufferUsage::CopyDst) { + if (device->IsToggleEnabled(Toggle::UseBlitForDepth16UnormTextureToBufferCopy) || + device->IsToggleEnabled(Toggle::UseBlitForDepth32FloatTextureToBufferCopy)) { + mUsage |= kInternalStorageBuffer; + } + } + GetObjectTrackingList()->Track(this); } diff --git a/src/dawn/native/Buffer.h b/src/dawn/native/Buffer.h index 3457ed5f41..aa2050e727 100644 --- a/src/dawn/native/Buffer.h +++ b/src/dawn/native/Buffer.h @@ -24,6 +24,7 @@ #include "dawn/native/Forward.h" #include "dawn/native/IntegerTypes.h" #include "dawn/native/ObjectBase.h" +#include "dawn/native/UsageValidationMode.h" #include "dawn/native/dawn_platform.h" diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt index deda9b0591..8425f21935 100644 --- a/src/dawn/native/CMakeLists.txt +++ b/src/dawn/native/CMakeLists.txt @@ -49,6 +49,8 @@ target_sources(dawn_native PRIVATE "BindingInfo.h" "BlitBufferToDepthStencil.cpp" "BlitBufferToDepthStencil.h" + "BlitDepthStencilToBuffer.cpp" + "BlitDepthStencilToBuffer.h" "BlitDepthToDepth.cpp" "BlitDepthToDepth.h" "Blob.cpp" diff --git a/src/dawn/native/CommandEncoder.cpp b/src/dawn/native/CommandEncoder.cpp index 97e8a82e81..476248f9cf 100644 --- a/src/dawn/native/CommandEncoder.cpp +++ b/src/dawn/native/CommandEncoder.cpp @@ -23,6 +23,7 @@ #include "dawn/native/ApplyClearColorValueWithDrawHelper.h" #include "dawn/native/BindGroup.h" #include "dawn/native/BlitBufferToDepthStencil.h" +#include "dawn/native/BlitDepthStencilToBuffer.h" #include "dawn/native/BlitDepthToDepth.h" #include "dawn/native/Buffer.h" #include "dawn/native/ChainUtils_autogen.h" @@ -1159,6 +1160,55 @@ void CommandEncoder::APICopyBufferToBuffer(BufferBase* source, destination, destinationOffset, size); } +// The internal version of APICopyBufferToBuffer which validates against mAllocatedSize instead of +// mSize of buffers. +void CommandEncoder::InternalCopyBufferToBufferWithAllocatedSize(BufferBase* source, + uint64_t sourceOffset, + BufferBase* destination, + uint64_t destinationOffset, + uint64_t size) { + mEncodingContext.TryEncode( + this, + [&](CommandAllocator* allocator) -> MaybeError { + if (GetDevice()->IsValidationEnabled()) { + DAWN_TRY(GetDevice()->ValidateObject(source)); + DAWN_TRY(GetDevice()->ValidateObject(destination)); + + DAWN_INVALID_IF(source == destination, + "Source and destination are the same buffer (%s).", source); + + DAWN_TRY_CONTEXT(ValidateCopySizeFitsInBuffer(source, sourceOffset, size, + BufferSizeType::AllocatedSize), + "validating source %s copy size against allocated size.", source); + DAWN_TRY_CONTEXT(ValidateCopySizeFitsInBuffer(destination, destinationOffset, size, + BufferSizeType::AllocatedSize), + "validating destination %s copy size against allocated size.", + destination); + DAWN_TRY(ValidateB2BCopyAlignment(size, sourceOffset, destinationOffset)); + + DAWN_TRY_CONTEXT(ValidateCanUseAs(source, wgpu::BufferUsage::CopySrc), + "validating source %s usage.", source); + DAWN_TRY_CONTEXT(ValidateCanUseAs(destination, wgpu::BufferUsage::CopyDst), + "validating destination %s usage.", destination); + } + + mTopLevelBuffers.insert(source); + mTopLevelBuffers.insert(destination); + + CopyBufferToBufferCmd* copy = + allocator->Allocate(Command::CopyBufferToBuffer); + copy->source = source; + copy->sourceOffset = sourceOffset; + copy->destination = destination; + copy->destinationOffset = destinationOffset; + copy->size = size; + + return {}; + }, + "encoding internal %s.CopyBufferToBuffer(%s, %u, %s, %u, %u).", this, source, sourceOffset, + destination, destinationOffset, size); +} + void CommandEncoder::APICopyBufferToTexture(const ImageCopyBuffer* source, const ImageCopyTexture* destination, const Extent3D* copySize) { @@ -1286,6 +1336,35 @@ void CommandEncoder::APICopyTextureToBuffer(const ImageCopyTexture* source, TextureDataLayout dstLayout = destination->layout; ApplyDefaultTextureDataLayoutOptions(&dstLayout, blockInfo, *copySize); + auto format = source->texture->GetFormat(); + auto aspect = ConvertAspect(format, source->aspect); + if (aspect == Aspect::Depth) { + if ((format.format == wgpu::TextureFormat::Depth16Unorm && + GetDevice()->IsToggleEnabled( + Toggle::UseBlitForDepth16UnormTextureToBufferCopy)) || + (format.format == wgpu::TextureFormat::Depth32Float && + GetDevice()->IsToggleEnabled( + Toggle::UseBlitForDepth32FloatTextureToBufferCopy))) { + TextureCopy src; + src.texture = source->texture; + src.origin = source->origin; + src.mipLevel = source->mipLevel; + src.aspect = aspect; + + BufferCopy dst; + dst.buffer = destination->buffer; + dst.bytesPerRow = destination->layout.bytesPerRow; + dst.rowsPerImage = destination->layout.rowsPerImage; + dst.offset = destination->layout.offset; + DAWN_TRY_CONTEXT(BlitDepthToBuffer(GetDevice(), this, src, dst, *copySize), + "copying depth aspect from %s to %s using blit workaround.", + src.texture.Get(), destination->buffer); + + return {}; + } + } + // TODO(crbug.com/dawn/1782): implement emulation for stencil + CopyTextureToBufferCmd* t2b = allocator->Allocate(Command::CopyTextureToBuffer); t2b->source.texture = source->texture; diff --git a/src/dawn/native/CommandEncoder.h b/src/dawn/native/CommandEncoder.h index 6b9d4a5404..2b6a4e4285 100644 --- a/src/dawn/native/CommandEncoder.h +++ b/src/dawn/native/CommandEncoder.h @@ -59,6 +59,11 @@ class CommandEncoder final : public ApiObjectBase { BufferBase* destination, uint64_t destinationOffset, uint64_t size); + void InternalCopyBufferToBufferWithAllocatedSize(BufferBase* source, + uint64_t sourceOffset, + BufferBase* destination, + uint64_t destinationOffset, + uint64_t size); void APICopyBufferToTexture(const ImageCopyBuffer* source, const ImageCopyTexture* destination, const Extent3D* copySize); diff --git a/src/dawn/native/CommandValidation.cpp b/src/dawn/native/CommandValidation.cpp index 7d5a34a1cd..86abb419d5 100644 --- a/src/dawn/native/CommandValidation.cpp +++ b/src/dawn/native/CommandValidation.cpp @@ -175,8 +175,17 @@ ResultOrError ComputeRequiredBytesInCopy(const TexelBlockInfo& blockIn MaybeError ValidateCopySizeFitsInBuffer(const Ref& buffer, uint64_t offset, - uint64_t size) { - uint64_t bufferSize = buffer->GetSize(); + uint64_t size, + BufferSizeType checkBufferSizeType) { + uint64_t bufferSize = 0; + switch (checkBufferSizeType) { + case BufferSizeType::Size: + bufferSize = buffer->GetSize(); + break; + case BufferSizeType::AllocatedSize: + bufferSize = buffer->GetAllocatedSize(); + break; + } bool fitsInBuffer = offset <= bufferSize && (size <= (bufferSize - offset)); DAWN_INVALID_IF(!fitsInBuffer, "Copy range (offset: %u, size: %u) does not fit in %s size (%u).", offset, size, diff --git a/src/dawn/native/CommandValidation.h b/src/dawn/native/CommandValidation.h index 53ecfc24e8..4417d6ff12 100644 --- a/src/dawn/native/CommandValidation.h +++ b/src/dawn/native/CommandValidation.h @@ -27,6 +27,8 @@ namespace dawn::native { +enum class BufferSizeType { Size, AllocatedSize }; + class QuerySetBase; struct SyncScopeResourceUsage; struct TexelBlockInfo; @@ -76,7 +78,8 @@ MaybeError ValidateImageCopyTexture(DeviceBase const* device, MaybeError ValidateCopySizeFitsInBuffer(const Ref& buffer, uint64_t offset, - uint64_t size); + uint64_t size, + BufferSizeType checkBufferSizeType = BufferSizeType::Size); bool IsRangeOverlapped(uint32_t startA, uint32_t startB, uint32_t length); diff --git a/src/dawn/native/InternalPipelineStore.h b/src/dawn/native/InternalPipelineStore.h index dd0bd69550..3500ed215f 100644 --- a/src/dawn/native/InternalPipelineStore.h +++ b/src/dawn/native/InternalPipelineStore.h @@ -60,6 +60,9 @@ struct InternalPipelineStore { Ref blitRG8ToDepth16UnormPipeline; + Ref blitDepth16UnormToBufferComputePipeline; + Ref blitDepth32FloatToBufferComputePipeline; + struct BlitR8ToStencilPipelines { Ref clearPipeline; std::array, 8> setStencilPipelines; diff --git a/src/dawn/native/Texture.cpp b/src/dawn/native/Texture.cpp index 98703e8965..fece48f445 100644 --- a/src/dawn/native/Texture.cpp +++ b/src/dawn/native/Texture.cpp @@ -592,6 +592,13 @@ TextureBase::TextureBase(DeviceBase* device, AddInternalUsage(wgpu::TextureUsage::RenderAttachment); } } + if (mFormat.HasDepth() && + (device->IsToggleEnabled(Toggle::UseBlitForDepth16UnormTextureToBufferCopy) || + device->IsToggleEnabled(Toggle::UseBlitForDepth32FloatTextureToBufferCopy))) { + if (mInternalUsage & wgpu::TextureUsage::CopySrc) { + AddInternalUsage(wgpu::TextureUsage::TextureBinding); + } + } } TextureBase::~TextureBase() = default; diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp index f806c773c7..40765de7c2 100644 --- a/src/dawn/native/Toggles.cpp +++ b/src/dawn/native/Toggles.cpp @@ -386,6 +386,16 @@ static constexpr ToggleEnumAndInfoList kToggleNameAndInfoList = {{ "Use a blit to copy from a depth texture to the nonzero subresource of a depth texture. " "Works around an issue where nonzero layers are not written.", "https://crbug.com/dawn/1083", ToggleStage::Device}}, + {Toggle::UseBlitForDepth16UnormTextureToBufferCopy, + {"use_blit_for_depth16unorm_texture_to_buffer_copy", + "Use a blit instead of a copy command to copy depth aspect of a texture to a buffer." + "Workaround for OpenGL and OpenGLES.", + "https://crbug.com/dawn/1782", ToggleStage::Device}}, + {Toggle::UseBlitForDepth32FloatTextureToBufferCopy, + {"use_blit_for_depth32float_texture_to_buffer_copy", + "Use a blit instead of a copy command to copy depth aspect of a texture to a buffer." + "Workaround for OpenGLES.", + "https://crbug.com/dawn/1782", ToggleStage::Device}}, {Toggle::D3D12ReplaceAddWithMinusWhenDstFactorIsZeroAndSrcFactorIsDstAlpha, {"d3d12_replace_add_with_minus_when_dst_factor_is_zero_and_src_factor_is_dst_alpha", "Replace the blending operation 'Add' with 'Minus' when dstBlendFactor is 'Zero' and " diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h index a8c4e10c13..d0b02127ee 100644 --- a/src/dawn/native/Toggles.h +++ b/src/dawn/native/Toggles.h @@ -92,6 +92,8 @@ enum class Toggle { UseBlitForBufferToDepthTextureCopy, UseBlitForBufferToStencilTextureCopy, UseBlitForDepthTextureToTextureCopyToNonzeroSubresource, + UseBlitForDepth16UnormTextureToBufferCopy, + UseBlitForDepth32FloatTextureToBufferCopy, D3D12ReplaceAddWithMinusWhenDstFactorIsZeroAndSrcFactorIsDstAlpha, D3D12PolyfillReflectVec2F32, VulkanClearGen12TextureWithCCSAmbiguateOnCreation, diff --git a/src/dawn/native/opengl/BufferGL.cpp b/src/dawn/native/opengl/BufferGL.cpp index db743f1a39..72544b54f0 100644 --- a/src/dawn/native/opengl/BufferGL.cpp +++ b/src/dawn/native/opengl/BufferGL.cpp @@ -41,7 +41,9 @@ Buffer::Buffer(Device* device, const BufferDescriptor* descriptor) : BufferBase(device, descriptor) { const OpenGLFunctions& gl = device->GetGL(); // Allocate at least 4 bytes so clamped accesses are always in bounds. - mAllocatedSize = std::max(GetSize(), uint64_t(4u)); + // Align with 4 byte to avoid out-of-bounds access issue in compute emulation for 2 byte + // element. + mAllocatedSize = Align(std::max(GetSize(), uint64_t(4u)), uint64_t(4u)); gl.GenBuffers(1, &mBuffer); gl.BindBuffer(GL_ARRAY_BUFFER, mBuffer); diff --git a/src/dawn/native/opengl/PhysicalDeviceGL.cpp b/src/dawn/native/opengl/PhysicalDeviceGL.cpp index b17bccd0ad..e12b4420a6 100644 --- a/src/dawn/native/opengl/PhysicalDeviceGL.cpp +++ b/src/dawn/native/opengl/PhysicalDeviceGL.cpp @@ -213,6 +213,19 @@ void PhysicalDevice::SetupBackendDeviceToggles(TogglesState* deviceToggles) cons // For OpenGL ES, we must use a placeholder fragment shader for vertex-only render pipeline. deviceToggles->Default(Toggle::UsePlaceholderFragmentInVertexOnlyPipeline, gl.GetVersion().IsES()); + // For OpenGL/OpenGL ES, use compute shader blit to emulate depth16unorm texture to buffer + // copies. + // Disable Angle on windows as it seems to have side-effect. +#if DAWN_PLATFORM_IS(WINDOWS) + const bool kIsAngleOnWindows = mName.find("ANGLE") != std::string::npos; +#else + constexpr bool kIsAngleOnWindows = false; +#endif + deviceToggles->Default(Toggle::UseBlitForDepth16UnormTextureToBufferCopy, !kIsAngleOnWindows); + + // For OpenGL ES, use compute shader blit to emulate depth32float texture to buffer copies. + deviceToggles->Default(Toggle::UseBlitForDepth32FloatTextureToBufferCopy, + gl.GetVersion().IsES() && !kIsAngleOnWindows); } ResultOrError> PhysicalDevice::CreateDeviceImpl(AdapterBase* adapter, diff --git a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp index 261909f2aa..654369e48e 100644 --- a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp +++ b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp @@ -41,8 +41,63 @@ constexpr std::array kValidDepthCopyFromBufferFormats = wgpu::TextureFormat::Depth16Unorm, }; +uint32_t GetBytesPerPixel(wgpu::TextureFormat format, wgpu::TextureAspect aspect) { + uint32_t bytesPerPixel = 0; + switch (format) { + case wgpu::TextureFormat::Depth24PlusStencil8: { + ASSERT(aspect == wgpu::TextureAspect::StencilOnly); + bytesPerPixel = 1; + break; + } + case wgpu::TextureFormat::Depth32FloatStencil8: { + switch (aspect) { + case wgpu::TextureAspect::DepthOnly: + bytesPerPixel = 4; + break; + case wgpu::TextureAspect::StencilOnly: + bytesPerPixel = 1; + break; + default: + UNREACHABLE(); + break; + } + break; + } + default: + bytesPerPixel = utils::GetTexelBlockSizeInBytes(format); + break; + } + return bytesPerPixel; +} + +// Bytes of unorm 16 of 0.23 is 0x3AE1, of which the 2 bytes are different. +// This helps better test unorm 16 compute emulation path. +constexpr float kInitDepth = 0.23f; + +// Bytes of unorm 16 of 0.23 is 0xB0A3. +// Use a non-zero clear depth to better test unorm16 compute emulation path. +constexpr float kClearDepth = 0.69f; + class DepthStencilCopyTests : public DawnTestWithParams { protected: + void MapAsyncAndWait(const wgpu::Buffer& buffer, + wgpu::MapMode mode, + size_t offset, + size_t size) { + bool done = false; + buffer.MapAsync( + mode, offset, size, + [](WGPUBufferMapAsyncStatus status, void* userdata) { + ASSERT_EQ(WGPUBufferMapAsyncStatus_Success, status); + *static_cast(userdata) = true; + }, + &done); + + while (!done) { + WaitABit(); + } + } + void SetUp() override { DawnTestWithParams::SetUp(); @@ -91,9 +146,10 @@ class DepthStencilCopyTests : public DawnTestWithParams> testLevel; uint32_t copyHeight = textureHeight >> testLevel; - wgpu::Extent3D copySize = {copyWidth, copyHeight, 1}; - - constexpr uint32_t kBytesPerRow = kTextureBytesPerRowAlignment; wgpu::BufferDescriptor bufferDescriptor = {}; bufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst; bufferDescriptor.size = - bufferCopyOffset + BufferSizeForTextureCopy(copyWidth, copyHeight, 1, - GetParam().mTextureFormat, - wgpu::TextureAspect::DepthOnly); + bufferCopyOffset + + BufferSizeForTextureCopy(copyWidth, copyHeight, textureArrayLayerCount, + GetParam().mTextureFormat, wgpu::TextureAspect::DepthOnly); wgpu::Buffer destinationBuffer = device.CreateBuffer(&bufferDescriptor); + DoCopyFromDepthTestWithBuffer(destinationBuffer, bufferCopyOffset, initDepth, textureWidth, + textureHeight, textureArrayLayerCount, testLevel, true); + } + + void DoCopyFromDepthTestWithBuffer(wgpu::Buffer destinationBuffer, + uint32_t bufferCopyOffset, + float initDepth, + uint32_t textureWidth, + uint32_t textureHeight, + uint32_t textureArrayLayerCount, + uint32_t testLevel, + bool checkBufferContent) { + // TODO(crbug.com/dawn/1291): Compute emulation path fails for Angle on Windows. + DAWN_SUPPRESS_TEST_IF(IsANGLE() && IsWindows()); + + uint32_t mipLevelCount = testLevel + 1; + wgpu::Texture texture = + CreateTexture(textureWidth, textureHeight, + wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc, + mipLevelCount, textureArrayLayerCount); + + // Initialize other mip levels with different init values for better testing + constexpr float garbageDepth = 0.123456789f; + ASSERT(initDepth != garbageDepth); + + for (uint32_t level = 0; level < mipLevelCount; level++) { + float regionDepth = (level == testLevel) ? initDepth : garbageDepth; + InitializeDepthStencilTextureRegion(texture, kClearDepth, regionDepth, 0, 0, level, 0, + textureArrayLayerCount); + } + + uint32_t copyWidth = textureWidth >> testLevel; + uint32_t copyHeight = textureHeight >> testLevel; + wgpu::Extent3D copySize = {copyWidth, copyHeight, textureArrayLayerCount}; + + wgpu::TextureFormat format = GetParam().mTextureFormat; + constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::DepthOnly; + uint32_t bytesPerPixel = GetBytesPerPixel(format, aspect); + uint32_t bytesPerRow = Align(copyWidth * bytesPerPixel, kTextureBytesPerRowAlignment); + uint32_t bytesPerImage = bytesPerRow * copyHeight; + wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); - wgpu::ImageCopyTexture imageCopyTexture = utils::CreateImageCopyTexture( - texture, testLevel, {0, 0, 0}, wgpu::TextureAspect::DepthOnly); + wgpu::ImageCopyTexture imageCopyTexture = + utils::CreateImageCopyTexture(texture, testLevel, {0, 0, 0}, aspect); wgpu::ImageCopyBuffer imageCopyBuffer = utils::CreateImageCopyBuffer( - destinationBuffer, bufferCopyOffset, kBytesPerRow, copyHeight); + destinationBuffer, bufferCopyOffset, bytesPerRow, copyHeight); encoder.CopyTextureToBuffer(&imageCopyTexture, &imageCopyBuffer, ©Size); wgpu::CommandBuffer commandBuffer = encoder.Finish(); queue.Submit(1, &commandBuffer); - if (GetParam().mTextureFormat == wgpu::TextureFormat::Depth16Unorm) { - uint16_t expected = FloatToUnorm(initDepth); - std::vector expectedData = { - 0, 0, 0, 0, // - 0, 0, 0, 0, // - expected, expected, 0, 0, // - expected, expected, 0, 0, // - }; + if (checkBufferContent) { + // Expected data pattern is that initDepth value at bottom left corner, while other + // region is kClearDepth. Data of each layer is the same. + if (format == wgpu::TextureFormat::Depth16Unorm) { + uint16_t expected = FloatToUnorm(initDepth); + uint16_t cleared = FloatToUnorm(kClearDepth); + std::vector expectedData(copyWidth * copyHeight, cleared); + for (uint32_t y = copyHeight / 2; y < copyHeight; y++) { + auto rowStart = expectedData.data() + y * copyWidth; + std::fill(rowStart, rowStart + copyWidth / 2, expected); + } - for (uint32_t y = 0; y < copyHeight; ++y) { - EXPECT_BUFFER_U16_RANGE_EQ(expectedData.data() + copyWidth * y, destinationBuffer, - bufferCopyOffset + y * kBytesPerRow, copyWidth); - } + for (uint32_t z = 0; z < textureArrayLayerCount; ++z) { + uint32_t bufferOffsetPerArrayLayer = bytesPerImage * z; + for (uint32_t y = 0; y < copyHeight; ++y) { + EXPECT_BUFFER_U16_RANGE_EQ( + expectedData.data() + copyWidth * y, destinationBuffer, + bufferCopyOffset + bufferOffsetPerArrayLayer + y * bytesPerRow, + copyWidth); + } + } + } else { + std::vector expectedData(copyWidth * copyHeight, kClearDepth); + for (uint32_t y = copyHeight / 2; y < copyHeight; y++) { + auto rowStart = expectedData.data() + y * copyWidth; + std::fill(rowStart, rowStart + copyWidth / 2, initDepth); + } - } else { - std::vector expectedData = { - 0.0, 0.0, 0.0, 0.0, // - 0.0, 0.0, 0.0, 0.0, // - initDepth, initDepth, 0.0, 0.0, // - initDepth, initDepth, 0.0, 0.0, // - }; - - for (uint32_t y = 0; y < copyHeight; ++y) { - EXPECT_BUFFER_FLOAT_RANGE_EQ(expectedData.data() + copyWidth * y, destinationBuffer, - bufferCopyOffset + y * kBytesPerRow, copyWidth); + for (uint32_t z = 0; z < textureArrayLayerCount; ++z) { + uint32_t bufferOffsetPerArrayLayer = bytesPerImage * z; + for (uint32_t y = 0; y < copyHeight; ++y) { + EXPECT_BUFFER_FLOAT_RANGE_EQ( + expectedData.data() + copyWidth * y, destinationBuffer, + bufferCopyOffset + bufferOffsetPerArrayLayer + y * bytesPerRow, + copyWidth); + } + } } } } @@ -518,34 +597,185 @@ class DepthCopyTests : public DepthStencilCopyTests { // Test copying the depth-only aspect into a buffer. TEST_P(DepthCopyTests, FromDepthAspect) { - constexpr float kInitDepth = 0.2f; constexpr uint32_t kBufferCopyOffset = 0; - constexpr uint32_t kWidth = 4; - constexpr uint32_t kHeight = 4; constexpr uint32_t kTestLevel = 0; - DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, kTestLevel); + constexpr uint32_t kTestTextureSizes[][2] = { + // Original test parameter + {4, 4}, + // Only 1 pixel at bottom left has value, test compute emulation path for unorm 16 + {2, 2}, + // Odd number texture width to test compute emulation path for unorm 16 + {3, 3}, + // float 32 needs bytesPerRow alignment + {65, 1}, + // unorm 16 and float 32 need bytesPerRow alignment + {129, 1}, + }; + constexpr uint32_t kTestTextureArrayLayerCounts[] = { + 1, + 2, + }; + + for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) { + for (const auto& size : kTestTextureSizes) { + DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, size[0], size[1], sizeZ, kTestLevel); + } + } } // Test copying the depth-only aspect into a buffer at a non-zero offset. TEST_P(DepthCopyTests, FromDepthAspectToBufferAtNonZeroOffset) { - constexpr float kInitDepth = 0.2f; - constexpr uint32_t kWidth = 4; - constexpr uint32_t kHeight = 4; constexpr uint32_t kTestLevel = 0; - constexpr std::array kBufferCopyOffsets = {4u, 512u}; - for (uint32_t offset : kBufferCopyOffsets) { - DoCopyFromDepthTest(offset, kInitDepth, kWidth, kHeight, kTestLevel); + constexpr uint32_t kBufferCopyOffsets[] = {4u, 512u}; + constexpr uint32_t kTestTextureSizes[][2] = { + // Original test parameter + {4, 4}, + // Only 1 pixel at bottom left has value, test compute emulation path for unorm 16 + {2, 2}, + // Odd number texture width to test compute emulation path for unorm 16 + {3, 3}, + // float 32 needs bytesPerRow alignment + {65, 1}, + // unorm 16 and float 32 need bytesPerRow alignment + {129, 1}, + }; + constexpr uint32_t kTestTextureArrayLayerCounts[] = { + 1, + 2, + }; + for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) { + for (uint32_t offset : kBufferCopyOffsets) { + for (const auto& size : kTestTextureSizes) { + DoCopyFromDepthTest(offset, kInitDepth, size[0], size[1], sizeZ, kTestLevel); + } + } } } // Test copying the non-zero mip, depth-only aspect into a buffer. TEST_P(DepthCopyTests, FromNonZeroMipDepthAspect) { - constexpr float kInitDepth = 0.2f; constexpr uint32_t kBufferCopyOffset = 0; constexpr uint32_t kWidth = 9; constexpr uint32_t kHeight = 9; - constexpr uint32_t kTestLevel = 1; - DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, kTestLevel); + DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, 1, 1); + DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, 2, 2); +} + +// Test buffer content outside of copy extent is preserved. +// This test is made specifially for compute blit for depth16unorm emulation path. +// The texel size is 2 byte, while in the compute shader we have to write 4 byte at a time. +// When the copy extent width is an odd number, buffer content outside of the copy range is +// inevitably written. So we need to make sure the original content of the buffer that's outside of +// the copy extent is still correctly preserved. +TEST_P(DepthCopyTests, PreserveBufferContent) { + constexpr uint32_t kBufferCopyOffsets[] = {0u, 4u, 512u}; + constexpr uint32_t kTestTextureSizes[][2] = { + {1, 1}, + {1, 2}, + {3, 3}, + }; + constexpr uint32_t kExtraBufferSize[] = {0u, 4u}; + const uint32_t kSizeZ = 1; + constexpr uint32_t kTestLevel = 0; + + wgpu::TextureFormat format = GetParam().mTextureFormat; + constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::DepthOnly; + + for (uint32_t extraBufferSize : kExtraBufferSize) { + for (uint32_t offset : kBufferCopyOffsets) { + for (const auto& size : kTestTextureSizes) { + wgpu::BufferDescriptor bufferDescriptor = {}; + // Add wgpu::BufferUsage::MapRead to check the buffer content with mapAsync + bufferDescriptor.usage = wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead; + bufferDescriptor.size = + extraBufferSize + offset + + BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect); + bufferDescriptor.mappedAtCreation = true; + wgpu::Buffer buffer = device.CreateBuffer(&bufferDescriptor); + constexpr uint8_t kOriginalValue = 0xff; + { + // Fill the buffer with an original value other than 0 to check they are + // incorrectly overwritten outside of the copy range. + uint8_t* ptr = static_cast(buffer.GetMappedRange()); + std::fill(ptr, ptr + bufferDescriptor.size, kOriginalValue); + buffer.Unmap(); + } + + // Don't check copy region content because the buffer doesn't have + // wgpu::BufferUsage::CopySrc usage. + DoCopyFromDepthTestWithBuffer(buffer, offset, kInitDepth, size[0], size[1], kSizeZ, + kTestLevel, false); + + std::vector expected(bufferDescriptor.size, kOriginalValue); + // Get the offset of the end of the copy range (without aligning with 4 bytes) + uint32_t bufferEndOffset = + offset + + BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect, false); + if (bufferDescriptor.size > bufferEndOffset) { + // Cannot use EXPECT_BUFFER_* helper here because it needs to align the copy + // size to a multiple of 4 bytes to call CopyBufferToBuffer. We are checking + // against format like Depth16Unorm so we need offset to be multiple of 2. + MapAsyncAndWait(buffer, wgpu::MapMode::Read, 0, wgpu::kWholeMapSize); + const uint8_t* ptr = static_cast(buffer.GetConstMappedRange()); + + // Check the content before copy range. + for (uint32_t i = 0; i < offset; i++) { + EXPECT_EQ(ptr[i], kOriginalValue); + } + + // Check the content after copy range. + uint32_t checkSize = bufferDescriptor.size - bufferEndOffset; + for (uint32_t i = 0; i < checkSize; i++) { + EXPECT_EQ(ptr[bufferEndOffset + i], kOriginalValue); + } + buffer.Unmap(); + } + } + } + } +} + +// Test compact buffer size edge case. +// This test is made specifially for compute blit for depth16unorm emulation path. +// When format is depth16unorm and width is an odd number, the size of the most compact buffer copy +// target can be something that's not a multiple of 4. We need to make sure access don't go out of +// bounds in the shader, when still writing to array in the compute shader. +TEST_P(DepthCopyTests, BufferCopySizeEdgeCase) { + constexpr uint32_t kBufferCopyOffsets[] = {0u, 4u, 512u}; + constexpr uint32_t kTestTextureSizes[][2] = { + // Cannot create compact copy buffer for {1, 1} here as storage buffer binding requires size + // of at least 4 bytes. + {3, 1}, + {1, 2}, + {3, 3}, + }; + const uint32_t kSizeZ = 1; + constexpr uint32_t kTestLevel = 0; + + wgpu::TextureFormat format = GetParam().mTextureFormat; + constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::DepthOnly; + + for (uint32_t offset : kBufferCopyOffsets) { + for (const auto& size : kTestTextureSizes) { + wgpu::BufferDescriptor bufferDescriptor = {}; + // Add wgpu::BufferUsage::MapRead to check the buffer content with mapAsync + bufferDescriptor.usage = wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead; + bufferDescriptor.size = + offset + + // Don't align for 4 bytes to get smallest possible buffer for depth16unorm. + BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect, false); + wgpu::Buffer buffer = device.CreateBuffer(&bufferDescriptor); + + // Don't check copy region content because the buffer doesn't have + // wgpu::BufferUsage::CopySrc usage. + DoCopyFromDepthTestWithBuffer(buffer, offset, kInitDepth, size[0], size[1], kSizeZ, + kTestLevel, false); + + // Unable to check the result since either MapAsync and CopyBufferToBuffer requires size + // to be multiple of 4 bytes. + // Just run and don't crash on ASSERT. + } + } } class DepthCopyFromBufferTests : public DepthStencilCopyTests { @@ -582,8 +812,6 @@ class DepthCopyFromBufferTests : public DepthStencilCopyTests { destTexture, 0, {0, 0, 0}, wgpu::TextureAspect::DepthOnly); wgpu::Extent3D extent = {kWidth, kHeight, 1}; - constexpr float kInitDepth = 0.2f; - // This expectation is the test as it performs the CopyTextureToBuffer. if (GetParam().mTextureFormat == wgpu::TextureFormat::Depth16Unorm) { uint16_t expected = FloatToUnorm(kInitDepth);