From e1219315635a0aa2a92c56c525154d1b6e962af0 Mon Sep 17 00:00:00 2001
From: shrekshao <shrekshao@google.com>
Date: Wed, 17 May 2023 22:25:34 +0000
Subject: [PATCH] Compat GL/GLES: blit a depth texture to a buffer using
 compute

Add compute blit emulation path for Depth16Unorm/Depth32Float
depth textures in OpenGL/OpenGLES backend.

Extend DepthCopyTests for better test coverage, especially for
testing Depth16Unorm compute emulation path.

Bug: dawn:1782, dawn:1291, dawn:1237
Change-Id: I02f6e10d13e8b0080b412a9f9a6d62b1e470ac9c
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/130820
Auto-Submit: Shrek Shao <shrekshao@google.com>
Reviewed-by: Stephen White <senorblanco@chromium.org>
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Shrek Shao <shrekshao@google.com>
---
 src/dawn/native/BUILD.gn                      |   2 +
 src/dawn/native/BlitDepthStencilToBuffer.cpp  | 361 ++++++++++++++
 src/dawn/native/BlitDepthStencilToBuffer.h    |  38 ++
 src/dawn/native/Buffer.cpp                    |   7 +
 src/dawn/native/Buffer.h                      |   1 +
 src/dawn/native/CMakeLists.txt                |   2 +
 src/dawn/native/CommandEncoder.cpp            |  79 +++
 src/dawn/native/CommandEncoder.h              |   5 +
 src/dawn/native/CommandValidation.cpp         |  13 +-
 src/dawn/native/CommandValidation.h           |   5 +-
 src/dawn/native/InternalPipelineStore.h       |   3 +
 src/dawn/native/Texture.cpp                   |   7 +
 src/dawn/native/Toggles.cpp                   |  10 +
 src/dawn/native/Toggles.h                     |   2 +
 src/dawn/native/opengl/BufferGL.cpp           |   4 +-
 src/dawn/native/opengl/PhysicalDeviceGL.cpp   |  13 +
 .../tests/end2end/DepthStencilCopyTests.cpp   | 452 +++++++++++++-----
 17 files changed, 888 insertions(+), 116 deletions(-)
 create mode 100644 src/dawn/native/BlitDepthStencilToBuffer.cpp
 create mode 100644 src/dawn/native/BlitDepthStencilToBuffer.h
diff --git a/src/dawn/native/BUILD.gn b/src/dawn/native/BUILD.gn
index e56d9d9515..24853ced15 100644
--- a/src/dawn/native/BUILD.gn
+++ b/src/dawn/native/BUILD.gn
@@ -203,6 +203,8 @@ source_set("sources") {
     "BindingInfo.h",
     "BlitBufferToDepthStencil.cpp",
     "BlitBufferToDepthStencil.h",
+    "BlitDepthStencilToBuffer.cpp",
+    "BlitDepthStencilToBuffer.h",
     "BlitDepthToDepth.cpp",
     "BlitDepthToDepth.h",
     "Blob.cpp",
diff --git a/src/dawn/native/BlitDepthStencilToBuffer.cpp b/src/dawn/native/BlitDepthStencilToBuffer.cpp
new file mode 100644
index 0000000000..a6795ff1c5
--- /dev/null
+++ b/src/dawn/native/BlitDepthStencilToBuffer.cpp
@@ -0,0 +1,361 @@
+// Copyright 2023 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dawn/native/BlitDepthStencilToBuffer.h"
+
+#include <utility>
+
+#include "dawn/common/Assert.h"
+#include "dawn/native/BindGroup.h"
+#include "dawn/native/CommandBuffer.h"
+#include "dawn/native/CommandEncoder.h"
+#include "dawn/native/ComputePassEncoder.h"
+#include "dawn/native/ComputePipeline.h"
+#include "dawn/native/Device.h"
+#include "dawn/native/InternalPipelineStore.h"
+#include "dawn/native/Queue.h"
+#include "dawn/native/utils/WGPUHelpers.h"
+
+namespace dawn::native {
+
+namespace {
+
+constexpr uint32_t kWorkgroupSizeX = 8;
+constexpr uint32_t kWorkgroupSizeY = 8;
+
+constexpr char kBlitDepth32FloatToBufferShaders[] = R"(
+@group(0) @binding(0) var src_tex : texture_depth_2d_array;
+@group(0) @binding(1) var<storage, read_write> dst_buf : array<f32>;
+
+struct Params {
+    // copyExtent
+    srcOrigin: vec3u,
+    pad0: u32,
+    srcExtent: vec3u,
+    pad1: u32,
+
+    // GPUImageDataLayout
+    indicesPerRow: u32,
+    rowsPerImage: u32,
+    indicesOffset: u32,
+};
+
+@group(0) @binding(2) var<uniform> params : Params;
+
+override workgroupSizeX: u32;
+override workgroupSizeY: u32;
+
+// Load the depth value and write to storage buffer.
+@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn blit_depth_to_buffer(@builtin(global_invocation_id) id : vec3u) {
+    let srcBoundary = params.srcOrigin + params.srcExtent;
+    let coord = id + params.srcOrigin;
+    if (any(coord >= srcBoundary)) {
+        return;
+    }
+
+    let dstOffset = params.indicesOffset + id.x + id.y * params.indicesPerRow + id.z * params.indicesPerRow * params.rowsPerImage;
+    dst_buf[dstOffset] = textureLoad(src_tex, coord.xy, coord.z, 0);
+}
+
+)";
+
+// ShaderF16 extension is only enabled by GL_AMD_gpu_shader_half_float for GL
+// so we should not use it generally for the emulation.
+// As a result we are using f32 and array<u32> to do all the math and byte manipulation.
+// If we have 2-byte scalar type (f16, u16) it can be a bit easier when writing to the storage
+// buffer.
+
+constexpr char kBlitDepth16UnormToBufferShaders[] = R"(
+@group(0) @binding(0) var src_tex : texture_depth_2d_array;
+@group(0) @binding(1) var<storage, read_write> dst_buf : array<u32>;
+
+struct Params {
+    // copyExtent
+    srcOrigin: vec3u,
+    pad0: u32,
+    srcExtent: vec3u,
+    pad1: u32,
+
+    // GPUImageDataLayout
+    indicesPerRow: u32,
+    rowsPerImage: u32,
+    indicesOffset: u32,
+};
+
+@group(0) @binding(2) var<uniform> params : Params;
+
+// Range of v is [0.0, 1.0]
+fn getUnorm16Bits(v: f32) -> u32 {
+    var bits: u32 = u32(v * 65535.0);
+    return bits;
+}
+
+override workgroupSizeX: u32;
+override workgroupSizeY: u32;
+
+// Load the depth value and write to storage buffer.
+// Each thread is responsible for reading 2 u16 values and packing them into 1 u32 value.
+@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn blit_depth_to_buffer(@builtin(global_invocation_id) id : vec3u) {
+    let srcBoundary = params.srcOrigin + params.srcExtent;
+    let coord0 = vec3u(id.x * 2, id.y, id.z) + params.srcOrigin;
+
+    if (any(coord0 >= srcBoundary)) {
+        return;
+    }
+
+    let v0: f32 = textureLoad(src_tex, coord0.xy, coord0.z, 0);
+    let r0: u32 = getUnorm16Bits(v0);
+
+    let dstOffset = params.indicesOffset + id.x + id.y * params.indicesPerRow + id.z * params.indicesPerRow * params.rowsPerImage;
+
+    var result: u32 = r0;
+    let coord1 = coord0 + vec3u(1, 0, 0);
+    if (coord1.x < srcBoundary.x) {
+        // Make sure coord1 is still within the copy boundary
+        // then read and write this value.
+        let v1: f32 = textureLoad(src_tex, coord1.xy, coord1.z, 0);
+        let r1: u32 = getUnorm16Bits(v1);
+        result += (r1 << 16);
+    } else {
+        // Otherwise, srcExtent.x is an odd number and this thread is at right edge of the texture
+        // To preserve the original buffer content, we need to read from the buffer and pack it
+        // together with r0 to avoid it being overwritten.
+        // TODO(dawn:1782): profiling against making a separate pass for this edge case
+        // as it require reading from dst_buf.
+        let original: u32 = dst_buf[dstOffset];
+        result += original & 0xffff0000;
+    }
+
+    dst_buf[dstOffset] = result;
+}
+)";
+
+ResultOrError<Ref<ComputePipelineBase>> CreateDepthBlitComputePipeline(DeviceBase* device,
+                                                                       InternalPipelineStore* store,
+                                                                       wgpu::TextureFormat format) {
+    ShaderModuleWGSLDescriptor wgslDesc = {};
+    ShaderModuleDescriptor shaderModuleDesc = {};
+    shaderModuleDesc.nextInChain = &wgslDesc;
+    switch (format) {
+        case wgpu::TextureFormat::Depth16Unorm:
+            wgslDesc.source = kBlitDepth16UnormToBufferShaders;
+            break;
+        case wgpu::TextureFormat::Depth32Float:
+            wgslDesc.source = kBlitDepth32FloatToBufferShaders;
+            break;
+        default:
+            UNREACHABLE();
+            break;
+    }
+
+    Ref<ShaderModuleBase> shaderModule;
+    DAWN_TRY_ASSIGN(shaderModule, device->CreateShaderModule(&shaderModuleDesc));
+
+    Ref<BindGroupLayoutBase> bindGroupLayout;
+    DAWN_TRY_ASSIGN(bindGroupLayout,
+                    utils::MakeBindGroupLayout(
+                        device,
+                        {
+                            {0, wgpu::ShaderStage::Compute, wgpu::TextureSampleType::Depth,
+                             wgpu::TextureViewDimension::e2DArray},
+                            {1, wgpu::ShaderStage::Compute, kInternalStorageBufferBinding},
+                            {2, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Uniform},
+                        },
+                        /* allowInternalBinding */ true));
+
+    Ref<PipelineLayoutBase> pipelineLayout;
+    DAWN_TRY_ASSIGN(pipelineLayout, utils::MakeBasicPipelineLayout(device, bindGroupLayout));
+
+    ComputePipelineDescriptor computePipelineDescriptor = {};
+    computePipelineDescriptor.layout = pipelineLayout.Get();
+    computePipelineDescriptor.compute.module = shaderModule.Get();
+    computePipelineDescriptor.compute.entryPoint = "blit_depth_to_buffer";
+
+    constexpr std::array<ConstantEntry, 2> constants = {{
+        {nullptr, "workgroupSizeX", kWorkgroupSizeX},
+        {nullptr, "workgroupSizeY", kWorkgroupSizeY},
+    }};
+    computePipelineDescriptor.compute.constantCount = constants.size();
+    computePipelineDescriptor.compute.constants = constants.data();
+
+    Ref<ComputePipelineBase> pipeline;
+    DAWN_TRY_ASSIGN(pipeline, device->CreateComputePipeline(&computePipelineDescriptor));
+
+    switch (format) {
+        case wgpu::TextureFormat::Depth16Unorm:
+            store->blitDepth16UnormToBufferComputePipeline = pipeline;
+            break;
+        case wgpu::TextureFormat::Depth32Float:
+            store->blitDepth32FloatToBufferComputePipeline = pipeline;
+            break;
+        default:
+            UNREACHABLE();
+            break;
+    }
+    return pipeline;
+}
+
+ResultOrError<Ref<ComputePipelineBase>> GetOrCreateDepth32FloatToBufferPipeline(
+    DeviceBase* device) {
+    InternalPipelineStore* store = device->GetInternalPipelineStore();
+    if (store->blitDepth32FloatToBufferComputePipeline != nullptr) {
+        return store->blitDepth32FloatToBufferComputePipeline;
+    }
+
+    Ref<ComputePipelineBase> pipeline;
+    DAWN_TRY_ASSIGN(
+        pipeline, CreateDepthBlitComputePipeline(device, store, wgpu::TextureFormat::Depth32Float));
+
+    return pipeline;
+}
+
+ResultOrError<Ref<ComputePipelineBase>> GetOrCreateDepth16UnormToBufferPipeline(
+    DeviceBase* device) {
+    InternalPipelineStore* store = device->GetInternalPipelineStore();
+    if (store->blitDepth16UnormToBufferComputePipeline != nullptr) {
+        return store->blitDepth16UnormToBufferComputePipeline;
+    }
+
+    Ref<ComputePipelineBase> pipeline;
+    DAWN_TRY_ASSIGN(
+        pipeline, CreateDepthBlitComputePipeline(device, store, wgpu::TextureFormat::Depth16Unorm));
+    return pipeline;
+}
+
+}  // anonymous namespace
+
+MaybeError BlitDepthToBuffer(DeviceBase* device,
+                             CommandEncoder* commandEncoder,
+                             const TextureCopy& src,
+                             const BufferCopy& dst,
+                             const Extent3D& copyExtent) {
+    const Format& format = src.texture->GetFormat();
+
+    Ref<BufferBase> destinationBuffer = dst.buffer;
+    bool useIntermediateCopyBuffer = false;
+    if (format.format == wgpu::TextureFormat::Depth16Unorm && dst.buffer->GetSize() % 4 != 0 &&
+        copyExtent.width % 2 != 0) {
+        // This path is made for OpenGL/GLES depth16unorm bliting a texture with an odd width,
+        // to a compact buffer. When we copy the last texel, we inevitably need to access an
+        // out of bounds location given by dst.buffer.size as we use array<u32> in the shader for
+        // the storage buffer. Although the allocated size of dst.buffer is aligned to 4 bytes for
+        // OpenGL/GLES backend, the size of the storage buffer binding for the shader is not. Thus
+        // we make an intermediate buffer aligned to 4 bytes for the compute shader to safely
+        // access, and perform an additional buffer to buffer copy at the end. This path should be
+        // hit rarely.
+        useIntermediateCopyBuffer = true;
+        BufferDescriptor descriptor = {};
+        descriptor.size = Align(dst.buffer->GetSize(), 4);
+        // TODO(dawn:1485): adding CopyDst usage to add kInternalStorageBuffer usage internally.
+        descriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+        DAWN_TRY_ASSIGN(destinationBuffer, device->CreateBuffer(&descriptor));
+    }
+
+    Ref<ComputePipelineBase> pipeline;
+    uint32_t workgroupCountX = 1;
+    uint32_t workgroupCountY = 1;
+    uint32_t workgroupCountZ = copyExtent.depthOrArrayLayers;
+    switch (format.format) {
+        case wgpu::TextureFormat::Depth16Unorm:
+            // One thread is responsible for writing two texel values (x, y) and (x+1, y).
+            workgroupCountX = (copyExtent.width + 2 * kWorkgroupSizeX - 1) / (2 * kWorkgroupSizeX);
+            workgroupCountY = (copyExtent.height + kWorkgroupSizeY - 1) / kWorkgroupSizeY;
+            DAWN_TRY_ASSIGN(pipeline, GetOrCreateDepth16UnormToBufferPipeline(device));
+            break;
+        case wgpu::TextureFormat::Depth32Float:
+            workgroupCountX = (copyExtent.width + kWorkgroupSizeX - 1) / kWorkgroupSizeX;
+            workgroupCountY = (copyExtent.height + kWorkgroupSizeY - 1) / kWorkgroupSizeY;
+            DAWN_TRY_ASSIGN(pipeline, GetOrCreateDepth32FloatToBufferPipeline(device));
+            break;
+        default:
+            // Other formats (e.g. Depth32FloatStencil8) are not supported on OpenGL/OpenGLES where
+            // we enabled this workaround. They only support Depth24PlusStencil8.
+            UNREACHABLE();
+    }
+
+    // Allow internal usages since we need to use the source as a texture binding
+    // and buffer as a storage binding.
+    auto scope = commandEncoder->MakeInternalUsageScope();
+
+    Ref<BindGroupLayoutBase> bindGroupLayout;
+    DAWN_TRY_ASSIGN(bindGroupLayout, pipeline->GetBindGroupLayout(0));
+
+    Ref<BufferBase> uniformBuffer;
+    {
+        BufferDescriptor bufferDesc = {};
+        // Uniform buffer size needs to be multiple of 16 bytes
+        bufferDesc.size = sizeof(uint32_t) * 12;
+        bufferDesc.usage = wgpu::BufferUsage::Uniform;
+        bufferDesc.mappedAtCreation = true;
+        DAWN_TRY_ASSIGN(uniformBuffer, device->CreateBuffer(&bufferDesc));
+
+        uint32_t* params =
+            static_cast<uint32_t*>(uniformBuffer->GetMappedRange(0, bufferDesc.size));
+        // srcOrigin: vec3u
+        params[0] = src.origin.x;
+        params[1] = src.origin.y;
+        // src.origin.z is set at textureView.baseArrayLayer
+        params[2] = 0;
+        // srcExtent: vec3u
+        params[4] = copyExtent.width;
+        params[5] = copyExtent.height;
+        params[6] = copyExtent.depthOrArrayLayers;
+
+        // Turn bytesPerRow, (bytes)offset to use array index as unit
+        // We use array<u32> for depth16unorm copy and array<f32> for depth32float copy
+        // Both array element sizes are 4 bytes.
+        params[8] = dst.bytesPerRow / 4;
+        params[9] = dst.rowsPerImage;
+        params[10] = dst.offset / 4;
+
+        DAWN_TRY(uniformBuffer->Unmap());
+    }
+
+    TextureViewDescriptor viewDesc = {};
+    viewDesc.aspect = wgpu::TextureAspect::DepthOnly;
+    viewDesc.dimension = wgpu::TextureViewDimension::e2DArray;
+    viewDesc.baseMipLevel = src.mipLevel;
+    viewDesc.mipLevelCount = 1;
+    viewDesc.baseArrayLayer = src.origin.z;
+    viewDesc.arrayLayerCount = copyExtent.depthOrArrayLayers;
+
+    Ref<TextureViewBase> srcView;
+    DAWN_TRY_ASSIGN(srcView, src.texture->CreateView(&viewDesc));
+
+    Ref<BindGroupBase> bindGroup;
+    DAWN_TRY_ASSIGN(bindGroup, utils::MakeBindGroup(device, bindGroupLayout,
+                                                    {
+                                                        {0, srcView},
+                                                        {1, destinationBuffer},
+                                                        {2, uniformBuffer},
+                                                    },
+                                                    UsageValidationMode::Internal));
+
+    Ref<ComputePassEncoder> pass = commandEncoder->BeginComputePass();
+    pass->APISetPipeline(pipeline.Get());
+    pass->APISetBindGroup(0, bindGroup.Get());
+    pass->APIDispatchWorkgroups(workgroupCountX, workgroupCountY, workgroupCountZ);
+
+    pass->APIEnd();
+
+    if (useIntermediateCopyBuffer) {
+        ASSERT(destinationBuffer->GetSize() <= dst.buffer->GetAllocatedSize());
+        commandEncoder->InternalCopyBufferToBufferWithAllocatedSize(
+            destinationBuffer.Get(), 0, dst.buffer.Get(), 0, destinationBuffer->GetSize());
+    }
+
+    return {};
+}
+
+}  // namespace dawn::native
diff --git a/src/dawn/native/BlitDepthStencilToBuffer.h b/src/dawn/native/BlitDepthStencilToBuffer.h
new file mode 100644
index 0000000000..ca60b9e929
--- /dev/null
+++ b/src/dawn/native/BlitDepthStencilToBuffer.h
@@ -0,0 +1,38 @@
+// Copyright 2023 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SRC_DAWN_NATIVE_BLITDEPTHSTENCILTOBUFFER_H_
+#define SRC_DAWN_NATIVE_BLITDEPTHSTENCILTOBUFFER_H_
+
+#include "dawn/native/Error.h"
+
+namespace dawn::native {
+
+struct TextureCopy;
+struct BufferCopy;
+
+// BlitDepthToBuffer works around OpenGL/GLES issues of copying depth textures to a buffer.
+// Supported depth texture format: depth16unorm, depth32float
+// It dispatches a compute shader textureLoad from the depth texture and writes to the buffer as a
+// storage buffer.
+
+MaybeError BlitDepthToBuffer(DeviceBase* device,
+                             CommandEncoder* commandEncoder,
+                             const TextureCopy& src,
+                             const BufferCopy& dst,
+                             const Extent3D& copyExtent);
+
+}  // namespace dawn::native
+
+#endif  // SRC_DAWN_NATIVE_BLITDEPTHSTENCILTOBUFFER_H_
diff --git a/src/dawn/native/Buffer.cpp b/src/dawn/native/Buffer.cpp
index fe79de4825..92acd77095 100644
--- a/src/dawn/native/Buffer.cpp
+++ b/src/dawn/native/Buffer.cpp
@@ -172,6 +172,13 @@ BufferBase::BufferBase(DeviceBase* device, const BufferDescriptor* descriptor)
         mUsage |= kInternalStorageBuffer;
     }
 
+    if (mUsage & wgpu::BufferUsage::CopyDst) {
+        if (device->IsToggleEnabled(Toggle::UseBlitForDepth16UnormTextureToBufferCopy) ||
+            device->IsToggleEnabled(Toggle::UseBlitForDepth32FloatTextureToBufferCopy)) {
+            mUsage |= kInternalStorageBuffer;
+        }
+    }
+
     GetObjectTrackingList()->Track(this);
 }
 
diff --git a/src/dawn/native/Buffer.h b/src/dawn/native/Buffer.h
index 3457ed5f41..aa2050e727 100644
--- a/src/dawn/native/Buffer.h
+++ b/src/dawn/native/Buffer.h
@@ -24,6 +24,7 @@
 #include "dawn/native/Forward.h"
 #include "dawn/native/IntegerTypes.h"
 #include "dawn/native/ObjectBase.h"
+#include "dawn/native/UsageValidationMode.h"
 
 #include "dawn/native/dawn_platform.h"
 
diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt
index deda9b0591..8425f21935 100644
--- a/src/dawn/native/CMakeLists.txt
+++ b/src/dawn/native/CMakeLists.txt
@@ -49,6 +49,8 @@ target_sources(dawn_native PRIVATE
     "BindingInfo.h"
     "BlitBufferToDepthStencil.cpp"
     "BlitBufferToDepthStencil.h"
+    "BlitDepthStencilToBuffer.cpp"
+    "BlitDepthStencilToBuffer.h"
     "BlitDepthToDepth.cpp"
     "BlitDepthToDepth.h"
     "Blob.cpp"
diff --git a/src/dawn/native/CommandEncoder.cpp b/src/dawn/native/CommandEncoder.cpp
index 97e8a82e81..476248f9cf 100644
--- a/src/dawn/native/CommandEncoder.cpp
+++ b/src/dawn/native/CommandEncoder.cpp
@@ -23,6 +23,7 @@
 #include "dawn/native/ApplyClearColorValueWithDrawHelper.h"
 #include "dawn/native/BindGroup.h"
 #include "dawn/native/BlitBufferToDepthStencil.h"
+#include "dawn/native/BlitDepthStencilToBuffer.h"
 #include "dawn/native/BlitDepthToDepth.h"
 #include "dawn/native/Buffer.h"
 #include "dawn/native/ChainUtils_autogen.h"
@@ -1159,6 +1160,55 @@ void CommandEncoder::APICopyBufferToBuffer(BufferBase* source,
         destination, destinationOffset, size);
 }
 
+// The internal version of APICopyBufferToBuffer which validates against mAllocatedSize instead of
+// mSize of buffers.
+void CommandEncoder::InternalCopyBufferToBufferWithAllocatedSize(BufferBase* source,
+                                                                 uint64_t sourceOffset,
+                                                                 BufferBase* destination,
+                                                                 uint64_t destinationOffset,
+                                                                 uint64_t size) {
+    mEncodingContext.TryEncode(
+        this,
+        [&](CommandAllocator* allocator) -> MaybeError {
+            if (GetDevice()->IsValidationEnabled()) {
+                DAWN_TRY(GetDevice()->ValidateObject(source));
+                DAWN_TRY(GetDevice()->ValidateObject(destination));
+
+                DAWN_INVALID_IF(source == destination,
+                                "Source and destination are the same buffer (%s).", source);
+
+                DAWN_TRY_CONTEXT(ValidateCopySizeFitsInBuffer(source, sourceOffset, size,
+                                                              BufferSizeType::AllocatedSize),
+                                 "validating source %s copy size against allocated size.", source);
+                DAWN_TRY_CONTEXT(ValidateCopySizeFitsInBuffer(destination, destinationOffset, size,
+                                                              BufferSizeType::AllocatedSize),
+                                 "validating destination %s copy size against allocated size.",
+                                 destination);
+                DAWN_TRY(ValidateB2BCopyAlignment(size, sourceOffset, destinationOffset));
+
+                DAWN_TRY_CONTEXT(ValidateCanUseAs(source, wgpu::BufferUsage::CopySrc),
+                                 "validating source %s usage.", source);
+                DAWN_TRY_CONTEXT(ValidateCanUseAs(destination, wgpu::BufferUsage::CopyDst),
+                                 "validating destination %s usage.", destination);
+            }
+
+            mTopLevelBuffers.insert(source);
+            mTopLevelBuffers.insert(destination);
+
+            CopyBufferToBufferCmd* copy =
+                allocator->Allocate<CopyBufferToBufferCmd>(Command::CopyBufferToBuffer);
+            copy->source = source;
+            copy->sourceOffset = sourceOffset;
+            copy->destination = destination;
+            copy->destinationOffset = destinationOffset;
+            copy->size = size;
+
+            return {};
+        },
+        "encoding internal %s.CopyBufferToBuffer(%s, %u, %s, %u, %u).", this, source, sourceOffset,
+        destination, destinationOffset, size);
+}
+
 void CommandEncoder::APICopyBufferToTexture(const ImageCopyBuffer* source,
                                             const ImageCopyTexture* destination,
                                             const Extent3D* copySize) {
@@ -1286,6 +1336,35 @@ void CommandEncoder::APICopyTextureToBuffer(const ImageCopyTexture* source,
             TextureDataLayout dstLayout = destination->layout;
             ApplyDefaultTextureDataLayoutOptions(&dstLayout, blockInfo, *copySize);
 
+            auto format = source->texture->GetFormat();
+            auto aspect = ConvertAspect(format, source->aspect);
+            if (aspect == Aspect::Depth) {
+                if ((format.format == wgpu::TextureFormat::Depth16Unorm &&
+                     GetDevice()->IsToggleEnabled(
+                         Toggle::UseBlitForDepth16UnormTextureToBufferCopy)) ||
+                    (format.format == wgpu::TextureFormat::Depth32Float &&
+                     GetDevice()->IsToggleEnabled(
+                         Toggle::UseBlitForDepth32FloatTextureToBufferCopy))) {
+                    TextureCopy src;
+                    src.texture = source->texture;
+                    src.origin = source->origin;
+                    src.mipLevel = source->mipLevel;
+                    src.aspect = aspect;
+
+                    BufferCopy dst;
+                    dst.buffer = destination->buffer;
+                    dst.bytesPerRow = destination->layout.bytesPerRow;
+                    dst.rowsPerImage = destination->layout.rowsPerImage;
+                    dst.offset = destination->layout.offset;
+                    DAWN_TRY_CONTEXT(BlitDepthToBuffer(GetDevice(), this, src, dst, *copySize),
+                                     "copying depth aspect from %s to %s using blit workaround.",
+                                     src.texture.Get(), destination->buffer);
+
+                    return {};
+                }
+            }
+            // TODO(crbug.com/dawn/1782): implement emulation for stencil
+
             CopyTextureToBufferCmd* t2b =
                 allocator->Allocate<CopyTextureToBufferCmd>(Command::CopyTextureToBuffer);
             t2b->source.texture = source->texture;
diff --git a/src/dawn/native/CommandEncoder.h b/src/dawn/native/CommandEncoder.h
index 6b9d4a5404..2b6a4e4285 100644
--- a/src/dawn/native/CommandEncoder.h
+++ b/src/dawn/native/CommandEncoder.h
@@ -59,6 +59,11 @@ class CommandEncoder final : public ApiObjectBase {
                                BufferBase* destination,
                                uint64_t destinationOffset,
                                uint64_t size);
+    void InternalCopyBufferToBufferWithAllocatedSize(BufferBase* source,
+                                                     uint64_t sourceOffset,
+                                                     BufferBase* destination,
+                                                     uint64_t destinationOffset,
+                                                     uint64_t size);
     void APICopyBufferToTexture(const ImageCopyBuffer* source,
                                 const ImageCopyTexture* destination,
                                 const Extent3D* copySize);
diff --git a/src/dawn/native/CommandValidation.cpp b/src/dawn/native/CommandValidation.cpp
index 7d5a34a1cd..86abb419d5 100644
--- a/src/dawn/native/CommandValidation.cpp
+++ b/src/dawn/native/CommandValidation.cpp
@@ -175,8 +175,17 @@ ResultOrError<uint64_t> ComputeRequiredBytesInCopy(const TexelBlockInfo& blockIn
 
 MaybeError ValidateCopySizeFitsInBuffer(const Ref<BufferBase>& buffer,
                                         uint64_t offset,
-                                        uint64_t size) {
-    uint64_t bufferSize = buffer->GetSize();
+                                        uint64_t size,
+                                        BufferSizeType checkBufferSizeType) {
+    uint64_t bufferSize = 0;
+    switch (checkBufferSizeType) {
+        case BufferSizeType::Size:
+            bufferSize = buffer->GetSize();
+            break;
+        case BufferSizeType::AllocatedSize:
+            bufferSize = buffer->GetAllocatedSize();
+            break;
+    }
     bool fitsInBuffer = offset <= bufferSize && (size <= (bufferSize - offset));
     DAWN_INVALID_IF(!fitsInBuffer,
                     "Copy range (offset: %u, size: %u) does not fit in %s size (%u).", offset, size,
diff --git a/src/dawn/native/CommandValidation.h b/src/dawn/native/CommandValidation.h
index 53ecfc24e8..4417d6ff12 100644
--- a/src/dawn/native/CommandValidation.h
+++ b/src/dawn/native/CommandValidation.h
@@ -27,6 +27,8 @@
 
 namespace dawn::native {
 
+enum class BufferSizeType { Size, AllocatedSize };
+
 class QuerySetBase;
 struct SyncScopeResourceUsage;
 struct TexelBlockInfo;
@@ -76,7 +78,8 @@ MaybeError ValidateImageCopyTexture(DeviceBase const* device,
 
 MaybeError ValidateCopySizeFitsInBuffer(const Ref<BufferBase>& buffer,
                                         uint64_t offset,
-                                        uint64_t size);
+                                        uint64_t size,
+                                        BufferSizeType checkBufferSizeType = BufferSizeType::Size);
 
 bool IsRangeOverlapped(uint32_t startA, uint32_t startB, uint32_t length);
 
diff --git a/src/dawn/native/InternalPipelineStore.h b/src/dawn/native/InternalPipelineStore.h
index dd0bd69550..3500ed215f 100644
--- a/src/dawn/native/InternalPipelineStore.h
+++ b/src/dawn/native/InternalPipelineStore.h
@@ -60,6 +60,9 @@ struct InternalPipelineStore {
 
     Ref<RenderPipelineBase> blitRG8ToDepth16UnormPipeline;
 
+    Ref<ComputePipelineBase> blitDepth16UnormToBufferComputePipeline;
+    Ref<ComputePipelineBase> blitDepth32FloatToBufferComputePipeline;
+
     struct BlitR8ToStencilPipelines {
         Ref<RenderPipelineBase> clearPipeline;
         std::array<Ref<RenderPipelineBase>, 8> setStencilPipelines;
diff --git a/src/dawn/native/Texture.cpp b/src/dawn/native/Texture.cpp
index 98703e8965..fece48f445 100644
--- a/src/dawn/native/Texture.cpp
+++ b/src/dawn/native/Texture.cpp
@@ -592,6 +592,13 @@ TextureBase::TextureBase(DeviceBase* device,
             AddInternalUsage(wgpu::TextureUsage::RenderAttachment);
         }
     }
+    if (mFormat.HasDepth() &&
+        (device->IsToggleEnabled(Toggle::UseBlitForDepth16UnormTextureToBufferCopy) ||
+         device->IsToggleEnabled(Toggle::UseBlitForDepth32FloatTextureToBufferCopy))) {
+        if (mInternalUsage & wgpu::TextureUsage::CopySrc) {
+            AddInternalUsage(wgpu::TextureUsage::TextureBinding);
+        }
+    }
 }
 
 TextureBase::~TextureBase() = default;
diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index f806c773c7..40765de7c2 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -386,6 +386,16 @@ static constexpr ToggleEnumAndInfoList kToggleNameAndInfoList = {{
       "Use a blit to copy from a depth texture to the nonzero subresource of a depth texture. "
       "Works around an issue where nonzero layers are not written.",
       "https://crbug.com/dawn/1083", ToggleStage::Device}},
+    {Toggle::UseBlitForDepth16UnormTextureToBufferCopy,
+     {"use_blit_for_depth16unorm_texture_to_buffer_copy",
+      "Use a blit instead of a copy command to copy depth aspect of a texture to a buffer."
+      "Workaround for OpenGL and OpenGLES.",
+      "https://crbug.com/dawn/1782", ToggleStage::Device}},
+    {Toggle::UseBlitForDepth32FloatTextureToBufferCopy,
+     {"use_blit_for_depth32float_texture_to_buffer_copy",
+      "Use a blit instead of a copy command to copy depth aspect of a texture to a buffer."
+      "Workaround for OpenGLES.",
+      "https://crbug.com/dawn/1782", ToggleStage::Device}},
     {Toggle::D3D12ReplaceAddWithMinusWhenDstFactorIsZeroAndSrcFactorIsDstAlpha,
      {"d3d12_replace_add_with_minus_when_dst_factor_is_zero_and_src_factor_is_dst_alpha",
       "Replace the blending operation 'Add' with 'Minus' when dstBlendFactor is 'Zero' and "
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index a8c4e10c13..d0b02127ee 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -92,6 +92,8 @@ enum class Toggle {
     UseBlitForBufferToDepthTextureCopy,
     UseBlitForBufferToStencilTextureCopy,
     UseBlitForDepthTextureToTextureCopyToNonzeroSubresource,
+    UseBlitForDepth16UnormTextureToBufferCopy,
+    UseBlitForDepth32FloatTextureToBufferCopy,
     D3D12ReplaceAddWithMinusWhenDstFactorIsZeroAndSrcFactorIsDstAlpha,
     D3D12PolyfillReflectVec2F32,
     VulkanClearGen12TextureWithCCSAmbiguateOnCreation,
diff --git a/src/dawn/native/opengl/BufferGL.cpp b/src/dawn/native/opengl/BufferGL.cpp
index db743f1a39..72544b54f0 100644
--- a/src/dawn/native/opengl/BufferGL.cpp
+++ b/src/dawn/native/opengl/BufferGL.cpp
@@ -41,7 +41,9 @@ Buffer::Buffer(Device* device, const BufferDescriptor* descriptor)
     : BufferBase(device, descriptor) {
     const OpenGLFunctions& gl = device->GetGL();
     // Allocate at least 4 bytes so clamped accesses are always in bounds.
-    mAllocatedSize = std::max(GetSize(), uint64_t(4u));
+    // Align with 4 byte to avoid out-of-bounds access issue in compute emulation for 2 byte
+    // element.
+    mAllocatedSize = Align(std::max(GetSize(), uint64_t(4u)), uint64_t(4u));
 
     gl.GenBuffers(1, &mBuffer);
     gl.BindBuffer(GL_ARRAY_BUFFER, mBuffer);
diff --git a/src/dawn/native/opengl/PhysicalDeviceGL.cpp b/src/dawn/native/opengl/PhysicalDeviceGL.cpp
index b17bccd0ad..e12b4420a6 100644
--- a/src/dawn/native/opengl/PhysicalDeviceGL.cpp
+++ b/src/dawn/native/opengl/PhysicalDeviceGL.cpp
@@ -213,6 +213,19 @@ void PhysicalDevice::SetupBackendDeviceToggles(TogglesState* deviceToggles) cons
     // For OpenGL ES, we must use a placeholder fragment shader for vertex-only render pipeline.
     deviceToggles->Default(Toggle::UsePlaceholderFragmentInVertexOnlyPipeline,
                            gl.GetVersion().IsES());
+    // For OpenGL/OpenGL ES, use compute shader blit to emulate depth16unorm texture to buffer
+    // copies.
+    // Disable Angle on windows as it seems to have side-effect.
+#if DAWN_PLATFORM_IS(WINDOWS)
+    const bool kIsAngleOnWindows = mName.find("ANGLE") != std::string::npos;
+#else
+    constexpr bool kIsAngleOnWindows = false;
+#endif
+    deviceToggles->Default(Toggle::UseBlitForDepth16UnormTextureToBufferCopy, !kIsAngleOnWindows);
+
+    // For OpenGL ES, use compute shader blit to emulate depth32float texture to buffer copies.
+    deviceToggles->Default(Toggle::UseBlitForDepth32FloatTextureToBufferCopy,
+                           gl.GetVersion().IsES() && !kIsAngleOnWindows);
 }
 
 ResultOrError<Ref<DeviceBase>> PhysicalDevice::CreateDeviceImpl(AdapterBase* adapter,
diff --git a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
index 261909f2aa..654369e48e 100644
--- a/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
+++ b/src/dawn/tests/end2end/DepthStencilCopyTests.cpp
@@ -41,8 +41,63 @@ constexpr std::array<wgpu::TextureFormat, 1> kValidDepthCopyFromBufferFormats =
     wgpu::TextureFormat::Depth16Unorm,
 };
 
+uint32_t GetBytesPerPixel(wgpu::TextureFormat format, wgpu::TextureAspect aspect) {
+    uint32_t bytesPerPixel = 0;
+    switch (format) {
+        case wgpu::TextureFormat::Depth24PlusStencil8: {
+            ASSERT(aspect == wgpu::TextureAspect::StencilOnly);
+            bytesPerPixel = 1;
+            break;
+        }
+        case wgpu::TextureFormat::Depth32FloatStencil8: {
+            switch (aspect) {
+                case wgpu::TextureAspect::DepthOnly:
+                    bytesPerPixel = 4;
+                    break;
+                case wgpu::TextureAspect::StencilOnly:
+                    bytesPerPixel = 1;
+                    break;
+                default:
+                    UNREACHABLE();
+                    break;
+            }
+            break;
+        }
+        default:
+            bytesPerPixel = utils::GetTexelBlockSizeInBytes(format);
+            break;
+    }
+    return bytesPerPixel;
+}
+
+// Bytes of unorm 16 of 0.23 is 0x3AE1, of which the 2 bytes are different.
+// This helps better test unorm 16 compute emulation path.
+constexpr float kInitDepth = 0.23f;
+
+// Bytes of unorm 16 of 0.23 is 0xB0A3.
+// Use a non-zero clear depth to better test unorm16 compute emulation path.
+constexpr float kClearDepth = 0.69f;
+
 class DepthStencilCopyTests : public DawnTestWithParams<DepthStencilCopyTestParams> {
   protected:
+    void MapAsyncAndWait(const wgpu::Buffer& buffer,
+                         wgpu::MapMode mode,
+                         size_t offset,
+                         size_t size) {
+        bool done = false;
+        buffer.MapAsync(
+            mode, offset, size,
+            [](WGPUBufferMapAsyncStatus status, void* userdata) {
+                ASSERT_EQ(WGPUBufferMapAsyncStatus_Success, status);
+                *static_cast<bool*>(userdata) = true;
+            },
+            &done);
+
+        while (!done) {
+            WaitABit();
+        }
+    }
+
     void SetUp() override {
         DawnTestWithParams<DepthStencilCopyTestParams>::SetUp();
 
@@ -91,9 +146,10 @@ class DepthStencilCopyTests : public DawnTestWithParams<DepthStencilCopyTestPara
     wgpu::Texture CreateTexture(uint32_t width,
                                 uint32_t height,
                                 wgpu::TextureUsage usage,
-                                uint32_t mipLevelCount = 1) {
+                                uint32_t mipLevelCount = 1,
+                                uint32_t arrayLayerCount = 1) {
         wgpu::TextureDescriptor texDescriptor = {};
-        texDescriptor.size = {width, height, 1};
+        texDescriptor.size = {width, height, arrayLayerCount};
         texDescriptor.format = GetParam().mTextureFormat;
         texDescriptor.usage = usage;
         texDescriptor.mipLevelCount = mipLevelCount;
@@ -133,7 +189,8 @@ class DepthStencilCopyTests : public DawnTestWithParams<DepthStencilCopyTestPara
                                              uint8_t clearStencil,
                                              uint8_t regionStencil,
                                              uint32_t mipLevel = 0,
-                                             uint32_t arrayLayer = 0) {
+                                             uint32_t arrayLayer = 0,
+                                             uint32_t arrayLayerCount = 1) {
         wgpu::TextureFormat format = GetParam().mTextureFormat;
         // Create the render pass used for the initialization.
         utils::ComboRenderPipelineDescriptor renderPipelineDesc;
@@ -160,26 +217,29 @@ class DepthStencilCopyTests : public DawnTestWithParams<DepthStencilCopyTestPara
         }
 
         wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&renderPipelineDesc);
-
-        // Build the render pass used for initialization.
-        wgpu::TextureViewDescriptor viewDesc = {};
-        viewDesc.baseMipLevel = mipLevel;
-        viewDesc.mipLevelCount = 1;
-        viewDesc.baseArrayLayer = arrayLayer;
-        viewDesc.arrayLayerCount = 1;
-
-        utils::ComboRenderPassDescriptor renderPassDesc({}, texture.CreateView(&viewDesc));
-        renderPassDesc.UnsetDepthStencilLoadStoreOpsForFormat(format);
-        renderPassDesc.cDepthStencilAttachmentInfo.depthClearValue = clearDepth;
-        renderPassDesc.cDepthStencilAttachmentInfo.stencilClearValue = clearStencil;
-
-        // Draw the quad (two triangles)
         wgpu::CommandEncoder commandEncoder = device.CreateCommandEncoder();
-        wgpu::RenderPassEncoder pass = commandEncoder.BeginRenderPass(&renderPassDesc);
-        pass.SetPipeline(pipeline);
-        pass.SetStencilReference(regionStencil);
-        pass.Draw(6);
-        pass.End();
+
+        for (uint32_t curArrayLayer = 0; curArrayLayer < arrayLayerCount; curArrayLayer++) {
+            // Build the render pass used for initialization.
+            wgpu::TextureViewDescriptor viewDesc = {};
+            viewDesc.baseMipLevel = mipLevel;
+            viewDesc.mipLevelCount = 1;
+            viewDesc.baseArrayLayer = arrayLayer + curArrayLayer;
+            viewDesc.arrayLayerCount = 1;
+
+            utils::ComboRenderPassDescriptor renderPassDesc({}, texture.CreateView(&viewDesc));
+            renderPassDesc.UnsetDepthStencilLoadStoreOpsForFormat(format);
+            // TODO(dawn:1782): use different clear values for each array layer.
+            renderPassDesc.cDepthStencilAttachmentInfo.depthClearValue = clearDepth;
+            renderPassDesc.cDepthStencilAttachmentInfo.stencilClearValue = clearStencil;
+
+            // Draw the quad (two triangles)
+            wgpu::RenderPassEncoder pass = commandEncoder.BeginRenderPass(&renderPassDesc);
+            pass.SetPipeline(pipeline);
+            pass.SetStencilReference(regionStencil);
+            pass.Draw(6);
+            pass.End();
+        }
 
         wgpu::CommandBuffer commands = commandEncoder.Finish();
         queue.Submit(1, &commands);
@@ -224,35 +284,19 @@ class DepthStencilCopyTests : public DawnTestWithParams<DepthStencilCopyTestPara
                                       uint32_t height,
                                       uint32_t depth,
                                       wgpu::TextureFormat format = wgpu::TextureFormat::RGBA8Unorm,
-                                      wgpu::TextureAspect aspect = wgpu::TextureAspect::All) {
-        uint32_t bytesPerPixel = 0;
-        switch (format) {
-            case wgpu::TextureFormat::Depth24PlusStencil8: {
-                ASSERT(aspect == wgpu::TextureAspect::StencilOnly);
-                bytesPerPixel = 1;
-                break;
-            }
-            case wgpu::TextureFormat::Depth32FloatStencil8: {
-                switch (aspect) {
-                    case wgpu::TextureAspect::DepthOnly:
-                        bytesPerPixel = 4;
-                        break;
-                    case wgpu::TextureAspect::StencilOnly:
-                        bytesPerPixel = 1;
-                        break;
-                    default:
-                        UNREACHABLE();
-                        break;
-                }
-                break;
-            }
-            default:
-                bytesPerPixel = utils::GetTexelBlockSizeInBytes(format);
-                break;
-        }
+                                      wgpu::TextureAspect aspect = wgpu::TextureAspect::All,
+                                      bool alignForMultipleOf4Bytes = true) {
+        uint32_t bytesPerPixel = GetBytesPerPixel(format, aspect);
 
         uint32_t bytesPerRow = Align(width * bytesPerPixel, kTextureBytesPerRowAlignment);
-        return (bytesPerRow * (height - 1) + width * bytesPerPixel) * depth;
+
+        // Bytes per image before last array layer
+        uint32_t bytesPerImage = bytesPerRow * height;
+
+        uint32_t result =
+            bytesPerImage * (depth - 1) + (bytesPerRow * (height - 1) + width * bytesPerPixel);
+
+        return alignForMultipleOf4Bytes ? Align(result, uint64_t(4)) : result;
     }
 
     wgpu::ShaderModule mVertexModule;
@@ -447,70 +491,105 @@ class DepthCopyTests : public DepthStencilCopyTests {
                              float initDepth,
                              uint32_t textureWidth,
                              uint32_t textureHeight,
+                             uint32_t textureArrayLayerCount,
                              uint32_t testLevel) {
-        // TODO(crbug.com/dawn/1237): Depth16Unorm test failed on OpenGL and OpenGLES which says
-        // Invalid format and type combination in glReadPixels
-        DAWN_TEST_UNSUPPORTED_IF(GetParam().mTextureFormat == wgpu::TextureFormat::Depth16Unorm &&
-                                 (IsOpenGL() || IsOpenGLES()));
-
-        // TODO(crbug.com/dawn/1291): These tests are failing on GLES (both native and ANGLE)
-        // when using Tint/GLSL.
-        DAWN_TEST_UNSUPPORTED_IF(IsOpenGLES());
-
-        uint32_t mipLevelCount = testLevel + 1;
-        wgpu::Texture texture = CreateTexture(
-            textureWidth, textureHeight,
-            wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc, mipLevelCount);
-
-        InitializeDepthStencilTextureRegion(texture, 0.f, initDepth, 0, 0, testLevel);
-
         uint32_t copyWidth = textureWidth >> testLevel;
         uint32_t copyHeight = textureHeight >> testLevel;
-        wgpu::Extent3D copySize = {copyWidth, copyHeight, 1};
-
-        constexpr uint32_t kBytesPerRow = kTextureBytesPerRowAlignment;
         wgpu::BufferDescriptor bufferDescriptor = {};
         bufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
         bufferDescriptor.size =
-            bufferCopyOffset + BufferSizeForTextureCopy(copyWidth, copyHeight, 1,
-                                                        GetParam().mTextureFormat,
-                                                        wgpu::TextureAspect::DepthOnly);
+            bufferCopyOffset +
+            BufferSizeForTextureCopy(copyWidth, copyHeight, textureArrayLayerCount,
+                                     GetParam().mTextureFormat, wgpu::TextureAspect::DepthOnly);
         wgpu::Buffer destinationBuffer = device.CreateBuffer(&bufferDescriptor);
 
+        DoCopyFromDepthTestWithBuffer(destinationBuffer, bufferCopyOffset, initDepth, textureWidth,
+                                      textureHeight, textureArrayLayerCount, testLevel, true);
+    }
+
+    void DoCopyFromDepthTestWithBuffer(wgpu::Buffer destinationBuffer,
+                                       uint32_t bufferCopyOffset,
+                                       float initDepth,
+                                       uint32_t textureWidth,
+                                       uint32_t textureHeight,
+                                       uint32_t textureArrayLayerCount,
+                                       uint32_t testLevel,
+                                       bool checkBufferContent) {
+        // TODO(crbug.com/dawn/1291): Compute emulation path fails for Angle on Windows.
+        DAWN_SUPPRESS_TEST_IF(IsANGLE() && IsWindows());
+
+        uint32_t mipLevelCount = testLevel + 1;
+        wgpu::Texture texture =
+            CreateTexture(textureWidth, textureHeight,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc,
+                          mipLevelCount, textureArrayLayerCount);
+
+        // Initialize other mip levels with different init values for better testing
+        constexpr float garbageDepth = 0.123456789f;
+        ASSERT(initDepth != garbageDepth);
+
+        for (uint32_t level = 0; level < mipLevelCount; level++) {
+            float regionDepth = (level == testLevel) ? initDepth : garbageDepth;
+            InitializeDepthStencilTextureRegion(texture, kClearDepth, regionDepth, 0, 0, level, 0,
+                                                textureArrayLayerCount);
+        }
+
+        uint32_t copyWidth = textureWidth >> testLevel;
+        uint32_t copyHeight = textureHeight >> testLevel;
+        wgpu::Extent3D copySize = {copyWidth, copyHeight, textureArrayLayerCount};
+
+        wgpu::TextureFormat format = GetParam().mTextureFormat;
+        constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::DepthOnly;
+        uint32_t bytesPerPixel = GetBytesPerPixel(format, aspect);
+        uint32_t bytesPerRow = Align(copyWidth * bytesPerPixel, kTextureBytesPerRowAlignment);
+        uint32_t bytesPerImage = bytesPerRow * copyHeight;
+
         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-        wgpu::ImageCopyTexture imageCopyTexture = utils::CreateImageCopyTexture(
-            texture, testLevel, {0, 0, 0}, wgpu::TextureAspect::DepthOnly);
+        wgpu::ImageCopyTexture imageCopyTexture =
+            utils::CreateImageCopyTexture(texture, testLevel, {0, 0, 0}, aspect);
         wgpu::ImageCopyBuffer imageCopyBuffer = utils::CreateImageCopyBuffer(
-            destinationBuffer, bufferCopyOffset, kBytesPerRow, copyHeight);
+            destinationBuffer, bufferCopyOffset, bytesPerRow, copyHeight);
         encoder.CopyTextureToBuffer(&imageCopyTexture, &imageCopyBuffer, &copySize);
         wgpu::CommandBuffer commandBuffer = encoder.Finish();
         queue.Submit(1, &commandBuffer);
 
-        if (GetParam().mTextureFormat == wgpu::TextureFormat::Depth16Unorm) {
-            uint16_t expected = FloatToUnorm<uint16_t>(initDepth);
-            std::vector<uint16_t> expectedData = {
-                0,        0,        0, 0,  //
-                0,        0,        0, 0,  //
-                expected, expected, 0, 0,  //
-                expected, expected, 0, 0,  //
-            };
+        if (checkBufferContent) {
+            // Expected data pattern is that initDepth value at bottom left corner, while other
+            // region is kClearDepth. Data of each layer is the same.
+            if (format == wgpu::TextureFormat::Depth16Unorm) {
+                uint16_t expected = FloatToUnorm<uint16_t>(initDepth);
+                uint16_t cleared = FloatToUnorm<uint16_t>(kClearDepth);
+                std::vector<uint16_t> expectedData(copyWidth * copyHeight, cleared);
+                for (uint32_t y = copyHeight / 2; y < copyHeight; y++) {
+                    auto rowStart = expectedData.data() + y * copyWidth;
+                    std::fill(rowStart, rowStart + copyWidth / 2, expected);
+                }
 
-            for (uint32_t y = 0; y < copyHeight; ++y) {
-                EXPECT_BUFFER_U16_RANGE_EQ(expectedData.data() + copyWidth * y, destinationBuffer,
-                                           bufferCopyOffset + y * kBytesPerRow, copyWidth);
-            }
+                for (uint32_t z = 0; z < textureArrayLayerCount; ++z) {
+                    uint32_t bufferOffsetPerArrayLayer = bytesPerImage * z;
+                    for (uint32_t y = 0; y < copyHeight; ++y) {
+                        EXPECT_BUFFER_U16_RANGE_EQ(
+                            expectedData.data() + copyWidth * y, destinationBuffer,
+                            bufferCopyOffset + bufferOffsetPerArrayLayer + y * bytesPerRow,
+                            copyWidth);
+                    }
+                }
+            } else {
+                std::vector<float> expectedData(copyWidth * copyHeight, kClearDepth);
+                for (uint32_t y = copyHeight / 2; y < copyHeight; y++) {
+                    auto rowStart = expectedData.data() + y * copyWidth;
+                    std::fill(rowStart, rowStart + copyWidth / 2, initDepth);
+                }
 
-        } else {
-            std::vector<float> expectedData = {
-                0.0,       0.0,       0.0, 0.0,  //
-                0.0,       0.0,       0.0, 0.0,  //
-                initDepth, initDepth, 0.0, 0.0,  //
-                initDepth, initDepth, 0.0, 0.0,  //
-            };
-
-            for (uint32_t y = 0; y < copyHeight; ++y) {
-                EXPECT_BUFFER_FLOAT_RANGE_EQ(expectedData.data() + copyWidth * y, destinationBuffer,
-                                             bufferCopyOffset + y * kBytesPerRow, copyWidth);
+                for (uint32_t z = 0; z < textureArrayLayerCount; ++z) {
+                    uint32_t bufferOffsetPerArrayLayer = bytesPerImage * z;
+                    for (uint32_t y = 0; y < copyHeight; ++y) {
+                        EXPECT_BUFFER_FLOAT_RANGE_EQ(
+                            expectedData.data() + copyWidth * y, destinationBuffer,
+                            bufferCopyOffset + bufferOffsetPerArrayLayer + y * bytesPerRow,
+                            copyWidth);
+                    }
+                }
             }
         }
     }
@@ -518,34 +597,185 @@ class DepthCopyTests : public DepthStencilCopyTests {
 
 // Test copying the depth-only aspect into a buffer.
 TEST_P(DepthCopyTests, FromDepthAspect) {
-    constexpr float kInitDepth = 0.2f;
     constexpr uint32_t kBufferCopyOffset = 0;
-    constexpr uint32_t kWidth = 4;
-    constexpr uint32_t kHeight = 4;
     constexpr uint32_t kTestLevel = 0;
-    DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, kTestLevel);
+    constexpr uint32_t kTestTextureSizes[][2] = {
+        // Original test parameter
+        {4, 4},
+        // Only 1 pixel at bottom left has value, test compute emulation path for unorm 16
+        {2, 2},
+        // Odd number texture width to test compute emulation path for unorm 16
+        {3, 3},
+        // float 32 needs bytesPerRow alignment
+        {65, 1},
+        // unorm 16 and float 32 need bytesPerRow alignment
+        {129, 1},
+    };
+    constexpr uint32_t kTestTextureArrayLayerCounts[] = {
+        1,
+        2,
+    };
+
+    for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) {
+        for (const auto& size : kTestTextureSizes) {
+            DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, size[0], size[1], sizeZ, kTestLevel);
+        }
+    }
 }
 
 // Test copying the depth-only aspect into a buffer at a non-zero offset.
 TEST_P(DepthCopyTests, FromDepthAspectToBufferAtNonZeroOffset) {
-    constexpr float kInitDepth = 0.2f;
-    constexpr uint32_t kWidth = 4;
-    constexpr uint32_t kHeight = 4;
     constexpr uint32_t kTestLevel = 0;
-    constexpr std::array<uint32_t, 2> kBufferCopyOffsets = {4u, 512u};
-    for (uint32_t offset : kBufferCopyOffsets) {
-        DoCopyFromDepthTest(offset, kInitDepth, kWidth, kHeight, kTestLevel);
+    constexpr uint32_t kBufferCopyOffsets[] = {4u, 512u};
+    constexpr uint32_t kTestTextureSizes[][2] = {
+        // Original test parameter
+        {4, 4},
+        // Only 1 pixel at bottom left has value, test compute emulation path for unorm 16
+        {2, 2},
+        // Odd number texture width to test compute emulation path for unorm 16
+        {3, 3},
+        // float 32 needs bytesPerRow alignment
+        {65, 1},
+        // unorm 16 and float 32 need bytesPerRow alignment
+        {129, 1},
+    };
+    constexpr uint32_t kTestTextureArrayLayerCounts[] = {
+        1,
+        2,
+    };
+    for (const uint32_t sizeZ : kTestTextureArrayLayerCounts) {
+        for (uint32_t offset : kBufferCopyOffsets) {
+            for (const auto& size : kTestTextureSizes) {
+                DoCopyFromDepthTest(offset, kInitDepth, size[0], size[1], sizeZ, kTestLevel);
+            }
+        }
     }
 }
 
 // Test copying the non-zero mip, depth-only aspect into a buffer.
 TEST_P(DepthCopyTests, FromNonZeroMipDepthAspect) {
-    constexpr float kInitDepth = 0.2f;
     constexpr uint32_t kBufferCopyOffset = 0;
     constexpr uint32_t kWidth = 9;
     constexpr uint32_t kHeight = 9;
-    constexpr uint32_t kTestLevel = 1;
-    DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, kTestLevel);
+    DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, 1, 1);
+    DoCopyFromDepthTest(kBufferCopyOffset, kInitDepth, kWidth, kHeight, 2, 2);
+}
+
+// Test buffer content outside of copy extent is preserved.
+// This test is made specifially for compute blit for depth16unorm emulation path.
+// The texel size is 2 byte, while in the compute shader we have to write 4 byte at a time.
+// When the copy extent width is an odd number, buffer content outside of the copy range is
+// inevitably written. So we need to make sure the original content of the buffer that's outside of
+// the copy extent is still correctly preserved.
+TEST_P(DepthCopyTests, PreserveBufferContent) {
+    constexpr uint32_t kBufferCopyOffsets[] = {0u, 4u, 512u};
+    constexpr uint32_t kTestTextureSizes[][2] = {
+        {1, 1},
+        {1, 2},
+        {3, 3},
+    };
+    constexpr uint32_t kExtraBufferSize[] = {0u, 4u};
+    const uint32_t kSizeZ = 1;
+    constexpr uint32_t kTestLevel = 0;
+
+    wgpu::TextureFormat format = GetParam().mTextureFormat;
+    constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::DepthOnly;
+
+    for (uint32_t extraBufferSize : kExtraBufferSize) {
+        for (uint32_t offset : kBufferCopyOffsets) {
+            for (const auto& size : kTestTextureSizes) {
+                wgpu::BufferDescriptor bufferDescriptor = {};
+                // Add wgpu::BufferUsage::MapRead to check the buffer content with mapAsync
+                bufferDescriptor.usage = wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead;
+                bufferDescriptor.size =
+                    extraBufferSize + offset +
+                    BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect);
+                bufferDescriptor.mappedAtCreation = true;
+                wgpu::Buffer buffer = device.CreateBuffer(&bufferDescriptor);
+                constexpr uint8_t kOriginalValue = 0xff;
+                {
+                    // Fill the buffer with an original value other than 0 to check they are
+                    // incorrectly overwritten outside of the copy range.
+                    uint8_t* ptr = static_cast<uint8_t*>(buffer.GetMappedRange());
+                    std::fill(ptr, ptr + bufferDescriptor.size, kOriginalValue);
+                    buffer.Unmap();
+                }
+
+                // Don't check copy region content because the buffer doesn't have
+                // wgpu::BufferUsage::CopySrc usage.
+                DoCopyFromDepthTestWithBuffer(buffer, offset, kInitDepth, size[0], size[1], kSizeZ,
+                                              kTestLevel, false);
+
+                std::vector<uint8_t> expected(bufferDescriptor.size, kOriginalValue);
+                // Get the offset of the end of the copy range (without aligning with 4 bytes)
+                uint32_t bufferEndOffset =
+                    offset +
+                    BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect, false);
+                if (bufferDescriptor.size > bufferEndOffset) {
+                    // Cannot use EXPECT_BUFFER_* helper here because it needs to align the copy
+                    // size to a multiple of 4 bytes to call CopyBufferToBuffer. We are checking
+                    // against format like Depth16Unorm so we need offset to be multiple of 2.
+                    MapAsyncAndWait(buffer, wgpu::MapMode::Read, 0, wgpu::kWholeMapSize);
+                    const uint8_t* ptr = static_cast<const uint8_t*>(buffer.GetConstMappedRange());
+
+                    // Check the content before copy range.
+                    for (uint32_t i = 0; i < offset; i++) {
+                        EXPECT_EQ(ptr[i], kOriginalValue);
+                    }
+
+                    // Check the content after copy range.
+                    uint32_t checkSize = bufferDescriptor.size - bufferEndOffset;
+                    for (uint32_t i = 0; i < checkSize; i++) {
+                        EXPECT_EQ(ptr[bufferEndOffset + i], kOriginalValue);
+                    }
+                    buffer.Unmap();
+                }
+            }
+        }
+    }
+}
+
+// Test compact buffer size edge case.
+// This test is made specifially for compute blit for depth16unorm emulation path.
+// When format is depth16unorm and width is an odd number, the size of the most compact buffer copy
+// target can be something that's not a multiple of 4. We need to make sure access don't go out of
+// bounds in the shader, when still writing to array<u32> in the compute shader.
+TEST_P(DepthCopyTests, BufferCopySizeEdgeCase) {
+    constexpr uint32_t kBufferCopyOffsets[] = {0u, 4u, 512u};
+    constexpr uint32_t kTestTextureSizes[][2] = {
+        // Cannot create compact copy buffer for {1, 1} here as storage buffer binding requires size
+        // of at least 4 bytes.
+        {3, 1},
+        {1, 2},
+        {3, 3},
+    };
+    const uint32_t kSizeZ = 1;
+    constexpr uint32_t kTestLevel = 0;
+
+    wgpu::TextureFormat format = GetParam().mTextureFormat;
+    constexpr wgpu::TextureAspect aspect = wgpu::TextureAspect::DepthOnly;
+
+    for (uint32_t offset : kBufferCopyOffsets) {
+        for (const auto& size : kTestTextureSizes) {
+            wgpu::BufferDescriptor bufferDescriptor = {};
+            // Add wgpu::BufferUsage::MapRead to check the buffer content with mapAsync
+            bufferDescriptor.usage = wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead;
+            bufferDescriptor.size =
+                offset +
+                // Don't align for 4 bytes to get smallest possible buffer for depth16unorm.
+                BufferSizeForTextureCopy(size[0], size[1], kSizeZ, format, aspect, false);
+            wgpu::Buffer buffer = device.CreateBuffer(&bufferDescriptor);
+
+            // Don't check copy region content because the buffer doesn't have
+            // wgpu::BufferUsage::CopySrc usage.
+            DoCopyFromDepthTestWithBuffer(buffer, offset, kInitDepth, size[0], size[1], kSizeZ,
+                                          kTestLevel, false);
+
+            // Unable to check the result since either MapAsync and CopyBufferToBuffer requires size
+            // to be multiple of 4 bytes.
+            // Just run and don't crash on ASSERT.
+        }
+    }
 }
 
 class DepthCopyFromBufferTests : public DepthStencilCopyTests {
@@ -582,8 +812,6 @@ class DepthCopyFromBufferTests : public DepthStencilCopyTests {
             destTexture, 0, {0, 0, 0}, wgpu::TextureAspect::DepthOnly);
         wgpu::Extent3D extent = {kWidth, kHeight, 1};
 
-        constexpr float kInitDepth = 0.2f;
-
         // This expectation is the test as it performs the CopyTextureToBuffer.
         if (GetParam().mTextureFormat == wgpu::TextureFormat::Depth16Unorm) {
             uint16_t expected = FloatToUnorm<uint16_t>(kInitDepth);