Initialize padding on buffer allocations

Buffer allocations in Dawn may be padded. This padding could be visible, depending on how the backend implements robust vertex buffer access. This commit updates buffer creation to clear all padding bytes immediately after creation. It is not counted as a lazy clear. And, add a test which reads off the end of a padded vertex buffer to check that the padding bytes are also initialized. Also: Update Metal buffers to always allocate enough space for Tint's vertex pulling, and pass the full allocated size to Tint. While writing the test in this commit, a bug was found where Tint assumes there is at least 4 bytes in the vertex pulling buffer. The WebGPU API currently allows zero-sized bindings, so Dawn needs to always allocate enough space for one element. Also: Update Vulkan vertex/index buffers to allocate at least one more byte than requested because vkCmdSetVertexBuffers and vkCmdSetIndexBuffer disallow the offset to be equal to the buffer size. We need at least one more byte to make zero-sized bindings as the end of the buffer valid. Lastly: Update helper so that a T2B copy of a single row that touches all bytes in a buffer updates the buffer state to be fully initialized. This is a small optimization that is necessary to write the test. Bug: dawn:837 Change-Id: I829f4764509c4ec784c5aeaaf40b6dcbd4be6866 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/62161 Commit-Queue: Austin Eng <enga@chromium.org> Reviewed-by: Kai Ninomiya <kainino@chromium.org> Reviewed-by: Corentin Wallez <cwallez@chromium.org>
2025-10-12 04:49:03 +00:00 · 2021-08-19 18:02:36 +00:00 · 2021-08-19 18:02:36 +00:00 · ddd0a0a856
commit ddd0a0a856
parent 663a16e2fe
13 changed files with 376 additions and 43 deletions
--- a/src/dawn_native/CommandBuffer.cpp
+++ b/src/dawn_native/CommandBuffer.cpp
@ -17,6 +17,7 @@
 #include "common/BitSetIterator.h"
 #include "dawn_native/Buffer.h"
 #include "dawn_native/CommandEncoder.h"
+#include "dawn_native/CommandValidation.h"
 #include "dawn_native/Commands.h"
 #include "dawn_native/Format.h"
 #include "dawn_native/Texture.h"
@ -171,27 +172,37 @@ namespace dawn_native {
        ASSERT(copy != nullptr);

        if (copy->destination.offset > 0) {
+            // The copy doesn't touch the start of the buffer.
            return false;
        }

        const TextureBase* texture = copy->source.texture.Get();
        const TexelBlockInfo& blockInfo =
            texture->GetFormat().GetAspectInfo(copy->source.aspect).block;
+        const uint64_t widthInBlocks = copy->copySize.width / blockInfo.width;
        const uint64_t heightInBlocks = copy->copySize.height / blockInfo.height;
+        const bool multiSlice = copy->copySize.depthOrArrayLayers > 1;
+        const bool multiRow = multiSlice || heightInBlocks > 1;

-        if (copy->destination.rowsPerImage > heightInBlocks) {
+        if (multiSlice && copy->destination.rowsPerImage > heightInBlocks) {
+            // There are gaps between slices that aren't overwritten
            return false;
        }

-        const uint64_t copyTextureDataSizePerRow =
-            copy->copySize.width / blockInfo.width * blockInfo.byteSize;
-        if (copy->destination.bytesPerRow > copyTextureDataSizePerRow) {
+        const uint64_t copyTextureDataSizePerRow = widthInBlocks * blockInfo.byteSize;
+        if (multiRow && copy->destination.bytesPerRow > copyTextureDataSizePerRow) {
+            // There are gaps between rows that aren't overwritten
            return false;
        }

-        const uint64_t overwrittenRangeSize =
-            copyTextureDataSizePerRow * heightInBlocks * copy->copySize.depthOrArrayLayers;
-        if (copy->destination.buffer->GetSize() > overwrittenRangeSize) {
+        // After the above checks, we're sure the copy has no gaps.
+        // Now, compute the total number of bytes written.
+        const uint64_t writtenBytes =
+            ComputeRequiredBytesInCopy(blockInfo, copy->copySize, copy->destination.bytesPerRow,
+                                       copy->destination.rowsPerImage)
+                .AcquireSuccess();
+        if (!copy->destination.buffer->IsFullBufferRange(copy->destination.offset, writtenBytes)) {
+            // The written bytes don't cover the whole buffer.
            return false;
        }

--- a/src/dawn_native/d3d12/BufferD3D12.cpp
+++ b/src/dawn_native/d3d12/BufferD3D12.cpp
@ -168,6 +168,21 @@ namespace dawn_native { namespace d3d12 {
            DAWN_TRY(ClearBuffer(commandRecordingContext, uint8_t(1u)));
        }

+        // Initialize the padding bytes to zero.
+        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) &&
+            !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                CommandRecordingContext* commandRecordingContext;
+                DAWN_TRY_ASSIGN(commandRecordingContext,
+                                ToBackend(GetDevice())->GetPendingCommandContext());
+
+                uint32_t clearSize = paddingBytes;
+                uint64_t clearOffset = GetSize();
+                DAWN_TRY(ClearBuffer(commandRecordingContext, 0, clearOffset, clearSize));
+            }
+        }
+
        return {};
    }

@ -444,29 +459,33 @@ namespace dawn_native { namespace d3d12 {
        return {};
    }

-    MaybeError Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
+    MaybeError Buffer::ClearBuffer(CommandRecordingContext* commandContext,
+                                   uint8_t clearValue,
+                                   uint64_t offset,
+                                   uint64_t size) {
        Device* device = ToBackend(GetDevice());
+        size = size > 0 ? size : GetAllocatedSize();

        // The state of the buffers on UPLOAD heap must always be GENERIC_READ and cannot be
        // changed away, so we can only clear such buffer with buffer mapping.
        if (D3D12HeapType(GetUsage()) == D3D12_HEAP_TYPE_UPLOAD) {
-            DAWN_TRY(MapInternal(true, 0, size_t(GetAllocatedSize()), "D3D12 map at clear buffer"));
-            memset(mMappedData, clearValue, GetAllocatedSize());
+            DAWN_TRY(MapInternal(true, static_cast<size_t>(offset), static_cast<size_t>(size),
+                                 "D3D12 map at clear buffer"));
+            memset(mMappedData, clearValue, size);
            UnmapImpl();
        } else {
            // TODO(crbug.com/dawn/852): use ClearUnorderedAccessView*() when the buffer usage
            // includes STORAGE.
            DynamicUploader* uploader = device->GetDynamicUploader();
            UploadHandle uploadHandle;
-            DAWN_TRY_ASSIGN(uploadHandle, uploader->Allocate(GetAllocatedSize(),
-                                                             device->GetPendingCommandSerial(),
+            DAWN_TRY_ASSIGN(uploadHandle,
+                            uploader->Allocate(size, device->GetPendingCommandSerial(),
                                               kCopyBufferToBufferOffsetAlignment));

-            memset(uploadHandle.mappedBuffer, clearValue, GetAllocatedSize());
+            memset(uploadHandle.mappedBuffer, clearValue, size);

            device->CopyFromStagingToBufferImpl(commandContext, uploadHandle.stagingBuffer,
-                                                uploadHandle.startOffset, this, 0,
-                                                GetAllocatedSize());
+                                                uploadHandle.startOffset, this, offset, size);
        }

        return {};
--- a/src/dawn_native/d3d12/BufferD3D12.h
+++ b/src/dawn_native/d3d12/BufferD3D12.h
@ -68,7 +68,10 @@ namespace dawn_native { namespace d3d12 {
                                                  wgpu::BufferUsage newUsage);

        MaybeError InitializeToZero(CommandRecordingContext* commandContext);
-        MaybeError ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
+        MaybeError ClearBuffer(CommandRecordingContext* commandContext,
+                               uint8_t clearValue,
+                               uint64_t offset = 0,
+                               uint64_t size = 0);

        ResourceHeapAllocation mResourceAllocation;
        bool mFixedResourceState = false;
--- a/src/dawn_native/metal/BufferMTL.h
+++ b/src/dawn_native/metal/BufferMTL.h
@ -52,7 +52,10 @@ namespace dawn_native { namespace metal {
        MaybeError MapAtCreationImpl() override;

        void InitializeToZero(CommandRecordingContext* commandContext);
-        void ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
+        void ClearBuffer(CommandRecordingContext* commandContext,
+                         uint8_t clearValue,
+                         uint64_t offset = 0,
+                         uint64_t size = 0);

        NSPRef<id<MTLBuffer>> mMtlBuffer;
    };
--- a/src/dawn_native/metal/BufferMTL.mm
+++ b/src/dawn_native/metal/BufferMTL.mm
@ -45,10 +45,6 @@ namespace dawn_native { namespace metal {
            storageMode = MTLResourceStorageModePrivate;
        }

-        if (GetSize() > std::numeric_limits<NSUInteger>::max()) {
-            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
-        }
-
        uint32_t alignment = 1;
 #ifdef DAWN_PLATFORM_MACOS
        // [MTLBlitCommandEncoder fillBuffer] requires the size to be a multiple of 4 on MacOS.
@ -64,13 +60,25 @@ namespace dawn_native { namespace metal {
            alignment = kMinUniformOrStorageBufferAlignment;
        }

-        // Allocate at least 4 bytes so clamped accesses are always in bounds.
-        NSUInteger currentSize = static_cast<NSUInteger>(std::max(GetSize(), uint64_t(4u)));
+        // The vertex pulling transform requires at least 4 bytes in the buffer.
+        // 0-sized vertex buffer bindings are allowed, so we always need an additional 4 bytes
+        // after the end.
+        NSUInteger extraBytes = 0u;
+        if ((GetUsage() & wgpu::BufferUsage::Vertex) != 0) {
+            extraBytes = 4u;
+        }
+
+        if (GetSize() > std::numeric_limits<NSUInteger>::max() - extraBytes) {
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+        NSUInteger currentSize =
+            std::max(static_cast<NSUInteger>(GetSize()) + extraBytes, NSUInteger(4));
+
        if (currentSize > std::numeric_limits<NSUInteger>::max() - alignment) {
            // Alignment would overlow.
            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
        }
-        currentSize = Align(currentSize, kMinUniformOrStorageBufferAlignment);
+        currentSize = Align(currentSize, alignment);

        if (@available(iOS 12, macOS 10.14, *)) {
            NSUInteger maxBufferSize = [ToBackend(GetDevice())->GetMTLDevice() maxBufferLength];
@ -109,6 +117,19 @@ namespace dawn_native { namespace metal {
            ClearBuffer(commandContext, uint8_t(1u));
        }

+        // Initialize the padding bytes to zero.
+        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) &&
+            !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                uint32_t clearSize = Align(paddingBytes, 4);
+                uint64_t clearOffset = GetAllocatedSize() - clearSize;
+
+                CommandRecordingContext* commandContext =
+                    ToBackend(GetDevice())->GetPendingCommandContext();
+                ClearBuffer(commandContext, 0, clearOffset, clearSize);
+            }
+        }
        return {};
    }

@ -197,11 +218,15 @@ namespace dawn_native { namespace metal {
        GetDevice()->IncrementLazyClearCountForTesting();
    }

-    void Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
+    void Buffer::ClearBuffer(CommandRecordingContext* commandContext,
+                             uint8_t clearValue,
+                             uint64_t offset,
+                             uint64_t size) {
        ASSERT(commandContext != nullptr);
-        ASSERT(GetAllocatedSize() > 0);
+        size = size > 0 ? size : GetAllocatedSize();
+        ASSERT(size > 0);
        [commandContext->EnsureBlit() fillBuffer:mMtlBuffer.Get()
-                                           range:NSMakeRange(0, GetAllocatedSize())
+                                           range:NSMakeRange(offset, size)
                                           value:clearValue];
    }

--- a/src/dawn_native/metal/CommandBufferMTL.mm
+++ b/src/dawn_native/metal/CommandBufferMTL.mm
@ -548,7 +548,8 @@ namespace dawn_native { namespace metal {
                mVertexBufferOffsets[slot] = offset;

                ASSERT(buffer->GetSize() < std::numeric_limits<uint32_t>::max());
-                mVertexBufferBindingSizes[slot] = static_cast<uint32_t>(buffer->GetSize() - offset);
+                mVertexBufferBindingSizes[slot] =
+                    static_cast<uint32_t>(buffer->GetAllocatedSize() - offset);
                mDirtyVertexBuffers.set(slot);
            }

--- a/src/dawn_native/opengl/BufferGL.cpp
+++ b/src/dawn_native/opengl/BufferGL.cpp
@ -49,6 +49,7 @@ namespace dawn_native { namespace opengl {
            device->gl.BufferData(GL_ARRAY_BUFFER, mAllocatedSize, clearValues.data(),
                                  GL_STATIC_DRAW);
        } else {
+            // Buffers start zeroed if you pass nullptr to glBufferData.
            device->gl.BufferData(GL_ARRAY_BUFFER, mAllocatedSize, nullptr, GL_STATIC_DRAW);
        }
    }
--- a/src/dawn_native/vulkan/BufferVk.cpp
+++ b/src/dawn_native/vulkan/BufferVk.cpp
@ -137,16 +137,34 @@ namespace dawn_native { namespace vulkan {
    }

    MaybeError Buffer::Initialize(bool mappedAtCreation) {
+        // vkCmdFillBuffer requires the size to be a multiple of 4.
+        constexpr size_t kAlignment = 4u;
+
+        uint32_t extraBytes = 0u;
+        if (GetUsage() & (wgpu::BufferUsage::Vertex | wgpu::BufferUsage::Index)) {
+            // vkCmdSetIndexBuffer and vkCmdSetVertexBuffer are invalid if the offset
+            // is equal to the whole buffer size. Allocate at least one more byte so it
+            // is valid to setVertex/IndexBuffer with a zero-sized range at the end
+            // of the buffer with (offset=buffer.size, size=0).
+            extraBytes = 1u;
+        }
+
+        uint64_t size = GetSize();
+        if (size > std::numeric_limits<uint64_t>::max() - extraBytes) {
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+
+        size += extraBytes;
+
        // Allocate at least 4 bytes so clamped accesses are always in bounds.
        // Also, Vulkan requires the size to be non-zero.
-        uint64_t size = std::max(GetSize(), uint64_t(4u));
-        // vkCmdFillBuffer requires the size to be a multiple of 4.
-        size_t alignment = 4u;
-        if (size > std::numeric_limits<uint64_t>::max() - alignment) {
+        size = std::max(size, uint64_t(4u));
+
+        if (size > std::numeric_limits<uint64_t>::max() - kAlignment) {
            // Alignment would overlow.
            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
        }
-        mAllocatedSize = Align(size, alignment);
+        mAllocatedSize = Align(size, kAlignment);

        // Avoid passing ludicrously large sizes to drivers because it causes issues: drivers add
        // some constants to the size passed and align it, but for values close to the maximum
@ -200,6 +218,17 @@ namespace dawn_native { namespace vulkan {
            ClearBuffer(device->GetPendingRecordingContext(), 0x01010101);
        }

+        // Initialize the padding bytes to zero.
+        if (device->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) && !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                uint32_t clearSize = Align(paddingBytes, 4);
+                uint64_t clearOffset = GetAllocatedSize() - clearSize;
+
+                CommandRecordingContext* recordingContext = device->GetPendingRecordingContext();
+                ClearBuffer(recordingContext, 0, clearOffset, clearSize);
+            }
+        }
        return {};
    }

@ -354,17 +383,21 @@ namespace dawn_native { namespace vulkan {
        SetIsDataInitialized();
    }

-    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue) {
+    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext,
+                             uint32_t clearValue,
+                             uint64_t offset,
+                             uint64_t size) {
        ASSERT(recordingContext != nullptr);
-        ASSERT(GetAllocatedSize() > 0);
+        size = size > 0 ? size : GetAllocatedSize();
+        ASSERT(size > 0);

        TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);

        Device* device = ToBackend(GetDevice());
        // VK_WHOLE_SIZE doesn't work on old Windows Intel Vulkan drivers, so we don't use it.
        // Note: Allocated size must be a multiple of 4.
-        ASSERT(GetAllocatedSize() % 4 == 0);
-        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, 0, GetAllocatedSize(),
+        ASSERT(size % 4 == 0);
+        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, offset, size,
                                 clearValue);
    }
 }}  // namespace dawn_native::vulkan
--- a/src/dawn_native/vulkan/BufferVk.h
+++ b/src/dawn_native/vulkan/BufferVk.h
@ -55,7 +55,10 @@ namespace dawn_native { namespace vulkan {

        MaybeError Initialize(bool mappedAtCreation);
        void InitializeToZero(CommandRecordingContext* recordingContext);
-        void ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue);
+        void ClearBuffer(CommandRecordingContext* recordingContext,
+                         uint32_t clearValue,
+                         uint64_t offset = 0,
+                         uint64_t size = 0);

        MaybeError MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) override;
        void UnmapImpl() override;
--- a/src/tests/end2end/BufferZeroInitTests.cpp
+++ b/src/tests/end2end/BufferZeroInitTests.cpp
@ -14,6 +14,7 @@

 #include "tests/DawnTest.h"

+#include "common/Math.h"
 #include "utils/ComboRenderPipelineDescriptor.h"
 #include "utils/TestUtils.h"
 #include "utils/WGPUHelpers.h"
@ -201,8 +202,10 @@ class BufferZeroInitTest : public DawnTest {
        EXPECT_PIXEL_RGBA8_EQ(kExpectedColor, outputTexture, 0u, 0u);
    }

-    wgpu::RenderPipeline CreateRenderPipelineForTest(const char* vertexShader,
-                                                     uint32_t vertexBufferCount = 1u) {
+    wgpu::RenderPipeline CreateRenderPipelineForTest(
+        const char* vertexShader,
+        uint32_t vertexBufferCount = 1u,
+        wgpu::VertexFormat vertexFormat = wgpu::VertexFormat::Float32x4) {
        constexpr wgpu::TextureFormat kColorAttachmentFormat = wgpu::TextureFormat::RGBA8Unorm;

        wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, vertexShader);
@ -219,9 +222,9 @@ class BufferZeroInitTest : public DawnTest {
        descriptor.cFragment.module = fsModule;
        descriptor.primitive.topology = wgpu::PrimitiveTopology::PointList;
        descriptor.vertex.bufferCount = vertexBufferCount;
-        descriptor.cBuffers[0].arrayStride = 4 * sizeof(float);
+        descriptor.cBuffers[0].arrayStride = Align(utils::VertexFormatSize(vertexFormat), 4);
        descriptor.cBuffers[0].attributeCount = 1;
-        descriptor.cAttributes[0].format = wgpu::VertexFormat::Float32x4;
+        descriptor.cAttributes[0].format = vertexFormat;
        descriptor.cTargets[0].format = kColorAttachmentFormat;
        return device.CreateRenderPipeline(&descriptor);
    }
@ -1128,6 +1131,114 @@ TEST_P(BufferZeroInitTest, SetVertexBuffer) {
    }
 }

+// Test for crbug.com/dawn/837.
+// Test that the padding after a buffer allocation is initialized to 0.
+// This test makes an unaligned vertex buffer which should be padded in the backend
+// allocation. It then tries to index off the end of the vertex buffer in an indexed
+// draw call. A backend which implements robust buffer access via clamping should
+// still see zeros at the end of the buffer.
+TEST_P(BufferZeroInitTest, PaddingInitialized) {
+    DAWN_SUPPRESS_TEST_IF(IsANGLE());  // TODO(crbug.com/dawn/1084).
+
+    constexpr wgpu::TextureFormat kColorAttachmentFormat = wgpu::TextureFormat::RGBA8Unorm;
+    // A small sub-4-byte format means a single vertex can fit entirely within the padded buffer,
+    // touching some of the padding. Test a small format, as well as larger formats.
+    for (wgpu::VertexFormat vertexFormat :
+         {wgpu::VertexFormat::Unorm8x2, wgpu::VertexFormat::Float16x2,
+          wgpu::VertexFormat::Float32x2}) {
+        wgpu::RenderPipeline renderPipeline =
+            CreateRenderPipelineForTest(R"(
+            struct VertexOut {
+                [[location(0)]] color : vec4<f32>;
+                [[builtin(position)]] position : vec4<f32>;
+            };
+
+            [[stage(vertex)]] fn main([[location(0)]] pos : vec2<f32>) -> VertexOut {
+                var output : VertexOut;
+                if (all(pos == vec2<f32>(0.0, 0.0))) {
+                    output.color = vec4<f32>(0.0, 1.0, 0.0, 1.0);
+                } else {
+                    output.color = vec4<f32>(1.0, 0.0, 0.0, 1.0);
+                }
+                output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+                return output;
+            })",
+                                        /* vertexBufferCount */ 1u, vertexFormat);
+
+        // Create an index buffer the indexes off the end of the vertex buffer.
+        wgpu::Buffer indexBuffer =
+            utils::CreateBufferFromData<uint32_t>(device, wgpu::BufferUsage::Index, {1});
+
+        const uint32_t vertexFormatSize = utils::VertexFormatSize(vertexFormat);
+
+        // Create an 8-bit texture to use to initialize buffer contents.
+        wgpu::TextureDescriptor initTextureDesc = {};
+        initTextureDesc.size = {vertexFormatSize + 4, 1, 1};
+        initTextureDesc.format = wgpu::TextureFormat::R8Unorm;
+        initTextureDesc.usage = wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst;
+        wgpu::ImageCopyTexture zeroTextureSrc =
+            utils::CreateImageCopyTexture(device.CreateTexture(&initTextureDesc), 0, {0, 0, 0});
+        {
+            wgpu::TextureDataLayout layout =
+                utils::CreateTextureDataLayout(0, wgpu::kCopyStrideUndefined);
+            std::vector<uint8_t> data(initTextureDesc.size.width);
+            queue.WriteTexture(&zeroTextureSrc, data.data(), data.size(), &layout,
+                               &initTextureDesc.size);
+        }
+
+        for (uint32_t extraBytes : {0, 1, 2, 3, 4}) {
+            // Create a vertex buffer to hold a single vertex attribute.
+            // Uniform usage is added to force even more padding on D3D12.
+            // The buffer is internally padded and allocated as a larger buffer.
+            const uint32_t vertexBufferSize = vertexFormatSize + extraBytes;
+            for (uint32_t vertexBufferOffset = 0; vertexBufferOffset <= vertexBufferSize;
+                 vertexBufferOffset += 4u) {
+                wgpu::Buffer vertexBuffer = CreateBuffer(
+                    vertexBufferSize, wgpu::BufferUsage::Vertex | wgpu::BufferUsage::Uniform |
+                                          wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst);
+
+                // "Fully" initialize the buffer with a copy from an 8-bit texture, touching
+                // everything except the padding. From the point-of-view of the API, all
+                // |vertexBufferSize| bytes are initialized. Note: Uses CopyTextureToBuffer because
+                // it does not require 4-byte alignment.
+                {
+                    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+                    wgpu::ImageCopyBuffer dst =
+                        utils::CreateImageCopyBuffer(vertexBuffer, 0, wgpu::kCopyStrideUndefined);
+                    wgpu::Extent3D extent = {vertexBufferSize, 1, 1};
+                    encoder.CopyTextureToBuffer(&zeroTextureSrc, &dst, &extent);
+
+                    wgpu::CommandBuffer commandBuffer = encoder.Finish();
+                    EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &commandBuffer));
+                }
+
+                wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+
+                wgpu::Texture colorAttachment =
+                    CreateAndInitializeTexture({1, 1, 1}, kColorAttachmentFormat);
+                utils::ComboRenderPassDescriptor renderPassDescriptor(
+                    {colorAttachment.CreateView()});
+
+                wgpu::RenderPassEncoder renderPass = encoder.BeginRenderPass(&renderPassDescriptor);
+
+                renderPass.SetVertexBuffer(0, vertexBuffer, vertexBufferOffset);
+                renderPass.SetIndexBuffer(indexBuffer, wgpu::IndexFormat::Uint32);
+
+                renderPass.SetPipeline(renderPipeline);
+                renderPass.DrawIndexed(1);
+                renderPass.EndPass();
+
+                wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+                EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &commandBuffer));
+
+                constexpr RGBA8 kExpectedPixelValue = {0, 255, 0, 255};
+                EXPECT_PIXEL_RGBA8_EQ(kExpectedPixelValue, colorAttachment, 0, 0);
+            }
+        }
+    }
+}
+
 // Test the buffer will be lazily initialized correctly when its first use is in SetIndexBuffer.
 TEST_P(BufferZeroInitTest, SetIndexBuffer) {
    // Bind the whole buffer as an index buffer.
--- a/src/tests/end2end/TextureZeroInitTests.cpp
+++ b/src/tests/end2end/TextureZeroInitTests.cpp
@ -2060,6 +2060,84 @@ TEST_P(CompressedTextureZeroInitTest, HalfCopyTextureToTextureMipLevel) {
                                          kViewMipLevel, 0, true);
 }

+// Test uploading then reading back from a 2D array compressed texture.
+// This is a regression test for a bug where the final destination buffer
+// was considered fully initialized even though there was a 256-byte
+// stride between images.
+TEST_P(CompressedTextureZeroInitTest, Copy2DArrayCompressedB2T2B) {
+    // TODO(crbug.com/dawn/643): diagnose and fix this failure on OpenGL.
+    DAWN_SUPPRESS_TEST_IF(IsOpenGL() || IsOpenGLES());
+
+    // create srcTexture with data
+    wgpu::TextureDescriptor textureDescriptor = CreateTextureDescriptor(
+        4, 5, wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst, utils::kBCFormats[0]);
+    textureDescriptor.size = {8, 8, 5};
+    wgpu::Texture srcTexture = device.CreateTexture(&textureDescriptor);
+
+    uint32_t mipLevel = 2;
+    wgpu::Extent3D copyExtent3D = {4, 4, 5};
+
+    uint32_t copyWidthInBlock = copyExtent3D.width / kFormatBlockByteSize;
+    uint32_t copyHeightInBlock = copyExtent3D.height / kFormatBlockByteSize;
+    uint32_t copyRowsPerImage = copyHeightInBlock;
+    uint32_t copyBytesPerRow =
+        Align(copyWidthInBlock * utils::GetTexelBlockSizeInBytes(textureDescriptor.format),
+              kTextureBytesPerRowAlignment);
+
+    // Generate data to upload
+    std::vector<uint8_t> data(utils::RequiredBytesInCopy(copyBytesPerRow, copyRowsPerImage,
+                                                         copyExtent3D, textureDescriptor.format));
+    for (size_t i = 0; i < data.size(); ++i) {
+        data[i] = i % 255;
+    }
+
+    // Copy texture data from a staging buffer to the destination texture.
+    wgpu::Buffer stagingBuffer =
+        utils::CreateBufferFromData(device, data.data(), data.size(), wgpu::BufferUsage::CopySrc);
+    wgpu::ImageCopyBuffer imageCopyBufferSrc =
+        utils::CreateImageCopyBuffer(stagingBuffer, 0, copyBytesPerRow, copyRowsPerImage);
+
+    wgpu::ImageCopyTexture imageCopyTexture =
+        utils::CreateImageCopyTexture(srcTexture, mipLevel, {0, 0, 0});
+
+    {
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToTexture(&imageCopyBufferSrc, &imageCopyTexture, &copyExtent3D);
+        wgpu::CommandBuffer copy = encoder.Finish();
+        EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &copy));
+    }
+
+    // Create a buffer to read back the data. It is the same size as the upload buffer.
+    wgpu::BufferDescriptor readbackDesc = {};
+    readbackDesc.size = data.size();
+    readbackDesc.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::Buffer readbackBuffer = device.CreateBuffer(&readbackDesc);
+
+    // Copy the texture to the readback buffer.
+    wgpu::ImageCopyBuffer imageCopyBufferDst =
+        utils::CreateImageCopyBuffer(readbackBuffer, 0, copyBytesPerRow, copyRowsPerImage);
+    {
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyTextureToBuffer(&imageCopyTexture, &imageCopyBufferDst, &copyExtent3D);
+        wgpu::CommandBuffer copy = encoder.Finish();
+
+        // Expect a lazy clear because the padding in the copy is not touched.
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &copy));
+    }
+
+    // Generate expected data. It is the same as the upload data, but padding is zero.
+    std::vector<uint8_t> expected(data.size(), 0);
+    for (uint32_t z = 0; z < copyExtent3D.depthOrArrayLayers; ++z) {
+        for (uint32_t y = 0; y < copyHeightInBlock; ++y) {
+            memcpy(&expected[copyBytesPerRow * y + copyBytesPerRow * copyRowsPerImage * z],
+                   &data[copyBytesPerRow * y + copyBytesPerRow * copyRowsPerImage * z],
+                   copyWidthInBlock * utils::GetTexelBlockSizeInBytes(textureDescriptor.format));
+        }
+    }
+    // Check final contents
+    EXPECT_BUFFER_U8_RANGE_EQ(expected.data(), readbackBuffer, 0, expected.size());
+}
+
 DAWN_INSTANTIATE_TEST(CompressedTextureZeroInitTest,
                      D3D12Backend({"nonzero_clear_resources_on_creation_for_testing"}),
                      MetalBackend({"nonzero_clear_resources_on_creation_for_testing"}),
--- a/src/utils/TestUtils.cpp
+++ b/src/utils/TestUtils.cpp
@ -134,4 +134,47 @@ namespace utils {
        device.GetQueue().WriteTexture(&imageCopyTexture, data.data(), 1, &textureDataLayout,
                                       &copyExtent);
    }
+
+    uint32_t VertexFormatSize(wgpu::VertexFormat format) {
+        switch (format) {
+            case wgpu::VertexFormat::Uint8x2:
+            case wgpu::VertexFormat::Sint8x2:
+            case wgpu::VertexFormat::Unorm8x2:
+            case wgpu::VertexFormat::Snorm8x2:
+                return 2;
+            case wgpu::VertexFormat::Uint8x4:
+            case wgpu::VertexFormat::Sint8x4:
+            case wgpu::VertexFormat::Unorm8x4:
+            case wgpu::VertexFormat::Snorm8x4:
+            case wgpu::VertexFormat::Uint16x2:
+            case wgpu::VertexFormat::Sint16x2:
+            case wgpu::VertexFormat::Unorm16x2:
+            case wgpu::VertexFormat::Snorm16x2:
+            case wgpu::VertexFormat::Float16x2:
+            case wgpu::VertexFormat::Float32:
+            case wgpu::VertexFormat::Uint32:
+            case wgpu::VertexFormat::Sint32:
+                return 4;
+            case wgpu::VertexFormat::Uint16x4:
+            case wgpu::VertexFormat::Sint16x4:
+            case wgpu::VertexFormat::Unorm16x4:
+            case wgpu::VertexFormat::Snorm16x4:
+            case wgpu::VertexFormat::Float16x4:
+            case wgpu::VertexFormat::Float32x2:
+            case wgpu::VertexFormat::Uint32x2:
+            case wgpu::VertexFormat::Sint32x2:
+                return 8;
+            case wgpu::VertexFormat::Float32x3:
+            case wgpu::VertexFormat::Uint32x3:
+            case wgpu::VertexFormat::Sint32x3:
+                return 12;
+            case wgpu::VertexFormat::Float32x4:
+            case wgpu::VertexFormat::Uint32x4:
+            case wgpu::VertexFormat::Sint32x4:
+                return 16;
+            case wgpu::VertexFormat::Undefined:
+                UNREACHABLE();
+        }
+    }
+
 }  // namespace utils
--- a/src/utils/TestUtils.h
+++ b/src/utils/TestUtils.h
@ -60,6 +60,8 @@ namespace utils {
    // in it will contain 1 byte of data.
    void UnalignDynamicUploader(wgpu::Device device);

+    uint32_t VertexFormatSize(wgpu::VertexFormat format);
+
 }  // namespace utils

 #endif  // UTILS_TESTHELPERS_H_