diff --git a/src/dawn_native/CommandBuffer.cpp b/src/dawn_native/CommandBuffer.cpp
index 19b80103f9..b6ba4f05f3 100644
--- a/src/dawn_native/CommandBuffer.cpp
+++ b/src/dawn_native/CommandBuffer.cpp
@@ -17,6 +17,7 @@
 #include "common/BitSetIterator.h"
 #include "dawn_native/Buffer.h"
 #include "dawn_native/CommandEncoder.h"
+#include "dawn_native/CommandValidation.h"
 #include "dawn_native/Commands.h"
 #include "dawn_native/Format.h"
 #include "dawn_native/Texture.h"
@@ -171,27 +172,37 @@ namespace dawn_native {
         ASSERT(copy != nullptr);
 
         if (copy->destination.offset > 0) {
+            // The copy doesn't touch the start of the buffer.
             return false;
         }
 
         const TextureBase* texture = copy->source.texture.Get();
         const TexelBlockInfo& blockInfo =
             texture->GetFormat().GetAspectInfo(copy->source.aspect).block;
+        const uint64_t widthInBlocks = copy->copySize.width / blockInfo.width;
         const uint64_t heightInBlocks = copy->copySize.height / blockInfo.height;
+        const bool multiSlice = copy->copySize.depthOrArrayLayers > 1;
+        const bool multiRow = multiSlice || heightInBlocks > 1;
 
-        if (copy->destination.rowsPerImage > heightInBlocks) {
+        if (multiSlice && copy->destination.rowsPerImage > heightInBlocks) {
+            // There are gaps between slices that aren't overwritten
             return false;
         }
 
-        const uint64_t copyTextureDataSizePerRow =
-            copy->copySize.width / blockInfo.width * blockInfo.byteSize;
-        if (copy->destination.bytesPerRow > copyTextureDataSizePerRow) {
+        const uint64_t copyTextureDataSizePerRow = widthInBlocks * blockInfo.byteSize;
+        if (multiRow && copy->destination.bytesPerRow > copyTextureDataSizePerRow) {
+            // There are gaps between rows that aren't overwritten
             return false;
         }
 
-        const uint64_t overwrittenRangeSize =
-            copyTextureDataSizePerRow * heightInBlocks * copy->copySize.depthOrArrayLayers;
-        if (copy->destination.buffer->GetSize() > overwrittenRangeSize) {
+        // After the above checks, we're sure the copy has no gaps.
+        // Now, compute the total number of bytes written.
+        const uint64_t writtenBytes =
+            ComputeRequiredBytesInCopy(blockInfo, copy->copySize, copy->destination.bytesPerRow,
+                                       copy->destination.rowsPerImage)
+                .AcquireSuccess();
+        if (!copy->destination.buffer->IsFullBufferRange(copy->destination.offset, writtenBytes)) {
+            // The written bytes don't cover the whole buffer.
             return false;
         }
 
diff --git a/src/dawn_native/d3d12/BufferD3D12.cpp b/src/dawn_native/d3d12/BufferD3D12.cpp
index 97a90bed19..399a02c39a 100644
--- a/src/dawn_native/d3d12/BufferD3D12.cpp
+++ b/src/dawn_native/d3d12/BufferD3D12.cpp
@@ -168,6 +168,21 @@ namespace dawn_native { namespace d3d12 {
             DAWN_TRY(ClearBuffer(commandRecordingContext, uint8_t(1u)));
         }
 
+        // Initialize the padding bytes to zero.
+        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) &&
+            !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                CommandRecordingContext* commandRecordingContext;
+                DAWN_TRY_ASSIGN(commandRecordingContext,
+                                ToBackend(GetDevice())->GetPendingCommandContext());
+
+                uint32_t clearSize = paddingBytes;
+                uint64_t clearOffset = GetSize();
+                DAWN_TRY(ClearBuffer(commandRecordingContext, 0, clearOffset, clearSize));
+            }
+        }
+
         return {};
     }
 
@@ -444,29 +459,33 @@ namespace dawn_native { namespace d3d12 {
         return {};
     }
 
-    MaybeError Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
+    MaybeError Buffer::ClearBuffer(CommandRecordingContext* commandContext,
+                                   uint8_t clearValue,
+                                   uint64_t offset,
+                                   uint64_t size) {
         Device* device = ToBackend(GetDevice());
+        size = size > 0 ? size : GetAllocatedSize();
 
         // The state of the buffers on UPLOAD heap must always be GENERIC_READ and cannot be
         // changed away, so we can only clear such buffer with buffer mapping.
         if (D3D12HeapType(GetUsage()) == D3D12_HEAP_TYPE_UPLOAD) {
-            DAWN_TRY(MapInternal(true, 0, size_t(GetAllocatedSize()), "D3D12 map at clear buffer"));
-            memset(mMappedData, clearValue, GetAllocatedSize());
+            DAWN_TRY(MapInternal(true, static_cast<size_t>(offset), static_cast<size_t>(size),
+                                 "D3D12 map at clear buffer"));
+            memset(mMappedData, clearValue, size);
             UnmapImpl();
         } else {
             // TODO(crbug.com/dawn/852): use ClearUnorderedAccessView*() when the buffer usage
             // includes STORAGE.
             DynamicUploader* uploader = device->GetDynamicUploader();
             UploadHandle uploadHandle;
-            DAWN_TRY_ASSIGN(uploadHandle, uploader->Allocate(GetAllocatedSize(),
-                                                             device->GetPendingCommandSerial(),
-                                                             kCopyBufferToBufferOffsetAlignment));
+            DAWN_TRY_ASSIGN(uploadHandle,
+                            uploader->Allocate(size, device->GetPendingCommandSerial(),
+                                               kCopyBufferToBufferOffsetAlignment));
 
-            memset(uploadHandle.mappedBuffer, clearValue, GetAllocatedSize());
+            memset(uploadHandle.mappedBuffer, clearValue, size);
 
             device->CopyFromStagingToBufferImpl(commandContext, uploadHandle.stagingBuffer,
-                                                uploadHandle.startOffset, this, 0,
-                                                GetAllocatedSize());
+                                                uploadHandle.startOffset, this, offset, size);
         }
 
         return {};
diff --git a/src/dawn_native/d3d12/BufferD3D12.h b/src/dawn_native/d3d12/BufferD3D12.h
index 4cd3d512ca..ed19efac67 100644
--- a/src/dawn_native/d3d12/BufferD3D12.h
+++ b/src/dawn_native/d3d12/BufferD3D12.h
@@ -68,7 +68,10 @@ namespace dawn_native { namespace d3d12 {
                                                   wgpu::BufferUsage newUsage);
 
         MaybeError InitializeToZero(CommandRecordingContext* commandContext);
-        MaybeError ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
+        MaybeError ClearBuffer(CommandRecordingContext* commandContext,
+                               uint8_t clearValue,
+                               uint64_t offset = 0,
+                               uint64_t size = 0);
 
         ResourceHeapAllocation mResourceAllocation;
         bool mFixedResourceState = false;
diff --git a/src/dawn_native/metal/BufferMTL.h b/src/dawn_native/metal/BufferMTL.h
index 409b89b5ab..0c7c5e28b4 100644
--- a/src/dawn_native/metal/BufferMTL.h
+++ b/src/dawn_native/metal/BufferMTL.h
@@ -52,7 +52,10 @@ namespace dawn_native { namespace metal {
         MaybeError MapAtCreationImpl() override;
 
         void InitializeToZero(CommandRecordingContext* commandContext);
-        void ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue);
+        void ClearBuffer(CommandRecordingContext* commandContext,
+                         uint8_t clearValue,
+                         uint64_t offset = 0,
+                         uint64_t size = 0);
 
         NSPRef<id<MTLBuffer>> mMtlBuffer;
     };
diff --git a/src/dawn_native/metal/BufferMTL.mm b/src/dawn_native/metal/BufferMTL.mm
index 237fcd01f6..c2c5a313ad 100644
--- a/src/dawn_native/metal/BufferMTL.mm
+++ b/src/dawn_native/metal/BufferMTL.mm
@@ -45,10 +45,6 @@ namespace dawn_native { namespace metal {
             storageMode = MTLResourceStorageModePrivate;
         }
 
-        if (GetSize() > std::numeric_limits<NSUInteger>::max()) {
-            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
-        }
-
         uint32_t alignment = 1;
 #ifdef DAWN_PLATFORM_MACOS
         // [MTLBlitCommandEncoder fillBuffer] requires the size to be a multiple of 4 on MacOS.
@@ -64,13 +60,25 @@ namespace dawn_native { namespace metal {
             alignment = kMinUniformOrStorageBufferAlignment;
         }
 
-        // Allocate at least 4 bytes so clamped accesses are always in bounds.
-        NSUInteger currentSize = static_cast<NSUInteger>(std::max(GetSize(), uint64_t(4u)));
+        // The vertex pulling transform requires at least 4 bytes in the buffer.
+        // 0-sized vertex buffer bindings are allowed, so we always need an additional 4 bytes
+        // after the end.
+        NSUInteger extraBytes = 0u;
+        if ((GetUsage() & wgpu::BufferUsage::Vertex) != 0) {
+            extraBytes = 4u;
+        }
+
+        if (GetSize() > std::numeric_limits<NSUInteger>::max() - extraBytes) {
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+        NSUInteger currentSize =
+            std::max(static_cast<NSUInteger>(GetSize()) + extraBytes, NSUInteger(4));
+
         if (currentSize > std::numeric_limits<NSUInteger>::max() - alignment) {
             // Alignment would overlow.
             return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
         }
-        currentSize = Align(currentSize, kMinUniformOrStorageBufferAlignment);
+        currentSize = Align(currentSize, alignment);
 
         if (@available(iOS 12, macOS 10.14, *)) {
             NSUInteger maxBufferSize = [ToBackend(GetDevice())->GetMTLDevice() maxBufferLength];
@@ -109,6 +117,19 @@ namespace dawn_native { namespace metal {
             ClearBuffer(commandContext, uint8_t(1u));
         }
 
+        // Initialize the padding bytes to zero.
+        if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) &&
+            !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                uint32_t clearSize = Align(paddingBytes, 4);
+                uint64_t clearOffset = GetAllocatedSize() - clearSize;
+
+                CommandRecordingContext* commandContext =
+                    ToBackend(GetDevice())->GetPendingCommandContext();
+                ClearBuffer(commandContext, 0, clearOffset, clearSize);
+            }
+        }
         return {};
     }
 
@@ -197,11 +218,15 @@ namespace dawn_native { namespace metal {
         GetDevice()->IncrementLazyClearCountForTesting();
     }
 
-    void Buffer::ClearBuffer(CommandRecordingContext* commandContext, uint8_t clearValue) {
+    void Buffer::ClearBuffer(CommandRecordingContext* commandContext,
+                             uint8_t clearValue,
+                             uint64_t offset,
+                             uint64_t size) {
         ASSERT(commandContext != nullptr);
-        ASSERT(GetAllocatedSize() > 0);
+        size = size > 0 ? size : GetAllocatedSize();
+        ASSERT(size > 0);
         [commandContext->EnsureBlit() fillBuffer:mMtlBuffer.Get()
-                                           range:NSMakeRange(0, GetAllocatedSize())
+                                           range:NSMakeRange(offset, size)
                                            value:clearValue];
     }
 
diff --git a/src/dawn_native/metal/CommandBufferMTL.mm b/src/dawn_native/metal/CommandBufferMTL.mm
index e49ab1052f..f41cff71ee 100644
--- a/src/dawn_native/metal/CommandBufferMTL.mm
+++ b/src/dawn_native/metal/CommandBufferMTL.mm
@@ -548,7 +548,8 @@ namespace dawn_native { namespace metal {
                 mVertexBufferOffsets[slot] = offset;
 
                 ASSERT(buffer->GetSize() < std::numeric_limits<uint32_t>::max());
-                mVertexBufferBindingSizes[slot] = static_cast<uint32_t>(buffer->GetSize() - offset);
+                mVertexBufferBindingSizes[slot] =
+                    static_cast<uint32_t>(buffer->GetAllocatedSize() - offset);
                 mDirtyVertexBuffers.set(slot);
             }
 
diff --git a/src/dawn_native/opengl/BufferGL.cpp b/src/dawn_native/opengl/BufferGL.cpp
index dd90fccb08..1e102d63c5 100644
--- a/src/dawn_native/opengl/BufferGL.cpp
+++ b/src/dawn_native/opengl/BufferGL.cpp
@@ -49,6 +49,7 @@ namespace dawn_native { namespace opengl {
             device->gl.BufferData(GL_ARRAY_BUFFER, mAllocatedSize, clearValues.data(),
                                   GL_STATIC_DRAW);
         } else {
+            // Buffers start zeroed if you pass nullptr to glBufferData.
             device->gl.BufferData(GL_ARRAY_BUFFER, mAllocatedSize, nullptr, GL_STATIC_DRAW);
         }
     }
diff --git a/src/dawn_native/vulkan/BufferVk.cpp b/src/dawn_native/vulkan/BufferVk.cpp
index 7b796f96a7..8139b29062 100644
--- a/src/dawn_native/vulkan/BufferVk.cpp
+++ b/src/dawn_native/vulkan/BufferVk.cpp
@@ -137,16 +137,34 @@ namespace dawn_native { namespace vulkan {
     }
 
     MaybeError Buffer::Initialize(bool mappedAtCreation) {
+        // vkCmdFillBuffer requires the size to be a multiple of 4.
+        constexpr size_t kAlignment = 4u;
+
+        uint32_t extraBytes = 0u;
+        if (GetUsage() & (wgpu::BufferUsage::Vertex | wgpu::BufferUsage::Index)) {
+            // vkCmdSetIndexBuffer and vkCmdSetVertexBuffer are invalid if the offset
+            // is equal to the whole buffer size. Allocate at least one more byte so it
+            // is valid to setVertex/IndexBuffer with a zero-sized range at the end
+            // of the buffer with (offset=buffer.size, size=0).
+            extraBytes = 1u;
+        }
+
+        uint64_t size = GetSize();
+        if (size > std::numeric_limits<uint64_t>::max() - extraBytes) {
+            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
+        }
+
+        size += extraBytes;
+
         // Allocate at least 4 bytes so clamped accesses are always in bounds.
         // Also, Vulkan requires the size to be non-zero.
-        uint64_t size = std::max(GetSize(), uint64_t(4u));
-        // vkCmdFillBuffer requires the size to be a multiple of 4.
-        size_t alignment = 4u;
-        if (size > std::numeric_limits<uint64_t>::max() - alignment) {
+        size = std::max(size, uint64_t(4u));
+
+        if (size > std::numeric_limits<uint64_t>::max() - kAlignment) {
             // Alignment would overlow.
             return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
         }
-        mAllocatedSize = Align(size, alignment);
+        mAllocatedSize = Align(size, kAlignment);
 
         // Avoid passing ludicrously large sizes to drivers because it causes issues: drivers add
         // some constants to the size passed and align it, but for values close to the maximum
@@ -200,6 +218,17 @@ namespace dawn_native { namespace vulkan {
             ClearBuffer(device->GetPendingRecordingContext(), 0x01010101);
         }
 
+        // Initialize the padding bytes to zero.
+        if (device->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) && !mappedAtCreation) {
+            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+            if (paddingBytes > 0) {
+                uint32_t clearSize = Align(paddingBytes, 4);
+                uint64_t clearOffset = GetAllocatedSize() - clearSize;
+
+                CommandRecordingContext* recordingContext = device->GetPendingRecordingContext();
+                ClearBuffer(recordingContext, 0, clearOffset, clearSize);
+            }
+        }
         return {};
     }
 
@@ -354,17 +383,21 @@ namespace dawn_native { namespace vulkan {
         SetIsDataInitialized();
     }
 
-    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue) {
+    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext,
+                             uint32_t clearValue,
+                             uint64_t offset,
+                             uint64_t size) {
         ASSERT(recordingContext != nullptr);
-        ASSERT(GetAllocatedSize() > 0);
+        size = size > 0 ? size : GetAllocatedSize();
+        ASSERT(size > 0);
 
         TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);
 
         Device* device = ToBackend(GetDevice());
         // VK_WHOLE_SIZE doesn't work on old Windows Intel Vulkan drivers, so we don't use it.
         // Note: Allocated size must be a multiple of 4.
-        ASSERT(GetAllocatedSize() % 4 == 0);
-        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, 0, GetAllocatedSize(),
+        ASSERT(size % 4 == 0);
+        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, offset, size,
                                  clearValue);
     }
 }}  // namespace dawn_native::vulkan
diff --git a/src/dawn_native/vulkan/BufferVk.h b/src/dawn_native/vulkan/BufferVk.h
index e9a8951e30..1c40140f12 100644
--- a/src/dawn_native/vulkan/BufferVk.h
+++ b/src/dawn_native/vulkan/BufferVk.h
@@ -55,7 +55,10 @@ namespace dawn_native { namespace vulkan {
 
         MaybeError Initialize(bool mappedAtCreation);
         void InitializeToZero(CommandRecordingContext* recordingContext);
-        void ClearBuffer(CommandRecordingContext* recordingContext, uint32_t clearValue);
+        void ClearBuffer(CommandRecordingContext* recordingContext,
+                         uint32_t clearValue,
+                         uint64_t offset = 0,
+                         uint64_t size = 0);
 
         MaybeError MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) override;
         void UnmapImpl() override;
diff --git a/src/tests/end2end/BufferZeroInitTests.cpp b/src/tests/end2end/BufferZeroInitTests.cpp
index 57be4d566a..04c961b839 100644
--- a/src/tests/end2end/BufferZeroInitTests.cpp
+++ b/src/tests/end2end/BufferZeroInitTests.cpp
@@ -14,6 +14,7 @@
 
 #include "tests/DawnTest.h"
 
+#include "common/Math.h"
 #include "utils/ComboRenderPipelineDescriptor.h"
 #include "utils/TestUtils.h"
 #include "utils/WGPUHelpers.h"
@@ -201,8 +202,10 @@ class BufferZeroInitTest : public DawnTest {
         EXPECT_PIXEL_RGBA8_EQ(kExpectedColor, outputTexture, 0u, 0u);
     }
 
-    wgpu::RenderPipeline CreateRenderPipelineForTest(const char* vertexShader,
-                                                     uint32_t vertexBufferCount = 1u) {
+    wgpu::RenderPipeline CreateRenderPipelineForTest(
+        const char* vertexShader,
+        uint32_t vertexBufferCount = 1u,
+        wgpu::VertexFormat vertexFormat = wgpu::VertexFormat::Float32x4) {
         constexpr wgpu::TextureFormat kColorAttachmentFormat = wgpu::TextureFormat::RGBA8Unorm;
 
         wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, vertexShader);
@@ -219,9 +222,9 @@ class BufferZeroInitTest : public DawnTest {
         descriptor.cFragment.module = fsModule;
         descriptor.primitive.topology = wgpu::PrimitiveTopology::PointList;
         descriptor.vertex.bufferCount = vertexBufferCount;
-        descriptor.cBuffers[0].arrayStride = 4 * sizeof(float);
+        descriptor.cBuffers[0].arrayStride = Align(utils::VertexFormatSize(vertexFormat), 4);
         descriptor.cBuffers[0].attributeCount = 1;
-        descriptor.cAttributes[0].format = wgpu::VertexFormat::Float32x4;
+        descriptor.cAttributes[0].format = vertexFormat;
         descriptor.cTargets[0].format = kColorAttachmentFormat;
         return device.CreateRenderPipeline(&descriptor);
     }
@@ -1128,6 +1131,114 @@ TEST_P(BufferZeroInitTest, SetVertexBuffer) {
     }
 }
 
+// Test for crbug.com/dawn/837.
+// Test that the padding after a buffer allocation is initialized to 0.
+// This test makes an unaligned vertex buffer which should be padded in the backend
+// allocation. It then tries to index off the end of the vertex buffer in an indexed
+// draw call. A backend which implements robust buffer access via clamping should
+// still see zeros at the end of the buffer.
+TEST_P(BufferZeroInitTest, PaddingInitialized) {
+    DAWN_SUPPRESS_TEST_IF(IsANGLE());  // TODO(crbug.com/dawn/1084).
+
+    constexpr wgpu::TextureFormat kColorAttachmentFormat = wgpu::TextureFormat::RGBA8Unorm;
+    // A small sub-4-byte format means a single vertex can fit entirely within the padded buffer,
+    // touching some of the padding. Test a small format, as well as larger formats.
+    for (wgpu::VertexFormat vertexFormat :
+         {wgpu::VertexFormat::Unorm8x2, wgpu::VertexFormat::Float16x2,
+          wgpu::VertexFormat::Float32x2}) {
+        wgpu::RenderPipeline renderPipeline =
+            CreateRenderPipelineForTest(R"(
+            struct VertexOut {
+                [[location(0)]] color : vec4<f32>;
+                [[builtin(position)]] position : vec4<f32>;
+            };
+
+            [[stage(vertex)]] fn main([[location(0)]] pos : vec2<f32>) -> VertexOut {
+                var output : VertexOut;
+                if (all(pos == vec2<f32>(0.0, 0.0))) {
+                    output.color = vec4<f32>(0.0, 1.0, 0.0, 1.0);
+                } else {
+                    output.color = vec4<f32>(1.0, 0.0, 0.0, 1.0);
+                }
+                output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+                return output;
+            })",
+                                        /* vertexBufferCount */ 1u, vertexFormat);
+
+        // Create an index buffer the indexes off the end of the vertex buffer.
+        wgpu::Buffer indexBuffer =
+            utils::CreateBufferFromData<uint32_t>(device, wgpu::BufferUsage::Index, {1});
+
+        const uint32_t vertexFormatSize = utils::VertexFormatSize(vertexFormat);
+
+        // Create an 8-bit texture to use to initialize buffer contents.
+        wgpu::TextureDescriptor initTextureDesc = {};
+        initTextureDesc.size = {vertexFormatSize + 4, 1, 1};
+        initTextureDesc.format = wgpu::TextureFormat::R8Unorm;
+        initTextureDesc.usage = wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst;
+        wgpu::ImageCopyTexture zeroTextureSrc =
+            utils::CreateImageCopyTexture(device.CreateTexture(&initTextureDesc), 0, {0, 0, 0});
+        {
+            wgpu::TextureDataLayout layout =
+                utils::CreateTextureDataLayout(0, wgpu::kCopyStrideUndefined);
+            std::vector<uint8_t> data(initTextureDesc.size.width);
+            queue.WriteTexture(&zeroTextureSrc, data.data(), data.size(), &layout,
+                               &initTextureDesc.size);
+        }
+
+        for (uint32_t extraBytes : {0, 1, 2, 3, 4}) {
+            // Create a vertex buffer to hold a single vertex attribute.
+            // Uniform usage is added to force even more padding on D3D12.
+            // The buffer is internally padded and allocated as a larger buffer.
+            const uint32_t vertexBufferSize = vertexFormatSize + extraBytes;
+            for (uint32_t vertexBufferOffset = 0; vertexBufferOffset <= vertexBufferSize;
+                 vertexBufferOffset += 4u) {
+                wgpu::Buffer vertexBuffer = CreateBuffer(
+                    vertexBufferSize, wgpu::BufferUsage::Vertex | wgpu::BufferUsage::Uniform |
+                                          wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst);
+
+                // "Fully" initialize the buffer with a copy from an 8-bit texture, touching
+                // everything except the padding. From the point-of-view of the API, all
+                // |vertexBufferSize| bytes are initialized. Note: Uses CopyTextureToBuffer because
+                // it does not require 4-byte alignment.
+                {
+                    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+                    wgpu::ImageCopyBuffer dst =
+                        utils::CreateImageCopyBuffer(vertexBuffer, 0, wgpu::kCopyStrideUndefined);
+                    wgpu::Extent3D extent = {vertexBufferSize, 1, 1};
+                    encoder.CopyTextureToBuffer(&zeroTextureSrc, &dst, &extent);
+
+                    wgpu::CommandBuffer commandBuffer = encoder.Finish();
+                    EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &commandBuffer));
+                }
+
+                wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+
+                wgpu::Texture colorAttachment =
+                    CreateAndInitializeTexture({1, 1, 1}, kColorAttachmentFormat);
+                utils::ComboRenderPassDescriptor renderPassDescriptor(
+                    {colorAttachment.CreateView()});
+
+                wgpu::RenderPassEncoder renderPass = encoder.BeginRenderPass(&renderPassDescriptor);
+
+                renderPass.SetVertexBuffer(0, vertexBuffer, vertexBufferOffset);
+                renderPass.SetIndexBuffer(indexBuffer, wgpu::IndexFormat::Uint32);
+
+                renderPass.SetPipeline(renderPipeline);
+                renderPass.DrawIndexed(1);
+                renderPass.EndPass();
+
+                wgpu::CommandBuffer commandBuffer = encoder.Finish();
+
+                EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &commandBuffer));
+
+                constexpr RGBA8 kExpectedPixelValue = {0, 255, 0, 255};
+                EXPECT_PIXEL_RGBA8_EQ(kExpectedPixelValue, colorAttachment, 0, 0);
+            }
+        }
+    }
+}
+
 // Test the buffer will be lazily initialized correctly when its first use is in SetIndexBuffer.
 TEST_P(BufferZeroInitTest, SetIndexBuffer) {
     // Bind the whole buffer as an index buffer.
diff --git a/src/tests/end2end/TextureZeroInitTests.cpp b/src/tests/end2end/TextureZeroInitTests.cpp
index 56084a35fd..e43a5305fa 100644
--- a/src/tests/end2end/TextureZeroInitTests.cpp
+++ b/src/tests/end2end/TextureZeroInitTests.cpp
@@ -2060,6 +2060,84 @@ TEST_P(CompressedTextureZeroInitTest, HalfCopyTextureToTextureMipLevel) {
                                           kViewMipLevel, 0, true);
 }
 
+// Test uploading then reading back from a 2D array compressed texture.
+// This is a regression test for a bug where the final destination buffer
+// was considered fully initialized even though there was a 256-byte
+// stride between images.
+TEST_P(CompressedTextureZeroInitTest, Copy2DArrayCompressedB2T2B) {
+    // TODO(crbug.com/dawn/643): diagnose and fix this failure on OpenGL.
+    DAWN_SUPPRESS_TEST_IF(IsOpenGL() || IsOpenGLES());
+
+    // create srcTexture with data
+    wgpu::TextureDescriptor textureDescriptor = CreateTextureDescriptor(
+        4, 5, wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::CopyDst, utils::kBCFormats[0]);
+    textureDescriptor.size = {8, 8, 5};
+    wgpu::Texture srcTexture = device.CreateTexture(&textureDescriptor);
+
+    uint32_t mipLevel = 2;
+    wgpu::Extent3D copyExtent3D = {4, 4, 5};
+
+    uint32_t copyWidthInBlock = copyExtent3D.width / kFormatBlockByteSize;
+    uint32_t copyHeightInBlock = copyExtent3D.height / kFormatBlockByteSize;
+    uint32_t copyRowsPerImage = copyHeightInBlock;
+    uint32_t copyBytesPerRow =
+        Align(copyWidthInBlock * utils::GetTexelBlockSizeInBytes(textureDescriptor.format),
+              kTextureBytesPerRowAlignment);
+
+    // Generate data to upload
+    std::vector<uint8_t> data(utils::RequiredBytesInCopy(copyBytesPerRow, copyRowsPerImage,
+                                                         copyExtent3D, textureDescriptor.format));
+    for (size_t i = 0; i < data.size(); ++i) {
+        data[i] = i % 255;
+    }
+
+    // Copy texture data from a staging buffer to the destination texture.
+    wgpu::Buffer stagingBuffer =
+        utils::CreateBufferFromData(device, data.data(), data.size(), wgpu::BufferUsage::CopySrc);
+    wgpu::ImageCopyBuffer imageCopyBufferSrc =
+        utils::CreateImageCopyBuffer(stagingBuffer, 0, copyBytesPerRow, copyRowsPerImage);
+
+    wgpu::ImageCopyTexture imageCopyTexture =
+        utils::CreateImageCopyTexture(srcTexture, mipLevel, {0, 0, 0});
+
+    {
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyBufferToTexture(&imageCopyBufferSrc, &imageCopyTexture, &copyExtent3D);
+        wgpu::CommandBuffer copy = encoder.Finish();
+        EXPECT_LAZY_CLEAR(0u, queue.Submit(1, &copy));
+    }
+
+    // Create a buffer to read back the data. It is the same size as the upload buffer.
+    wgpu::BufferDescriptor readbackDesc = {};
+    readbackDesc.size = data.size();
+    readbackDesc.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    wgpu::Buffer readbackBuffer = device.CreateBuffer(&readbackDesc);
+
+    // Copy the texture to the readback buffer.
+    wgpu::ImageCopyBuffer imageCopyBufferDst =
+        utils::CreateImageCopyBuffer(readbackBuffer, 0, copyBytesPerRow, copyRowsPerImage);
+    {
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        encoder.CopyTextureToBuffer(&imageCopyTexture, &imageCopyBufferDst, &copyExtent3D);
+        wgpu::CommandBuffer copy = encoder.Finish();
+
+        // Expect a lazy clear because the padding in the copy is not touched.
+        EXPECT_LAZY_CLEAR(1u, queue.Submit(1, &copy));
+    }
+
+    // Generate expected data. It is the same as the upload data, but padding is zero.
+    std::vector<uint8_t> expected(data.size(), 0);
+    for (uint32_t z = 0; z < copyExtent3D.depthOrArrayLayers; ++z) {
+        for (uint32_t y = 0; y < copyHeightInBlock; ++y) {
+            memcpy(&expected[copyBytesPerRow * y + copyBytesPerRow * copyRowsPerImage * z],
+                   &data[copyBytesPerRow * y + copyBytesPerRow * copyRowsPerImage * z],
+                   copyWidthInBlock * utils::GetTexelBlockSizeInBytes(textureDescriptor.format));
+        }
+    }
+    // Check final contents
+    EXPECT_BUFFER_U8_RANGE_EQ(expected.data(), readbackBuffer, 0, expected.size());
+}
+
 DAWN_INSTANTIATE_TEST(CompressedTextureZeroInitTest,
                       D3D12Backend({"nonzero_clear_resources_on_creation_for_testing"}),
                       MetalBackend({"nonzero_clear_resources_on_creation_for_testing"}),
diff --git a/src/utils/TestUtils.cpp b/src/utils/TestUtils.cpp
index adbdd28be1..8f359abe57 100644
--- a/src/utils/TestUtils.cpp
+++ b/src/utils/TestUtils.cpp
@@ -134,4 +134,47 @@ namespace utils {
         device.GetQueue().WriteTexture(&imageCopyTexture, data.data(), 1, &textureDataLayout,
                                        &copyExtent);
     }
+
+    uint32_t VertexFormatSize(wgpu::VertexFormat format) {
+        switch (format) {
+            case wgpu::VertexFormat::Uint8x2:
+            case wgpu::VertexFormat::Sint8x2:
+            case wgpu::VertexFormat::Unorm8x2:
+            case wgpu::VertexFormat::Snorm8x2:
+                return 2;
+            case wgpu::VertexFormat::Uint8x4:
+            case wgpu::VertexFormat::Sint8x4:
+            case wgpu::VertexFormat::Unorm8x4:
+            case wgpu::VertexFormat::Snorm8x4:
+            case wgpu::VertexFormat::Uint16x2:
+            case wgpu::VertexFormat::Sint16x2:
+            case wgpu::VertexFormat::Unorm16x2:
+            case wgpu::VertexFormat::Snorm16x2:
+            case wgpu::VertexFormat::Float16x2:
+            case wgpu::VertexFormat::Float32:
+            case wgpu::VertexFormat::Uint32:
+            case wgpu::VertexFormat::Sint32:
+                return 4;
+            case wgpu::VertexFormat::Uint16x4:
+            case wgpu::VertexFormat::Sint16x4:
+            case wgpu::VertexFormat::Unorm16x4:
+            case wgpu::VertexFormat::Snorm16x4:
+            case wgpu::VertexFormat::Float16x4:
+            case wgpu::VertexFormat::Float32x2:
+            case wgpu::VertexFormat::Uint32x2:
+            case wgpu::VertexFormat::Sint32x2:
+                return 8;
+            case wgpu::VertexFormat::Float32x3:
+            case wgpu::VertexFormat::Uint32x3:
+            case wgpu::VertexFormat::Sint32x3:
+                return 12;
+            case wgpu::VertexFormat::Float32x4:
+            case wgpu::VertexFormat::Uint32x4:
+            case wgpu::VertexFormat::Sint32x4:
+                return 16;
+            case wgpu::VertexFormat::Undefined:
+                UNREACHABLE();
+        }
+    }
+
 }  // namespace utils
diff --git a/src/utils/TestUtils.h b/src/utils/TestUtils.h
index d611124175..02b0dafb31 100644
--- a/src/utils/TestUtils.h
+++ b/src/utils/TestUtils.h
@@ -60,6 +60,8 @@ namespace utils {
     // in it will contain 1 byte of data.
     void UnalignDynamicUploader(wgpu::Device device);
 
+    uint32_t VertexFormatSize(wgpu::VertexFormat format);
+
 }  // namespace utils
 
 #endif  // UTILS_TESTHELPERS_H_