diff --git a/src/dawn/native/Toggles.cpp b/src/dawn/native/Toggles.cpp
index 3816f20a57..ae64158bf1 100644
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -306,6 +306,12 @@ static constexpr ToggleEnumAndInfoList kToggleNameAndInfoList = {{
       "integer that is greater than 2^24 or smaller than -2^24). This toggle is also enabled on "
       "Intel GPUs on Metal backend due to a driver issue on Intel Metal driver.",
       "https://crbug.com/dawn/537"}},
+    {Toggle::MetalUseDummyBlitEncoderForWriteTimestamp,
+     {"metal_use_dummy_blit_encoder_for_write_timestamp",
+      "Add dummy blit command to blit encoder when encoding writeTimestamp as workaround on Metal."
+      "This toggle is enabled by default on Metal backend where GPU counters cannot be stored to"
+      "sampleBufferAttachments on empty blit encoder.",
+      "https://crbug.com/dawn/1473"}},
     // Comment to separate the }} so it is clearer what to copy-paste to add a toggle.
 }};
 }  // anonymous namespace
diff --git a/src/dawn/native/Toggles.h b/src/dawn/native/Toggles.h
index 981aa94e0c..942f76a167 100644
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -79,6 +79,7 @@ enum class Toggle {
     D3D12AllocateExtraMemoryFor2DArrayTexture,
     D3D12UseTempBufferInDepthStencilTextureAndBufferCopyWithNonZeroBufferOffset,
     ApplyClearBigIntegerColorValueWithDraw,
+    MetalUseDummyBlitEncoderForWriteTimestamp,
 
     EnumCount,
     InvalidEnum = EnumCount,
diff --git a/src/dawn/native/metal/BackendMTL.mm b/src/dawn/native/metal/BackendMTL.mm
index 9415e52828..9d5648033c 100644
--- a/src/dawn/native/metal/BackendMTL.mm
+++ b/src/dawn/native/metal/BackendMTL.mm
@@ -24,6 +24,7 @@
 #include "dawn/native/MetalBackend.h"
 #include "dawn/native/metal/BufferMTL.h"
 #include "dawn/native/metal/DeviceMTL.h"
+#include "dawn/native/metal/UtilsMetal.h"
 
 #if DAWN_PLATFORM_IS(MACOS)
 #import <IOKit/IOKitLib.h>
@@ -170,18 +171,6 @@ MaybeError GetDevicePCIInfo(id<MTLDevice> device, PCIIDs* ids) {
 #error "Unsupported Apple platform."
 #endif
 
-DAWN_NOINLINE bool IsCounterSamplingBoundarySupport(id<MTLDevice> device)
-    API_AVAILABLE(macos(11.0), ios(14.0)) {
-    bool isBlitBoundarySupported =
-        [device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary];
-    bool isDispatchBoundarySupported =
-        [device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary];
-    bool isDrawBoundarySupported =
-        [device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary];
-
-    return isBlitBoundarySupported && isDispatchBoundarySupported && isDrawBoundarySupported;
-}
-
 // This method has seen hard-to-debug crashes. See crbug.com/dawn/1102.
 // For now, it is written defensively, with many potentially unnecessary guards until
 // we narrow down the cause of the problem.
@@ -246,11 +235,13 @@ DAWN_NOINLINE bool IsGPUCounterSupported(id<MTLDevice> device,
     }
 
     if (@available(macOS 11.0, iOS 14.0, *)) {
-        // Check whether it can read GPU counters at the specified command boundary. Apple
-        // family GPUs do not support sampling between different Metal commands, because
-        // they defer fragment processing until after the GPU processes all the primitives
-        // in the render pass.
-        if (!IsCounterSamplingBoundarySupport(device)) {
+        // Check whether it can read GPU counters at the specified command boundary or stage
+        // boundary. Apple family GPUs do not support sampling between different Metal commands,
+        // because they defer fragment processing until after the GPU processes all the primitives
+        // in the render pass. GPU counters are only available if sampling at least one of the
+        // command or stage boundaries is supported.
+        if (!SupportCounterSamplingAtCommandBoundary(device) &&
+            !SupportCounterSamplingAtStageBoundary(device)) {
             return false;
         }
     }
diff --git a/src/dawn/native/metal/CommandBufferMTL.h b/src/dawn/native/metal/CommandBufferMTL.h
index 0f95ef4243..2976ec81dd 100644
--- a/src/dawn/native/metal/CommandBufferMTL.h
+++ b/src/dawn/native/metal/CommandBufferMTL.h
@@ -16,12 +16,15 @@
 #define SRC_DAWN_NATIVE_METAL_COMMANDBUFFERMTL_H_
 
 #include "dawn/native/CommandBuffer.h"
+#include "dawn/native/Commands.h"
 #include "dawn/native/Error.h"
 
 #import <Metal/Metal.h>
 
 namespace dawn::native {
 class CommandEncoder;
+struct BeginComputePassCmd;
+struct BeginRenderPassCmd;
 }
 
 namespace dawn::native::metal {
@@ -55,8 +58,10 @@ class CommandBuffer final : public CommandBufferBase {
   private:
     using CommandBufferBase::CommandBufferBase;
 
-    MaybeError EncodeComputePass(CommandRecordingContext* commandContext);
-    MaybeError EncodeRenderPass(id<MTLRenderCommandEncoder> encoder);
+    MaybeError EncodeComputePass(CommandRecordingContext* commandContext,
+                                 BeginComputePassCmd* computePassCmd);
+    MaybeError EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
+                                BeginRenderPassCmd* renderPassCmd);
 };
 
 }  // namespace dawn::native::metal
diff --git a/src/dawn/native/metal/CommandBufferMTL.mm b/src/dawn/native/metal/CommandBufferMTL.mm
index 66a4569277..a5e5fbe52e 100644
--- a/src/dawn/native/metal/CommandBufferMTL.mm
+++ b/src/dawn/native/metal/CommandBufferMTL.mm
@@ -49,7 +49,129 @@ MTLIndexType MTLIndexFormat(wgpu::IndexFormat format) {
     }
 }
 
-NSRef<MTLRenderPassDescriptor> CreateMTLRenderPassDescriptor(BeginRenderPassCmd* renderPass) {
+template <typename PassDescriptor>
+class SampleBufferAttachment {
+  public:
+    void SetSampleBuffer(PassDescriptor* descriptor, id<MTLCounterSampleBuffer> sampleBuffer)
+        API_AVAILABLE(macos(11.0), ios(14.0));
+    void SetStartSampleIndex(PassDescriptor* descriptor, NSUInteger sampleIndex)
+        API_AVAILABLE(macos(11.0), ios(14.0));
+    void SetEndSampleIndex(PassDescriptor* descriptor, NSUInteger sampleIndex)
+        API_AVAILABLE(macos(11.0), ios(14.0));
+
+  private:
+    // Initialized to the maximum value, in order to start from 0 after the first increment.
+    NSUInteger attachmentIndex = NSUIntegerMax;
+    // TODO(dawn:1473): The maximum of sampleBufferAttachments depends on the length of MTLDevice's
+    // counterSets, but Metal does not match the allowed maximum of sampleBufferAttachments with the
+    // length of counterSets on AGX family. Hardcode as a constant and check this whenever Metal
+    // could get the matched value.
+    static constexpr NSUInteger kMaxSampleBufferAttachments = 4;
+};
+
+template <typename PassDescriptor>
+void SampleBufferAttachment<PassDescriptor>::SetSampleBuffer(
+    PassDescriptor* descriptor,
+    id<MTLCounterSampleBuffer> sampleBuffer) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    attachmentIndex++;
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].sampleBuffer = sampleBuffer;
+}
+
+// Must be called after SetSampleBuffer
+template <>
+void SampleBufferAttachment<MTLRenderPassDescriptor>::SetStartSampleIndex(
+    MTLRenderPassDescriptor* descriptor,
+    NSUInteger sampleIndex) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].startOfVertexSampleIndex = sampleIndex;
+}
+
+// Must be called after SetSampleBuffer
+template <>
+void SampleBufferAttachment<MTLRenderPassDescriptor>::SetEndSampleIndex(
+    MTLRenderPassDescriptor* descriptor,
+    NSUInteger sampleIndex) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].endOfFragmentSampleIndex = sampleIndex;
+}
+
+// Must be called after SetSampleBuffer
+template <>
+void SampleBufferAttachment<MTLComputePassDescriptor>::SetStartSampleIndex(
+    MTLComputePassDescriptor* descriptor,
+    NSUInteger sampleIndex) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].startOfEncoderSampleIndex = sampleIndex;
+}
+
+// Must be called after SetSampleBuffer
+template <>
+void SampleBufferAttachment<MTLComputePassDescriptor>::SetEndSampleIndex(
+    MTLComputePassDescriptor* descriptor,
+    NSUInteger sampleIndex) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    // TODO(dawn:1473): Use MTLComputePassSampleBuffers or query method instead of the magic number
+    // 4 when Metal could get the maximum of sampleBufferAttachments on compute pass
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].endOfEncoderSampleIndex = sampleIndex;
+}
+
+template <typename PassDescriptor, typename BeginPass>
+void SetSampleBufferAttachments(PassDescriptor* descriptor, BeginPass* cmd) {
+    // Use @available instead of API_AVAILABLE because GetCounterSampleBuffer() also needs checking
+    // API availability.
+    if (@available(macOS 11.0, iOS 14.0, *)) {
+        QuerySetBase* beginQuerySet = cmd->beginTimestamp.querySet.Get();
+        QuerySetBase* endQuerySet = cmd->endTimestamp.querySet.Get();
+
+        SampleBufferAttachment<PassDescriptor> sampleBufferAttachment;
+
+        if (beginQuerySet != nullptr) {
+            sampleBufferAttachment.SetSampleBuffer(
+                descriptor, ToBackend(beginQuerySet)->GetCounterSampleBuffer());
+            sampleBufferAttachment.SetStartSampleIndex(descriptor,
+                                                       NSUInteger(cmd->beginTimestamp.queryIndex));
+
+            if (beginQuerySet == endQuerySet) {
+                sampleBufferAttachment.SetEndSampleIndex(descriptor,
+                                                         NSUInteger(cmd->endTimestamp.queryIndex));
+            } else {
+                sampleBufferAttachment.SetEndSampleIndex(descriptor, MTLCounterDontSample);
+            }
+        }
+
+        // Set to other sampleBufferAttachment if the endQuerySet is different with beginQuerySet.
+        if (endQuerySet != nullptr && beginQuerySet != endQuerySet) {
+            sampleBufferAttachment.SetSampleBuffer(
+                descriptor, ToBackend(endQuerySet)->GetCounterSampleBuffer());
+            sampleBufferAttachment.SetStartSampleIndex(descriptor, MTLCounterDontSample);
+            sampleBufferAttachment.SetEndSampleIndex(descriptor,
+                                                     NSUInteger(cmd->endTimestamp.queryIndex));
+        }
+    } else {
+        UNREACHABLE();
+    }
+}
+
+NSRef<MTLComputePassDescriptor> CreateMTLComputePassDescriptor(BeginComputePassCmd* computePass)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    // Note that this creates a descriptor that's autoreleased so we don't use AcquireNSRef
+    NSRef<MTLComputePassDescriptor> descriptorRef =
+        [MTLComputePassDescriptor computePassDescriptor];
+    MTLComputePassDescriptor* descriptor = descriptorRef.Get();
+    // MTLDispatchTypeSerial is the same dispatch type as the deafult MTLComputeCommandEncoder.
+    // MTLDispatchTypeConcurrent requires memory barriers to ensure multiple commands synchronize
+    // access to the same resources, which we may support it later.
+    descriptor.dispatchType = MTLDispatchTypeSerial;
+
+    SetSampleBufferAttachments(descriptor, computePass);
+
+    return descriptorRef;
+}
+
+NSRef<MTLRenderPassDescriptor> CreateMTLRenderPassDescriptor(
+    BeginRenderPassCmd* renderPass,
+    bool useCounterSamplingAtStageBoundary) {
     // Note that this creates a descriptor that's autoreleased so we don't use AcquireNSRef
     NSRef<MTLRenderPassDescriptor> descriptorRef = [MTLRenderPassDescriptor renderPassDescriptor];
     MTLRenderPassDescriptor* descriptor = descriptorRef.Get();
@@ -197,9 +319,36 @@ NSRef<MTLRenderPassDescriptor> CreateMTLRenderPassDescriptor(BeginRenderPassCmd*
             ToBackend(renderPass->occlusionQuerySet.Get())->GetVisibilityBuffer();
     }
 
+    if (@available(macOS 11.0, iOS 14.0, *)) {
+        if (useCounterSamplingAtStageBoundary) {
+            SetSampleBufferAttachments(descriptor, renderPass);
+        }
+    }
+
     return descriptorRef;
 }
 
+void EncodeEmptyBlitEncoderForWriteTimestamp(Device* device,
+                                             CommandRecordingContext* commandContext,
+                                             WriteTimestampCmd* cmd)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    commandContext->EndBlit();
+
+    MTLBlitPassDescriptor* descriptor = [[MTLBlitPassDescriptor alloc] init];
+    if (cmd->querySet.Get() != nullptr) {
+        descriptor.sampleBufferAttachments[0].sampleBuffer =
+            ToBackend(cmd->querySet.Get())->GetCounterSampleBuffer();
+        descriptor.sampleBufferAttachments[0].startOfEncoderSampleIndex = MTLCounterDontSample;
+        descriptor.sampleBufferAttachments[0].endOfEncoderSampleIndex = NSUInteger(cmd->queryIndex);
+
+        id<MTLBlitCommandEncoder> blit = commandContext->BeginBlit(descriptor);
+        if (device->IsToggleEnabled(Toggle::MetalUseDummyBlitEncoderForWriteTimestamp)) {
+            [blit fillBuffer:device->GetDummyBlitMtlBuffer() range:NSMakeRange(0, 1) value:0];
+        }
+        commandContext->EndBlit();
+    }
+}
+
 // Metal uses a physical addressing mode which means buffers in the shading language are
 // just pointers to the virtual address of their start. This means there is no way to know
 // the length of a buffer to compute the length() of unsized arrays at the end of storage
@@ -612,18 +761,13 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)
             case Command::BeginComputePass: {
                 BeginComputePassCmd* cmd = mCommands.NextCommand<BeginComputePassCmd>();
 
-                if (cmd->beginTimestamp.querySet.Get() != nullptr ||
-                    cmd->endTimestamp.querySet.Get() != nullptr) {
-                    return DAWN_UNIMPLEMENTED_ERROR("timestampWrites unimplemented.");
-                }
-
                 for (const SyncScopeResourceUsage& scope :
                      GetResourceUsages().computePasses[nextComputePassNumber].dispatchUsages) {
                     LazyClearSyncScope(scope, commandContext);
                 }
                 commandContext->EndBlit();
 
-                DAWN_TRY(EncodeComputePass(commandContext));
+                DAWN_TRY(EncodeComputePass(commandContext, cmd));
 
                 nextComputePassNumber++;
                 break;
@@ -632,22 +776,19 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)
             case Command::BeginRenderPass: {
                 BeginRenderPassCmd* cmd = mCommands.NextCommand<BeginRenderPassCmd>();
 
-                if (cmd->beginTimestamp.querySet.Get() != nullptr ||
-                    cmd->endTimestamp.querySet.Get() != nullptr) {
-                    return DAWN_UNIMPLEMENTED_ERROR("timestampWrites unimplemented.");
-                }
-
                 LazyClearSyncScope(GetResourceUsages().renderPasses[nextRenderPassNumber],
                                    commandContext);
                 commandContext->EndBlit();
 
                 LazyClearRenderPassAttachments(cmd);
-                NSRef<MTLRenderPassDescriptor> descriptor = CreateMTLRenderPassDescriptor(cmd);
+                NSRef<MTLRenderPassDescriptor> descriptor = CreateMTLRenderPassDescriptor(
+                    cmd, ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary());
                 DAWN_TRY(EncodeMetalRenderPass(
                     ToBackend(GetDevice()), commandContext, descriptor.Get(), cmd->width,
-                    cmd->height, [this](id<MTLRenderCommandEncoder> encoder) -> MaybeError {
-                        return this->EncodeRenderPass(encoder);
-                    }));
+                    cmd->height,
+                    [this](id<MTLRenderCommandEncoder> encoder, BeginRenderPassCmd* cmd)
+                        -> MaybeError { return this->EncodeRenderPass(encoder, cmd); },
+                    cmd));
 
                 nextRenderPassNumber++;
                 break;
@@ -905,16 +1046,29 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)
 
             case Command::WriteTimestamp: {
                 WriteTimestampCmd* cmd = mCommands.NextCommand<WriteTimestampCmd>();
-                QuerySet* querySet = ToBackend(cmd->querySet.Get());
 
-                if (@available(macos 10.15, iOS 14.0, *)) {
-                    [commandContext->EnsureBlit()
-                        sampleCountersInBuffer:querySet->GetCounterSampleBuffer()
-                                 atSampleIndex:NSUInteger(cmd->queryIndex)
-                                   withBarrier:YES];
+                if (ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary()) {
+                    if (@available(macos 11.0, iOS 14.0, *)) {
+                        // Simulate writeTimestamp cmd between blit commands on the devices which
+                        // supports counter sampling at stage boundary.
+                        EncodeEmptyBlitEncoderForWriteTimestamp(ToBackend(GetDevice()),
+                                                                commandContext, cmd);
+                    } else {
+                        UNREACHABLE();
+                    }
                 } else {
-                    UNREACHABLE();
+                    if (@available(macos 10.15, iOS 14.0, *)) {
+                        ASSERT(ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary());
+                        [commandContext->EnsureBlit()
+                            sampleCountersInBuffer:ToBackend(cmd->querySet.Get())
+                                                       ->GetCounterSampleBuffer()
+                                     atSampleIndex:NSUInteger(cmd->queryIndex)
+                                       withBarrier:YES];
+                    } else {
+                        UNREACHABLE();
+                    }
                 }
+
                 break;
             }
 
@@ -985,18 +1139,64 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)
     return {};
 }
 
-MaybeError CommandBuffer::EncodeComputePass(CommandRecordingContext* commandContext) {
+MaybeError CommandBuffer::EncodeComputePass(CommandRecordingContext* commandContext,
+                                            BeginComputePassCmd* computePassCmd) {
     ComputePipeline* lastPipeline = nullptr;
     StorageBufferLengthTracker storageBufferLengths = {};
     BindGroupTracker bindGroups(&storageBufferLengths);
 
-    id<MTLComputeCommandEncoder> encoder = commandContext->BeginCompute();
+    id<MTLComputeCommandEncoder> encoder;
+    // When counter sampling is supported at stage boundary, begin a configurable compute pass
+    // encoder which is supported since macOS 11.0+ and iOS 14.0+ and set timestamp writes to
+    // compute pass descriptor, otherwise begin a default compute pass encoder, and simulate
+    // timestamp writes using sampleCountersInBuffer API at the beginning and end of compute pass.
+    if (ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary()) {
+        if (@available(macOS 11.0, iOS 14.0, *)) {
+            NSRef<MTLComputePassDescriptor> descriptor =
+                CreateMTLComputePassDescriptor(computePassCmd);
+            encoder = commandContext->BeginCompute(descriptor.Get());
+        } else {
+            UNREACHABLE();
+        }
+    } else {
+        encoder = commandContext->BeginCompute();
+
+        if (@available(macos 10.15, iOS 14.0, *)) {
+            if (computePassCmd->beginTimestamp.querySet.Get() != nullptr) {
+                ASSERT(ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary());
+
+                [encoder
+                    sampleCountersInBuffer:ToBackend(computePassCmd->beginTimestamp.querySet.Get())
+                                               ->GetCounterSampleBuffer()
+                             atSampleIndex:NSUInteger(computePassCmd->beginTimestamp.queryIndex)
+                               withBarrier:YES];
+            }
+        }
+    }
 
     Command type;
     while (mCommands.NextCommandId(&type)) {
         switch (type) {
             case Command::EndComputePass: {
                 mCommands.NextCommand<EndComputePassCmd>();
+
+                if (@available(macos 10.15, iOS 14.0, *)) {
+                    // Simulate timestamp write at the end of render pass if it does not support
+                    // counter sampling at stage boundary.
+                    if (ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary() &&
+                        computePassCmd->endTimestamp.querySet.Get() != nullptr) {
+                        ASSERT(!ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary());
+
+                        [encoder
+                            sampleCountersInBuffer:ToBackend(
+                                                       computePassCmd->endTimestamp.querySet.Get())
+                                                       ->GetCounterSampleBuffer()
+                                     atSampleIndex:NSUInteger(
+                                                       computePassCmd->endTimestamp.queryIndex)
+                                       withBarrier:YES];
+                    }
+                }
+
                 commandContext->EndCompute();
                 return {};
             }
@@ -1104,7 +1304,8 @@ MaybeError CommandBuffer::EncodeComputePass(CommandRecordingContext* commandCont
     UNREACHABLE();
 }
 
-MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder) {
+MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
+                                           BeginRenderPassCmd* renderPassCmd) {
     bool enableVertexPulling = GetDevice()->IsToggleEnabled(Toggle::MetalEnableVertexPulling);
     RenderPipeline* lastPipeline = nullptr;
     id<MTLBuffer> indexBuffer = nullptr;
@@ -1116,6 +1317,20 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder)
     VertexBufferTracker vertexBuffers(&storageBufferLengths);
     BindGroupTracker bindGroups(&storageBufferLengths);
 
+    if (@available(macos 10.15, iOS 14.0, *)) {
+        // Simulate timestamp write at the beginning of render pass by
+        // sampleCountersInBuffer if it does not support counter sampling at stage boundary.
+        if (ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary() &&
+            renderPassCmd->beginTimestamp.querySet.Get() != nullptr) {
+            ASSERT(!ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary());
+
+            [encoder sampleCountersInBuffer:ToBackend(renderPassCmd->beginTimestamp.querySet.Get())
+                                                ->GetCounterSampleBuffer()
+                              atSampleIndex:NSUInteger(renderPassCmd->beginTimestamp.queryIndex)
+                                withBarrier:YES];
+        }
+    }
+
     auto EncodeRenderBundleCommand = [&](CommandIterator* iter, Command type) {
         switch (type) {
             case Command::Draw: {
@@ -1304,6 +1519,24 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder)
         switch (type) {
             case Command::EndRenderPass: {
                 mCommands.NextCommand<EndRenderPassCmd>();
+
+                if (@available(macos 10.15, iOS 14.0, *)) {
+                    // Simulate timestamp write at the end of render pass if it does not support
+                    // counter sampling at stage boundary.
+                    if (ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary() &&
+                        renderPassCmd->endTimestamp.querySet.Get() != nullptr) {
+                        ASSERT(!ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary());
+
+                        [encoder
+                            sampleCountersInBuffer:ToBackend(
+                                                       renderPassCmd->endTimestamp.querySet.Get())
+                                                       ->GetCounterSampleBuffer()
+                                     atSampleIndex:NSUInteger(
+                                                       renderPassCmd->endTimestamp.queryIndex)
+                                       withBarrier:YES];
+                    }
+                }
+
                 return {};
             }
 
diff --git a/src/dawn/native/metal/CommandRecordingContext.h b/src/dawn/native/metal/CommandRecordingContext.h
index b5ec3defaf..025c88bbe0 100644
--- a/src/dawn/native/metal/CommandRecordingContext.h
+++ b/src/dawn/native/metal/CommandRecordingContext.h
@@ -36,10 +36,18 @@ class CommandRecordingContext : NonMovable {
     MaybeError PrepareNextCommandBuffer(id<MTLCommandQueue> queue);
     NSPRef<id<MTLCommandBuffer>> AcquireCommands();
 
+    // Create blit pass encoder from blit pass descriptor
+    id<MTLBlitCommandEncoder> BeginBlit(MTLBlitPassDescriptor* descriptor)
+        API_AVAILABLE(macos(11.0), ios(14.0));
     id<MTLBlitCommandEncoder> EnsureBlit();
     void EndBlit();
 
+    // Create a sequential compute pass by default.
     id<MTLComputeCommandEncoder> BeginCompute();
+    // Create configurable compute pass from a descriptor with serial dispatch type which commands
+    // are executed sequentially.
+    id<MTLComputeCommandEncoder> BeginCompute(MTLComputePassDescriptor* descriptor)
+        API_AVAILABLE(macos(11.0), ios(14.0));
     void EndCompute();
 
     id<MTLRenderCommandEncoder> BeginRender(MTLRenderPassDescriptor* descriptor);
diff --git a/src/dawn/native/metal/CommandRecordingContext.mm b/src/dawn/native/metal/CommandRecordingContext.mm
index d4bbef068d..294e53f8ea 100644
--- a/src/dawn/native/metal/CommandRecordingContext.mm
+++ b/src/dawn/native/metal/CommandRecordingContext.mm
@@ -62,6 +62,20 @@ NSPRef<id<MTLCommandBuffer>> CommandRecordingContext::AcquireCommands() {
     return std::move(mCommands);
 }
 
+id<MTLBlitCommandEncoder> CommandRecordingContext::BeginBlit(MTLBlitPassDescriptor* descriptor)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(descriptor);
+    ASSERT(mCommands != nullptr);
+    ASSERT(mBlit == nullptr);
+    ASSERT(!mInEncoder);
+
+    mInEncoder = true;
+    // The encoder is created autoreleased. Retain it to avoid the autoreleasepool from
+    // draining from under us.
+    mBlit.Acquire([[*mCommands blitCommandEncoderWithDescriptor:descriptor] retain]);
+    return mBlit.Get();
+}
+
 id<MTLBlitCommandEncoder> CommandRecordingContext::EnsureBlit() {
     ASSERT(mCommands != nullptr);
 
@@ -98,6 +112,20 @@ id<MTLComputeCommandEncoder> CommandRecordingContext::BeginCompute() {
     return mCompute.Get();
 }
 
+id<MTLComputeCommandEncoder> CommandRecordingContext::BeginCompute(
+    MTLComputePassDescriptor* descriptor) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(descriptor);
+    ASSERT(mCommands != nullptr);
+    ASSERT(mCompute == nullptr);
+    ASSERT(!mInEncoder);
+
+    mInEncoder = true;
+    // The encoder is created autoreleased. Retain it to avoid the autoreleasepool from
+    // draining from under us.
+    mCompute.Acquire([[*mCommands computeCommandEncoderWithDescriptor:descriptor] retain]);
+    return mCompute.Get();
+}
+
 void CommandRecordingContext::EndCompute() {
     ASSERT(mCommands != nullptr);
     ASSERT(mCompute != nullptr);
diff --git a/src/dawn/native/metal/DeviceMTL.h b/src/dawn/native/metal/DeviceMTL.h
index 62d4f909ba..dca0033d16 100644
--- a/src/dawn/native/metal/DeviceMTL.h
+++ b/src/dawn/native/metal/DeviceMTL.h
@@ -72,6 +72,13 @@ class Device final : public DeviceBase {
 
     float GetTimestampPeriodInNS() const override;
 
+    bool UseCounterSamplingAtCommandBoundary() const;
+    bool UseCounterSamplingAtStageBoundary() const;
+
+    // Get a MTLBuffer that can be used as a dummy in a no-op blit encoder based on filling this
+    // single-byte buffer
+    id<MTLBuffer> GetDummyBlitMtlBuffer();
+
   private:
     Device(AdapterBase* adapter,
            NSPRef<id<MTLDevice>> mtlDevice,
@@ -144,6 +151,13 @@ class Device final : public DeviceBase {
     MTLTimestamp mGpuTimestamp API_AVAILABLE(macos(10.15), ios(14.0)) = 0;
     // The parameters for kalman filter
     std::unique_ptr<KalmanInfo> mKalmanInfo;
+
+    // Support counter sampling between blit commands, dispatches and draw calls
+    bool mCounterSamplingAtCommandBoundary;
+    // Support counter sampling at the begin and end of blit pass, compute pass and render pass's
+    // vertex/fragement stage
+    bool mCounterSamplingAtStageBoundary;
+    NSPRef<id<MTLBuffer>> mDummyBlitMtlBuffer;
 };
 
 }  // namespace dawn::native::metal
diff --git a/src/dawn/native/metal/DeviceMTL.mm b/src/dawn/native/metal/DeviceMTL.mm
index a3fdac1e51..e5e592b6c2 100644
--- a/src/dawn/native/metal/DeviceMTL.mm
+++ b/src/dawn/native/metal/DeviceMTL.mm
@@ -121,7 +121,18 @@ Device::Device(AdapterBase* adapter,
                const TripleStateTogglesSet& userProvidedToggles)
     : DeviceBase(adapter, descriptor, userProvidedToggles),
       mMtlDevice(std::move(mtlDevice)),
-      mCompletedSerial(0) {}
+      mCompletedSerial(0) {
+    // On macOS < 11.0, we only can check whether counter sampling is supported, and the counter
+    // only can be sampled between command boundary using sampleCountersInBuffer API if it's
+    // supported.
+    if (@available(macOS 11.0, iOS 14.0, *)) {
+        mCounterSamplingAtCommandBoundary = SupportCounterSamplingAtCommandBoundary(GetMTLDevice());
+        mCounterSamplingAtStageBoundary = SupportCounterSamplingAtStageBoundary(GetMTLDevice());
+    } else {
+        mCounterSamplingAtCommandBoundary = true;
+        mCounterSamplingAtStageBoundary = false;
+    }
+}
 
 Device::~Device() {
     Destroy();
@@ -234,6 +245,14 @@ void Device::InitTogglesFromDriver() {
     if (gpu_info::IsIntel(vendorId)) {
         SetToggle(Toggle::ApplyClearBigIntegerColorValueWithDraw, true);
     }
+
+    // TODO(dawn:1473): Metal fails to store GPU counters to sampleBufferAttachments on empty
+    // encoders on macOS 11.0+, we need to add dummy blit command to blit encoder when encoding
+    // writeTimestamp as workaround by enabling the toggle
+    // "metal_use_dummy_blit_encoder_for_write_timestamp".
+    if (@available(macos 11.0, iOS 14.0, *)) {
+        SetToggle(Toggle::MetalUseDummyBlitEncoderForWriteTimestamp, true);
+    }
 }
 
 ResultOrError<Ref<BindGroupBase>> Device::CreateBindGroupImpl(
@@ -498,6 +517,7 @@ void Device::DestroyImpl() {
 
     mCommandQueue = nullptr;
     mMtlDevice = nullptr;
+    mDummyBlitMtlBuffer = nullptr;
 }
 
 uint32_t Device::GetOptimalBytesPerRowAlignment() const {
@@ -512,4 +532,21 @@ float Device::GetTimestampPeriodInNS() const {
     return mTimestampPeriod;
 }
 
+bool Device::UseCounterSamplingAtCommandBoundary() const {
+    return mCounterSamplingAtCommandBoundary;
+}
+
+bool Device::UseCounterSamplingAtStageBoundary() const {
+    return mCounterSamplingAtStageBoundary;
+}
+
+id<MTLBuffer> Device::GetDummyBlitMtlBuffer() {
+    if (mDummyBlitMtlBuffer == nullptr) {
+        mDummyBlitMtlBuffer.Acquire(
+            [GetMTLDevice() newBufferWithLength:1 options:MTLResourceStorageModePrivate]);
+    }
+
+    return mDummyBlitMtlBuffer.Get();
+}
+
 }  // namespace dawn::native::metal
diff --git a/src/dawn/native/metal/UtilsMetal.h b/src/dawn/native/metal/UtilsMetal.h
index 418c4a6864..d491ecbd62 100644
--- a/src/dawn/native/metal/UtilsMetal.h
+++ b/src/dawn/native/metal/UtilsMetal.h
@@ -23,6 +23,7 @@
 #import <Metal/Metal.h>
 
 namespace dawn::native {
+struct BeginRenderPassCmd;
 struct ProgrammableStage;
 struct EntryPointMetadata;
 enum class SingleShaderStage;
@@ -81,19 +82,26 @@ constexpr MTLStoreAction kMTLStoreActionStoreAndMultisampleResolve =
 // happen at the render pass start and end. Because workarounds wrap the encoding of the render
 // pass, the encoding must be entirely done by the `encodeInside` callback.
 // At the end of this function, `commandContext` will have no encoder open.
-using EncodeInsideRenderPass = std::function<MaybeError(id<MTLRenderCommandEncoder>)>;
+using EncodeInsideRenderPass =
+    std::function<MaybeError(id<MTLRenderCommandEncoder>, BeginRenderPassCmd* renderPassCmd)>;
 MaybeError EncodeMetalRenderPass(Device* device,
                                  CommandRecordingContext* commandContext,
                                  MTLRenderPassDescriptor* mtlRenderPass,
                                  uint32_t width,
                                  uint32_t height,
-                                 EncodeInsideRenderPass encodeInside);
+                                 EncodeInsideRenderPass encodeInside,
+                                 BeginRenderPassCmd* renderPassCmd = nullptr);
 
 MaybeError EncodeEmptyMetalRenderPass(Device* device,
                                       CommandRecordingContext* commandContext,
                                       MTLRenderPassDescriptor* mtlRenderPass,
                                       Extent3D size);
 
+bool SupportCounterSamplingAtCommandBoundary(id<MTLDevice> device)
+    API_AVAILABLE(macos(11.0), ios(14.0));
+bool SupportCounterSamplingAtStageBoundary(id<MTLDevice> device)
+    API_AVAILABLE(macos(11.0), ios(14.0));
+
 }  // namespace dawn::native::metal
 
 #endif  // SRC_DAWN_NATIVE_METAL_UTILSMETAL_H_
diff --git a/src/dawn/native/metal/UtilsMetal.mm b/src/dawn/native/metal/UtilsMetal.mm
index 4eb5fb6e81..9a091b88fa 100644
--- a/src/dawn/native/metal/UtilsMetal.mm
+++ b/src/dawn/native/metal/UtilsMetal.mm
@@ -328,7 +328,8 @@ MaybeError EncodeMetalRenderPass(Device* device,
                                  MTLRenderPassDescriptor* mtlRenderPass,
                                  uint32_t width,
                                  uint32_t height,
-                                 EncodeInsideRenderPass encodeInside) {
+                                 EncodeInsideRenderPass encodeInside,
+                                 BeginRenderPassCmd* renderPassCmd) {
     // This function handles multiple workarounds. Because some cases requires multiple
     // workarounds to happen at the same time, it handles workarounds one by one and calls
     // itself recursively to handle the next workaround if needed.
@@ -359,7 +360,7 @@ MaybeError EncodeMetalRenderPass(Device* device,
         // resolve back to the true resolve targets.
         if (workaroundUsed) {
             DAWN_TRY(EncodeMetalRenderPass(device, commandContext, mtlRenderPass, width, height,
-                                           std::move(encodeInside)));
+                                           std::move(encodeInside), renderPassCmd));
 
             for (uint32_t i = 0; i < kMaxColorAttachments; ++i) {
                 if (trueResolveAttachments[i].texture == nullptr) {
@@ -403,7 +404,7 @@ MaybeError EncodeMetalRenderPass(Device* device,
 
         if (workaroundUsed) {
             DAWN_TRY(EncodeMetalRenderPass(device, commandContext, mtlRenderPass, width, height,
-                                           std::move(encodeInside)));
+                                           std::move(encodeInside), renderPassCmd));
 
             for (uint32_t i = 0; i < kMaxColorAttachments; ++i) {
                 if (originalAttachments[i].texture == nullptr) {
@@ -439,7 +440,7 @@ MaybeError EncodeMetalRenderPass(Device* device,
         // If we found a store + MSAA resolve we need to resolve in a different render pass.
         if (hasStoreAndMSAAResolve) {
             DAWN_TRY(EncodeMetalRenderPass(device, commandContext, mtlRenderPass, width, height,
-                                           std::move(encodeInside)));
+                                           std::move(encodeInside), renderPassCmd));
 
             ResolveInAnotherRenderPass(commandContext, mtlRenderPass, resolveTextures);
             return {};
@@ -448,7 +449,7 @@ MaybeError EncodeMetalRenderPass(Device* device,
 
     // No (more) workarounds needed! We can finally encode the actual render pass.
     commandContext->EndBlit();
-    DAWN_TRY(encodeInside(commandContext->BeginRender(mtlRenderPass)));
+    DAWN_TRY(encodeInside(commandContext->BeginRender(mtlRenderPass), renderPassCmd));
     commandContext->EndRender();
     return {};
 }
@@ -457,8 +458,26 @@ MaybeError EncodeEmptyMetalRenderPass(Device* device,
                                       CommandRecordingContext* commandContext,
                                       MTLRenderPassDescriptor* mtlRenderPass,
                                       Extent3D size) {
-    return EncodeMetalRenderPass(device, commandContext, mtlRenderPass, size.width, size.height,
-                                 [&](id<MTLRenderCommandEncoder>) -> MaybeError { return {}; });
+    return EncodeMetalRenderPass(
+        device, commandContext, mtlRenderPass, size.width, size.height,
+        [&](id<MTLRenderCommandEncoder>, BeginRenderPassCmd*) -> MaybeError { return {}; });
+}
+
+DAWN_NOINLINE bool SupportCounterSamplingAtCommandBoundary(id<MTLDevice> device)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    bool isBlitBoundarySupported =
+        [device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary];
+    bool isDispatchBoundarySupported =
+        [device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary];
+    bool isDrawBoundarySupported =
+        [device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary];
+
+    return isBlitBoundarySupported && isDispatchBoundarySupported && isDrawBoundarySupported;
+}
+
+DAWN_NOINLINE bool SupportCounterSamplingAtStageBoundary(id<MTLDevice> device)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    return [device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary];
 }
 
 }  // namespace dawn::native::metal
diff --git a/src/dawn/tests/DawnTest.cpp b/src/dawn/tests/DawnTest.cpp
index f1c3fb49ca..ff72d3482c 100644
--- a/src/dawn/tests/DawnTest.cpp
+++ b/src/dawn/tests/DawnTest.cpp
@@ -667,6 +667,10 @@ bool DawnTestBase::IsAMD() const {
     return gpu_info::IsAMD(mParam.adapterProperties.vendorID);
 }
 
+bool DawnTestBase::IsApple() const {
+    return gpu_info::IsApple(mParam.adapterProperties.vendorID);
+}
+
 bool DawnTestBase::IsARM() const {
     return gpu_info::IsARM(mParam.adapterProperties.vendorID);
 }
diff --git a/src/dawn/tests/DawnTest.h b/src/dawn/tests/DawnTest.h
index 7622dd23d5..0a95873392 100644
--- a/src/dawn/tests/DawnTest.h
+++ b/src/dawn/tests/DawnTest.h
@@ -225,6 +225,7 @@ class DawnTestBase {
     bool IsVulkan() const;
 
     bool IsAMD() const;
+    bool IsApple() const;
     bool IsARM() const;
     bool IsImgTec() const;
     bool IsIntel() const;
diff --git a/src/dawn/tests/end2end/QueryTests.cpp b/src/dawn/tests/end2end/QueryTests.cpp
index ff195376c6..5c641ad8f4 100644
--- a/src/dawn/tests/end2end/QueryTests.cpp
+++ b/src/dawn/tests/end2end/QueryTests.cpp
@@ -18,6 +18,15 @@
 #include "dawn/utils/ComboRenderPipelineDescriptor.h"
 #include "dawn/utils/WGPUHelpers.h"
 
+// Clear the content of the result buffer into 0xFFFFFFFF.
+constexpr static uint64_t kSentinelValue = ~uint64_t(0u);
+constexpr static uint64_t kZero = 0u;
+constexpr static unsigned int kRTSize = 4;
+constexpr uint64_t kMinDestinationOffset = 256;
+constexpr uint64_t kMinCount = kMinDestinationOffset / sizeof(uint64_t);
+constexpr wgpu::TextureFormat kColorFormat = wgpu::TextureFormat::RGBA8Unorm;
+constexpr wgpu::TextureFormat kDepthStencilFormat = wgpu::TextureFormat::Depth24PlusStencil8;
+
 class QueryTests : public DawnTest {
   protected:
     wgpu::Buffer CreateResolveBuffer(uint64_t size) {
@@ -27,13 +36,15 @@ class QueryTests : public DawnTest {
                            wgpu::BufferUsage::CopyDst;
         return device.CreateBuffer(&descriptor);
     }
-};
 
-// Clear the content of the result buffer into 0xFFFFFFFF.
-constexpr static uint64_t kSentinelValue = ~uint64_t(0u);
-constexpr static uint64_t kZero = 0u;
-constexpr uint64_t kMinDestinationOffset = 256;
-constexpr uint64_t kMinCount = kMinDestinationOffset / sizeof(uint64_t);
+    wgpu::Texture CreateRenderTexture(wgpu::TextureFormat format) {
+        wgpu::TextureDescriptor descriptor;
+        descriptor.size = {kRTSize, kRTSize, 1};
+        descriptor.format = format;
+        descriptor.usage = wgpu::TextureUsage::RenderAttachment;
+        return device.CreateTexture(&descriptor);
+    }
+};
 
 class OcclusionExpectation : public detail::Expectation {
   public:
@@ -112,14 +123,6 @@ class OcclusionQueryTests : public QueryTests {
         return device.CreateQuerySet(&descriptor);
     }
 
-    wgpu::Texture CreateRenderTexture(wgpu::TextureFormat format) {
-        wgpu::TextureDescriptor descriptor;
-        descriptor.size = {kRTSize, kRTSize, 1};
-        descriptor.format = format;
-        descriptor.usage = wgpu::TextureUsage::RenderAttachment;
-        return device.CreateTexture(&descriptor);
-    }
-
     void TestOcclusionQueryWithDepthStencilTest(bool depthTestEnabled,
                                                 bool stencilTestEnabled,
                                                 OcclusionExpectation::Result expected) {
@@ -130,8 +133,7 @@ class OcclusionQueryTests : public QueryTests {
         descriptor.cFragment.module = fsModule;
 
         // Enable depth and stencil tests and set comparison tests never pass.
-        wgpu::DepthStencilState* depthStencil =
-            descriptor.EnableDepthStencil(wgpu::TextureFormat::Depth24PlusStencil8);
+        wgpu::DepthStencilState* depthStencil = descriptor.EnableDepthStencil(kDepthStencilFormat);
         depthStencil->depthCompare =
             depthTestEnabled ? wgpu::CompareFunction::Never : wgpu::CompareFunction::Always;
         depthStencil->stencilFront.compare =
@@ -141,10 +143,10 @@ class OcclusionQueryTests : public QueryTests {
 
         wgpu::RenderPipeline renderPipeline = device.CreateRenderPipeline(&descriptor);
 
-        wgpu::Texture renderTarget = CreateRenderTexture(wgpu::TextureFormat::RGBA8Unorm);
+        wgpu::Texture renderTarget = CreateRenderTexture(kColorFormat);
         wgpu::TextureView renderTargetView = renderTarget.CreateView();
 
-        wgpu::Texture depthTexture = CreateRenderTexture(wgpu::TextureFormat::Depth24PlusStencil8);
+        wgpu::Texture depthTexture = CreateRenderTexture(kDepthStencilFormat);
         wgpu::TextureView depthTextureView = depthTexture.CreateView();
 
         wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
@@ -205,8 +207,6 @@ class OcclusionQueryTests : public QueryTests {
     wgpu::ShaderModule fsModule;
 
     wgpu::RenderPipeline pipeline;
-
-    constexpr static unsigned int kRTSize = 4;
 };
 
 // Test creating query set with the type of Occlusion
@@ -551,23 +551,87 @@ class TimestampQueryTests : public QueryTests {
         return device.CreateQuerySet(&descriptor);
     }
 
+    wgpu::RenderPipeline CreateRenderPipeline(bool hasFragmentStage = true) {
+        utils::ComboRenderPipelineDescriptor descriptor;
+
+        descriptor.vertex.module = utils::CreateShaderModule(device, R"(
+                @vertex
+                fn main(@builtin(vertex_index) VertexIndex : u32) -> @builtin(position) vec4<f32> {
+                    var pos = array<vec2<f32>, 3>(
+                        vec2<f32>( 1.0,  1.0),
+                        vec2<f32>(-1.0, -1.0),
+                        vec2<f32>( 1.0, -1.0));
+                    return vec4<f32>(pos[VertexIndex], 0.0, 1.0);
+                })");
+
+        if (hasFragmentStage) {
+            descriptor.cFragment.module = utils::CreateShaderModule(device, R"(
+                @fragment fn main() -> @location(0) vec4<f32> {
+                    return vec4<f32>(0.0, 1.0, 0.0, 1.0);
+                })");
+        } else {
+            descriptor.fragment = nullptr;
+            descriptor.EnableDepthStencil(kDepthStencilFormat);
+        }
+
+        return device.CreateRenderPipeline(&descriptor);
+    }
+
+    void EncodeComputeTimestampWrites(
+        const wgpu::CommandEncoder& encoder,
+        const std::vector<wgpu::ComputePassTimestampWrite>& timestampWrites,
+        bool hasPipeline = true) {
+        wgpu::ComputePassDescriptor descriptor;
+        descriptor.timestampWriteCount = timestampWrites.size();
+        descriptor.timestampWrites = timestampWrites.data();
+
+        wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&descriptor);
+        if (hasPipeline) {
+            pass.SetPipeline(computePipeline);
+            pass.DispatchWorkgroups(1, 1, 1);
+        }
+        pass.End();
+    }
+
+    void EncodeRenderTimestampWrites(
+        const wgpu::CommandEncoder& encoder,
+        const std::vector<wgpu::RenderPassTimestampWrite>& timestampWrites,
+        bool hasPipeline = true,
+        bool hasFragmentStage = true) {
+        wgpu::Texture depthTexture = CreateRenderTexture(kDepthStencilFormat);
+        utils::ComboRenderPassDescriptor renderPassDesc =
+            hasFragmentStage
+                ? utils::ComboRenderPassDescriptor({CreateRenderTexture(kColorFormat).CreateView()})
+                : utils::ComboRenderPassDescriptor(
+                      {}, CreateRenderTexture(kDepthStencilFormat).CreateView());
+        renderPassDesc.timestampWriteCount = timestampWrites.size();
+        renderPassDesc.timestampWrites = timestampWrites.data();
+
+        wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPassDesc);
+        if (hasPipeline) {
+            wgpu::RenderPipeline renderPipeline = CreateRenderPipeline(hasFragmentStage);
+            pass.SetPipeline(renderPipeline);
+            pass.Draw(3);
+        }
+        pass.End();
+    }
+
     void TestTimestampWritesOnComputePass(
         const std::vector<wgpu::ComputePassTimestampWrite>& timestampWrites,
-        const std::vector<wgpu::ComputePassTimestampWrite>& timestampWritesOnAnotherPass = {}) {
+        const std::vector<wgpu::ComputePassTimestampWrite>& timestampWritesOnAnotherPass = {},
+        bool hasPipeline = true) {
         size_t queryCount = timestampWrites.size() + timestampWritesOnAnotherPass.size();
         // The destination buffer offset must be a multiple of 256.
         wgpu::Buffer destination =
             CreateResolveBuffer(queryCount * kMinDestinationOffset + sizeof(uint64_t));
 
-        wgpu::ComputePassDescriptor descriptor;
-        descriptor.timestampWriteCount = timestampWrites.size();
-        descriptor.timestampWrites = timestampWrites.data();
-
         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-        wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&descriptor);
-        pass.SetPipeline(computePipeline);
-        pass.DispatchWorkgroups(1, 1, 1);
-        pass.End();
+        EncodeComputeTimestampWrites(encoder, timestampWrites, hasPipeline);
+
+        // Begin another compute pass if the timestampWritesOnAnotherPass is set.
+        if (!timestampWritesOnAnotherPass.empty()) {
+            EncodeComputeTimestampWrites(encoder, timestampWritesOnAnotherPass, hasPipeline);
+        }
 
         // Resolve queries one by one because the query set at the beginning of pass may be
         // different with the one at the end of pass.
@@ -575,25 +639,10 @@ class TimestampQueryTests : public QueryTests {
             encoder.ResolveQuerySet(timestampWrites[i].querySet, timestampWrites[i].queryIndex, 1,
                                     destination, i * kMinDestinationOffset);
         }
-
-        // Begin another compute pass if the timestampWritesOnAnotherPass is set.
-        if (!timestampWritesOnAnotherPass.empty()) {
-            wgpu::ComputePassDescriptor descriptor2;
-            descriptor2.timestampWriteCount = timestampWritesOnAnotherPass.size();
-            descriptor2.timestampWrites = timestampWritesOnAnotherPass.data();
-
-            wgpu::ComputePassEncoder pass2 = encoder.BeginComputePass(&descriptor2);
-            pass2.SetPipeline(computePipeline);
-            pass2.DispatchWorkgroups(1, 1, 1);
-            pass2.End();
-
-            for (size_t i = 0; i < timestampWritesOnAnotherPass.size(); i++) {
-                // Resolve queries one by one because the query set at the beginning of pass may be
-                // different with the one at the end of pass.
-                encoder.ResolveQuerySet(timestampWritesOnAnotherPass[i].querySet,
-                                        timestampWritesOnAnotherPass[i].queryIndex, 1, destination,
-                                        (timestampWrites.size() + i) * kMinDestinationOffset);
-            }
+        for (size_t i = 0; i < timestampWritesOnAnotherPass.size(); i++) {
+            encoder.ResolveQuerySet(timestampWritesOnAnotherPass[i].querySet,
+                                    timestampWritesOnAnotherPass[i].queryIndex, 1, destination,
+                                    (timestampWrites.size() + i) * kMinDestinationOffset);
         }
 
         wgpu::CommandBuffer commands = encoder.Finish();
@@ -607,19 +656,22 @@ class TimestampQueryTests : public QueryTests {
 
     void TestTimestampWritesOnRenderPass(
         const std::vector<wgpu::RenderPassTimestampWrite>& timestampWrites,
-        const std::vector<wgpu::RenderPassTimestampWrite>& timestampWritesOnAnotherPass = {}) {
+        const std::vector<wgpu::RenderPassTimestampWrite>& timestampWritesOnAnotherPass = {},
+        bool hasPipeline = true,
+        bool hasFragmentStage = true) {
         size_t queryCount = timestampWrites.size() + timestampWritesOnAnotherPass.size();
         // The destination buffer offset must be a multiple of 256.
         wgpu::Buffer destination =
             CreateResolveBuffer(queryCount * kMinDestinationOffset + sizeof(uint64_t));
 
-        utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 1, 1);
-        renderPass.renderPassInfo.timestampWriteCount = timestampWrites.size();
-        renderPass.renderPassInfo.timestampWrites = timestampWrites.data();
-
         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-        wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
-        pass.End();
+        EncodeRenderTimestampWrites(encoder, timestampWrites, hasPipeline, hasFragmentStage);
+
+        // Begin another render pass if the timestampWritesOnAnotherPass is set.
+        if (!timestampWritesOnAnotherPass.empty()) {
+            EncodeRenderTimestampWrites(encoder, timestampWritesOnAnotherPass, hasPipeline,
+                                        hasFragmentStage);
+        }
 
         // Resolve queries one by one because the query set at the beginning of pass may be
         // different with the one at the end of pass.
@@ -628,22 +680,10 @@ class TimestampQueryTests : public QueryTests {
                                     destination, i * kMinDestinationOffset);
         }
 
-        // Begin another render pass if the timestampWritesOnAnotherPass is set.
-        if (!timestampWritesOnAnotherPass.empty()) {
-            utils::BasicRenderPass renderPass2 = utils::CreateBasicRenderPass(device, 1, 1);
-            renderPass2.renderPassInfo.timestampWriteCount = timestampWritesOnAnotherPass.size();
-            renderPass2.renderPassInfo.timestampWrites = timestampWritesOnAnotherPass.data();
-
-            wgpu::RenderPassEncoder pass2 = encoder.BeginRenderPass(&renderPass2.renderPassInfo);
-            pass2.End();
-
-            for (size_t i = 0; i < timestampWritesOnAnotherPass.size(); i++) {
-                // Resolve queries one by one because the query set at the beginning of pass may be
-                // different with the one at the end of pass.
-                encoder.ResolveQuerySet(timestampWritesOnAnotherPass[i].querySet,
-                                        timestampWritesOnAnotherPass[i].queryIndex, 1, destination,
-                                        (timestampWrites.size() + i) * kMinDestinationOffset);
-            }
+        for (size_t i = 0; i < timestampWritesOnAnotherPass.size(); i++) {
+            encoder.ResolveQuerySet(timestampWritesOnAnotherPass[i].querySet,
+                                    timestampWritesOnAnotherPass[i].queryIndex, 1, destination,
+                                    (timestampWrites.size() + i) * kMinDestinationOffset);
         }
 
         wgpu::CommandBuffer commands = encoder.Finish();
@@ -669,9 +709,6 @@ TEST_P(TimestampQueryTests, QuerySetCreation) {
 
 // Test calling timestamp query from command encoder
 TEST_P(TimestampQueryTests, TimestampOnCommandEncoder) {
-    // TODO (dawn:1250): Still not implemented on Metal backend.
-    DAWN_TEST_UNSUPPORTED_IF(IsMetal());
-
     constexpr uint32_t kQueryCount = 2;
 
     // Write timestamp with different query indexes
@@ -709,6 +746,10 @@ TEST_P(TimestampQueryTests, TimestampOnCommandEncoder) {
 
 // Test calling timestamp query from render pass encoder
 TEST_P(TimestampQueryTests, TimestampOnRenderPass) {
+    // TODO (dawn:1250): Split writeTimestamp() to another extension which is not supported on Apple
+    // devices
+    DAWN_TEST_UNSUPPORTED_IF(IsMacOS() && IsMetal() && IsApple());
+
     constexpr uint32_t kQueryCount = 2;
 
     // Write timestamp with different query indexes
@@ -754,6 +795,10 @@ TEST_P(TimestampQueryTests, TimestampOnRenderPass) {
 
 // Test calling timestamp query from compute pass encoder
 TEST_P(TimestampQueryTests, TimestampOnComputePass) {
+    // TODO (dawn:1250): Split writeTimestamp() to another extension which is not supported on Apple
+    // devices
+    DAWN_TEST_UNSUPPORTED_IF(IsMacOS() && IsMetal() && IsApple());
+
     constexpr uint32_t kQueryCount = 2;
 
     // Write timestamp with different query indexes
@@ -815,19 +860,34 @@ TEST_P(TimestampQueryTests, TimestampOnComputePass) {
     }
 }
 
-// Test timestampWrites setting in compute pass descriptor
-TEST_P(TimestampQueryTests, TimestampWritesOnComputePass) {
+// Test timestampWrites with query set in compute pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesQuerySetOnComputePass) {
     // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
     // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
     // the issue is fixed.
     DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
 
-    // TODO (dawn:1250): Still not implemented on Metal backend.
-    DAWN_TEST_UNSUPPORTED_IF(IsMetal());
+    // TODO (dawn:1473): Metal bug which fails to store GPU counters to different sample buffer.
+    DAWN_SUPPRESS_TEST_IF(IsMacOS() && IsMetal() && IsApple());
+
+    // Set timestampWrites with different query set on same compute pass
+    wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
+    wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);
+
+    TestTimestampWritesOnComputePass({{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning},
+                                      {querySet1, 0, wgpu::ComputePassTimestampLocation::End}});
+}
+
+// Test timestampWrites with query index in compute pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesQueryIndexOnComputePass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
 
     constexpr uint32_t kQueryCount = 2;
 
-    // Set timestampWrites with different query indexes and locations on same compute pass
+    // Set timestampWrites with different query indexes on same compute pass
     {
         wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
 
@@ -836,16 +896,35 @@ TEST_P(TimestampQueryTests, TimestampWritesOnComputePass) {
              {querySet, 1, wgpu::ComputePassTimestampLocation::End}});
     }
 
-    // Set timestampWrites with different query set on same compute pass
+    // Set timestampWrites with same query index on same compute pass
     {
-        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
-        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
 
         TestTimestampWritesOnComputePass(
-            {{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning},
-             {querySet1, 0, wgpu::ComputePassTimestampLocation::End}});
+            {{querySet, 0, wgpu::ComputePassTimestampLocation::Beginning},
+             {querySet, 0, wgpu::ComputePassTimestampLocation::End}});
     }
 
+    // Set timestampWrites with same query indexes on different compute pass
+    {
+        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(kQueryCount);
+
+        TestTimestampWritesOnComputePass(
+            {{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning}},
+            {{querySet1, 0, wgpu::ComputePassTimestampLocation::End}});
+    }
+}
+
+// Test timestampWrites with timestamp location in compute pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesLocationOnComputePass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    constexpr uint32_t kQueryCount = 2;
+
     // Set timestampWrites with only one value of ComputePassTimestampLocation
     {
         wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
@@ -856,63 +935,78 @@ TEST_P(TimestampQueryTests, TimestampWritesOnComputePass) {
         TestTimestampWritesOnComputePass({{querySet, 1, wgpu::ComputePassTimestampLocation::End}});
     }
 
-    // Set timestampWrites with same query set and query index on same compute pass
+    // Set timestampWrites with same location on different compute pass
     {
-        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
+        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);
 
         TestTimestampWritesOnComputePass(
-            {{querySet, 0, wgpu::ComputePassTimestampLocation::Beginning},
-             {querySet, 0, wgpu::ComputePassTimestampLocation::End}});
-    }
-
-    // Set timestampWrites with same query indexes and locations on different compute pass
-    {
-        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(kQueryCount);
-        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(kQueryCount);
-
-        TestTimestampWritesOnComputePass(
-            {{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning},
-             {querySet0, 1, wgpu::ComputePassTimestampLocation::End}},
-            {{querySet1, 0, wgpu::ComputePassTimestampLocation::Beginning},
-             {querySet1, 1, wgpu::ComputePassTimestampLocation::End}});
+            {{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning}},
+            {{querySet1, 0, wgpu::ComputePassTimestampLocation::Beginning}});
     }
 }
 
-// Test timestampWrites setting in render pass descriptor
-TEST_P(TimestampQueryTests, TimestampWritesOnRenderPass) {
+// Test timestampWrites on compute pass without pipeline
+TEST_P(TimestampQueryTests, TimestampWritesOnComputePassWithNoPipline) {
     // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
     // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
     // the issue is fixed.
     DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
 
-    // TODO (dawn:1250): Still not implemented on Metal backend.
-    DAWN_TEST_UNSUPPORTED_IF(IsMetal());
+    // TODO (dawn:1473): Metal fails to store GPU counters to sampleBufferAttachments on empty
+    // encoders.
+    DAWN_SUPPRESS_TEST_IF(IsMacOS() && IsMetal() && IsApple());
 
-    constexpr uint32_t kQueryCount = 2;
+    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);
+
+    TestTimestampWritesOnComputePass({{querySet, 0, wgpu::ComputePassTimestampLocation::Beginning},
+                                      {querySet, 1, wgpu::ComputePassTimestampLocation::End}},
+                                     {}, false);
+}
+
+// Test timestampWrites with query set in render pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesQuerySetOnRenderPass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    // TODO (dawn:1473): Metal bug which fails to store GPU counters to different sample buffer.
+    DAWN_SUPPRESS_TEST_IF(IsMacOS() && IsMetal() && IsApple());
+
+    // Set timestampWrites with different query set on same render pass
+    wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
+    wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);
+
+    TestTimestampWritesOnRenderPass({{querySet0, 0, wgpu::RenderPassTimestampLocation::Beginning},
+                                     {querySet1, 0, wgpu::RenderPassTimestampLocation::End}});
+}
+
+// Test timestampWrites with query index in compute pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesQueryIndexOnRenderPass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
 
     // Set timestampWrites with different query indexes and locations, not need test write same
     // query index due to it's not allowed on render pass.
-    {
-        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);
 
-        TestTimestampWritesOnRenderPass(
-            {{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning},
-             {querySet, 1, wgpu::RenderPassTimestampLocation::End}});
-    }
+    TestTimestampWritesOnRenderPass({{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning},
+                                     {querySet, 1, wgpu::RenderPassTimestampLocation::End}});
+}
 
-    // Set timestampWrites with different query set on same render pass
-    {
-        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
-        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);
-
-        TestTimestampWritesOnRenderPass(
-            {{querySet0, 0, wgpu::RenderPassTimestampLocation::Beginning},
-             {querySet1, 0, wgpu::RenderPassTimestampLocation::End}});
-    }
+// Test timestampWrites with timestamp location in render pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesLocationOnRenderPass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
 
     // Set timestampWrites with only one value of RenderPassTimestampLocation
     {
-        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);
 
         TestTimestampWritesOnRenderPass(
             {{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning}});
@@ -920,19 +1014,45 @@ TEST_P(TimestampQueryTests, TimestampWritesOnRenderPass) {
         TestTimestampWritesOnRenderPass({{querySet, 1, wgpu::RenderPassTimestampLocation::End}});
     }
 
-    // Set timestampWrites with same query indexes and locations on different render pass
+    // Set timestampWrites with same location on different render pass
     {
-        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(kQueryCount);
-        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
+        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);
 
         TestTimestampWritesOnRenderPass(
-            {{querySet0, 0, wgpu::RenderPassTimestampLocation::Beginning},
-             {querySet0, 1, wgpu::RenderPassTimestampLocation::End}},
-            {{querySet1, 0, wgpu::RenderPassTimestampLocation::Beginning},
-             {querySet1, 1, wgpu::RenderPassTimestampLocation::End}});
+            {{querySet0, 0, wgpu::RenderPassTimestampLocation::Beginning}},
+            {{querySet1, 0, wgpu::RenderPassTimestampLocation::Beginning}});
     }
 }
 
+// Test timestampWrites on render pass without pipeline
+TEST_P(TimestampQueryTests, TimestampWritesOnRenderPassWithNoPipline) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);
+    TestTimestampWritesOnRenderPass({{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning},
+                                     {querySet, 1, wgpu::RenderPassTimestampLocation::End}},
+                                    {}, false);
+}
+
+// Test timestampWrites on render pass with pipeline but no fragment stage
+TEST_P(TimestampQueryTests, TimestampWritesOnRenderPassWithOnlyVertexStage) {
+    DAWN_TEST_UNSUPPORTED_IF(HasToggleEnabled("use_placeholder_fragment_in_vertex_only_pipeline"));
+
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);
+    TestTimestampWritesOnRenderPass({{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning},
+                                     {querySet, 1, wgpu::RenderPassTimestampLocation::End}},
+                                    {}, true, false);
+}
+
 // Test resolving timestamp query from another different encoder
 TEST_P(TimestampQueryTests, ResolveFromAnotherEncoder) {
     constexpr uint32_t kQueryCount = 2;