Implement timestampWrites on Metal

Metal configures the query set and query index for the beginning and end of passes in pass descriptor when beginning a pass encoder, so we need to record all timestamp writes information in BeginXxxPassCmd. For the platfroms that support timestamp query, it must support timestamp write at command boundary or stage boundary, if the stage boundary is supported, use sampleBufferAttachments API for Metal implementation, otherwise simulate timestamp write using sampleCountersInBuffer API after begining a pass and before ending a pass. Bug: dawn:1250 Change-Id: I462cb05a0102521cd2df4db3ac6f71863419b933 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/93940 Kokoro: Kokoro <noreply+kokoro@google.com> Reviewed-by: Corentin Wallez <cwallez@chromium.org> Commit-Queue: Hao Li <hao.x.li@intel.com>
2025-12-09 21:47:47 +00:00 · 2022-09-16 15:00:19 +00:00
parent ba68620f6f
commit 92182412b8
14 changed files with 658 additions and 183 deletions
--- a/src/dawn/native/Toggles.cpp
+++ b/src/dawn/native/Toggles.cpp
@@ -306,6 +306,12 @@ static constexpr ToggleEnumAndInfoList kToggleNameAndInfoList = {{
      "integer that is greater than 2^24 or smaller than -2^24). This toggle is also enabled on "
      "Intel GPUs on Metal backend due to a driver issue on Intel Metal driver.",
      "https://crbug.com/dawn/537"}},
+    {Toggle::MetalUseDummyBlitEncoderForWriteTimestamp,
+     {"metal_use_dummy_blit_encoder_for_write_timestamp",
+      "Add dummy blit command to blit encoder when encoding writeTimestamp as workaround on Metal."
+      "This toggle is enabled by default on Metal backend where GPU counters cannot be stored to"
+      "sampleBufferAttachments on empty blit encoder.",
+      "https://crbug.com/dawn/1473"}},
    // Comment to separate the }} so it is clearer what to copy-paste to add a toggle.
 }};
 }  // anonymous namespace
--- a/src/dawn/native/Toggles.h
+++ b/src/dawn/native/Toggles.h
@@ -79,6 +79,7 @@ enum class Toggle {
    D3D12AllocateExtraMemoryFor2DArrayTexture,
    D3D12UseTempBufferInDepthStencilTextureAndBufferCopyWithNonZeroBufferOffset,
    ApplyClearBigIntegerColorValueWithDraw,
+    MetalUseDummyBlitEncoderForWriteTimestamp,

    EnumCount,
    InvalidEnum = EnumCount,
--- a/src/dawn/native/metal/BackendMTL.mm
+++ b/src/dawn/native/metal/BackendMTL.mm
@@ -24,6 +24,7 @@
 #include "dawn/native/MetalBackend.h"
 #include "dawn/native/metal/BufferMTL.h"
 #include "dawn/native/metal/DeviceMTL.h"
+#include "dawn/native/metal/UtilsMetal.h"

 #if DAWN_PLATFORM_IS(MACOS)
 #import <IOKit/IOKitLib.h>
@@ -170,18 +171,6 @@ MaybeError GetDevicePCIInfo(id<MTLDevice> device, PCIIDs* ids) {
 #error "Unsupported Apple platform."
 #endif

-DAWN_NOINLINE bool IsCounterSamplingBoundarySupport(id<MTLDevice> device)
-    API_AVAILABLE(macos(11.0), ios(14.0)) {
-    bool isBlitBoundarySupported =
-        [device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary];
-    bool isDispatchBoundarySupported =
-        [device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary];
-    bool isDrawBoundarySupported =
-        [device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary];
-
-    return isBlitBoundarySupported && isDispatchBoundarySupported && isDrawBoundarySupported;
-}
-
 // This method has seen hard-to-debug crashes. See crbug.com/dawn/1102.
 // For now, it is written defensively, with many potentially unnecessary guards until
 // we narrow down the cause of the problem.
@@ -246,11 +235,13 @@ DAWN_NOINLINE bool IsGPUCounterSupported(id<MTLDevice> device,
    }

    if (@available(macOS 11.0, iOS 14.0, *)) {
-        // Check whether it can read GPU counters at the specified command boundary. Apple
-        // family GPUs do not support sampling between different Metal commands, because
-        // they defer fragment processing until after the GPU processes all the primitives
-        // in the render pass.
-        if (!IsCounterSamplingBoundarySupport(device)) {
+        // Check whether it can read GPU counters at the specified command boundary or stage
+        // boundary. Apple family GPUs do not support sampling between different Metal commands,
+        // because they defer fragment processing until after the GPU processes all the primitives
+        // in the render pass. GPU counters are only available if sampling at least one of the
+        // command or stage boundaries is supported.
+        if (!SupportCounterSamplingAtCommandBoundary(device) &&
+            !SupportCounterSamplingAtStageBoundary(device)) {
            return false;
        }
    }
--- a/src/dawn/native/metal/CommandBufferMTL.h
+++ b/src/dawn/native/metal/CommandBufferMTL.h
@@ -16,12 +16,15 @@
 #define SRC_DAWN_NATIVE_METAL_COMMANDBUFFERMTL_H_

 #include "dawn/native/CommandBuffer.h"
+#include "dawn/native/Commands.h"
 #include "dawn/native/Error.h"

 #import <Metal/Metal.h>

 namespace dawn::native {
 class CommandEncoder;
+struct BeginComputePassCmd;
+struct BeginRenderPassCmd;
 }

 namespace dawn::native::metal {
@@ -55,8 +58,10 @@ class CommandBuffer final : public CommandBufferBase {
  private:
    using CommandBufferBase::CommandBufferBase;

-    MaybeError EncodeComputePass(CommandRecordingContext* commandContext);
-    MaybeError EncodeRenderPass(id<MTLRenderCommandEncoder> encoder);
+    MaybeError EncodeComputePass(CommandRecordingContext* commandContext,
+                                 BeginComputePassCmd* computePassCmd);
+    MaybeError EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
+                                BeginRenderPassCmd* renderPassCmd);
 };

 }  // namespace dawn::native::metal
--- a/src/dawn/native/metal/CommandBufferMTL.mm
+++ b/src/dawn/native/metal/CommandBufferMTL.mm
@@ -49,7 +49,129 @@ MTLIndexType MTLIndexFormat(wgpu::IndexFormat format) {
    }
 }

-NSRef<MTLRenderPassDescriptor> CreateMTLRenderPassDescriptor(BeginRenderPassCmd* renderPass) {
+template <typename PassDescriptor>
+class SampleBufferAttachment {
+  public:
+    void SetSampleBuffer(PassDescriptor* descriptor, id<MTLCounterSampleBuffer> sampleBuffer)
+        API_AVAILABLE(macos(11.0), ios(14.0));
+    void SetStartSampleIndex(PassDescriptor* descriptor, NSUInteger sampleIndex)
+        API_AVAILABLE(macos(11.0), ios(14.0));
+    void SetEndSampleIndex(PassDescriptor* descriptor, NSUInteger sampleIndex)
+        API_AVAILABLE(macos(11.0), ios(14.0));
+
+  private:
+    // Initialized to the maximum value, in order to start from 0 after the first increment.
+    NSUInteger attachmentIndex = NSUIntegerMax;
+    // TODO(dawn:1473): The maximum of sampleBufferAttachments depends on the length of MTLDevice's
+    // counterSets, but Metal does not match the allowed maximum of sampleBufferAttachments with the
+    // length of counterSets on AGX family. Hardcode as a constant and check this whenever Metal
+    // could get the matched value.
+    static constexpr NSUInteger kMaxSampleBufferAttachments = 4;
+};
+
+template <typename PassDescriptor>
+void SampleBufferAttachment<PassDescriptor>::SetSampleBuffer(
+    PassDescriptor* descriptor,
+    id<MTLCounterSampleBuffer> sampleBuffer) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    attachmentIndex++;
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].sampleBuffer = sampleBuffer;
+}
+
+// Must be called after SetSampleBuffer
+template <>
+void SampleBufferAttachment<MTLRenderPassDescriptor>::SetStartSampleIndex(
+    MTLRenderPassDescriptor* descriptor,
+    NSUInteger sampleIndex) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].startOfVertexSampleIndex = sampleIndex;
+}
+
+// Must be called after SetSampleBuffer
+template <>
+void SampleBufferAttachment<MTLRenderPassDescriptor>::SetEndSampleIndex(
+    MTLRenderPassDescriptor* descriptor,
+    NSUInteger sampleIndex) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].endOfFragmentSampleIndex = sampleIndex;
+}
+
+// Must be called after SetSampleBuffer
+template <>
+void SampleBufferAttachment<MTLComputePassDescriptor>::SetStartSampleIndex(
+    MTLComputePassDescriptor* descriptor,
+    NSUInteger sampleIndex) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].startOfEncoderSampleIndex = sampleIndex;
+}
+
+// Must be called after SetSampleBuffer
+template <>
+void SampleBufferAttachment<MTLComputePassDescriptor>::SetEndSampleIndex(
+    MTLComputePassDescriptor* descriptor,
+    NSUInteger sampleIndex) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    // TODO(dawn:1473): Use MTLComputePassSampleBuffers or query method instead of the magic number
+    // 4 when Metal could get the maximum of sampleBufferAttachments on compute pass
+    ASSERT(attachmentIndex < kMaxSampleBufferAttachments);
+    descriptor.sampleBufferAttachments[attachmentIndex].endOfEncoderSampleIndex = sampleIndex;
+}
+
+template <typename PassDescriptor, typename BeginPass>
+void SetSampleBufferAttachments(PassDescriptor* descriptor, BeginPass* cmd) {
+    // Use @available instead of API_AVAILABLE because GetCounterSampleBuffer() also needs checking
+    // API availability.
+    if (@available(macOS 11.0, iOS 14.0, *)) {
+        QuerySetBase* beginQuerySet = cmd->beginTimestamp.querySet.Get();
+        QuerySetBase* endQuerySet = cmd->endTimestamp.querySet.Get();
+
+        SampleBufferAttachment<PassDescriptor> sampleBufferAttachment;
+
+        if (beginQuerySet != nullptr) {
+            sampleBufferAttachment.SetSampleBuffer(
+                descriptor, ToBackend(beginQuerySet)->GetCounterSampleBuffer());
+            sampleBufferAttachment.SetStartSampleIndex(descriptor,
+                                                       NSUInteger(cmd->beginTimestamp.queryIndex));
+
+            if (beginQuerySet == endQuerySet) {
+                sampleBufferAttachment.SetEndSampleIndex(descriptor,
+                                                         NSUInteger(cmd->endTimestamp.queryIndex));
+            } else {
+                sampleBufferAttachment.SetEndSampleIndex(descriptor, MTLCounterDontSample);
+            }
+        }
+
+        // Set to other sampleBufferAttachment if the endQuerySet is different with beginQuerySet.
+        if (endQuerySet != nullptr && beginQuerySet != endQuerySet) {
+            sampleBufferAttachment.SetSampleBuffer(
+                descriptor, ToBackend(endQuerySet)->GetCounterSampleBuffer());
+            sampleBufferAttachment.SetStartSampleIndex(descriptor, MTLCounterDontSample);
+            sampleBufferAttachment.SetEndSampleIndex(descriptor,
+                                                     NSUInteger(cmd->endTimestamp.queryIndex));
+        }
+    } else {
+        UNREACHABLE();
+    }
+}
+
+NSRef<MTLComputePassDescriptor> CreateMTLComputePassDescriptor(BeginComputePassCmd* computePass)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    // Note that this creates a descriptor that's autoreleased so we don't use AcquireNSRef
+    NSRef<MTLComputePassDescriptor> descriptorRef =
+        [MTLComputePassDescriptor computePassDescriptor];
+    MTLComputePassDescriptor* descriptor = descriptorRef.Get();
+    // MTLDispatchTypeSerial is the same dispatch type as the deafult MTLComputeCommandEncoder.
+    // MTLDispatchTypeConcurrent requires memory barriers to ensure multiple commands synchronize
+    // access to the same resources, which we may support it later.
+    descriptor.dispatchType = MTLDispatchTypeSerial;
+
+    SetSampleBufferAttachments(descriptor, computePass);
+
+    return descriptorRef;
+}
+
+NSRef<MTLRenderPassDescriptor> CreateMTLRenderPassDescriptor(
+    BeginRenderPassCmd* renderPass,
+    bool useCounterSamplingAtStageBoundary) {
    // Note that this creates a descriptor that's autoreleased so we don't use AcquireNSRef
    NSRef<MTLRenderPassDescriptor> descriptorRef = [MTLRenderPassDescriptor renderPassDescriptor];
    MTLRenderPassDescriptor* descriptor = descriptorRef.Get();
@@ -197,9 +319,36 @@ NSRef<MTLRenderPassDescriptor> CreateMTLRenderPassDescriptor(BeginRenderPassCmd*
            ToBackend(renderPass->occlusionQuerySet.Get())->GetVisibilityBuffer();
    }

+    if (@available(macOS 11.0, iOS 14.0, *)) {
+        if (useCounterSamplingAtStageBoundary) {
+            SetSampleBufferAttachments(descriptor, renderPass);
+        }
+    }
+
    return descriptorRef;
 }

+void EncodeEmptyBlitEncoderForWriteTimestamp(Device* device,
+                                             CommandRecordingContext* commandContext,
+                                             WriteTimestampCmd* cmd)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    commandContext->EndBlit();
+
+    MTLBlitPassDescriptor* descriptor = [[MTLBlitPassDescriptor alloc] init];
+    if (cmd->querySet.Get() != nullptr) {
+        descriptor.sampleBufferAttachments[0].sampleBuffer =
+            ToBackend(cmd->querySet.Get())->GetCounterSampleBuffer();
+        descriptor.sampleBufferAttachments[0].startOfEncoderSampleIndex = MTLCounterDontSample;
+        descriptor.sampleBufferAttachments[0].endOfEncoderSampleIndex = NSUInteger(cmd->queryIndex);
+
+        id<MTLBlitCommandEncoder> blit = commandContext->BeginBlit(descriptor);
+        if (device->IsToggleEnabled(Toggle::MetalUseDummyBlitEncoderForWriteTimestamp)) {
+            [blit fillBuffer:device->GetDummyBlitMtlBuffer() range:NSMakeRange(0, 1) value:0];
+        }
+        commandContext->EndBlit();
+    }
+}
+
 // Metal uses a physical addressing mode which means buffers in the shading language are
 // just pointers to the virtual address of their start. This means there is no way to know
 // the length of a buffer to compute the length() of unsized arrays at the end of storage
@@ -612,18 +761,13 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)
            case Command::BeginComputePass: {
                BeginComputePassCmd* cmd = mCommands.NextCommand<BeginComputePassCmd>();

-                if (cmd->beginTimestamp.querySet.Get() != nullptr ||
-                    cmd->endTimestamp.querySet.Get() != nullptr) {
-                    return DAWN_UNIMPLEMENTED_ERROR("timestampWrites unimplemented.");
-                }
-
                for (const SyncScopeResourceUsage& scope :
                     GetResourceUsages().computePasses[nextComputePassNumber].dispatchUsages) {
                    LazyClearSyncScope(scope, commandContext);
                }
                commandContext->EndBlit();

-                DAWN_TRY(EncodeComputePass(commandContext));
+                DAWN_TRY(EncodeComputePass(commandContext, cmd));

                nextComputePassNumber++;
                break;
@@ -632,22 +776,19 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)
            case Command::BeginRenderPass: {
                BeginRenderPassCmd* cmd = mCommands.NextCommand<BeginRenderPassCmd>();

-                if (cmd->beginTimestamp.querySet.Get() != nullptr ||
-                    cmd->endTimestamp.querySet.Get() != nullptr) {
-                    return DAWN_UNIMPLEMENTED_ERROR("timestampWrites unimplemented.");
-                }
-
                LazyClearSyncScope(GetResourceUsages().renderPasses[nextRenderPassNumber],
                                   commandContext);
                commandContext->EndBlit();

                LazyClearRenderPassAttachments(cmd);
-                NSRef<MTLRenderPassDescriptor> descriptor = CreateMTLRenderPassDescriptor(cmd);
+                NSRef<MTLRenderPassDescriptor> descriptor = CreateMTLRenderPassDescriptor(
+                    cmd, ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary());
                DAWN_TRY(EncodeMetalRenderPass(
                    ToBackend(GetDevice()), commandContext, descriptor.Get(), cmd->width,
-                    cmd->height, [this](id<MTLRenderCommandEncoder> encoder) -> MaybeError {
-                        return this->EncodeRenderPass(encoder);
-                    }));
+                    cmd->height,
+                    [this](id<MTLRenderCommandEncoder> encoder, BeginRenderPassCmd* cmd)
+                        -> MaybeError { return this->EncodeRenderPass(encoder, cmd); },
+                    cmd));

                nextRenderPassNumber++;
                break;
@@ -905,16 +1046,29 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)

            case Command::WriteTimestamp: {
                WriteTimestampCmd* cmd = mCommands.NextCommand<WriteTimestampCmd>();
-                QuerySet* querySet = ToBackend(cmd->querySet.Get());

+                if (ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary()) {
+                    if (@available(macos 11.0, iOS 14.0, *)) {
+                        // Simulate writeTimestamp cmd between blit commands on the devices which
+                        // supports counter sampling at stage boundary.
+                        EncodeEmptyBlitEncoderForWriteTimestamp(ToBackend(GetDevice()),
+                                                                commandContext, cmd);
+                    } else {
+                        UNREACHABLE();
+                    }
+                } else {
                    if (@available(macos 10.15, iOS 14.0, *)) {
+                        ASSERT(ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary());
                        [commandContext->EnsureBlit()
-                        sampleCountersInBuffer:querySet->GetCounterSampleBuffer()
+                            sampleCountersInBuffer:ToBackend(cmd->querySet.Get())
+                                                       ->GetCounterSampleBuffer()
                                     atSampleIndex:NSUInteger(cmd->queryIndex)
                                       withBarrier:YES];
                    } else {
                        UNREACHABLE();
                    }
+                }
+
                break;
            }

@@ -985,18 +1139,64 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)
    return {};
 }

-MaybeError CommandBuffer::EncodeComputePass(CommandRecordingContext* commandContext) {
+MaybeError CommandBuffer::EncodeComputePass(CommandRecordingContext* commandContext,
+                                            BeginComputePassCmd* computePassCmd) {
    ComputePipeline* lastPipeline = nullptr;
    StorageBufferLengthTracker storageBufferLengths = {};
    BindGroupTracker bindGroups(&storageBufferLengths);

-    id<MTLComputeCommandEncoder> encoder = commandContext->BeginCompute();
+    id<MTLComputeCommandEncoder> encoder;
+    // When counter sampling is supported at stage boundary, begin a configurable compute pass
+    // encoder which is supported since macOS 11.0+ and iOS 14.0+ and set timestamp writes to
+    // compute pass descriptor, otherwise begin a default compute pass encoder, and simulate
+    // timestamp writes using sampleCountersInBuffer API at the beginning and end of compute pass.
+    if (ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary()) {
+        if (@available(macOS 11.0, iOS 14.0, *)) {
+            NSRef<MTLComputePassDescriptor> descriptor =
+                CreateMTLComputePassDescriptor(computePassCmd);
+            encoder = commandContext->BeginCompute(descriptor.Get());
+        } else {
+            UNREACHABLE();
+        }
+    } else {
+        encoder = commandContext->BeginCompute();
+
+        if (@available(macos 10.15, iOS 14.0, *)) {
+            if (computePassCmd->beginTimestamp.querySet.Get() != nullptr) {
+                ASSERT(ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary());
+
+                [encoder
+                    sampleCountersInBuffer:ToBackend(computePassCmd->beginTimestamp.querySet.Get())
+                                               ->GetCounterSampleBuffer()
+                             atSampleIndex:NSUInteger(computePassCmd->beginTimestamp.queryIndex)
+                               withBarrier:YES];
+            }
+        }
+    }

    Command type;
    while (mCommands.NextCommandId(&type)) {
        switch (type) {
            case Command::EndComputePass: {
                mCommands.NextCommand<EndComputePassCmd>();
+
+                if (@available(macos 10.15, iOS 14.0, *)) {
+                    // Simulate timestamp write at the end of render pass if it does not support
+                    // counter sampling at stage boundary.
+                    if (ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary() &&
+                        computePassCmd->endTimestamp.querySet.Get() != nullptr) {
+                        ASSERT(!ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary());
+
+                        [encoder
+                            sampleCountersInBuffer:ToBackend(
+                                                       computePassCmd->endTimestamp.querySet.Get())
+                                                       ->GetCounterSampleBuffer()
+                                     atSampleIndex:NSUInteger(
+                                                       computePassCmd->endTimestamp.queryIndex)
+                                       withBarrier:YES];
+                    }
+                }
+
                commandContext->EndCompute();
                return {};
            }
@@ -1104,7 +1304,8 @@ MaybeError CommandBuffer::EncodeComputePass(CommandRecordingContext* commandCont
    UNREACHABLE();
 }

-MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder) {
+MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
+                                           BeginRenderPassCmd* renderPassCmd) {
    bool enableVertexPulling = GetDevice()->IsToggleEnabled(Toggle::MetalEnableVertexPulling);
    RenderPipeline* lastPipeline = nullptr;
    id<MTLBuffer> indexBuffer = nullptr;
@@ -1116,6 +1317,20 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder)
    VertexBufferTracker vertexBuffers(&storageBufferLengths);
    BindGroupTracker bindGroups(&storageBufferLengths);

+    if (@available(macos 10.15, iOS 14.0, *)) {
+        // Simulate timestamp write at the beginning of render pass by
+        // sampleCountersInBuffer if it does not support counter sampling at stage boundary.
+        if (ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary() &&
+            renderPassCmd->beginTimestamp.querySet.Get() != nullptr) {
+            ASSERT(!ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary());
+
+            [encoder sampleCountersInBuffer:ToBackend(renderPassCmd->beginTimestamp.querySet.Get())
+                                                ->GetCounterSampleBuffer()
+                              atSampleIndex:NSUInteger(renderPassCmd->beginTimestamp.queryIndex)
+                                withBarrier:YES];
+        }
+    }
+
    auto EncodeRenderBundleCommand = [&](CommandIterator* iter, Command type) {
        switch (type) {
            case Command::Draw: {
@@ -1304,6 +1519,24 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder)
        switch (type) {
            case Command::EndRenderPass: {
                mCommands.NextCommand<EndRenderPassCmd>();
+
+                if (@available(macos 10.15, iOS 14.0, *)) {
+                    // Simulate timestamp write at the end of render pass if it does not support
+                    // counter sampling at stage boundary.
+                    if (ToBackend(GetDevice())->UseCounterSamplingAtCommandBoundary() &&
+                        renderPassCmd->endTimestamp.querySet.Get() != nullptr) {
+                        ASSERT(!ToBackend(GetDevice())->UseCounterSamplingAtStageBoundary());
+
+                        [encoder
+                            sampleCountersInBuffer:ToBackend(
+                                                       renderPassCmd->endTimestamp.querySet.Get())
+                                                       ->GetCounterSampleBuffer()
+                                     atSampleIndex:NSUInteger(
+                                                       renderPassCmd->endTimestamp.queryIndex)
+                                       withBarrier:YES];
+                    }
+                }
+
                return {};
            }

--- a/src/dawn/native/metal/CommandRecordingContext.h
+++ b/src/dawn/native/metal/CommandRecordingContext.h
@@ -36,10 +36,18 @@ class CommandRecordingContext : NonMovable {
    MaybeError PrepareNextCommandBuffer(id<MTLCommandQueue> queue);
    NSPRef<id<MTLCommandBuffer>> AcquireCommands();

+    // Create blit pass encoder from blit pass descriptor
+    id<MTLBlitCommandEncoder> BeginBlit(MTLBlitPassDescriptor* descriptor)
+        API_AVAILABLE(macos(11.0), ios(14.0));
    id<MTLBlitCommandEncoder> EnsureBlit();
    void EndBlit();

+    // Create a sequential compute pass by default.
    id<MTLComputeCommandEncoder> BeginCompute();
+    // Create configurable compute pass from a descriptor with serial dispatch type which commands
+    // are executed sequentially.
+    id<MTLComputeCommandEncoder> BeginCompute(MTLComputePassDescriptor* descriptor)
+        API_AVAILABLE(macos(11.0), ios(14.0));
    void EndCompute();

    id<MTLRenderCommandEncoder> BeginRender(MTLRenderPassDescriptor* descriptor);
--- a/src/dawn/native/metal/CommandRecordingContext.mm
+++ b/src/dawn/native/metal/CommandRecordingContext.mm
@@ -62,6 +62,20 @@ NSPRef<id<MTLCommandBuffer>> CommandRecordingContext::AcquireCommands() {
    return std::move(mCommands);
 }

+id<MTLBlitCommandEncoder> CommandRecordingContext::BeginBlit(MTLBlitPassDescriptor* descriptor)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(descriptor);
+    ASSERT(mCommands != nullptr);
+    ASSERT(mBlit == nullptr);
+    ASSERT(!mInEncoder);
+
+    mInEncoder = true;
+    // The encoder is created autoreleased. Retain it to avoid the autoreleasepool from
+    // draining from under us.
+    mBlit.Acquire([[*mCommands blitCommandEncoderWithDescriptor:descriptor] retain]);
+    return mBlit.Get();
+}
+
 id<MTLBlitCommandEncoder> CommandRecordingContext::EnsureBlit() {
    ASSERT(mCommands != nullptr);

@@ -98,6 +112,20 @@ id<MTLComputeCommandEncoder> CommandRecordingContext::BeginCompute() {
    return mCompute.Get();
 }

+id<MTLComputeCommandEncoder> CommandRecordingContext::BeginCompute(
+    MTLComputePassDescriptor* descriptor) API_AVAILABLE(macos(11.0), ios(14.0)) {
+    ASSERT(descriptor);
+    ASSERT(mCommands != nullptr);
+    ASSERT(mCompute == nullptr);
+    ASSERT(!mInEncoder);
+
+    mInEncoder = true;
+    // The encoder is created autoreleased. Retain it to avoid the autoreleasepool from
+    // draining from under us.
+    mCompute.Acquire([[*mCommands computeCommandEncoderWithDescriptor:descriptor] retain]);
+    return mCompute.Get();
+}
+
 void CommandRecordingContext::EndCompute() {
    ASSERT(mCommands != nullptr);
    ASSERT(mCompute != nullptr);
--- a/src/dawn/native/metal/DeviceMTL.h
+++ b/src/dawn/native/metal/DeviceMTL.h
@@ -72,6 +72,13 @@ class Device final : public DeviceBase {

    float GetTimestampPeriodInNS() const override;

+    bool UseCounterSamplingAtCommandBoundary() const;
+    bool UseCounterSamplingAtStageBoundary() const;
+
+    // Get a MTLBuffer that can be used as a dummy in a no-op blit encoder based on filling this
+    // single-byte buffer
+    id<MTLBuffer> GetDummyBlitMtlBuffer();
+
  private:
    Device(AdapterBase* adapter,
           NSPRef<id<MTLDevice>> mtlDevice,
@@ -144,6 +151,13 @@ class Device final : public DeviceBase {
    MTLTimestamp mGpuTimestamp API_AVAILABLE(macos(10.15), ios(14.0)) = 0;
    // The parameters for kalman filter
    std::unique_ptr<KalmanInfo> mKalmanInfo;
+
+    // Support counter sampling between blit commands, dispatches and draw calls
+    bool mCounterSamplingAtCommandBoundary;
+    // Support counter sampling at the begin and end of blit pass, compute pass and render pass's
+    // vertex/fragement stage
+    bool mCounterSamplingAtStageBoundary;
+    NSPRef<id<MTLBuffer>> mDummyBlitMtlBuffer;
 };

 }  // namespace dawn::native::metal
--- a/src/dawn/native/metal/DeviceMTL.mm
+++ b/src/dawn/native/metal/DeviceMTL.mm
@@ -121,7 +121,18 @@ Device::Device(AdapterBase* adapter,
               const TripleStateTogglesSet& userProvidedToggles)
    : DeviceBase(adapter, descriptor, userProvidedToggles),
      mMtlDevice(std::move(mtlDevice)),
-      mCompletedSerial(0) {}
+      mCompletedSerial(0) {
+    // On macOS < 11.0, we only can check whether counter sampling is supported, and the counter
+    // only can be sampled between command boundary using sampleCountersInBuffer API if it's
+    // supported.
+    if (@available(macOS 11.0, iOS 14.0, *)) {
+        mCounterSamplingAtCommandBoundary = SupportCounterSamplingAtCommandBoundary(GetMTLDevice());
+        mCounterSamplingAtStageBoundary = SupportCounterSamplingAtStageBoundary(GetMTLDevice());
+    } else {
+        mCounterSamplingAtCommandBoundary = true;
+        mCounterSamplingAtStageBoundary = false;
+    }
+}

 Device::~Device() {
    Destroy();
@@ -234,6 +245,14 @@ void Device::InitTogglesFromDriver() {
    if (gpu_info::IsIntel(vendorId)) {
        SetToggle(Toggle::ApplyClearBigIntegerColorValueWithDraw, true);
    }
+
+    // TODO(dawn:1473): Metal fails to store GPU counters to sampleBufferAttachments on empty
+    // encoders on macOS 11.0+, we need to add dummy blit command to blit encoder when encoding
+    // writeTimestamp as workaround by enabling the toggle
+    // "metal_use_dummy_blit_encoder_for_write_timestamp".
+    if (@available(macos 11.0, iOS 14.0, *)) {
+        SetToggle(Toggle::MetalUseDummyBlitEncoderForWriteTimestamp, true);
+    }
 }

 ResultOrError<Ref<BindGroupBase>> Device::CreateBindGroupImpl(
@@ -498,6 +517,7 @@ void Device::DestroyImpl() {

    mCommandQueue = nullptr;
    mMtlDevice = nullptr;
+    mDummyBlitMtlBuffer = nullptr;
 }

 uint32_t Device::GetOptimalBytesPerRowAlignment() const {
@@ -512,4 +532,21 @@ float Device::GetTimestampPeriodInNS() const {
    return mTimestampPeriod;
 }

+bool Device::UseCounterSamplingAtCommandBoundary() const {
+    return mCounterSamplingAtCommandBoundary;
+}
+
+bool Device::UseCounterSamplingAtStageBoundary() const {
+    return mCounterSamplingAtStageBoundary;
+}
+
+id<MTLBuffer> Device::GetDummyBlitMtlBuffer() {
+    if (mDummyBlitMtlBuffer == nullptr) {
+        mDummyBlitMtlBuffer.Acquire(
+            [GetMTLDevice() newBufferWithLength:1 options:MTLResourceStorageModePrivate]);
+    }
+
+    return mDummyBlitMtlBuffer.Get();
+}
+
 }  // namespace dawn::native::metal
--- a/src/dawn/native/metal/UtilsMetal.h
+++ b/src/dawn/native/metal/UtilsMetal.h
@@ -23,6 +23,7 @@
 #import <Metal/Metal.h>

 namespace dawn::native {
+struct BeginRenderPassCmd;
 struct ProgrammableStage;
 struct EntryPointMetadata;
 enum class SingleShaderStage;
@@ -81,19 +82,26 @@ constexpr MTLStoreAction kMTLStoreActionStoreAndMultisampleResolve =
 // happen at the render pass start and end. Because workarounds wrap the encoding of the render
 // pass, the encoding must be entirely done by the `encodeInside` callback.
 // At the end of this function, `commandContext` will have no encoder open.
-using EncodeInsideRenderPass = std::function<MaybeError(id<MTLRenderCommandEncoder>)>;
+using EncodeInsideRenderPass =
+    std::function<MaybeError(id<MTLRenderCommandEncoder>, BeginRenderPassCmd* renderPassCmd)>;
 MaybeError EncodeMetalRenderPass(Device* device,
                                 CommandRecordingContext* commandContext,
                                 MTLRenderPassDescriptor* mtlRenderPass,
                                 uint32_t width,
                                 uint32_t height,
-                                 EncodeInsideRenderPass encodeInside);
+                                 EncodeInsideRenderPass encodeInside,
+                                 BeginRenderPassCmd* renderPassCmd = nullptr);

 MaybeError EncodeEmptyMetalRenderPass(Device* device,
                                      CommandRecordingContext* commandContext,
                                      MTLRenderPassDescriptor* mtlRenderPass,
                                      Extent3D size);

+bool SupportCounterSamplingAtCommandBoundary(id<MTLDevice> device)
+    API_AVAILABLE(macos(11.0), ios(14.0));
+bool SupportCounterSamplingAtStageBoundary(id<MTLDevice> device)
+    API_AVAILABLE(macos(11.0), ios(14.0));
+
 }  // namespace dawn::native::metal

 #endif  // SRC_DAWN_NATIVE_METAL_UTILSMETAL_H_
--- a/src/dawn/native/metal/UtilsMetal.mm
+++ b/src/dawn/native/metal/UtilsMetal.mm
@@ -328,7 +328,8 @@ MaybeError EncodeMetalRenderPass(Device* device,
                                 MTLRenderPassDescriptor* mtlRenderPass,
                                 uint32_t width,
                                 uint32_t height,
-                                 EncodeInsideRenderPass encodeInside) {
+                                 EncodeInsideRenderPass encodeInside,
+                                 BeginRenderPassCmd* renderPassCmd) {
    // This function handles multiple workarounds. Because some cases requires multiple
    // workarounds to happen at the same time, it handles workarounds one by one and calls
    // itself recursively to handle the next workaround if needed.
@@ -359,7 +360,7 @@ MaybeError EncodeMetalRenderPass(Device* device,
        // resolve back to the true resolve targets.
        if (workaroundUsed) {
            DAWN_TRY(EncodeMetalRenderPass(device, commandContext, mtlRenderPass, width, height,
-                                           std::move(encodeInside)));
+                                           std::move(encodeInside), renderPassCmd));

            for (uint32_t i = 0; i < kMaxColorAttachments; ++i) {
                if (trueResolveAttachments[i].texture == nullptr) {
@@ -403,7 +404,7 @@ MaybeError EncodeMetalRenderPass(Device* device,

        if (workaroundUsed) {
            DAWN_TRY(EncodeMetalRenderPass(device, commandContext, mtlRenderPass, width, height,
-                                           std::move(encodeInside)));
+                                           std::move(encodeInside), renderPassCmd));

            for (uint32_t i = 0; i < kMaxColorAttachments; ++i) {
                if (originalAttachments[i].texture == nullptr) {
@@ -439,7 +440,7 @@ MaybeError EncodeMetalRenderPass(Device* device,
        // If we found a store + MSAA resolve we need to resolve in a different render pass.
        if (hasStoreAndMSAAResolve) {
            DAWN_TRY(EncodeMetalRenderPass(device, commandContext, mtlRenderPass, width, height,
-                                           std::move(encodeInside)));
+                                           std::move(encodeInside), renderPassCmd));

            ResolveInAnotherRenderPass(commandContext, mtlRenderPass, resolveTextures);
            return {};
@@ -448,7 +449,7 @@ MaybeError EncodeMetalRenderPass(Device* device,

    // No (more) workarounds needed! We can finally encode the actual render pass.
    commandContext->EndBlit();
-    DAWN_TRY(encodeInside(commandContext->BeginRender(mtlRenderPass)));
+    DAWN_TRY(encodeInside(commandContext->BeginRender(mtlRenderPass), renderPassCmd));
    commandContext->EndRender();
    return {};
 }
@@ -457,8 +458,26 @@ MaybeError EncodeEmptyMetalRenderPass(Device* device,
                                      CommandRecordingContext* commandContext,
                                      MTLRenderPassDescriptor* mtlRenderPass,
                                      Extent3D size) {
-    return EncodeMetalRenderPass(device, commandContext, mtlRenderPass, size.width, size.height,
-                                 [&](id<MTLRenderCommandEncoder>) -> MaybeError { return {}; });
+    return EncodeMetalRenderPass(
+        device, commandContext, mtlRenderPass, size.width, size.height,
+        [&](id<MTLRenderCommandEncoder>, BeginRenderPassCmd*) -> MaybeError { return {}; });
+}
+
+DAWN_NOINLINE bool SupportCounterSamplingAtCommandBoundary(id<MTLDevice> device)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    bool isBlitBoundarySupported =
+        [device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary];
+    bool isDispatchBoundarySupported =
+        [device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary];
+    bool isDrawBoundarySupported =
+        [device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary];
+
+    return isBlitBoundarySupported && isDispatchBoundarySupported && isDrawBoundarySupported;
+}
+
+DAWN_NOINLINE bool SupportCounterSamplingAtStageBoundary(id<MTLDevice> device)
+    API_AVAILABLE(macos(11.0), ios(14.0)) {
+    return [device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary];
 }

 }  // namespace dawn::native::metal
--- a/src/dawn/tests/DawnTest.cpp
+++ b/src/dawn/tests/DawnTest.cpp
@@ -667,6 +667,10 @@ bool DawnTestBase::IsAMD() const {
    return gpu_info::IsAMD(mParam.adapterProperties.vendorID);
 }

+bool DawnTestBase::IsApple() const {
+    return gpu_info::IsApple(mParam.adapterProperties.vendorID);
+}
+
 bool DawnTestBase::IsARM() const {
    return gpu_info::IsARM(mParam.adapterProperties.vendorID);
 }
--- a/src/dawn/tests/DawnTest.h
+++ b/src/dawn/tests/DawnTest.h
@@ -225,6 +225,7 @@ class DawnTestBase {
    bool IsVulkan() const;

    bool IsAMD() const;
+    bool IsApple() const;
    bool IsARM() const;
    bool IsImgTec() const;
    bool IsIntel() const;
--- a/src/dawn/tests/end2end/QueryTests.cpp
+++ b/src/dawn/tests/end2end/QueryTests.cpp
@@ -18,6 +18,15 @@
 #include "dawn/utils/ComboRenderPipelineDescriptor.h"
 #include "dawn/utils/WGPUHelpers.h"

+// Clear the content of the result buffer into 0xFFFFFFFF.
+constexpr static uint64_t kSentinelValue = ~uint64_t(0u);
+constexpr static uint64_t kZero = 0u;
+constexpr static unsigned int kRTSize = 4;
+constexpr uint64_t kMinDestinationOffset = 256;
+constexpr uint64_t kMinCount = kMinDestinationOffset / sizeof(uint64_t);
+constexpr wgpu::TextureFormat kColorFormat = wgpu::TextureFormat::RGBA8Unorm;
+constexpr wgpu::TextureFormat kDepthStencilFormat = wgpu::TextureFormat::Depth24PlusStencil8;
+
 class QueryTests : public DawnTest {
  protected:
    wgpu::Buffer CreateResolveBuffer(uint64_t size) {
@@ -27,13 +36,15 @@ class QueryTests : public DawnTest {
                           wgpu::BufferUsage::CopyDst;
        return device.CreateBuffer(&descriptor);
    }
-};

-// Clear the content of the result buffer into 0xFFFFFFFF.
-constexpr static uint64_t kSentinelValue = ~uint64_t(0u);
-constexpr static uint64_t kZero = 0u;
-constexpr uint64_t kMinDestinationOffset = 256;
-constexpr uint64_t kMinCount = kMinDestinationOffset / sizeof(uint64_t);
+    wgpu::Texture CreateRenderTexture(wgpu::TextureFormat format) {
+        wgpu::TextureDescriptor descriptor;
+        descriptor.size = {kRTSize, kRTSize, 1};
+        descriptor.format = format;
+        descriptor.usage = wgpu::TextureUsage::RenderAttachment;
+        return device.CreateTexture(&descriptor);
+    }
+};

 class OcclusionExpectation : public detail::Expectation {
  public:
@@ -112,14 +123,6 @@ class OcclusionQueryTests : public QueryTests {
        return device.CreateQuerySet(&descriptor);
    }

-    wgpu::Texture CreateRenderTexture(wgpu::TextureFormat format) {
-        wgpu::TextureDescriptor descriptor;
-        descriptor.size = {kRTSize, kRTSize, 1};
-        descriptor.format = format;
-        descriptor.usage = wgpu::TextureUsage::RenderAttachment;
-        return device.CreateTexture(&descriptor);
-    }
-
    void TestOcclusionQueryWithDepthStencilTest(bool depthTestEnabled,
                                                bool stencilTestEnabled,
                                                OcclusionExpectation::Result expected) {
@@ -130,8 +133,7 @@ class OcclusionQueryTests : public QueryTests {
        descriptor.cFragment.module = fsModule;

        // Enable depth and stencil tests and set comparison tests never pass.
-        wgpu::DepthStencilState* depthStencil =
-            descriptor.EnableDepthStencil(wgpu::TextureFormat::Depth24PlusStencil8);
+        wgpu::DepthStencilState* depthStencil = descriptor.EnableDepthStencil(kDepthStencilFormat);
        depthStencil->depthCompare =
            depthTestEnabled ? wgpu::CompareFunction::Never : wgpu::CompareFunction::Always;
        depthStencil->stencilFront.compare =
@@ -141,10 +143,10 @@ class OcclusionQueryTests : public QueryTests {

        wgpu::RenderPipeline renderPipeline = device.CreateRenderPipeline(&descriptor);

-        wgpu::Texture renderTarget = CreateRenderTexture(wgpu::TextureFormat::RGBA8Unorm);
+        wgpu::Texture renderTarget = CreateRenderTexture(kColorFormat);
        wgpu::TextureView renderTargetView = renderTarget.CreateView();

-        wgpu::Texture depthTexture = CreateRenderTexture(wgpu::TextureFormat::Depth24PlusStencil8);
+        wgpu::Texture depthTexture = CreateRenderTexture(kDepthStencilFormat);
        wgpu::TextureView depthTextureView = depthTexture.CreateView();

        wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
@@ -205,8 +207,6 @@ class OcclusionQueryTests : public QueryTests {
    wgpu::ShaderModule fsModule;

    wgpu::RenderPipeline pipeline;
-
-    constexpr static unsigned int kRTSize = 4;
 };

 // Test creating query set with the type of Occlusion
@@ -551,23 +551,87 @@ class TimestampQueryTests : public QueryTests {
        return device.CreateQuerySet(&descriptor);
    }

+    wgpu::RenderPipeline CreateRenderPipeline(bool hasFragmentStage = true) {
+        utils::ComboRenderPipelineDescriptor descriptor;
+
+        descriptor.vertex.module = utils::CreateShaderModule(device, R"(
+                @vertex
+                fn main(@builtin(vertex_index) VertexIndex : u32) -> @builtin(position) vec4<f32> {
+                    var pos = array<vec2<f32>, 3>(
+                        vec2<f32>( 1.0,  1.0),
+                        vec2<f32>(-1.0, -1.0),
+                        vec2<f32>( 1.0, -1.0));
+                    return vec4<f32>(pos[VertexIndex], 0.0, 1.0);
+                })");
+
+        if (hasFragmentStage) {
+            descriptor.cFragment.module = utils::CreateShaderModule(device, R"(
+                @fragment fn main() -> @location(0) vec4<f32> {
+                    return vec4<f32>(0.0, 1.0, 0.0, 1.0);
+                })");
+        } else {
+            descriptor.fragment = nullptr;
+            descriptor.EnableDepthStencil(kDepthStencilFormat);
+        }
+
+        return device.CreateRenderPipeline(&descriptor);
+    }
+
+    void EncodeComputeTimestampWrites(
+        const wgpu::CommandEncoder& encoder,
+        const std::vector<wgpu::ComputePassTimestampWrite>& timestampWrites,
+        bool hasPipeline = true) {
+        wgpu::ComputePassDescriptor descriptor;
+        descriptor.timestampWriteCount = timestampWrites.size();
+        descriptor.timestampWrites = timestampWrites.data();
+
+        wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&descriptor);
+        if (hasPipeline) {
+            pass.SetPipeline(computePipeline);
+            pass.DispatchWorkgroups(1, 1, 1);
+        }
+        pass.End();
+    }
+
+    void EncodeRenderTimestampWrites(
+        const wgpu::CommandEncoder& encoder,
+        const std::vector<wgpu::RenderPassTimestampWrite>& timestampWrites,
+        bool hasPipeline = true,
+        bool hasFragmentStage = true) {
+        wgpu::Texture depthTexture = CreateRenderTexture(kDepthStencilFormat);
+        utils::ComboRenderPassDescriptor renderPassDesc =
+            hasFragmentStage
+                ? utils::ComboRenderPassDescriptor({CreateRenderTexture(kColorFormat).CreateView()})
+                : utils::ComboRenderPassDescriptor(
+                      {}, CreateRenderTexture(kDepthStencilFormat).CreateView());
+        renderPassDesc.timestampWriteCount = timestampWrites.size();
+        renderPassDesc.timestampWrites = timestampWrites.data();
+
+        wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPassDesc);
+        if (hasPipeline) {
+            wgpu::RenderPipeline renderPipeline = CreateRenderPipeline(hasFragmentStage);
+            pass.SetPipeline(renderPipeline);
+            pass.Draw(3);
+        }
+        pass.End();
+    }
+
    void TestTimestampWritesOnComputePass(
        const std::vector<wgpu::ComputePassTimestampWrite>& timestampWrites,
-        const std::vector<wgpu::ComputePassTimestampWrite>& timestampWritesOnAnotherPass = {}) {
+        const std::vector<wgpu::ComputePassTimestampWrite>& timestampWritesOnAnotherPass = {},
+        bool hasPipeline = true) {
        size_t queryCount = timestampWrites.size() + timestampWritesOnAnotherPass.size();
        // The destination buffer offset must be a multiple of 256.
        wgpu::Buffer destination =
            CreateResolveBuffer(queryCount * kMinDestinationOffset + sizeof(uint64_t));

-        wgpu::ComputePassDescriptor descriptor;
-        descriptor.timestampWriteCount = timestampWrites.size();
-        descriptor.timestampWrites = timestampWrites.data();
-
        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-        wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&descriptor);
-        pass.SetPipeline(computePipeline);
-        pass.DispatchWorkgroups(1, 1, 1);
-        pass.End();
+        EncodeComputeTimestampWrites(encoder, timestampWrites, hasPipeline);
+
+        // Begin another compute pass if the timestampWritesOnAnotherPass is set.
+        if (!timestampWritesOnAnotherPass.empty()) {
+            EncodeComputeTimestampWrites(encoder, timestampWritesOnAnotherPass, hasPipeline);
+        }

        // Resolve queries one by one because the query set at the beginning of pass may be
        // different with the one at the end of pass.
@@ -575,26 +639,11 @@ class TimestampQueryTests : public QueryTests {
            encoder.ResolveQuerySet(timestampWrites[i].querySet, timestampWrites[i].queryIndex, 1,
                                    destination, i * kMinDestinationOffset);
        }
-
-        // Begin another compute pass if the timestampWritesOnAnotherPass is set.
-        if (!timestampWritesOnAnotherPass.empty()) {
-            wgpu::ComputePassDescriptor descriptor2;
-            descriptor2.timestampWriteCount = timestampWritesOnAnotherPass.size();
-            descriptor2.timestampWrites = timestampWritesOnAnotherPass.data();
-
-            wgpu::ComputePassEncoder pass2 = encoder.BeginComputePass(&descriptor2);
-            pass2.SetPipeline(computePipeline);
-            pass2.DispatchWorkgroups(1, 1, 1);
-            pass2.End();
-
        for (size_t i = 0; i < timestampWritesOnAnotherPass.size(); i++) {
-                // Resolve queries one by one because the query set at the beginning of pass may be
-                // different with the one at the end of pass.
            encoder.ResolveQuerySet(timestampWritesOnAnotherPass[i].querySet,
                                    timestampWritesOnAnotherPass[i].queryIndex, 1, destination,
                                    (timestampWrites.size() + i) * kMinDestinationOffset);
        }
-        }

        wgpu::CommandBuffer commands = encoder.Finish();
        queue.Submit(1, &commands);
@@ -607,19 +656,22 @@ class TimestampQueryTests : public QueryTests {

    void TestTimestampWritesOnRenderPass(
        const std::vector<wgpu::RenderPassTimestampWrite>& timestampWrites,
-        const std::vector<wgpu::RenderPassTimestampWrite>& timestampWritesOnAnotherPass = {}) {
+        const std::vector<wgpu::RenderPassTimestampWrite>& timestampWritesOnAnotherPass = {},
+        bool hasPipeline = true,
+        bool hasFragmentStage = true) {
        size_t queryCount = timestampWrites.size() + timestampWritesOnAnotherPass.size();
        // The destination buffer offset must be a multiple of 256.
        wgpu::Buffer destination =
            CreateResolveBuffer(queryCount * kMinDestinationOffset + sizeof(uint64_t));

-        utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 1, 1);
-        renderPass.renderPassInfo.timestampWriteCount = timestampWrites.size();
-        renderPass.renderPassInfo.timestampWrites = timestampWrites.data();
-
        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-        wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
-        pass.End();
+        EncodeRenderTimestampWrites(encoder, timestampWrites, hasPipeline, hasFragmentStage);
+
+        // Begin another render pass if the timestampWritesOnAnotherPass is set.
+        if (!timestampWritesOnAnotherPass.empty()) {
+            EncodeRenderTimestampWrites(encoder, timestampWritesOnAnotherPass, hasPipeline,
+                                        hasFragmentStage);
+        }

        // Resolve queries one by one because the query set at the beginning of pass may be
        // different with the one at the end of pass.
@@ -628,23 +680,11 @@ class TimestampQueryTests : public QueryTests {
                                    destination, i * kMinDestinationOffset);
        }

-        // Begin another render pass if the timestampWritesOnAnotherPass is set.
-        if (!timestampWritesOnAnotherPass.empty()) {
-            utils::BasicRenderPass renderPass2 = utils::CreateBasicRenderPass(device, 1, 1);
-            renderPass2.renderPassInfo.timestampWriteCount = timestampWritesOnAnotherPass.size();
-            renderPass2.renderPassInfo.timestampWrites = timestampWritesOnAnotherPass.data();
-
-            wgpu::RenderPassEncoder pass2 = encoder.BeginRenderPass(&renderPass2.renderPassInfo);
-            pass2.End();
-
        for (size_t i = 0; i < timestampWritesOnAnotherPass.size(); i++) {
-                // Resolve queries one by one because the query set at the beginning of pass may be
-                // different with the one at the end of pass.
            encoder.ResolveQuerySet(timestampWritesOnAnotherPass[i].querySet,
                                    timestampWritesOnAnotherPass[i].queryIndex, 1, destination,
                                    (timestampWrites.size() + i) * kMinDestinationOffset);
        }
-        }

        wgpu::CommandBuffer commands = encoder.Finish();
        queue.Submit(1, &commands);
@@ -669,9 +709,6 @@ TEST_P(TimestampQueryTests, QuerySetCreation) {

 // Test calling timestamp query from command encoder
 TEST_P(TimestampQueryTests, TimestampOnCommandEncoder) {
-    // TODO (dawn:1250): Still not implemented on Metal backend.
-    DAWN_TEST_UNSUPPORTED_IF(IsMetal());
-
    constexpr uint32_t kQueryCount = 2;

    // Write timestamp with different query indexes
@@ -709,6 +746,10 @@ TEST_P(TimestampQueryTests, TimestampOnCommandEncoder) {

 // Test calling timestamp query from render pass encoder
 TEST_P(TimestampQueryTests, TimestampOnRenderPass) {
+    // TODO (dawn:1250): Split writeTimestamp() to another extension which is not supported on Apple
+    // devices
+    DAWN_TEST_UNSUPPORTED_IF(IsMacOS() && IsMetal() && IsApple());
+
    constexpr uint32_t kQueryCount = 2;

    // Write timestamp with different query indexes
@@ -754,6 +795,10 @@ TEST_P(TimestampQueryTests, TimestampOnRenderPass) {

 // Test calling timestamp query from compute pass encoder
 TEST_P(TimestampQueryTests, TimestampOnComputePass) {
+    // TODO (dawn:1250): Split writeTimestamp() to another extension which is not supported on Apple
+    // devices
+    DAWN_TEST_UNSUPPORTED_IF(IsMacOS() && IsMetal() && IsApple());
+
    constexpr uint32_t kQueryCount = 2;

    // Write timestamp with different query indexes
@@ -815,19 +860,34 @@ TEST_P(TimestampQueryTests, TimestampOnComputePass) {
    }
 }

-// Test timestampWrites setting in compute pass descriptor
-TEST_P(TimestampQueryTests, TimestampWritesOnComputePass) {
+// Test timestampWrites with query set in compute pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesQuerySetOnComputePass) {
    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
    // the issue is fixed.
    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());

-    // TODO (dawn:1250): Still not implemented on Metal backend.
-    DAWN_TEST_UNSUPPORTED_IF(IsMetal());
+    // TODO (dawn:1473): Metal bug which fails to store GPU counters to different sample buffer.
+    DAWN_SUPPRESS_TEST_IF(IsMacOS() && IsMetal() && IsApple());
+
+    // Set timestampWrites with different query set on same compute pass
+    wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
+    wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);
+
+    TestTimestampWritesOnComputePass({{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning},
+                                      {querySet1, 0, wgpu::ComputePassTimestampLocation::End}});
+}
+
+// Test timestampWrites with query index in compute pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesQueryIndexOnComputePass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());

    constexpr uint32_t kQueryCount = 2;

-    // Set timestampWrites with different query indexes and locations on same compute pass
+    // Set timestampWrites with different query indexes on same compute pass
    {
        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);

@@ -836,16 +896,35 @@ TEST_P(TimestampQueryTests, TimestampWritesOnComputePass) {
             {querySet, 1, wgpu::ComputePassTimestampLocation::End}});
    }

-    // Set timestampWrites with different query set on same compute pass
+    // Set timestampWrites with same query index on same compute pass
    {
-        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
-        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);

        TestTimestampWritesOnComputePass(
-            {{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning},
-             {querySet1, 0, wgpu::ComputePassTimestampLocation::End}});
+            {{querySet, 0, wgpu::ComputePassTimestampLocation::Beginning},
+             {querySet, 0, wgpu::ComputePassTimestampLocation::End}});
    }

+    // Set timestampWrites with same query indexes on different compute pass
+    {
+        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(kQueryCount);
+
+        TestTimestampWritesOnComputePass(
+            {{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning}},
+            {{querySet1, 0, wgpu::ComputePassTimestampLocation::End}});
+    }
+}
+
+// Test timestampWrites with timestamp location in compute pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesLocationOnComputePass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    constexpr uint32_t kQueryCount = 2;
+
    // Set timestampWrites with only one value of ComputePassTimestampLocation
    {
        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
@@ -856,63 +935,78 @@ TEST_P(TimestampQueryTests, TimestampWritesOnComputePass) {
        TestTimestampWritesOnComputePass({{querySet, 1, wgpu::ComputePassTimestampLocation::End}});
    }

-    // Set timestampWrites with same query set and query index on same compute pass
+    // Set timestampWrites with same location on different compute pass
    {
-        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
+        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);

        TestTimestampWritesOnComputePass(
-            {{querySet, 0, wgpu::ComputePassTimestampLocation::Beginning},
-             {querySet, 0, wgpu::ComputePassTimestampLocation::End}});
-    }
-
-    // Set timestampWrites with same query indexes and locations on different compute pass
-    {
-        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(kQueryCount);
-        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(kQueryCount);
-
-        TestTimestampWritesOnComputePass(
-            {{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning},
-             {querySet0, 1, wgpu::ComputePassTimestampLocation::End}},
-            {{querySet1, 0, wgpu::ComputePassTimestampLocation::Beginning},
-             {querySet1, 1, wgpu::ComputePassTimestampLocation::End}});
+            {{querySet0, 0, wgpu::ComputePassTimestampLocation::Beginning}},
+            {{querySet1, 0, wgpu::ComputePassTimestampLocation::Beginning}});
    }
 }

-// Test timestampWrites setting in render pass descriptor
-TEST_P(TimestampQueryTests, TimestampWritesOnRenderPass) {
+// Test timestampWrites on compute pass without pipeline
+TEST_P(TimestampQueryTests, TimestampWritesOnComputePassWithNoPipline) {
    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
    // the issue is fixed.
    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());

-    // TODO (dawn:1250): Still not implemented on Metal backend.
-    DAWN_TEST_UNSUPPORTED_IF(IsMetal());
+    // TODO (dawn:1473): Metal fails to store GPU counters to sampleBufferAttachments on empty
+    // encoders.
+    DAWN_SUPPRESS_TEST_IF(IsMacOS() && IsMetal() && IsApple());

-    constexpr uint32_t kQueryCount = 2;
+    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);

-    // Set timestampWrites with different query indexes and locations, not need test write same
-    // query index due to it's not allowed on render pass.
-    {
-        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+    TestTimestampWritesOnComputePass({{querySet, 0, wgpu::ComputePassTimestampLocation::Beginning},
+                                      {querySet, 1, wgpu::ComputePassTimestampLocation::End}},
+                                     {}, false);
+}

-        TestTimestampWritesOnRenderPass(
-            {{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning},
-             {querySet, 1, wgpu::RenderPassTimestampLocation::End}});
-    }
+// Test timestampWrites with query set in render pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesQuerySetOnRenderPass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    // TODO (dawn:1473): Metal bug which fails to store GPU counters to different sample buffer.
+    DAWN_SUPPRESS_TEST_IF(IsMacOS() && IsMetal() && IsApple());

    // Set timestampWrites with different query set on same render pass
-    {
    wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
    wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);

-        TestTimestampWritesOnRenderPass(
-            {{querySet0, 0, wgpu::RenderPassTimestampLocation::Beginning},
+    TestTimestampWritesOnRenderPass({{querySet0, 0, wgpu::RenderPassTimestampLocation::Beginning},
                                     {querySet1, 0, wgpu::RenderPassTimestampLocation::End}});
-    }
+}
+
+// Test timestampWrites with query index in compute pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesQueryIndexOnRenderPass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    // Set timestampWrites with different query indexes and locations, not need test write same
+    // query index due to it's not allowed on render pass.
+    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);
+
+    TestTimestampWritesOnRenderPass({{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning},
+                                     {querySet, 1, wgpu::RenderPassTimestampLocation::End}});
+}
+
+// Test timestampWrites with timestamp location in render pass descriptor
+TEST_P(TimestampQueryTests, TimestampWritesLocationOnRenderPass) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());

    // Set timestampWrites with only one value of RenderPassTimestampLocation
    {
-        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);

        TestTimestampWritesOnRenderPass(
            {{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning}});
@@ -920,19 +1014,45 @@ TEST_P(TimestampQueryTests, TimestampWritesOnRenderPass) {
        TestTimestampWritesOnRenderPass({{querySet, 1, wgpu::RenderPassTimestampLocation::End}});
    }

-    // Set timestampWrites with same query indexes and locations on different render pass
+    // Set timestampWrites with same location on different render pass
    {
-        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(kQueryCount);
-        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(kQueryCount);
+        wgpu::QuerySet querySet0 = CreateQuerySetForTimestamp(1);
+        wgpu::QuerySet querySet1 = CreateQuerySetForTimestamp(1);

        TestTimestampWritesOnRenderPass(
-            {{querySet0, 0, wgpu::RenderPassTimestampLocation::Beginning},
-             {querySet0, 1, wgpu::RenderPassTimestampLocation::End}},
-            {{querySet1, 0, wgpu::RenderPassTimestampLocation::Beginning},
-             {querySet1, 1, wgpu::RenderPassTimestampLocation::End}});
+            {{querySet0, 0, wgpu::RenderPassTimestampLocation::Beginning}},
+            {{querySet1, 0, wgpu::RenderPassTimestampLocation::Beginning}});
    }
 }

+// Test timestampWrites on render pass without pipeline
+TEST_P(TimestampQueryTests, TimestampWritesOnRenderPassWithNoPipline) {
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);
+    TestTimestampWritesOnRenderPass({{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning},
+                                     {querySet, 1, wgpu::RenderPassTimestampLocation::End}},
+                                    {}, false);
+}
+
+// Test timestampWrites on render pass with pipeline but no fragment stage
+TEST_P(TimestampQueryTests, TimestampWritesOnRenderPassWithOnlyVertexStage) {
+    DAWN_TEST_UNSUPPORTED_IF(HasToggleEnabled("use_placeholder_fragment_in_vertex_only_pipeline"));
+
+    // TODO(dawn:1489): Fails on Intel Windows Vulkan due to a driver issue that
+    // vkCmdFillBuffer and vkCmdCopyQueryPoolResults are not executed in order, skip it until
+    // the issue is fixed.
+    DAWN_SUPPRESS_TEST_IF(IsWindows() && IsVulkan() && IsIntel());
+
+    wgpu::QuerySet querySet = CreateQuerySetForTimestamp(2);
+    TestTimestampWritesOnRenderPass({{querySet, 0, wgpu::RenderPassTimestampLocation::Beginning},
+                                     {querySet, 1, wgpu::RenderPassTimestampLocation::End}},
+                                    {}, true, false);
+}
+
 // Test resolving timestamp query from another different encoder
 TEST_P(TimestampQueryTests, ResolveFromAnotherEncoder) {
    constexpr uint32_t kQueryCount = 2;