From ff8b3f43972342de3ea48cb94c036d8a7b7089c5 Mon Sep 17 00:00:00 2001 From: Austin Eng Date: Tue, 10 Dec 2019 01:10:27 +0000 Subject: [PATCH] Inline CommandAllocator/Iterator Inlining these hot functions decreases CPU time in perf tests for DrawCallPerf.Run/Vulkan by roughly 12% (55 to 47ns) and increases binary size by about 0.16% (~4kB). Bug: dawn:304 Change-Id: I84e5d011defe88d6f1492dcb54e421c3d1bf099f Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/14000 Commit-Queue: Austin Eng Reviewed-by: Corentin Wallez --- src/common/Compiler.h | 9 ++ src/common/Math.cpp | 7 -- src/common/Math.h | 16 +++- src/dawn_native/CommandAllocator.cpp | 118 +++++---------------------- src/dawn_native/CommandAllocator.h | 102 +++++++++++++++++++++-- 5 files changed, 138 insertions(+), 114 deletions(-) diff --git a/src/common/Compiler.h b/src/common/Compiler.h index 3bbcee6a18..8e425c9058 100644 --- a/src/common/Compiler.h +++ b/src/common/Compiler.h @@ -61,6 +61,9 @@ # endif # define DAWN_DECLARE_UNUSED __attribute__((unused)) +# if defined(NDEBUG) +# define DAWN_FORCE_INLINE inline __attribute__((always_inline)) +# endif // MSVC #elif defined(_MSC_VER) @@ -77,6 +80,9 @@ extern void __cdecl __debugbreak(void); # endif # define DAWN_DECLARE_UNUSED +# if defined(NDEBUG) +# define DAWN_FORCE_INLINE __forceinline +# endif #else # error "Unsupported compiler" @@ -97,5 +103,8 @@ extern void __cdecl __debugbreak(void); #if !defined(DAWN_NO_DISCARD) # define DAWN_NO_DISCARD #endif +#if !defined(DAWN_FORCE_INLINE) +# define DAWN_FORCE_INLINE inline +#endif #endif // COMMON_COMPILER_H_ diff --git a/src/common/Math.cpp b/src/common/Math.cpp index a8823e5429..2ae977b1e8 100644 --- a/src/common/Math.cpp +++ b/src/common/Math.cpp @@ -85,13 +85,6 @@ bool IsPtrAligned(const void* ptr, size_t alignment) { return (reinterpret_cast(ptr) & (alignment - 1)) == 0; } -void* AlignVoidPtr(void* ptr, size_t alignment) { - ASSERT(IsPowerOfTwo(alignment)); - ASSERT(alignment != 0); - return reinterpret_cast((reinterpret_cast(ptr) + (alignment - 1)) & - ~(alignment - 1)); -} - bool IsAligned(uint32_t value, size_t alignment) { ASSERT(alignment <= UINT32_MAX); ASSERT(IsPowerOfTwo(alignment)); diff --git a/src/common/Math.h b/src/common/Math.h index ac40dd9672..5ee915ef73 100644 --- a/src/common/Math.h +++ b/src/common/Math.h @@ -15,6 +15,8 @@ #ifndef COMMON_MATH_H_ #define COMMON_MATH_H_ +#include "common/Assert.h" + #include #include #include @@ -35,13 +37,19 @@ bool IsAligned(uint32_t value, size_t alignment); uint32_t Align(uint32_t value, size_t alignment); template -T* AlignPtr(T* ptr, size_t alignment) { - return static_cast(AlignVoidPtr(ptr, alignment)); +DAWN_FORCE_INLINE T* AlignPtr(T* ptr, size_t alignment) { + ASSERT(IsPowerOfTwo(alignment)); + ASSERT(alignment != 0); + return reinterpret_cast((reinterpret_cast(ptr) + (alignment - 1)) & + ~(alignment - 1)); } template -const T* AlignPtr(const T* ptr, size_t alignment) { - return static_cast(AlignVoidPtr(const_cast(ptr), alignment)); +DAWN_FORCE_INLINE const T* AlignPtr(const T* ptr, size_t alignment) { + ASSERT(IsPowerOfTwo(alignment)); + ASSERT(alignment != 0); + return reinterpret_cast((reinterpret_cast(ptr) + (alignment - 1)) & + ~(alignment - 1)); } template diff --git a/src/dawn_native/CommandAllocator.cpp b/src/dawn_native/CommandAllocator.cpp index 990c1c5990..553f889657 100644 --- a/src/dawn_native/CommandAllocator.cpp +++ b/src/dawn_native/CommandAllocator.cpp @@ -23,12 +23,9 @@ namespace dawn_native { - constexpr uint32_t EndOfBlock = UINT_MAX; // std::numeric_limits::max(); - constexpr uint32_t AdditionalData = UINT_MAX - 1; // std::numeric_limits::max() - 1; - // TODO(cwallez@chromium.org): figure out a way to have more type safety for the iterator - CommandIterator::CommandIterator() : mEndOfBlock(EndOfBlock) { + CommandIterator::CommandIterator() { Reset(); } @@ -42,7 +39,7 @@ namespace dawn_native { } } - CommandIterator::CommandIterator(CommandIterator&& other) : mEndOfBlock(EndOfBlock) { + CommandIterator::CommandIterator(CommandIterator&& other) { if (!other.IsEmpty()) { mBlocks = std::move(other.mBlocks); other.Reset(); @@ -64,7 +61,7 @@ namespace dawn_native { } CommandIterator::CommandIterator(CommandAllocator&& allocator) - : mBlocks(allocator.AcquireBlocks()), mEndOfBlock(EndOfBlock) { + : mBlocks(allocator.AcquireBlocks()) { Reset(); } @@ -74,6 +71,17 @@ namespace dawn_native { return *this; } + bool CommandIterator::NextCommandIdInNewBlock(uint32_t* commandId) { + mCurrentBlock++; + if (mCurrentBlock >= mBlocks.size()) { + Reset(); + *commandId = detail::kEndOfBlock; + return false; + } + mCurrentPtr = AlignPtr(mBlocks[mCurrentBlock].block, alignof(uint32_t)); + return NextCommandId(commandId); + } + void CommandIterator::Reset() { mCurrentBlock = 0; @@ -97,47 +105,6 @@ namespace dawn_native { return mBlocks[0].block == reinterpret_cast(&mEndOfBlock); } - bool CommandIterator::NextCommandId(uint32_t* commandId) { - uint8_t* idPtr = AlignPtr(mCurrentPtr, alignof(uint32_t)); - ASSERT(idPtr + sizeof(uint32_t) <= - mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size); - - uint32_t id = *reinterpret_cast(idPtr); - - if (id == EndOfBlock) { - mCurrentBlock++; - if (mCurrentBlock >= mBlocks.size()) { - Reset(); - *commandId = EndOfBlock; - return false; - } - mCurrentPtr = AlignPtr(mBlocks[mCurrentBlock].block, alignof(uint32_t)); - return NextCommandId(commandId); - } - - mCurrentPtr = idPtr + sizeof(uint32_t); - *commandId = id; - return true; - } - - void* CommandIterator::NextCommand(size_t commandSize, size_t commandAlignment) { - uint8_t* commandPtr = AlignPtr(mCurrentPtr, commandAlignment); - ASSERT(commandPtr + sizeof(commandSize) <= - mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size); - - mCurrentPtr = commandPtr + commandSize; - return commandPtr; - } - - void* CommandIterator::NextData(size_t dataSize, size_t dataAlignment) { - uint32_t id; - bool hasId = NextCommandId(&id); - ASSERT(hasId); - ASSERT(id == AdditionalData); - - return NextCommand(dataSize, dataAlignment); - } - // Potential TODO(cwallez@chromium.org): // - Host the size and pointer to next block in the block itself to avoid having an allocation // in the vector @@ -161,60 +128,23 @@ namespace dawn_native { ASSERT(mCurrentPtr != nullptr && mEndPtr != nullptr); ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t))); ASSERT(mCurrentPtr + sizeof(uint32_t) <= mEndPtr); - *reinterpret_cast(mCurrentPtr) = EndOfBlock; + *reinterpret_cast(mCurrentPtr) = detail::kEndOfBlock; mCurrentPtr = nullptr; mEndPtr = nullptr; return std::move(mBlocks); } - uint8_t* CommandAllocator::Allocate(uint32_t commandId, - size_t commandSize, - size_t commandAlignment) { - ASSERT(mCurrentPtr != nullptr); - ASSERT(mEndPtr != nullptr); - ASSERT(commandId != EndOfBlock); - - // It should always be possible to allocate one id, for EndOfBlock tagging, - ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t))); - ASSERT(mEndPtr >= mCurrentPtr); - ASSERT(static_cast(mEndPtr - mCurrentPtr) >= sizeof(uint32_t)); - - // The memory after the ID will contain the following: - // - the current ID - // - padding to align the command, maximum kMaxSupportedAlignment - // - the command of size commandSize - // - padding to align the next ID, maximum alignof(uint32_t) - // - the next ID of size sizeof(uint32_t) - // - // To avoid checking for overflows at every step of the computations we compute an upper - // bound of the space that will be needed in addition to the command data. - static constexpr size_t kWorstCaseAdditionalSize = - sizeof(uint32_t) + kMaxSupportedAlignment + alignof(uint32_t) + sizeof(uint32_t); - - // This can't overflow because by construction mCurrentPtr always has space for the next ID. - size_t remainingSize = static_cast(mEndPtr - mCurrentPtr); - - // The good case were we have enough space for the command data and upper bound of the - // extra required space. - if ((remainingSize >= kWorstCaseAdditionalSize) && - (remainingSize - kWorstCaseAdditionalSize >= commandSize)) { - uint32_t* idAlloc = reinterpret_cast(mCurrentPtr); - *idAlloc = commandId; - - uint8_t* commandAlloc = AlignPtr(mCurrentPtr + sizeof(uint32_t), commandAlignment); - mCurrentPtr = AlignPtr(commandAlloc + commandSize, alignof(uint32_t)); - - return commandAlloc; - } - - // When there is not enough space, we signal the EndOfBlock, so that the iterator knows to - // move to the next one. EndOfBlock on the last block means the end of the commands. + uint8_t* CommandAllocator::AllocateInNewBlock(uint32_t commandId, + size_t commandSize, + size_t commandAlignment) { + // When there is not enough space, we signal the kEndOfBlock, so that the iterator knows + // to move to the next one. kEndOfBlock on the last block means the end of the commands. uint32_t* idAlloc = reinterpret_cast(mCurrentPtr); - *idAlloc = EndOfBlock; + *idAlloc = detail::kEndOfBlock; // We'll request a block that can contain at least the command ID, the command and an - // additional ID to contain the EndOfBlock tag. + // additional ID to contain the kEndOfBlock tag. size_t requestedBlockSize = commandSize + kWorstCaseAdditionalSize; // The computation of the request could overflow. @@ -228,10 +158,6 @@ namespace dawn_native { return Allocate(commandId, commandSize, commandAlignment); } - uint8_t* CommandAllocator::AllocateData(size_t commandSize, size_t commandAlignment) { - return Allocate(AdditionalData, commandSize, commandAlignment); - } - bool CommandAllocator::GetNewBlock(size_t minimumSize) { // Allocate blocks doubling sizes each time, to a maximum of 16k (or at least minimumSize). mLastAllocationSize = diff --git a/src/dawn_native/CommandAllocator.h b/src/dawn_native/CommandAllocator.h index 504ba7a6a3..82de05c1a4 100644 --- a/src/dawn_native/CommandAllocator.h +++ b/src/dawn_native/CommandAllocator.h @@ -15,6 +15,9 @@ #ifndef DAWNNATIVE_COMMAND_ALLOCATOR_H_ #define DAWNNATIVE_COMMAND_ALLOCATOR_H_ +#include "common/Assert.h" +#include "common/Math.h" + #include #include #include @@ -56,6 +59,11 @@ namespace dawn_native { }; using CommandBlocks = std::vector; + namespace detail { + constexpr uint32_t kEndOfBlock = std::numeric_limits::max(); + constexpr uint32_t kAdditionalData = std::numeric_limits::max() - 1; + } // namespace detail + class CommandAllocator; // TODO(cwallez@chromium.org): prevent copy for both iterator and allocator @@ -91,15 +99,46 @@ namespace dawn_native { private: bool IsEmpty() const; - bool NextCommandId(uint32_t* commandId); - void* NextCommand(size_t commandSize, size_t commandAlignment); - void* NextData(size_t dataSize, size_t dataAlignment); + DAWN_FORCE_INLINE bool NextCommandId(uint32_t* commandId) { + uint8_t* idPtr = AlignPtr(mCurrentPtr, alignof(uint32_t)); + ASSERT(idPtr + sizeof(uint32_t) <= + mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size); + + uint32_t id = *reinterpret_cast(idPtr); + + if (id != detail::kEndOfBlock) { + mCurrentPtr = idPtr + sizeof(uint32_t); + *commandId = id; + return true; + } + return NextCommandIdInNewBlock(commandId); + } + + bool NextCommandIdInNewBlock(uint32_t* commandId); + + DAWN_FORCE_INLINE void* NextCommand(size_t commandSize, size_t commandAlignment) { + uint8_t* commandPtr = AlignPtr(mCurrentPtr, commandAlignment); + ASSERT(commandPtr + sizeof(commandSize) <= + mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size); + + mCurrentPtr = commandPtr + commandSize; + return commandPtr; + } + + DAWN_FORCE_INLINE void* NextData(size_t dataSize, size_t dataAlignment) { + uint32_t id; + bool hasId = NextCommandId(&id); + ASSERT(hasId); + ASSERT(id == detail::kAdditionalData); + + return NextCommand(dataSize, dataAlignment); + } CommandBlocks mBlocks; uint8_t* mCurrentPtr = nullptr; size_t mCurrentBlock = 0; // Used to avoid a special case for empty iterators. - uint32_t mEndOfBlock; + uint32_t mEndOfBlock = detail::kEndOfBlock; bool mDataWasDestroyed = false; }; @@ -140,18 +179,67 @@ namespace dawn_native { // using the CommandAllocator passes the static_asserts. static constexpr size_t kMaxSupportedAlignment = 8; + // To avoid checking for overflows at every step of the computations we compute an upper + // bound of the space that will be needed in addition to the command data. + static constexpr size_t kWorstCaseAdditionalSize = + sizeof(uint32_t) + kMaxSupportedAlignment + alignof(uint32_t) + sizeof(uint32_t); + friend CommandIterator; CommandBlocks&& AcquireBlocks(); - uint8_t* Allocate(uint32_t commandId, size_t commandSize, size_t commandAlignment); - uint8_t* AllocateData(size_t dataSize, size_t dataAlignment); + DAWN_FORCE_INLINE uint8_t* Allocate(uint32_t commandId, + size_t commandSize, + size_t commandAlignment) { + ASSERT(mCurrentPtr != nullptr); + ASSERT(mEndPtr != nullptr); + ASSERT(commandId != detail::kEndOfBlock); + + // It should always be possible to allocate one id, for kEndOfBlock tagging, + ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t))); + ASSERT(mEndPtr >= mCurrentPtr); + ASSERT(static_cast(mEndPtr - mCurrentPtr) >= sizeof(uint32_t)); + + // The memory after the ID will contain the following: + // - the current ID + // - padding to align the command, maximum kMaxSupportedAlignment + // - the command of size commandSize + // - padding to align the next ID, maximum alignof(uint32_t) + // - the next ID of size sizeof(uint32_t) + + // This can't overflow because by construction mCurrentPtr always has space for the next + // ID. + size_t remainingSize = static_cast(mEndPtr - mCurrentPtr); + + // The good case were we have enough space for the command data and upper bound of the + // extra required space. + if ((remainingSize >= kWorstCaseAdditionalSize) && + (remainingSize - kWorstCaseAdditionalSize >= commandSize)) { + uint32_t* idAlloc = reinterpret_cast(mCurrentPtr); + *idAlloc = commandId; + + uint8_t* commandAlloc = AlignPtr(mCurrentPtr + sizeof(uint32_t), commandAlignment); + mCurrentPtr = AlignPtr(commandAlloc + commandSize, alignof(uint32_t)); + + return commandAlloc; + } + return AllocateInNewBlock(commandId, commandSize, commandAlignment); + } + + uint8_t* AllocateInNewBlock(uint32_t commandId, + size_t commandSize, + size_t commandAlignment); + + DAWN_FORCE_INLINE uint8_t* AllocateData(size_t commandSize, size_t commandAlignment) { + return Allocate(detail::kAdditionalData, commandSize, commandAlignment); + } + bool GetNewBlock(size_t minimumSize); CommandBlocks mBlocks; size_t mLastAllocationSize = 2048; // Pointers to the current range of allocation in the block. Guaranteed to allow for at - // least one uint32_t if not nullptr, so that the special EndOfBlock command id can always + // least one uint32_t if not nullptr, so that the special kEndOfBlock command id can always // be written. Nullptr iff the blocks were moved out. uint8_t* mCurrentPtr = nullptr; uint8_t* mEndPtr = nullptr;