commit 9a725c89cfaebafa27921c95fbae59b622843329 Author: Luke Street Date: Wed Jul 27 11:25:25 2022 -0400 Initial commit diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..c144259 --- /dev/null +++ b/.clang-format @@ -0,0 +1,29 @@ +--- +BasedOnStyle: LLVM +ColumnLimit: 120 +UseTab: Never +TabWidth: 2 +--- +Language: Cpp +DerivePointerAlignment: false +PointerAlignment: Left +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +IndentCaseLabels: false +AllowShortBlocksOnASingleLine: Always +AlignOperands: true +AlignTrailingComments: true +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BreakConstructorInitializersBeforeComma: true +AlwaysBreakAfterReturnType: None +AlwaysBreakAfterDefinitionReturnType: None +AllowShortFunctionsOnASingleLine: All +Cpp11BracedListStyle: true +NamespaceIndentation: None +BinPackArguments: true +BinPackParameters: true +SortIncludes: false +AccessModifierOffset: -2 +ConstructorInitializerIndentWidth: 0 +ConstructorInitializerAllOnOneLineOrOnePerLine: true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..103fed4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.buildcache/ +.DS_Store +.idea/ +.vs/ +build/ +cmake-build-*/ +CMakeUserPresets.json +out/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5ca641c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,13 @@ +[submodule "extern/dawn"] + path = extern/dawn + url = https://github.com/encounter/dawn-cmake.git +[submodule "extern/SDL"] + path = extern/SDL + url = https://github.com/encounter/SDL.git + branch = merged +[submodule "extern/imgui"] + path = extern/imgui + url = https://github.com/ocornut/imgui.git +[submodule "extern/fmt"] + path = extern/fmt + url = https://github.com/fmtlib/fmt.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..4cc5577 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,86 @@ +cmake_minimum_required(VERSION 3.13) +project(aurora LANGUAGES C CXX) +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 20) + +option(AURORA_NATIVE_MATRIX "Assume OpenGL-layout matrices, disables transposing" OFF) + +add_subdirectory(extern) +add_library(aurora STATIC + lib/aurora.cpp + lib/webgpu/gpu.cpp + lib/imgui.cpp + lib/input.cpp + lib/window.cpp + lib/dawn/BackendBinding.cpp + lib/gfx/common.cpp + lib/gfx/texture.cpp + lib/gfx/gx.cpp + lib/gfx/gx_shader.cpp + lib/gfx/texture_convert.cpp + lib/gfx/stream/shader.cpp + lib/gfx/model/shader.cpp + lib/dolphin/GXBump.cpp + lib/dolphin/GXCull.cpp + lib/dolphin/GXDispList.cpp + lib/dolphin/GXDraw.cpp + lib/dolphin/GXExtra.cpp + lib/dolphin/GXFifo.cpp + lib/dolphin/GXFrameBuffer.cpp + lib/dolphin/GXGeometry.cpp + lib/dolphin/GXGet.cpp + lib/dolphin/GXLighting.cpp + lib/dolphin/GXManage.cpp + lib/dolphin/GXPerf.cpp + lib/dolphin/GXPixel.cpp + lib/dolphin/GXTev.cpp + lib/dolphin/GXTexture.cpp + lib/dolphin/GXTransform.cpp + lib/dolphin/GXVert.cpp + lib/dolphin/vi.cpp + ) +add_library(aurora::aurora ALIAS aurora) +target_compile_definitions(aurora PUBLIC AURORA TARGET_PC) +if (AURORA_NATIVE_MATRIX) + target_compile_definitions(aurora PRIVATE AURORA_NATIVE_MATRIX) +endif () +target_include_directories(aurora PUBLIC include) +target_include_directories(aurora PRIVATE ../imgui) +if (NOT TARGET SDL2::SDL2-static) + find_package(SDL2 REQUIRED) +endif () +target_link_libraries(aurora PUBLIC SDL2::SDL2-static fmt::fmt imgui xxhash) +target_link_libraries(aurora PRIVATE dawn_native dawncpp webgpu_dawn absl::btree absl::flat_hash_map) +if (DAWN_ENABLE_VULKAN) + target_compile_definitions(aurora PRIVATE DAWN_ENABLE_BACKEND_VULKAN) + target_sources(aurora PRIVATE lib/dawn/VulkanBinding.cpp) +endif () +if (DAWN_ENABLE_METAL) + target_compile_definitions(aurora PRIVATE DAWN_ENABLE_BACKEND_METAL) + target_sources(aurora PRIVATE lib/dawn/MetalBinding.mm) + set_source_files_properties(lib/dawn/MetalBinding.mm PROPERTIES COMPILE_FLAGS -fobjc-arc) +endif () +if (DAWN_ENABLE_D3D12) + target_compile_definitions(aurora PRIVATE DAWN_ENABLE_BACKEND_D3D12) + target_sources(aurora PRIVATE lib/dawn/D3D12Binding.cpp) +endif () +if (DAWN_ENABLE_DESKTOP_GL OR DAWN_ENABLE_OPENGLES) + target_compile_definitions(aurora PRIVATE DAWN_ENABLE_BACKEND_OPENGL) + if (DAWN_ENABLE_DESKTOP_GL) + target_compile_definitions(aurora PRIVATE DAWN_ENABLE_BACKEND_DESKTOP_GL) + endif () + if (DAWN_ENABLE_OPENGLES) + target_compile_definitions(aurora PRIVATE DAWN_ENABLE_BACKEND_OPENGLES) + endif () + target_sources(aurora PRIVATE lib/dawn/OpenGLBinding.cpp) +endif () +if (DAWN_ENABLE_NULL) + target_compile_definitions(aurora PRIVATE DAWN_ENABLE_BACKEND_NULL) + target_sources(aurora PRIVATE lib/dawn/NullBinding.cpp) +endif () + +# Optional +add_library(aurora_main STATIC lib/main.cpp) +target_include_directories(aurora_main PUBLIC include) +target_link_libraries(aurora_main PUBLIC SDL2::SDL2main) +add_library(aurora::main ALIAS aurora_main) diff --git a/GX.md b/GX.md new file mode 100644 index 0000000..370d986 --- /dev/null +++ b/GX.md @@ -0,0 +1,266 @@ +# GX API Support + +- GXBump + - [x] GXSetNumIndStages + - [x] GXSetIndTexOrder + - [x] GXSetIndTexCoordScale + - [x] GXSetIndTexMtx + - [x] GXSetTevIndirect + - [x] GXSetTevDirect + - [x] GXSetTevIndWarp + - [ ] GXSetTevIndTile + - [ ] GXSetTevIndBumpST + - [ ] GXSetTevIndBumpXYZ + - [ ] GXSetTevIndRepeat +- GXCull + - [x] GXSetScissor + - [x] GXSetCullMode + - [ ] GXSetCoPlanar +- GXDispList + - [x] GXBeginDisplayList (stub) + - [x] GXEndDisplayList (stub) + - [x] GXCallDisplayList +- GXDraw + - [ ] GXDrawCylinder + - [ ] GXDrawTorus + - [ ] GXDrawSphere + - [ ] GXDrawCube + - [ ] GXDrawDodeca + - [ ] GXDrawOctahedron + - [ ] GXDrawIcosahedron + - [ ] GXDrawSphere1 + - [ ] GXGenNormalTable +- GXFifo + - [x] GXGetGPStatus (stub) + - [ ] GXGetFifoStatus + - [x] GXGetFifoPtrs (stub) + - [x] GXGetCPUFifo (stub) + - [x] GXGetGPFifo (stub) + - [ ] GXGetFifoBase + - [ ] GXGetFifoSize + - [ ] GXGetFifoLimits + - [ ] GXSetBreakPtCallback + - [ ] GXEnableBreakPt + - [ ] GXDisableBreakPt + - [x] GXInitFifoBase (stub) + - [x] GXInitFifoPtrs (stub) + - [ ] GXInitFifoLimits + - [x] GXSetCPUFifo (stub) + - [x] GXSetGPFifo (stub) + - [x] GXSaveCPUFifo (stub) + - [ ] GXSaveGPFifo + - [ ] GXRedirectWriteGatherPipe + - [ ] GXRestoreWriteGatherPipe + - [ ] GXSetCurrentGXThread + - [ ] GXGetCurrentGXThread + - [ ] GXGetOverflowCount + - [ ] GXResetOverflowCount +- GXFrameBuffer + - [x] GXAdjustForOverscan + - [x] GXSetDispCopySrc (stub) + - [x] GXSetTexCopySrc + - [x] GXSetDispCopyDst (stub) + - [x] GXSetTexCopyDst + - [ ] GXSetDispCopyFrame2Field + - [ ] GXSetCopyClamp + - [x] GXSetDispCopyYScale (stub) + - [x] GXSetCopyClear + - [x] GXSetCopyFilter (stub) + - [x] GXSetDispCopyGamma (stub) + - [x] GXCopyDisp (stub) + - [x] GXCopyTex + - [ ] GXGetYScaleFactor + - [ ] GXGetNumXfbLines + - [ ] GXClearBoundingBox + - [ ] GXReadBoundingBox +- GXGeometry + - [x] GXSetVtxDesc + - [x] GXSetVtxDescv + - [x] GXClearVtxDesc + - [x] GXSetVtxAttrFmt + - [ ] GXSetVtxAttrFmtv + - [x] GXSetArray + - [x] GXBegin + - [x] GXEnd + - [x] GXSetTexCoordGen2 + - [x] GXSetNumTexGens + - [ ] GXInvalidateVtxCache + - [ ] GXSetLineWidth + - [ ] GXSetPointSize + - [ ] GXEnableTexOffsets +- GXGet + - [ ] GXGetVtxDesc + - [ ] GXGetVtxDescv + - [ ] GXGetVtxAttrFmtv + - [ ] GXGetLineWidth + - [ ] GXGetPointSize + - [x] GXGetVtxAttrFmt + - [ ] GXGetViewportv + - [x] GXGetProjectionv + - [ ] GXGetScissor + - [ ] GXGetCullMode + - [x] GXGetLightAttnA + - [x] GXGetLightAttnK + - [x] GXGetLightPos + - [x] GXGetLightDir + - [x] GXGetLightColor + - [x] GXGetTexObjData + - [x] GXGetTexObjWidth + - [x] GXGetTexObjHeight + - [x] GXGetTexObjFmt + - [x] GXGetTexObjWrapS + - [x] GXGetTexObjWrapT + - [x] GXGetTexObjMipMap + - [ ] GXGetTexObjAll + - [ ] GXGetTexObjMinFilt + - [ ] GXGetTexObjMagFilt + - [ ] GXGetTexObjMinLOD + - [ ] GXGetTexObjMaxLOD + - [ ] GXGetTexObjLODBias + - [ ] GXGetTexObjBiasClamp + - [ ] GXGetTexObjEdgeLOD + - [ ] GXGetTexObjMaxAniso + - [ ] GXGetTexObjLODAll + - [ ] GXGetTexObjTlut + - [ ] GXGetTlutObjData + - [ ] GXGetTlutObjFmt + - [ ] GXGetTlutObjNumEntries + - [ ] GXGetTlutObjAll + - [ ] GXGetTexRegionAll + - [ ] GXGetTlutRegionAll +- GXLighting + - [x] GXInitLightAttn + - [x] GXInitLightAttnA + - [x] GXInitLightAttnK + - [x] GXInitLightSpot + - [x] GXInitLightDistAttn + - [x] GXInitLightPos + - [x] GXInitLightColor + - [x] GXLoadLightObjImm + - [ ] GXLoadLightObjIndx + - [x] GXSetChanAmbColor + - [x] GXSetChanMatColor + - [x] GXSetNumChans + - [x] GXInitLightDir + - [x] GXInitSpecularDir + - [x] GXInitSpecularDirHA + - [x] GXSetChanCtrl +- GXManage + - [x] GXInit (stub) + - [ ] GXAbortFrame + - [ ] GXSetDrawSync + - [ ] GXReadDrawSync + - [ ] GXSetDrawSyncCallback + - [x] GXDrawDone (stub) + - [x] GXSetDrawDone (stub) + - [ ] GXWaitDrawDone + - [x] GXSetDrawDoneCallback (stub) + - [ ] GXSetResetWritePipe + - [x] GXFlush (stub) + - [ ] GXResetWriteGatherPipe + - [x] GXPixModeSync (stub) + - [x] GXTexModeSync (stub) + - [ ] IsWriteGatherBufferEmpty + - [ ] GXSetMisc +- GXPerf + - [ ] GXSetGPMetric + - [ ] GXClearGPMetric + - [ ] GXReadGPMetric + - [ ] GXReadGP0Metric + - [ ] GXReadGP1Metric + - [ ] GXReadMemMetric + - [ ] GXClearMemMetric + - [ ] GXReadPixMetric + - [ ] GXClearPixMetric + - [ ] GXSetVCacheMetric + - [ ] GXReadVCacheMetric + - [ ] GXClearVCacheMetric + - [ ] GXReadXfRasMetric + - [ ] GXInitXfRasMetric + - [ ] GXReadClksPerVtx +- GXPixel + - [x] GXSetFog + - [x] GXSetFogColor + - [ ] GXInitFogAdjTable + - [ ] GXSetFogRangeAdj + - [x] GXSetBlendMode + - [x] GXSetColorUpdate + - [x] GXSetAlphaUpdate + - [x] GXSetZMode + - [ ] GXSetZCompLoc + - [x] GXSetPixelFmt (stub) + - [x] GXSetDither (stub) + - [x] GXSetDstAlpha + - [ ] GXSetFieldMask + - [ ] GXSetFieldMode +- GXTev + - [x] GXSetTevOp + - [x] GXSetTevColorIn + - [x] GXSetTevAlphaIn + - [x] GXSetTevColorOp + - [x] GXSetTevAlphaOp + - [x] GXSetTevColor + - [x] GXSetTevColorS10 + - [x] GXSetAlphaCompare + - [x] GXSetTevOrder + - [ ] GXSetZTexture + - [x] GXSetNumTevStages + - [x] GXSetTevKColor + - [x] GXSetTevKColorSel + - [x] GXSetTevKAlphaSel + - [x] GXSetTevSwapMode + - [x] GXSetTevSwapModeTable +- GXTexture + - [x] GXInitTexObj + - [x] GXInitTexObjCI + - [x] GXInitTexObjLOD + - [x] GXInitTexObjData + - [x] GXInitTexObjWrapMode + - [x] GXInitTexObjTlut + - [ ] GXInitTexObjFilter + - [ ] GXInitTexObjMaxLOD + - [ ] GXInitTexObjMinLOD + - [ ] GXInitTexObjLODBias + - [ ] GXInitTexObjBiasClamp + - [ ] GXInitTexObjEdgeLOD + - [ ] GXInitTexObjMaxAniso + - [ ] GXInitTexObjUserData + - [ ] GXGetTexObjUserData + - [x] GXLoadTexObj + - [x] GXGetTexBufferSize + - [x] GXInitTlutObj + - [x] GXLoadTlut + - [ ] GXInitTexCacheRegion + - [ ] GXInitTexPreLoadRegion + - [ ] GXInitTlutRegion + - [ ] GXInvalidateTexRegion + - [x] GXInvalidateTexAll (stub) + - [ ] GXPreLoadEntireTexture + - [ ] GXSetTexRegionCallback + - [ ] GXSetTlutRegionCallback + - [ ] GXLoadTexObjPreLoaded + - [ ] GXSetTexCoordScaleManually + - [ ] GXSetTexCoordCylWrap + - [ ] GXSetTexCoordBias +- GXTransform + - [x] GXSetProjection + - [ ] GXSetProjectionv + - [x] GXLoadPosMtxImm + - [ ] GXLoadPosMtxIndx + - [x] GXLoadNrmMtxImm + - [ ] GXLoadNrmMtxImm3x3 + - [ ] GXLoadNrmMtxIndx3x3 + - [x] GXSetCurrentMtx + - [x] GXLoadTexMtxImm + - [ ] GXLoadTexMtxIndx + - [ ] GXProject + - [x] GXSetViewport + - [x] GXSetViewportJitter + - [ ] GXSetZScaleOffset + - [ ] GXSetScissorBoxOffset + - [ ] GXSetClipMode +- GXVert + - [x] GXPosition\[n]\[t] + - [x] GXNormal\[n]\[t] + - [x] GXColor\[n]\[t] + - [x] GXTexCoord\[n]\[t] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..baf2790 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2022 Luke Street + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..84d25f9 --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +# Aurora + +Aurora is a source-level GameCube & Wii compatibility layer intended for use with game reverse engineering projects. + +Originally developed for use in [Metaforce](https://github.com/AxioDL/metaforce), a Metroid Prime reverse engineering +project. + +### Features + +- GX compatibility layer + - Graphics API support: D3D12, Vulkan, Metal, OpenGL 4.4+ and OpenGL ES 3.1+ + - *Planned: deko3d backend for Switch* +- Application layer using SDL + - Runs on Windows, Linux, macOS, iOS, tvOS (Android coming soon) + - Audio support with SDL_audio +- PAD compatibility layer + - Utilizes SDL_GameController for wide controller support, including GameCube controllers. + - *Planned: Wii remote support* +- [Dear ImGui](https://github.com/ocornut/imgui) built-in for UI + +### GX + +The GX compatibility layer is built on top of [WebGPU](https://www.w3.org/TR/webgpu/), a cross-platform graphics API +abstraction layer. WebGPU allows targeting all major platforms simultaneously with minimal overhead. + +Currently, the WebGPU implementation used is Chromium's [Dawn](https://dawn.googlesource.com/dawn/). + +See [GX API support](GX.md) for more information. + +### PAD + +The PAD compatibility layer utilizes SDL_GameController to automatically support & provide mappings for hundreds of +controllers across all platforms. diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt new file mode 100644 index 0000000..726a13d --- /dev/null +++ b/extern/CMakeLists.txt @@ -0,0 +1,38 @@ +if (CMAKE_SYSTEM_NAME STREQUAL Windows) + set(DAWN_ENABLE_DESKTOP_GL ON CACHE BOOL "Enable compilation of the OpenGL backend" FORCE) +endif () +if (CMAKE_SYSTEM_NAME STREQUAL Linux) + set(DAWN_ENABLE_OPENGLES ON CACHE BOOL "Enable compilation of the OpenGL ES backend" FORCE) +endif () +add_subdirectory(dawn EXCLUDE_FROM_ALL) +if (DAWN_ENABLE_VULKAN) + target_compile_definitions(dawn_native PRIVATE + DAWN_ENABLE_VULKAN_VALIDATION_LAYERS + DAWN_VK_DATA_DIR="vulkandata") +endif () +if (MSVC) + target_compile_options(dawn_native PRIVATE /bigobj) +else () + target_compile_options(SPIRV-Tools-static PRIVATE -Wno-implicit-fallthrough) + target_compile_options(SPIRV-Tools-opt PRIVATE -Wno-implicit-fallthrough) +endif () + +if (WIN32) + set(SDL_LIBC ON CACHE BOOL "Use the system C library" FORCE) +endif () +add_subdirectory(SDL EXCLUDE_FROM_ALL) +if (NOT MSVC) + target_compile_options(SDL2-static PRIVATE -Wno-implicit-fallthrough -Wno-shadow) +endif () + +add_subdirectory(xxhash EXCLUDE_FROM_ALL) +add_subdirectory(fmt EXCLUDE_FROM_ALL) + +add_library(imgui + imgui/imgui.cpp + imgui/imgui_demo.cpp + imgui/imgui_draw.cpp + imgui/imgui_tables.cpp + imgui/imgui_widgets.cpp + imgui/misc/cpp/imgui_stdlib.cpp) +target_include_directories(imgui PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/imgui) diff --git a/extern/SDL b/extern/SDL new file mode 160000 index 0000000..34458fe --- /dev/null +++ b/extern/SDL @@ -0,0 +1 @@ +Subproject commit 34458fe9f4a11a5b15ced6193e4c5d7ef97cca36 diff --git a/extern/dawn b/extern/dawn new file mode 160000 index 0000000..64a23ce --- /dev/null +++ b/extern/dawn @@ -0,0 +1 @@ +Subproject commit 64a23ce0ede5f232cc209b69d64164ede6810b65 diff --git a/extern/fmt b/extern/fmt new file mode 160000 index 0000000..81f1cc7 --- /dev/null +++ b/extern/fmt @@ -0,0 +1 @@ +Subproject commit 81f1cc74a776581cdef8659d176049d3aeb743c6 diff --git a/extern/imgui b/extern/imgui new file mode 160000 index 0000000..e99c4fc --- /dev/null +++ b/extern/imgui @@ -0,0 +1 @@ +Subproject commit e99c4fc6688e218a0e5da50f56638aebab45da9b diff --git a/extern/xxhash/CMakeLists.txt b/extern/xxhash/CMakeLists.txt new file mode 100644 index 0000000..2fa4353 --- /dev/null +++ b/extern/xxhash/CMakeLists.txt @@ -0,0 +1,3 @@ +add_library(xxhash xxhash_impl.c) +target_include_directories(xxhash PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_compile_definitions(xxhash INTERFACE XXH_STATIC_LINKING_ONLY) diff --git a/extern/xxhash/LICENSE b/extern/xxhash/LICENSE new file mode 100644 index 0000000..e4c5da7 --- /dev/null +++ b/extern/xxhash/LICENSE @@ -0,0 +1,26 @@ +xxHash Library +Copyright (c) 2012-2021 Yann Collet +All rights reserved. + +BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/extern/xxhash/xxh_x86dispatch.c b/extern/xxhash/xxh_x86dispatch.c new file mode 100644 index 0000000..bec93bf --- /dev/null +++ b/extern/xxhash/xxh_x86dispatch.c @@ -0,0 +1,770 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2020-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + + +/*! + * @file xxh_x86dispatch.c + * + * Automatic dispatcher code for the @ref XXH3_family on x86-based targets. + * + * Optional add-on. + * + * **Compile this file with the default flags for your target.** Do not compile + * with flags like `-mavx*`, `-march=native`, or `/arch:AVX*`, there will be + * an error. See @ref XXH_X86DISPATCH_ALLOW_AVX for details. + * + * @defgroup dispatch x86 Dispatcher + * @{ + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +#if !(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)) +# error "Dispatching is currently only supported on x86 and x86_64." +#endif + +/*! + * @def XXH_X86DISPATCH_ALLOW_AVX + * @brief Disables the AVX sanity check. + * + * Don't compile xxh_x86dispatch.c with options like `-mavx*`, `-march=native`, + * or `/arch:AVX*`. It is intended to be compiled for the minimum target, and + * it selectively enables SSE2, AVX2, and AVX512 when it is needed. + * + * Using this option _globally_ allows this feature, and therefore makes it + * undefined behavior to execute on any CPU without said feature. + * + * Even if the source code isn't directly using AVX intrinsics in a function, + * the compiler can still generate AVX code from autovectorization and by + * "upgrading" SSE2 intrinsics to use the VEX prefixes (a.k.a. AVX128). + * + * Use the same flags that you use to compile the rest of the program; this + * file will safely generate SSE2, AVX2, and AVX512 without these flags. + * + * Define XXH_X86DISPATCH_ALLOW_AVX to ignore this check, and feel free to open + * an issue if there is a target in the future where AVX is a default feature. + */ +#ifdef XXH_DOXYGEN +# define XXH_X86DISPATCH_ALLOW_AVX +#endif + +#if defined(__AVX__) && !defined(XXH_X86DISPATCH_ALLOW_AVX) +# error "Do not compile xxh_x86dispatch.c with AVX enabled! See the comment above." +#endif + +#ifdef __has_include +# define XXH_HAS_INCLUDE(header) __has_include(header) +#else +# define XXH_HAS_INCLUDE(header) 0 +#endif + +/*! + * @def XXH_DISPATCH_SCALAR + * @brief Enables/dispatching the scalar code path. + * + * If this is defined to 0, SSE2 support is assumed. This reduces code size + * when the scalar path is not needed. + * + * This is automatically defined to 0 when... + * - SSE2 support is enabled in the compiler + * - Targeting x86_64 + * - Targeting Android x86 + * - Targeting macOS + */ +#ifndef XXH_DISPATCH_SCALAR +# if defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) /* SSE2 on by default */ \ + || defined(__x86_64__) || defined(_M_X64) /* x86_64 */ \ + || defined(__ANDROID__) || defined(__APPLEv__) /* Android or macOS */ +# define XXH_DISPATCH_SCALAR 0 /* disable */ +# else +# define XXH_DISPATCH_SCALAR 1 +# endif +#endif +/*! + * @def XXH_DISPATCH_AVX2 + * @brief Enables/disables dispatching for AVX2. + * + * This is automatically detected if it is not defined. + * - GCC 4.7 and later are known to support AVX2, but >4.9 is required for + * to get the AVX2 intrinsics and typedefs without -mavx -mavx2. + * - Visual Studio 2013 Update 2 and later are known to support AVX2. + * - The GCC/Clang internal header `` is detected. While this is + * not allowed to be included directly, it still appears in the builtin + * include path and is detectable with `__has_include`. + * + * @see XXH_AVX2 + */ +#ifndef XXH_DISPATCH_AVX2 +# if (defined(__GNUC__) && (__GNUC__ > 4)) /* GCC 5.0+ */ \ + || (defined(_MSC_VER) && _MSC_VER >= 1900) /* VS 2015+ */ \ + || (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 180030501) /* VS 2013 Update 2 */ \ + || XXH_HAS_INCLUDE() /* GCC/Clang internal header */ +# define XXH_DISPATCH_AVX2 1 /* enable dispatch towards AVX2 */ +# else +# define XXH_DISPATCH_AVX2 0 +# endif +#endif /* XXH_DISPATCH_AVX2 */ + +/*! + * @def XXH_DISPATCH_AVX512 + * @brief Enables/disables dispatching for AVX512. + * + * Automatically detected if one of the following conditions is met: + * - GCC 4.9 and later are known to support AVX512. + * - Visual Studio 2017 and later are known to support AVX2. + * - The GCC/Clang internal header `` is detected. While this + * is not allowed to be included directly, it still appears in the builtin + * include path and is detectable with `__has_include`. + * + * @see XXH_AVX512 + */ +#ifndef XXH_DISPATCH_AVX512 +# if (defined(__GNUC__) \ + && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9))) /* GCC 4.9+ */ \ + || (defined(_MSC_VER) && _MSC_VER >= 1910) /* VS 2017+ */ \ + || XXH_HAS_INCLUDE() /* GCC/Clang internal header */ +# define XXH_DISPATCH_AVX512 1 /* enable dispatch towards AVX512 */ +# else +# define XXH_DISPATCH_AVX512 0 +# endif +#endif /* XXH_DISPATCH_AVX512 */ + +/*! + * @def XXH_TARGET_SSE2 + * @brief Allows a function to be compiled with SSE2 intrinsics. + * + * Uses `__attribute__((__target__("sse2")))` on GCC to allow SSE2 to be used + * even with `-mno-sse2`. + * + * @def XXH_TARGET_AVX2 + * @brief Like @ref XXH_TARGET_SSE2, but for AVX2. + * + * @def XXH_TARGET_AVX512 + * @brief Like @ref XXH_TARGET_SSE2, but for AVX512. + */ +#if defined(__GNUC__) +# include /* SSE2 */ +# if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 +# include /* AVX2, AVX512F */ +# endif +# define XXH_TARGET_SSE2 __attribute__((__target__("sse2"))) +# define XXH_TARGET_AVX2 __attribute__((__target__("avx2"))) +# define XXH_TARGET_AVX512 __attribute__((__target__("avx512f"))) +#elif defined(_MSC_VER) +# include +# define XXH_TARGET_SSE2 +# define XXH_TARGET_AVX2 +# define XXH_TARGET_AVX512 +#else +# error "Dispatching is currently not supported for your compiler." +#endif + +#ifdef XXH_DISPATCH_DEBUG +/* debug logging */ +# include +# define XXH_debugPrint(str) { fprintf(stderr, "DEBUG: xxHash dispatch: %s \n", str); fflush(NULL); } +#else +# define XXH_debugPrint(str) ((void)0) +# undef NDEBUG /* avoid redefinition */ +# define NDEBUG +#endif +#include + +#define XXH_INLINE_ALL +#define XXH_X86DISPATCH +#include "xxhash.h" + +/* + * Support both AT&T and Intel dialects + * + * GCC doesn't convert AT&T syntax to Intel syntax, and will error out if + * compiled with -masm=intel. Instead, it supports dialect switching with + * curly braces: { AT&T syntax | Intel syntax } + * + * Clang's integrated assembler automatically converts AT&T syntax to Intel if + * needed, making the dialect switching useless (it isn't even supported). + * + * Note: Comments are written in the inline assembly itself. + */ +#ifdef __clang__ +# define XXH_I_ATT(intel, att) att "\n\t" +#else +# define XXH_I_ATT(intel, att) "{" att "|" intel "}\n\t" +#endif + +/*! + * @internal + * @brief Runs CPUID. + * + * @param eax , ecx The parameters to pass to CPUID, %eax and %ecx respectively. + * @param abcd The array to store the result in, `{ eax, ebx, ecx, edx }` + */ +static void XXH_cpuid(xxh_u32 eax, xxh_u32 ecx, xxh_u32* abcd) +{ +#if defined(_MSC_VER) + __cpuidex(abcd, eax, ecx); +#else + xxh_u32 ebx, edx; +# if defined(__i386__) && defined(__PIC__) + __asm__( + "# Call CPUID\n\t" + "#\n\t" + "# On 32-bit x86 with PIC enabled, we are not allowed to overwrite\n\t" + "# EBX, so we use EDI instead.\n\t" + XXH_I_ATT("mov edi, ebx", "movl %%ebx, %%edi") + XXH_I_ATT("cpuid", "cpuid" ) + XXH_I_ATT("xchg edi, ebx", "xchgl %%ebx, %%edi") + : "=D" (ebx), +# else + __asm__( + "# Call CPUID\n\t" + XXH_I_ATT("cpuid", "cpuid") + : "=b" (ebx), +# endif + "+a" (eax), "+c" (ecx), "=d" (edx)); + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +#endif +} + +/* + * Modified version of Intel's guide + * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family + */ + +#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 +/*! + * @internal + * @brief Runs `XGETBV`. + * + * While the CPU may support AVX2, the operating system might not properly save + * the full YMM/ZMM registers. + * + * xgetbv is used for detecting this: Any compliant operating system will define + * a set of flags in the xcr0 register indicating how it saves the AVX registers. + * + * You can manually disable this flag on Windows by running, as admin: + * + * bcdedit.exe /set xsavedisable 1 + * + * and rebooting. Run the same command with 0 to re-enable it. + */ +static xxh_u64 XXH_xgetbv(void) +{ +#if defined(_MSC_VER) + return _xgetbv(0); /* min VS2010 SP1 compiler is required */ +#else + xxh_u32 xcr0_lo, xcr0_hi; + __asm__( + "# Call XGETBV\n\t" + "#\n\t" + "# Older assemblers (e.g. macOS's ancient GAS version) don't support\n\t" + "# the XGETBV opcode, so we encode it by hand instead.\n\t" + "# See for details.\n\t" + ".byte 0x0f, 0x01, 0xd0\n\t" + : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0)); + return xcr0_lo | ((xxh_u64)xcr0_hi << 32); +#endif +} +#endif + +#define XXH_SSE2_CPUID_MASK (1 << 26) +#define XXH_OSXSAVE_CPUID_MASK ((1 << 26) | (1 << 27)) +#define XXH_AVX2_CPUID_MASK (1 << 5) +#define XXH_AVX2_XGETBV_MASK ((1 << 2) | (1 << 1)) +#define XXH_AVX512F_CPUID_MASK (1 << 16) +#define XXH_AVX512F_XGETBV_MASK ((7 << 5) | (1 << 2) | (1 << 1)) + +/*! + * @internal + * @brief Returns the best XXH3 implementation. + * + * Runs various CPUID/XGETBV tests to try and determine the best implementation. + * + * @return The best @ref XXH_VECTOR implementation. + * @see XXH_VECTOR_TYPES + */ +static int XXH_featureTest(void) +{ + xxh_u32 abcd[4]; + xxh_u32 max_leaves; + int best = XXH_SCALAR; +#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 + xxh_u64 xgetbv_val; +#endif +#if defined(__GNUC__) && defined(__i386__) + xxh_u32 cpuid_supported; + __asm__( + "# For the sake of ruthless backwards compatibility, check if CPUID\n\t" + "# is supported in the EFLAGS on i386.\n\t" + "# This is not necessary on x86_64 - CPUID is mandatory.\n\t" + "# The ID flag (bit 21) in the EFLAGS register indicates support\n\t" + "# for the CPUID instruction. If a software procedure can set and\n\t" + "# clear this flag, the processor executing the procedure supports\n\t" + "# the CPUID instruction.\n\t" + "# \n\t" + "#\n\t" + "# Routine is from .\n\t" + + "# Save EFLAGS\n\t" + XXH_I_ATT("pushfd", "pushfl" ) + "# Store EFLAGS\n\t" + XXH_I_ATT("pushfd", "pushfl" ) + "# Invert the ID bit in stored EFLAGS\n\t" + XXH_I_ATT("xor dword ptr[esp], 0x200000", "xorl $0x200000, (%%esp)") + "# Load stored EFLAGS (with ID bit inverted)\n\t" + XXH_I_ATT("popfd", "popfl" ) + "# Store EFLAGS again (ID bit may or not be inverted)\n\t" + XXH_I_ATT("pushfd", "pushfl" ) + "# eax = modified EFLAGS (ID bit may or may not be inverted)\n\t" + XXH_I_ATT("pop eax", "popl %%eax" ) + "# eax = whichever bits were changed\n\t" + XXH_I_ATT("xor eax, dword ptr[esp]", "xorl (%%esp), %%eax" ) + "# Restore original EFLAGS\n\t" + XXH_I_ATT("popfd", "popfl" ) + "# eax = zero if ID bit can't be changed, else non-zero\n\t" + XXH_I_ATT("and eax, 0x200000", "andl $0x200000, %%eax" ) + : "=a" (cpuid_supported) :: "cc"); + + if (XXH_unlikely(!cpuid_supported)) { + XXH_debugPrint("CPUID support is not detected!"); + return best; + } + +#endif + /* Check how many CPUID pages we have */ + XXH_cpuid(0, 0, abcd); + max_leaves = abcd[0]; + + /* Shouldn't happen on hardware, but happens on some QEMU configs. */ + if (XXH_unlikely(max_leaves == 0)) { + XXH_debugPrint("Max CPUID leaves == 0!"); + return best; + } + + /* Check for SSE2, OSXSAVE and xgetbv */ + XXH_cpuid(1, 0, abcd); + + /* + * Test for SSE2. The check is redundant on x86_64, but it doesn't hurt. + */ + if (XXH_unlikely((abcd[3] & XXH_SSE2_CPUID_MASK) != XXH_SSE2_CPUID_MASK)) + return best; + + XXH_debugPrint("SSE2 support detected."); + + best = XXH_SSE2; +#if XXH_DISPATCH_AVX2 || XXH_DISPATCH_AVX512 + /* Make sure we have enough leaves */ + if (XXH_unlikely(max_leaves < 7)) + return best; + + /* Test for OSXSAVE and XGETBV */ + if ((abcd[2] & XXH_OSXSAVE_CPUID_MASK) != XXH_OSXSAVE_CPUID_MASK) + return best; + + /* CPUID check for AVX features */ + XXH_cpuid(7, 0, abcd); + + xgetbv_val = XXH_xgetbv(); +#if XXH_DISPATCH_AVX2 + /* Validate that AVX2 is supported by the CPU */ + if ((abcd[1] & XXH_AVX2_CPUID_MASK) != XXH_AVX2_CPUID_MASK) + return best; + + /* Validate that the OS supports YMM registers */ + if ((xgetbv_val & XXH_AVX2_XGETBV_MASK) != XXH_AVX2_XGETBV_MASK) { + XXH_debugPrint("AVX2 supported by the CPU, but not the OS."); + return best; + } + + /* AVX2 supported */ + XXH_debugPrint("AVX2 support detected."); + best = XXH_AVX2; +#endif +#if XXH_DISPATCH_AVX512 + /* Check if AVX512F is supported by the CPU */ + if ((abcd[1] & XXH_AVX512F_CPUID_MASK) != XXH_AVX512F_CPUID_MASK) { + XXH_debugPrint("AVX512F not supported by CPU"); + return best; + } + + /* Validate that the OS supports ZMM registers */ + if ((xgetbv_val & XXH_AVX512F_XGETBV_MASK) != XXH_AVX512F_XGETBV_MASK) { + XXH_debugPrint("AVX512F supported by the CPU, but not the OS."); + return best; + } + + /* AVX512F supported */ + XXH_debugPrint("AVX512F support detected."); + best = XXH_AVX512; +#endif +#endif + return best; +} + + +/* === Vector implementations === */ + +/*! + * @internal + * @brief Defines the various dispatch functions. + * + * TODO: Consolidate? + * + * @param suffix The suffix for the functions, e.g. sse2 or scalar + * @param target XXH_TARGET_* or empty. + */ +#define XXH_DEFINE_DISPATCH_FUNCS(suffix, target) \ + \ +/* === XXH3, default variants === */ \ + \ +XXH_NO_INLINE target XXH64_hash_t \ +XXHL64_default_##suffix(const void* XXH_RESTRICT input, size_t len) \ +{ \ + return XXH3_hashLong_64b_internal( \ + input, len, XXH3_kSecret, sizeof(XXH3_kSecret), \ + XXH3_accumulate_512_##suffix, XXH3_scrambleAcc_##suffix \ + ); \ +} \ + \ +/* === XXH3, Seeded variants === */ \ + \ +XXH_NO_INLINE target XXH64_hash_t \ +XXHL64_seed_##suffix(const void* XXH_RESTRICT input, size_t len, \ + XXH64_hash_t seed) \ +{ \ + return XXH3_hashLong_64b_withSeed_internal( \ + input, len, seed, XXH3_accumulate_512_##suffix, \ + XXH3_scrambleAcc_##suffix, XXH3_initCustomSecret_##suffix \ + ); \ +} \ + \ +/* === XXH3, Secret variants === */ \ + \ +XXH_NO_INLINE target XXH64_hash_t \ +XXHL64_secret_##suffix(const void* XXH_RESTRICT input, size_t len, \ + const void* secret, size_t secretLen) \ +{ \ + return XXH3_hashLong_64b_internal( \ + input, len, secret, secretLen, \ + XXH3_accumulate_512_##suffix, XXH3_scrambleAcc_##suffix \ + ); \ +} \ + \ +/* === XXH3 update variants === */ \ + \ +XXH_NO_INLINE target XXH_errorcode \ +XXH3_update_##suffix(XXH3_state_t* state, const void* input, size_t len) \ +{ \ + return XXH3_update(state, (const xxh_u8*)input, len, \ + XXH3_accumulate_512_##suffix, XXH3_scrambleAcc_##suffix); \ +} \ + \ +/* === XXH128 default variants === */ \ + \ +XXH_NO_INLINE target XXH128_hash_t \ +XXHL128_default_##suffix(const void* XXH_RESTRICT input, size_t len) \ +{ \ + return XXH3_hashLong_128b_internal( \ + input, len, XXH3_kSecret, sizeof(XXH3_kSecret), \ + XXH3_accumulate_512_##suffix, XXH3_scrambleAcc_##suffix \ + ); \ +} \ + \ +/* === XXH128 Secret variants === */ \ + \ +XXH_NO_INLINE target XXH128_hash_t \ +XXHL128_secret_##suffix(const void* XXH_RESTRICT input, size_t len, \ + const void* XXH_RESTRICT secret, size_t secretLen) \ +{ \ + return XXH3_hashLong_128b_internal( \ + input, len, (const xxh_u8*)secret, secretLen, \ + XXH3_accumulate_512_##suffix, XXH3_scrambleAcc_##suffix); \ +} \ + \ +/* === XXH128 Seeded variants === */ \ + \ +XXH_NO_INLINE target XXH128_hash_t \ +XXHL128_seed_##suffix(const void* XXH_RESTRICT input, size_t len, \ + XXH64_hash_t seed) \ +{ \ + return XXH3_hashLong_128b_withSeed_internal(input, len, seed, \ + XXH3_accumulate_512_##suffix, XXH3_scrambleAcc_##suffix, \ + XXH3_initCustomSecret_##suffix); \ +} + +/* End XXH_DEFINE_DISPATCH_FUNCS */ + +#if XXH_DISPATCH_SCALAR +XXH_DEFINE_DISPATCH_FUNCS(scalar, /* nothing */) +#endif +XXH_DEFINE_DISPATCH_FUNCS(sse2, XXH_TARGET_SSE2) +#if XXH_DISPATCH_AVX2 +XXH_DEFINE_DISPATCH_FUNCS(avx2, XXH_TARGET_AVX2) +#endif +#if XXH_DISPATCH_AVX512 +XXH_DEFINE_DISPATCH_FUNCS(avx512, XXH_TARGET_AVX512) +#endif +#undef XXH_DEFINE_DISPATCH_FUNCS + +/* ==== Dispatchers ==== */ + +typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_default)(const void* XXH_RESTRICT, size_t); + +typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_withSeed)(const void* XXH_RESTRICT, size_t, XXH64_hash_t); + +typedef XXH64_hash_t (*XXH3_dispatchx86_hashLong64_withSecret)(const void* XXH_RESTRICT, size_t, const void* XXH_RESTRICT, size_t); + +typedef XXH_errorcode (*XXH3_dispatchx86_update)(XXH3_state_t*, const void*, size_t); + +typedef struct { + XXH3_dispatchx86_hashLong64_default hashLong64_default; + XXH3_dispatchx86_hashLong64_withSeed hashLong64_seed; + XXH3_dispatchx86_hashLong64_withSecret hashLong64_secret; + XXH3_dispatchx86_update update; +} XXH_dispatchFunctions_s; + +#define XXH_NB_DISPATCHES 4 + +/*! + * @internal + * @brief Table of dispatchers for @ref XXH3_64bits(). + * + * @pre The indices must match @ref XXH_VECTOR_TYPE. + */ +static const XXH_dispatchFunctions_s XXH_kDispatch[XXH_NB_DISPATCHES] = { +#if XXH_DISPATCH_SCALAR + /* Scalar */ { XXHL64_default_scalar, XXHL64_seed_scalar, XXHL64_secret_scalar, XXH3_update_scalar }, +#else + /* Scalar */ { NULL, NULL, NULL, NULL }, +#endif + /* SSE2 */ { XXHL64_default_sse2, XXHL64_seed_sse2, XXHL64_secret_sse2, XXH3_update_sse2 }, +#if XXH_DISPATCH_AVX2 + /* AVX2 */ { XXHL64_default_avx2, XXHL64_seed_avx2, XXHL64_secret_avx2, XXH3_update_avx2 }, +#else + /* AVX2 */ { NULL, NULL, NULL, NULL }, +#endif +#if XXH_DISPATCH_AVX512 + /* AVX512 */ { XXHL64_default_avx512, XXHL64_seed_avx512, XXHL64_secret_avx512, XXH3_update_avx512 } +#else + /* AVX512 */ { NULL, NULL, NULL, NULL } +#endif +}; +/*! + * @internal + * @brief The selected dispatch table for @ref XXH3_64bits(). + */ +static XXH_dispatchFunctions_s XXH_g_dispatch = { NULL, NULL, NULL, NULL }; + + +typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_default)(const void* XXH_RESTRICT, size_t); + +typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_withSeed)(const void* XXH_RESTRICT, size_t, XXH64_hash_t); + +typedef XXH128_hash_t (*XXH3_dispatchx86_hashLong128_withSecret)(const void* XXH_RESTRICT, size_t, const void* XXH_RESTRICT, size_t); + +typedef struct { + XXH3_dispatchx86_hashLong128_default hashLong128_default; + XXH3_dispatchx86_hashLong128_withSeed hashLong128_seed; + XXH3_dispatchx86_hashLong128_withSecret hashLong128_secret; + XXH3_dispatchx86_update update; +} XXH_dispatch128Functions_s; + + +/*! + * @internal + * @brief Table of dispatchers for @ref XXH3_128bits(). + * + * @pre The indices must match @ref XXH_VECTOR_TYPE. + */ +static const XXH_dispatch128Functions_s XXH_kDispatch128[XXH_NB_DISPATCHES] = { +#if XXH_DISPATCH_SCALAR + /* Scalar */ { XXHL128_default_scalar, XXHL128_seed_scalar, XXHL128_secret_scalar, XXH3_update_scalar }, +#else + /* Scalar */ { NULL, NULL, NULL, NULL }, +#endif + /* SSE2 */ { XXHL128_default_sse2, XXHL128_seed_sse2, XXHL128_secret_sse2, XXH3_update_sse2 }, +#if XXH_DISPATCH_AVX2 + /* AVX2 */ { XXHL128_default_avx2, XXHL128_seed_avx2, XXHL128_secret_avx2, XXH3_update_avx2 }, +#else + /* AVX2 */ { NULL, NULL, NULL, NULL }, +#endif +#if XXH_DISPATCH_AVX512 + /* AVX512 */ { XXHL128_default_avx512, XXHL128_seed_avx512, XXHL128_secret_avx512, XXH3_update_avx512 } +#else + /* AVX512 */ { NULL, NULL, NULL, NULL } +#endif +}; + +/*! + * @internal + * @brief The selected dispatch table for @ref XXH3_64bits(). + */ +static XXH_dispatch128Functions_s XXH_g_dispatch128 = { NULL, NULL, NULL, NULL }; + +/*! + * @internal + * @brief Runs a CPUID check and sets the correct dispatch tables. + */ +static void XXH_setDispatch(void) +{ + int vecID = XXH_featureTest(); + XXH_STATIC_ASSERT(XXH_AVX512 == XXH_NB_DISPATCHES-1); + assert(XXH_SCALAR <= vecID && vecID <= XXH_AVX512); +#if !XXH_DISPATCH_SCALAR + assert(vecID != XXH_SCALAR); +#endif +#if !XXH_DISPATCH_AVX512 + assert(vecID != XXH_AVX512); +#endif +#if !XXH_DISPATCH_AVX2 + assert(vecID != XXH_AVX2); +#endif + XXH_g_dispatch = XXH_kDispatch[vecID]; + XXH_g_dispatch128 = XXH_kDispatch128[vecID]; +} + + +/* ==== XXH3 public functions ==== */ + +static XXH64_hash_t +XXH3_hashLong_64b_defaultSecret_selection(const void* input, size_t len, + XXH64_hash_t seed64, const xxh_u8* secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + if (XXH_g_dispatch.hashLong64_default == NULL) XXH_setDispatch(); + return XXH_g_dispatch.hashLong64_default(input, len); +} + +XXH64_hash_t XXH3_64bits_dispatch(const void* input, size_t len) +{ + return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_defaultSecret_selection); +} + +static XXH64_hash_t +XXH3_hashLong_64b_withSeed_selection(const void* input, size_t len, + XXH64_hash_t seed64, const xxh_u8* secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + if (XXH_g_dispatch.hashLong64_seed == NULL) XXH_setDispatch(); + return XXH_g_dispatch.hashLong64_seed(input, len, seed64); +} + +XXH64_hash_t XXH3_64bits_withSeed_dispatch(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed_selection); +} + +static XXH64_hash_t +XXH3_hashLong_64b_withSecret_selection(const void* input, size_t len, + XXH64_hash_t seed64, const xxh_u8* secret, size_t secretLen) +{ + (void)seed64; + if (XXH_g_dispatch.hashLong64_secret == NULL) XXH_setDispatch(); + return XXH_g_dispatch.hashLong64_secret(input, len, secret, secretLen); +} + +XXH64_hash_t XXH3_64bits_withSecret_dispatch(const void* input, size_t len, const void* secret, size_t secretLen) +{ + return XXH3_64bits_internal(input, len, 0, secret, secretLen, XXH3_hashLong_64b_withSecret_selection); +} + +XXH_errorcode +XXH3_64bits_update_dispatch(XXH3_state_t* state, const void* input, size_t len) +{ + if (XXH_g_dispatch.update == NULL) XXH_setDispatch(); + return XXH_g_dispatch.update(state, (const xxh_u8*)input, len); +} + + +/* ==== XXH128 public functions ==== */ + +static XXH128_hash_t +XXH3_hashLong_128b_defaultSecret_selection(const void* input, size_t len, + XXH64_hash_t seed64, const void* secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + if (XXH_g_dispatch128.hashLong128_default == NULL) XXH_setDispatch(); + return XXH_g_dispatch128.hashLong128_default(input, len); +} + +XXH128_hash_t XXH3_128bits_dispatch(const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_defaultSecret_selection); +} + +static XXH128_hash_t +XXH3_hashLong_128b_withSeed_selection(const void* input, size_t len, + XXH64_hash_t seed64, const void* secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + if (XXH_g_dispatch128.hashLong128_seed == NULL) XXH_setDispatch(); + return XXH_g_dispatch128.hashLong128_seed(input, len, seed64); +} + +XXH128_hash_t XXH3_128bits_withSeed_dispatch(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_withSeed_selection); +} + +static XXH128_hash_t +XXH3_hashLong_128b_withSecret_selection(const void* input, size_t len, + XXH64_hash_t seed64, const void* secret, size_t secretLen) +{ + (void)seed64; + if (XXH_g_dispatch128.hashLong128_secret == NULL) XXH_setDispatch(); + return XXH_g_dispatch128.hashLong128_secret(input, len, secret, secretLen); +} + +XXH128_hash_t XXH3_128bits_withSecret_dispatch(const void* input, size_t len, const void* secret, size_t secretLen) +{ + return XXH3_128bits_internal(input, len, 0, secret, secretLen, XXH3_hashLong_128b_withSecret_selection); +} + +XXH_errorcode +XXH3_128bits_update_dispatch(XXH3_state_t* state, const void* input, size_t len) +{ + if (XXH_g_dispatch128.update == NULL) XXH_setDispatch(); + return XXH_g_dispatch128.update(state, (const xxh_u8*)input, len); +} + +#if defined (__cplusplus) +} +#endif +/*! @} */ diff --git a/extern/xxhash/xxh_x86dispatch.h b/extern/xxhash/xxh_x86dispatch.h new file mode 100644 index 0000000..417ef08 --- /dev/null +++ b/extern/xxhash/xxh_x86dispatch.h @@ -0,0 +1,85 @@ +/* + * xxHash - XXH3 Dispatcher for x86-based targets + * Copyright (C) 2020-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +#ifndef XXH_X86DISPATCH_H_13563687684 +#define XXH_X86DISPATCH_H_13563687684 + +#include "xxhash.h" /* XXH64_hash_t, XXH3_state_t */ + +#if defined (__cplusplus) +extern "C" { +#endif + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_dispatch(const void* input, size_t len); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed_dispatch(const void* input, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret_dispatch(const void* input, size_t len, const void* secret, size_t secretLen); +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update_dispatch(XXH3_state_t* state, const void* input, size_t len); + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_dispatch(const void* input, size_t len); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed_dispatch(const void* input, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret_dispatch(const void* input, size_t len, const void* secret, size_t secretLen); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update_dispatch(XXH3_state_t* state, const void* input, size_t len); + +#if defined (__cplusplus) +} +#endif + + +/* automatic replacement of XXH3 functions. + * can be disabled by setting XXH_DISPATCH_DISABLE_REPLACE */ +#ifndef XXH_DISPATCH_DISABLE_REPLACE + +# undef XXH3_64bits +# define XXH3_64bits XXH3_64bits_dispatch +# undef XXH3_64bits_withSeed +# define XXH3_64bits_withSeed XXH3_64bits_withSeed_dispatch +# undef XXH3_64bits_withSecret +# define XXH3_64bits_withSecret XXH3_64bits_withSecret_dispatch +# undef XXH3_64bits_update +# define XXH3_64bits_update XXH3_64bits_update_dispatch + +# undef XXH128 +# define XXH128 XXH3_128bits_withSeed_dispatch +# undef XXH3_128bits +# define XXH3_128bits XXH3_128bits_dispatch +# undef XXH3_128bits_withSeed +# define XXH3_128bits_withSeed XXH3_128bits_withSeed_dispatch +# undef XXH3_128bits_withSecret +# define XXH3_128bits_withSecret XXH3_128bits_withSecret_dispatch +# undef XXH3_128bits_update +# define XXH3_128bits_update XXH3_128bits_update_dispatch + +#endif /* XXH_DISPATCH_DISABLE_REPLACE */ + +#endif /* XXH_X86DISPATCH_H_13563687684 */ diff --git a/extern/xxhash/xxhash.c b/extern/xxhash/xxhash.c new file mode 100644 index 0000000..083b039 --- /dev/null +++ b/extern/xxhash/xxhash.c @@ -0,0 +1,43 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/extern/xxhash/xxhash.h b/extern/xxhash/xxhash.h new file mode 100644 index 0000000..962ef76 --- /dev/null +++ b/extern/xxhash/xxhash.h @@ -0,0 +1,6074 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include + * #include + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * @file xxhash.h + * xxHash prototypes and implementation + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 1 +/*! @brief Version number, encoded as two digits each */ +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + + +/* **************************** +* Common basic types +******************************/ +#include /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; + +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * See @ref single_shot_example "Single Shot Example" for an example. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +#ifndef XXH_NO_STREAM +/*! + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * @see streaming_example at the top of @ref xxhash.h for an example. + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif + +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif + +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif + +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else +# include +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled via the @ref XXH_VECTOR + * macro. For the x86 family, an automatic dispatcher is included separately + * in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generage exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief 64-bit unseeded variant of XXH3. + * + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see + * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t length); + +/*! + * @brief 64-bit seeded variant of XXH3 + * + * This variant generates a custom secret on the fly based on default secret + * altered using the `seed` value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * @param input The data to hash + * @param length The length + * @param seed The 64-bit seed to alter the state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief 64-bit variant of XXH3 with a custom "secret". + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing "XXH3_generateSecret()" instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); +XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); + +/* + * XXH3_64bits_reset(): + * Initialize with default parameters. + * digest will be equivalent to `XXH3_64bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); +/* + * XXH3_64bits_reset_withSeed(): + * Generate a custom secret from `seed`, and store it into `statePtr`. + * digest will be equivalent to `XXH3_64bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Unseeded 128-bit variant of XXH3 + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see + * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(const void* data, size_t len); +/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @return: >0 if *h128_1 > *h128_2 + * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(const void* h128_1, const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; } + + +/*! + * simple alias to pre-selected XXH3_128bits variant + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * XXH3_generateSecret(): + * + * Derive a high-entropy secret from any user-defined content, named customSeed. + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include + * #include + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The seed to seed the state. + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed); + +/*! + * These variants generate hash values using either + * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(const void* data, size_t len, + const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(const void* input, size_t length, + const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, + const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, + const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * . + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# define XXH_ASSERT(c) ((void)0) +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to bswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang autovectorizes it incorrectly + * and it is pointless writing a NEON implementation that is basically the + * same speed as scalar for XXH32. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + + do { + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); + } else { + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @ingroup XXH32_family + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH64_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + + do { + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); + } else { + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#else +/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || defined(__aarch64__) || defined(_M_ARM) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) +# define inline __inline__ /* circumvent a clang bug */ +# include +# undef inline +# elif defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# endif +#endif + +#if defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent bswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * @ref XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment reqired for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + + +#if XXH_VECTOR == XXH_NEON +/* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split into + * two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] + * | '---------. .--------' | + * | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 is + * completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so modifying + * D11 will modify the high half of Q5. This is similar to how modifying AH + * will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd lanes + * in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead of + * taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the + * lower and upper halves of the Q register to end up with the high and low + * halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions modifying two + * registers at once is not possible in GCC or Clang's IR, and they have to + * create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, many + * quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes to a + * D-form register zero the high bits, similar to how writes to W-form scalar + * registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent + * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one + * operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be this + * mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } + * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } + * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + */ + +/*! + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) + * { + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ +# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && (defined(__GNUC__) || defined(__clang__)) \ + && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM)) +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ + __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) +# else +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64 (in); \ + (outHi) = vshrn_n_u64 ((in), 32); \ + } while (0) +# endif + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(uint64x2_t const*)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and + * 2 lanes on scalar by default. + * + * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the + * emulated 64-bit arithmetic is too slow. + * + * Modern ARM CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't + * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions, + * you are only using 2/3 of the CPU bandwidth. + * + * This is even more noticable on the more advanced cores like the A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the + * remaining lanes will use scalar instructions. This improves the bandwidth + * and also gives the integer pipelines something to do besides twiddling loop + * counters and pointers. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include +# else +# include +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= 0x165667919E3779F9ULL; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= 0x9FB21C651E98DF25ULL; + h64 ^= (h64 >> 35) + len ; + h64 *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + size_t i = (len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); +#endif + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + int const nbRounds = (int)len / 16; + int i; + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXH3_avalanche(acc); + XXH_ASSERT(nbRounds >= 8); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + return XXH3_avalanche(acc); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64)); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*', + * this will warn "discards 'const' qualifier". */ + union { + const __m512i* cp; + void* p; + } remote_const_void; + remote_const_void.cp = src + i; + dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { + uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + uint64x2_t acc_vec = xacc[i]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + data_key = veorq_u64(data_vec, key_vec); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi); + /* xacc[i] += acc_vec_2; */ + acc_vec = vaddq_u64 (acc_vec, acc_vec_2); + xacc[i] = acc_vec; + } + + } +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* xacc = (uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); + + size_t i; + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1 */ + uint32x2_t data_key_lo, data_key_hi; + /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (xacc[i] >> 32); + * xacc[i] = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + { /* + * prod_hi = (data_key >> 32) * XXH_PRIME32_1; + * + * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will + * incorrectly "optimize" this: + * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); + * shifted = vshll_n_u32(tmp, 32); + * to this: + * tmp = "vmulq_u64"(a, b); // no such thing! + * shifted = vshlq_n_u64(tmp, 32); + * + * However, unlike SSE, Clang lacks a 64-bit multiply routine + * for NEON, and it scalarizes two 64-bit multiplies instead. + * + * vmull_u32 has the same timing as vmul_u32, and it avoids + * this bug completely. + * See https://bugs.llvm.org/show_bug.cgi?id=39967 + */ + uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + /* xacc[i] = prod_hi << 32; */ + prod_hi = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime); + } + } + } +} + +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + unsigned int* const xacc = (unsigned int*) acc; + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i); + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + /* xacc[i] = acc_vec; */ + vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i); + } +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_u64x2* const xacc = (xxh_u64x2*) acc; + const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +/* scalar variants - universal */ + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == XXH3_kSecret); + + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * XXH3_accumulate() + * Loops over XXH3_accumulate_512(). + * Assumption: nbStripes will not overflow the secret size + */ +XXH_FORCE_INLINE void +XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes, + XXH3_f_accumulate_512 f_acc512) +{ + size_t n; + for (n = 0; n < nbStripes; n++ ) { + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; + XXH_PREFETCH(in + XXH_PREFETCH_DIST); + f_acc512(acc, + in, + secret + n*XXH_SECRET_CONSUME_RATE); + } +} + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc512, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc512, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* input, size_t len, + XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(const void* input, size_t length, const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(const void* input, size_t length, const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/* Note : when XXH3_consumeStripes() is invoked, + * there must be a guarantee that at least one more byte must be consumed from input + * so that the function can blindly consume all stripes using the "normal" secret segment */ +XXH_FORCE_INLINE void +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { + /* need a scrambling operation */ + size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; + size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512); + f_scramble(acc, secret + secretLimit); + XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512); + *nbStripesSoFarPtr = nbStripesAfterBlock; + } else { + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512); + *nbStripesSoFarPtr += nbStripes; + } +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc512, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + + /* large input to consume : ingest per full block */ + if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar); + /* join to current block's end */ + { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar; + XXH_ASSERT(nbStripesToEnd <= nbStripes); + XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512); + f_scramble(acc, secret + state->secretLimit); + state->nbStripesSoFar = 0; + input += nbStripesToEnd * XXH_STRIPE_LEN; + nbStripes -= nbStripesToEnd; + } + /* consume per entire blocks */ + while(nbStripes >= state->nbStripesPerBlock) { + XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512); + f_scramble(acc, secret + state->secretLimit); + input += state->nbStripesPerBlock * XXH_STRIPE_LEN; + nbStripes -= state->nbStripesPerBlock; + } + /* consume last partial block */ + XXH3_accumulate(acc, input, secret, nbStripes, f_acc512); + input += nbStripes * XXH_STRIPE_LEN; + XXH_ASSERT(input < bEnd); /* at least some bytes left */ + state->nbStripesSoFar = nbStripes; + /* buffer predecessor of last partial stripe */ + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN); + } else { + /* content to consume <= block size */ + /* Consume input by a multiple of internal buffer size */ + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc512, f_scramble); + input += XXH3_INTERNALBUFFER_SIZE; + } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + } + } + + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate_512, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate_512, XXH3_scrambleAcc); + /* last stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - XXH_STRIPE_LEN, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } else { /* bufferedSize < XXH_STRIPE_LEN */ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + size_t i = (len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + int const nbRounds = (int)len / 32; + int i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + for (i=0; i<4; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + (32 * i), + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + XXH_ASSERT(nbRounds >= 4); + for (i=4 ; i < nbRounds; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + 0ULL - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate_512, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc512, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc512, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; n + +extern "C" { +#else +#include "stdint.h" +#include "stdbool.h" +#endif + +typedef enum { + BACKEND_AUTO, + BACKEND_D3D12, + BACKEND_METAL, + BACKEND_VULKAN, + BACKEND_OPENGL, + BACKEND_OPENGLES, + BACKEND_WEBGPU, + BACKEND_NULL, +} AuroraBackend; + +typedef enum { + LOG_DEBUG, + LOG_INFO, + LOG_WARNING, + LOG_ERROR, + LOG_FATAL, +} AuroraLogLevel; + +typedef struct { + uint32_t width; + uint32_t height; + uint32_t fb_width; + uint32_t fb_height; + float scale; +} AuroraWindowSize; + +typedef struct AuroraEvent AuroraEvent; + +typedef void (*AuroraLogCallback)(AuroraLogLevel level, const char* message, unsigned int len); +typedef void (*AuroraImGuiInitCallback)(const AuroraWindowSize* size); + +typedef struct { + const char* appName; + const char* configPath; + AuroraBackend desiredBackend; + uint32_t msaa; + uint16_t maxTextureAnisotropy; + bool startFullscreen; + uint32_t windowWidth; + uint32_t windowHeight; + void* iconRGBA8; + uint32_t iconWidth; + uint32_t iconHeight; + AuroraLogCallback logCallback; + AuroraImGuiInitCallback imGuiInitCallback; +} AuroraConfig; + +typedef struct { + AuroraBackend backend; + const char* configPath; + AuroraWindowSize windowSize; +} AuroraInfo; + +AuroraInfo aurora_initialize(int argc, char* argv[], const AuroraConfig* config); +void aurora_shutdown(); +const AuroraEvent* aurora_update(); +bool aurora_begin_frame(); +void aurora_end_frame(); + +#ifndef NDEBUG +#define AURORA_GFX_DEBUG_GROUPS +#endif + +void push_debug_group(const char* label); +void pop_debug_group(); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/aurora/event.h b/include/aurora/event.h new file mode 100644 index 0000000..0cb7146 --- /dev/null +++ b/include/aurora/event.h @@ -0,0 +1,40 @@ +#ifndef AURORA_EVENT_H +#define AURORA_EVENT_H + +#include "aurora.h" + +#include + +#ifdef __cplusplus +#include + +extern "C" { +#else +#include "stdint.h" +#endif + +typedef enum { + AURORA_NONE, + AURORA_EXIT, + AURORA_SDL_EVENT, + AURORA_WINDOW_RESIZED, + AURORA_CONTROLLER_ADDED, + AURORA_CONTROLLER_REMOVED, + AURORA_PAUSED, + AURORA_UNPAUSED, +} AuroraEventType; + +struct AuroraEvent { + AuroraEventType type; + union { + SDL_Event sdl; + AuroraWindowSize windowSize; + int32_t controller; + }; +}; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/aurora/imgui.h b/include/aurora/imgui.h new file mode 100644 index 0000000..77eba9a --- /dev/null +++ b/include/aurora/imgui.h @@ -0,0 +1,20 @@ +#ifndef AURORA_IMGUI_H +#define AURORA_IMGUI_H + +#include + +#ifdef __cplusplus +#include + +extern "C" { +#else +#include "stdint.h" +#endif + +ImTextureID aurora_imgui_add_texture(uint32_t width, uint32_t height, const void* rgba8); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/aurora/main.h b/include/aurora/main.h new file mode 100644 index 0000000..68ee67c --- /dev/null +++ b/include/aurora/main.h @@ -0,0 +1,16 @@ +#ifndef AURORA_MAIN_H +#define AURORA_MAIN_H + +#ifdef __cplusplus +extern "C" { +#endif + +int aurora_main(int argc, char* argv[]); + +#ifdef __cplusplus +} +#endif + +#define main aurora_main + +#endif diff --git a/include/aurora/math.hpp b/include/aurora/math.hpp new file mode 100644 index 0000000..4f4601c --- /dev/null +++ b/include/aurora/math.hpp @@ -0,0 +1,254 @@ +#pragma once + +#include +#include + +#ifndef AURORA_VEC2_EXTRA +#define AURORA_VEC2_EXTRA +#endif +#ifndef AURORA_VEC3_EXTRA +#define AURORA_VEC3_EXTRA +#endif +#ifndef AURORA_VEC4_EXTRA +#define AURORA_VEC4_EXTRA +#endif +#ifndef AURORA_MAT4X4_EXTRA +#define AURORA_MAT4X4_EXTRA +#endif + +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif +#if __has_attribute(vector_size) +//#define USE_GCC_VECTOR_EXTENSIONS +#endif + +namespace aurora { +template +struct Vec2 { + T x{}; + T y{}; + + constexpr Vec2() = default; + constexpr Vec2(T x, T y) : x(x), y(y) {} + AURORA_VEC2_EXTRA +#ifdef METAFORCE + constexpr Vec2(const zeus::CVector2f& vec) : x(vec.x()), y(vec.y()) {} +#endif + + bool operator==(const Vec2& rhs) const { return x == rhs.x && y == rhs.y; } +}; +template +struct Vec3 { + T x{}; + T y{}; + T z{}; + + constexpr Vec3() = default; + constexpr Vec3(T x, T y, T z) : x(x), y(y), z(z) {} + AURORA_VEC3_EXTRA +#ifdef METAFORCE + constexpr Vec3(const zeus::CVector3f& vec) : x(vec.x()), y(vec.y()), z(vec.z()) {} + operator zeus::CVector3f() const { return {x, y, z}; } +#endif + + bool operator==(const Vec3& rhs) const { return x == rhs.x && y == rhs.y && z == rhs.z; } +}; +template +struct Vec4 { +#ifdef USE_GCC_VECTOR_EXTENSIONS + typedef T Vt __attribute__((vector_size(sizeof(T) * 4))); + Vt m; +#else + using Vt = T[4]; + Vt m; +#endif + + constexpr Vec4() = default; + constexpr Vec4(Vt m) : m(m) {} + constexpr Vec4(T x, T y, T z, T w) : m{x, y, z, w} {} + // For Vec3 with padding + constexpr Vec4(T x, T y, T z) : m{x, y, z, {}} {} + // For Vec3 -> Vec4 + constexpr Vec4(Vec3 v, T w) : m{v.x, v.y, v.z, w} {} + AURORA_VEC4_EXTRA +#ifdef METAFORCE + constexpr Vec4(const zeus::CVector4f& vec) : x(vec.x()), y(vec.y()), z(vec.z()), w(vec.w()) {} + constexpr Vec4(const zeus::CColor& color) : x(color.r()), y(color.g()), z(color.b()), w(color.a()) {} +#endif + + inline Vec4& operator=(const Vec4& other) { + memcpy(&m, &other.m, sizeof(Vt)); + return *this; + } + + [[nodiscard]] inline T& x() { return m[0]; } + [[nodiscard]] inline T x() const { return m[0]; } + [[nodiscard]] inline T& y() { return m[1]; } + [[nodiscard]] inline T y() const { return m[1]; } + [[nodiscard]] inline T& z() { return m[2]; } + [[nodiscard]] inline T z() const { return m[2]; } + [[nodiscard]] inline T& w() { return m[3]; } + [[nodiscard]] inline T w() const { return m[3]; } + [[nodiscard]] inline T& operator[](size_t i) { return m[i]; } + [[nodiscard]] inline T operator[](size_t i) const { return m[i]; } + + template + [[nodiscard]] constexpr Vec4 shuffle() const { + static_assert(x < 4 && y < 4 && z < 4 && w < 4); +#if defined(USE_GCC_VECTOR_EXTENSIONS) && __has_builtin(__builtin_shuffle) + typedef int Vi __attribute__((vector_size(16))); + return __builtin_shuffle(m, Vi{x, y, z, w}); +#else + return {m[x], m[y], m[z], m[w]}; +#endif + } + + bool operator==(const Vec4& rhs) const { +#if defined(USE_GCC_VECTOR_EXTENSIONS) && __has_builtin(__builtin_reduce_and) + return __builtin_reduce_and(m == rhs.m) != 0; +#else + return m[0] == rhs.m[0] && m[1] == rhs.m[1] && m[2] == rhs.m[2] && m[3] == rhs.m[3]; +#endif + } +}; +template +[[nodiscard]] inline Vec4 operator+(const Vec4& a, const Vec4& b) { +#ifdef USE_GCC_VECTOR_EXTENSIONS + return a.m + b.m; +#else + return {a.m[0] + b.m[0], a.m[1] + b.m[1], a.m[2] + b.m[2], a.m[3] + b.m[3]}; +#endif +} +template +[[nodiscard]] inline Vec4 operator*(const Vec4& a, const Vec4& b) { +#ifdef USE_GCC_VECTOR_EXTENSIONS + return a.m * b.m; +#else + return {a.m[0] * b.m[0], a.m[1] * b.m[1], a.m[2] * b.m[2], a.m[3] * b.m[3]}; +#endif +} +template +struct Mat3x2 { + Vec2 m0{}; + Vec2 m1{}; + Vec2 m2{}; + + constexpr Mat3x2() = default; + constexpr Mat3x2(const Vec2& m0, const Vec2& m1, const Vec2& m2) : m0(m0), m1(m1), m2(m2) {} + + bool operator==(const Mat3x2& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1 && m2 == rhs.m2; } +}; +template +struct Mat4x2 { + Vec2 m0{}; + Vec2 m1{}; + Vec2 m2{}; + Vec2 m3{}; + + constexpr Mat4x2() = default; + constexpr Mat4x2(const Vec2& m0, const Vec2& m1, const Vec2& m2, const Vec2& m3) + : m0(m0), m1(m1), m2(m2), m3(m3) {} + + inline Mat4x2 transpose() const { + return { + {m0.x, m2.x}, + {m0.y, m2.y}, + {m1.x, m3.x}, + {m1.y, m3.y}, + }; + } + + bool operator==(const Mat4x2& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1 && m2 == rhs.m2 && m3 == rhs.m3; } +}; +template +struct Mat4x4; +template +struct Mat3x4 { + Vec4 m0{}; + Vec4 m1{}; + Vec4 m2{}; + + constexpr Mat3x4() = default; + constexpr Mat3x4(const Vec4& m0, const Vec4& m1, const Vec4& m2) : m0(m0), m1(m1), m2(m2) {} + + inline Mat4x4 to4x4() const; + inline Mat4x4 toTransposed4x4() const; +}; +static_assert(sizeof(Mat3x4) == sizeof(float[3][4])); +template +struct Mat4x4 { + Vec4 m0{}; + Vec4 m1{}; + Vec4 m2{}; + Vec4 m3{}; + + constexpr Mat4x4() = default; + constexpr Mat4x4(const Vec4& m0, const Vec4& m1, const Vec4& m2, const Vec4& m3) + : m0(m0), m1(m1), m2(m2), m3(m3) {} + AURORA_MAT4X4_EXTRA +#ifdef METAFORCE + constexpr Mat4x4(const zeus::CMatrix4f& m) : m0(m[0]), m1(m[1]), m2(m[2]), m3(m[3]) {} + constexpr Mat4x4(const zeus::CTransform& m) : Mat4x4(m.toMatrix4f()) {} +#endif + + [[nodiscard]] Mat4x4 transpose() const { + return { + {m0[0], m1[0], m2[0], m3[0]}, + {m0[1], m1[1], m2[1], m3[1]}, + {m0[2], m1[2], m2[2], m3[2]}, + {m0[3], m1[3], m2[3], m3[3]}, + }; + } + inline Mat4x4& operator=(const Mat4x4& other) { + m0 = other.m0; + m1 = other.m1; + m2 = other.m2; + m3 = other.m3; + return *this; + } + + inline Vec4& operator[](size_t i) { return *(&m0 + i); } + inline const Vec4& operator[](size_t i) const { return *(&m0 + i); } + + bool operator==(const Mat4x4& rhs) const { return m0 == rhs.m0 && m1 == rhs.m1 && m2 == rhs.m2 && m3 == rhs.m3; } +}; +static_assert(sizeof(Mat4x4) == sizeof(float[4][4])); +template +[[nodiscard]] inline Mat4x4 operator*(const Mat4x4& a, const Mat4x4& b) { + Mat4x4 out; + for (size_t i = 0; i < 4; ++i) { + *(&out.m0 + i) = a.m0 * b[i].template shuffle<0, 0, 0, 0>() + a.m1 * b[i].template shuffle<1, 1, 1, 1>() + + a.m2 * b[i].template shuffle<2, 2, 2, 2>() + a.m3 * b[i].template shuffle<3, 3, 3, 3>(); + } + return out; +} +template +[[nodiscard]] inline Mat4x4 Mat3x4::to4x4() const { + return { + {m0.m[0], m0.m[1], m0.m[2], 0.f}, + {m1.m[0], m1.m[1], m1.m[2], 0.f}, + {m2.m[0], m2.m[1], m2.m[2], 0.f}, + {m0.m[3], m1.m[3], m2.m[3], 1.f}, + }; +} +template +[[nodiscard]] inline Mat4x4 Mat3x4::toTransposed4x4() const { + return Mat4x4{ + m0, + m1, + m2, + {0.f, 0.f, 0.f, 1.f}, + } + .transpose(); +} +constexpr Mat4x4 Mat4x4_Identity{ + Vec4{1.f, 0.f, 0.f, 0.f}, + Vec4{0.f, 1.f, 0.f, 0.f}, + Vec4{0.f, 0.f, 1.f, 0.f}, + Vec4{0.f, 0.f, 0.f, 1.f}, +}; +} // namespace aurora diff --git a/include/dolphin/gx.h b/include/dolphin/gx.h new file mode 100644 index 0000000..91838c2 --- /dev/null +++ b/include/dolphin/gx.h @@ -0,0 +1,32 @@ +#ifndef DOLPHIN_GX_H +#define DOLPHIN_GX_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXBump.h b/include/dolphin/gx/GXBump.h new file mode 100644 index 0000000..aee5c66 --- /dev/null +++ b/include/dolphin/gx/GXBump.h @@ -0,0 +1,28 @@ +#ifndef DOLPHIN_GXBUMP_H +#define DOLPHIN_GXBUMP_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXSetTevDirect(GXTevStageID tev_stage); +void GXSetNumIndStages(u8 nIndStages); +#ifdef TARGET_PC +void GXSetIndTexMtx(GXIndTexMtxID mtx_sel, const void* offset, s8 scale_exp); +#else +void GXSetIndTexMtx(GXIndTexMtxID mtx_sel, f32 offset[2][3], s8 scale_exp); +#endif +void GXSetIndTexOrder(GXIndTexStageID ind_stage, GXTexCoordID tex_coord, GXTexMapID tex_map); +void GXSetTevIndirect(GXTevStageID tev_stage, GXIndTexStageID ind_stage, GXIndTexFormat format, + GXIndTexBiasSel bias_sel, GXIndTexMtxID matrix_sel, GXIndTexWrap wrap_s, GXIndTexWrap wrap_t, + GXBool add_prev, GXBool ind_lod, GXIndTexAlphaSel alpha_sel); +void GXSetTevIndWarp(GXTevStageID tev_stage, GXIndTexStageID ind_stage, GXBool signed_offsets, GXBool replace_mode, + GXIndTexMtxID matrix_sel); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXCommandList.h b/include/dolphin/gx/GXCommandList.h new file mode 100644 index 0000000..9c8352a --- /dev/null +++ b/include/dolphin/gx/GXCommandList.h @@ -0,0 +1,35 @@ +#ifndef DOLPHIN_GXCOMMANDLIST_H +#define DOLPHIN_GXCOMMANDLIST_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define GX_NOP 0x00 +#define GX_DRAW_QUADS 0x80 +#define GX_DRAW_TRIANGLES 0x90 +#define GX_DRAW_TRIANGLE_STRIP 0x98 +#define GX_DRAW_TRIANGLE_FAN 0xA0 +#define GX_DRAW_LINES 0xA8 +#define GX_DRAW_LINE_STRIP 0xB0 +#define GX_DRAW_POINTS 0xB8 + +#define GX_LOAD_BP_REG 0x61 +#define GX_LOAD_CP_REG 0x08 +#define GX_LOAD_XF_REG 0x10 +#define GX_LOAD_INDX_A 0x20 +#define GX_LOAD_INDX_B 0x28 +#define GX_LOAD_INDX_C 0x30 +#define GX_LOAD_INDX_D 0x38 + +#define GX_CMD_CALL_DL 0x40 +#define GX_CMD_INVL_VC 0x48 + +#define GX_OPCODE_MASK 0xF8 +#define GX_VAT_MASK 0x07 + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXCull.h b/include/dolphin/gx/GXCull.h new file mode 100644 index 0000000..bf2d922 --- /dev/null +++ b/include/dolphin/gx/GXCull.h @@ -0,0 +1,18 @@ +#ifndef DOLPHIN_GXCULL_H +#define DOLPHIN_GXCULL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXSetScissor(u32 left, u32 top, u32 wd, u32 ht); +void GXSetCullMode(GXCullMode mode); +void GXSetCoPlanar(GXBool enable); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXDispList.h b/include/dolphin/gx/GXDispList.h new file mode 100644 index 0000000..b9a664d --- /dev/null +++ b/include/dolphin/gx/GXDispList.h @@ -0,0 +1,18 @@ +#ifndef DOLPHIN_GXDISPLIST_H +#define DOLPHIN_GXDISPLIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXBeginDisplayList(void* list, u32 size); +u32 GXEndDisplayList(void); +void GXCallDisplayList(const void* list, u32 nbytes); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXDraw.h b/include/dolphin/gx/GXDraw.h new file mode 100644 index 0000000..662ec40 --- /dev/null +++ b/include/dolphin/gx/GXDraw.h @@ -0,0 +1,16 @@ +#ifndef DOLPHIN_GXDRAW_H +#define DOLPHIN_GXDRAW_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXDrawSphere(u8 numMajor, u8 numMinor); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXEnum.h b/include/dolphin/gx/GXEnum.h new file mode 100644 index 0000000..902ee64 --- /dev/null +++ b/include/dolphin/gx/GXEnum.h @@ -0,0 +1,758 @@ +#ifndef DOLPHIN_GXENUM_H +#define DOLPHIN_GXENUM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef TARGET_PC +#include + +typedef bool GXBool; +#else +typedef u8 GXBool; +#endif + +#define GX_FALSE ((GXBool)0) +#define GX_TRUE ((GXBool)1) + +#define GX_ENABLE ((GXBool)1) +#define GX_DISABLE ((GXBool)0) + +typedef enum { + GX_PERSPECTIVE, + GX_ORTHOGRAPHIC, +} GXProjectionType; + +typedef enum { + GX_NEVER, + GX_LESS, + GX_EQUAL, + GX_LEQUAL, + GX_GREATER, + GX_NEQUAL, + GX_GEQUAL, + GX_ALWAYS, +} GXCompare; + +typedef enum { + GX_AOP_AND, + GX_AOP_OR, + GX_AOP_XOR, + GX_AOP_XNOR, + GX_MAX_ALPHAOP, +} GXAlphaOp; + +typedef enum { + GX_ZC_LINEAR, + GX_ZC_NEAR, + GX_ZC_MID, + GX_ZC_FAR, +} GXZFmt16; + +typedef enum { + GX_GM_1_0, + GX_GM_1_7, + GX_GM_2_2, +} GXGamma; + +typedef enum { + GX_PF_RGB8_Z24, + GX_PF_RGBA6_Z24, + GX_PF_RGB565_Z16, + GX_PF_Z24, + GX_PF_Y8, + GX_PF_U8, + GX_PF_V8, + GX_PF_YUV420, +} GXPixelFmt; + +typedef enum { + GX_QUADS = 0x80, + GX_TRIANGLES = 0x90, + GX_TRIANGLESTRIP = 0x98, + GX_TRIANGLEFAN = 0xA0, + GX_LINES = 0xA8, + GX_LINESTRIP = 0xB0, + GX_POINTS = 0xB8, +} GXPrimitive; + +typedef enum { + GX_VTXFMT0, + GX_VTXFMT1, + GX_VTXFMT2, + GX_VTXFMT3, + GX_VTXFMT4, + GX_VTXFMT5, + GX_VTXFMT6, + GX_VTXFMT7, + GX_MAX_VTXFMT, +} GXVtxFmt; + +typedef enum { + GX_VA_PNMTXIDX, + GX_VA_TEX0MTXIDX, + GX_VA_TEX1MTXIDX, + GX_VA_TEX2MTXIDX, + GX_VA_TEX3MTXIDX, + GX_VA_TEX4MTXIDX, + GX_VA_TEX5MTXIDX, + GX_VA_TEX6MTXIDX, + GX_VA_TEX7MTXIDX, + GX_VA_POS, + GX_VA_NRM, + GX_VA_CLR0, + GX_VA_CLR1, + GX_VA_TEX0, + GX_VA_TEX1, + GX_VA_TEX2, + GX_VA_TEX3, + GX_VA_TEX4, + GX_VA_TEX5, + GX_VA_TEX6, + GX_VA_TEX7, + GX_POS_MTX_ARRAY, + GX_NRM_MTX_ARRAY, + GX_TEX_MTX_ARRAY, + GX_LIGHT_ARRAY, + GX_VA_NBT, + GX_VA_MAX_ATTR, + GX_VA_NULL = 0xFF, +} GXAttr; + +typedef enum { + GX_NONE, + GX_DIRECT, + GX_INDEX8, + GX_INDEX16, +} GXAttrType; + +#define _GX_TF_CTF 0x20 +#define _GX_TF_ZTF 0x10 + +typedef enum { + GX_TF_I4 = 0x0, + GX_TF_I8 = 0x1, + GX_TF_IA4 = 0x2, + GX_TF_IA8 = 0x3, + GX_TF_RGB565 = 0x4, + GX_TF_RGB5A3 = 0x5, + GX_TF_RGBA8 = 0x6, + GX_TF_CMPR = 0xE, + + GX_CTF_R4 = 0x0 | _GX_TF_CTF, + GX_CTF_RA4 = 0x2 | _GX_TF_CTF, + GX_CTF_RA8 = 0x3 | _GX_TF_CTF, + GX_CTF_YUVA8 = 0x6 | _GX_TF_CTF, + GX_CTF_A8 = 0x7 | _GX_TF_CTF, + GX_CTF_R8 = 0x8 | _GX_TF_CTF, + GX_CTF_G8 = 0x9 | _GX_TF_CTF, + GX_CTF_B8 = 0xA | _GX_TF_CTF, + GX_CTF_RG8 = 0xB | _GX_TF_CTF, + GX_CTF_GB8 = 0xC | _GX_TF_CTF, + + GX_TF_Z8 = 0x1 | _GX_TF_ZTF, + GX_TF_Z16 = 0x3 | _GX_TF_ZTF, + GX_TF_Z24X8 = 0x6 | _GX_TF_ZTF, + + GX_CTF_Z4 = 0x0 | _GX_TF_ZTF | _GX_TF_CTF, + GX_CTF_Z8M = 0x9 | _GX_TF_ZTF | _GX_TF_CTF, + GX_CTF_Z8L = 0xA | _GX_TF_ZTF | _GX_TF_CTF, + GX_CTF_Z16L = 0xC | _GX_TF_ZTF | _GX_TF_CTF, + + GX_TF_A8 = GX_CTF_A8, +} GXTexFmt; + +typedef enum { + GX_TF_C4 = 0x8, + GX_TF_C8 = 0x9, + GX_TF_C14X2 = 0xa, +} GXCITexFmt; + +typedef enum { + GX_CLAMP, + GX_REPEAT, + GX_MIRROR, + GX_MAX_TEXWRAPMODE, +} GXTexWrapMode; + +typedef enum { + GX_NEAR, + GX_LINEAR, + GX_NEAR_MIP_NEAR, + GX_LIN_MIP_NEAR, + GX_NEAR_MIP_LIN, + GX_LIN_MIP_LIN, +} GXTexFilter; + +typedef enum { + GX_ANISO_1, + GX_ANISO_2, + GX_ANISO_4, + GX_MAX_ANISOTROPY, +} GXAnisotropy; + +typedef enum { + GX_TEXMAP0, + GX_TEXMAP1, + GX_TEXMAP2, + GX_TEXMAP3, + GX_TEXMAP4, + GX_TEXMAP5, + GX_TEXMAP6, + GX_TEXMAP7, + GX_MAX_TEXMAP, + GX_TEXMAP_NULL = 0xFF, + GX_TEX_DISABLE = 0x100, +} GXTexMapID; + +typedef enum { + GX_TEXCOORD0, + GX_TEXCOORD1, + GX_TEXCOORD2, + GX_TEXCOORD3, + GX_TEXCOORD4, + GX_TEXCOORD5, + GX_TEXCOORD6, + GX_TEXCOORD7, + GX_MAX_TEXCOORD, + GX_TEXCOORD_NULL = 0xFF, +} GXTexCoordID; + +typedef enum { + GX_TEVSTAGE0, + GX_TEVSTAGE1, + GX_TEVSTAGE2, + GX_TEVSTAGE3, + GX_TEVSTAGE4, + GX_TEVSTAGE5, + GX_TEVSTAGE6, + GX_TEVSTAGE7, + GX_TEVSTAGE8, + GX_TEVSTAGE9, + GX_TEVSTAGE10, + GX_TEVSTAGE11, + GX_TEVSTAGE12, + GX_TEVSTAGE13, + GX_TEVSTAGE14, + GX_TEVSTAGE15, + GX_MAX_TEVSTAGE, +} GXTevStageID; + +typedef enum { + GX_MODULATE, + GX_DECAL, + GX_BLEND, + GX_REPLACE, + GX_PASSCLR, +} GXTevMode; + +typedef enum { + GX_MTX3x4, + GX_MTX2x4, +} GXTexMtxType; + +typedef enum { + GX_TG_MTX3x4, + GX_TG_MTX2x4, + GX_TG_BUMP0, + GX_TG_BUMP1, + GX_TG_BUMP2, + GX_TG_BUMP3, + GX_TG_BUMP4, + GX_TG_BUMP5, + GX_TG_BUMP6, + GX_TG_BUMP7, + GX_TG_SRTG, +} GXTexGenType; + +typedef enum { + GX_PNMTX0 = 0, + GX_PNMTX1 = 3, + GX_PNMTX2 = 6, + GX_PNMTX3 = 9, + GX_PNMTX4 = 12, + GX_PNMTX5 = 15, + GX_PNMTX6 = 18, + GX_PNMTX7 = 21, + GX_PNMTX8 = 24, + GX_PNMTX9 = 27, +} GXPosNrmMtx; + +typedef enum { + GX_TEXMTX0 = 30, + GX_TEXMTX1 = 33, + GX_TEXMTX2 = 36, + GX_TEXMTX3 = 39, + GX_TEXMTX4 = 42, + GX_TEXMTX5 = 45, + GX_TEXMTX6 = 48, + GX_TEXMTX7 = 51, + GX_TEXMTX8 = 54, + GX_TEXMTX9 = 57, + GX_IDENTITY = 60, +} GXTexMtx; + +typedef enum { + GX_COLOR0, + GX_COLOR1, + GX_ALPHA0, + GX_ALPHA1, + GX_COLOR0A0, + GX_COLOR1A1, + GX_COLOR_ZERO, + GX_ALPHA_BUMP, + GX_ALPHA_BUMPN, + GX_COLOR_NULL = 0xFF, +} GXChannelID; + +typedef enum { + GX_TG_POS, + GX_TG_NRM, + GX_TG_BINRM, + GX_TG_TANGENT, + GX_TG_TEX0, + GX_TG_TEX1, + GX_TG_TEX2, + GX_TG_TEX3, + GX_TG_TEX4, + GX_TG_TEX5, + GX_TG_TEX6, + GX_TG_TEX7, + GX_TG_TEXCOORD0, + GX_TG_TEXCOORD1, + GX_TG_TEXCOORD2, + GX_TG_TEXCOORD3, + GX_TG_TEXCOORD4, + GX_TG_TEXCOORD5, + GX_TG_TEXCOORD6, + GX_TG_COLOR0, + GX_TG_COLOR1, + GX_MAX_TEXGENSRC, +} GXTexGenSrc; + +typedef enum { + GX_BM_NONE, + GX_BM_BLEND, + GX_BM_LOGIC, + GX_BM_SUBTRACT, + GX_MAX_BLENDMODE, +} GXBlendMode; + +typedef enum { + GX_BL_ZERO, + GX_BL_ONE, + GX_BL_SRCCLR, + GX_BL_INVSRCCLR, + GX_BL_SRCALPHA, + GX_BL_INVSRCALPHA, + GX_BL_DSTALPHA, + GX_BL_INVDSTALPHA, + GX_BL_DSTCLR = GX_BL_SRCCLR, + GX_BL_INVDSTCLR = GX_BL_INVSRCCLR, +} GXBlendFactor; + +typedef enum { + GX_LO_CLEAR, + GX_LO_AND, + GX_LO_REVAND, + GX_LO_COPY, + GX_LO_INVAND, + GX_LO_NOOP, + GX_LO_XOR, + GX_LO_OR, + GX_LO_NOR, + GX_LO_EQUIV, + GX_LO_INV, + GX_LO_REVOR, + GX_LO_INVCOPY, + GX_LO_INVOR, + GX_LO_NAND, + GX_LO_SET, +} GXLogicOp; + +typedef enum { + GX_POS_XY = 0, + GX_POS_XYZ = 1, + GX_NRM_XYZ = 0, + GX_NRM_NBT = 1, + GX_NRM_NBT3 = 2, + GX_CLR_RGB = 0, + GX_CLR_RGBA = 1, + GX_TEX_S = 0, + GX_TEX_ST = 1, +} GXCompCnt; + +typedef enum { + GX_U8 = 0, + GX_S8 = 1, + GX_U16 = 2, + GX_S16 = 3, + GX_F32 = 4, + GX_RGB565 = 0, + GX_RGB8 = 1, + GX_RGBX8 = 2, + GX_RGBA4 = 3, + GX_RGBA6 = 4, + GX_RGBA8 = 5, +} GXCompType; + +typedef enum { + GX_PTTEXMTX0 = 64, + GX_PTTEXMTX1 = 67, + GX_PTTEXMTX2 = 70, + GX_PTTEXMTX3 = 73, + GX_PTTEXMTX4 = 76, + GX_PTTEXMTX5 = 79, + GX_PTTEXMTX6 = 82, + GX_PTTEXMTX7 = 85, + GX_PTTEXMTX8 = 88, + GX_PTTEXMTX9 = 91, + GX_PTTEXMTX10 = 94, + GX_PTTEXMTX11 = 97, + GX_PTTEXMTX12 = 100, + GX_PTTEXMTX13 = 103, + GX_PTTEXMTX14 = 106, + GX_PTTEXMTX15 = 109, + GX_PTTEXMTX16 = 112, + GX_PTTEXMTX17 = 115, + GX_PTTEXMTX18 = 118, + GX_PTTEXMTX19 = 121, + GX_PTIDENTITY = 125, +} GXPTTexMtx; + +typedef enum { + GX_TEVPREV, + GX_TEVREG0, + GX_TEVREG1, + GX_TEVREG2, + GX_MAX_TEVREG, +} GXTevRegID; + +typedef enum { + GX_DF_NONE, + GX_DF_SIGN, + GX_DF_CLAMP, +} GXDiffuseFn; + +typedef enum { + GX_SRC_REG, + GX_SRC_VTX, +} GXColorSrc; + +typedef enum { + GX_AF_SPEC, + GX_AF_SPOT, + GX_AF_NONE, +} GXAttnFn; + +typedef enum { + GX_LIGHT0 = 0x001, + GX_LIGHT1 = 0x002, + GX_LIGHT2 = 0x004, + GX_LIGHT3 = 0x008, + GX_LIGHT4 = 0x010, + GX_LIGHT5 = 0x020, + GX_LIGHT6 = 0x040, + GX_LIGHT7 = 0x080, + GX_MAX_LIGHT = 0x100, + GX_LIGHT_NULL = 0, +} GXLightID; + +typedef enum { + GX_TO_ZERO, + GX_TO_SIXTEENTH, + GX_TO_EIGHTH, + GX_TO_FOURTH, + GX_TO_HALF, + GX_TO_ONE, + GX_MAX_TEXOFFSET, +} GXTexOffset; + +typedef enum { + GX_SP_OFF, + GX_SP_FLAT, + GX_SP_COS, + GX_SP_COS2, + GX_SP_SHARP, + GX_SP_RING1, + GX_SP_RING2, +} GXSpotFn; + +typedef enum { + GX_DA_OFF, + GX_DA_GENTLE, + GX_DA_MEDIUM, + GX_DA_STEEP, +} GXDistAttnFn; + +typedef enum { + GX_CULL_NONE, + GX_CULL_FRONT, + GX_CULL_BACK, + GX_CULL_ALL + +} GXCullMode; + +typedef enum { GX_TEV_SWAP0 = 0, GX_TEV_SWAP1, GX_TEV_SWAP2, GX_TEV_SWAP3, GX_MAX_TEVSWAP } GXTevSwapSel; + +typedef enum { GX_CH_RED = 0, GX_CH_GREEN, GX_CH_BLUE, GX_CH_ALPHA } GXTevColorChan; + +typedef enum _GXFogType { + GX_FOG_NONE = 0, + GX_FOG_PERSP_LIN = 2, + GX_FOG_PERSP_EXP = 4, + GX_FOG_PERSP_EXP2 = 5, + GX_FOG_PERSP_REVEXP = 6, + GX_FOG_PERSP_REVEXP2 = 7, + GX_FOG_ORTHO_LIN = 10, + GX_FOG_ORTHO_EXP = 12, + GX_FOG_ORTHO_EXP2 = 13, + GX_FOG_ORTHO_REVEXP = 14, + GX_FOG_ORTHO_REVEXP2 = 15, + GX_FOG_LIN = GX_FOG_PERSP_LIN, + GX_FOG_EXP = GX_FOG_PERSP_EXP, + GX_FOG_EXP2 = GX_FOG_PERSP_EXP2, + GX_FOG_REVEXP = GX_FOG_PERSP_REVEXP, + GX_FOG_REVEXP2 = GX_FOG_PERSP_REVEXP2, +} GXFogType; + +typedef enum { + GX_CC_CPREV, + GX_CC_APREV, + GX_CC_C0, + GX_CC_A0, + GX_CC_C1, + GX_CC_A1, + GX_CC_C2, + GX_CC_A2, + GX_CC_TEXC, + GX_CC_TEXA, + GX_CC_RASC, + GX_CC_RASA, + GX_CC_ONE, + GX_CC_HALF, + GX_CC_KONST, + GX_CC_ZERO +} GXTevColorArg; + +typedef enum { + GX_CA_APREV, + GX_CA_A0, + GX_CA_A1, + GX_CA_A2, + GX_CA_TEXA, + GX_CA_RASA, + GX_CA_KONST, + GX_CA_ZERO +} GXTevAlphaArg; + +typedef enum { + GX_TEV_ADD = 0, + GX_TEV_SUB = 1, + GX_TEV_COMP_R8_GT = 8, + GX_TEV_COMP_R8_EQ = 9, + GX_TEV_COMP_GR16_GT = 10, + GX_TEV_COMP_GR16_EQ = 11, + GX_TEV_COMP_BGR24_GT = 12, + GX_TEV_COMP_BGR24_EQ = 13, + GX_TEV_COMP_RGB8_GT = 14, + GX_TEV_COMP_RGB8_EQ = 15, + GX_TEV_COMP_A8_GT = GX_TEV_COMP_RGB8_GT, + GX_TEV_COMP_A8_EQ = GX_TEV_COMP_RGB8_EQ +} GXTevOp; + +typedef enum { GX_TB_ZERO, GX_TB_ADDHALF, GX_TB_SUBHALF, GX_MAX_TEVBIAS } GXTevBias; + +typedef enum { GX_CS_SCALE_1, GX_CS_SCALE_2, GX_CS_SCALE_4, GX_CS_DIVIDE_2, GX_MAX_TEVSCALE } GXTevScale; + +typedef enum { + GX_TEV_KCSEL_8_8 = 0x00, + GX_TEV_KCSEL_7_8 = 0x01, + GX_TEV_KCSEL_6_8 = 0x02, + GX_TEV_KCSEL_5_8 = 0x03, + GX_TEV_KCSEL_4_8 = 0x04, + GX_TEV_KCSEL_3_8 = 0x05, + GX_TEV_KCSEL_2_8 = 0x06, + GX_TEV_KCSEL_1_8 = 0x07, + GX_TEV_KCSEL_1 = GX_TEV_KCSEL_8_8, + GX_TEV_KCSEL_3_4 = GX_TEV_KCSEL_6_8, + GX_TEV_KCSEL_1_2 = GX_TEV_KCSEL_4_8, + GX_TEV_KCSEL_1_4 = GX_TEV_KCSEL_2_8, + GX_TEV_KCSEL_K0 = 0x0C, + GX_TEV_KCSEL_K1 = 0x0D, + GX_TEV_KCSEL_K2 = 0x0E, + GX_TEV_KCSEL_K3 = 0x0F, + GX_TEV_KCSEL_K0_R = 0x10, + GX_TEV_KCSEL_K1_R = 0x11, + GX_TEV_KCSEL_K2_R = 0x12, + GX_TEV_KCSEL_K3_R = 0x13, + GX_TEV_KCSEL_K0_G = 0x14, + GX_TEV_KCSEL_K1_G = 0x15, + GX_TEV_KCSEL_K2_G = 0x16, + GX_TEV_KCSEL_K3_G = 0x17, + GX_TEV_KCSEL_K0_B = 0x18, + GX_TEV_KCSEL_K1_B = 0x19, + GX_TEV_KCSEL_K2_B = 0x1A, + GX_TEV_KCSEL_K3_B = 0x1B, + GX_TEV_KCSEL_K0_A = 0x1C, + GX_TEV_KCSEL_K1_A = 0x1D, + GX_TEV_KCSEL_K2_A = 0x1E, + GX_TEV_KCSEL_K3_A = 0x1F +} GXTevKColorSel; + +typedef enum { + GX_TEV_KASEL_8_8 = 0x00, + GX_TEV_KASEL_7_8 = 0x01, + GX_TEV_KASEL_6_8 = 0x02, + GX_TEV_KASEL_5_8 = 0x03, + GX_TEV_KASEL_4_8 = 0x04, + GX_TEV_KASEL_3_8 = 0x05, + GX_TEV_KASEL_2_8 = 0x06, + GX_TEV_KASEL_1_8 = 0x07, + GX_TEV_KASEL_1 = GX_TEV_KASEL_8_8, + GX_TEV_KASEL_3_4 = GX_TEV_KASEL_6_8, + GX_TEV_KASEL_1_2 = GX_TEV_KASEL_4_8, + GX_TEV_KASEL_1_4 = GX_TEV_KASEL_2_8, + GX_TEV_KASEL_K0_R = 0x10, + GX_TEV_KASEL_K1_R = 0x11, + GX_TEV_KASEL_K2_R = 0x12, + GX_TEV_KASEL_K3_R = 0x13, + GX_TEV_KASEL_K0_G = 0x14, + GX_TEV_KASEL_K1_G = 0x15, + GX_TEV_KASEL_K2_G = 0x16, + GX_TEV_KASEL_K3_G = 0x17, + GX_TEV_KASEL_K0_B = 0x18, + GX_TEV_KASEL_K1_B = 0x19, + GX_TEV_KASEL_K2_B = 0x1A, + GX_TEV_KASEL_K3_B = 0x1B, + GX_TEV_KASEL_K0_A = 0x1C, + GX_TEV_KASEL_K1_A = 0x1D, + GX_TEV_KASEL_K2_A = 0x1E, + GX_TEV_KASEL_K3_A = 0x1F +} GXTevKAlphaSel; + +typedef enum { GX_KCOLOR0 = 0, GX_KCOLOR1, GX_KCOLOR2, GX_KCOLOR3, GX_MAX_KCOLOR } GXTevKColorID; + +typedef enum { + GX_ZT_DISABLE, + GX_ZT_ADD, + GX_ZT_REPLACE, + GX_MAX_ZTEXOP, +} GXZTexOp; + +typedef enum { + GX_ITF_8, + GX_ITF_5, + GX_ITF_4, + GX_ITF_3, + GX_MAX_ITFORMAT, +} GXIndTexFormat; + +typedef enum { + GX_ITB_NONE, + GX_ITB_S, + GX_ITB_T, + GX_ITB_ST, + GX_ITB_U, + GX_ITB_SU, + GX_ITB_TU, + GX_ITB_STU, + GX_MAX_ITBIAS, +} GXIndTexBiasSel; + +typedef enum { + GX_ITBA_OFF, + GX_ITBA_S, + GX_ITBA_T, + GX_ITBA_U, + GX_MAX_ITBALPHA, +} GXIndTexAlphaSel; + +typedef enum { + GX_ITM_OFF, + GX_ITM_0, + GX_ITM_1, + GX_ITM_2, + GX_ITM_S0 = 5, + GX_ITM_S1, + GX_ITM_S2, + GX_ITM_T0 = 9, + GX_ITM_T1, + GX_ITM_T2, +} GXIndTexMtxID; + +typedef enum { + GX_ITW_OFF, + GX_ITW_256, + GX_ITW_128, + GX_ITW_64, + GX_ITW_32, + GX_ITW_16, + GX_ITW_0, + GX_MAX_ITWRAP, +} GXIndTexWrap; + +typedef enum { + GX_INDTEXSTAGE0, + GX_INDTEXSTAGE1, + GX_INDTEXSTAGE2, + GX_INDTEXSTAGE3, + GX_MAX_INDTEXSTAGE, +} GXIndTexStageID; + +typedef enum { + GX_ITS_1, + GX_ITS_2, + GX_ITS_4, + GX_ITS_8, + GX_ITS_16, + GX_ITS_32, + GX_ITS_64, + GX_ITS_128, + GX_ITS_256, + GX_MAX_ITSCALE, +} GXIndTexScale; + +typedef enum { + GX_CLIP_ENABLE = 0, + GX_CLIP_DISABLE = 1, +} GXClipMode; + +typedef enum { + GX_TLUT0 = 0, + GX_TLUT1 = 1, + GX_TLUT2 = 2, + GX_TLUT3 = 3, + GX_TLUT4 = 4, + GX_TLUT5 = 5, + GX_TLUT6 = 6, + GX_TLUT7 = 7, + GX_TLUT8 = 8, + GX_TLUT9 = 9, + GX_TLUT10 = 10, + GX_TLUT11 = 11, + GX_TLUT12 = 12, + GX_TLUT13 = 13, + GX_TLUT14 = 14, + GX_TLUT15 = 15, + GX_BIGTLUT0 = 16, + GX_BIGTLUT1 = 17, + GX_BIGTLUT2 = 18, + GX_BIGTLUT3 = 19, +} GXTlut; + +typedef enum { + GX_TL_IA8, + GX_TL_RGB565, + GX_TL_RGB5A3, + GX_MAX_TLUTFMT, +} GXTlutFmt; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXExtra.h b/include/dolphin/gx/GXExtra.h new file mode 100644 index 0000000..08c2f8d --- /dev/null +++ b/include/dolphin/gx/GXExtra.h @@ -0,0 +1,33 @@ +#ifndef DOLPHIN_GXEXTRA_H +#define DOLPHIN_GXEXTRA_H +// Extra types for PC +#ifdef TARGET_PC +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + float r; + float g; + float b; + float a; +} GXColorF32; + +typedef enum { + GX_TF_R8_PC = 0x60, + GX_TF_RGBA8_PC = 0x61, +} GXPCTexFmt; + +void GXDestroyTexObj(GXTexObj* obj); + +void GXColor4f32(float r, float g, float b, float a); + +#ifdef __cplusplus +} +#endif +#endif + +#endif diff --git a/include/dolphin/gx/GXFifo.h b/include/dolphin/gx/GXFifo.h new file mode 100644 index 0000000..b6763c1 --- /dev/null +++ b/include/dolphin/gx/GXFifo.h @@ -0,0 +1,31 @@ +#ifndef DOLPHIN_GXFIFO_H +#define DOLPHIN_GXFIFO_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + u8 pad[128]; +} GXFifoObj; + +void GXInitFifoBase(GXFifoObj* fifo, void* base, u32 size); +void GXInitFifoPtrs(GXFifoObj* fifo, void* readPtr, void* writePtr); +void GXGetFifoPtrs(GXFifoObj* fifo, void** readPtr, void** writePtr); +GXFifoObj* GXGetCPUFifo(void); +GXFifoObj* GXGetGPFifo(void); +void GXSetCPUFifo(GXFifoObj* fifo); +void GXSetGPFifo(GXFifoObj* fifo); +void GXSaveCPUFifo(GXFifoObj* fifo); +void GXGetFifoStatus(GXFifoObj* fifo, GXBool* overhi, GXBool* underlow, u32* fifoCount, GXBool* cpu_write, + GXBool* gp_read, GXBool* fifowrap); +void GXGetGPStatus(GXBool* overhi, GXBool* underlow, GXBool* readIdle, GXBool* cmdIdle, GXBool* brkpt); +void GXInitFifoLimits(GXFifoObj* fifo, u32 hiWaterMark, u32 loWaterMark); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXFrameBuffer.h b/include/dolphin/gx/GXFrameBuffer.h new file mode 100644 index 0000000..174c2c6 --- /dev/null +++ b/include/dolphin/gx/GXFrameBuffer.h @@ -0,0 +1,30 @@ +#ifndef DOLPHIN_GXFRAMEBUFFER_H +#define DOLPHIN_GXFRAMEBUFFER_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define GX_MAX_Z24 0x00FFFFFF + +void GXSetCopyClear(GXColor clear_clr, u32 clear_z); +void GXAdjustForOverscan(GXRenderModeObj* rmin, GXRenderModeObj* rmout, u16 hor, u16 ver); +void GXCopyDisp(void* dest, GXBool clear); +void GXSetDispCopyGamma(GXGamma gamma); +void GXSetDispCopySrc(u16 left, u16 top, u16 wd, u16 ht); +void GXSetDispCopyDst(u16 wd, u16 ht); +u32 GXSetDispCopyYScale(f32 vscale); +void GXSetCopyFilter(GXBool aa, u8 sample_pattern[12][2], GXBool vf, u8 vfilter[7]); +void GXSetPixelFmt(GXPixelFmt pix_fmt, GXZFmt16 z_fmt); +void GXSetTexCopySrc(u16 left, u16 top, u16 wd, u16 ht); +void GXSetTexCopyDst(u16 wd, u16 ht, GXTexFmt fmt, GXBool mipmap); +void GXCopyTex(void* dest, GXBool clear); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXGeometry.h b/include/dolphin/gx/GXGeometry.h new file mode 100644 index 0000000..27de27a --- /dev/null +++ b/include/dolphin/gx/GXGeometry.h @@ -0,0 +1,35 @@ +#ifndef DOLPHIN_GXGEOMETRY_H +#define DOLPHIN_GXGEOMETRY_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXSetVtxDesc(GXAttr attr, GXAttrType type); +void GXSetVtxDescv(GXVtxDescList* list); +void GXClearVtxDesc(void); +void GXSetVtxAttrFmt(GXVtxFmt vtxfmt, GXAttr attr, GXCompCnt cnt, GXCompType type, u8 frac); +void GXSetNumTexGens(u8 nTexGens); +void GXBegin(GXPrimitive type, GXVtxFmt vtxfmt, u16 nverts); +void GXSetTexCoordGen2(GXTexCoordID dst_coord, GXTexGenType func, GXTexGenSrc src_param, u32 mtx, GXBool normalize, + u32 postmtx); +void GXSetLineWidth(u8 width, GXTexOffset texOffsets); +void GXSetPointSize(u8 pointSize, GXTexOffset texOffsets); +void GXEnableTexOffsets(GXTexCoordID coord, GXBool line_enable, GXBool point_enable); +#ifdef TARGET_PC +void GXSetArray(GXAttr attr, const void* data, u32 size, u8 stride); +#else +void GXSetArray(GXAttr attr, const void* data, u8 stride); +#endif + +static inline void GXSetTexCoordGen(GXTexCoordID dst_coord, GXTexGenType func, GXTexGenSrc src_param, u32 mtx) { + GXSetTexCoordGen2(dst_coord, func, src_param, mtx, GX_FALSE, GX_PTIDENTITY); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXGet.h b/include/dolphin/gx/GXGet.h new file mode 100644 index 0000000..ac6b80b --- /dev/null +++ b/include/dolphin/gx/GXGet.h @@ -0,0 +1,27 @@ +#ifndef DOLPHIN_GXGET_H +#define DOLPHIN_GXGET_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +GXBool GXGetTexObjMipMap(GXTexObj* tex_obj); +GXTexFmt GXGetTexObjFmt(GXTexObj* tex_obj); +u16 GXGetTexObjHeight(GXTexObj* tex_obj); +u16 GXGetTexObjWidth(GXTexObj* tex_obj); +GXTexWrapMode GXGetTexObjWrapS(GXTexObj* tex_obj); +GXTexWrapMode GXGetTexObjWrapT(GXTexObj* tex_obj); +void* GXGetTexObjData(GXTexObj* tex_obj); +void GXGetProjectionv(f32* p); +void GXGetLightPos(GXLightObj* lt_obj, f32* x, f32* y, f32* z); +void GXGetLightColor(GXLightObj* lt_obj, GXColor* color); +void GXGetVtxAttrFmt(GXVtxFmt idx, GXAttr attr, GXCompCnt* compCnt, GXCompType* compType, u8* shift); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXLighting.h b/include/dolphin/gx/GXLighting.h new file mode 100644 index 0000000..648853b --- /dev/null +++ b/include/dolphin/gx/GXLighting.h @@ -0,0 +1,31 @@ +#ifndef DOLPHIN_GXLIGHTING_H +#define DOLPHIN_GXLIGHTING_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXSetNumChans(u8 nChans); +void GXSetChanCtrl(GXChannelID chan, GXBool enable, GXColorSrc amb_src, GXColorSrc mat_src, u32 light_mask, + GXDiffuseFn diff_fn, GXAttnFn attn_fn); +void GXSetChanAmbColor(GXChannelID chan, GXColor amb_color); +void GXSetChanMatColor(GXChannelID chan, GXColor mat_color); + +void GXInitLightSpot(GXLightObj* lt_obj, f32 cutoff, GXSpotFn spot_func); +void GXInitLightDistAttn(GXLightObj* lt_obj, f32 ref_distance, f32 ref_brightness, GXDistAttnFn dist_func); +void GXInitLightPos(GXLightObj* lt_obj, f32 x, f32 y, f32 z); +void GXInitLightDir(GXLightObj* lt_obj, f32 nx, f32 ny, f32 nz); +void GXInitLightColor(GXLightObj* lt_obj, GXColor color); +void GXInitLightAttn(GXLightObj* lt_obj, f32 a0, f32 a1, f32 a2, f32 k0, f32 k1, f32 k2); +void GXInitLightAttnA(GXLightObj* lt_obj, f32 a0, f32 a1, f32 a2); +void GXInitLightAttnK(GXLightObj* lt_obj, f32 k0, f32 k1, f32 k2); +void GXLoadLightObjImm(GXLightObj* lt_obj, GXLightID light); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXManage.h b/include/dolphin/gx/GXManage.h new file mode 100644 index 0000000..f590b8d --- /dev/null +++ b/include/dolphin/gx/GXManage.h @@ -0,0 +1,23 @@ +#ifndef DOLPHIN_GXMANAGE_H +#define DOLPHIN_GXMANAGE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*GXDrawDoneCallback)(void); + +GXFifoObj* GXInit(void* base, u32 size); +GXDrawDoneCallback GXSetDrawDoneCallback(GXDrawDoneCallback cb); +void GXDrawDone(void); +void GXSetDrawDone(void); +void GXFlush(void); +void GXPixModeSync(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXPerf.h b/include/dolphin/gx/GXPerf.h new file mode 100644 index 0000000..7859e18 --- /dev/null +++ b/include/dolphin/gx/GXPerf.h @@ -0,0 +1,16 @@ +#ifndef DOLPHIN_GXPERF_H +#define DOLPHIN_GXPERF_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXReadXfRasMetric(u32* xf_wait_in, u32* xf_wait_out, u32* ras_busy, u32* clocks); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXPixel.h b/include/dolphin/gx/GXPixel.h new file mode 100644 index 0000000..c2536b6 --- /dev/null +++ b/include/dolphin/gx/GXPixel.h @@ -0,0 +1,28 @@ +#ifndef DOLPHIN_GXPIXEL_H +#define DOLPHIN_GXPIXEL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXSetFog(GXFogType type, f32 startz, f32 endz, f32 nearz, f32 farz, GXColor color); +void GXSetFogColor(GXColor color); +// ? GXSetFogRangeAdj(); +void GXSetBlendMode(GXBlendMode type, GXBlendFactor src_factor, GXBlendFactor dst_factor, GXLogicOp op); +void GXSetColorUpdate(GXBool update_enable); +void GXSetAlphaUpdate(GXBool update_enable); +void GXSetZMode(GXBool compare_enable, GXCompare func, GXBool update_enable); +void GXSetZCompLoc(GXBool before_tex); +void GXSetPixelFmt(GXPixelFmt pix_fmt, GXZFmt16 z_fmt); +void GXSetDither(GXBool dither); +void GXSetDstAlpha(GXBool enable, u8 alpha); +// ? GXSetFieldMask(); +// ? GXSetFieldMode(); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXStruct.h b/include/dolphin/gx/GXStruct.h new file mode 100644 index 0000000..ecb650c --- /dev/null +++ b/include/dolphin/gx/GXStruct.h @@ -0,0 +1,100 @@ +#ifndef DOLPHIN_GXSTRUCT_H +#define DOLPHIN_GXSTRUCT_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define VI_TVMODE(format, interlace) (((format) << 2) + (interlace)) + +#define VI_INTERLACE 0 +#define VI_NON_INTERLACE 1 +#define VI_PROGRESSIVE 2 + +#define VI_NTSC 0 +#define VI_PAL 1 +#define VI_MPAL 2 +#define VI_DEBUG 3 +#define VI_DEBUG_PAL 4 +#define VI_EURGB60 5 + +typedef enum { + VI_TVMODE_NTSC_INT = VI_TVMODE(VI_NTSC, VI_INTERLACE), + VI_TVMODE_NTSC_DS = VI_TVMODE(VI_NTSC, VI_NON_INTERLACE), + VI_TVMODE_NTSC_PROG = VI_TVMODE(VI_NTSC, VI_PROGRESSIVE), + VI_TVMODE_PAL_INT = VI_TVMODE(VI_PAL, VI_INTERLACE), + VI_TVMODE_PAL_DS = VI_TVMODE(VI_PAL, VI_NON_INTERLACE), + VI_TVMODE_EURGB60_INT = VI_TVMODE(VI_EURGB60, VI_INTERLACE), + VI_TVMODE_EURGB60_DS = VI_TVMODE(VI_EURGB60, VI_NON_INTERLACE), + VI_TVMODE_MPAL_INT = VI_TVMODE(VI_MPAL, VI_INTERLACE), + VI_TVMODE_MPAL_DS = VI_TVMODE(VI_MPAL, VI_NON_INTERLACE), + VI_TVMODE_DEBUG_INT = VI_TVMODE(VI_DEBUG, VI_INTERLACE), + VI_TVMODE_DEBUG_PAL_INT = VI_TVMODE(VI_DEBUG_PAL, VI_INTERLACE), + VI_TVMODE_DEBUG_PAL_DS = VI_TVMODE(VI_DEBUG_PAL, VI_NON_INTERLACE) +} VITVMode; + +typedef enum { VI_XFBMODE_SF = 0, VI_XFBMODE_DF } VIXFBMode; + +typedef struct { + /*0x00*/ VITVMode viTVmode; + /*0x04*/ u16 fbWidth; + /*0x06*/ u16 efbHeight; + /*0x08*/ u16 xfbHeight; + /*0x0A*/ u16 viXOrigin; + /*0x0C*/ u16 viYOrigin; + /*0x0E*/ u16 viWidth; + /*0x10*/ u16 viHeight; + /*0x14*/ VIXFBMode xFBmode; + /*0x18*/ u8 field_rendering; + u8 aa; + u8 sample_pattern[12][2]; + u8 vfilter[7]; +} GXRenderModeObj; + +typedef struct { + u8 r; + u8 g; + u8 b; + u8 a; +} GXColor; + +typedef struct { +#ifdef TARGET_PC + u32 dummy[22]; +#else + u32 dummy[8]; +#endif +} GXTexObj; + +typedef struct { +#ifdef TARGET_PC + u32 dummy[4]; +#else + u32 dummy[3]; +#endif +} GXTlutObj; + +typedef struct { + u32 dummy[16]; +} GXLightObj; + +typedef struct { + GXAttr attr; + GXAttrType type; +} GXVtxDescList; + +typedef struct { + s16 r; + s16 g; + s16 b; + s16 a; +} GXColorS10; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXTev.h b/include/dolphin/gx/GXTev.h new file mode 100644 index 0000000..dccd4d5 --- /dev/null +++ b/include/dolphin/gx/GXTev.h @@ -0,0 +1,35 @@ +#ifndef DOLPHIN_GXTEV_H +#define DOLPHIN_GXTEV_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXSetTevOp(GXTevStageID id, GXTevMode mode); +void GXSetTevColorIn(GXTevStageID stage, GXTevColorArg a, GXTevColorArg b, GXTevColorArg c, GXTevColorArg d); +void GXSetTevAlphaIn(GXTevStageID stage, GXTevAlphaArg a, GXTevAlphaArg b, GXTevAlphaArg c, GXTevAlphaArg d); +void GXSetTevColorOp(GXTevStageID stage, GXTevOp op, GXTevBias bias, GXTevScale scale, GXBool clamp, + GXTevRegID out_reg); +void GXSetTevAlphaOp(GXTevStageID stage, GXTevOp op, GXTevBias bias, GXTevScale scale, GXBool clamp, + GXTevRegID out_reg); +void GXSetTevColor(GXTevRegID id, GXColor color); +void GXSetTevColorS10(GXTevRegID id, GXColorS10 color); +void GXSetTevKColor(GXTevKColorID id, GXColor color); +void GXSetTevKColorSel(GXTevStageID stage, GXTevKColorSel sel); +void GXSetTevKAlphaSel(GXTevStageID stage, GXTevKAlphaSel sel); +void GXSetTevSwapMode(GXTevStageID stage, GXTevSwapSel ras_sel, GXTevSwapSel tex_sel); +void GXSetTevSwapModeTable(GXTevSwapSel table, GXTevColorChan red, GXTevColorChan green, GXTevColorChan blue, + GXTevColorChan alpha); +void GXSetAlphaCompare(GXCompare comp0, u8 ref0, GXAlphaOp op, GXCompare comp1, u8 ref1); +void GXSetZTexture(GXZTexOp op, GXTexFmt fmt, u32 bias); +void GXSetTevOrder(GXTevStageID stage, GXTexCoordID coord, GXTexMapID map, GXChannelID color); +void GXSetNumTevStages(u8 nStages); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXTexture.h b/include/dolphin/gx/GXTexture.h new file mode 100644 index 0000000..942811b --- /dev/null +++ b/include/dolphin/gx/GXTexture.h @@ -0,0 +1,29 @@ +#ifndef DOLPHIN_GXTEXTURE_H +#define DOLPHIN_GXTEXTURE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void GXInitTexObj(GXTexObj* obj, const void* data, u16 width, u16 height, u32 format, GXTexWrapMode wrapS, + GXTexWrapMode wrapT, GXBool mipmap); +void GXInitTexObjCI(GXTexObj* obj, const void* data, u16 width, u16 height, GXCITexFmt format, GXTexWrapMode wrapS, + GXTexWrapMode wrapT, GXBool mipmap, u32 tlut); +void GXInitTexObjData(GXTexObj* obj, const void* data); +void GXInitTexObjLOD(GXTexObj* obj, GXTexFilter min_filt, GXTexFilter mag_filt, f32 min_lod, f32 max_lod, f32 lod_bias, + GXBool bias_clamp, GXBool do_edge_lod, GXAnisotropy max_aniso); +void GXLoadTexObj(GXTexObj* obj, GXTexMapID id); +u32 GXGetTexBufferSize(u16 width, u16 height, u32 format, GXBool mipmap, u8 max_lod); +void GXInvalidateTexAll(); +void GXInitTexObjWrapMode(GXTexObj* obj, GXTexWrapMode s, GXTexWrapMode t); +void GXInitTlutObj(GXTlutObj* obj, const void* data, GXTlutFmt format, u16 entries); +void GXLoadTlut(const GXTlutObj* obj, GXTlut idx); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXTransform.h b/include/dolphin/gx/GXTransform.h new file mode 100644 index 0000000..73f5e83 --- /dev/null +++ b/include/dolphin/gx/GXTransform.h @@ -0,0 +1,33 @@ +#ifndef DOLPHIN_GXTRANSFORM_H +#define DOLPHIN_GXTRANSFORM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define GX_PROJECTION_SZ 7 + +#ifdef TARGET_PC +void GXSetProjection(const void* mtx, GXProjectionType type); +void GXLoadPosMtxImm(const void* mtx, u32 id); +void GXLoadNrmMtxImm(const void* mtx, u32 id); +void GXLoadTexMtxImm(const void* mtx, u32 id, GXTexMtxType type); +#else +void GXSetProjection(f32 mtx[4][4], GXProjectionType type); +void GXLoadPosMtxImm(f32 mtx[3][4], u32 id); +void GXLoadNrmMtxImm(f32 mtx[3][4], u32 id); +void GXLoadTexMtxImm(f32 mtx[][4], u32 id, GXTexMtxType type); +#endif +void GXSetViewport(f32 left, f32 top, f32 wd, f32 ht, f32 nearz, f32 farz); +void GXSetCurrentMtx(u32 id); +void GXSetViewportJitter(f32 left, f32 top, f32 wd, f32 ht, f32 nearz, f32 farz, u32 field); +void GXSetScissorBoxOffset(s32 x_off, s32 y_off); +void GXSetClipMode(GXClipMode mode); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/gx/GXVert.h b/include/dolphin/gx/GXVert.h new file mode 100644 index 0000000..8af0cde --- /dev/null +++ b/include/dolphin/gx/GXVert.h @@ -0,0 +1,132 @@ +#ifndef DOLPHIN_GXVERT_H +#define DOLPHIN_GXVERT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define GXFIFO_ADDR 0xCC008000 + +typedef union { + u8 u8; + u16 u16; + u32 u32; + u64 u64; + s8 s8; + s16 s16; + s32 s32; + s64 s64; + f32 f32; + f64 f64; +} PPCWGPipe; + +#ifdef __MWERKS__ +volatile PPCWGPipe GXWGFifo : GXFIFO_ADDR; +#else +#define GXWGFifo (*(volatile PPCWGPipe*)GXFIFO_ADDR) +#endif + +#ifdef TARGET_PC + +void GXPosition3f32(f32 x, f32 y, f32 z); +void GXPosition3u16(u16 x, u16 y, u16 z); +void GXPosition3s16(s16 x, s16 y, s16 z); +void GXPosition3u8(u8 x, u8 y, u8 z); +void GXPosition3s8(s8 x, s8 y, s8 z); + +void GXPosition2f32(f32 x, f32 y); +void GXPosition2u16(u16 x, u16 y); +void GXPosition2s16(s16 x, s16 y); +void GXPosition2u8(u8 x, u8 y); +void GXPosition2s8(s8 x, s8 y); + +void GXPosition1x16(u16 index); +void GXPosition1x8(u8 index); + +void GXNormal3f32(f32 x, f32 y, f32 z); +void GXNormal3s16(s16 x, s16 y, s16 z); +void GXNormal3s8(s8 x, s8 y, s8 z); + +void GXNormal1x16(u16 index); +void GXNormal1x8(u8 index); + +void GXColor4u8(u8 r, u8 g, u8 b, u8 a); + +void GXColor3u8(u8 r, u8 g, u8 b); + +void GXColor1u32(u32 clr); +void GXColor1u16(u16 clr); + +void GXColor1x16(u16 index); +void GXColor1x8(u8 index); + +void GXTexCoord2f32(f32 s, f32 t); +void GXTexCoord2u16(u16 s, u16 t); +void GXTexCoord2s16(s16 s, s16 t); +void GXTexCoord2u8(u8 s, u8 t); +void GXTexCoord2s8(s8 s, s8 t); + +void GXTexCoord1f32(f32 s, f32 t); +void GXTexCoord1u16(u16 s, u16 t); +void GXTexCoord1s16(s16 s, s16 t); +void GXTexCoord1u8(u8 s, u8 t); +void GXTexCoord1s8(s8 s, s8 t); + +void GXTexCoord1x16(u16 index); +void GXTexCoord1x8(u8 index); + +extern void GXEnd(void); + +#else + +static inline void GXPosition2f32(const f32 x, const f32 y) { + GXWGFifo.f32 = x; + GXWGFifo.f32 = y; +} + +static inline void GXPosition3s16(const s16 x, const s16 y, const s16 z) { + GXWGFifo.s16 = x; + GXWGFifo.s16 = y; + GXWGFifo.s16 = z; +} + +static inline void GXPosition3f32(const f32 x, const f32 y, const f32 z) { + GXWGFifo.f32 = x; + GXWGFifo.f32 = y; + GXWGFifo.f32 = z; +} + +static inline void GXNormal3f32(const f32 x, const f32 y, const f32 z) { + GXWGFifo.f32 = x; + GXWGFifo.f32 = y; + GXWGFifo.f32 = z; +} + +static inline void GXColor4u8(const u8 r, const u8 g, const u8 b, const u8 a) { + GXWGFifo.u8 = r; + GXWGFifo.u8 = g; + GXWGFifo.u8 = b; + GXWGFifo.u8 = a; +} + +static inline void GXTexCoord2s16(const s16 u, const s16 v) { + GXWGFifo.s16 = u; + GXWGFifo.s16 = v; +} + +static inline void GXTexCoord2f32(const f32 u, const f32 v) { + GXWGFifo.f32 = u; + GXWGFifo.f32 = v; +} + +static inline void GXEnd(void) {} + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/pad.h b/include/dolphin/pad.h new file mode 100644 index 0000000..2422d2d --- /dev/null +++ b/include/dolphin/pad.h @@ -0,0 +1,114 @@ +#ifndef DOLPHIN_PAD_H +#define DOLPHIN_PAD_H + +#include + +#define PAD_CHAN0 0 +#define PAD_CHAN1 1 +#define PAD_CHAN2 2 +#define PAD_CHAN3 3 +#define PAD_CHANMAX 4 + +#define PAD_MOTOR_STOP 0 +#define PAD_MOTOR_RUMBLE 1 +#define PAD_MOTOR_STOP_HARD 2 + +#define PAD_ERR_NONE 0 +#define PAD_ERR_NO_CONTROLLER -1 +#define PAD_ERR_NOT_READY -2 +#define PAD_ERR_TRANSFER -3 + +#define PAD_BUTTON_LEFT 0x0001 +#define PAD_BUTTON_RIGHT 0x0002 +#define PAD_BUTTON_DOWN 0x0004 +#define PAD_BUTTON_UP 0x0008 +#define PAD_TRIGGER_Z 0x0010 +#define PAD_TRIGGER_R 0x0020 +#define PAD_TRIGGER_L 0x0040 +#define PAD_BUTTON_A 0x0100 +#define PAD_BUTTON_B 0x0200 +#define PAD_BUTTON_X 0x0400 +#define PAD_BUTTON_Y 0x0800 +#define PAD_BUTTON_MENU 0x1000 +#define PAD_BUTTON_START 0x1000 + +#define PAD_CHAN0_BIT 0x80000000 +#define PAD_CHAN1_BIT 0x40000000 +#define PAD_CHAN2_BIT 0x20000000 +#define PAD_CHAN3_BIT 0x10000000 + +#define PADButtonDown(buttonLast, button) (((buttonLast) ^ (button)) & (button)) +#define PADButtonUp(buttonLast, button) (((buttonLast) ^ (button)) & (buttonLast)) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct PADStatus { + u16 button; + s8 stickX; + s8 stickY; + s8 substickX; + s8 substickY; + u8 triggerL; + u8 triggerR; + u8 analogA; + u8 analogB; + s8 err; +} PADStatus; + +BOOL PADInit(); +u32 PADRead(PADStatus* status); +BOOL PADReset(u32 mask); +BOOL PADRecalibrate(u32 mask); +void PADClamp(PADStatus* status); +void PADClampCircle(PADStatus* status); +void PADControlMotor(s32 chan, u32 cmd); +void PADSetSpec(u32 spec); +void PADControlAllMotors(const u32* cmdArr); + +#ifdef TARGET_PC +/* New API to facilitate controller interactions */ +typedef struct PADDeadZones { + bool emulateTriggers; + bool useDeadzones; + u16 stickDeadZone; + u16 substickDeadZone; + u16 leftTriggerActivationZone; + u16 rightTriggerActivationZone; +} PADDeadZones; + +typedef u16 PADButton; + +typedef struct PADButtonMapping { + u32 nativeButton; + PADButton padButton; +} PADButtonMapping; + +/* Returns the total number of controllers */ +u32 PADCount(); +/* Returns the controller name for the given index into the controller map */ +const char* PADGetNameForControllerIndex(u32 idx); +void PADSetPortForIndex(u32 index, s32 port); +s32 PADGetIndexForPort(u32 port); +void PADGetVidPid(u32 port, u32* vid, u32* pid); +void PADClearPort(u32 port); +const char* PADGetName(u32 port); +void PADSetButtonMapping(u32 port, PADButtonMapping mapping); +void PADSetAllButtonMappings(u32 port, PADButtonMapping buttons[12]); +PADButtonMapping* PADGetButtonMappings(u32 port, u32* buttonCount); +void PADSerializeMappings(); +PADDeadZones* PADGetDeadZones(u32 port); +const char* PADGetButtonName(PADButton); +const char* PADGetNativeButtonName(u32 button); +/* Returns any pressed native button */ +s32 PADGetNativeButtonPressed(u32 port); +void PADRestoreDefaultMapping(u32 port); +void PADBlockInput(bool block); +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/si.h b/include/dolphin/si.h new file mode 100644 index 0000000..02eb65a --- /dev/null +++ b/include/dolphin/si.h @@ -0,0 +1,72 @@ +#ifndef DOLPHIN_SI_H +#define DOLPHIN_SI_H + +#include + +#define SI_CHAN0 0 +#define SI_CHAN1 1 +#define SI_CHAN2 2 +#define SI_CHAN3 3 +#define SI_MAX_CHAN 4 + +#define SI_CHAN0_BIT 0x80000000 +#define SI_CHAN1_BIT 0x40000000 +#define SI_CHAN2_BIT 0x20000000 +#define SI_CHAN3_BIT 0x10000000 +#define SI_CHAN_BIT(chn) (SI_CHAN0_BIT >> (chn)) + +#define SI_ERROR_UNDER_RUN 0x0001 +#define SI_ERROR_OVER_RUN 0x0002 +#define SI_ERROR_COLLISION 0x0004 +#define SI_ERROR_NO_RESPONSE 0x0008 +#define SI_ERROR_WRST 0x0010 +#define SI_ERROR_RDST 0x0020 +#define SI_ERROR_UNKNOWN 0x0040 +#define SI_ERROR_BUSY 0x0080 + +#define SI_TYPE_MASK 0x18000000u +#define SI_TYPE_N64 0x00000000u +#define SI_TYPE_DOLPHIN 0x08000000u +#define SI_TYPE_GC SI_TYPE_DOLPHIN + +// GameCube specific +#define SI_GC_WIRELESS 0x80000000u +#define SI_GC_NOMOTOR 0x20000000u // no rumble motor +#define SI_GC_STANDARD 0x01000000u // dolphin standard controller + +// WaveBird specific +#define SI_WIRELESS_RECEIVED 0x40000000u // 0: no wireless unit +#define SI_WIRELESS_IR 0x04000000u // 0: IR 1: RF +#define SI_WIRELESS_STATE 0x02000000u // 0: variable 1: fixed +#define SI_WIRELESS_ORIGIN 0x00200000u // 0: invalid 1: valid +#define SI_WIRELESS_FIX_ID 0x00100000u // 0: not fixed 1: fixed +#define SI_WIRELESS_TYPE 0x000f0000u +#define SI_WIRELESS_LITE_MASK 0x000c0000u // 0: normal 1: lite controller +#define SI_WIRELESS_LITE 0x00040000u // 0: normal 1: lite controller +#define SI_WIRELESS_CONT_MASK 0x00080000u // 0: non-controller 1: non-controller +#define SI_WIRELESS_CONT 0x00000000u +#define SI_WIRELESS_ID 0x00c0ff00u +#define SI_WIRELESS_TYPE_ID (SI_WIRELESS_TYPE | SI_WIRELESS_ID) + +#define SI_N64_CONTROLLER (SI_TYPE_N64 | 0x05000000) +#define SI_N64_MIC (SI_TYPE_N64 | 0x00010000) +#define SI_N64_KEYBOARD (SI_TYPE_N64 | 0x00020000) +#define SI_N64_MOUSE (SI_TYPE_N64 | 0x02000000) +#define SI_GBA (SI_TYPE_N64 | 0x00040000) +#define SI_GC_CONTROLLER (SI_TYPE_GC | SI_GC_STANDARD) +#define SI_GC_RECEIVER (SI_TYPE_GC | SI_GC_WIRELESS) +#define SI_GC_WAVEBIRD (SI_TYPE_GC | SI_GC_WIRELESS | SI_GC_STANDARD | SI_WIRELESS_STATE | SI_WIRELESS_FIX_ID) +#define SI_GC_KEYBOARD (SI_TYPE_GC | 0x00200000) +#define SI_GC_STEERING (SI_TYPE_GC | 0x00000000) + +#ifdef __cplusplus +extern "C" { +#endif + +u32 SIProbe(s32 chan); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/types.h b/include/dolphin/types.h new file mode 100644 index 0000000..4411a0d --- /dev/null +++ b/include/dolphin/types.h @@ -0,0 +1,75 @@ +#ifndef DOLPHIN_TYPES_H +#define DOLPHIN_TYPES_H + +#ifdef TARGET_PC +#include +#endif + +typedef signed char s8; +typedef signed short int s16; +typedef signed int s32; +#if __WORDSIZE == 64 +typedef signed long int s64; +#else +typedef signed long long int s64; +#endif + +typedef unsigned char u8; +typedef unsigned short int u16; +typedef unsigned int u32; +#if __WORDSIZE == 64 +typedef unsigned long int u64; +#else +typedef unsigned long long int u64; +#endif + +typedef volatile u8 vu8; +typedef volatile u16 vu16; +typedef volatile u32 vu32; +typedef volatile u64 vu64; + +typedef volatile s8 vs8; +typedef volatile s16 vs16; +typedef volatile s32 vs32; +typedef volatile s64 vs64; + +typedef float f32; +typedef double f64; + +typedef volatile f32 vf32; +typedef volatile f64 vf64; + +#ifdef TARGET_PC +#include +typedef bool BOOL; +#define FALSE false +#define TRUE true +#else +typedef int BOOL; +#define FALSE 0 +#define TRUE 1 +#endif + +#ifdef TARGET_PC +#include +#else +#define NULL 0 +#endif +#ifndef __cplusplus +#define nullptr NULL +#endif + +#if defined(__MWERKS__) +#define AT_ADDRESS(addr) : (addr) +#define ATTRIBUTE_ALIGN(num) __attribute__((aligned(num))) +#elif defined(__GNUC__) +#define AT_ADDRESS(addr) // was removed in GCC. define in linker script instead. +#define ATTRIBUTE_ALIGN(num) __attribute__((aligned(num))) +#elif defined(_MSC_VER) +#define AT_ADDRESS(addr) +#define ATTRIBUTE_ALIGN(num) +#else +#error unknown compiler +#endif + +#endif diff --git a/include/dolphin/vi.h b/include/dolphin/vi.h new file mode 100644 index 0000000..7ac0a13 --- /dev/null +++ b/include/dolphin/vi.h @@ -0,0 +1,23 @@ +#ifndef DOLPHIN_VI_H +#define DOLPHIN_VI_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void VIInit(void); +void VIConfigure(GXRenderModeObj *rm); +void VIFlush(void); +u32 VIGetTvFormat(void); +void VISetNextFrameBuffer(void *fb); +void VIWaitForRetrace(void); +void VISetBlack(BOOL black); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/dolphin/vifuncs.h b/include/dolphin/vifuncs.h new file mode 100644 index 0000000..acc6ae9 --- /dev/null +++ b/include/dolphin/vifuncs.h @@ -0,0 +1,16 @@ +#ifndef DOLPHIN_VIFUNCS_H +#define DOLPHIN_VIFUNCS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +u32 VIGetNextField(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/magic_enum.hpp b/include/magic_enum.hpp new file mode 100644 index 0000000..18c2804 --- /dev/null +++ b/include/magic_enum.hpp @@ -0,0 +1,1135 @@ +// __ __ _ ______ _____ +// | \/ | (_) | ____| / ____|_ _ +// | \ / | __ _ __ _ _ ___ | |__ _ __ _ _ _ __ ___ | | _| |_ _| |_ +// | |\/| |/ _` |/ _` | |/ __| | __| | '_ \| | | | '_ ` _ \ | | |_ _|_ _| +// | | | | (_| | (_| | | (__ | |____| | | | |_| | | | | | | | |____|_| |_| +// |_| |_|\__,_|\__, |_|\___| |______|_| |_|\__,_|_| |_| |_| \_____| +// __/ | https://github.com/Neargye/magic_enum +// |___/ version 0.7.2 +// +// Licensed under the MIT License . +// SPDX-License-Identifier: MIT +// Copyright (c) 2019 - 2021 Daniil Goncharov . +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef NEARGYE_MAGIC_ENUM_HPP +#define NEARGYE_MAGIC_ENUM_HPP + +#define MAGIC_ENUM_VERSION_MAJOR 0 +#define MAGIC_ENUM_VERSION_MINOR 7 +#define MAGIC_ENUM_VERSION_PATCH 2 + +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(MAGIC_ENUM_USING_ALIAS_OPTIONAL) +#include +#endif +#if !defined(MAGIC_ENUM_USING_ALIAS_STRING) +#include +#endif +#if !defined(MAGIC_ENUM_USING_ALIAS_STRING_VIEW) +#include +#endif + +#if defined(__clang__) +# pragma clang diagnostic push +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" // May be used uninitialized 'return {};'. +#elif defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 26495) // Variable 'static_string::chars_' is uninitialized. +# pragma warning(disable : 28020) // Arithmetic overflow: Using operator '-' on a 4 byte value and then casting the result to a 8 byte value. +# pragma warning(disable : 26451) // The expression '0<=_Param_(1)&&_Param_(1)<=1-1' is not true at this call. +#endif + +// Checks magic_enum compiler compatibility. +#if defined(__clang__) && __clang_major__ >= 5 || defined(__GNUC__) && __GNUC__ >= 9 || defined(_MSC_VER) && _MSC_VER >= 1910 +# undef MAGIC_ENUM_SUPPORTED +# define MAGIC_ENUM_SUPPORTED 1 +#endif + +// Checks magic_enum compiler aliases compatibility. +#if defined(__clang__) && __clang_major__ >= 5 || defined(__GNUC__) && __GNUC__ >= 9 || defined(_MSC_VER) && _MSC_VER >= 1920 +# undef MAGIC_ENUM_SUPPORTED_ALIASES +# define MAGIC_ENUM_SUPPORTED_ALIASES 1 +#endif + +// Enum value must be greater or equals than MAGIC_ENUM_RANGE_MIN. By default MAGIC_ENUM_RANGE_MIN = -128. +// If need another min range for all enum types by default, redefine the macro MAGIC_ENUM_RANGE_MIN. +#if !defined(MAGIC_ENUM_RANGE_MIN) +# define MAGIC_ENUM_RANGE_MIN -128 +#endif + +// Enum value must be less or equals than MAGIC_ENUM_RANGE_MAX. By default MAGIC_ENUM_RANGE_MAX = 128. +// If need another max range for all enum types by default, redefine the macro MAGIC_ENUM_RANGE_MAX. +#if !defined(MAGIC_ENUM_RANGE_MAX) +# define MAGIC_ENUM_RANGE_MAX 128 +#endif + +namespace magic_enum { + +// If need another optional type, define the macro MAGIC_ENUM_USING_ALIAS_OPTIONAL. +#if defined(MAGIC_ENUM_USING_ALIAS_OPTIONAL) +MAGIC_ENUM_USING_ALIAS_OPTIONAL +#else +using std::optional; +#endif + +// If need another string_view type, define the macro MAGIC_ENUM_USING_ALIAS_STRING_VIEW. +#if defined(MAGIC_ENUM_USING_ALIAS_STRING_VIEW) +MAGIC_ENUM_USING_ALIAS_STRING_VIEW +#else +using std::string_view; +#endif + +// If need another string type, define the macro MAGIC_ENUM_USING_ALIAS_STRING. +#if defined(MAGIC_ENUM_USING_ALIAS_STRING) +MAGIC_ENUM_USING_ALIAS_STRING +#else +using std::string; +#endif + +namespace customize { + +// Enum value must be in range [MAGIC_ENUM_RANGE_MIN, MAGIC_ENUM_RANGE_MAX]. By default MAGIC_ENUM_RANGE_MIN = -128, MAGIC_ENUM_RANGE_MAX = 128. +// If need another range for all enum types by default, redefine the macro MAGIC_ENUM_RANGE_MIN and MAGIC_ENUM_RANGE_MAX. +// If need another range for specific enum type, add specialization enum_range for necessary enum type. +template +struct enum_range { + static_assert(std::is_enum_v, "magic_enum::customize::enum_range requires enum type."); + inline static constexpr int min = MAGIC_ENUM_RANGE_MIN; + inline static constexpr int max = MAGIC_ENUM_RANGE_MAX; + static_assert(max > min, "magic_enum::customize::enum_range requires max > min."); +}; + +static_assert(MAGIC_ENUM_RANGE_MIN <= 0, "MAGIC_ENUM_RANGE_MIN must be less or equals than 0."); +static_assert(MAGIC_ENUM_RANGE_MIN > (std::numeric_limits::min)(), "MAGIC_ENUM_RANGE_MIN must be greater than INT16_MIN."); + +static_assert(MAGIC_ENUM_RANGE_MAX > 0, "MAGIC_ENUM_RANGE_MAX must be greater than 0."); +static_assert(MAGIC_ENUM_RANGE_MAX < (std::numeric_limits::max)(), "MAGIC_ENUM_RANGE_MAX must be less than INT16_MAX."); + +static_assert(MAGIC_ENUM_RANGE_MAX > MAGIC_ENUM_RANGE_MIN, "MAGIC_ENUM_RANGE_MAX must be greater than MAGIC_ENUM_RANGE_MIN."); + +// If need custom names for enum, add specialization enum_name for necessary enum type. +template +constexpr string_view enum_name(E) noexcept { + static_assert(std::is_enum_v, "magic_enum::customize::enum_name requires enum type."); + + return {}; +} + +} // namespace magic_enum::customize + +namespace detail { + +template +struct supported +#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED || defined(MAGIC_ENUM_NO_CHECK_SUPPORT) + : std::true_type {}; +#else + : std::false_type {}; +#endif + +struct char_equal_to { + constexpr bool operator()(char lhs, char rhs) const noexcept { + return lhs == rhs; + } +}; + +template +class static_string { + public: + constexpr explicit static_string(string_view str) noexcept : static_string{str, std::make_index_sequence{}} { + assert(str.size() == N); + } + + constexpr const char* data() const noexcept { return chars_; } + + constexpr std::size_t size() const noexcept { return N; } + + constexpr operator string_view() const noexcept { return {data(), size()}; } + + private: + template + constexpr static_string(string_view str, std::index_sequence) noexcept : chars_{str[I]..., '\0'} {} + + char chars_[N + 1]; +}; + +template <> +class static_string<0> { + public: + constexpr explicit static_string(string_view) noexcept {} + + constexpr const char* data() const noexcept { return nullptr; } + + constexpr std::size_t size() const noexcept { return 0; } + + constexpr operator string_view() const noexcept { return {}; } +}; + +constexpr string_view pretty_name(string_view name) noexcept { + for (std::size_t i = name.size(); i > 0; --i) { + if (!((name[i - 1] >= '0' && name[i - 1] <= '9') || + (name[i - 1] >= 'a' && name[i - 1] <= 'z') || + (name[i - 1] >= 'A' && name[i - 1] <= 'Z') || + (name[i - 1] == '_'))) { + name.remove_prefix(i); + break; + } + } + + if (name.size() > 0 && ((name.front() >= 'a' && name.front() <= 'z') || + (name.front() >= 'A' && name.front() <= 'Z') || + (name.front() == '_'))) { + return name; + } + + return {}; // Invalid name. +} + +constexpr std::size_t find(string_view str, char c) noexcept { +#if defined(__clang__) && __clang_major__ < 9 && defined(__GLIBCXX__) || defined(_MSC_VER) && _MSC_VER < 1920 && !defined(__clang__) +// https://stackoverflow.com/questions/56484834/constexpr-stdstring-viewfind-last-of-doesnt-work-on-clang-8-with-libstdc +// https://developercommunity.visualstudio.com/content/problem/360432/vs20178-regression-c-failed-in-test.html + constexpr bool workaround = true; +#else + constexpr bool workaround = false; +#endif + if constexpr (workaround) { + for (std::size_t i = 0; i < str.size(); ++i) { + if (str[i] == c) { + return i; + } + } + + return string_view::npos; + } else { + return str.find_first_of(c); + } +} + +template +constexpr std::array, N> to_array(T (&a)[N], std::index_sequence) { + return {{a[I]...}}; +} + +template +constexpr bool cmp_equal(string_view lhs, string_view rhs, BinaryPredicate&& p) noexcept(std::is_nothrow_invocable_r_v) { +#if defined(_MSC_VER) && _MSC_VER < 1920 && !defined(__clang__) + // https://developercommunity.visualstudio.com/content/problem/360432/vs20178-regression-c-failed-in-test.html + // https://developercommunity.visualstudio.com/content/problem/232218/c-constexpr-string-view.html + constexpr bool workaround = true; +#else + constexpr bool workaround = false; +#endif + constexpr bool default_predicate = std::is_same_v, char_equal_to>; + + if constexpr (default_predicate && !workaround) { + static_cast(p); + return lhs == rhs; + } else { + if (lhs.size() != rhs.size()) { + return false; + } + + const auto size = lhs.size(); + for (std::size_t i = 0; i < size; ++i) { + if (!p(lhs[i], rhs[i])) { + return false; + } + } + + return true; + } +} + +template +constexpr bool cmp_less(L lhs, R rhs) noexcept { + static_assert(std::is_integral_v && std::is_integral_v, "magic_enum::detail::cmp_less requires integral type."); + + if constexpr (std::is_signed_v == std::is_signed_v) { + // If same signedness (both signed or both unsigned). + return lhs < rhs; + } else if constexpr (std::is_signed_v) { + // If 'right' is negative, then result is 'false', otherwise cast & compare. + return rhs > 0 && lhs < static_cast>(rhs); + } else { + // If 'left' is negative, then result is 'true', otherwise cast & compare. + return lhs < 0 || static_cast>(lhs) < rhs; + } +} + +template +constexpr I log2(I value) noexcept { + static_assert(std::is_integral_v, "magic_enum::detail::log2 requires integral type."); + + auto ret = I{0}; + for (; value > I{1}; value >>= I{1}, ++ret) {} + + return ret; +} + +template +constexpr bool is_pow2(I x) noexcept { + static_assert(std::is_integral_v, "magic_enum::detail::is_pow2 requires integral type."); + + return x != 0 && (x & (x - 1)) == 0; +} + +template +inline constexpr bool is_enum_v = std::is_enum_v && std::is_same_v>; + +template +constexpr auto n() noexcept { + static_assert(is_enum_v, "magic_enum::detail::n requires enum type."); +#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED +# if defined(__clang__) + constexpr string_view name{__PRETTY_FUNCTION__ + 34, sizeof(__PRETTY_FUNCTION__) - 36}; +# elif defined(__GNUC__) + constexpr string_view name{__PRETTY_FUNCTION__ + 49, sizeof(__PRETTY_FUNCTION__) - 51}; +# elif defined(_MSC_VER) + constexpr string_view name{__FUNCSIG__ + 40, sizeof(__FUNCSIG__) - 57}; +# endif + return static_string{name}; +#else + return string_view{}; // Unsupported compiler. +#endif +} + +template +inline constexpr auto type_name_v = n(); + +template +constexpr auto n() noexcept { + static_assert(is_enum_v, "magic_enum::detail::n requires enum type."); + constexpr auto custom_name = customize::enum_name(V); + + if constexpr (custom_name.empty()) { + static_cast(custom_name); +#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED +# if defined(__clang__) || defined(__GNUC__) + constexpr auto name = pretty_name({__PRETTY_FUNCTION__, sizeof(__PRETTY_FUNCTION__) - 2}); +# elif defined(_MSC_VER) + constexpr auto name = pretty_name({__FUNCSIG__, sizeof(__FUNCSIG__) - 17}); +# endif + return static_string{name}; +#else + return string_view{}; // Unsupported compiler. +#endif + } else { + return static_string{custom_name}; + } +} + +template +inline constexpr auto enum_name_v = n(); + +template +constexpr bool is_valid() noexcept { + static_assert(is_enum_v, "magic_enum::detail::is_valid requires enum type."); + + return n(V)>().size() != 0; +} + +template > +constexpr E value(std::size_t i) noexcept { + static_assert(is_enum_v, "magic_enum::detail::value requires enum type."); + + if constexpr (IsFlags) { + return static_cast(U{1} << static_cast(static_cast(i) + O)); + } else { + return static_cast(static_cast(i) + O); + } +} + +template > +constexpr int reflected_min() noexcept { + static_assert(is_enum_v, "magic_enum::detail::reflected_min requires enum type."); + + if constexpr (IsFlags) { + return 0; + } else { + constexpr auto lhs = customize::enum_range::min; + static_assert(lhs > (std::numeric_limits::min)(), "magic_enum::enum_range requires min must be greater than INT16_MIN."); + constexpr auto rhs = (std::numeric_limits::min)(); + + if constexpr (cmp_less(lhs, rhs)) { + return rhs; + } else { + static_assert(!is_valid(0)>(), "magic_enum::enum_range detects enum value smaller than min range size."); + return lhs; + } + } +} + +template > +constexpr int reflected_max() noexcept { + static_assert(is_enum_v, "magic_enum::detail::reflected_max requires enum type."); + + if constexpr (IsFlags) { + return std::numeric_limits::digits - 1; + } else { + constexpr auto lhs = customize::enum_range::max; + static_assert(lhs < (std::numeric_limits::max)(), "magic_enum::enum_range requires max must be less than INT16_MAX."); + constexpr auto rhs = (std::numeric_limits::max)(); + + if constexpr (cmp_less(lhs, rhs)) { + static_assert(!is_valid(0)>(), "magic_enum::enum_range detects enum value larger than max range size."); + return lhs; + } else { + return rhs; + } + } +} + +template +inline constexpr auto reflected_min_v = reflected_min(); + +template +inline constexpr auto reflected_max_v = reflected_max(); + +template +constexpr std::size_t values_count(const bool (&valid)[N]) noexcept { + auto count = std::size_t{0}; + for (std::size_t i = 0; i < N; ++i) { + if (valid[i]) { + ++count; + } + } + + return count; +} + +template +constexpr auto values(std::index_sequence) noexcept { + static_assert(is_enum_v, "magic_enum::detail::values requires enum type."); + constexpr bool valid[sizeof...(I)] = {is_valid(I)>()...}; + constexpr std::size_t count = values_count(valid); + + if constexpr (count > 0) { + E values[count] = {}; + for (std::size_t i = 0, v = 0; v < count; ++i) { + if (valid[i]) { + values[v++] = value(i); + } + } + + return to_array(values, std::make_index_sequence{}); + } else { + return std::array{}; + } +} + +template > +constexpr auto values() noexcept { + static_assert(is_enum_v, "magic_enum::detail::values requires enum type."); + constexpr auto min = reflected_min_v; + constexpr auto max = reflected_max_v; + constexpr auto range_size = max - min + 1; + static_assert(range_size > 0, "magic_enum::enum_range requires valid size."); + static_assert(range_size < (std::numeric_limits::max)(), "magic_enum::enum_range requires valid size."); + + return values>(std::make_index_sequence{}); +} + +template +inline constexpr auto values_v = values(); + +template > +using values_t = decltype((values_v)); + +template +inline constexpr auto count_v = values_v.size(); + +template > +inline constexpr auto min_v = (count_v > 0) ? static_cast(values_v.front()) : U{0}; + +template > +inline constexpr auto max_v = (count_v > 0) ? static_cast(values_v.back()) : U{0}; + +template > +constexpr std::size_t range_size() noexcept { + static_assert(is_enum_v, "magic_enum::detail::range_size requires enum type."); + constexpr auto max = IsFlags ? log2(max_v) : max_v; + constexpr auto min = IsFlags ? log2(min_v) : min_v; + constexpr auto range_size = max - min + U{1}; + static_assert(range_size > 0, "magic_enum::enum_range requires valid size."); + static_assert(range_size < (std::numeric_limits::max)(), "magic_enum::enum_range requires valid size."); + + return static_cast(range_size); +} + +template +inline constexpr auto range_size_v = range_size(); + +template +using index_t = std::conditional_t < (std::numeric_limits::max)(), std::uint8_t, std::uint16_t>; + +template +inline constexpr auto invalid_index_v = (std::numeric_limits>::max)(); + +template +constexpr auto indexes(std::index_sequence) noexcept { + static_assert(is_enum_v, "magic_enum::detail::indexes requires enum type."); + constexpr auto min = IsFlags ? log2(min_v) : min_v; + [[maybe_unused]] auto i = index_t{0}; + + return std::array{{(is_valid(I)>() ? i++ : invalid_index_v)...}}; +} + +template +inline constexpr auto indexes_v = indexes(std::make_index_sequence>{}); + +template +constexpr auto names(std::index_sequence) noexcept { + static_assert(is_enum_v, "magic_enum::detail::names requires enum type."); + + return std::array{{enum_name_v[I]>...}}; +} + +template +inline constexpr auto names_v = names(std::make_index_sequence>{}); + +template > +using names_t = decltype((names_v)); + +template +constexpr auto entries(std::index_sequence) noexcept { + static_assert(is_enum_v, "magic_enum::detail::entries requires enum type."); + + return std::array, sizeof...(I)>{{{values_v[I], enum_name_v[I]>}...}}; +} + +template +inline constexpr auto entries_v = entries(std::make_index_sequence>{}); + +template > +using entries_t = decltype((entries_v)); + +template > +constexpr bool is_sparse() noexcept { + static_assert(is_enum_v, "magic_enum::detail::is_sparse requires enum type."); + + return range_size_v != count_v; +} + +template +inline constexpr bool is_sparse_v = is_sparse(); + +template > +constexpr std::size_t undex(U value) noexcept { + static_assert(is_enum_v, "magic_enum::detail::undex requires enum type."); + + if (const auto i = static_cast(value - min_v); value >= min_v && value <= max_v) { + if constexpr (is_sparse_v) { + if (const auto idx = indexes_v[i]; idx != invalid_index_v) { + return idx; + } + } else { + return i; + } + } + + return invalid_index_v; // Value out of range. +} + +template > +constexpr std::size_t endex(E value) noexcept { + static_assert(is_enum_v, "magic_enum::detail::endex requires enum type."); + + return undex(static_cast(value)); +} + +template > +constexpr U value_ors() noexcept { + static_assert(is_enum_v, "magic_enum::detail::endex requires enum type."); + + auto value = U{0}; + for (std::size_t i = 0; i < count_v; ++i) { + value |= static_cast(values_v[i]); + } + + return value; +} + +template +struct enable_if_enum {}; + +template +struct enable_if_enum { + using type = R; + using D = std::decay_t; + static_assert(supported::value, "magic_enum unsupported compiler (https://github.com/Neargye/magic_enum#compiler-compatibility)."); +}; + +template +using enable_if_enum_t = std::enable_if_t>, R>; + +template >>> +using enum_concept = T; + +template > +struct is_scoped_enum : std::false_type {}; + +template +struct is_scoped_enum : std::bool_constant>> {}; + +template > +struct is_unscoped_enum : std::false_type {}; + +template +struct is_unscoped_enum : std::bool_constant>> {}; + +template >> +struct underlying_type {}; + +template +struct underlying_type : std::underlying_type> {}; + +} // namespace magic_enum::detail + +// Checks is magic_enum supported compiler. +inline constexpr bool is_magic_enum_supported = detail::supported::value; + +template +using Enum = detail::enum_concept; + +// Checks whether T is an Unscoped enumeration type. +// Provides the member constant value which is equal to true, if T is an [Unscoped enumeration](https://en.cppreference.com/w/cpp/language/enum#Unscoped_enumeration) type. Otherwise, value is equal to false. +template +struct is_unscoped_enum : detail::is_unscoped_enum {}; + +template +inline constexpr bool is_unscoped_enum_v = is_unscoped_enum::value; + +// Checks whether T is an Scoped enumeration type. +// Provides the member constant value which is equal to true, if T is an [Scoped enumeration](https://en.cppreference.com/w/cpp/language/enum#Scoped_enumerations) type. Otherwise, value is equal to false. +template +struct is_scoped_enum : detail::is_scoped_enum {}; + +template +inline constexpr bool is_scoped_enum_v = is_scoped_enum::value; + +// If T is a complete enumeration type, provides a member typedef type that names the underlying type of T. +// Otherwise, if T is not an enumeration type, there is no member type. Otherwise (T is an incomplete enumeration type), the program is ill-formed. +template +struct underlying_type : detail::underlying_type {}; + +template +using underlying_type_t = typename underlying_type::type; + +// Returns type name of enum. +template +[[nodiscard]] constexpr auto enum_type_name() noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + constexpr string_view name = detail::type_name_v; + static_assert(name.size() > 0, "Enum type does not have a name."); + + return name; +} + +// Returns number of enum values. +template +[[nodiscard]] constexpr auto enum_count() noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + + return detail::count_v; +} + +// Returns enum value at specified index. +// No bounds checking is performed: the behavior is undefined if index >= number of enum values. +template +[[nodiscard]] constexpr auto enum_value(std::size_t index) noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + static_assert(detail::count_v > 0, "magic_enum requires enum implementation and valid max and min."); + + if constexpr (detail::is_sparse_v) { + return assert((index < detail::count_v)), detail::values_v[index]; + } else { + return assert((index < detail::count_v)), detail::value>(index); + } +} + +// Returns std::array with enum values, sorted by enum value. +template +[[nodiscard]] constexpr auto enum_values() noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + static_assert(detail::count_v > 0, "magic_enum requires enum implementation and valid max and min."); + + return detail::values_v; +} + +// Returns name from static storage enum variable. +// This version is much lighter on the compile times and is not restricted to the enum_range limitation. +template +[[nodiscard]] constexpr auto enum_name() noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + constexpr string_view name = detail::enum_name_v; + static_assert(name.size() > 0, "Enum value does not have a name."); + + return name; +} + +// Returns name from enum value. +// If enum value does not have name or value out of range, returns empty string. +template +[[nodiscard]] constexpr auto enum_name(E value) noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + + if (const auto i = detail::endex(value); i != detail::invalid_index_v) { + return detail::names_v[i]; + } + + return {}; // Invalid value or out of range. +} + +// Returns std::array with names, sorted by enum value. +template +[[nodiscard]] constexpr auto enum_names() noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + static_assert(detail::count_v > 0, "magic_enum requires enum implementation and valid max and min."); + + return detail::names_v; +} + +// Returns std::array with pairs (value, name), sorted by enum value. +template +[[nodiscard]] constexpr auto enum_entries() noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + static_assert(detail::count_v > 0, "magic_enum requires enum implementation and valid max and min."); + + return detail::entries_v; +} + +// Obtains enum value from integer value. +// Returns optional with enum value. +template +[[nodiscard]] constexpr auto enum_cast(underlying_type_t value) noexcept -> detail::enable_if_enum_t>> { + using D = std::decay_t; + + if (detail::undex(value) != detail::invalid_index_v) { + return static_cast(value); + } + + return {}; // Invalid value or out of range. +} + +// Obtains enum value from name. +// Returns optional with enum value. +template +[[nodiscard]] constexpr auto enum_cast(string_view value, BinaryPredicate p) noexcept(std::is_nothrow_invocable_r_v) -> detail::enable_if_enum_t>> { + static_assert(std::is_invocable_r_v, "magic_enum::enum_cast requires bool(char, char) invocable predicate."); + using D = std::decay_t; + + for (std::size_t i = 0; i < detail::count_v; ++i) { + if (detail::cmp_equal(value, detail::names_v[i], p)) { + return enum_value(i); + } + } + + return {}; // Invalid value or out of range. +} + +// Obtains enum value from name. +// Returns optional with enum value. +template +[[nodiscard]] constexpr auto enum_cast(string_view value) noexcept -> detail::enable_if_enum_t>> { + using D = std::decay_t; + + return enum_cast(value, detail::char_equal_to{}); +} + +// Returns integer value from enum value. +template +[[nodiscard]] constexpr auto enum_integer(E value) noexcept -> detail::enable_if_enum_t> { + return static_cast>(value); +} + +// Obtains index in enum values from enum value. +// Returns optional with index. +template +[[nodiscard]] constexpr auto enum_index(E value) noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + + if (const auto i = detail::endex(value); i != detail::invalid_index_v) { + return i; + } + + return {}; // Invalid value or out of range. +} + +// Checks whether enum contains enumerator with such enum value. +template +[[nodiscard]] constexpr auto enum_contains(E value) noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + + return detail::endex(value) != detail::invalid_index_v; +} + +// Checks whether enum contains enumerator with such integer value. +template +[[nodiscard]] constexpr auto enum_contains(underlying_type_t value) noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + + return detail::undex(value) != detail::invalid_index_v; +} + +// Checks whether enum contains enumerator with such name. +template +[[nodiscard]] constexpr auto enum_contains(string_view value, BinaryPredicate p) noexcept(std::is_nothrow_invocable_r_v) -> detail::enable_if_enum_t { + static_assert(std::is_invocable_r_v, "magic_enum::enum_contains requires bool(char, char) invocable predicate."); + using D = std::decay_t; + + return enum_cast(value, std::move_if_noexcept(p)).has_value(); +} + +// Checks whether enum contains enumerator with such name. +template +[[nodiscard]] constexpr auto enum_contains(string_view value) noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + + return enum_cast(value).has_value(); +} + +namespace ostream_operators { + +template , int> = 0> +std::basic_ostream& operator<<(std::basic_ostream& os, E value) { + using D = std::decay_t; + using U = underlying_type_t; +#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED + if (const auto name = magic_enum::enum_name(value); !name.empty()) { + for (const auto c : name) { + os.put(c); + } + return os; + } +#endif + return (os << static_cast(value)); +} + +template , int> = 0> +std::basic_ostream& operator<<(std::basic_ostream& os, optional value) { + return value.has_value() ? (os << value.value()) : os; +} + +} // namespace magic_enum::ostream_operators + +namespace bitwise_operators { + +template , int> = 0> +constexpr E operator~(E rhs) noexcept { + return static_cast(~static_cast>(rhs)); +} + +template , int> = 0> +constexpr E operator|(E lhs, E rhs) noexcept { + return static_cast(static_cast>(lhs) | static_cast>(rhs)); +} + +template , int> = 0> +constexpr E operator&(E lhs, E rhs) noexcept { + return static_cast(static_cast>(lhs) & static_cast>(rhs)); +} + +template , int> = 0> +constexpr E operator^(E lhs, E rhs) noexcept { + return static_cast(static_cast>(lhs) ^ static_cast>(rhs)); +} + +template , int> = 0> +constexpr E& operator|=(E& lhs, E rhs) noexcept { + return lhs = (lhs | rhs); +} + +template , int> = 0> +constexpr E& operator&=(E& lhs, E rhs) noexcept { + return lhs = (lhs & rhs); +} + +template , int> = 0> +constexpr E& operator^=(E& lhs, E rhs) noexcept { + return lhs = (lhs ^ rhs); +} + +} // namespace magic_enum::bitwise_operators + +namespace flags { + +// Returns type name of enum. +using magic_enum::enum_type_name; + +// Returns number of enum-flags values. +template +[[nodiscard]] constexpr auto enum_count() noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + + return detail::count_v; +} + +// Returns enum-flags value at specified index. +// No bounds checking is performed: the behavior is undefined if index >= number of enum-flags values. +template +[[nodiscard]] constexpr auto enum_value(std::size_t index) noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + static_assert(detail::count_v > 0, "magic_enum::flags requires enum-flags implementation."); + + if constexpr (detail::is_sparse_v) { + return assert((index < detail::count_v)), detail::values_v[index]; + } else { + constexpr auto min = detail::log2(detail::min_v); + + return assert((index < detail::count_v)), detail::value(index); + } +} + +// Returns std::array with enum-flags values, sorted by enum-flags value. +template +[[nodiscard]] constexpr auto enum_values() noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + static_assert(detail::count_v > 0, "magic_enum::flags requires enum-flags implementation."); + + return detail::values_v; +} + +// Returns name from enum-flags value. +// If enum-flags value does not have name or value out of range, returns empty string. +template +[[nodiscard]] auto enum_name(E value) -> detail::enable_if_enum_t { + using D = std::decay_t; + using U = underlying_type_t; + + string name; + auto check_value = U{0}; + for (std::size_t i = 0; i < detail::count_v; ++i) { + if (const auto v = static_cast(enum_value(i)); (static_cast(value) & v) != 0) { + check_value |= v; + const auto n = detail::names_v[i]; + if (!name.empty()) { + name.append(1, '|'); + } + name.append(n.data(), n.size()); + } + } + + if (check_value != 0 && check_value == static_cast(value)) { + return name; + } + + return {}; // Invalid value or out of range. +} + +// Returns std::array with string names, sorted by enum-flags value. +template +[[nodiscard]] constexpr auto enum_names() noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + static_assert(detail::count_v > 0, "magic_enum::flags requires enum-flags implementation."); + + return detail::names_v; +} + +// Returns std::array with pairs (value, name), sorted by enum-flags value. +template +[[nodiscard]] constexpr auto enum_entries() noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + static_assert(detail::count_v > 0, "magic_enum::flags requires enum-flags implementation."); + + return detail::entries_v; +} + +// Obtains enum-flags value from integer value. +// Returns optional with enum-flags value. +template +[[nodiscard]] constexpr auto enum_cast(underlying_type_t value) noexcept -> detail::enable_if_enum_t>> { + using D = std::decay_t; + using U = underlying_type_t; + + if constexpr (detail::is_sparse_v) { + auto check_value = U{0}; + for (std::size_t i = 0; i < detail::count_v; ++i) { + if (const auto v = static_cast(enum_value(i)); (value & v) != 0) { + check_value |= v; + } + } + + if (check_value != 0 && check_value == value) { + return static_cast(value); + } + } else { + constexpr auto min = detail::min_v; + constexpr auto max = detail::value_ors(); + + if (value >= min && value <= max) { + return static_cast(value); + } + } + + return {}; // Invalid value or out of range. +} + +// Obtains enum-flags value from name. +// Returns optional with enum-flags value. +template +[[nodiscard]] constexpr auto enum_cast(string_view value, BinaryPredicate p) noexcept(std::is_nothrow_invocable_r_v) -> detail::enable_if_enum_t>> { + static_assert(std::is_invocable_r_v, "magic_enum::flags::enum_cast requires bool(char, char) invocable predicate."); + using D = std::decay_t; + using U = underlying_type_t; + + auto result = U{0}; + while (!value.empty()) { + const auto d = detail::find(value, '|'); + const auto s = (d == string_view::npos) ? value : value.substr(0, d); + auto f = U{0}; + for (std::size_t i = 0; i < detail::count_v; ++i) { + if (detail::cmp_equal(s, detail::names_v[i], p)) { + f = static_cast(enum_value(i)); + result |= f; + break; + } + } + if (f == U{0}) { + return {}; // Invalid value or out of range. + } + value.remove_prefix((d == string_view::npos) ? value.size() : d + 1); + } + + if (result == U{0}) { + return {}; // Invalid value or out of range. + } else { + return static_cast(result); + } +} + +// Obtains enum-flags value from name. +// Returns optional with enum-flags value. +template +[[nodiscard]] constexpr auto enum_cast(string_view value) noexcept -> detail::enable_if_enum_t>> { + using D = std::decay_t; + + return enum_cast(value, detail::char_equal_to{}); +} + +// Returns integer value from enum value. +using magic_enum::enum_integer; + +// Obtains index in enum-flags values from enum-flags value. +// Returns optional with index. +template +[[nodiscard]] constexpr auto enum_index(E value) noexcept -> detail::enable_if_enum_t> { + using D = std::decay_t; + using U = underlying_type_t; + + if (detail::is_pow2(static_cast(value))) { + for (std::size_t i = 0; i < detail::count_v; ++i) { + if (enum_value(i) == value) { + return i; + } + } + } + + return {}; // Invalid value or out of range. +} + +// Checks whether enum-flags contains enumerator with such enum-flags value. +template +[[nodiscard]] constexpr auto enum_contains(E value) noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + using U = underlying_type_t; + + return enum_cast(static_cast(value)).has_value(); +} + +// Checks whether enum-flags contains enumerator with such integer value. +template +[[nodiscard]] constexpr auto enum_contains(underlying_type_t value) noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + + return enum_cast(value).has_value(); +} + +// Checks whether enum-flags contains enumerator with such name. +template +[[nodiscard]] constexpr auto enum_contains(string_view value, BinaryPredicate p) noexcept(std::is_nothrow_invocable_r_v) -> detail::enable_if_enum_t { + static_assert(std::is_invocable_r_v, "magic_enum::flags::enum_contains requires bool(char, char) invocable predicate."); + using D = std::decay_t; + + return enum_cast(value, std::move_if_noexcept(p)).has_value(); +} + +// Checks whether enum-flags contains enumerator with such name. +template +[[nodiscard]] constexpr auto enum_contains(string_view value) noexcept -> detail::enable_if_enum_t { + using D = std::decay_t; + + return enum_cast(value).has_value(); +} + +} // namespace magic_enum::flags + +namespace flags::ostream_operators { + +template , int> = 0> +std::basic_ostream& operator<<(std::basic_ostream& os, E value) { + using D = std::decay_t; + using U = underlying_type_t; +#if defined(MAGIC_ENUM_SUPPORTED) && MAGIC_ENUM_SUPPORTED + if (const auto name = magic_enum::flags::enum_name(value); !name.empty()) { + for (const auto c : name) { + os.put(c); + } + return os; + } +#endif + return (os << static_cast(value)); +} + +template , int> = 0> +std::basic_ostream& operator<<(std::basic_ostream& os, optional value) { + return value.has_value() ? (os << value.value()) : os; +} + +} // namespace magic_enum::flags::ostream_operators + +namespace flags::bitwise_operators { + +using namespace magic_enum::bitwise_operators; + +} // namespace magic_enum::flags::bitwise_operators + +} // namespace magic_enum + +#if defined(__clang__) +# pragma clang diagnostic pop +#elif defined(__GNUC__) +# pragma GCC diagnostic pop +#elif defined(_MSC_VER) +# pragma warning(pop) +#endif + +#endif // NEARGYE_MAGIC_ENUM_HPP diff --git a/lib/aurora.cpp b/lib/aurora.cpp new file mode 100644 index 0000000..37113fb --- /dev/null +++ b/lib/aurora.cpp @@ -0,0 +1,210 @@ +#include + +#include "gfx/common.hpp" +#include "imgui.hpp" +#include "internal.hpp" +#include "webgpu/gpu.hpp" +#include "window.hpp" + +#include +#include + +namespace aurora { +static Module Log("aurora"); + +AuroraConfig g_config; + +// GPU +using webgpu::g_device; +using webgpu::g_queue; +using webgpu::g_swapChain; + +constexpr std::array PreferredBackendOrder{ +#ifdef DAWN_ENABLE_BACKEND_D3D12 +// BACKEND_D3D12, +#endif +#ifdef DAWN_ENABLE_BACKEND_METAL + BACKEND_METAL, +#endif +#ifdef DAWN_ENABLE_BACKEND_VULKAN + BACKEND_VULKAN, +#endif +#ifdef DAWN_ENABLE_BACKEND_DESKTOP_GL + BACKEND_OPENGL, +#endif +#ifdef DAWN_ENABLE_BACKEND_OPENGLES + BACKEND_OPENGLES, +#endif +#ifdef DAWN_ENABLE_BACKEND_NULL + BACKEND_NULL, +#endif +}; + +static bool g_initialFrame = false; + +static AuroraInfo initialize(int argc, char* argv[], const AuroraConfig& config) noexcept { + g_config = config; + if (g_config.appName == nullptr) { + g_config.appName = "Aurora"; + } + if (g_config.configPath == nullptr) { + g_config.configPath = SDL_GetPrefPath(nullptr, g_config.appName); + } + if (g_config.msaa == 0) { + g_config.msaa = 1; + } + if (g_config.maxTextureAnisotropy == 0) { + g_config.maxTextureAnisotropy = 16; + } + window::initialize(); + + /* Attempt to create a window using the calling application's desired backend */ + AuroraBackend selectedBackend = config.desiredBackend; + bool windowCreated = false; + if (selectedBackend != BACKEND_AUTO && window::create_window(selectedBackend)) { + if (webgpu::initialize(selectedBackend)) { + windowCreated = true; + } else { + window::destroy_window(); + } + } + + if (!windowCreated) { + for (const auto backendType : PreferredBackendOrder) { + selectedBackend = backendType; + if (!window::create_window(selectedBackend)) { + continue; + } + if (webgpu::initialize(selectedBackend)) { + windowCreated = true; + break; + } else { + window::destroy_window(); + } + } + } + + if (!windowCreated) { + Log.report(LOG_FATAL, FMT_STRING("Error creating window: {}"), SDL_GetError()); + unreachable(); + } + + // Initialize SDL_Renderer for ImGui when we can't use a Dawn backend + if (webgpu::g_backendType == WGPUBackendType_Null) { + if (!window::create_renderer()) { + Log.report(LOG_FATAL, FMT_STRING("Failed to initialize SDL renderer: {}"), SDL_GetError()); + unreachable(); + } + } + + window::show_window(); + gfx::initialize(); + + imgui::create_context(); + const auto size = window::get_window_size(); + Log.report(LOG_INFO, FMT_STRING("Using framebuffer size {}x{} scale {}"), size.fb_width, size.fb_height, size.scale); + if (g_config.imGuiInitCallback != nullptr) { + g_config.imGuiInitCallback(&size); + } + imgui::initialize(); + + if (aurora_begin_frame()) { + g_initialFrame = true; + } + return { + .backend = selectedBackend, + .configPath = g_config.configPath, + .windowSize = size, + }; +} + +static WGPUTextureView g_currentView = nullptr; + +static void shutdown() noexcept { + if (g_currentView != nullptr) { + wgpuTextureViewRelease(g_currentView); + g_currentView = nullptr; + } + imgui::shutdown(); + gfx::shutdown(); + webgpu::shutdown(); + window::shutdown(); +} + +static const AuroraEvent* update() noexcept { + if (g_initialFrame) { + aurora_end_frame(); + g_initialFrame = false; + } + const auto* events = window::poll_events(); + imgui::new_frame(window::get_window_size()); + return events; +} + +static bool begin_frame() noexcept { + g_currentView = wgpuSwapChainGetCurrentTextureView(g_swapChain); + if (!g_currentView) { + ImGui::EndFrame(); + // Force swapchain recreation + const auto size = window::get_window_size(); + webgpu::resize_swapchain(size.fb_width, size.fb_height, true); + return false; + } + gfx::begin_frame(); + return true; +} + +static void end_frame() noexcept { + const auto encoderDescriptor = WGPUCommandEncoderDescriptor{ + .label = "Redraw encoder", + }; + auto encoder = wgpuDeviceCreateCommandEncoder(g_device, &encoderDescriptor); + gfx::end_frame(encoder); + gfx::render(encoder); + { + const std::array attachments{ + WGPURenderPassColorAttachment{ + .view = g_currentView, + .loadOp = WGPULoadOp_Clear, + .storeOp = WGPUStoreOp_Store, + }, + }; + const WGPURenderPassDescriptor renderPassDescriptor{ + .label = "Post render pass", + .colorAttachmentCount = attachments.size(), + .colorAttachments = attachments.data(), + }; + auto pass = wgpuCommandEncoderBeginRenderPass(encoder, &renderPassDescriptor); + // Copy EFB -> XFB (swapchain) + wgpuRenderPassEncoderSetPipeline(pass, webgpu::g_CopyPipeline); + wgpuRenderPassEncoderSetBindGroup(pass, 0, webgpu::g_CopyBindGroup, 0, nullptr); + wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0); + if (!g_initialFrame) { + // Render ImGui + imgui::render(pass); + } + wgpuRenderPassEncoderEnd(pass); + wgpuRenderPassEncoderRelease(pass); + } + const WGPUCommandBufferDescriptor cmdBufDescriptor{.label = "Redraw command buffer"}; + const auto buffer = wgpuCommandEncoderFinish(encoder, &cmdBufDescriptor); + wgpuQueueSubmit(g_queue, 1, &buffer); + wgpuCommandBufferRelease(buffer); + wgpuCommandEncoderRelease(encoder); + wgpuSwapChainPresent(g_swapChain); + wgpuTextureViewRelease(g_currentView); + g_currentView = nullptr; + if (!g_initialFrame) { + ImGui::EndFrame(); + } +} +} // namespace aurora + +// C API bindings +AuroraInfo aurora_initialize(int argc, char* argv[], const AuroraConfig* config) { + return aurora::initialize(argc, argv, *config); +} +void aurora_shutdown() { aurora::shutdown(); } +const AuroraEvent* aurora_update() { return aurora::update(); } +bool aurora_begin_frame() { return aurora::begin_frame(); } +void aurora_end_frame() { aurora::end_frame(); } diff --git a/lib/dawn/BackendBinding.cpp b/lib/dawn/BackendBinding.cpp new file mode 100644 index 0000000..89c9ae4 --- /dev/null +++ b/lib/dawn/BackendBinding.cpp @@ -0,0 +1,127 @@ +#include "BackendBinding.hpp" + +#if defined(DAWN_ENABLE_BACKEND_D3D12) +#include +#endif +#if defined(DAWN_ENABLE_BACKEND_METAL) +#include +#endif +#if defined(DAWN_ENABLE_BACKEND_VULKAN) +#include +#endif +#if defined(DAWN_ENABLE_BACKEND_OPENGL) +#include +#include +#endif +#if defined(DAWN_ENABLE_BACKEND_NULL) +#include +#endif + +namespace aurora::webgpu::utils { + +#if defined(DAWN_ENABLE_BACKEND_D3D12) +BackendBinding* CreateD3D12Binding(SDL_Window* window, WGPUDevice device); +#endif +#if defined(DAWN_ENABLE_BACKEND_METAL) +BackendBinding* CreateMetalBinding(SDL_Window* window, WGPUDevice device); +#endif +#if defined(DAWN_ENABLE_BACKEND_NULL) +BackendBinding* CreateNullBinding(SDL_Window* window, WGPUDevice device); +#endif +#if defined(DAWN_ENABLE_BACKEND_OPENGL) +BackendBinding* CreateOpenGLBinding(SDL_Window* window, WGPUDevice device); +#endif +#if defined(DAWN_ENABLE_BACKEND_VULKAN) +BackendBinding* CreateVulkanBinding(SDL_Window* window, WGPUDevice device); +#endif + +BackendBinding::BackendBinding(SDL_Window* window, WGPUDevice device) : m_window(window), m_device(device) {} + +bool DiscoverAdapter(dawn::native::Instance* instance, SDL_Window* window, WGPUBackendType type) { + switch (type) { +#if defined(DAWN_ENABLE_BACKEND_D3D12) + case WGPUBackendType_D3D12: { + dawn::native::d3d12::AdapterDiscoveryOptions options; + return instance->DiscoverAdapters(&options); + } +#endif +#if defined(DAWN_ENABLE_BACKEND_METAL) + case WGPUBackendType_Metal: { + dawn::native::metal::AdapterDiscoveryOptions options; + return instance->DiscoverAdapters(&options); + } +#endif +#if defined(DAWN_ENABLE_BACKEND_VULKAN) + case WGPUBackendType_Vulkan: { + dawn::native::vulkan::AdapterDiscoveryOptions options; + return instance->DiscoverAdapters(&options); + } +#endif +#if defined(DAWN_ENABLE_BACKEND_DESKTOP_GL) + case WGPUBackendType_OpenGL: { + SDL_GL_ResetAttributes(); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 4); + SDL_GL_CreateContext(window); + auto getProc = reinterpret_cast(SDL_GL_GetProcAddress); + dawn::native::opengl::AdapterDiscoveryOptions adapterOptions; + adapterOptions.getProc = getProc; + return instance->DiscoverAdapters(&adapterOptions); + } +#endif +#if defined(DAWN_ENABLE_BACKEND_OPENGLES) + case WGPUBackendType_OpenGLES: { + SDL_GL_ResetAttributes(); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_ES); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 3); + SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 0); + SDL_GL_CreateContext(window); + auto getProc = reinterpret_cast(SDL_GL_GetProcAddress); + dawn::native::opengl::AdapterDiscoveryOptionsES adapterOptions; + adapterOptions.getProc = getProc; + return instance->DiscoverAdapters(&adapterOptions); + } +#endif +#if defined(DAWN_ENABLE_BACKEND_NULL) + case WGPUBackendType_Null: + instance->DiscoverDefaultAdapters(); + return true; +#endif + default: + return false; + } +} + +BackendBinding* CreateBinding(WGPUBackendType type, SDL_Window* window, WGPUDevice device) { + switch (type) { +#if defined(DAWN_ENABLE_BACKEND_D3D12) + case WGPUBackendType_D3D12: + return CreateD3D12Binding(window, device); +#endif +#if defined(DAWN_ENABLE_BACKEND_METAL) + case WGPUBackendType_Metal: + return CreateMetalBinding(window, device); +#endif +#if defined(DAWN_ENABLE_BACKEND_NULL) + case WGPUBackendType_Null: + return CreateNullBinding(window, device); +#endif +#if defined(DAWN_ENABLE_BACKEND_DESKTOP_GL) + case WGPUBackendType_OpenGL: + return CreateOpenGLBinding(window, device); +#endif +#if defined(DAWN_ENABLE_BACKEND_OPENGLES) + case WGPUBackendType_OpenGLES: + return CreateOpenGLBinding(window, device); +#endif +#if defined(DAWN_ENABLE_BACKEND_VULKAN) + case WGPUBackendType_Vulkan: + return CreateVulkanBinding(window, device); +#endif + default: + return nullptr; + } +} + +} // namespace aurora::webgpu::utils diff --git a/lib/dawn/BackendBinding.hpp b/lib/dawn/BackendBinding.hpp new file mode 100644 index 0000000..3b638fc --- /dev/null +++ b/lib/dawn/BackendBinding.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +struct SDL_Window; + +namespace aurora::webgpu::utils { + +class BackendBinding { +public: + virtual ~BackendBinding() = default; + + virtual uint64_t GetSwapChainImplementation() = 0; + virtual WGPUTextureFormat GetPreferredSwapChainTextureFormat() = 0; + +protected: + BackendBinding(SDL_Window* window, WGPUDevice device); + + SDL_Window* m_window = nullptr; + WGPUDevice m_device = nullptr; +}; + +bool DiscoverAdapter(dawn::native::Instance* instance, SDL_Window* window, WGPUBackendType type); +BackendBinding* CreateBinding(WGPUBackendType type, SDL_Window* window, WGPUDevice device); + +} // namespace aurora::webgpu::utils diff --git a/lib/dawn/D3D12Binding.cpp b/lib/dawn/D3D12Binding.cpp new file mode 100644 index 0000000..dbd2af4 --- /dev/null +++ b/lib/dawn/D3D12Binding.cpp @@ -0,0 +1,37 @@ +#include "BackendBinding.hpp" + +#include +#include + +namespace aurora::webgpu::utils { +class D3D12Binding : public BackendBinding { +public: + D3D12Binding(SDL_Window* window, WGPUDevice device) : BackendBinding(window, device) {} + + uint64_t GetSwapChainImplementation() override { + if (m_swapChainImpl.userData == nullptr) { + CreateSwapChainImpl(); + } + return reinterpret_cast(&m_swapChainImpl); + } + + WGPUTextureFormat GetPreferredSwapChainTextureFormat() override { + if (m_swapChainImpl.userData == nullptr) { + CreateSwapChainImpl(); + } + return dawn::native::d3d12::GetNativeSwapChainPreferredFormat(&m_swapChainImpl); + } + +private: + DawnSwapChainImplementation m_swapChainImpl{}; + + void CreateSwapChainImpl() { + SDL_SysWMinfo wmInfo; + SDL_VERSION(&wmInfo.version); + SDL_GetWindowWMInfo(m_window, &wmInfo); + m_swapChainImpl = dawn::native::d3d12::CreateNativeSwapChainImpl(m_device, wmInfo.info.win.window); + } +}; + +BackendBinding* CreateD3D12Binding(SDL_Window* window, WGPUDevice device) { return new D3D12Binding(window, device); } +} // namespace aurora::webgpu::utils diff --git a/lib/dawn/MetalBinding.mm b/lib/dawn/MetalBinding.mm new file mode 100644 index 0000000..151ed91 --- /dev/null +++ b/lib/dawn/MetalBinding.mm @@ -0,0 +1,108 @@ +#include "BackendBinding.hpp" + +#include +#include + +#import + +template DawnSwapChainImplementation CreateSwapChainImplementation(T *swapChain) { + DawnSwapChainImplementation impl = {}; + impl.userData = swapChain; + impl.Init = [](void *userData, void *wsiContext) { + auto *ctx = static_cast(wsiContext); + reinterpret_cast(userData)->Init(ctx); + }; + impl.Destroy = [](void *userData) { delete reinterpret_cast(userData); }; + impl.Configure = [](void *userData, WGPUTextureFormat format, WGPUTextureUsage allowedUsage, uint32_t width, + uint32_t height) { + return static_cast(userData)->Configure(format, allowedUsage, width, height); + }; + impl.GetNextTexture = [](void *userData, DawnSwapChainNextTexture *nextTexture) { + return static_cast(userData)->GetNextTexture(nextTexture); + }; + impl.Present = [](void *userData) { return static_cast(userData)->Present(); }; + return impl; +} + +namespace aurora::webgpu::utils { +class SwapChainImplMTL { +public: + using WSIContext = DawnWSIContextMetal; + + explicit SwapChainImplMTL(SDL_Window *window) : m_view(SDL_Metal_CreateView(window)) {} + + ~SwapChainImplMTL() { SDL_Metal_DestroyView(m_view); } + + void Init(DawnWSIContextMetal *ctx) { + mMtlDevice = ctx->device; + mCommandQueue = ctx->queue; + } + + DawnSwapChainError Configure(WGPUTextureFormat format, WGPUTextureUsage usage, uint32_t width, uint32_t height) { + if (format != WGPUTextureFormat_BGRA8Unorm) { + return "unsupported format"; + } + assert(width > 0); + assert(height > 0); + + CGSize size = {}; + size.width = width; + size.height = height; + + mLayer = (__bridge CAMetalLayer *)(SDL_Metal_GetLayer(m_view)); + [mLayer setDevice:mMtlDevice]; + [mLayer setPixelFormat:MTLPixelFormatBGRA8Unorm]; + [mLayer setDrawableSize:size]; + + constexpr uint32_t kFramebufferOnlyTextureUsages = WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_Present; + bool hasOnlyFramebufferUsages = (usage & (~kFramebufferOnlyTextureUsages)) == 0u; + if (hasOnlyFramebufferUsages) { + [mLayer setFramebufferOnly:YES]; + } + + return DAWN_SWAP_CHAIN_NO_ERROR; + } + + DawnSwapChainError GetNextTexture(DawnSwapChainNextTexture *nextTexture) { + mCurrentDrawable = [mLayer nextDrawable]; + mCurrentTexture = mCurrentDrawable.texture; + nextTexture->texture.ptr = (__bridge void *)(mCurrentTexture); + return DAWN_SWAP_CHAIN_NO_ERROR; + } + + DawnSwapChainError Present() { + id commandBuffer = [mCommandQueue commandBuffer]; + [commandBuffer presentDrawable:mCurrentDrawable]; + [commandBuffer commit]; + return DAWN_SWAP_CHAIN_NO_ERROR; + } + +private: + SDL_MetalView m_view = nil; + id mMtlDevice = nil; + id mCommandQueue = nil; + + CAMetalLayer *mLayer = nullptr; + id mCurrentDrawable = nil; + id mCurrentTexture = nil; +}; + +class MetalBinding : public BackendBinding { +public: + MetalBinding(SDL_Window *window, WGPUDevice device) : BackendBinding(window, device) {} + + uint64_t GetSwapChainImplementation() override { + if (m_swapChainImpl.userData == nullptr) { + m_swapChainImpl = CreateSwapChainImplementation(new SwapChainImplMTL(m_window)); + } + return reinterpret_cast(&m_swapChainImpl); + } + + WGPUTextureFormat GetPreferredSwapChainTextureFormat() override { return WGPUTextureFormat_BGRA8Unorm; } + +private: + DawnSwapChainImplementation m_swapChainImpl{}; +}; + +BackendBinding *CreateMetalBinding(SDL_Window *window, WGPUDevice device) { return new MetalBinding(window, device); } +} // namespace aurora::webgpu::utils diff --git a/lib/dawn/NullBinding.cpp b/lib/dawn/NullBinding.cpp new file mode 100644 index 0000000..343d233 --- /dev/null +++ b/lib/dawn/NullBinding.cpp @@ -0,0 +1,26 @@ +#include "BackendBinding.hpp" + +#include + +namespace aurora::webgpu::utils { +class NullBinding : public BackendBinding { +public: + NullBinding(SDL_Window* window, WGPUDevice device) : BackendBinding(window, device) {} + + uint64_t GetSwapChainImplementation() override { + if (m_swapChainImpl.userData == nullptr) { + m_swapChainImpl = dawn::native::null::CreateNativeSwapChainImpl(); + } + return reinterpret_cast(&m_swapChainImpl); + } + + WGPUTextureFormat GetPreferredSwapChainTextureFormat() override { + return WGPUTextureFormat_RGBA8Unorm; + } + +private: + DawnSwapChainImplementation m_swapChainImpl{}; +}; + +BackendBinding* CreateNullBinding(SDL_Window* window, WGPUDevice device) { return new NullBinding(window, device); } +} // namespace aurora::webgpu::utils diff --git a/lib/dawn/OpenGLBinding.cpp b/lib/dawn/OpenGLBinding.cpp new file mode 100644 index 0000000..5e0e7ed --- /dev/null +++ b/lib/dawn/OpenGLBinding.cpp @@ -0,0 +1,35 @@ +#include "BackendBinding.hpp" + +#include +#include + +namespace aurora::webgpu::utils { +class OpenGLBinding : public BackendBinding { +public: + OpenGLBinding(SDL_Window* window, WGPUDevice device) : BackendBinding(window, device) {} + + uint64_t GetSwapChainImplementation() override { + if (m_swapChainImpl.userData == nullptr) { + CreateSwapChainImpl(); + } + return reinterpret_cast(&m_swapChainImpl); + } + + WGPUTextureFormat GetPreferredSwapChainTextureFormat() override { + if (m_swapChainImpl.userData == nullptr) { + CreateSwapChainImpl(); + } + return dawn::native::opengl::GetNativeSwapChainPreferredFormat(&m_swapChainImpl); + } + +private: + DawnSwapChainImplementation m_swapChainImpl{}; + + void CreateSwapChainImpl() { + m_swapChainImpl = dawn::native::opengl::CreateNativeSwapChainImpl( + m_device, [](void* userdata) { SDL_GL_SwapWindow(static_cast(userdata)); }, m_window); + } +}; + +BackendBinding* CreateOpenGLBinding(SDL_Window* window, WGPUDevice device) { return new OpenGLBinding(window, device); } +} // namespace aurora::webgpu::utils diff --git a/lib/dawn/VulkanBinding.cpp b/lib/dawn/VulkanBinding.cpp new file mode 100644 index 0000000..8e37d5f --- /dev/null +++ b/lib/dawn/VulkanBinding.cpp @@ -0,0 +1,42 @@ +#include "BackendBinding.hpp" + +#include "../internal.hpp" + +#include +#include + +namespace aurora::webgpu::utils { +static Module Log("aurora::webgpu::utils::VulkanBinding"); + +class VulkanBinding : public BackendBinding { +public: + VulkanBinding(SDL_Window* window, WGPUDevice device) : BackendBinding(window, device) {} + + uint64_t GetSwapChainImplementation() override { + if (m_swapChainImpl.userData == nullptr) { + CreateSwapChainImpl(); + } + return reinterpret_cast(&m_swapChainImpl); + } + + WGPUTextureFormat GetPreferredSwapChainTextureFormat() override { + if (m_swapChainImpl.userData == nullptr) { + CreateSwapChainImpl(); + } + return dawn::native::vulkan::GetNativeSwapChainPreferredFormat(&m_swapChainImpl); + } + +private: + DawnSwapChainImplementation m_swapChainImpl{}; + + void CreateSwapChainImpl() { + VkSurfaceKHR surface = VK_NULL_HANDLE; + if (SDL_Vulkan_CreateSurface(m_window, dawn::native::vulkan::GetInstance(m_device), &surface) != SDL_TRUE) { + Log.report(LOG_FATAL, FMT_STRING("Failed to create Vulkan surface: {}"), SDL_GetError()); + } + m_swapChainImpl = dawn::native::vulkan::CreateNativeSwapChainImpl(m_device, surface); + } +}; + +BackendBinding* CreateVulkanBinding(SDL_Window* window, WGPUDevice device) { return new VulkanBinding(window, device); } +} // namespace aurora::webgpu::utils diff --git a/lib/dolphin/GXBump.cpp b/lib/dolphin/GXBump.cpp new file mode 100644 index 0000000..d54c0ad --- /dev/null +++ b/lib/dolphin/GXBump.cpp @@ -0,0 +1,63 @@ +#include "gx.hpp" + +void GXSetNumIndStages(u8 num) { update_gx_state(g_gxState.numIndStages, num); } + +void GXSetIndTexOrder(GXIndTexStageID indStage, GXTexCoordID texCoord, GXTexMapID texMap) { + auto& stage = g_gxState.indStages[indStage]; + update_gx_state(stage.texCoordId, texCoord); + update_gx_state(stage.texMapId, texMap); +} + +void GXSetIndTexCoordScale(GXIndTexStageID indStage, GXIndTexScale scaleS, GXIndTexScale scaleT) { + auto& stage = g_gxState.indStages[indStage]; + update_gx_state(stage.scaleS, scaleS); + update_gx_state(stage.scaleT, scaleT); +} + +void GXSetIndTexMtx(GXIndTexMtxID id, const void* offset, s8 scaleExp) { + if (id < GX_ITM_0 || id > GX_ITM_2) { + Log.report(LOG_FATAL, FMT_STRING("invalid ind tex mtx ID {}"), id); + } + update_gx_state(g_gxState.indTexMtxs[id - 1], {*reinterpret_cast*>(offset), scaleExp}); +} + +void GXSetTevIndirect(GXTevStageID tevStage, GXIndTexStageID indStage, GXIndTexFormat fmt, GXIndTexBiasSel biasSel, + GXIndTexMtxID matrixSel, GXIndTexWrap wrapS, GXIndTexWrap wrapT, GXBool addPrev, GXBool indLod, + GXIndTexAlphaSel alphaSel) { + auto& stage = g_gxState.tevStages[tevStage]; + update_gx_state(stage.indTexStage, indStage); + update_gx_state(stage.indTexFormat, fmt); + update_gx_state(stage.indTexBiasSel, biasSel); + update_gx_state(stage.indTexAlphaSel, alphaSel); + update_gx_state(stage.indTexMtxId, matrixSel); + update_gx_state(stage.indTexWrapS, wrapS); + update_gx_state(stage.indTexWrapT, wrapT); + update_gx_state(stage.indTexAddPrev, addPrev); + update_gx_state(stage.indTexUseOrigLOD, indLod); +} + +void GXSetTevDirect(GXTevStageID stageId) { + auto& stage = g_gxState.tevStages[stageId]; + // TODO is this right? + update_gx_state(stage.indTexStage, GX_INDTEXSTAGE0); + update_gx_state(stage.indTexFormat, GX_ITF_8); + update_gx_state(stage.indTexBiasSel, GX_ITB_NONE); + update_gx_state(stage.indTexAlphaSel, GX_ITBA_OFF); + update_gx_state(stage.indTexMtxId, GX_ITM_OFF); + update_gx_state(stage.indTexWrapS, GX_ITW_OFF); + update_gx_state(stage.indTexWrapT, GX_ITW_OFF); + update_gx_state(stage.indTexUseOrigLOD, false); + update_gx_state(stage.indTexAddPrev, false); +} + +void GXSetTevIndWarp(GXTevStageID tevStage, GXIndTexStageID indStage, GXBool signedOffsets, GXBool replaceMode, + GXIndTexMtxID matrixSel) { + const auto wrap = replaceMode ? GX_ITW_0 : GX_ITW_OFF; + const auto biasSel = signedOffsets ? GX_ITB_STU : GX_ITB_NONE; + GXSetTevIndirect(tevStage, indStage, GX_ITF_8, biasSel, matrixSel, wrap, wrap, false, false, GX_ITBA_OFF); +} + +// TODO GXSetTevIndTile +// TODO GXSetTevIndBumpST +// TODO GXSetTevIndBumpXYZ +// TODO GXSetTevIndRepeat diff --git a/lib/dolphin/GXCull.cpp b/lib/dolphin/GXCull.cpp new file mode 100644 index 0000000..7dfae7b --- /dev/null +++ b/lib/dolphin/GXCull.cpp @@ -0,0 +1,7 @@ +#include "gx.hpp" + +void GXSetScissor(u32 left, u32 top, u32 width, u32 height) { aurora::gfx::set_scissor(left, top, width, height); } + +void GXSetCullMode(GXCullMode mode) { update_gx_state(g_gxState.cullMode, mode); } + +// TODO GXSetCoPlanar diff --git a/lib/dolphin/GXDispList.cpp b/lib/dolphin/GXDispList.cpp new file mode 100644 index 0000000..a991aca --- /dev/null +++ b/lib/dolphin/GXDispList.cpp @@ -0,0 +1,23 @@ +#include "gx.hpp" + +#include "../gfx/model/shader.hpp" + +void GXBeginDisplayList(void* list, u32 size) { + // TODO +} + +u32 GXEndDisplayList() { + // TODO + return 0; +} + +void GXCallDisplayList(const void* data, u32 nbytes) { + // TODO CElementGen needs fixing + for (const auto& type : aurora::gfx::gx::g_gxState.vtxDesc) { + if (type == GX_DIRECT) { + Log.report(LOG_WARNING, FMT_STRING("Direct attributes in surface config!")); + return; + } + } + aurora::gfx::model::queue_surface(static_cast(data), nbytes); +} diff --git a/lib/dolphin/GXDraw.cpp b/lib/dolphin/GXDraw.cpp new file mode 100644 index 0000000..cd706b2 --- /dev/null +++ b/lib/dolphin/GXDraw.cpp @@ -0,0 +1,13 @@ +#include "gx.hpp" + +// TODO GXDrawCylinder +// TODO GXDrawTorus + +void GXDrawSphere(u8 numMajor, u8 numMinor) { puts("GXDrawSphere is a stub"); } + +// TODO GXDrawCube +// TODO GXDrawDodeca +// TODO GXDrawOctahedron +// TODO GXDrawIcosahedron +// TODO GXDrawSphere1 +// TODO GXGenNormalTable diff --git a/lib/dolphin/GXExtra.cpp b/lib/dolphin/GXExtra.cpp new file mode 100644 index 0000000..22ba9ac --- /dev/null +++ b/lib/dolphin/GXExtra.cpp @@ -0,0 +1,6 @@ +#include "gx.hpp" + +void GXDestroyTexObj(GXTexObj* obj_) { + auto* obj = reinterpret_cast(obj_); + obj->ref.reset(); +} diff --git a/lib/dolphin/GXFifo.cpp b/lib/dolphin/GXFifo.cpp new file mode 100644 index 0000000..a969ac0 --- /dev/null +++ b/lib/dolphin/GXFifo.cpp @@ -0,0 +1,47 @@ +#include "gx.hpp" + +static GXFifoObj* GPFifo; +static GXFifoObj* CPUFifo; + +void GXGetGPStatus(GXBool* overhi, GXBool* underlow, GXBool* readIdle, GXBool* cmdIdle, GXBool* brkpt) { + *overhi = *underlow = *readIdle = *cmdIdle = *brkpt = false; + *readIdle = true; +} + +// TODO GXGetFifoStatus + +void GXGetFifoPtrs(GXFifoObj* fifo, void** readPtr, void** writePtr) { + *readPtr = NULL; + *writePtr = NULL; +} + +GXFifoObj* GXGetCPUFifo() { return CPUFifo; } + +GXFifoObj* GXGetGPFifo() { return GPFifo; } + +// TODO GXGetFifoBase +// TODO GXGetFifoSize +// TODO GXGetFifoLimits +// TODO GXSetBreakPtCallback +// TODO GXEnableBreakPt +// TODO GXDisableBreakPt + +void GXInitFifoBase(GXFifoObj* fifo, void* base, u32 size) {} + +void GXInitFifoPtrs(GXFifoObj* fifo, void* readPtr, void* writePtr) {} + +// TODO GXInitFifoLimits + +void GXSetCPUFifo(GXFifoObj* fifo) { CPUFifo = fifo; } + +void GXSetGPFifo(GXFifoObj* fifo) { GPFifo = fifo; } + +void GXSaveCPUFifo(GXFifoObj* fifo) {} + +// TODO GXSaveGPFifo +// TODO GXRedirectWriteGatherPipe +// TODO GXRestoreWriteGatherPipe +// TODO GXSetCurrentGXThread +// TODO GXGetCurrentGXThread +// TODO GXGetOverflowCount +// TODO GXResetOverflowCount diff --git a/lib/dolphin/GXFrameBuffer.cpp b/lib/dolphin/GXFrameBuffer.cpp new file mode 100644 index 0000000..dfb7aa5 --- /dev/null +++ b/lib/dolphin/GXFrameBuffer.cpp @@ -0,0 +1,55 @@ +#include "gx.hpp" + +#include "../window.hpp" + +extern "C" { +GXRenderModeObj GXNtsc480IntDf = { + VI_TVMODE_NTSC_INT, 640, 480, 480, 40, 0, 640, 480, VI_XFBMODE_DF, 0, 0, +}; +GXRenderModeObj GXPal528IntDf = { + VI_TVMODE_PAL_INT, 704, 528, 480, 40, 0, 640, 480, VI_XFBMODE_DF, 0, 0, +}; +GXRenderModeObj GXMpal480IntDf = { + VI_TVMODE_PAL_INT, 640, 480, 480, 40, 0, 640, 480, VI_XFBMODE_DF, 0, 0, +}; +} + +void GXAdjustForOverscan(GXRenderModeObj* rmin, GXRenderModeObj* rmout, u16 hor, u16 ver) { + *rmout = *rmin; + const auto size = aurora::window::get_window_size(); + rmout->fbWidth = size.fb_width; + rmout->efbHeight = size.fb_height; + rmout->xfbHeight = size.fb_height; +} + +void GXSetDispCopySrc(u16 left, u16 top, u16 wd, u16 ht) {} + +void GXSetTexCopySrc(u16 left, u16 top, u16 wd, u16 ht) { + // TODO +} + +void GXSetDispCopyDst(u16 wd, u16 ht) {} + +void GXSetTexCopyDst(u16 wd, u16 ht, GXTexFmt fmt, GXBool mipmap) { + // TODO +} + +// TODO GXSetDispCopyFrame2Field +// TODO GXSetCopyClamp + +u32 GXSetDispCopyYScale(f32 vscale) { return 0; } + +void GXSetCopyClear(GXColor color, u32 depth) { update_gx_state(g_gxState.clearColor, from_gx_color(color)); } + +void GXSetCopyFilter(GXBool aa, u8 sample_pattern[12][2], GXBool vf, u8 vfilter[7]) {} + +void GXSetDispCopyGamma(GXGamma gamma) {} + +void GXCopyDisp(void* dest, GXBool clear) {} + +// TODO move GXCopyTex here + +// TODO GXGetYScaleFactor +// TODO GXGetNumXfbLines +// TODO GXClearBoundingBox +// TODO GXReadBoundingBox diff --git a/lib/dolphin/GXGeometry.cpp b/lib/dolphin/GXGeometry.cpp new file mode 100644 index 0000000..f01b1e0 --- /dev/null +++ b/lib/dolphin/GXGeometry.cpp @@ -0,0 +1,62 @@ +#include "gx.hpp" + +#include + +void GXSetVtxDesc(GXAttr attr, GXAttrType type) { update_gx_state(g_gxState.vtxDesc[attr], type); } + +void GXSetVtxDescv(GXVtxDescList* list) { + g_gxState.vtxDesc.fill({}); + while (list->attr != GX_VA_NULL) { + update_gx_state(g_gxState.vtxDesc[list->attr], list->type); + ++list; + } +} + +void GXClearVtxDesc() { g_gxState.vtxDesc.fill({}); } + +void GXSetVtxAttrFmt(GXVtxFmt vtxfmt, GXAttr attr, GXCompCnt cnt, GXCompType type, u8 frac) { + if (vtxfmt < GX_VTXFMT0 || vtxfmt >= GX_MAX_VTXFMT) { + Log.report(LOG_FATAL, FMT_STRING("invalid vtxfmt {}"), vtxfmt); + unreachable(); + } + if (attr < GX_VA_PNMTXIDX || attr >= GX_VA_MAX_ATTR) { + Log.report(LOG_FATAL, FMT_STRING("invalid attr {}"), attr); + unreachable(); + } + auto& fmt = g_gxState.vtxFmts[vtxfmt].attrs[attr]; + update_gx_state(fmt.cnt, cnt); + update_gx_state(fmt.type, type); + update_gx_state(fmt.frac, frac); +} + +// TODO GXSetVtxAttrFmtv + +void GXSetArray(GXAttr attr, const void* data, u32 size, u8 stride) { + auto& array = g_gxState.arrays[attr]; + array.data = data; + array.size = size; + array.stride = stride; + array.cachedRange = {}; +} + +// TODO move GXBegin, GXEnd here + +void GXSetTexCoordGen2(GXTexCoordID dst, GXTexGenType type, GXTexGenSrc src, u32 mtx, GXBool normalize, u32 postMtx) { + if (dst < GX_TEXCOORD0 || dst > GX_TEXCOORD7) { + Log.report(LOG_FATAL, FMT_STRING("invalid tex coord {}"), dst); + unreachable(); + } + update_gx_state(g_gxState.tcgs[dst], + {type, src, static_cast(mtx), static_cast(postMtx), normalize}); +} + +void GXSetNumTexGens(u8 num) { update_gx_state(g_gxState.numTexGens, num); } + +// TODO GXInvalidateVtxCache + +void GXSetLineWidth(u8 width, GXTexOffset offs) { + // TODO +} + +// TODO GXSetPointSize +// TODO GXEnableTexOffsets diff --git a/lib/dolphin/GXGet.cpp b/lib/dolphin/GXGet.cpp new file mode 100644 index 0000000..955039c --- /dev/null +++ b/lib/dolphin/GXGet.cpp @@ -0,0 +1,106 @@ +#include "gx.hpp" + +#include "../gfx/texture.hpp" + +// TODO GXGetVtxDesc +// TODO GXGetVtxDescv +// TODO GXGetVtxAttrFmtv +// TODO GXGetLineWidth +// TODO GXGetPointSize + +void GXGetVtxAttrFmt(GXVtxFmt idx, GXAttr attr, GXCompCnt* compCnt, GXCompType* compType, u8* shift) { + const auto& fmt = g_gxState.vtxFmts[idx].attrs[attr]; + *compCnt = fmt.cnt; + *compType = fmt.type; + *shift = fmt.frac; +} + +// TODO GXGetViewportv + +void GXGetProjectionv(f32* p) { + const auto& mtx = g_gxState.origProj; + p[0] = static_cast(g_gxState.projType); + p[1] = mtx.m0[0]; + p[3] = mtx.m1[1]; + p[5] = mtx.m2[2]; + p[6] = mtx.m2[3]; + if (g_gxState.projType == GX_ORTHOGRAPHIC) { + p[2] = mtx.m0[3]; + p[4] = mtx.m1[3]; + } else { + p[2] = mtx.m0[2]; + p[4] = mtx.m1[2]; + } +} + +// TODO GXGetScissor +// TODO GXGetCullMode + +void GXGetLightAttnA(GXLightObj* light_, float* a0, float* a1, float* a2) { + auto* light = reinterpret_cast(light_); + *a0 = light->a0; + *a1 = light->a1; + *a2 = light->a2; +} + +void GXGetLightAttnK(GXLightObj* light_, float* k0, float* k1, float* k2) { + auto* light = reinterpret_cast(light_); + *k0 = light->k0; + *k1 = light->k1; + *k2 = light->k2; +} + +void GXGetLightPos(GXLightObj* light_, float* x, float* y, float* z) { + auto* light = reinterpret_cast(light_); + *x = light->px; + *z = light->py; + *z = light->pz; +} + +void GXGetLightDir(GXLightObj* light_, float* nx, float* ny, float* nz) { + auto* light = reinterpret_cast(light_); + *nx = -light->nx; + *ny = -light->ny; + *nz = -light->nz; +} + +void GXGetLightColor(GXLightObj* light_, GXColor* col) { + auto* light = reinterpret_cast(light_); + *col = light->color; +} + +void* GXGetTexObjData(GXTexObj* tex_obj) { + return const_cast(reinterpret_cast(tex_obj)->data); +} + +u16 GXGetTexObjWidth(GXTexObj* tex_obj) { return reinterpret_cast(tex_obj)->width; } + +u16 GXGetTexObjHeight(GXTexObj* tex_obj) { return reinterpret_cast(tex_obj)->height; } + +GXTexFmt GXGetTexObjFmt(GXTexObj* tex_obj) { + return static_cast(reinterpret_cast(tex_obj)->fmt); +} + +GXTexWrapMode GXGetTexObjWrapS(GXTexObj* tex_obj) { return reinterpret_cast(tex_obj)->wrapS; } + +GXTexWrapMode GXGetTexObjWrapT(GXTexObj* tex_obj) { return reinterpret_cast(tex_obj)->wrapT; } + +GXBool GXGetTexObjMipMap(GXTexObj* tex_obj) { return reinterpret_cast(tex_obj)->hasMips; } + +// TODO GXGetTexObjAll +// TODO GXGetTexObjMinFilt +// TODO GXGetTexObjMagFilt +// TODO GXGetTexObjMinLOD +// TODO GXGetTexObjMaxLOD +// TODO GXGetTexObjLODBias +// TODO GXGetTexObjBiasClamp +// TODO GXGetTexObjEdgeLOD +// TODO GXGetTexObjMaxAniso +// TODO GXGetTexObjLODAll +// TODO GXGetTexObjTlut +// TODO GXGetTlutObjData +// TODO GXGetTlutObjFmt +// TODO GXGetTlutObjNumEntries +// TODO GXGetTlutObjAll +// TODO GXGetTexRegionAll +// TODO GXGetTlutRegionAll diff --git a/lib/dolphin/GXLighting.cpp b/lib/dolphin/GXLighting.cpp new file mode 100644 index 0000000..1fa2219 --- /dev/null +++ b/lib/dolphin/GXLighting.cpp @@ -0,0 +1,238 @@ +#include "gx.hpp" + +void GXInitLightAttn(GXLightObj* light_, float a0, float a1, float a2, float k0, float k1, float k2) { + auto* light = reinterpret_cast(light_); + light->a0 = a0; + light->a1 = a1; + light->a2 = a2; + light->k0 = k0; + light->k1 = k1; + light->k2 = k2; +} + +void GXInitLightAttnA(GXLightObj* light_, float a0, float a1, float a2) { + auto* light = reinterpret_cast(light_); + light->a0 = a0; + light->a1 = a1; + light->a2 = a2; +} + +void GXInitLightAttnK(GXLightObj* light_, float k0, float k1, float k2) { + auto* light = reinterpret_cast(light_); + light->k0 = k0; + light->k1 = k1; + light->k2 = k2; +} + +void GXInitLightSpot(GXLightObj* light_, float cutoff, GXSpotFn spotFn) { + if (cutoff <= 0.f || cutoff > 90.f) { + spotFn = GX_SP_OFF; + } + + float cr = std::cos((cutoff * M_PIF) / 180.f); + float a0 = 1.f; + float a1 = 0.f; + float a2 = 0.f; + switch (spotFn) { + default: + break; + case GX_SP_FLAT: + a0 = -1000.f * cr; + a1 = 1000.f; + a2 = 0.f; + break; + case GX_SP_COS: + a0 = -cr / (1.f - cr); + a1 = 1.f / (1.f - cr); + a2 = 0.f; + break; + case GX_SP_COS2: + a0 = 0.f; + a1 = -cr / (1.f - cr); + a2 = 1.f / (1.f - cr); + break; + case GX_SP_SHARP: { + const float d = (1.f - cr) * (1.f - cr); + a0 = cr * (cr - 2.f); + a1 = 2.f / d; + a2 = -1.f / d; + break; + } + case GX_SP_RING1: { + const float d = (1.f - cr) * (1.f - cr); + a0 = 4.f * cr / d; + a1 = 4.f * (1.f + cr) / d; + a2 = -4.f / d; + break; + } + case GX_SP_RING2: { + const float d = (1.f - cr) * (1.f - cr); + a0 = 1.f - 2.f * cr * cr / d; + a1 = 4.f * cr / d; + a2 = -2.f / d; + break; + } + } + + auto* light = reinterpret_cast(light_); + light->a0 = a0; + light->a1 = a1; + light->a2 = a2; +} + +void GXInitLightDistAttn(GXLightObj* light_, float refDistance, float refBrightness, GXDistAttnFn distFunc) { + if (refDistance < 0.f || refBrightness < 0.f || refBrightness >= 1.f) { + distFunc = GX_DA_OFF; + } + float k0 = 1.f; + float k1 = 0.f; + float k2 = 0.f; + switch (distFunc) { + case GX_DA_GENTLE: + k0 = 1.0f; + k1 = (1.0f - refBrightness) / (refBrightness * refDistance); + k2 = 0.0f; + break; + case GX_DA_MEDIUM: + k0 = 1.0f; + k1 = 0.5f * (1.0f - refBrightness) / (refBrightness * refDistance); + k2 = 0.5f * (1.0f - refBrightness) / (refBrightness * refDistance * refDistance); + break; + case GX_DA_STEEP: + k0 = 1.0f; + k1 = 0.0f; + k2 = (1.0f - refBrightness) / (refBrightness * refDistance * refDistance); + break; + case GX_DA_OFF: + k0 = 1.0f; + k1 = 0.0f; + k2 = 0.0f; + break; + } + + auto* light = reinterpret_cast(light_); + light->k0 = k0; + light->k1 = k1; + light->k2 = k2; +} + +void GXInitLightPos(GXLightObj* light_, float x, float y, float z) { + auto* light = reinterpret_cast(light_); + light->px = x; + light->py = y; + light->pz = z; +} + +void GXInitLightColor(GXLightObj* light_, GXColor col) { + auto* light = reinterpret_cast(light_); + light->color = col; +} + +void GXLoadLightObjImm(GXLightObj* light_, GXLightID id) { + u32 idx = std::log2(id); + aurora::gfx::gx::Light realLight; + auto* light = reinterpret_cast(light_); + realLight.pos = {light->px, light->py, light->pz}; + realLight.dir = {light->nx, light->ny, light->nz}; + realLight.cosAtt = {light->a0, light->a1, light->a2}; + realLight.distAtt = {light->k0, light->k1, light->k2}; + realLight.color = from_gx_color(light->color); + update_gx_state(g_gxState.lights[idx], realLight); +} + +// TODO GXLoadLightObjIndx + +void GXSetChanAmbColor(GXChannelID id, GXColor color) { + if (id == GX_COLOR0A0) { + GXSetChanAmbColor(GX_COLOR0, color); + GXSetChanAmbColor(GX_ALPHA0, color); + return; + } else if (id == GX_COLOR1A1) { + GXSetChanAmbColor(GX_COLOR1, color); + GXSetChanAmbColor(GX_ALPHA1, color); + return; + } + if (id < GX_COLOR0 || id > GX_ALPHA1) { + Log.report(LOG_FATAL, FMT_STRING("bad channel {}"), id); + unreachable(); + } + update_gx_state(g_gxState.colorChannelState[id].ambColor, from_gx_color(color)); +} + +void GXSetChanMatColor(GXChannelID id, GXColor color) { + if (id == GX_COLOR0A0) { + GXSetChanMatColor(GX_COLOR0, color); + GXSetChanMatColor(GX_ALPHA0, color); + return; + } else if (id == GX_COLOR1A1) { + GXSetChanMatColor(GX_COLOR1, color); + GXSetChanMatColor(GX_ALPHA1, color); + return; + } + if (id < GX_COLOR0 || id > GX_ALPHA1) { + Log.report(LOG_FATAL, FMT_STRING("bad channel {}"), id); + unreachable(); + } + update_gx_state(g_gxState.colorChannelState[id].matColor, from_gx_color(color)); +} + +void GXSetNumChans(u8 num) { update_gx_state(g_gxState.numChans, num); } + +void GXInitLightDir(GXLightObj* light_, float nx, float ny, float nz) { + auto* light = reinterpret_cast(light_); + light->nx = -nx; + light->ny = -ny; + light->nz = -nz; +} + +void GXInitSpecularDir(GXLightObj* light_, float nx, float ny, float nz) { + float hx = -nx; + float hy = -ny; + float hz = (-nz + 1.0f); + float mag = ((hx * hx) + (hy * hy) + (hz * hz)); + if (mag != 0.0f) { + mag = 1.0f / sqrtf(mag); + } + + auto* light = reinterpret_cast(light_); + light->px = (nx * GX_LARGE_NUMBER); + light->py = (ny * GX_LARGE_NUMBER); + light->pz = (nz * GX_LARGE_NUMBER); + light->nx = hx * mag; + light->ny = hy * mag; + light->nz = hz * mag; +} + +void GXInitSpecularDirHA(GXLightObj* light_, float nx, float ny, float nz, float hx, float hy, float hz) { + auto* light = reinterpret_cast(light_); + light->px = (nx * GX_LARGE_NUMBER); + light->py = (ny * GX_LARGE_NUMBER); + light->pz = (nz * GX_LARGE_NUMBER); + light->nx = hx; + light->ny = hy; + light->nz = hz; +} + +void GXSetChanCtrl(GXChannelID id, bool lightingEnabled, GXColorSrc ambSrc, GXColorSrc matSrc, u32 lightState, + GXDiffuseFn diffFn, GXAttnFn attnFn) { + if (id == GX_COLOR0A0) { + GXSetChanCtrl(GX_COLOR0, lightingEnabled, ambSrc, matSrc, lightState, diffFn, attnFn); + GXSetChanCtrl(GX_ALPHA0, lightingEnabled, ambSrc, matSrc, lightState, diffFn, attnFn); + return; + } else if (id == GX_COLOR1A1) { + GXSetChanCtrl(GX_COLOR1, lightingEnabled, ambSrc, matSrc, lightState, diffFn, attnFn); + GXSetChanCtrl(GX_ALPHA1, lightingEnabled, ambSrc, matSrc, lightState, diffFn, attnFn); + return; + } + if (id < GX_COLOR0 || id > GX_ALPHA1) { + Log.report(LOG_FATAL, FMT_STRING("bad channel {}"), id); + unreachable(); + } + auto& chan = g_gxState.colorChannelConfig[id]; + update_gx_state(chan.lightingEnabled, lightingEnabled); + update_gx_state(chan.ambSrc, ambSrc); + update_gx_state(chan.matSrc, matSrc); + update_gx_state(chan.diffFn, diffFn); + update_gx_state(chan.attnFn, attnFn); + update_gx_state(g_gxState.colorChannelState[id].lightMask, GX::LightMask{lightState}); +} diff --git a/lib/dolphin/GXManage.cpp b/lib/dolphin/GXManage.cpp new file mode 100644 index 0000000..93577c8 --- /dev/null +++ b/lib/dolphin/GXManage.cpp @@ -0,0 +1,35 @@ +#include "gx.hpp" + +static GXDrawDoneCallback DrawDoneCB = nullptr; + +GXFifoObj* GXInit(void* base, u32 size) { return NULL; } + +// TODO GXAbortFrame +// TODO GXSetDrawSync +// TODO GXReadDrawSync +// TODO GXSetDrawSyncCallback + +void GXDrawDone() { DrawDoneCB(); } + +void GXSetDrawDone() { DrawDoneCB(); } + +// TODO GXWaitDrawDone + +GXDrawDoneCallback GXSetDrawDoneCallback(GXDrawDoneCallback cb) { + GXDrawDoneCallback old = DrawDoneCB; + DrawDoneCB = cb; + return old; +} + +// TODO GXSetResetWritePipe + +void GXFlush() {} + +// TODO GXResetWriteGatherPipe + +void GXPixModeSync() {} + +void GXTexModeSync() {} + +// TODO IsWriteGatherBufferEmpty +// TODO GXSetMisc diff --git a/lib/dolphin/GXPerf.cpp b/lib/dolphin/GXPerf.cpp new file mode 100644 index 0000000..8e6ea69 --- /dev/null +++ b/lib/dolphin/GXPerf.cpp @@ -0,0 +1,17 @@ +#include "gx.hpp" + +// TODO GXSetGPMetric +// TODO GXClearGPMetric +// TODO GXReadGPMetric +// TODO GXReadGP0Metric +// TODO GXReadGP1Metric +// TODO GXReadMemMetric +// TODO GXClearMemMetric +// TODO GXReadPixMetric +// TODO GXClearPixMetric +// TODO GXSetVCacheMetric +// TODO GXReadVCacheMetric +// TODO GXClearVCacheMetric +// TODO GXReadXfRasMetric +// TODO GXInitXfRasMetric +// TODO GXReadClksPerVtx diff --git a/lib/dolphin/GXPixel.cpp b/lib/dolphin/GXPixel.cpp new file mode 100644 index 0000000..53d9c88 --- /dev/null +++ b/lib/dolphin/GXPixel.cpp @@ -0,0 +1,46 @@ +#include "gx.hpp" + +void GXSetFog(GXFogType type, float startZ, float endZ, float nearZ, float farZ, GXColor color) { + update_gx_state(g_gxState.fog, {type, startZ, endZ, nearZ, farZ, from_gx_color(color)}); +} + +void GXSetFogColor(GXColor color) { update_gx_state(g_gxState.fog.color, from_gx_color(color)); } + +// TODO GXInitFogAdjTable +// TODO GXSetFogRangeAdj + +void GXSetBlendMode(GXBlendMode mode, GXBlendFactor src, GXBlendFactor dst, GXLogicOp op) { + update_gx_state(g_gxState.blendMode, mode); + update_gx_state(g_gxState.blendFacSrc, src); + update_gx_state(g_gxState.blendFacDst, dst); + update_gx_state(g_gxState.blendOp, op); +} + +void GXSetColorUpdate(GXBool enabled) { update_gx_state(g_gxState.colorUpdate, enabled); } + +void GXSetAlphaUpdate(bool enabled) { update_gx_state(g_gxState.alphaUpdate, enabled); } + +void GXSetZMode(bool compare_enable, GXCompare func, bool update_enable) { + update_gx_state(g_gxState.depthCompare, compare_enable); + update_gx_state(g_gxState.depthFunc, func); + update_gx_state(g_gxState.depthUpdate, update_enable); +} + +void GXSetZCompLoc(GXBool before_tex) { + // TODO +} + +void GXSetPixelFmt(GXPixelFmt pix_fmt, GXZFmt16 z_fmt) {} + +void GXSetDither(GXBool dither) {} + +void GXSetDstAlpha(bool enabled, u8 value) { + if (enabled) { + update_gx_state(g_gxState.dstAlpha, value); + } else { + update_gx_state(g_gxState.dstAlpha, UINT32_MAX); + } +} + +// TODO GXSetFieldMask +// TODO GXSetFieldMode diff --git a/lib/dolphin/GXTev.cpp b/lib/dolphin/GXTev.cpp new file mode 100644 index 0000000..d102ffe --- /dev/null +++ b/lib/dolphin/GXTev.cpp @@ -0,0 +1,111 @@ +#include "gx.hpp" + +void GXSetTevOp(GXTevStageID id, GXTevMode mode) { + GXTevColorArg inputColor = GX_CC_RASC; + GXTevAlphaArg inputAlpha = GX_CA_RASA; + if (id != GX_TEVSTAGE0) { + inputColor = GX_CC_CPREV; + inputAlpha = GX_CA_APREV; + } + switch (mode) { + case GX_MODULATE: + GXSetTevColorIn(id, GX_CC_ZERO, GX_CC_TEXC, inputColor, GX_CC_ZERO); + GXSetTevAlphaIn(id, GX_CA_ZERO, GX_CA_TEXA, inputAlpha, GX_CA_ZERO); + break; + case GX_DECAL: + GXSetTevColorIn(id, inputColor, GX_CC_TEXC, GX_CC_TEXA, GX_CC_ZERO); + GXSetTevAlphaIn(id, GX_CA_ZERO, GX_CA_ZERO, GX_CA_ZERO, inputAlpha); + break; + case GX_BLEND: + GXSetTevColorIn(id, inputColor, GX_CC_ONE, GX_CC_TEXC, GX_CC_ZERO); + GXSetTevAlphaIn(id, GX_CA_ZERO, GX_CA_TEXA, inputAlpha, GX_CA_ZERO); + break; + case GX_REPLACE: + GXSetTevColorIn(id, GX_CC_ZERO, GX_CC_ZERO, GX_CC_ZERO, GX_CC_TEXC); + GXSetTevAlphaIn(id, GX_CA_ZERO, GX_CA_ZERO, GX_CA_ZERO, GX_CA_TEXA); + break; + case GX_PASSCLR: + GXSetTevColorIn(id, GX_CC_ZERO, GX_CC_ZERO, GX_CC_ZERO, inputColor); + GXSetTevAlphaIn(id, GX_CA_ZERO, GX_CA_ZERO, GX_CA_ZERO, inputAlpha); + break; + } + GXSetTevColorOp(id, GX_TEV_ADD, GX_TB_ZERO, GX_CS_SCALE_1, GX_TRUE, GX_TEVPREV); + GXSetTevAlphaOp(id, GX_TEV_ADD, GX_TB_ZERO, GX_CS_SCALE_1, GX_TRUE, GX_TEVPREV); +} + +void GXSetTevColorIn(GXTevStageID stageId, GXTevColorArg a, GXTevColorArg b, GXTevColorArg c, GXTevColorArg d) { + update_gx_state(g_gxState.tevStages[stageId].colorPass, {a, b, c, d}); +} + +void GXSetTevAlphaIn(GXTevStageID stageId, GXTevAlphaArg a, GXTevAlphaArg b, GXTevAlphaArg c, GXTevAlphaArg d) { + update_gx_state(g_gxState.tevStages[stageId].alphaPass, {a, b, c, d}); +} + +void GXSetTevColorOp(GXTevStageID stageId, GXTevOp op, GXTevBias bias, GXTevScale scale, bool clamp, + GXTevRegID outReg) { + update_gx_state(g_gxState.tevStages[stageId].colorOp, {op, bias, scale, outReg, clamp}); +} + +void GXSetTevAlphaOp(GXTevStageID stageId, GXTevOp op, GXTevBias bias, GXTevScale scale, bool clamp, + GXTevRegID outReg) { + update_gx_state(g_gxState.tevStages[stageId].alphaOp, {op, bias, scale, outReg, clamp}); +} + +void GXSetTevColor(GXTevRegID id, GXColor color) { + if (id < GX_TEVPREV || id > GX_TEVREG2) { + Log.report(LOG_FATAL, FMT_STRING("bad tevreg {}"), id); + unreachable(); + } + update_gx_state(g_gxState.colorRegs[id], from_gx_color(color)); +} + +void GXSetTevColorS10(GXTevRegID id, GXColorS10 color) { + update_gx_state(g_gxState.colorRegs[id], aurora::Vec4{ + static_cast(color.r) / 1023.f, + static_cast(color.g) / 1023.f, + static_cast(color.b) / 1023.f, + static_cast(color.a) / 1023.f, + }); +} + +void GXSetAlphaCompare(GXCompare comp0, u8 ref0, GXAlphaOp op, GXCompare comp1, u8 ref1) { + update_gx_state(g_gxState.alphaCompare, {comp0, ref0, op, comp1, ref1}); +} + +void GXSetTevOrder(GXTevStageID id, GXTexCoordID tcid, GXTexMapID tmid, GXChannelID cid) { + auto& stage = g_gxState.tevStages[id]; + update_gx_state(stage.texCoordId, tcid); + update_gx_state(stage.texMapId, tmid); + update_gx_state(stage.channelId, cid); +} + +// TODO GXSetZTexture + +void GXSetNumTevStages(u8 num) { update_gx_state(g_gxState.numTevStages, num); } + +void GXSetTevKColor(GXTevKColorID id, GXColor color) { + if (id >= GX_MAX_KCOLOR) { + Log.report(LOG_FATAL, FMT_STRING("bad kcolor {}"), id); + unreachable(); + } + update_gx_state(g_gxState.kcolors[id], from_gx_color(color)); +} + +void GXSetTevKColorSel(GXTevStageID id, GXTevKColorSel sel) { update_gx_state(g_gxState.tevStages[id].kcSel, sel); } + +void GXSetTevKAlphaSel(GXTevStageID id, GXTevKAlphaSel sel) { update_gx_state(g_gxState.tevStages[id].kaSel, sel); } + +void GXSetTevSwapMode(GXTevStageID stageId, GXTevSwapSel rasSel, GXTevSwapSel texSel) { + auto& stage = g_gxState.tevStages[stageId]; + update_gx_state(stage.tevSwapRas, rasSel); + update_gx_state(stage.tevSwapTex, texSel); +} + +void GXSetTevSwapModeTable(GXTevSwapSel id, GXTevColorChan red, GXTevColorChan green, GXTevColorChan blue, + GXTevColorChan alpha) { + if (id < GX_TEV_SWAP0 || id >= GX_MAX_TEVSWAP) { + Log.report(LOG_FATAL, FMT_STRING("invalid tev swap sel {}"), id); + unreachable(); + } + update_gx_state(g_gxState.tevSwapTable[id], {red, green, blue, alpha}); +} diff --git a/lib/dolphin/GXTexture.cpp b/lib/dolphin/GXTexture.cpp new file mode 100644 index 0000000..784a2a4 --- /dev/null +++ b/lib/dolphin/GXTexture.cpp @@ -0,0 +1,231 @@ +#include "gx.hpp" + +#include "../gfx/texture.hpp" + +#include + +static absl::flat_hash_map g_resolvedTexMap; + +void GXInitTexObj(GXTexObj* obj_, const void* data, u16 width, u16 height, u32 format, GXTexWrapMode wrapS, + GXTexWrapMode wrapT, GXBool mipmap) { + memset(obj_, 0, sizeof(GXTexObj)); + auto* obj = reinterpret_cast(obj_); + obj->data = data; + obj->width = width; + obj->height = height; + obj->fmt = format; + obj->wrapS = wrapS; + obj->wrapT = wrapT; + obj->hasMips = mipmap; + // TODO default values? + obj->minFilter = GX_LINEAR; + obj->magFilter = GX_LINEAR; + obj->minLod = 0.f; + obj->maxLod = 0.f; + obj->lodBias = 0.f; + obj->biasClamp = false; + obj->doEdgeLod = false; + obj->maxAniso = GX_ANISO_4; + obj->tlut = GX_TLUT0; + if (g_resolvedTexMap.contains(data)) { + obj->dataInvalidated = false; // TODO hack + } else { + obj->dataInvalidated = true; + } +} + +void GXInitTexObjCI(GXTexObj* obj_, const void* data, u16 width, u16 height, GXCITexFmt format, GXTexWrapMode wrapS, + GXTexWrapMode wrapT, GXBool mipmap, u32 tlut) { + memset(obj_, 0, sizeof(GXTexObj)); + auto* obj = reinterpret_cast(obj_); + obj->data = data; + obj->width = width; + obj->height = height; + obj->fmt = static_cast(format); + obj->wrapS = wrapS; + obj->wrapT = wrapT; + obj->hasMips = mipmap; + obj->tlut = static_cast(tlut); + // TODO default values? + obj->minFilter = GX_LINEAR; + obj->magFilter = GX_LINEAR; + obj->minLod = 0.f; + obj->maxLod = 0.f; + obj->lodBias = 0.f; + obj->biasClamp = false; + obj->doEdgeLod = false; + obj->maxAniso = GX_ANISO_4; + obj->dataInvalidated = true; +} + +void GXInitTexObjLOD(GXTexObj* obj_, GXTexFilter minFilt, GXTexFilter magFilt, float minLod, float maxLod, + float lodBias, GXBool biasClamp, GXBool doEdgeLod, GXAnisotropy maxAniso) { + auto* obj = reinterpret_cast(obj_); + obj->minFilter = minFilt; + obj->magFilter = magFilt; + obj->minLod = minLod; + obj->maxLod = maxLod; + obj->lodBias = lodBias; + obj->biasClamp = biasClamp; + obj->doEdgeLod = doEdgeLod; + obj->maxAniso = maxAniso; +} + +void GXInitTexObjData(GXTexObj* obj_, const void* data) { + auto* obj = reinterpret_cast(obj_); + obj->data = data; + obj->dataInvalidated = true; +} + +void GXInitTexObjWrapMode(GXTexObj* obj_, GXTexWrapMode wrapS, GXTexWrapMode wrapT) { + auto* obj = reinterpret_cast(obj_); + obj->wrapS = wrapS; + obj->wrapT = wrapT; +} + +void GXInitTexObjTlut(GXTexObj* obj_, u32 tlut) { + auto* obj = reinterpret_cast(obj_); + obj->tlut = static_cast(tlut); +} + +// TODO GXInitTexObjFilter +// TODO GXInitTexObjMaxLOD +// TODO GXInitTexObjMinLOD +// TODO GXInitTexObjLODBias +// TODO GXInitTexObjBiasClamp +// TODO GXInitTexObjEdgeLOD +// TODO GXInitTexObjMaxAniso +// TODO GXInitTexObjUserData +// TODO GXGetTexObjUserData + +void GXLoadTexObj(GXTexObj* obj_, GXTexMapID id) { + auto* obj = reinterpret_cast(obj_); + if (!obj->ref) { + obj->ref = aurora::gfx::new_dynamic_texture_2d(obj->width, obj->height, u32(obj->maxLod) + 1, obj->fmt, + fmt::format(FMT_STRING("GXLoadTexObj_{}"), obj->fmt).c_str()); + } + if (obj->dataInvalidated) { + aurora::gfx::write_texture(*obj->ref, {static_cast(obj->data), UINT32_MAX /* TODO */}); + obj->dataInvalidated = false; + } + g_gxState.textures[id] = {*obj}; + // TODO stateDirty? +} + +u32 GXGetTexBufferSize(u16 width, u16 height, u32 fmt, GXBool mips, u8 maxLod) { + s32 shiftX = 0; + s32 shiftY = 0; + switch (fmt) { + case GX_TF_I4: + case GX_TF_C4: + case GX_TF_CMPR: + case GX_CTF_R4: + case GX_CTF_Z4: + shiftX = 3; + shiftY = 3; + break; + case GX_TF_I8: + case GX_TF_IA4: + case GX_TF_C8: + case GX_TF_Z8: + case GX_CTF_RA4: + case GX_CTF_A8: + case GX_CTF_R8: + case GX_CTF_G8: + case GX_CTF_B8: + case GX_CTF_Z8M: + case GX_CTF_Z8L: + shiftX = 3; + shiftY = 2; + break; + case GX_TF_IA8: + case GX_TF_RGB565: + case GX_TF_RGB5A3: + case GX_TF_RGBA8: + case GX_TF_C14X2: + case GX_TF_Z16: + case GX_TF_Z24X8: + case GX_CTF_RA8: + case GX_CTF_RG8: + case GX_CTF_GB8: + case GX_CTF_Z16L: + shiftX = 2; + shiftY = 2; + break; + default: + break; + } + u32 bitSize = fmt == GX_TF_RGBA8 || fmt == GX_TF_Z24X8 ? 64 : 32; + u32 bufLen = 0; + if (mips) { + while (maxLod != 0) { + const u32 tileX = ((width + (1 << shiftX) - 1) >> shiftX); + const u32 tileY = ((height + (1 << shiftY) - 1) >> shiftY); + bufLen += bitSize * tileX * tileY; + + if (width == 1 && height == 1) { + return bufLen; + } + + width = (width < 2) ? 1 : width / 2; + height = (height < 2) ? 1 : height / 2; + --maxLod; + }; + } else { + const u32 tileX = ((width + (1 << shiftX) - 1) >> shiftX); + const u32 tileY = ((height + (1 << shiftY) - 1) >> shiftY); + bufLen = bitSize * tileX * tileY; + } + + return bufLen; +} + +void GXInitTlutObj(GXTlutObj* obj_, const void* data, GXTlutFmt format, u16 entries) { + memset(obj_, 0, sizeof(GXTlutObj)); + GXTexFmt texFmt; + switch (format) { + case GX_TL_IA8: + texFmt = GX_TF_IA8; + break; + case GX_TL_RGB565: + texFmt = GX_TF_RGB565; + break; + case GX_TL_RGB5A3: + texFmt = GX_TF_RGB5A3; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid tlut format {}"), format); + unreachable(); + } + auto* obj = reinterpret_cast(obj_); + obj->ref = aurora::gfx::new_static_texture_2d( + entries, 1, 1, texFmt, aurora::ArrayRef{static_cast(data), static_cast(entries) * 2}, + "GXInitTlutObj"); +} + +void GXLoadTlut(const GXTlutObj* obj_, GXTlut idx) { + g_gxState.tluts[idx] = *reinterpret_cast(obj_); + // TODO stateDirty? +} + +// TODO GXInitTexCacheRegion +// TODO GXInitTexPreLoadRegion +// TODO GXInitTlutRegion +// TODO GXInvalidateTexRegion + +void GXInvalidateTexAll() { + // no-op? +} + +// TODO GXPreLoadEntireTexture +// TODO GXSetTexRegionCallback +// TODO GXSetTlutRegionCallback +// TODO GXLoadTexObjPreLoaded +// TODO GXSetTexCoordScaleManually +// TODO GXSetTexCoordCylWrap +// TODO GXSetTexCoordBias + +void GXCopyTex(void* dest, GXBool clear) { + // TODO + g_resolvedTexMap.emplace(dest, 0); +} diff --git a/lib/dolphin/GXTransform.cpp b/lib/dolphin/GXTransform.cpp new file mode 100644 index 0000000..efbdf89 --- /dev/null +++ b/lib/dolphin/GXTransform.cpp @@ -0,0 +1,123 @@ +#include "gx.hpp" + +constexpr aurora::Mat4x4 DepthCorrect{ + {1.f, 0.f, 0.f, 0.f}, + {0.f, 1.f, 0.f, 0.f}, + {0.f, 0.f, 1.f, 0.f}, + {0.f, 0.f, 1.f, 1.f}, +}; + +void GXSetProjection(const void* mtx_, GXProjectionType type) { + const auto& mtx = *reinterpret_cast*>(mtx_); + g_gxState.origProj = mtx; + g_gxState.projType = type; +#ifdef AURORA_NATIVE_MATRIX + update_gx_state(g_gxState.proj, DepthCorrect * mtx); +#else + update_gx_state(g_gxState.proj, DepthCorrect * mtx.transpose()); +#endif +} + +// TODO GXSetProjectionv + +void GXLoadPosMtxImm(const void* mtx_, u32 id) { + if (id < GX_PNMTX0 || id > GX_PNMTX9) { + Log.report(LOG_FATAL, FMT_STRING("invalid pn mtx {}"), id); + unreachable(); + } + auto& state = g_gxState.pnMtx[id / 3]; +#ifdef AURORA_NATIVE_MATRIX + const auto& mtx = *reinterpret_cast*>(mtx_); + update_gx_state(state.pos, mtx); +#else + const auto* mtx = reinterpret_cast*>(mtx_); + update_gx_state(state.pos, mtx->toTransposed4x4()); +#endif +} + +// TODO GXLoadPosMtxIndx + +void GXLoadNrmMtxImm(const void* mtx_, u32 id) { + if (id < GX_PNMTX0 || id > GX_PNMTX9) { + Log.report(LOG_FATAL, FMT_STRING("invalid pn mtx {}"), id); + unreachable(); + } + auto& state = g_gxState.pnMtx[id / 3]; +#ifdef AURORA_NATIVE_MATRIX + const auto& mtx = *reinterpret_cast*>(mtx_); + update_gx_state(state.nrm, mtx); +#else + const auto* mtx = reinterpret_cast*>(mtx_); + update_gx_state(state.nrm, mtx->toTransposed4x4()); +#endif +} + +// TODO GXLoadNrmMtxImm3x3 +// TODO GXLoadNrmMtxIndx3x3 + +void GXSetCurrentMtx(u32 id) { + if (id < GX_PNMTX0 || id > GX_PNMTX9) { + Log.report(LOG_FATAL, FMT_STRING("invalid pn mtx {}"), id); + unreachable(); + } + update_gx_state(g_gxState.currentPnMtx, id / 3); +} + +void GXLoadTexMtxImm(const void* mtx_, u32 id, GXTexMtxType type) { + if ((id < GX_TEXMTX0 || id > GX_IDENTITY) && (id < GX_PTTEXMTX0 || id > GX_PTIDENTITY)) { + Log.report(LOG_FATAL, FMT_STRING("invalid tex mtx {}"), id); + unreachable(); + } + if (id >= GX_PTTEXMTX0) { + if (type != GX_MTX3x4) { + Log.report(LOG_FATAL, FMT_STRING("invalid pt mtx type {}"), type); + unreachable(); + } + const auto idx = (id - GX_PTTEXMTX0) / 3; +#ifdef AURORA_NATIVE_MATRIX + const auto& mtx = *reinterpret_cast*>(mtx_); + update_gx_state>(g_gxState.ptTexMtxs[idx], mtx); +#else + const auto& mtx = *reinterpret_cast*>(mtx_); + update_gx_state>(g_gxState.ptTexMtxs[idx], mtx.toTransposed4x4()); +#endif + } else { + const auto idx = (id - GX_TEXMTX0) / 3; + switch (type) { + case GX_MTX3x4: { +#ifdef AURORA_NATIVE_MATRIX + const auto& mtx = *reinterpret_cast*>(mtx_); + update_gx_state(g_gxState.texMtxs[idx], mtx); +#else + const auto& mtx = *reinterpret_cast*>(mtx_); + update_gx_state(g_gxState.texMtxs[idx], mtx.toTransposed4x4()); +#endif + break; + } + case GX_MTX2x4: { + const auto& mtx = *reinterpret_cast*>(mtx_); +#ifdef AURORA_NATIVE_MATRIX + update_gx_state(g_gxState.texMtxs[idx], mtx); +#else + update_gx_state(g_gxState.texMtxs[idx], mtx.transpose()); +#endif + break; + } + } + } +} + +// TODO GXLoadTexMtxIndx +// TODO GXProject + +void GXSetViewport(float left, float top, float width, float height, float nearZ, float farZ) { + aurora::gfx::set_viewport(left, top, width, height, nearZ, farZ); +} + +void GXSetViewportJitter(float left, float top, float width, float height, float nearZ, float farZ, u32 field) { + aurora::gfx::set_viewport(left, top, width, height, nearZ, farZ); +} + +// TODO GXSetZScaleOffset +// TODO GXSetScissorBoxOffset +// TODO GXSetClipMode diff --git a/lib/dolphin/GXVert.cpp b/lib/dolphin/GXVert.cpp new file mode 100644 index 0000000..52d610d --- /dev/null +++ b/lib/dolphin/GXVert.cpp @@ -0,0 +1,188 @@ +#include "gx.hpp" + +#include "../gfx/stream/shader.hpp" + +#include +#include + +#ifndef NDEBUG +static inline GXAttr next_attr(size_t begin) { + auto iter = std::find_if(g_gxState.vtxDesc.begin() + begin, g_gxState.vtxDesc.end(), + [](const auto type) { return type != GX_NONE; }); + if (begin > 0 && iter == g_gxState.vtxDesc.end()) { + // wrap around + iter = std::find_if(g_gxState.vtxDesc.begin(), g_gxState.vtxDesc.end(), + [](const auto type) { return type != GX_NONE; }); + } + return GXAttr(iter - g_gxState.vtxDesc.begin()); +} +#endif + +struct SStreamState { + GXPrimitive primitive; + u16 vertexCount = 0; + aurora::ByteBuffer vertexBuffer; + std::vector indices; +#ifndef NDEBUG + GXAttr nextAttr; +#endif + + explicit SStreamState(GXPrimitive primitive, u16 numVerts, u16 vertexSize) noexcept : primitive(primitive) { + vertexBuffer.reserve_extra(size_t(numVerts) * vertexSize); + if (numVerts > 3 && (primitive == GX_TRIANGLEFAN || primitive == GX_TRIANGLESTRIP)) { + indices.reserve((u32(numVerts) - 3) * 3 + 3); + } else if (numVerts > 4 && primitive == GX_QUADS) { + indices.reserve(u32(numVerts) / 4 * 6); + } else { + indices.reserve(numVerts); + } +#ifndef NDEBUG + nextAttr = next_attr(0); +#endif + } +}; + +static std::optional sStreamState; + +void GXBegin(GXPrimitive primitive, GXVtxFmt vtxFmt, u16 nVerts) { +#ifndef NDEBUG + if (sStreamState) { + Log.report(LOG_FATAL, FMT_STRING("Stream began twice!")); + unreachable(); + } +#endif + uint16_t vertexSize = 0; + for (GXAttr attr{}; const auto type : g_gxState.vtxDesc) { + if (type == GX_DIRECT) { + if (attr == GX_VA_POS || attr == GX_VA_NRM) { + vertexSize += 12; + } else if (attr == GX_VA_CLR0 || attr == GX_VA_CLR1) { + vertexSize += 16; + } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) { + vertexSize += 8; + } else { + Log.report(LOG_FATAL, FMT_STRING("don't know how to handle attr {}"), attr); + unreachable(); + } + } else if (type == GX_INDEX8 || type == GX_INDEX16) { + vertexSize += 2; + } + attr = GXAttr(attr + 1); + } + if (vertexSize == 0) { + Log.report(LOG_FATAL, FMT_STRING("no vtx attributes enabled?")); + unreachable(); + } + sStreamState.emplace(primitive, nVerts, vertexSize); +} + +static inline void check_attr_order(GXAttr attr) noexcept { +#ifndef NDEBUG + if (!sStreamState) { + Log.report(LOG_FATAL, FMT_STRING("Stream not started!")); + unreachable(); + } + if (sStreamState->nextAttr != attr) { + Log.report(LOG_FATAL, FMT_STRING("bad attribute order: {}, expected {}"), attr, sStreamState->nextAttr); + unreachable(); + } + sStreamState->nextAttr = next_attr(attr + 1); +#endif +} + +void GXPosition3f32(float x, float y, float z) { + check_attr_order(GX_VA_POS); + auto& state = *sStreamState; + state.vertexBuffer.append(&x, sizeof(float)); + state.vertexBuffer.append(&y, sizeof(float)); + state.vertexBuffer.append(&z, sizeof(float)); + if (state.primitive == GX_TRIANGLES || state.vertexCount < 3) { + // pass + } else if (state.primitive == GX_TRIANGLEFAN) { + state.indices.push_back(0); + state.indices.push_back(state.vertexCount - 1); + } else if (state.primitive == GX_TRIANGLESTRIP) { + if ((state.vertexCount & 1) == 0) { + state.indices.push_back(state.vertexCount - 2); + state.indices.push_back(state.vertexCount - 1); + } else { + state.indices.push_back(state.vertexCount - 1); + state.indices.push_back(state.vertexCount - 2); + } + } else if (state.primitive == GX_QUADS) { + if ((state.vertexCount & 3) == 3) { + state.indices.push_back(state.vertexCount - 3); + state.indices.push_back(state.vertexCount - 1); + } + } + state.indices.push_back(state.vertexCount); + ++state.vertexCount; +} + +void GXPosition3s16(s16 x, s16 y, s16 z) { + // TODO frac + GXPosition3f32(x, y, z); +} + +void GXNormal3f32(float x, float y, float z) { + check_attr_order(GX_VA_NRM); + sStreamState->vertexBuffer.append(&x, 4); + sStreamState->vertexBuffer.append(&y, 4); + sStreamState->vertexBuffer.append(&z, 4); +} + +void GXColor4f32(float r, float g, float b, float a) { + check_attr_order(GX_VA_CLR0); + sStreamState->vertexBuffer.append(&r, 4); + sStreamState->vertexBuffer.append(&g, 4); + sStreamState->vertexBuffer.append(&b, 4); + sStreamState->vertexBuffer.append(&a, 4); +} + +void GXColor4u8(u8 r, u8 g, u8 b, u8 a) { + GXColor4f32(static_cast(r) / 255.f, static_cast(g) / 255.f, static_cast(b) / 255.f, + static_cast(a) / 255.f); +} + +void GXTexCoord2f32(float u, float v) { + check_attr_order(GX_VA_TEX0); + sStreamState->vertexBuffer.append(&u, 4); + sStreamState->vertexBuffer.append(&v, 4); +} + +void GXTexCoord2s16(s16 s, s16 t) { + // TODO frac + GXTexCoord2f32(s, t); +} + +void GXPosition1x16(u16 idx) { + check_attr_order(GX_VA_POS); + // keep aligned + if (sStreamState->vertexBuffer.size() % 4 != 0) { + sStreamState->vertexBuffer.append_zeroes(4 - (sStreamState->vertexBuffer.size() % 4)); + } + sStreamState->vertexBuffer.append(&idx, 2); +} + +void GXEnd() { + if (sStreamState->vertexCount == 0) { + sStreamState.reset(); + return; + } + const auto vertRange = aurora::gfx::push_verts(sStreamState->vertexBuffer.data(), sStreamState->vertexBuffer.size()); + const auto indexRange = aurora::gfx::push_indices(aurora::ArrayRef{sStreamState->indices}); + aurora::gfx::stream::PipelineConfig config{}; + populate_pipeline_config(config, GX_TRIANGLES); + const auto info = aurora::gfx::gx::build_shader_info(config.shaderConfig); + const auto pipeline = aurora::gfx::pipeline_ref(config); + aurora::gfx::push_draw_command(aurora::gfx::stream::DrawData{ + .pipeline = pipeline, + .vertRange = vertRange, + .uniformRange = build_uniform(info), + .indexRange = indexRange, + .indexCount = static_cast(sStreamState->indices.size()), + .bindGroups = aurora::gfx::gx::build_bind_groups(info, config.shaderConfig, {}), + .dstAlpha = g_gxState.dstAlpha, + }); + sStreamState.reset(); +} diff --git a/lib/dolphin/gx.hpp b/lib/dolphin/gx.hpp new file mode 100644 index 0000000..2183b0f --- /dev/null +++ b/lib/dolphin/gx.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include "../internal.hpp" +#include "../gfx/gx.hpp" + +static aurora::Module Log("aurora::gx"); + +using aurora::gfx::gx::g_gxState; + +template +static inline void update_gx_state(T& val, T newVal) { + if (val != newVal) { + val = std::move(newVal); + g_gxState.stateDirty = true; + } +} + +static inline aurora::Vec4 from_gx_color(GXColor color) { + return { + static_cast(color.r) / 255.f, + static_cast(color.g) / 255.f, + static_cast(color.b) / 255.f, + static_cast(color.a) / 255.f, + }; +} diff --git a/lib/dolphin/vi.cpp b/lib/dolphin/vi.cpp new file mode 100644 index 0000000..8c08241 --- /dev/null +++ b/lib/dolphin/vi.cpp @@ -0,0 +1,7 @@ +extern "C" { +#include +} + +void VIInit() {} +u32 VIGetTvFormat() { return 0; } +void VIFlush() {} diff --git a/lib/gfx/common.cpp b/lib/gfx/common.cpp new file mode 100644 index 0000000..ad9cb78 --- /dev/null +++ b/lib/gfx/common.cpp @@ -0,0 +1,813 @@ +#include "common.hpp" + +#include "../internal.hpp" +#include "../webgpu/gpu.hpp" +#include "model/shader.hpp" +#include "stream/shader.hpp" +#include "texture.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace aurora::gfx { +static Module Log("aurora::gfx"); + +using webgpu::g_device; +using webgpu::g_queue; + +#ifdef AURORA_GFX_DEBUG_GROUPS +std::vector g_debugGroupStack; +#endif + +constexpr uint64_t UniformBufferSize = 3145728; // 3mb +constexpr uint64_t VertexBufferSize = 3145728; // 3mb +constexpr uint64_t IndexBufferSize = 1048576; // 1mb +constexpr uint64_t StorageBufferSize = 8388608; // 8mb +constexpr uint64_t TextureUploadSize = 25165824; // 24mb + +constexpr uint64_t StagingBufferSize = + UniformBufferSize + VertexBufferSize + IndexBufferSize + StorageBufferSize + TextureUploadSize; + +struct ShaderState { + stream::State stream; + model::State model; +}; +struct ShaderDrawCommand { + ShaderType type; + union { + stream::DrawData stream; + model::DrawData model; + }; +}; +enum class CommandType { + SetViewport, + SetScissor, + Draw, +}; +struct Command { + CommandType type; +#ifdef AURORA_GFX_DEBUG_GROUPS + std::vector debugGroupStack; +#endif + union Data { + struct SetViewportCommand { + float left; + float top; + float width; + float height; + float znear; + float zfar; + + bool operator==(const SetViewportCommand& rhs) const { + return left == rhs.left && top == rhs.top && width == rhs.width && height == rhs.height && znear == rhs.znear && + zfar == rhs.zfar; + } + bool operator!=(const SetViewportCommand& rhs) const { return !(*this == rhs); } + } setViewport; + struct SetScissorCommand { + uint32_t x; + uint32_t y; + uint32_t w; + uint32_t h; + + bool operator==(const SetScissorCommand& rhs) const { + return x == rhs.x && y == rhs.y && w == rhs.w && h == rhs.h; + } + bool operator!=(const SetScissorCommand& rhs) const { return !(*this == rhs); } + } setScissor; + ShaderDrawCommand draw; + } data; +}; +} // namespace aurora::gfx + +namespace aurora { +// For types that we can't ensure are safe to hash with has_unique_object_representations, +// we create specialized methods to handle them. Note that these are highly dependent on +// the structure definition, which could easily change with Dawn updates. +template <> +inline HashType xxh3_hash(const WGPUBindGroupDescriptor& input, HashType seed) { + constexpr auto offset = sizeof(void*) * 2; // skip nextInChain, label + const auto hash = xxh3_hash_s(reinterpret_cast(&input) + offset, + sizeof(WGPUBindGroupDescriptor) - offset - sizeof(void*) /* skip entries */, seed); + return xxh3_hash_s(input.entries, sizeof(WGPUBindGroupEntry) * input.entryCount, hash); +} +template <> +inline HashType xxh3_hash(const WGPUSamplerDescriptor& input, HashType seed) { + constexpr auto offset = sizeof(void*) * 2; // skip nextInChain, label + return xxh3_hash_s(reinterpret_cast(&input) + offset, + sizeof(WGPUSamplerDescriptor) - offset - 2 /* skip padding */, seed); +} +} // namespace aurora + +namespace aurora::gfx { +using NewPipelineCallback = std::function; +std::mutex g_pipelineMutex; +static bool g_hasPipelineThread = false; +static std::thread g_pipelineThread; +static std::atomic_bool g_pipelineThreadEnd; +static std::condition_variable g_pipelineCv; +static absl::flat_hash_map g_pipelines; +static std::deque> g_queuedPipelines; +static absl::flat_hash_map g_cachedBindGroups; +static absl::flat_hash_map g_cachedSamplers; +std::atomic_uint32_t queuedPipelines; +std::atomic_uint32_t createdPipelines; + +static ByteBuffer g_verts; +static ByteBuffer g_uniforms; +static ByteBuffer g_indices; +static ByteBuffer g_storage; +static ByteBuffer g_staticStorage; +static ByteBuffer g_textureUpload; +WGPUBuffer g_vertexBuffer; +WGPUBuffer g_uniformBuffer; +WGPUBuffer g_indexBuffer; +WGPUBuffer g_storageBuffer; +size_t g_staticStorageLastSize = 0; +static std::array g_stagingBuffers; +static WGPUSupportedLimits g_cachedLimits; + +static ShaderState g_state; +static PipelineRef g_currentPipeline; + +using CommandList = std::vector; +struct ClipRect { + int32_t x; + int32_t y; + int32_t width; + int32_t height; +}; +struct RenderPass { + u32 resolveTarget = UINT32_MAX; + ClipRect resolveRect; + Vec4 clearColor{0.f, 0.f, 0.f, 0.f}; + CommandList commands; + bool clear = true; +}; +static std::vector g_renderPasses; +static u32 g_currentRenderPass = UINT32_MAX; +std::vector g_resolvedTextures; +std::vector g_textureUploads; + +static ByteBuffer g_serializedPipelines{}; +static u32 g_serializedPipelineCount = 0; + +template +static void serialize_pipeline_config(ShaderType type, const PipelineConfig& config) { + static_assert(std::has_unique_object_representations_v); + g_serializedPipelines.append(&type, sizeof(type)); + const u32 configSize = sizeof(config); + g_serializedPipelines.append(&configSize, sizeof(configSize)); + g_serializedPipelines.append(&config, configSize); + ++g_serializedPipelineCount; +} + +template +static PipelineRef find_pipeline(ShaderType type, const PipelineConfig& config, NewPipelineCallback&& cb, + bool serialize = true) { + PipelineRef hash = xxh3_hash(config, static_cast(type)); + bool found = false; + { + std::scoped_lock guard{g_pipelineMutex}; + found = g_pipelines.contains(hash); + if (!found) { + if (g_hasPipelineThread) { + const auto ref = + std::find_if(g_queuedPipelines.begin(), g_queuedPipelines.end(), [=](auto v) { return v.first == hash; }); + if (ref != g_queuedPipelines.end()) { + found = true; + } + } else { + g_pipelines.try_emplace(hash, cb()); + if (serialize) { + serialize_pipeline_config(type, config); + } + found = true; + } + } + if (!found) { + g_queuedPipelines.emplace_back(std::pair{hash, std::move(cb)}); + if (serialize) { + serialize_pipeline_config(type, config); + } + } + } + if (!found) { + g_pipelineCv.notify_one(); + queuedPipelines++; + } + return hash; +} + +static inline void push_command(CommandType type, const Command::Data& data) { + if (g_currentRenderPass == UINT32_MAX) { + Log.report(LOG_WARNING, FMT_STRING("Dropping command {}"), magic_enum::enum_name(type)); + return; + } + g_renderPasses[g_currentRenderPass].commands.push_back({ + .type = type, +#ifdef AURORA_GFX_DEBUG_GROUPS + .debugGroupStack = g_debugGroupStack, +#endif + .data = data, + }); +} + +static void push_draw_command(ShaderDrawCommand data) { push_command(CommandType::Draw, Command::Data{.draw = data}); } + +static Command::Data::SetViewportCommand g_cachedViewport; +void set_viewport(float left, float top, float width, float height, float znear, float zfar) noexcept { + Command::Data::SetViewportCommand cmd{left, top, width, height, znear, zfar}; + if (cmd != g_cachedViewport) { + push_command(CommandType::SetViewport, Command::Data{.setViewport = cmd}); + g_cachedViewport = cmd; + } +} +static Command::Data::SetScissorCommand g_cachedScissor; +void set_scissor(uint32_t x, uint32_t y, uint32_t w, uint32_t h) noexcept { + Command::Data::SetScissorCommand cmd{x, y, w, h}; + if (cmd != g_cachedScissor) { + push_command(CommandType::SetScissor, Command::Data{.setScissor = cmd}); + g_cachedScissor = cmd; + } +} + +static inline bool operator==(const WGPUExtent3D& lhs, const WGPUExtent3D& rhs) { + return lhs.width == rhs.width && lhs.height == rhs.height && lhs.depthOrArrayLayers == rhs.depthOrArrayLayers; +} +static inline bool operator!=(const WGPUExtent3D& lhs, const WGPUExtent3D& rhs) { return !(lhs == rhs); } + +void resolve_color(const ClipRect& rect, uint32_t bind, GXTexFmt fmt, bool clear_depth) noexcept { + if (g_resolvedTextures.size() < bind + 1) { + g_resolvedTextures.resize(bind + 1); + } + const WGPUExtent3D size{ + .width = static_cast(rect.width), + .height = static_cast(rect.height), + .depthOrArrayLayers = 1, + }; + if (!g_resolvedTextures[bind] || g_resolvedTextures[bind]->size != size) { + g_resolvedTextures[bind] = new_render_texture(rect.width, rect.height, fmt, "Resolved Texture"); + } + auto& currentPass = g_renderPasses[g_currentRenderPass]; + currentPass.resolveTarget = bind; + currentPass.resolveRect = rect; + auto& newPass = g_renderPasses.emplace_back(); + newPass.clearColor = gx::g_gxState.clearColor; + newPass.clear = false; // TODO + ++g_currentRenderPass; +} + +template <> +const stream::State& get_state() { + return g_state.stream; +} +template <> +void push_draw_command(stream::DrawData data) { + push_draw_command(ShaderDrawCommand{.type = ShaderType::Stream, .stream = data}); +} +template <> +PipelineRef pipeline_ref(stream::PipelineConfig config) { + return find_pipeline(ShaderType::Stream, config, [=]() { return create_pipeline(g_state.stream, config); }); +} + +template <> +void push_draw_command(model::DrawData data) { + push_draw_command(ShaderDrawCommand{.type = ShaderType::Model, .model = data}); +} +template <> +PipelineRef pipeline_ref(model::PipelineConfig config) { + return find_pipeline(ShaderType::Model, config, [=]() { return create_pipeline(g_state.model, config); }); +} + +static void pipeline_worker() { + bool hasMore = false; + while (true) { + std::pair cb; + { + std::unique_lock lock{g_pipelineMutex}; + if (!hasMore) { + g_pipelineCv.wait(lock, [] { return !g_queuedPipelines.empty() || g_pipelineThreadEnd; }); + } + if (g_pipelineThreadEnd) { + break; + } + cb = std::move(g_queuedPipelines.front()); + } + auto result = cb.second(); + // std::this_thread::sleep_for(std::chrono::milliseconds{1500}); + { + std::scoped_lock lock{g_pipelineMutex}; + if (!g_pipelines.try_emplace(cb.first, std::move(result)).second) { + Log.report(LOG_FATAL, FMT_STRING("Duplicate pipeline {}"), cb.first); + unreachable(); + } + g_queuedPipelines.pop_front(); + hasMore = !g_queuedPipelines.empty(); + } + createdPipelines++; + queuedPipelines--; + } +} + +void initialize() { + // No async pipelines for OpenGL (ES) + if (webgpu::g_backendType == WGPUBackendType_OpenGL || webgpu::g_backendType == WGPUBackendType_OpenGLES) { + g_hasPipelineThread = false; + } else { + g_pipelineThreadEnd = false; + g_pipelineThread = std::thread(pipeline_worker); + g_hasPipelineThread = true; + } + + // For uniform & storage buffer offset alignments + wgpuDeviceGetLimits(g_device, &g_cachedLimits); + + const auto createBuffer = [](WGPUBuffer& out, WGPUBufferUsageFlags usage, uint64_t size, const char* label) { + if (size <= 0) { + return; + } + const WGPUBufferDescriptor descriptor{ + .label = label, + .usage = usage, + .size = size, + }; + out = wgpuDeviceCreateBuffer(g_device, &descriptor); + }; + createBuffer(g_uniformBuffer, WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst, UniformBufferSize, + "Shared Uniform Buffer"); + createBuffer(g_vertexBuffer, WGPUBufferUsage_Vertex | WGPUBufferUsage_CopyDst, VertexBufferSize, + "Shared Vertex Buffer"); + createBuffer(g_indexBuffer, WGPUBufferUsage_Index | WGPUBufferUsage_CopyDst, IndexBufferSize, "Shared Index Buffer"); + createBuffer(g_storageBuffer, WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst, StorageBufferSize, + "Shared Storage Buffer"); + for (int i = 0; i < g_stagingBuffers.size(); ++i) { + const auto label = fmt::format(FMT_STRING("Staging Buffer {}"), i); + createBuffer(g_stagingBuffers[i], WGPUBufferUsage_MapWrite | WGPUBufferUsage_CopySrc, StagingBufferSize, + label.c_str()); + } + map_staging_buffer(); + + g_state.stream = stream::construct_state(); + g_state.model = model::construct_state(); + + { + // Load serialized pipeline cache + std::string path = std::string{g_config.configPath} + "/pipeline_cache.bin"; + std::ifstream file(path, std::ios::in | std::ios::binary | std::ios::ate); + if (file) { + const auto size = file.tellg(); + file.seekg(0, std::ios::beg); + constexpr size_t headerSize = sizeof(g_serializedPipelineCount); + if (size != -1 && size > headerSize) { + g_serializedPipelines.append_zeroes(size_t(size) - headerSize); + file.read(reinterpret_cast(&g_serializedPipelineCount), headerSize); + file.read(reinterpret_cast(g_serializedPipelines.data()), size_t(size) - headerSize); + } + } + } + if (g_serializedPipelineCount > 0) { + size_t offset = 0; + while (offset < g_serializedPipelines.size()) { + ShaderType type = *reinterpret_cast(g_serializedPipelines.data() + offset); + offset += sizeof(ShaderType); + u32 size = *reinterpret_cast(g_serializedPipelines.data() + offset); + offset += sizeof(u32); + switch (type) { + case ShaderType::Stream: { + if (size != sizeof(stream::PipelineConfig)) { + break; + } + const auto config = *reinterpret_cast(g_serializedPipelines.data() + offset); + if (config.version != gx::GXPipelineConfigVersion) { + break; + } + find_pipeline( + type, config, [=]() { return stream::create_pipeline(g_state.stream, config); }, false); + } break; + case ShaderType::Model: { + if (size != sizeof(model::PipelineConfig)) { + break; + } + const auto config = *reinterpret_cast(g_serializedPipelines.data() + offset); + if (config.version != gx::GXPipelineConfigVersion) { + break; + } + find_pipeline( + type, config, [=]() { return model::create_pipeline(g_state.model, config); }, false); + } break; + default: + Log.report(LOG_WARNING, FMT_STRING("Unknown pipeline type {}"), static_cast(type)); + break; + } + offset += size; + } + } +} + +void shutdown() { + if (g_hasPipelineThread) { + g_pipelineThreadEnd = true; + g_pipelineCv.notify_all(); + g_pipelineThread.join(); + } + + { + // Write serialized pipelines to file + const auto path = std::string{g_config.configPath} + "pipeline_cache.bin"; + std::ofstream file(path, std::ios::out | std::ios::trunc | std::ios::binary); + if (file) { + file.write(reinterpret_cast(&g_serializedPipelineCount), sizeof(g_serializedPipelineCount)); + file.write(reinterpret_cast(g_serializedPipelines.data()), g_serializedPipelines.size()); + } + g_serializedPipelines.clear(); + g_serializedPipelineCount = 0; + } + + gx::shutdown(); + + g_resolvedTextures.clear(); + g_textureUploads.clear(); + for (const auto& item : g_cachedBindGroups) { + wgpuBindGroupRelease(item.second); + } + g_cachedBindGroups.clear(); + for (const auto& item : g_cachedSamplers) { + wgpuSamplerRelease(item.second); + } + g_cachedSamplers.clear(); + for (const auto& item : g_pipelines) { + wgpuRenderPipelineRelease(item.second); + } + g_pipelines.clear(); + g_queuedPipelines.clear(); + if (g_vertexBuffer != nullptr) { + wgpuBufferDestroy(g_vertexBuffer); + g_vertexBuffer = nullptr; + } + if (g_uniformBuffer != nullptr) { + wgpuBufferDestroy(g_uniformBuffer); + g_uniformBuffer = nullptr; + } + if (g_indexBuffer != nullptr) { + wgpuBufferDestroy(g_indexBuffer); + g_indexBuffer = nullptr; + } + if (g_storageBuffer != nullptr) { + wgpuBufferDestroy(g_storageBuffer); + g_storageBuffer = nullptr; + } + for (auto& item : g_stagingBuffers) { + if (item != nullptr) { + wgpuBufferDestroy(item); + } + item = nullptr; + } + g_renderPasses.clear(); + g_currentRenderPass = UINT32_MAX; + + g_state = {}; + + queuedPipelines = 0; + createdPipelines = 0; +} + +static size_t currentStagingBuffer = 0; +static bool bufferMapped = false; +void map_staging_buffer() { + bufferMapped = false; + wgpuBufferMapAsync( + g_stagingBuffers[currentStagingBuffer], WGPUMapMode_Write, 0, StagingBufferSize, + [](WGPUBufferMapAsyncStatus status, void* userdata) { + if (status == WGPUBufferMapAsyncStatus_DestroyedBeforeCallback) { + return; + } else if (status != WGPUBufferMapAsyncStatus_Success) { + Log.report(LOG_FATAL, FMT_STRING("Buffer mapping failed: {}"), status); + unreachable(); + } + *static_cast(userdata) = true; + }, + &bufferMapped); +} + +void begin_frame() { + while (!bufferMapped) { + wgpuDeviceTick(g_device); + } + size_t bufferOffset = 0; + auto& stagingBuf = g_stagingBuffers[currentStagingBuffer]; + const auto mapBuffer = [&](ByteBuffer& buf, uint64_t size) { + if (size <= 0) { + return; + } + buf = ByteBuffer{static_cast(wgpuBufferGetMappedRange(stagingBuf, bufferOffset, size)), + static_cast(size)}; + bufferOffset += size; + }; + mapBuffer(g_verts, VertexBufferSize); + mapBuffer(g_uniforms, UniformBufferSize); + mapBuffer(g_indices, IndexBufferSize); + mapBuffer(g_storage, StorageBufferSize); + mapBuffer(g_textureUpload, TextureUploadSize); + + g_renderPasses.emplace_back(); + g_renderPasses[0].clearColor = gx::g_gxState.clearColor; + g_currentRenderPass = 0; +// push_command(CommandType::SetViewport, Command::Data{.setViewport = g_cachedViewport}); +// push_command(CommandType::SetScissor, Command::Data{.setScissor = g_cachedScissor}); +} + +// for imgui debug +size_t g_lastVertSize; +size_t g_lastUniformSize; +size_t g_lastIndexSize; +size_t g_lastStorageSize; + +void end_frame(WGPUCommandEncoder cmd) { + uint64_t bufferOffset = 0; + const auto writeBuffer = [&](ByteBuffer& buf, WGPUBuffer& out, uint64_t size, std::string_view label) { + const auto writeSize = buf.size(); // Only need to copy this many bytes + if (writeSize > 0) { + wgpuCommandEncoderCopyBufferToBuffer(cmd, g_stagingBuffers[currentStagingBuffer], bufferOffset, out, 0, + writeSize); + buf.clear(); + } + bufferOffset += size; + return writeSize; + }; + wgpuBufferUnmap(g_stagingBuffers[currentStagingBuffer]); + g_lastVertSize = writeBuffer(g_verts, g_vertexBuffer, VertexBufferSize, "Vertex"); + g_lastUniformSize = writeBuffer(g_uniforms, g_uniformBuffer, UniformBufferSize, "Uniform"); + g_lastIndexSize = writeBuffer(g_indices, g_indexBuffer, IndexBufferSize, "Index"); + g_lastStorageSize = writeBuffer(g_storage, g_storageBuffer, StorageBufferSize, "Storage"); + { + // Perform texture copies + for (const auto& item : g_textureUploads) { + const WGPUImageCopyBuffer buf{ + .layout = + WGPUTextureDataLayout{ + .offset = item.layout.offset + bufferOffset, + .bytesPerRow = ALIGN(item.layout.bytesPerRow, 256), + .rowsPerImage = item.layout.rowsPerImage, + }, + .buffer = g_stagingBuffers[currentStagingBuffer], + }; + wgpuCommandEncoderCopyBufferToTexture(cmd, &buf, &item.tex, &item.size); + } + g_textureUploads.clear(); + g_textureUpload.clear(); + } + currentStagingBuffer = (currentStagingBuffer + 1) % g_stagingBuffers.size(); + map_staging_buffer(); + g_currentRenderPass = UINT32_MAX; +} + +void render(WGPUCommandEncoder cmd) { + for (u32 i = 0; i < g_renderPasses.size(); ++i) { + const auto& passInfo = g_renderPasses[i]; + bool finalPass = i == g_renderPasses.size() - 1; + if (finalPass && passInfo.resolveTarget != UINT32_MAX) { + Log.report(LOG_FATAL, FMT_STRING("Final render pass must not have resolve target")); + unreachable(); + } + const std::array attachments{ + WGPURenderPassColorAttachment{ + .view = webgpu::g_frameBuffer.view, + .resolveTarget = webgpu::g_graphicsConfig.msaaSamples > 1 ? webgpu::g_frameBufferResolved.view : nullptr, + .loadOp = passInfo.clear ? WGPULoadOp_Clear : WGPULoadOp_Load, + .storeOp = WGPUStoreOp_Store, + .clearColor = {NAN, NAN, NAN, NAN}, + .clearValue = + { + .r = passInfo.clearColor.x(), + .g = passInfo.clearColor.y(), + .b = passInfo.clearColor.z(), + .a = passInfo.clearColor.w(), + }, + }, + }; + const WGPURenderPassDepthStencilAttachment depthStencilAttachment{ + .view = webgpu::g_depthBuffer.view, + .depthLoadOp = passInfo.clear ? WGPULoadOp_Clear : WGPULoadOp_Load, + .depthStoreOp = WGPUStoreOp_Store, + .clearDepth = NAN, + .depthClearValue = 1.f, + }; + const auto label = fmt::format(FMT_STRING("Render pass {}"), i); + const WGPURenderPassDescriptor renderPassDescriptor{ + .label = label.c_str(), + .colorAttachmentCount = attachments.size(), + .colorAttachments = attachments.data(), + .depthStencilAttachment = &depthStencilAttachment, + }; + auto pass = wgpuCommandEncoderBeginRenderPass(cmd, &renderPassDescriptor); + render_pass(pass, i); + wgpuRenderPassEncoderEnd(pass); + wgpuRenderPassEncoderRelease(pass); + + if (passInfo.resolveTarget != UINT32_MAX) { + WGPUImageCopyTexture src{ + .origin = + WGPUOrigin3D{ + .x = static_cast(passInfo.resolveRect.x), + .y = static_cast(passInfo.resolveRect.y), + }, + }; + if (webgpu::g_graphicsConfig.msaaSamples > 1) { + src.texture = webgpu::g_frameBufferResolved.texture; + } else { + src.texture = webgpu::g_frameBuffer.texture; + } + auto& target = g_resolvedTextures[passInfo.resolveTarget]; + const WGPUImageCopyTexture dst{ + .texture = target->texture, + }; + const WGPUExtent3D size{ + .width = static_cast(passInfo.resolveRect.width), + .height = static_cast(passInfo.resolveRect.height), + .depthOrArrayLayers = 1, + }; + wgpuCommandEncoderCopyTextureToTexture(cmd, &src, &dst, &size); + } + } + g_renderPasses.clear(); +} + +void render_pass(WGPURenderPassEncoder pass, u32 idx) { + g_currentPipeline = UINTPTR_MAX; +#ifdef AURORA_GFX_DEBUG_GROUPS + std::vector lastDebugGroupStack; +#endif + + for (const auto& cmd : g_renderPasses[idx].commands) { +#ifdef AURORA_GFX_DEBUG_GROUPS + { + size_t firstDiff = lastDebugGroupStack.size(); + for (size_t i = 0; i < lastDebugGroupStack.size(); ++i) { + if (i >= cmd.debugGroupStack.size() || cmd.debugGroupStack[i] != lastDebugGroupStack[i]) { + firstDiff = i; + break; + } + } + for (size_t i = firstDiff; i < lastDebugGroupStack.size(); ++i) { + wgpuRenderPassEncoderPopDebugGroup(pass); + } + for (size_t i = firstDiff; i < cmd.debugGroupStack.size(); ++i) { + wgpuRenderPassEncoderPushDebugGroup(pass, cmd.debugGroupStack[i].c_str()); + } + lastDebugGroupStack = cmd.debugGroupStack; + } +#endif + switch (cmd.type) { + case CommandType::SetViewport: { + const auto& vp = cmd.data.setViewport; + wgpuRenderPassEncoderSetViewport(pass, vp.left, vp.top, vp.width, vp.height, vp.znear, vp.zfar); + } break; + case CommandType::SetScissor: { + const auto& sc = cmd.data.setScissor; + wgpuRenderPassEncoderSetScissorRect(pass, sc.x, sc.y, sc.w, sc.h); + } break; + case CommandType::Draw: { + const auto& draw = cmd.data.draw; + switch (draw.type) { + case ShaderType::Stream: + stream::render(g_state.stream, draw.stream, pass); + break; + case ShaderType::Model: + model::render(g_state.model, draw.model, pass); + break; + } + } break; + } + } + +#ifdef AURORA_GFX_DEBUG_GROUPS + for (size_t i = 0; i < lastDebugGroupStack.size(); ++i) { + wgpuRenderPassEncoderPopDebugGroup(pass); + } +#endif +} + +bool bind_pipeline(PipelineRef ref, WGPURenderPassEncoder pass) { + if (ref == g_currentPipeline) { + return true; + } + std::lock_guard guard{g_pipelineMutex}; + const auto it = g_pipelines.find(ref); + if (it == g_pipelines.end()) { + return false; + } + wgpuRenderPassEncoderSetPipeline(pass, it->second); + g_currentPipeline = ref; + return true; +} + +static inline Range push(ByteBuffer& target, const uint8_t* data, size_t length, size_t alignment) { + size_t padding = 0; + if (alignment != 0) { + padding = alignment - length % alignment; + } + auto begin = target.size(); + if (length == 0) { + length = alignment; + target.append_zeroes(alignment); + } else { + target.append(data, length); + if (padding > 0) { + target.append_zeroes(padding); + } + } + return {static_cast(begin), static_cast(length + padding)}; +} +static inline Range map(ByteBuffer& target, size_t length, size_t alignment) { + size_t padding = 0; + if (alignment != 0) { + padding = alignment - length % alignment; + } + if (length == 0) { + length = alignment; + } + auto begin = target.size(); + target.append_zeroes(length + padding); + return {static_cast(begin), static_cast(length + padding)}; +} +Range push_verts(const uint8_t* data, size_t length) { return push(g_verts, data, length, 4); } +Range push_indices(const uint8_t* data, size_t length) { return push(g_indices, data, length, 4); } +Range push_uniform(const uint8_t* data, size_t length) { + return push(g_uniforms, data, length, g_cachedLimits.limits.minUniformBufferOffsetAlignment); +} +Range push_storage(const uint8_t* data, size_t length) { + return push(g_storage, data, length, g_cachedLimits.limits.minStorageBufferOffsetAlignment); +} +Range push_texture_data(const uint8_t* data, size_t length, u32 bytesPerRow, u32 rowsPerImage) { + // For CopyBufferToTexture, we need an alignment of 256 per row (see Dawn kTextureBytesPerRowAlignment) + const auto copyBytesPerRow = ALIGN(bytesPerRow, 256); + const auto range = map(g_textureUpload, copyBytesPerRow * rowsPerImage, 0); + u8* dst = g_textureUpload.data() + range.offset; + for (u32 i = 0; i < rowsPerImage; ++i) { + memcpy(dst, data, bytesPerRow); + data += bytesPerRow; + dst += copyBytesPerRow; + } + return range; +} +std::pair map_verts(size_t length) { + const auto range = map(g_verts, length, 4); + return {ByteBuffer{g_verts.data() + range.offset, range.size}, range}; +} +std::pair map_indices(size_t length) { + const auto range = map(g_indices, length, 4); + return {ByteBuffer{g_indices.data() + range.offset, range.size}, range}; +} +std::pair map_uniform(size_t length) { + const auto range = map(g_uniforms, length, g_cachedLimits.limits.minUniformBufferOffsetAlignment); + return {ByteBuffer{g_uniforms.data() + range.offset, range.size}, range}; +} +std::pair map_storage(size_t length) { + const auto range = map(g_storage, length, g_cachedLimits.limits.minStorageBufferOffsetAlignment); + return {ByteBuffer{g_storage.data() + range.offset, range.size}, range}; +} + +BindGroupRef bind_group_ref(const WGPUBindGroupDescriptor& descriptor) { + const auto id = xxh3_hash(descriptor); + if (!g_cachedBindGroups.contains(id)) { + g_cachedBindGroups.try_emplace(id, wgpuDeviceCreateBindGroup(g_device, &descriptor)); + } + return id; +} +WGPUBindGroup find_bind_group(BindGroupRef id) { + const auto it = g_cachedBindGroups.find(id); + if (it == g_cachedBindGroups.end()) { + Log.report(LOG_FATAL, FMT_STRING("get_bind_group: failed to locate {}"), id); + unreachable(); + } + return it->second; +} + +WGPUSampler sampler_ref(const WGPUSamplerDescriptor& descriptor) { + const auto id = xxh3_hash(descriptor); + auto it = g_cachedSamplers.find(id); + if (it == g_cachedSamplers.end()) { + it = g_cachedSamplers.try_emplace(id, wgpuDeviceCreateSampler(g_device, &descriptor)).first; + } + return it->second; +} + +uint32_t align_uniform(uint32_t value) { return ALIGN(value, g_cachedLimits.limits.minUniformBufferOffsetAlignment); } +} // namespace aurora::gfx + +void push_debug_group(const char* label) { +#ifdef AURORA_GFX_DEBUG_GROUPS + aurora::gfx::g_debugGroupStack.emplace_back(label); +#endif +} +void pop_debug_group() { +#ifdef AURORA_GFX_DEBUG_GROUPS + aurora::gfx::g_debugGroupStack.pop_back(); +#endif +} diff --git a/lib/gfx/common.hpp b/lib/gfx/common.hpp new file mode 100644 index 0000000..732db6c --- /dev/null +++ b/lib/gfx/common.hpp @@ -0,0 +1,205 @@ +#pragma once + +#include "../internal.hpp" + +#include +#include +#include + +#include +#include + +namespace aurora { +#if INTPTR_MAX == INT32_MAX +using HashType = XXH32_hash_t; +#else +using HashType = XXH64_hash_t; +#endif +static inline HashType xxh3_hash_s(const void* input, size_t len, HashType seed = 0) { + return static_cast(XXH3_64bits_withSeed(input, len, seed)); +} +template +static inline HashType xxh3_hash(const T& input, HashType seed = 0) { + // Validate that the type has no padding bytes, which can easily cause + // hash mismatches. This also disallows floats, but that's okay for us. + static_assert(std::has_unique_object_representations_v); + return xxh3_hash_s(&input, sizeof(T), seed); +} + +class ByteBuffer { +public: + ByteBuffer() noexcept = default; + explicit ByteBuffer(size_t size) noexcept + : m_data(static_cast(calloc(1, size))), m_length(size), m_capacity(size) {} + explicit ByteBuffer(uint8_t* data, size_t size) noexcept + : m_data(data), m_length(0), m_capacity(size), m_owned(false) {} + ~ByteBuffer() noexcept { + if (m_data != nullptr && m_owned) { + free(m_data); + } + } + ByteBuffer(ByteBuffer&& rhs) noexcept + : m_data(rhs.m_data), m_length(rhs.m_length), m_capacity(rhs.m_capacity), m_owned(rhs.m_owned) { + rhs.m_data = nullptr; + rhs.m_length = 0; + rhs.m_capacity = 0; + rhs.m_owned = true; + } + ByteBuffer& operator=(ByteBuffer&& rhs) noexcept { + if (m_data != nullptr && m_owned) { + free(m_data); + } + m_data = rhs.m_data; + m_length = rhs.m_length; + m_capacity = rhs.m_capacity; + m_owned = rhs.m_owned; + rhs.m_data = nullptr; + rhs.m_length = 0; + rhs.m_capacity = 0; + rhs.m_owned = true; + return *this; + } + ByteBuffer(ByteBuffer const&) = delete; + ByteBuffer& operator=(ByteBuffer const&) = delete; + + [[nodiscard]] uint8_t* data() noexcept { return m_data; } + [[nodiscard]] const uint8_t* data() const noexcept { return m_data; } + [[nodiscard]] size_t size() const noexcept { return m_length; } + [[nodiscard]] bool empty() const noexcept { return m_length == 0; } + + void append(const void* data, size_t size) { + resize(m_length + size, false); + memcpy(m_data + m_length, data, size); + m_length += size; + } + + void append_zeroes(size_t size) { + resize(m_length + size, true); + m_length += size; + } + + void clear() { + if (m_data != nullptr && m_owned) { + free(m_data); + } + m_data = nullptr; + m_length = 0; + m_capacity = 0; + m_owned = true; + } + + void reserve_extra(size_t size) { resize(m_length + size, true); } + +private: + uint8_t* m_data = nullptr; + size_t m_length = 0; + size_t m_capacity = 0; + bool m_owned = true; + + void resize(size_t size, bool zeroed) { + if (size == 0) { + clear(); + } else if (m_data == nullptr) { + if (zeroed) { + m_data = static_cast(calloc(1, size)); + } else { + m_data = static_cast(malloc(size)); + } + m_owned = true; + } else if (size > m_capacity) { + if (!m_owned) { + abort(); + } + m_data = static_cast(realloc(m_data, size)); + if (zeroed) { + memset(m_data + m_capacity, 0, size - m_capacity); + } + } else { + return; + } + m_capacity = size; + } +}; +} // namespace aurora + +namespace aurora::gfx { +extern WGPUBuffer g_vertexBuffer; +extern WGPUBuffer g_uniformBuffer; +extern WGPUBuffer g_indexBuffer; +extern WGPUBuffer g_storageBuffer; +extern size_t g_staticStorageLastSize; + +using BindGroupRef = HashType; +using PipelineRef = HashType; +using SamplerRef = HashType; +using ShaderRef = HashType; +struct Range { + uint32_t offset = 0; + uint32_t size = 0; + + inline bool operator==(const Range& rhs) { return offset == rhs.offset && size == rhs.size; } +}; + +enum class ShaderType { + Stream, + Model, +}; + +void initialize(); +void shutdown(); + +void begin_frame(); +void end_frame(WGPUCommandEncoder cmd); +void render(WGPUCommandEncoder cmd); +void render_pass(WGPURenderPassEncoder pass, uint32_t idx); +void map_staging_buffer(); + +Range push_verts(const uint8_t* data, size_t length); +template +static inline Range push_verts(ArrayRef data) { + return push_verts(reinterpret_cast(data.data()), data.size() * sizeof(T)); +} +Range push_indices(const uint8_t* data, size_t length); +template +static inline Range push_indices(ArrayRef data) { + return push_indices(reinterpret_cast(data.data()), data.size() * sizeof(T)); +} +Range push_uniform(const uint8_t* data, size_t length); +template +static inline Range push_uniform(const T& data) { + return push_uniform(reinterpret_cast(&data), sizeof(T)); +} +Range push_storage(const uint8_t* data, size_t length); +template +static inline Range push_storage(ArrayRef data) { + return push_storage(reinterpret_cast(data.data()), data.size() * sizeof(T)); +} +template +static inline Range push_storage(const T& data) { + return push_storage(reinterpret_cast(&data), sizeof(T)); +} +Range push_texture_data(const uint8_t* data, size_t length, uint32_t bytesPerRow, uint32_t rowsPerImage); +std::pair map_verts(size_t length); +std::pair map_indices(size_t length); +std::pair map_uniform(size_t length); +std::pair map_storage(size_t length); + +template +const State& get_state(); +template +void push_draw_command(DrawData data); + +template +PipelineRef pipeline_ref(PipelineConfig config); +bool bind_pipeline(PipelineRef ref, WGPURenderPassEncoder pass); + +BindGroupRef bind_group_ref(const WGPUBindGroupDescriptor& descriptor); +WGPUBindGroup find_bind_group(BindGroupRef id); + +WGPUSampler sampler_ref(const WGPUSamplerDescriptor& descriptor); + +uint32_t align_uniform(uint32_t value); + +void set_viewport(float left, float top, float width, float height, float znear, float zfar) noexcept; +void set_scissor(uint32_t x, uint32_t y, uint32_t w, uint32_t h) noexcept; +} // namespace aurora::gfx diff --git a/lib/gfx/gx.cpp b/lib/gfx/gx.cpp new file mode 100644 index 0000000..1a12e6a --- /dev/null +++ b/lib/gfx/gx.cpp @@ -0,0 +1,794 @@ +#include "gx.hpp" + +#include "../webgpu/gpu.hpp" +#include "../window.hpp" +#include "../internal.hpp" +#include "common.hpp" + +#include +#include +#include + +using aurora::gfx::gx::g_gxState; +static aurora::Module Log("aurora::gx"); + +namespace aurora::gfx { +static Module Log("aurora::gfx::gx"); + +namespace gx { +using webgpu::g_device; +using webgpu::g_graphicsConfig; + +GXState g_gxState{}; + +const TextureBind& get_texture(GXTexMapID id) noexcept { return g_gxState.textures[static_cast(id)]; } + +static inline WGPUBlendFactor to_blend_factor(GXBlendFactor fac, bool isDst) { + switch (fac) { + case GX_BL_ZERO: + return WGPUBlendFactor_Zero; + case GX_BL_ONE: + return WGPUBlendFactor_One; + case GX_BL_SRCCLR: // + GX_BL_DSTCLR + if (isDst) { + return WGPUBlendFactor_Src; + } else { + return WGPUBlendFactor_Dst; + } + case GX_BL_INVSRCCLR: // + GX_BL_INVDSTCLR + if (isDst) { + return WGPUBlendFactor_OneMinusSrc; + } else { + return WGPUBlendFactor_OneMinusDst; + } + case GX_BL_SRCALPHA: + return WGPUBlendFactor_SrcAlpha; + case GX_BL_INVSRCALPHA: + return WGPUBlendFactor_OneMinusSrcAlpha; + case GX_BL_DSTALPHA: + return WGPUBlendFactor_DstAlpha; + case GX_BL_INVDSTALPHA: + return WGPUBlendFactor_OneMinusDstAlpha; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid blend factor {}"), fac); + unreachable(); + } +} + +static inline WGPUCompareFunction to_compare_function(GXCompare func) { + switch (func) { + case GX_NEVER: + return WGPUCompareFunction_Never; + case GX_LESS: + return WGPUCompareFunction_Less; + case GX_EQUAL: + return WGPUCompareFunction_Equal; + case GX_LEQUAL: + return WGPUCompareFunction_LessEqual; + case GX_GREATER: + return WGPUCompareFunction_Greater; + case GX_NEQUAL: + return WGPUCompareFunction_NotEqual; + case GX_GEQUAL: + return WGPUCompareFunction_GreaterEqual; + case GX_ALWAYS: + return WGPUCompareFunction_Always; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid depth fn {}"), func); + unreachable(); + } +} + +static inline WGPUBlendState to_blend_state(GXBlendMode mode, GXBlendFactor srcFac, GXBlendFactor dstFac, GXLogicOp op, + u32 dstAlpha) { + WGPUBlendComponent colorBlendComponent; + switch (mode) { + case GX_BM_NONE: + colorBlendComponent = { + .operation = WGPUBlendOperation_Add, + .srcFactor = WGPUBlendFactor_One, + .dstFactor = WGPUBlendFactor_Zero, + }; + break; + case GX_BM_BLEND: + colorBlendComponent = { + .operation = WGPUBlendOperation_Add, + .srcFactor = to_blend_factor(srcFac, false), + .dstFactor = to_blend_factor(dstFac, true), + }; + break; + case GX_BM_SUBTRACT: + colorBlendComponent = { + .operation = WGPUBlendOperation_ReverseSubtract, + .srcFactor = WGPUBlendFactor_One, + .dstFactor = WGPUBlendFactor_One, + }; + break; + case GX_BM_LOGIC: + switch (op) { + case GX_LO_CLEAR: + colorBlendComponent = { + .operation = WGPUBlendOperation_Add, + .srcFactor = WGPUBlendFactor_Zero, + .dstFactor = WGPUBlendFactor_Zero, + }; + break; + case GX_LO_COPY: + colorBlendComponent = { + .operation = WGPUBlendOperation_Add, + .srcFactor = WGPUBlendFactor_One, + .dstFactor = WGPUBlendFactor_Zero, + }; + break; + case GX_LO_NOOP: + colorBlendComponent = { + .operation = WGPUBlendOperation_Add, + .srcFactor = WGPUBlendFactor_Zero, + .dstFactor = WGPUBlendFactor_One, + }; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("unsupported logic op {}"), op); + unreachable(); + } + break; + default: + Log.report(LOG_FATAL, FMT_STRING("unsupported blend mode {}"), mode); + unreachable(); + } + WGPUBlendComponent alphaBlendComponent{ + .operation = WGPUBlendOperation_Add, + .srcFactor = WGPUBlendFactor_One, + .dstFactor = WGPUBlendFactor_Zero, + }; + if (dstAlpha != UINT32_MAX) { + alphaBlendComponent = WGPUBlendComponent{ + .operation = WGPUBlendOperation_Add, + .srcFactor = WGPUBlendFactor_Constant, + .dstFactor = WGPUBlendFactor_Zero, + }; + } + return { + .color = colorBlendComponent, + .alpha = alphaBlendComponent, + }; +} + +static inline WGPUColorWriteMaskFlags to_write_mask(bool colorUpdate, bool alphaUpdate) { + WGPUColorWriteMaskFlags writeMask = WGPUColorWriteMask_None; + if (colorUpdate) { + writeMask |= WGPUColorWriteMask_Red | WGPUColorWriteMask_Green | WGPUColorWriteMask_Blue; + } + if (alphaUpdate) { + writeMask |= WGPUColorWriteMask_Alpha; + } + return writeMask; +} + +static inline WGPUPrimitiveState to_primitive_state(GXPrimitive gx_prim, GXCullMode gx_cullMode) { + WGPUPrimitiveTopology primitive = WGPUPrimitiveTopology_TriangleList; + switch (gx_prim) { + case GX_TRIANGLES: + break; + case GX_TRIANGLESTRIP: + primitive = WGPUPrimitiveTopology_TriangleStrip; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("Unsupported primitive type {}"), gx_prim); + unreachable(); + } + WGPUCullMode cullMode = WGPUCullMode_None; + switch (gx_cullMode) { + case GX_CULL_FRONT: + cullMode = WGPUCullMode_Front; + break; + case GX_CULL_BACK: + cullMode = WGPUCullMode_Back; + break; + case GX_CULL_NONE: + break; + default: + Log.report(LOG_FATAL, FMT_STRING("Unsupported cull mode {}"), gx_cullMode); + unreachable(); + } + return { + .topology = primitive, + .frontFace = WGPUFrontFace_CW, + .cullMode = cullMode, + }; +} + +WGPURenderPipeline build_pipeline(const PipelineConfig& config, const ShaderInfo& info, + ArrayRef vtxBuffers, WGPUShaderModule shader, + const char* label) noexcept { + const WGPUDepthStencilState depthStencil{ + .format = g_graphicsConfig.depthFormat, + .depthWriteEnabled = config.depthUpdate, + .depthCompare = to_compare_function(config.depthFunc), + .stencilFront = + WGPUStencilFaceState{ + .compare = WGPUCompareFunction_Always, + }, + .stencilBack = + WGPUStencilFaceState{ + .compare = WGPUCompareFunction_Always, + }, + }; + const auto blendState = + to_blend_state(config.blendMode, config.blendFacSrc, config.blendFacDst, config.blendOp, config.dstAlpha); + const std::array colorTargets{WGPUColorTargetState{ + .format = g_graphicsConfig.colorFormat, + .blend = &blendState, + .writeMask = to_write_mask(config.colorUpdate, config.alphaUpdate), + }}; + const WGPUFragmentState fragmentState{ + .module = shader, + .entryPoint = "fs_main", + .targetCount = colorTargets.size(), + .targets = colorTargets.data(), + }; + auto layouts = build_bind_group_layouts(info, config.shaderConfig); + const std::array bindGroupLayouts{ + layouts.uniformLayout, + layouts.samplerLayout, + layouts.textureLayout, + }; + const WGPUPipelineLayoutDescriptor pipelineLayoutDescriptor{ + .label = "GX Pipeline Layout", + .bindGroupLayoutCount = static_cast(info.sampledTextures.any() ? bindGroupLayouts.size() : 1), + .bindGroupLayouts = bindGroupLayouts.data(), + }; + auto pipelineLayout = wgpuDeviceCreatePipelineLayout(g_device, &pipelineLayoutDescriptor); + const WGPURenderPipelineDescriptor descriptor{ + .label = label, + .layout = pipelineLayout, + .vertex = + { + .module = shader, + .entryPoint = "vs_main", + .bufferCount = static_cast(vtxBuffers.size()), + .buffers = vtxBuffers.data(), + }, + .primitive = to_primitive_state(config.primitive, config.cullMode), + .depthStencil = &depthStencil, + .multisample = + WGPUMultisampleState{ + .count = g_graphicsConfig.msaaSamples, + .mask = UINT32_MAX, + }, + .fragment = &fragmentState, + }; + auto pipeline = wgpuDeviceCreateRenderPipeline(g_device, &descriptor); + wgpuPipelineLayoutRelease(pipelineLayout); + return pipeline; +} + +void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive) noexcept { + config.shaderConfig.fogType = g_gxState.fog.type; + config.shaderConfig.vtxAttrs = g_gxState.vtxDesc; + int lastIndexedAttr = -1; + for (int i = 0; i < GX_VA_MAX_ATTR; ++i) { + const auto type = g_gxState.vtxDesc[i]; + if (type != GX_INDEX8 && type != GX_INDEX16) { + config.shaderConfig.attrMapping[i] = GX_VA_NULL; + continue; + } + const auto& array = g_gxState.arrays[i]; + if (lastIndexedAttr >= 0 && array == g_gxState.arrays[lastIndexedAttr]) { + // Map attribute to previous attribute + config.shaderConfig.attrMapping[i] = config.shaderConfig.attrMapping[lastIndexedAttr]; + } else { + // Map attribute to its own storage + config.shaderConfig.attrMapping[i] = static_cast(i); + } + lastIndexedAttr = i; + } + config.shaderConfig.tevSwapTable = g_gxState.tevSwapTable; + for (u8 i = 0; i < g_gxState.numTevStages; ++i) { + config.shaderConfig.tevStages[i] = g_gxState.tevStages[i]; + } + config.shaderConfig.tevStageCount = g_gxState.numTevStages; + for (u8 i = 0; i < g_gxState.numChans * 2; ++i) { + const auto& cc = g_gxState.colorChannelConfig[i]; + if (cc.lightingEnabled) { + config.shaderConfig.colorChannels[i] = cc; + } else { + // Only matSrc matters when lighting disabled + config.shaderConfig.colorChannels[i] = { + .matSrc = cc.matSrc, + }; + } + } + for (u8 i = 0; i < g_gxState.numTexGens; ++i) { + config.shaderConfig.tcgs[i] = g_gxState.tcgs[i]; + } + if (g_gxState.alphaCompare) { + config.shaderConfig.alphaCompare = g_gxState.alphaCompare; + } + config.shaderConfig.indexedAttributeCount = + std::count_if(config.shaderConfig.vtxAttrs.begin(), config.shaderConfig.vtxAttrs.end(), + [](const auto type) { return type == GX_INDEX8 || type == GX_INDEX16; }); + for (u8 i = 0; i < MaxTextures; ++i) { + const auto& bind = g_gxState.textures[i]; + TextureConfig texConfig{}; + if (bind.texObj.ref) { + if (requires_copy_conversion(bind.texObj)) { + texConfig.copyFmt = bind.texObj.ref->gxFormat; + } + if (requires_load_conversion(bind.texObj)) { + texConfig.loadFmt = bind.texObj.fmt; + } + texConfig.renderTex = bind.texObj.ref->isRenderTexture; + } + config.shaderConfig.textureConfig[i] = texConfig; + } + config = { + .shaderConfig = config.shaderConfig, + .primitive = primitive, + .depthFunc = g_gxState.depthFunc, + .cullMode = g_gxState.cullMode, + .blendMode = g_gxState.blendMode, + .blendFacSrc = g_gxState.blendFacSrc, + .blendFacDst = g_gxState.blendFacDst, + .blendOp = g_gxState.blendOp, + .dstAlpha = g_gxState.dstAlpha, + .depthCompare = g_gxState.depthCompare, + .depthUpdate = g_gxState.depthUpdate, + .alphaUpdate = g_gxState.alphaUpdate, + .colorUpdate = g_gxState.colorUpdate, + }; +} + +Range build_uniform(const ShaderInfo& info) noexcept { + auto [buf, range] = map_uniform(info.uniformSize); + { + buf.append(&g_gxState.pnMtx[g_gxState.currentPnMtx], 128); + buf.append(&g_gxState.proj, 64); + } + for (int i = 0; i < info.loadsTevReg.size(); ++i) { + if (!info.loadsTevReg.test(i)) { + continue; + } + buf.append(&g_gxState.colorRegs[i], 16); + } + bool lightingEnabled = false; + for (int i = 0; i < info.sampledColorChannels.size(); ++i) { + if (!info.sampledColorChannels.test(i)) { + continue; + } + const auto& ccc = g_gxState.colorChannelConfig[i * 2]; + const auto& ccca = g_gxState.colorChannelConfig[i * 2 + 1]; + if (ccc.lightingEnabled || ccca.lightingEnabled) { + lightingEnabled = true; + break; + } + } + if (lightingEnabled) { + // Lights + static_assert(sizeof(g_gxState.lights) == 80 * GX::MaxLights); + buf.append(&g_gxState.lights, 80 * GX::MaxLights); + // Light state for all channels + for (int i = 0; i < 4; ++i) { + u32 lightState = g_gxState.colorChannelState[i].lightMask.to_ulong(); + buf.append(&lightState, 4); + } + } + for (int i = 0; i < info.sampledColorChannels.size(); ++i) { + if (!info.sampledColorChannels.test(i)) { + continue; + } + const auto& ccc = g_gxState.colorChannelConfig[i * 2]; + const auto& ccs = g_gxState.colorChannelState[i * 2]; + if (ccc.lightingEnabled && ccc.ambSrc == GX_SRC_REG) { + buf.append(&ccs.ambColor, 16); + } + if (ccc.matSrc == GX_SRC_REG) { + buf.append(&ccs.matColor, 16); + } + const auto& ccca = g_gxState.colorChannelConfig[i * 2 + 1]; + const auto& ccsa = g_gxState.colorChannelState[i * 2 + 1]; + if (ccca.lightingEnabled && ccca.ambSrc == GX_SRC_REG) { + buf.append(&ccsa.ambColor, 16); + } + if (ccca.matSrc == GX_SRC_REG) { + buf.append(&ccsa.matColor, 16); + } + } + for (int i = 0; i < info.sampledKColors.size(); ++i) { + if (!info.sampledKColors.test(i)) { + continue; + } + buf.append(&g_gxState.kcolors[i], 16); + } + for (int i = 0; i < info.usesTexMtx.size(); ++i) { + if (!info.usesTexMtx.test(i)) { + continue; + } + const auto& state = g_gxState; + switch (info.texMtxTypes[i]) { + case GX_TG_MTX2x4: + if (std::holds_alternative>(state.texMtxs[i])) { + buf.append(&std::get>(state.texMtxs[i]), 32); + } else if (std::holds_alternative>(g_gxState.texMtxs[i])) { + // TODO: SMB hits this? + Mat4x2 mtx{ + {1.f, 0.f}, + {0.f, 1.f}, + {0.f, 0.f}, + {0.f, 0.f}, + }; + buf.append(&mtx, 32); + } else { + Log.report(LOG_FATAL, FMT_STRING("expected 2x4 mtx in idx {}"), i); + unreachable(); + } + break; + case GX_TG_MTX3x4: + if (std::holds_alternative>(g_gxState.texMtxs[i])) { + const auto& mat = std::get>(g_gxState.texMtxs[i]); + buf.append(&mat, 64); + } else { + Log.report(LOG_FATAL, FMT_STRING("expected 3x4 mtx in idx {}"), i); + buf.append(&Mat4x4_Identity, 64); + } + break; + default: + Log.report(LOG_FATAL, FMT_STRING("unhandled tex mtx type {}"), info.texMtxTypes[i]); + unreachable(); + } + } + for (int i = 0; i < info.usesPTTexMtx.size(); ++i) { + if (!info.usesPTTexMtx.test(i)) { + continue; + } + buf.append(&g_gxState.ptTexMtxs[i], 64); + } + if (info.usesFog) { + const auto& state = g_gxState.fog; + struct Fog { + Vec4 color = state.color; + float a = 0.f; + float b = 0.5f; + float c = 0.f; + float pad = FLT_MAX; + } fog{}; + static_assert(sizeof(Fog) == 32); + if (state.nearZ != state.farZ && state.startZ != state.endZ) { + const float depthRange = state.farZ - state.nearZ; + const float fogRange = state.endZ - state.startZ; + fog.a = (state.farZ * state.nearZ) / (depthRange * fogRange); + fog.b = state.farZ / depthRange; + fog.c = state.startZ / fogRange; + } + buf.append(&fog, 32); + } + for (int i = 0; i < info.sampledTextures.size(); ++i) { + if (!info.sampledTextures.test(i)) { + continue; + } + const auto& tex = get_texture(static_cast(i)); + if (!tex) { + Log.report(LOG_FATAL, FMT_STRING("unbound texture {}"), i); + unreachable(); + } + buf.append(&tex.texObj.lodBias, 4); + } + return range; +} + +static absl::flat_hash_map sUniformBindGroupLayouts; +static absl::flat_hash_map> sTextureBindGroupLayouts; + +GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& config, + const BindGroupRanges& ranges) noexcept { + const auto layouts = build_bind_group_layouts(info, config); + + std::array uniformEntries{ + WGPUBindGroupEntry{ + .binding = 0, + .buffer = g_uniformBuffer, + .size = info.uniformSize, + }, + }; + u32 uniformBindIdx = 1; + for (u32 i = 0; i < GX_VA_MAX_ATTR; ++i) { + const Range& range = ranges.vaRanges[i]; + if (range.size <= 0) { + continue; + } + uniformEntries[uniformBindIdx] = WGPUBindGroupEntry{ + .binding = uniformBindIdx, + .buffer = g_storageBuffer, + .size = range.size, + }; + ++uniformBindIdx; + } + + std::array samplerEntries; + std::array textureEntries; + u32 samplerCount = 0; + u32 textureCount = 0; + for (u32 i = 0; i < info.sampledTextures.size(); ++i) { + if (!info.sampledTextures.test(i)) { + continue; + } + const auto& tex = g_gxState.textures[i]; + if (!tex) { + Log.report(LOG_FATAL, FMT_STRING("unbound texture {}"), i); + unreachable(); + } + samplerEntries[samplerCount] = { + .binding = samplerCount, + .sampler = sampler_ref(tex.get_descriptor()), + }; + ++samplerCount; + textureEntries[textureCount] = { + .binding = textureCount, + .textureView = tex.texObj.ref->view, + }; + ++textureCount; + // Load palette + const auto& texConfig = config.textureConfig[i]; + if (is_palette_format(texConfig.loadFmt)) { + u32 tlut = tex.texObj.tlut; + if (tlut < GX_TLUT0 || tlut > GX_TLUT7) { + Log.report(LOG_FATAL, FMT_STRING("tlut out of bounds {}"), tlut); + unreachable(); + } else if (!g_gxState.tluts[tlut].ref) { + Log.report(LOG_FATAL, FMT_STRING("tlut unbound {}"), tlut); + unreachable(); + } + textureEntries[textureCount] = { + .binding = textureCount, + .textureView = g_gxState.tluts[tlut].ref->view, + }; + ++textureCount; + } + } + return { + .uniformBindGroup = bind_group_ref(WGPUBindGroupDescriptor{ + .label = "GX Uniform Bind Group", + .layout = layouts.uniformLayout, + .entryCount = uniformBindIdx, + .entries = uniformEntries.data(), + }), + .samplerBindGroup = bind_group_ref(WGPUBindGroupDescriptor{ + .label = "GX Sampler Bind Group", + .layout = layouts.samplerLayout, + .entryCount = samplerCount, + .entries = samplerEntries.data(), + }), + .textureBindGroup = bind_group_ref(WGPUBindGroupDescriptor{ + .label = "GX Texture Bind Group", + .layout = layouts.textureLayout, + .entryCount = textureCount, + .entries = textureEntries.data(), + }), + }; +} + +GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const ShaderConfig& config) noexcept { + GXBindGroupLayouts out; + u32 uniformSizeKey = info.uniformSize + (config.indexedAttributeCount > 0 ? 1 : 0); + const auto uniformIt = sUniformBindGroupLayouts.find(uniformSizeKey); + if (uniformIt != sUniformBindGroupLayouts.end()) { + out.uniformLayout = uniformIt->second; + } else { + std::array uniformLayoutEntries{ + WGPUBindGroupLayoutEntry{ + .binding = 0, + .visibility = WGPUShaderStage_Vertex | WGPUShaderStage_Fragment, + .buffer = + WGPUBufferBindingLayout{ + .type = WGPUBufferBindingType_Uniform, + .hasDynamicOffset = true, + .minBindingSize = info.uniformSize, + }, + }, + }; + u32 bindIdx = 1; + for (int i = 0; i < GX_VA_MAX_ATTR; ++i) { + if (config.attrMapping[i] == static_cast(i)) { + uniformLayoutEntries[bindIdx] = WGPUBindGroupLayoutEntry{ + .binding = bindIdx, + .visibility = WGPUShaderStage_Vertex, + .buffer = + WGPUBufferBindingLayout{ + .type = WGPUBufferBindingType_ReadOnlyStorage, + .hasDynamicOffset = true, + }, + }; + ++bindIdx; + } + } + const auto uniformLayoutDescriptor = WGPUBindGroupLayoutDescriptor{ + .label = "GX Uniform Bind Group Layout", + .entryCount = bindIdx, + .entries = uniformLayoutEntries.data(), + }; + out.uniformLayout = wgpuDeviceCreateBindGroupLayout(g_device, &uniformLayoutDescriptor); + // sUniformBindGroupLayouts.try_emplace(uniformSizeKey, out.uniformLayout); + } + + // u32 textureCount = info.sampledTextures.count(); + // const auto textureIt = sTextureBindGroupLayouts.find(textureCount); + // if (textureIt != sTextureBindGroupLayouts.end()) { + // const auto& [sl, tl] = textureIt->second; + // out.samplerLayout = sl; + // out.textureLayout = tl; + // } else { + u32 numSamplers = 0; + u32 numTextures = 0; + std::array samplerEntries; + std::array textureEntries; + for (u32 i = 0; i < info.sampledTextures.size(); ++i) { + if (!info.sampledTextures.test(i)) { + continue; + } + const auto& texConfig = config.textureConfig[i]; + bool copyAsPalette = is_palette_format(texConfig.copyFmt); + bool loadAsPalette = is_palette_format(texConfig.loadFmt); + samplerEntries[numSamplers] = { + .binding = numSamplers, + .visibility = WGPUShaderStage_Fragment, + .sampler = {.type = copyAsPalette && loadAsPalette ? WGPUSamplerBindingType_NonFiltering + : WGPUSamplerBindingType_Filtering}, + }; + ++numSamplers; + if (loadAsPalette) { + textureEntries[numTextures] = { + .binding = numTextures, + .visibility = WGPUShaderStage_Fragment, + .texture = + { + .sampleType = copyAsPalette ? WGPUTextureSampleType_Sint : WGPUTextureSampleType_Float, + .viewDimension = WGPUTextureViewDimension_2D, + }, + }; + ++numTextures; + textureEntries[numTextures] = { + .binding = numTextures, + .visibility = WGPUShaderStage_Fragment, + .texture = + { + .sampleType = WGPUTextureSampleType_Float, + .viewDimension = WGPUTextureViewDimension_2D, + }, + }; + ++numTextures; + } else { + textureEntries[numTextures] = { + .binding = numTextures, + .visibility = WGPUShaderStage_Fragment, + .texture = + { + .sampleType = WGPUTextureSampleType_Float, + .viewDimension = WGPUTextureViewDimension_2D, + }, + }; + ++numTextures; + } + } + { + const WGPUBindGroupLayoutDescriptor descriptor{ + .label = "GX Sampler Bind Group Layout", + .entryCount = numSamplers, + .entries = samplerEntries.data(), + }; + out.samplerLayout = wgpuDeviceCreateBindGroupLayout(g_device, &descriptor); + } + { + const WGPUBindGroupLayoutDescriptor descriptor{ + .label = "GX Texture Bind Group Layout", + .entryCount = numTextures, + .entries = textureEntries.data(), + }; + out.textureLayout = wgpuDeviceCreateBindGroupLayout(g_device, &descriptor); + } + // sTextureBindGroupLayouts.try_emplace(textureCount, out.samplerLayout, out.textureLayout); + // } + return out; +} + +// TODO this is awkward +extern absl::flat_hash_map> g_gxCachedShaders; +void shutdown() noexcept { + // TODO we should probably store this all in g_state.gx instead + for (const auto& item : sUniformBindGroupLayouts) { + wgpuBindGroupLayoutRelease(item.second); + } + sUniformBindGroupLayouts.clear(); + for (const auto& item : sTextureBindGroupLayouts) { + wgpuBindGroupLayoutRelease(item.second.first); + wgpuBindGroupLayoutRelease(item.second.second); + } + sTextureBindGroupLayouts.clear(); + for (auto& item : g_gxState.textures) { + item.texObj.ref.reset(); + } + for (auto& item : g_gxState.tluts) { + item.ref.reset(); + } + for (const auto& item : g_gxCachedShaders) { + wgpuShaderModuleRelease(item.second.first); + } + g_gxCachedShaders.clear(); +} +} // namespace gx + +static WGPUAddressMode wgpu_address_mode(GXTexWrapMode mode) { + switch (mode) { + case GX_CLAMP: + return WGPUAddressMode_ClampToEdge; + case GX_REPEAT: + return WGPUAddressMode_Repeat; + case GX_MIRROR: + return WGPUAddressMode_MirrorRepeat; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid wrap mode {}"), mode); + unreachable(); + } +} +static std::pair wgpu_filter_mode(GXTexFilter filter) { + switch (filter) { + case GX_NEAR: + return {WGPUFilterMode_Nearest, WGPUFilterMode_Linear}; + case GX_LINEAR: + return {WGPUFilterMode_Linear, WGPUFilterMode_Linear}; + case GX_NEAR_MIP_NEAR: + return {WGPUFilterMode_Nearest, WGPUFilterMode_Nearest}; + case GX_LIN_MIP_NEAR: + return {WGPUFilterMode_Linear, WGPUFilterMode_Nearest}; + case GX_NEAR_MIP_LIN: + return {WGPUFilterMode_Nearest, WGPUFilterMode_Linear}; + case GX_LIN_MIP_LIN: + return {WGPUFilterMode_Linear, WGPUFilterMode_Linear}; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid filter mode {}"), filter); + unreachable(); + } +} +static u16 wgpu_aniso(GXAnisotropy aniso) { + switch (aniso) { + case GX_ANISO_1: + return 1; + case GX_ANISO_2: + return std::max(webgpu::g_graphicsConfig.textureAnisotropy / 2, 1); + case GX_ANISO_4: + return std::max(webgpu::g_graphicsConfig.textureAnisotropy, 1); + default: + Log.report(LOG_FATAL, FMT_STRING("invalid aniso mode {}"), aniso); + unreachable(); + } +} +WGPUSamplerDescriptor TextureBind::get_descriptor() const noexcept { + if (gx::requires_copy_conversion(texObj) && gx::is_palette_format(texObj.ref->gxFormat)) { + return { + .label = "Generated Non-Filtering Sampler", + .addressModeU = wgpu_address_mode(texObj.wrapS), + .addressModeV = wgpu_address_mode(texObj.wrapT), + .addressModeW = WGPUAddressMode_Repeat, + .magFilter = WGPUFilterMode_Nearest, + .minFilter = WGPUFilterMode_Nearest, + .mipmapFilter = WGPUFilterMode_Nearest, + .lodMinClamp = 0.f, + .lodMaxClamp = 1000.f, + .maxAnisotropy = 1, + }; + } + const auto [minFilter, mipFilter] = wgpu_filter_mode(texObj.minFilter); + const auto [magFilter, _] = wgpu_filter_mode(texObj.magFilter); + return { + .label = "Generated Filtering Sampler", + .addressModeU = wgpu_address_mode(texObj.wrapS), + .addressModeV = wgpu_address_mode(texObj.wrapT), + .addressModeW = WGPUAddressMode_Repeat, + .magFilter = magFilter, + .minFilter = minFilter, + .mipmapFilter = mipFilter, + .lodMinClamp = 0.f, + .lodMaxClamp = 1000.f, + .maxAnisotropy = wgpu_aniso(texObj.maxAniso), + }; +} +} // namespace aurora::gfx diff --git a/lib/gfx/gx.hpp b/lib/gfx/gx.hpp new file mode 100644 index 0000000..0b7cc63 --- /dev/null +++ b/lib/gfx/gx.hpp @@ -0,0 +1,402 @@ +#pragma once +#include +#include + +#include "common.hpp" +#include "../internal.hpp" +#include "texture.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#define M_PIF 3.14159265358979323846f + +namespace GX { +constexpr u8 MaxLights = 8; +using LightMask = std::bitset; +} // namespace GX + +struct GXLightObj_ { + GXColor color; + float a0 = 1.f; + float a1 = 0.f; + float a2 = 0.f; + float k0 = 1.f; + float k1 = 0.f; + float k2 = 0.f; + float px = 0.f; + float py = 0.f; + float pz = 0.f; + float nx = 0.f; + float ny = 0.f; + float nz = 0.f; +}; +static_assert(sizeof(GXLightObj_) <= sizeof(GXLightObj), "GXLightObj too small!"); + +#if GX_IS_WII +constexpr float GX_LARGE_NUMBER = -1.0e+18f; +#else +constexpr float GX_LARGE_NUMBER = -1048576.0f; +#endif + +namespace aurora::gfx::gx { +constexpr u32 MaxTextures = GX_MAX_TEXMAP; +constexpr u32 MaxTevStages = GX_MAX_TEVSTAGE; +constexpr u32 MaxColorChannels = 4; +constexpr u32 MaxTevRegs = 4; // TEVPREV, TEVREG0-2 +constexpr u32 MaxKColors = GX_MAX_KCOLOR; +constexpr u32 MaxTexMtx = 10; +constexpr u32 MaxPTTexMtx = 20; +constexpr u32 MaxTexCoord = GX_MAX_TEXCOORD; +constexpr u32 MaxVtxAttr = GX_VA_MAX_ATTR; +constexpr u32 MaxTevSwap = GX_MAX_TEVSWAP; +constexpr u32 MaxIndStages = GX_MAX_INDTEXSTAGE; +constexpr u32 MaxIndTexMtxs = 3; +constexpr u32 MaxVtxFmt = GX_MAX_VTXFMT; +constexpr u32 MaxPnMtx = (GX_PNMTX9 / 3) + 1; + +template +struct TevPass { + Arg a = Default; + Arg b = Default; + Arg c = Default; + Arg d = Default; + + bool operator==(const TevPass& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } +}; +static_assert(std::has_unique_object_representations_v>); +static_assert(std::has_unique_object_representations_v>); +struct TevOp { + GXTevOp op = GX_TEV_ADD; + GXTevBias bias = GX_TB_ZERO; + GXTevScale scale = GX_CS_SCALE_1; + GXTevRegID outReg = GX_TEVPREV; + bool clamp = true; + u8 _p1 = 0; + u8 _p2 = 0; + u8 _p3 = 0; + + bool operator==(const TevOp& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } +}; +static_assert(std::has_unique_object_representations_v); +struct TevStage { + TevPass colorPass; + TevPass alphaPass; + TevOp colorOp; + TevOp alphaOp; + GXTevKColorSel kcSel = GX_TEV_KCSEL_1; + GXTevKAlphaSel kaSel = GX_TEV_KASEL_1; + GXTexCoordID texCoordId = GX_TEXCOORD_NULL; + GXTexMapID texMapId = GX_TEXMAP_NULL; + GXChannelID channelId = GX_COLOR_NULL; + GXTevSwapSel tevSwapRas = GX_TEV_SWAP0; + GXTevSwapSel tevSwapTex = GX_TEV_SWAP0; + GXIndTexStageID indTexStage = GX_INDTEXSTAGE0; + GXIndTexFormat indTexFormat = GX_ITF_8; + GXIndTexBiasSel indTexBiasSel = GX_ITB_NONE; + GXIndTexAlphaSel indTexAlphaSel = GX_ITBA_OFF; + GXIndTexMtxID indTexMtxId = GX_ITM_OFF; + GXIndTexWrap indTexWrapS = GX_ITW_OFF; + GXIndTexWrap indTexWrapT = GX_ITW_OFF; + bool indTexUseOrigLOD = false; + bool indTexAddPrev = false; + u8 _p1 = 0; + u8 _p2 = 0; + + bool operator==(const TevStage& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } +}; +static_assert(std::has_unique_object_representations_v); +struct IndStage { + GXTexCoordID texCoordId; + GXTexMapID texMapId; + GXIndTexScale scaleS; + GXIndTexScale scaleT; +}; +static_assert(std::has_unique_object_representations_v); +// For shader generation +struct ColorChannelConfig { + GXColorSrc matSrc = GX_SRC_REG; + GXColorSrc ambSrc = GX_SRC_REG; + GXDiffuseFn diffFn = GX_DF_NONE; + GXAttnFn attnFn = GX_AF_NONE; + bool lightingEnabled = false; + u8 _p1 = 0; + u8 _p2 = 0; + u8 _p3 = 0; + + bool operator==(const ColorChannelConfig& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } +}; +static_assert(std::has_unique_object_representations_v); +// For uniform generation +struct ColorChannelState { + Vec4 matColor; + Vec4 ambColor; + GX::LightMask lightMask; +}; +// Mat4x4 used instead of Mat4x3 for padding purposes +using TexMtxVariant = std::variant, Mat4x4>; +struct TcgConfig { + GXTexGenType type = GX_TG_MTX2x4; + GXTexGenSrc src = GX_MAX_TEXGENSRC; + GXTexMtx mtx = GX_IDENTITY; + GXPTTexMtx postMtx = GX_PTIDENTITY; + bool normalize = false; + u8 _p1 = 0; + u8 _p2 = 0; + u8 _p3 = 0; + + bool operator==(const TcgConfig& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } +}; +static_assert(std::has_unique_object_representations_v); +struct FogState { + GXFogType type = GX_FOG_NONE; + float startZ = 0.f; + float endZ = 0.f; + float nearZ = 0.f; + float farZ = 0.f; + Vec4 color; + + bool operator==(const FogState& rhs) const { + return type == rhs.type && startZ == rhs.startZ && endZ == rhs.endZ && nearZ == rhs.nearZ && farZ == rhs.farZ && + color == rhs.color; + } +}; +struct TevSwap { + GXTevColorChan red = GX_CH_RED; + GXTevColorChan green = GX_CH_GREEN; + GXTevColorChan blue = GX_CH_BLUE; + GXTevColorChan alpha = GX_CH_ALPHA; + + bool operator==(const TevSwap& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } + explicit operator bool() const { return !(*this == TevSwap{}); } +}; +static_assert(std::has_unique_object_representations_v); +struct AlphaCompare { + GXCompare comp0 = GX_ALWAYS; + u32 ref0; // would be u8 but extended to avoid padding bytes + GXAlphaOp op = GX_AOP_AND; + GXCompare comp1 = GX_ALWAYS; + u32 ref1; + + bool operator==(const AlphaCompare& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } + explicit operator bool() const { return comp0 != GX_ALWAYS || comp1 != GX_ALWAYS; } +}; +static_assert(std::has_unique_object_representations_v); +struct IndTexMtxInfo { + aurora::Mat3x2 mtx; + s8 scaleExp; + + bool operator==(const IndTexMtxInfo& rhs) const { return mtx == rhs.mtx && scaleExp == rhs.scaleExp; } +}; +struct VtxAttrFmt { + GXCompCnt cnt; + GXCompType type; + u8 frac; +}; +struct VtxFmt { + std::array attrs; +}; +struct PnMtx { + Mat4x4 pos; + Mat4x4 nrm; +}; +static_assert(sizeof(PnMtx) == sizeof(Mat4x4) * 2); +struct Light { + Vec4 pos{0.f, 0.f, 0.f}; + Vec4 dir{0.f, 0.f, 0.f}; + Vec4 color{0.f, 0.f, 0.f, 0.f}; + Vec4 cosAtt{0.f, 0.f, 0.f}; + Vec4 distAtt{0.f, 0.f, 0.f}; + + bool operator==(const Light& rhs) const { + return pos == rhs.pos && dir == rhs.dir && color == rhs.color && cosAtt == rhs.cosAtt && distAtt == rhs.distAtt; + } +}; +static_assert(sizeof(Light) == 80); +struct AttrArray { + const void* data; + u32 size; + u8 stride; + Range cachedRange; +}; +inline bool operator==(const AttrArray& lhs, const AttrArray& rhs) { + return lhs.data == rhs.data && lhs.size == rhs.size && lhs.stride == rhs.stride; +} + +struct GXState { + std::array pnMtx; + u32 currentPnMtx; + Mat4x4 proj; + Mat4x4 origProj; // for GXGetProjectionv + GXProjectionType projType; // for GXGetProjectionv + FogState fog; + GXCullMode cullMode = GX_CULL_BACK; + GXBlendMode blendMode = GX_BM_NONE; + GXBlendFactor blendFacSrc = GX_BL_SRCALPHA; + GXBlendFactor blendFacDst = GX_BL_INVSRCALPHA; + GXLogicOp blendOp = GX_LO_CLEAR; + GXCompare depthFunc = GX_LEQUAL; + Vec4 clearColor{0.f, 0.f, 0.f, 1.f}; + u32 dstAlpha; // u8; UINT32_MAX = disabled + AlphaCompare alphaCompare; + std::array, MaxTevRegs> colorRegs; + std::array, GX_MAX_KCOLOR> kcolors; + std::array colorChannelConfig; + std::array colorChannelState; + std::array lights; + std::array tevStages; + std::array textures; + std::array tluts; + std::array texMtxs; + std::array, MaxPTTexMtx> ptTexMtxs; + std::array tcgs; + std::array vtxDesc; + std::array vtxFmts; + std::array tevSwapTable{ + TevSwap{}, + TevSwap{GX_CH_RED, GX_CH_RED, GX_CH_RED, GX_CH_ALPHA}, + TevSwap{GX_CH_GREEN, GX_CH_GREEN, GX_CH_GREEN, GX_CH_ALPHA}, + TevSwap{GX_CH_BLUE, GX_CH_BLUE, GX_CH_BLUE, GX_CH_ALPHA}, + }; + std::array indStages; + std::array indTexMtxs; + std::array arrays; + bool depthCompare = true; + bool depthUpdate = true; + bool colorUpdate = true; + bool alphaUpdate = true; + u8 numChans = 0; + u8 numIndStages = 0; + u8 numTevStages = 0; + u8 numTexGens = 0; + bool stateDirty = true; +}; +extern GXState g_gxState; + +void shutdown() noexcept; +const TextureBind& get_texture(GXTexMapID id) noexcept; + +static inline bool requires_copy_conversion(const GXTexObj_& obj) { + if (!obj.ref) { + return false; + } + if (obj.ref->isRenderTexture) { + return true; + } + switch (obj.ref->gxFormat) { + // case GX_TF_RGB565: + // case GX_TF_I4: + // case GX_TF_I8: + case GX_TF_C4: + case GX_TF_C8: + case GX_TF_C14X2: + return true; + default: + return false; + } +} +static inline bool requires_load_conversion(const GXTexObj_& obj) { + if (!obj.ref) { + return false; + } + switch (obj.fmt) { + case GX_TF_I4: + case GX_TF_I8: + case GX_TF_C4: + case GX_TF_C8: + case GX_TF_C14X2: + return true; + default: + return false; + } +} +static inline bool is_palette_format(u32 fmt) { return fmt == GX_TF_C4 || fmt == GX_TF_C8 || fmt == GX_TF_C14X2; } + +struct TextureConfig { + u32 copyFmt = InvalidTextureFormat; // Underlying texture format + u32 loadFmt = InvalidTextureFormat; // Texture format being bound + bool renderTex = false; // Perform conversion + u8 _p1 = 0; + u8 _p2 = 0; + u8 _p3 = 0; + + bool operator==(const TextureConfig& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } +}; +static_assert(std::has_unique_object_representations_v); +struct ShaderConfig { + GXFogType fogType; + std::array vtxAttrs; + // Mapping for indexed attributes -> storage buffer + std::array attrMapping; + std::array tevSwapTable; + std::array tevStages; + u32 tevStageCount = 0; + std::array colorChannels; + std::array tcgs; + AlphaCompare alphaCompare; + u32 indexedAttributeCount = 0; + std::array textureConfig; + + bool operator==(const ShaderConfig& rhs) const { return memcmp(this, &rhs, sizeof(*this)) == 0; } +}; +static_assert(std::has_unique_object_representations_v); + +constexpr u32 GXPipelineConfigVersion = 4; +struct PipelineConfig { + u32 version = GXPipelineConfigVersion; + ShaderConfig shaderConfig; + GXPrimitive primitive; + GXCompare depthFunc; + GXCullMode cullMode; + GXBlendMode blendMode; + GXBlendFactor blendFacSrc, blendFacDst; + GXLogicOp blendOp; + u32 dstAlpha; + bool depthCompare, depthUpdate, alphaUpdate, colorUpdate; +}; +static_assert(std::has_unique_object_representations_v); + +struct GXBindGroupLayouts { + WGPUBindGroupLayout uniformLayout; + WGPUBindGroupLayout samplerLayout; + WGPUBindGroupLayout textureLayout; +}; +struct GXBindGroups { + BindGroupRef uniformBindGroup; + BindGroupRef samplerBindGroup; + BindGroupRef textureBindGroup; +}; +// Output info from shader generation +struct ShaderInfo { + std::bitset sampledTexCoords; + std::bitset sampledTextures; + std::bitset sampledKColors; + std::bitset sampledColorChannels; + std::bitset loadsTevReg; + std::bitset writesTevReg; + std::bitset usesTexMtx; + std::bitset usesPTTexMtx; + std::array texMtxTypes{}; + u32 uniformSize = 0; + bool usesFog : 1 = false; +}; +struct BindGroupRanges { + std::array vaRanges{}; +}; +void populate_pipeline_config(PipelineConfig& config, GXPrimitive primitive) noexcept; +WGPURenderPipeline build_pipeline(const PipelineConfig& config, const ShaderInfo& info, + ArrayRef vtxBuffers, WGPUShaderModule shader, + const char* label) noexcept; +ShaderInfo build_shader_info(const ShaderConfig& config) noexcept; +WGPUShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& info) noexcept; +// Range build_vertex_buffer(const GXShaderInfo& info) noexcept; +Range build_uniform(const ShaderInfo& info) noexcept; +GXBindGroupLayouts build_bind_group_layouts(const ShaderInfo& info, const ShaderConfig& config) noexcept; +GXBindGroups build_bind_groups(const ShaderInfo& info, const ShaderConfig& config, + const BindGroupRanges& ranges) noexcept; +} // namespace aurora::gfx::gx diff --git a/lib/gfx/gx_shader.cpp b/lib/gfx/gx_shader.cpp new file mode 100644 index 0000000..c6aa18e --- /dev/null +++ b/lib/gfx/gx_shader.cpp @@ -0,0 +1,1392 @@ +#include "common.hpp" + +#include "../webgpu/gpu.hpp" +#include "gx.hpp" + +#include + +constexpr bool EnableNormalVisualization = false; +constexpr bool EnableDebugPrints = true; +constexpr bool UsePerPixelLighting = true; + +namespace aurora::gfx::gx { +using namespace fmt::literals; +using namespace std::string_literals; +using namespace std::string_view_literals; + +static Module Log("aurora::gfx::gx"); + +absl::flat_hash_map> g_gxCachedShaders; +#ifndef NDEBUG +static absl::flat_hash_map g_gxCachedShaderConfigs; +#endif + +static inline std::string_view chan_comp(GXTevColorChan chan) noexcept { + switch (chan) { + case GX_CH_RED: + return "r"; + case GX_CH_GREEN: + return "g"; + case GX_CH_BLUE: + return "b"; + case GX_CH_ALPHA: + return "a"; + default: + return "?"; + } +} + +static void color_arg_reg_info(GXTevColorArg arg, const TevStage& stage, ShaderInfo& info) { + switch (arg) { + case GX_CC_CPREV: + case GX_CC_APREV: + if (!info.writesTevReg.test(GX_TEVPREV)) { + info.loadsTevReg.set(GX_TEVPREV); + } + break; + case GX_CC_C0: + case GX_CC_A0: + if (!info.writesTevReg.test(GX_TEVREG0)) { + info.loadsTevReg.set(GX_TEVREG0); + } + break; + case GX_CC_C1: + case GX_CC_A1: + if (!info.writesTevReg.test(GX_TEVREG1)) { + info.loadsTevReg.set(GX_TEVREG1); + } + break; + case GX_CC_C2: + case GX_CC_A2: + if (!info.writesTevReg.test(GX_TEVREG2)) { + info.loadsTevReg.set(GX_TEVREG2); + } + break; + case GX_CC_TEXC: + case GX_CC_TEXA: + if (stage.texCoordId == GX_TEXCOORD_NULL) { + Log.report(LOG_FATAL, FMT_STRING("texCoord not bound")); + } + if (stage.texMapId == GX_TEXMAP_NULL) { + Log.report(LOG_FATAL, FMT_STRING("texMap not bound")); + } + info.sampledTexCoords.set(stage.texCoordId); + info.sampledTextures.set(stage.texMapId); + break; + case GX_CC_RASC: + case GX_CC_RASA: + if (stage.channelId >= GX_COLOR0A0 && stage.channelId <= GX_COLOR1A1) { + info.sampledColorChannels.set(stage.channelId - GX_COLOR0A0); + } + break; + case GX_CC_KONST: + switch (stage.kcSel) { + case GX_TEV_KCSEL_K0: + case GX_TEV_KCSEL_K0_R: + case GX_TEV_KCSEL_K0_G: + case GX_TEV_KCSEL_K0_B: + case GX_TEV_KCSEL_K0_A: + info.sampledKColors.set(0); + break; + case GX_TEV_KCSEL_K1: + case GX_TEV_KCSEL_K1_R: + case GX_TEV_KCSEL_K1_G: + case GX_TEV_KCSEL_K1_B: + case GX_TEV_KCSEL_K1_A: + info.sampledKColors.set(1); + break; + case GX_TEV_KCSEL_K2: + case GX_TEV_KCSEL_K2_R: + case GX_TEV_KCSEL_K2_G: + case GX_TEV_KCSEL_K2_B: + case GX_TEV_KCSEL_K2_A: + info.sampledKColors.set(2); + break; + case GX_TEV_KCSEL_K3: + case GX_TEV_KCSEL_K3_R: + case GX_TEV_KCSEL_K3_G: + case GX_TEV_KCSEL_K3_B: + case GX_TEV_KCSEL_K3_A: + info.sampledKColors.set(3); + break; + default: + break; + } + break; + default: + break; + } +} + +static bool formatHasAlpha(u32 format) { + switch (format) { + case GX_TF_IA4: + case GX_TF_IA8: + case GX_TF_RGB5A3: + case GX_TF_RGBA8: + case GX_TF_CMPR: + case GX_CTF_RA4: + case GX_CTF_RA8: + case GX_CTF_YUVA8: + case GX_CTF_A8: + case GX_TF_RGBA8_PC: + return true; + default: + return false; + } +} + +static std::string color_arg_reg(GXTevColorArg arg, size_t stageIdx, const ShaderConfig& config, + const TevStage& stage) { + switch (arg) { + case GX_CC_CPREV: + return "prev.rgb"; + case GX_CC_APREV: + return "vec3(prev.a)"; + case GX_CC_C0: + return "tevreg0.rgb"; + case GX_CC_A0: + return "vec3(tevreg0.a)"; + case GX_CC_C1: + return "tevreg1.rgb"; + case GX_CC_A1: + return "vec3(tevreg1.a)"; + case GX_CC_C2: + return "tevreg2.rgb"; + case GX_CC_A2: + return "vec3(tevreg2.a)"; + case GX_CC_TEXC: { + if (stage.texMapId == GX_TEXMAP_NULL) { + Log.report(LOG_FATAL, FMT_STRING("unmapped texture for stage {}"), stageIdx); + unreachable(); + } else if (stage.texMapId < GX_TEXMAP0 || stage.texMapId > GX_TEXMAP7) { + Log.report(LOG_FATAL, FMT_STRING("invalid texture {} for stage {}"), stage.texMapId, stageIdx); + unreachable(); + } + const auto& swap = config.tevSwapTable[stage.tevSwapTex]; + return fmt::format(FMT_STRING("sampled{}.{}{}{}"), stageIdx, chan_comp(swap.red), chan_comp(swap.green), + chan_comp(swap.blue)); + } + case GX_CC_TEXA: { + if (stage.texMapId == GX_TEXMAP_NULL) { + Log.report(LOG_FATAL, FMT_STRING("unmapped texture for stage {}"), stageIdx); + unreachable(); + } else if (stage.texMapId < GX_TEXMAP0 || stage.texMapId > GX_TEXMAP7) { + Log.report(LOG_FATAL, FMT_STRING("invalid texture {} for stage {}"), stage.texMapId, stageIdx); + unreachable(); + } + const auto& swap = config.tevSwapTable[stage.tevSwapTex]; + return fmt::format(FMT_STRING("vec3(sampled{}.{})"), stageIdx, chan_comp(swap.alpha)); + } + case GX_CC_RASC: { + if (stage.channelId == GX_COLOR_NULL) { + Log.report(LOG_FATAL, FMT_STRING("unmapped color channel for stage {}"), stageIdx); + unreachable(); + } else if (stage.channelId == GX_COLOR_ZERO) { + return "vec3(0.0)"; + } else if (stage.channelId < GX_COLOR0A0 || stage.channelId > GX_COLOR1A1) { + Log.report(LOG_FATAL, FMT_STRING("invalid color channel {} for stage {}"), stage.channelId, stageIdx); + unreachable(); + } + u32 idx = stage.channelId - GX_COLOR0A0; + const auto& swap = config.tevSwapTable[stage.tevSwapRas]; + return fmt::format(FMT_STRING("rast{}.{}{}{}"), idx, chan_comp(swap.red), chan_comp(swap.green), + chan_comp(swap.blue)); + } + case GX_CC_RASA: { + if (stage.channelId == GX_COLOR_NULL) { + Log.report(LOG_FATAL, FMT_STRING("unmapped color channel for stage {}"), stageIdx); + unreachable(); + } else if (stage.channelId == GX_COLOR_ZERO) { + return "vec3(0.0)"; + } else if (stage.channelId < GX_COLOR0A0 || stage.channelId > GX_COLOR1A1) { + Log.report(LOG_FATAL, FMT_STRING("invalid color channel {} for stage {}"), stage.channelId, stageIdx); + unreachable(); + } + u32 idx = stage.channelId - GX_COLOR0A0; + const auto& swap = config.tevSwapTable[stage.tevSwapRas]; + return fmt::format(FMT_STRING("vec3(rast{}.{})"), idx, chan_comp(swap.alpha)); + } + case GX_CC_ONE: + return "vec3(1.0)"; + case GX_CC_HALF: + return "vec3(0.5)"; + case GX_CC_KONST: { + switch (stage.kcSel) { + case GX_TEV_KCSEL_8_8: + return "vec3(1.0)"; + case GX_TEV_KCSEL_7_8: + return "vec3(7.0/8.0)"; + case GX_TEV_KCSEL_6_8: + return "vec3(6.0/8.0)"; + case GX_TEV_KCSEL_5_8: + return "vec3(5.0/8.0)"; + case GX_TEV_KCSEL_4_8: + return "vec3(4.0/8.0)"; + case GX_TEV_KCSEL_3_8: + return "vec3(3.0/8.0)"; + case GX_TEV_KCSEL_2_8: + return "vec3(2.0/8.0)"; + case GX_TEV_KCSEL_1_8: + return "vec3(1.0/8.0)"; + case GX_TEV_KCSEL_K0: + return "ubuf.kcolor0.rgb"; + case GX_TEV_KCSEL_K1: + return "ubuf.kcolor1.rgb"; + case GX_TEV_KCSEL_K2: + return "ubuf.kcolor2.rgb"; + case GX_TEV_KCSEL_K3: + return "ubuf.kcolor3.rgb"; + case GX_TEV_KCSEL_K0_R: + return "vec3(ubuf.kcolor0.r)"; + case GX_TEV_KCSEL_K1_R: + return "vec3(ubuf.kcolor1.r)"; + case GX_TEV_KCSEL_K2_R: + return "vec3(ubuf.kcolor2.r)"; + case GX_TEV_KCSEL_K3_R: + return "vec3(ubuf.kcolor3.r)"; + case GX_TEV_KCSEL_K0_G: + return "vec3(ubuf.kcolor0.g)"; + case GX_TEV_KCSEL_K1_G: + return "vec3(ubuf.kcolor1.g)"; + case GX_TEV_KCSEL_K2_G: + return "vec3(ubuf.kcolor2.g)"; + case GX_TEV_KCSEL_K3_G: + return "vec3(ubuf.kcolor3.g)"; + case GX_TEV_KCSEL_K0_B: + return "vec3(ubuf.kcolor0.b)"; + case GX_TEV_KCSEL_K1_B: + return "vec3(ubuf.kcolor1.b)"; + case GX_TEV_KCSEL_K2_B: + return "vec3(ubuf.kcolor2.b)"; + case GX_TEV_KCSEL_K3_B: + return "vec3(ubuf.kcolor3.b)"; + case GX_TEV_KCSEL_K0_A: + return "vec3(ubuf.kcolor0.a)"; + case GX_TEV_KCSEL_K1_A: + return "vec3(ubuf.kcolor1.a)"; + case GX_TEV_KCSEL_K2_A: + return "vec3(ubuf.kcolor2.a)"; + case GX_TEV_KCSEL_K3_A: + return "vec3(ubuf.kcolor3.a)"; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid kcSel {}"), stage.kcSel); + unreachable(); + } + } + case GX_CC_ZERO: + return "vec3(0.0)"; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid color arg {}"), arg); + unreachable(); + } +} + +static void alpha_arg_reg_info(GXTevAlphaArg arg, const TevStage& stage, ShaderInfo& info) { + switch (arg) { + case GX_CA_APREV: + if (!info.writesTevReg.test(GX_TEVPREV)) { + info.loadsTevReg.set(GX_TEVPREV); + } + break; + case GX_CA_A0: + if (!info.writesTevReg.test(GX_TEVREG0)) { + info.loadsTevReg.set(GX_TEVREG0); + } + break; + case GX_CA_A1: + if (!info.writesTevReg.test(GX_TEVREG1)) { + info.loadsTevReg.set(GX_TEVREG1); + } + break; + case GX_CA_A2: + if (!info.writesTevReg.test(GX_TEVREG2)) { + info.loadsTevReg.set(GX_TEVREG2); + } + break; + case GX_CA_TEXA: + if (stage.texCoordId == GX_TEXCOORD_NULL) { + Log.report(LOG_FATAL, FMT_STRING("texCoord not bound")); + } + if (stage.texMapId == GX_TEXMAP_NULL) { + Log.report(LOG_FATAL, FMT_STRING("texMap not bound")); + } + info.sampledTexCoords.set(stage.texCoordId); + info.sampledTextures.set(stage.texMapId); + break; + case GX_CA_RASA: + if (stage.channelId >= GX_COLOR0A0 && stage.channelId <= GX_COLOR1A1) { + info.sampledColorChannels.set(stage.channelId - GX_COLOR0A0); + } + break; + case GX_CA_KONST: + switch (stage.kaSel) { + case GX_TEV_KASEL_K0_R: + case GX_TEV_KASEL_K0_G: + case GX_TEV_KASEL_K0_B: + case GX_TEV_KASEL_K0_A: + info.sampledKColors.set(0); + break; + case GX_TEV_KASEL_K1_R: + case GX_TEV_KASEL_K1_G: + case GX_TEV_KASEL_K1_B: + case GX_TEV_KASEL_K1_A: + info.sampledKColors.set(1); + break; + case GX_TEV_KASEL_K2_R: + case GX_TEV_KASEL_K2_G: + case GX_TEV_KASEL_K2_B: + case GX_TEV_KASEL_K2_A: + info.sampledKColors.set(2); + break; + case GX_TEV_KASEL_K3_R: + case GX_TEV_KASEL_K3_G: + case GX_TEV_KASEL_K3_B: + case GX_TEV_KASEL_K3_A: + info.sampledKColors.set(3); + break; + default: + break; + } + break; + default: + break; + } +} + +static std::string alpha_arg_reg(GXTevAlphaArg arg, size_t stageIdx, const ShaderConfig& config, + const TevStage& stage) { + switch (arg) { + case GX_CA_APREV: + return "prev.a"; + case GX_CA_A0: + return "tevreg0.a"; + case GX_CA_A1: + return "tevreg1.a"; + case GX_CA_A2: + return "tevreg2.a"; + case GX_CA_TEXA: { + if (stage.texMapId == GX_TEXMAP_NULL) { + Log.report(LOG_FATAL, FMT_STRING("unmapped texture for stage {}"), stageIdx); + unreachable(); + } else if (stage.texMapId < GX_TEXMAP0 || stage.texMapId > GX_TEXMAP7) { + Log.report(LOG_FATAL, FMT_STRING("invalid texture {} for stage {}"), stage.texMapId, stageIdx); + unreachable(); + } + const auto& swap = config.tevSwapTable[stage.tevSwapTex]; + return fmt::format(FMT_STRING("sampled{}.{}"), stageIdx, chan_comp(swap.alpha)); + } + case GX_CA_RASA: { + if (stage.channelId == GX_COLOR_NULL) { + Log.report(LOG_FATAL, FMT_STRING("unmapped color channel for stage {}"), stageIdx); + unreachable(); + } else if (stage.channelId == GX_COLOR_ZERO) { + return "0.0"; + } else if (stage.channelId < GX_COLOR0A0 || stage.channelId > GX_COLOR1A1) { + Log.report(LOG_FATAL, FMT_STRING("invalid color channel {} for stage {}"), stage.channelId, stageIdx); + unreachable(); + } + u32 idx = stage.channelId - GX_COLOR0A0; + const auto& swap = config.tevSwapTable[stage.tevSwapRas]; + return fmt::format(FMT_STRING("rast{}.{}"), idx, chan_comp(swap.alpha)); + } + case GX_CA_KONST: { + switch (stage.kaSel) { + case GX_TEV_KASEL_8_8: + return "1.0"; + case GX_TEV_KASEL_7_8: + return "(7.0/8.0)"; + case GX_TEV_KASEL_6_8: + return "(6.0/8.0)"; + case GX_TEV_KASEL_5_8: + return "(5.0/8.0)"; + case GX_TEV_KASEL_4_8: + return "(4.0/8.0)"; + case GX_TEV_KASEL_3_8: + return "(3.0/8.0)"; + case GX_TEV_KASEL_2_8: + return "(2.0/8.0)"; + case GX_TEV_KASEL_1_8: + return "(1.0/8.0)"; + case GX_TEV_KASEL_K0_R: + return "ubuf.kcolor0.r"; + case GX_TEV_KASEL_K1_R: + return "ubuf.kcolor1.r"; + case GX_TEV_KASEL_K2_R: + return "ubuf.kcolor2.r"; + case GX_TEV_KASEL_K3_R: + return "ubuf.kcolor3.r"; + case GX_TEV_KASEL_K0_G: + return "ubuf.kcolor0.g"; + case GX_TEV_KASEL_K1_G: + return "ubuf.kcolor1.g"; + case GX_TEV_KASEL_K2_G: + return "ubuf.kcolor2.g"; + case GX_TEV_KASEL_K3_G: + return "ubuf.kcolor3.g"; + case GX_TEV_KASEL_K0_B: + return "ubuf.kcolor0.b"; + case GX_TEV_KASEL_K1_B: + return "ubuf.kcolor1.b"; + case GX_TEV_KASEL_K2_B: + return "ubuf.kcolor2.b"; + case GX_TEV_KASEL_K3_B: + return "ubuf.kcolor3.b"; + case GX_TEV_KASEL_K0_A: + return "ubuf.kcolor0.a"; + case GX_TEV_KASEL_K1_A: + return "ubuf.kcolor1.a"; + case GX_TEV_KASEL_K2_A: + return "ubuf.kcolor2.a"; + case GX_TEV_KASEL_K3_A: + return "ubuf.kcolor3.a"; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid kaSel {}"), stage.kaSel); + unreachable(); + } + } + case GX_CA_ZERO: + return "0.0"; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid alpha arg {}"), arg); + unreachable(); + } +} + +static std::string_view tev_op(GXTevOp op) { + switch (op) { + case GX_TEV_ADD: + return ""sv; + case GX_TEV_SUB: + return "-"sv; + default: + Log.report(LOG_FATAL, FMT_STRING("TODO {}"), op); + unreachable(); + } +} + +static std::string_view tev_bias(GXTevBias bias) { + switch (bias) { + case GX_TB_ZERO: + return ""sv; + case GX_TB_ADDHALF: + return " + 0.5"sv; + case GX_TB_SUBHALF: + return " - 0.5"sv; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid bias {}"), bias); + unreachable(); + } +} + +static std::string alpha_compare(GXCompare comp, u8 ref, bool& valid) { + const float fref = ref / 255.f; + switch (comp) { + case GX_NEVER: + return "false"s; + case GX_LESS: + return fmt::format(FMT_STRING("(prev.a < {}f)"), fref); + case GX_LEQUAL: + return fmt::format(FMT_STRING("(prev.a <= {}f)"), fref); + case GX_EQUAL: + return fmt::format(FMT_STRING("(prev.a == {}f)"), fref); + case GX_NEQUAL: + return fmt::format(FMT_STRING("(prev.a != {}f)"), fref); + case GX_GEQUAL: + return fmt::format(FMT_STRING("(prev.a >= {}f)"), fref); + case GX_GREATER: + return fmt::format(FMT_STRING("(prev.a > {}f)"), fref); + case GX_ALWAYS: + valid = false; + return "true"s; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid compare {}"), comp); + unreachable(); + } +} + +static std::string_view tev_scale(GXTevScale scale) { + switch (scale) { + case GX_CS_SCALE_1: + return ""sv; + case GX_CS_SCALE_2: + return " * 2.0"sv; + case GX_CS_SCALE_4: + return " * 4.0"sv; + case GX_CS_DIVIDE_2: + return " / 2.0"sv; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid scale {}"), scale); + unreachable(); + } +} + +static inline std::string vtx_attr(const ShaderConfig& config, GXAttr attr) { + const auto type = config.vtxAttrs[attr]; + if (type == GX_NONE) { + if (attr == GX_VA_NRM) { + // Default normal + return "vec3(1.0, 0.0, 0.0)"s; + } + Log.report(LOG_FATAL, FMT_STRING("unmapped attr {}"), attr); + unreachable(); + } + if (attr == GX_VA_POS) { + return "in_pos"s; + } + if (attr == GX_VA_NRM) { + return "in_nrm"s; + } + if (attr == GX_VA_CLR0 || attr == GX_VA_CLR1) { + const auto idx = attr - GX_VA_CLR0; + return fmt::format(FMT_STRING("in_clr{}"), idx); + } + if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) { + const auto idx = attr - GX_VA_TEX0; + return fmt::format(FMT_STRING("in_tex{}_uv"), idx); + } + Log.report(LOG_FATAL, FMT_STRING("unhandled attr {}"), attr); + unreachable(); +} + +static inline std::string texture_conversion(const TextureConfig& tex, u32 stageIdx, u32 texMapId) { + std::string out; + if (tex.renderTex) + switch (tex.copyFmt) { + default: + break; + case GX_TF_RGB565: + // Set alpha channel to 1.0 + out += fmt::format(FMT_STRING("\n sampled{0}.a = 1.0;"), stageIdx); + break; + case GX_TF_I4: + case GX_TF_I8: + // FIXME HACK + if (!is_palette_format(tex.loadFmt)) { + // Perform intensity conversion + out += fmt::format(FMT_STRING("\n sampled{0} = vec4(intensityF32(sampled{0}.rgb), 0.f, 0.f, 1.f);"), + stageIdx); + } + break; + } + switch (tex.loadFmt) { + default: + break; + case GX_TF_I4: + case GX_TF_I8: + // Splat R to RGBA + out += fmt::format(FMT_STRING("\n sampled{0} = vec4(sampled{0}.r);"), stageIdx); + break; + } + return out; +} + +constexpr std::array TevColorArgNames{ + "CPREV"sv, "APREV"sv, "C0"sv, "A0"sv, "C1"sv, "A1"sv, "C2"sv, "A2"sv, + "TEXC"sv, "TEXA"sv, "RASC"sv, "RASA"sv, "ONE"sv, "HALF"sv, "KONST"sv, "ZERO"sv, +}; +constexpr std::array TevAlphaArgNames{ + "APREV"sv, "A0"sv, "A1"sv, "A2"sv, "TEXA"sv, "RASA"sv, "KONST"sv, "ZERO"sv, +}; + +constexpr std::array VtxAttributeNames{ + "pn_mtx", "tex0_mtx", "tex1_mtx", "tex2_mtx", "tex3_mtx", "tex4_mtx", "tex5_mtx", + "tex6_mtx", "tex7_mtx", "pos", "nrm", "clr0", "clr1", "tex0_uv", + "tex1_uv", "tex2_uv", "tex3_uv", "tex4_uv", "tex5_uv", "tex6_uv", "tex7_uv", + "pos_mtx_array", "nrm_mtx_array", "tex_mtx_array", "light_array", "nbt", +}; + +ShaderInfo build_shader_info(const ShaderConfig& config) noexcept { + // const auto hash = xxh3_hash(config); + // const auto it = g_gxCachedShaders.find(hash); + // if (it != g_gxCachedShaders.end()) { + // return it->second.second; + // } + + ShaderInfo info{ + .uniformSize = 64 * 3, // mv, mvInv, proj + }; + for (int i = 0; i < config.tevStageCount; ++i) { + const auto& stage = config.tevStages[i]; + // Color pass + color_arg_reg_info(stage.colorPass.a, stage, info); + color_arg_reg_info(stage.colorPass.b, stage, info); + color_arg_reg_info(stage.colorPass.c, stage, info); + color_arg_reg_info(stage.colorPass.d, stage, info); + info.writesTevReg.set(stage.colorOp.outReg); + + // Alpha pass + alpha_arg_reg_info(stage.alphaPass.a, stage, info); + alpha_arg_reg_info(stage.alphaPass.b, stage, info); + alpha_arg_reg_info(stage.alphaPass.c, stage, info); + alpha_arg_reg_info(stage.alphaPass.d, stage, info); + if (!info.writesTevReg.test(stage.alphaOp.outReg)) { + // If we're writing alpha to a register that's not been + // written to in the shader, load from uniform buffer + info.loadsTevReg.set(stage.alphaOp.outReg); + info.writesTevReg.set(stage.alphaOp.outReg); + } + } + info.uniformSize += info.loadsTevReg.count() * 16; + bool lightingEnabled = false; + for (int i = 0; i < info.sampledColorChannels.size(); ++i) { + if (info.sampledColorChannels.test(i)) { + const auto& cc = config.colorChannels[i * 2]; + const auto& cca = config.colorChannels[i * 2 + 1]; + if (cc.lightingEnabled || cca.lightingEnabled) { + lightingEnabled = true; + } + } + } + if (lightingEnabled) { + // Lights + light state for all channels + info.uniformSize += 16 + (80 * GX::MaxLights); + } + for (int i = 0; i < info.sampledColorChannels.size(); ++i) { + if (info.sampledColorChannels.test(i)) { + const auto& cc = config.colorChannels[i * 2]; + if (cc.lightingEnabled && cc.ambSrc == GX_SRC_REG) { + info.uniformSize += 16; + } + if (cc.matSrc == GX_SRC_REG) { + info.uniformSize += 16; + } + const auto& cca = config.colorChannels[i * 2 + 1]; + if (cca.lightingEnabled && cca.ambSrc == GX_SRC_REG) { + info.uniformSize += 16; + } + if (cca.matSrc == GX_SRC_REG) { + info.uniformSize += 16; + } + } + } + info.uniformSize += info.sampledKColors.count() * 16; + for (int i = 0; i < info.sampledTexCoords.size(); ++i) { + if (!info.sampledTexCoords.test(i)) { + continue; + } + const auto& tcg = config.tcgs[i]; + if (tcg.mtx != GX_IDENTITY) { + u32 texMtxIdx = (tcg.mtx - GX_TEXMTX0) / 3; + info.usesTexMtx.set(texMtxIdx); + info.texMtxTypes[texMtxIdx] = tcg.type; + } + if (tcg.postMtx != GX_PTIDENTITY) { + u32 postMtxIdx = (tcg.postMtx - GX_PTTEXMTX0) / 3; + info.usesPTTexMtx.set(postMtxIdx); + } + } + for (int i = 0; i < info.usesTexMtx.size(); ++i) { + if (info.usesTexMtx.test(i)) { + switch (info.texMtxTypes[i]) { + case GX_TG_MTX2x4: + info.uniformSize += 32; + break; + case GX_TG_MTX3x4: + info.uniformSize += 64; + break; + default: + break; + } + } + } + info.uniformSize += info.usesPTTexMtx.count() * 64; + if (config.fogType != GX_FOG_NONE) { + info.usesFog = true; + info.uniformSize += 32; + } + info.uniformSize += info.sampledTextures.count() * 4; + info.uniformSize = align_uniform(info.uniformSize); + return info; +} + +WGPUShaderModule build_shader(const ShaderConfig& config, const ShaderInfo& info) noexcept { + const auto hash = xxh3_hash(config); + const auto it = g_gxCachedShaders.find(hash); + if (it != g_gxCachedShaders.end()) { +#ifndef NDEBUG + if (g_gxCachedShaderConfigs[hash] != config) { + Log.report(LOG_FATAL, FMT_STRING("Shader collision!")); + unreachable(); + } +#endif + return it->second.first; + } + + if (EnableDebugPrints) { + Log.report(LOG_INFO, FMT_STRING("Shader config (hash {:x}):"), hash); + { + for (int i = 0; i < config.tevStageCount; ++i) { + const auto& stage = config.tevStages[i]; + Log.report(LOG_INFO, FMT_STRING(" tevStages[{}]:"), i); + Log.report(LOG_INFO, FMT_STRING(" color_a: {}"), TevColorArgNames[stage.colorPass.a]); + Log.report(LOG_INFO, FMT_STRING(" color_b: {}"), TevColorArgNames[stage.colorPass.b]); + Log.report(LOG_INFO, FMT_STRING(" color_c: {}"), TevColorArgNames[stage.colorPass.c]); + Log.report(LOG_INFO, FMT_STRING(" color_d: {}"), TevColorArgNames[stage.colorPass.d]); + Log.report(LOG_INFO, FMT_STRING(" alpha_a: {}"), TevAlphaArgNames[stage.alphaPass.a]); + Log.report(LOG_INFO, FMT_STRING(" alpha_b: {}"), TevAlphaArgNames[stage.alphaPass.b]); + Log.report(LOG_INFO, FMT_STRING(" alpha_c: {}"), TevAlphaArgNames[stage.alphaPass.c]); + Log.report(LOG_INFO, FMT_STRING(" alpha_d: {}"), TevAlphaArgNames[stage.alphaPass.d]); + Log.report(LOG_INFO, FMT_STRING(" color_op_clamp: {}"), stage.colorOp.clamp); + Log.report(LOG_INFO, FMT_STRING(" color_op_op: {}"), stage.colorOp.op); + Log.report(LOG_INFO, FMT_STRING(" color_op_bias: {}"), stage.colorOp.bias); + Log.report(LOG_INFO, FMT_STRING(" color_op_scale: {}"), stage.colorOp.scale); + Log.report(LOG_INFO, FMT_STRING(" color_op_reg_id: {}"), stage.colorOp.outReg); + Log.report(LOG_INFO, FMT_STRING(" alpha_op_clamp: {}"), stage.alphaOp.clamp); + Log.report(LOG_INFO, FMT_STRING(" alpha_op_op: {}"), stage.alphaOp.op); + Log.report(LOG_INFO, FMT_STRING(" alpha_op_bias: {}"), stage.alphaOp.bias); + Log.report(LOG_INFO, FMT_STRING(" alpha_op_scale: {}"), stage.alphaOp.scale); + Log.report(LOG_INFO, FMT_STRING(" alpha_op_reg_id: {}"), stage.alphaOp.outReg); + Log.report(LOG_INFO, FMT_STRING(" kc_sel: {}"), stage.kcSel); + Log.report(LOG_INFO, FMT_STRING(" ka_sel: {}"), stage.kaSel); + Log.report(LOG_INFO, FMT_STRING(" texCoordId: {}"), stage.texCoordId); + Log.report(LOG_INFO, FMT_STRING(" texMapId: {}"), stage.texMapId); + Log.report(LOG_INFO, FMT_STRING(" channelId: {}"), stage.channelId); + } + for (int i = 0; i < config.colorChannels.size(); ++i) { + const auto& chan = config.colorChannels[i]; + Log.report(LOG_INFO, FMT_STRING(" colorChannels[{}]: enabled {} mat {} amb {}"), i, chan.lightingEnabled, + chan.matSrc, chan.ambSrc); + } + for (int i = 0; i < config.tcgs.size(); ++i) { + const auto& tcg = config.tcgs[i]; + if (tcg.src != GX_MAX_TEXGENSRC) { + Log.report(LOG_INFO, FMT_STRING(" tcg[{}]: src {} mtx {} post {} type {} norm {}"), i, tcg.src, tcg.mtx, + tcg.postMtx, tcg.type, tcg.normalize); + } + } + Log.report(LOG_INFO, FMT_STRING(" alphaCompare: comp0 {} ref0 {} op {} comp1 {} ref1 {}"), + config.alphaCompare.comp0, config.alphaCompare.ref0, config.alphaCompare.op, config.alphaCompare.comp1, + config.alphaCompare.ref1); + Log.report(LOG_INFO, FMT_STRING(" indexedAttributeCount: {}"), config.indexedAttributeCount); + Log.report(LOG_INFO, FMT_STRING(" fogType: {}"), config.fogType); + } + } + + std::string uniformPre; + std::string uniBufAttrs; + std::string uniformBindings; + std::string sampBindings; + std::string texBindings; + std::string vtxOutAttrs; + std::string vtxInAttrs; + std::string vtxXfrAttrsPre; + std::string vtxXfrAttrs; + size_t locIdx = 0; + size_t vtxOutIdx = 0; + size_t uniBindingIdx = 1; + if (config.indexedAttributeCount > 0) { + // Display list attributes + int currAttrIdx = 0; + for (GXAttr attr{}; attr < MaxVtxAttr; attr = GXAttr(attr + 1)) { + // Indexed attributes + if (config.vtxAttrs[attr] != GX_INDEX8 && config.vtxAttrs[attr] != GX_INDEX16) { + continue; + } + const auto [div, rem] = std::div(currAttrIdx, 4); + std::string_view attrName; + bool addUniformBinding = true; + if (config.attrMapping[attr] != attr) { + attrName = VtxAttributeNames[config.attrMapping[attr]]; + addUniformBinding = false; + } else { + attrName = VtxAttributeNames[attr]; + } + vtxXfrAttrsPre += + fmt::format(FMT_STRING("\n var {} = v_arr_{}[in_dl{}[{}]];"), vtx_attr(config, attr), attrName, div, rem); + if (addUniformBinding) { + std::string_view arrType; + if (attr == GX_VA_POS || attr == GX_VA_NRM) { + arrType = "vec3"; + } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) { + arrType = "vec2"; + } + uniformBindings += fmt::format(FMT_STRING("\n@group(0) @binding({})" + "\nvar v_arr_{}: array<{}>;"), + uniBindingIdx++, attrName, arrType); + } + ++currAttrIdx; + } + auto [num4xAttrArrays, rem] = std::div(currAttrIdx, 4); + u32 num2xAttrArrays = 0; + if (rem > 2) { + ++num4xAttrArrays; + } else if (rem > 0) { + num2xAttrArrays = 1; + } + for (u32 i = 0; i < num4xAttrArrays; ++i) { + if (locIdx > 0) { + vtxInAttrs += "\n , "; + } else { + vtxInAttrs += "\n "; + } + vtxInAttrs += fmt::format(FMT_STRING("@location({}) in_dl{}: vec4"), locIdx++, i); + } + for (u32 i = 0; i < num2xAttrArrays; ++i) { + if (locIdx > 0) { + vtxInAttrs += "\n , "; + } else { + vtxInAttrs += "\n "; + } + vtxInAttrs += fmt::format(FMT_STRING("@location({}) in_dl{}: vec2"), locIdx++, num4xAttrArrays + i); + } + } + for (GXAttr attr{}; attr < MaxVtxAttr; attr = GXAttr(attr + 1)) { + // Direct attributes + if (config.vtxAttrs[attr] != GX_DIRECT) { + continue; + } + if (locIdx > 0) { + vtxInAttrs += "\n , "; + } else { + vtxInAttrs += "\n "; + } + if (attr == GX_VA_POS) { + vtxInAttrs += fmt::format(FMT_STRING("@location({}) in_pos: vec3"), locIdx++); + } else if (attr == GX_VA_NRM) { + vtxInAttrs += fmt::format(FMT_STRING("@location({}) in_nrm: vec3"), locIdx++); + } else if (attr == GX_VA_CLR0 || attr == GX_VA_CLR1) { + vtxInAttrs += fmt::format(FMT_STRING("@location({}) in_clr{}: vec4"), locIdx++, attr - GX_VA_CLR0); + } else if (attr >= GX_VA_TEX0 && attr <= GX_VA_TEX7) { + vtxInAttrs += fmt::format(FMT_STRING("@location({}) in_tex{}_uv: vec2"), locIdx++, attr - GX_VA_TEX0); + } + } + vtxXfrAttrsPre += fmt::format(FMT_STRING("\n var mv_pos = ubuf.pos_mtx * vec4({}, 1.0);" + "\n var mv_nrm = ubuf.nrm_mtx * vec4({}, 0.0);" + "\n out.pos = ubuf.proj * vec4(mv_pos, 1.0);"), + vtx_attr(config, GX_VA_POS), vtx_attr(config, GX_VA_NRM)); + if constexpr (EnableNormalVisualization) { + vtxOutAttrs += fmt::format(FMT_STRING("\n @location({}) nrm: vec3,"), vtxOutIdx++); + vtxXfrAttrsPre += "\n out.nrm = mv_nrm;"; + } + + std::string fragmentFnPre; + std::string fragmentFn; + for (u32 idx = 0; idx < config.tevStageCount; ++idx) { + const auto& stage = config.tevStages[idx]; + { + std::string outReg; + switch (stage.colorOp.outReg) { + case GX_TEVPREV: + outReg = "prev"; + break; + case GX_TEVREG0: + outReg = "tevreg0"; + break; + case GX_TEVREG1: + outReg = "tevreg1"; + break; + case GX_TEVREG2: + outReg = "tevreg2"; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid colorOp outReg {}"), stage.colorOp.outReg); + } + std::string op = fmt::format( + FMT_STRING("(({4}mix({0}, {1}, {2}) + {3}){5}){6}"), color_arg_reg(stage.colorPass.a, idx, config, stage), + color_arg_reg(stage.colorPass.b, idx, config, stage), color_arg_reg(stage.colorPass.c, idx, config, stage), + color_arg_reg(stage.colorPass.d, idx, config, stage), tev_op(stage.colorOp.op), tev_bias(stage.colorOp.bias), + tev_scale(stage.colorOp.scale)); + if (stage.colorOp.clamp) { + op = fmt::format(FMT_STRING("clamp({}, vec3(0.0), vec3(1.0))"), op); + } + fragmentFn += + fmt::format(FMT_STRING("\n // TEV stage {2}\n {0} = vec4({1}, {0}.a);"), outReg, op, idx); + } + { + std::string outReg; + switch (stage.alphaOp.outReg) { + case GX_TEVPREV: + outReg = "prev.a"; + break; + case GX_TEVREG0: + outReg = "tevreg0.a"; + break; + case GX_TEVREG1: + outReg = "tevreg1.a"; + break; + case GX_TEVREG2: + outReg = "tevreg2.a"; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid alphaOp outReg {}"), stage.alphaOp.outReg); + } + std::string op = fmt::format( + FMT_STRING("(({4}mix({0}, {1}, {2}) + {3}){5}){6}"), alpha_arg_reg(stage.alphaPass.a, idx, config, stage), + alpha_arg_reg(stage.alphaPass.b, idx, config, stage), alpha_arg_reg(stage.alphaPass.c, idx, config, stage), + alpha_arg_reg(stage.alphaPass.d, idx, config, stage), tev_op(stage.alphaOp.op), tev_bias(stage.alphaOp.bias), + tev_scale(stage.alphaOp.scale)); + if (stage.alphaOp.clamp) { + op = fmt::format(FMT_STRING("clamp({}, 0.0, 1.0)"), op); + } + fragmentFn += fmt::format(FMT_STRING("\n {0} = {1};"), outReg, op); + } + } + if (info.loadsTevReg.test(0)) { + uniBufAttrs += "\n tevprev: vec4,"; + fragmentFnPre += "\n var prev = ubuf.tevprev;"; + } else { + fragmentFnPre += "\n var prev: vec4;"; + } + for (int i = 1 /* Skip TEVPREV */; i < info.loadsTevReg.size(); ++i) { + if (info.loadsTevReg.test(i)) { + uniBufAttrs += fmt::format(FMT_STRING("\n tevreg{}: vec4,"), i - 1); + fragmentFnPre += fmt::format(FMT_STRING("\n var tevreg{0} = ubuf.tevreg{0};"), i - 1); + } else if (info.writesTevReg.test(i)) { + fragmentFnPre += fmt::format(FMT_STRING("\n var tevreg{0}: vec4;"), i - 1); + } + } + bool addedLightStruct = false; + int vtxColorIdx = 0; + for (int i = 0; i < info.sampledColorChannels.size(); ++i) { + if (!info.sampledColorChannels.test(i)) { + continue; + } + const auto& cc = config.colorChannels[i * 2]; + const auto& cca = config.colorChannels[i * 2 + 1]; + + if (!addedLightStruct && (cc.lightingEnabled || cca.lightingEnabled)) { + uniBufAttrs += fmt::format(FMT_STRING("\n lights: array," + "\n lightState0: u32," + "\n lightState0a: u32," + "\n lightState1: u32," + "\n lightState1a: u32,"), + GX::MaxLights); + uniformPre += + "\n" + "struct Light {\n" + " pos: vec3,\n" + " dir: vec3,\n" + " color: vec4,\n" + " cos_att: vec3,\n" + " dist_att: vec3,\n" + "};"; + if (UsePerPixelLighting) { + vtxOutAttrs += fmt::format(FMT_STRING("\n @location({}) mv_pos: vec3,"), vtxOutIdx++); + vtxOutAttrs += fmt::format(FMT_STRING("\n @location({}) mv_nrm: vec3,"), vtxOutIdx++); + vtxXfrAttrs += fmt::format(FMT_STRING(R"""( + out.mv_pos = mv_pos; + out.mv_nrm = mv_nrm;)""")); + } + addedLightStruct = true; + } + + if (cc.lightingEnabled && cc.ambSrc == GX_SRC_REG) { + uniBufAttrs += fmt::format(FMT_STRING("\n cc{0}_amb: vec4,"), i); + } + if (cc.matSrc == GX_SRC_REG) { + uniBufAttrs += fmt::format(FMT_STRING("\n cc{0}_mat: vec4,"), i); + } + if (cca.lightingEnabled && cca.ambSrc == GX_SRC_REG) { + uniBufAttrs += fmt::format(FMT_STRING("\n cc{0}a_amb: vec4,"), i); + } + if (cca.matSrc == GX_SRC_REG) { + uniBufAttrs += fmt::format(FMT_STRING("\n cc{0}a_mat: vec4,"), i); + } + + // Output vertex color if necessary + bool usesVtxColor = false; + if (((cc.lightingEnabled && cc.ambSrc == GX_SRC_VTX) || cc.matSrc == GX_SRC_VTX || + (cca.lightingEnabled && cca.matSrc == GX_SRC_VTX) || cca.matSrc == GX_SRC_VTX)) { + if (UsePerPixelLighting) { + vtxOutAttrs += fmt::format(FMT_STRING("\n @location({}) clr{}: vec4,"), vtxOutIdx++, vtxColorIdx); + vtxXfrAttrs += fmt::format(FMT_STRING("\n out.clr{} = {};"), vtxColorIdx, + vtx_attr(config, static_cast(GX_VA_CLR0 + vtxColorIdx))); + } + usesVtxColor = true; + } + + // TODO handle alpha lighting + if (cc.lightingEnabled) { + std::string ambSrc, matSrc, lightAttnFn, lightDiffFn; + if (cc.ambSrc == GX_SRC_VTX) { + if (UsePerPixelLighting) { + ambSrc = fmt::format(FMT_STRING("in.clr{}"), vtxColorIdx); + } else { + ambSrc = vtx_attr(config, static_cast(GX_VA_CLR0 + vtxColorIdx)); + } + } else if (cc.ambSrc == GX_SRC_REG) { + ambSrc = fmt::format(FMT_STRING("ubuf.cc{0}_amb"), i); + } + if (cc.matSrc == GX_SRC_VTX) { + if (UsePerPixelLighting) { + matSrc = fmt::format(FMT_STRING("in.clr{}"), vtxColorIdx); + } else { + matSrc = vtx_attr(config, static_cast(GX_VA_CLR0 + vtxColorIdx)); + } + } else if (cc.matSrc == GX_SRC_REG) { + matSrc = fmt::format(FMT_STRING("ubuf.cc{0}_mat"), i); + } + GXDiffuseFn diffFn = cc.diffFn; + if (cc.attnFn == GX_AF_NONE) { + lightAttnFn = "attn = 1.0;"; + } else if (cc.attnFn == GX_AF_SPOT) { + lightAttnFn = fmt::format(FMT_STRING(R"""( + var cosine = max(0.0, dot(ldir, light.dir)); + var cos_attn = dot(light.cos_att, vec3(1.0, cosine, cosine * cosine)); + var dist_attn = dot(light.dist_att, vec3(1.0, dist, dist2)); + attn = max(0.0, cos_attn / dist_attn);)""")); + } else if (cc.attnFn == GX_AF_SPEC) { + diffFn = GX_DF_NONE; + Log.report(LOG_FATAL, FMT_STRING("AF_SPEC unimplemented")); + } + if (diffFn == GX_DF_NONE) { + lightDiffFn = "1.0"; + } else if (diffFn == GX_DF_SIGN) { + if (UsePerPixelLighting) { + lightDiffFn = "dot(ldir, in.mv_nrm)"; + } else { + lightDiffFn = "dot(ldir, mv_nrm)"; + } + } else if (diffFn == GX_DF_CLAMP) { + if (UsePerPixelLighting) { + lightDiffFn = "max(0.0, dot(ldir, in.mv_nrm))"; + } else { + lightDiffFn = "max(0.0, dot(ldir, mv_nrm))"; + } + } + std::string outVar, posVar; + if (UsePerPixelLighting) { + outVar = fmt::format(FMT_STRING("rast{}"), i); + posVar = "in.mv_pos"; + } else { + outVar = fmt::format(FMT_STRING("out.cc{}"), i); + posVar = "mv_pos"; + } + auto lightFunc = fmt::format(FMT_STRING(R"""( + {{ + var lighting = {5}; + for (var i = 0u; i < {1}u; i++) {{ + if ((ubuf.lightState{0} & (1u << i)) == 0u) {{ continue; }} + var light = ubuf.lights[i]; + var ldir = light.pos - {7}; + var dist2 = dot(ldir, ldir); + var dist = sqrt(dist2); + ldir = ldir / dist; + var attn: f32;{2} + var diff = {3}; + lighting = lighting + (attn * diff * light.color); + }} + // TODO alpha lighting + {6} = vec4(({4} * clamp(lighting, vec4(0.0), vec4(1.0))).xyz, {4}.a); + }})"""), + i, GX::MaxLights, lightAttnFn, lightDiffFn, matSrc, ambSrc, outVar, posVar); + if (UsePerPixelLighting) { + fragmentFnPre += fmt::format(FMT_STRING("\n var rast{}: vec4;"), i); + fragmentFnPre += lightFunc; + } else { + vtxOutAttrs += fmt::format(FMT_STRING("\n @location({}) cc{}: vec4,"), vtxOutIdx++, i); + vtxXfrAttrs += lightFunc; + fragmentFnPre += fmt::format(FMT_STRING("\n var rast{0} = in.cc{0};"), i); + } + } else if (cc.matSrc == GX_SRC_VTX) { + if (UsePerPixelLighting) { + // Color will already be written to clr{} + fragmentFnPre += fmt::format(FMT_STRING("\n var rast{0} = in.clr{0};"), vtxColorIdx); + } else { + vtxOutAttrs += fmt::format(FMT_STRING("\n @location({}) cc{}: vec4,"), vtxOutIdx++, i); + vtxXfrAttrs += + fmt::format(FMT_STRING("\n out.cc{} = {};"), i, vtx_attr(config, GXAttr(GX_VA_CLR0 + vtxColorIdx))); + fragmentFnPre += fmt::format(FMT_STRING("\n var rast{0} = in.cc{0};"), i); + } + } else { + fragmentFnPre += fmt::format(FMT_STRING("\n var rast{0} = ubuf.cc{0}_mat;"), i); + } + + if (usesVtxColor) { + ++vtxColorIdx; + } + } + for (int i = 0; i < info.sampledKColors.size(); ++i) { + if (info.sampledKColors.test(i)) { + uniBufAttrs += fmt::format(FMT_STRING("\n kcolor{}: vec4,"), i); + } + } + for (int i = 0; i < info.sampledTexCoords.size(); ++i) { + if (!info.sampledTexCoords.test(i)) { + continue; + } + const auto& tcg = config.tcgs[i]; + vtxOutAttrs += fmt::format(FMT_STRING("\n @location({}) tex{}_uv: vec2,"), vtxOutIdx++, i); + if (tcg.src >= GX_TG_TEX0 && tcg.src <= GX_TG_TEX7) { + vtxXfrAttrs += fmt::format(FMT_STRING("\n var tc{} = vec4({}, 0.0, 1.0);"), i, + vtx_attr(config, GXAttr(GX_VA_TEX0 + (tcg.src - GX_TG_TEX0)))); + } else if (tcg.src == GX_TG_POS) { + vtxXfrAttrs += fmt::format(FMT_STRING("\n var tc{} = vec4(in_pos, 1.0);"), i); + } else if (tcg.src == GX_TG_NRM) { + vtxXfrAttrs += fmt::format(FMT_STRING("\n var tc{} = vec4(in_nrm, 1.0);"), i); + } else { + Log.report(LOG_FATAL, FMT_STRING("unhandled tcg src {} for "), tcg.src); + unreachable(); + } + if (tcg.mtx == GX_IDENTITY) { + vtxXfrAttrs += fmt::format(FMT_STRING("\n var tc{0}_tmp = tc{0}.xyz;"), i); + } else { + u32 texMtxIdx = (tcg.mtx - GX_TEXMTX0) / 3; + vtxXfrAttrs += fmt::format(FMT_STRING("\n var tc{0}_tmp = ubuf.texmtx{1} * tc{0};"), i, texMtxIdx); + } + if (tcg.normalize) { + vtxXfrAttrs += fmt::format(FMT_STRING("\n tc{0}_tmp = normalize(tc{0}_tmp);"), i); + } + if (tcg.postMtx == GX_PTIDENTITY) { + vtxXfrAttrs += fmt::format(FMT_STRING("\n var tc{0}_proj = tc{0}_tmp;"), i); + } else { + u32 postMtxIdx = (tcg.postMtx - GX_PTTEXMTX0) / 3; + vtxXfrAttrs += fmt::format(FMT_STRING("\n var tc{0}_proj = ubuf.postmtx{1} * vec4(tc{0}_tmp.xyz, 1.0);"), + i, postMtxIdx); + } + vtxXfrAttrs += fmt::format(FMT_STRING("\n out.tex{0}_uv = tc{0}_proj.xy;"), i); + } + for (int i = 0; i < config.tevStages.size(); ++i) { + const auto& stage = config.tevStages[i]; + if (stage.texMapId == GX_TEXMAP_NULL || + stage.texCoordId == GX_TEXCOORD_NULL + // TODO should check this per-stage probably + || !info.sampledTextures.test(stage.texMapId)) { + continue; + } + std::string uvIn = fmt::format(FMT_STRING("in.tex{0}_uv"), stage.texCoordId); + const auto& texConfig = config.textureConfig[stage.texMapId]; + if (is_palette_format(texConfig.loadFmt)) { + std::string_view suffix; + if (!is_palette_format(texConfig.copyFmt)) { + switch (texConfig.loadFmt) { + case GX_TF_C4: + suffix = "I4"sv; + break; + // case GX_TF_C8: + // suffix = "I8"; + // break; + // case GX_TF_C14X2: + // suffix = "I14X2"; + // break; + default: + Log.report(LOG_FATAL, FMT_STRING("Unsupported palette format {}"), texConfig.loadFmt); + unreachable(); + } + } + fragmentFnPre += + fmt::format(FMT_STRING("\n var sampled{0} = textureSamplePalette{3}(tex{1}, tex{1}_samp, {2}, tlut{1});"), + i, stage.texMapId, uvIn, suffix); + } else { + fragmentFnPre += fmt::format( + FMT_STRING("\n var sampled{0} = textureSampleBias(tex{1}, tex{1}_samp, {2}, ubuf.tex{1}_lod);"), i, + stage.texMapId, uvIn); + } + fragmentFnPre += texture_conversion(texConfig, i, stage.texMapId); + } + for (int i = 0; i < info.usesTexMtx.size(); ++i) { + if (info.usesTexMtx.test(i)) { + switch (info.texMtxTypes[i]) { + case GX_TG_MTX2x4: + uniBufAttrs += fmt::format(FMT_STRING("\n texmtx{}: mat4x2,"), i); + break; + case GX_TG_MTX3x4: + uniBufAttrs += fmt::format(FMT_STRING("\n texmtx{}: mat4x3,"), i); + break; + default: + Log.report(LOG_FATAL, FMT_STRING("unhandled tex mtx type {}"), info.texMtxTypes[i]); + unreachable(); + } + } + } + for (int i = 0; i < info.usesPTTexMtx.size(); ++i) { + if (info.usesPTTexMtx.test(i)) { + uniBufAttrs += fmt::format(FMT_STRING("\n postmtx{}: mat4x3,"), i); + } + } + if (info.usesFog) { + uniformPre += + "\n" + "struct Fog {\n" + " color: vec4,\n" + " a: f32,\n" + " b: f32,\n" + " c: f32,\n" + " pad: f32,\n" + "}"; + uniBufAttrs += "\n fog: Fog,"; + + fragmentFn += "\n // Fog\n var fogF = clamp((ubuf.fog.a / (ubuf.fog.b - in.pos.z)) - ubuf.fog.c, 0.0, 1.0);"; + switch (config.fogType) { + case GX_FOG_PERSP_LIN: + case GX_FOG_ORTHO_LIN: + fragmentFn += "\n var fogZ = fogF;"; + break; + case GX_FOG_PERSP_EXP: + case GX_FOG_ORTHO_EXP: + fragmentFn += "\n var fogZ = 1.0 - exp2(-8.0 * fogF);"; + break; + case GX_FOG_PERSP_EXP2: + case GX_FOG_ORTHO_EXP2: + fragmentFn += "\n var fogZ = 1.0 - exp2(-8.0 * fogF * fogF);"; + break; + case GX_FOG_PERSP_REVEXP: + case GX_FOG_ORTHO_REVEXP: + fragmentFn += "\n var fogZ = exp2(-8.0 * (1.0 - fogF));"; + break; + case GX_FOG_PERSP_REVEXP2: + case GX_FOG_ORTHO_REVEXP2: + fragmentFn += + "\n fogF = 1.0 - fogF;" + "\n var fogZ = exp2(-8.0 * fogF * fogF);"; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid fog type {}"), config.fogType); + unreachable(); + } + fragmentFn += "\n prev = vec4(mix(prev.rgb, ubuf.fog.color.rgb, clamp(fogZ, 0.0, 1.0)), prev.a);"; + } + size_t texBindIdx = 0; + for (int i = 0; i < info.sampledTextures.size(); ++i) { + if (!info.sampledTextures.test(i)) { + continue; + } + uniBufAttrs += fmt::format(FMT_STRING("\n tex{}_lod: f32,"), i); + + sampBindings += fmt::format(FMT_STRING("\n@group(1) @binding({})\n" + "var tex{}_samp: sampler;"), + texBindIdx, i); + + const auto& texConfig = config.textureConfig[i]; + if (is_palette_format(texConfig.loadFmt)) { + texBindings += fmt::format(FMT_STRING("\n@group(2) @binding({})\n" + "var tex{}: texture_2d<{}>;"), + texBindIdx, i, is_palette_format(texConfig.copyFmt) ? "i32"sv : "f32"sv); + ++texBindIdx; + texBindings += fmt::format(FMT_STRING("\n@group(2) @binding({})\n" + "var tlut{}: texture_2d;"), + texBindIdx, i); + } else { + texBindings += fmt::format(FMT_STRING("\n@group(2) @binding({})\n" + "var tex{}: texture_2d;"), + texBindIdx, i); + } + ++texBindIdx; + } + + if (config.alphaCompare) { + bool comp0Valid = true; + bool comp1Valid = true; + std::string comp0 = alpha_compare(config.alphaCompare.comp0, config.alphaCompare.ref0, comp0Valid); + std::string comp1 = alpha_compare(config.alphaCompare.comp1, config.alphaCompare.ref1, comp1Valid); + if (comp0Valid || comp1Valid) { + fragmentFn += "\n // Alpha compare"; + switch (config.alphaCompare.op) { + case GX_AOP_AND: + fragmentFn += fmt::format(FMT_STRING("\n if (!({} && {})) {{ discard; }}"), comp0, comp1); + break; + case GX_AOP_OR: + fragmentFn += fmt::format(FMT_STRING("\n if (!({} || {})) {{ discard; }}"), comp0, comp1); + break; + case GX_AOP_XOR: + fragmentFn += fmt::format(FMT_STRING("\n if (!({} ^^ {})) {{ discard; }}"), comp0, comp1); + break; + case GX_AOP_XNOR: + fragmentFn += fmt::format(FMT_STRING("\n if (({} ^^ {})) {{ discard; }}"), comp0, comp1); + break; + default: + Log.report(LOG_FATAL, FMT_STRING("invalid alpha compare op {}"), config.alphaCompare.op); + unreachable(); + } + } + } + if constexpr (EnableNormalVisualization) { + fragmentFn += "\n prev = vec4(in.nrm, prev.a);"; + } + + const auto shaderSource = fmt::format(FMT_STRING(R"""({10} +struct Uniform {{ + pos_mtx: mat4x3, + nrm_mtx: mat4x3, + proj: mat4x4,{0} +}}; +@group(0) @binding(0) +var ubuf: Uniform;{3}{1}{2} + +struct VertexOutput {{ + @builtin(position) pos: vec4,{4} +}}; + +fn intensityF32(rgb: vec3) -> f32 {{ + // RGB to intensity conversion + // https://github.com/dolphin-emu/dolphin/blob/4cd48e609c507e65b95bca5afb416b59eaf7f683/Source/Core/VideoCommon/TextureConverterShaderGen.cpp#L237-L241 + return dot(rgb, vec3(0.257, 0.504, 0.098)) + 16.0 / 255.0; +}} +fn intensityI4(rgb: vec3) -> i32 {{ + return i32(intensityF32(rgb) * 16.f); +}} +fn textureSamplePalette(tex: texture_2d, samp: sampler, uv: vec2, tlut: texture_2d) -> vec4 {{ + // Gather index values + var i = textureGather(0, tex, samp, uv); + // Load palette colors + var c0 = textureLoad(tlut, vec2(i[0], 0), 0); + var c1 = textureLoad(tlut, vec2(i[1], 0), 0); + var c2 = textureLoad(tlut, vec2(i[2], 0), 0); + var c3 = textureLoad(tlut, vec2(i[3], 0), 0); + // Perform bilinear filtering + var f = fract(uv * vec2(textureDimensions(tex)) + 0.5); + var t0 = mix(c3, c2, f.x); + var t1 = mix(c0, c1, f.x); + return mix(t0, t1, f.y); +}} +fn textureSamplePaletteI4(tex: texture_2d, samp: sampler, uv: vec2, tlut: texture_2d) -> vec4 {{ + // Gather RGB channels + var iR = textureGather(0, tex, samp, uv); + var iG = textureGather(1, tex, samp, uv); + var iB = textureGather(2, tex, samp, uv); + // Perform intensity conversion + var i0 = intensityI4(vec3(iR[0], iG[0], iB[0])); + var i1 = intensityI4(vec3(iR[1], iG[1], iB[1])); + var i2 = intensityI4(vec3(iR[2], iG[2], iB[2])); + var i3 = intensityI4(vec3(iR[3], iG[3], iB[3])); + // Load palette colors + var c0 = textureLoad(tlut, vec2(i0, 0), 0); + var c1 = textureLoad(tlut, vec2(i1, 0), 0); + var c2 = textureLoad(tlut, vec2(i2, 0), 0); + var c3 = textureLoad(tlut, vec2(i3, 0), 0); + // Perform bilinear filtering + var f = fract(uv * vec2(textureDimensions(tex)) + 0.5); + var t0 = mix(c3, c2, f.x); + var t1 = mix(c0, c1, f.x); + return mix(t0, t1, f.y); +}} + +@stage(vertex) +fn vs_main({5} +) -> VertexOutput {{ + var out: VertexOutput;{9}{6} + return out; +}} + +@stage(fragment) +fn fs_main(in: VertexOutput) -> @location(0) vec4 {{{8}{7} + return prev; +}} +)"""), + uniBufAttrs, sampBindings, texBindings, uniformBindings, vtxOutAttrs, + vtxInAttrs, vtxXfrAttrs, fragmentFn, fragmentFnPre, vtxXfrAttrsPre, uniformPre); + if (EnableDebugPrints) { + Log.report(LOG_INFO, FMT_STRING("Generated shader: {}"), shaderSource); + } + + const WGPUShaderModuleWGSLDescriptor wgslDescriptor{ + .chain = {.sType = WGPUSType_ShaderModuleWGSLDescriptor}, + .source = shaderSource.c_str(), + }; + const auto label = fmt::format(FMT_STRING("GX Shader {:x}"), hash); + const auto shaderDescriptor = WGPUShaderModuleDescriptor{ + .nextInChain = &wgslDescriptor.chain, + .label = label.c_str(), + }; + auto shader = wgpuDeviceCreateShaderModule(webgpu::g_device, &shaderDescriptor); + + auto pair = std::make_pair(shader, info); + g_gxCachedShaders.emplace(hash, pair); +#ifndef NDEBUG + g_gxCachedShaderConfigs.emplace(hash, config); +#endif + + return pair.first; +} +} // namespace aurora::gfx::gx diff --git a/lib/gfx/model/shader.cpp b/lib/gfx/model/shader.cpp new file mode 100644 index 0000000..2b0300a --- /dev/null +++ b/lib/gfx/model/shader.cpp @@ -0,0 +1,499 @@ +#include "shader.hpp" + +#include "../../webgpu/gpu.hpp" + +#include + +namespace aurora::gfx::model { +static Module Log("aurora::gfx::model"); + +template +constexpr T bswap16(T val) noexcept { + static_assert(sizeof(T) == sizeof(u16)); + union { + u16 u; + T t; + } v{.t = val}; +#if __GNUC__ + v.u = __builtin_bswap16(v.u); +#elif _WIN32 + v.u = _byteswap_ushort(v.u); +#else + v.u = (v.u << 8) | ((v.u >> 8) & 0xFF); +#endif + return v.t; +} +template +constexpr T bswap32(T val) noexcept { + static_assert(sizeof(T) == sizeof(u32)); + union { + u32 u; + T t; + } v{.t = val}; +#if __GNUC__ + v.u = __builtin_bswap32(v.u); +#elif _WIN32 + v.u = _byteswap_ulong(v.u); +#else + v.u = ((v.u & 0x0000FFFF) << 16) | ((v.u & 0xFFFF0000) >> 16) | ((v.u & 0x00FF00FF) << 8) | ((v.u & 0xFF00FF00) >> 8); +#endif + return v.t; +} + +using IndexedAttrs = std::array; +struct DisplayListCache { + ByteBuffer vtxBuf; + ByteBuffer idxBuf; + IndexedAttrs indexedAttrs; + + DisplayListCache(ByteBuffer&& vtxBuf, ByteBuffer&& idxBuf, IndexedAttrs indexedAttrs) + : vtxBuf(std::move(vtxBuf)), idxBuf(std::move(idxBuf)), indexedAttrs(indexedAttrs) {} +}; + +static absl::flat_hash_map sCachedDisplayLists; + +static u32 prepare_vtx_buffer(ByteBuffer& buf, GXVtxFmt vtxfmt, const u8* ptr, u16 vtxCount, + IndexedAttrs& indexedAttrs) { + using aurora::gfx::gx::g_gxState; + struct { + u8 count; + GXCompType type; + } attrArrays[GX_VA_MAX_ATTR] = {}; + u32 vtxSize = 0; + u32 outVtxSize = 0; + + // Calculate attribute offsets and vertex size + for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) { + const auto& attrFmt = g_gxState.vtxFmts[vtxfmt].attrs[attr]; + switch (g_gxState.vtxDesc[attr]) { + case GX_NONE: + break; + case GX_DIRECT: +#define COMBINE(val1, val2, val3) (((val1) << 16) | ((val2) << 8) | (val3)) + switch (COMBINE(attr, attrFmt.cnt, attrFmt.type)) { + case COMBINE(GX_VA_POS, GX_POS_XYZ, GX_F32): + case COMBINE(GX_VA_NRM, GX_NRM_XYZ, GX_F32): + attrArrays[attr].count = 3; + attrArrays[attr].type = GX_F32; + vtxSize += 12; + outVtxSize += 12; + break; + case COMBINE(GX_VA_POS, GX_POS_XYZ, GX_S16): + case COMBINE(GX_VA_NRM, GX_NRM_XYZ, GX_S16): + attrArrays[attr].count = 3; + attrArrays[attr].type = GX_S16; + vtxSize += 6; + outVtxSize += 12; + break; + case COMBINE(GX_VA_TEX0, GX_TEX_ST, GX_F32): + case COMBINE(GX_VA_TEX1, GX_TEX_ST, GX_F32): + case COMBINE(GX_VA_TEX2, GX_TEX_ST, GX_F32): + case COMBINE(GX_VA_TEX3, GX_TEX_ST, GX_F32): + case COMBINE(GX_VA_TEX4, GX_TEX_ST, GX_F32): + case COMBINE(GX_VA_TEX5, GX_TEX_ST, GX_F32): + case COMBINE(GX_VA_TEX6, GX_TEX_ST, GX_F32): + case COMBINE(GX_VA_TEX7, GX_TEX_ST, GX_F32): + attrArrays[attr].count = 2; + attrArrays[attr].type = GX_F32; + vtxSize += 8; + outVtxSize += 8; + break; + case COMBINE(GX_VA_TEX0, GX_TEX_ST, GX_S16): + case COMBINE(GX_VA_TEX1, GX_TEX_ST, GX_S16): + case COMBINE(GX_VA_TEX2, GX_TEX_ST, GX_S16): + case COMBINE(GX_VA_TEX3, GX_TEX_ST, GX_S16): + case COMBINE(GX_VA_TEX4, GX_TEX_ST, GX_S16): + case COMBINE(GX_VA_TEX5, GX_TEX_ST, GX_S16): + case COMBINE(GX_VA_TEX6, GX_TEX_ST, GX_S16): + case COMBINE(GX_VA_TEX7, GX_TEX_ST, GX_S16): + attrArrays[attr].count = 2; + attrArrays[attr].type = GX_S16; + vtxSize += 4; + outVtxSize += 8; + break; + case COMBINE(GX_VA_CLR0, GX_CLR_RGBA, GX_RGBA8): + case COMBINE(GX_VA_CLR1, GX_CLR_RGBA, GX_RGBA8): + attrArrays[attr].count = 4; + attrArrays[attr].type = GX_RGBA8; + vtxSize += 4; + outVtxSize += 16; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("not handled: attr {}, cnt {}, type {}"), attr, attrFmt.cnt, attrFmt.type); + break; + } +#undef COMBINE + break; + case GX_INDEX8: + ++vtxSize; + outVtxSize += 2; + indexedAttrs[attr] = true; + break; + case GX_INDEX16: + vtxSize += 2; + outVtxSize += 2; + indexedAttrs[attr] = true; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("unhandled attribute type {}"), g_gxState.vtxDesc[attr]); + } + } + // Align to 4 + int rem = outVtxSize % 4; + int padding = 0; + if (rem != 0) { + padding = 4 - rem; + outVtxSize += padding; + } + + // Build vertex buffer + buf.reserve_extra(vtxCount * outVtxSize); + std::array out{}; + for (u32 v = 0; v < vtxCount; ++v) { + for (int attr = 0; attr < GX_VA_MAX_ATTR; attr++) { + if (g_gxState.vtxDesc[attr] == GX_INDEX8) { + u16 index = *ptr; + buf.append(&index, 2); + ++ptr; + } else if (g_gxState.vtxDesc[attr] == GX_INDEX16) { + u16 index = bswap16(*reinterpret_cast(ptr)); + buf.append(&index, 2); + ptr += 2; + } + if (g_gxState.vtxDesc[attr] != GX_DIRECT) { + continue; + } + const auto& attrFmt = g_gxState.vtxFmts[vtxfmt].attrs[attr]; + u8 count = attrArrays[attr].count; + switch (attrArrays[attr].type) { + case GX_U8: + for (int i = 0; i < count; ++i) { + const auto value = reinterpret_cast(ptr)[i]; + out[i] = static_cast(value) / static_cast(1 << attrFmt.frac); + } + buf.append(out.data(), sizeof(f32) * count); + ptr += count; + break; + case GX_S8: + for (int i = 0; i < count; ++i) { + const auto value = reinterpret_cast(ptr)[i]; + out[i] = static_cast(value) / static_cast(1 << attrFmt.frac); + } + buf.append(out.data(), sizeof(f32) * count); + ptr += count; + break; + case GX_U16: + for (int i = 0; i < count; ++i) { + const auto value = bswap16(reinterpret_cast(ptr)[i]); + out[i] = static_cast(value) / static_cast(1 << attrFmt.frac); + } + buf.append(out.data(), sizeof(f32) * count); + ptr += count * sizeof(u16); + break; + case GX_S16: + for (int i = 0; i < count; ++i) { + const auto value = bswap16(reinterpret_cast(ptr)[i]); + out[i] = static_cast(value) / static_cast(1 << attrFmt.frac); + } + buf.append(out.data(), sizeof(f32) * count); + ptr += count * sizeof(s16); + break; + case GX_F32: + for (int i = 0; i < count; ++i) { + out[i] = bswap32(reinterpret_cast(ptr)[i]); + } + buf.append(out.data(), sizeof(f32) * count); + ptr += count * sizeof(f32); + break; + case GX_RGBA8: + out[0] = static_cast(ptr[0]) / 255.f; + out[1] = static_cast(ptr[1]) / 255.f; + out[2] = static_cast(ptr[2]) / 255.f; + out[3] = static_cast(ptr[3]) / 255.f; + buf.append(out.data(), sizeof(f32) * 4); + ptr += sizeof(u32); + break; + } + } + if (padding > 0) { + buf.append_zeroes(padding); + } + } + + return vtxSize; +} + +static u16 prepare_idx_buffer(ByteBuffer& buf, GXPrimitive prim, u16 vtxStart, u16 vtxCount) { + u16 numIndices = 0; + if (prim == GX_TRIANGLES) { + buf.reserve_extra(vtxCount * sizeof(u16)); + for (u16 v = 0; v < vtxCount; ++v) { + const u16 idx = vtxStart + v; + buf.append(&idx, sizeof(u16)); + ++numIndices; + } + } else if (prim == GX_TRIANGLEFAN) { + buf.reserve_extra(((u32(vtxCount) - 3) * 3 + 3) * sizeof(u16)); + for (u16 v = 0; v < vtxCount; ++v) { + const u16 idx = vtxStart + v; + if (v < 3) { + buf.append(&idx, sizeof(u16)); + ++numIndices; + continue; + } + const std::array idxs{vtxStart, u16(idx - 1), idx}; + buf.append(idxs.data(), sizeof(u16) * 3); + numIndices += 3; + } + } else if (prim == GX_TRIANGLESTRIP) { + buf.reserve_extra(((u32(vtxCount) - 3) * 3 + 3) * sizeof(u16)); + for (u16 v = 0; v < vtxCount; ++v) { + const u16 idx = vtxStart + v; + if (v < 3) { + buf.append(&idx, sizeof(u16)); + ++numIndices; + continue; + } + if ((v & 1) == 0) { + const std::array idxs{u16(idx - 2), u16(idx - 1), idx}; + buf.append(idxs.data(), sizeof(u16) * 3); + } else { + const std::array idxs{u16(idx - 1), u16(idx - 2), idx}; + buf.append(idxs.data(), sizeof(u16) * 3); + } + numIndices += 3; + } + } else { + Log.report(LOG_FATAL, FMT_STRING("Unsupported primitive type {}"), static_cast(prim)); + } + return numIndices; +} + +void queue_surface(const u8* dlStart, u32 dlSize) noexcept { + const auto hash = xxh3_hash_s(dlStart, dlSize, 0); + Range vertRange, idxRange; + u32 numIndices = 0; + IndexedAttrs indexedAttrs{}; + auto it = sCachedDisplayLists.find(hash); + if (it != sCachedDisplayLists.end()) { + const auto& cache = it->second; + numIndices = cache.idxBuf.size() / 2; + vertRange = push_verts(cache.vtxBuf.data(), cache.vtxBuf.size()); + idxRange = push_indices(cache.idxBuf.data(), cache.idxBuf.size()); + indexedAttrs = cache.indexedAttrs; + } else { + const u8* data = dlStart; + u32 pos = 0; + ByteBuffer vtxBuf; + ByteBuffer idxBuf; + u16 vtxStart = 0; + + while (pos < dlSize) { + u8 cmd = data[pos++]; + + u8 opcode = cmd & GX_OPCODE_MASK; + switch (opcode) { + case GX_NOP: + continue; + case GX_LOAD_BP_REG: + // TODO? + pos += 4; + break; + case GX_DRAW_QUADS: + case GX_DRAW_TRIANGLES: + case GX_DRAW_TRIANGLE_STRIP: + case GX_DRAW_TRIANGLE_FAN: { + const auto prim = static_cast(opcode); + const auto fmt = static_cast(cmd & GX_VAT_MASK); + u16 vtxCount = bswap16(*reinterpret_cast(data + pos)); + pos += 2; + pos += vtxCount * prepare_vtx_buffer(vtxBuf, fmt, data + pos, vtxCount, indexedAttrs); + numIndices += prepare_idx_buffer(idxBuf, prim, vtxStart, vtxCount); + vtxStart += vtxCount; + break; + } + case GX_DRAW_LINES: + case GX_DRAW_LINE_STRIP: + case GX_DRAW_POINTS: + Log.report(LOG_FATAL, FMT_STRING("unimplemented prim type: {}"), opcode); + break; + default: + Log.report(LOG_FATAL, FMT_STRING("unimplemented opcode: {}"), opcode); + break; + } + } + vertRange = push_verts(vtxBuf.data(), vtxBuf.size()); + idxRange = push_indices(idxBuf.data(), idxBuf.size()); + sCachedDisplayLists.try_emplace(hash, std::move(vtxBuf), std::move(idxBuf), indexedAttrs); + } + + gx::BindGroupRanges ranges{}; + int lastIndexedAttr = -1; + for (int i = 0; i < GX_VA_MAX_ATTR; ++i) { + if (!indexedAttrs[i]) { + continue; + } + auto& array = gx::g_gxState.arrays[i]; + if (lastIndexedAttr >= 0 && array == gx::g_gxState.arrays[lastIndexedAttr]) { + // Reuse range from last attribute in shader + // Don't set the output range, so it remains unbound + const auto range = gx::g_gxState.arrays[lastIndexedAttr].cachedRange; + array.cachedRange = range; + } else if (array.cachedRange.size > 0) { + // Use the currently cached range + ranges.vaRanges[i] = array.cachedRange; + } else { + // Push array data to storage and cache range + const auto range = push_storage(static_cast(array.data), array.size); + ranges.vaRanges[i] = range; + array.cachedRange = range; + } + lastIndexedAttr = i; + } + + model::PipelineConfig config{}; + populate_pipeline_config(config, GX_TRIANGLES); + const auto info = gx::build_shader_info(config.shaderConfig); + const auto bindGroups = gx::build_bind_groups(info, config.shaderConfig, ranges); + const auto pipeline = pipeline_ref(config); + + push_draw_command(model::DrawData{ + .pipeline = pipeline, + .vertRange = vertRange, + .idxRange = idxRange, + .dataRanges = ranges, + .uniformRange = build_uniform(info), + .indexCount = numIndices, + .bindGroups = bindGroups, + .dstAlpha = gx::g_gxState.dstAlpha, + }); +} + +State construct_state() { return {}; } + +WGPURenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config) { + const auto info = build_shader_info(config.shaderConfig); // TODO remove + const auto shader = build_shader(config.shaderConfig, info); + + std::array vtxAttrs{}; + auto [num4xAttr, rem] = std::div(config.shaderConfig.indexedAttributeCount, 4); + u32 num2xAttr = 0; + if (rem > 2) { + ++num4xAttr; + } else if (rem > 0) { + ++num2xAttr; + } + + u32 offset = 0; + u32 shaderLocation = 0; + + // Indexed attributes + for (u32 i = 0; i < num4xAttr; ++i) { + vtxAttrs[shaderLocation] = { + .format = WGPUVertexFormat_Sint16x4, + .offset = offset, + .shaderLocation = shaderLocation, + }; + offset += 8; + ++shaderLocation; + } + for (u32 i = 0; i < num2xAttr; ++i) { + vtxAttrs[shaderLocation] = { + .format = WGPUVertexFormat_Sint16x2, + .offset = offset, + .shaderLocation = shaderLocation, + }; + offset += 4; + ++shaderLocation; + } + + // Direct attributes + for (int i = 0; i < gx::MaxVtxAttr; ++i) { + const auto attrType = config.shaderConfig.vtxAttrs[i]; + if (attrType != GX_DIRECT) { + continue; + } + const auto attr = static_cast(i); + switch (attr) { + case GX_VA_POS: + case GX_VA_NRM: + vtxAttrs[shaderLocation] = WGPUVertexAttribute{ + .format = WGPUVertexFormat_Float32x3, + .offset = offset, + .shaderLocation = shaderLocation, + }; + offset += 12; + break; + case GX_VA_CLR0: + case GX_VA_CLR1: + vtxAttrs[shaderLocation] = WGPUVertexAttribute{ + .format = WGPUVertexFormat_Float32x4, + .offset = offset, + .shaderLocation = shaderLocation, + }; + offset += 16; + break; + case GX_VA_TEX0: + case GX_VA_TEX1: + case GX_VA_TEX2: + case GX_VA_TEX3: + case GX_VA_TEX4: + case GX_VA_TEX5: + case GX_VA_TEX6: + case GX_VA_TEX7: + vtxAttrs[shaderLocation] = WGPUVertexAttribute{ + .format = WGPUVertexFormat_Float32x2, + .offset = offset, + .shaderLocation = shaderLocation, + }; + offset += 8; + break; + default: + Log.report(LOG_FATAL, FMT_STRING("unhandled direct attr {}"), i); + } + ++shaderLocation; + } + + const std::array vtxBuffers{WGPUVertexBufferLayout{ + .arrayStride = offset, + .stepMode = WGPUVertexStepMode_Vertex, + .attributeCount = shaderLocation, + .attributes = vtxAttrs.data(), + }}; + + return build_pipeline(config, info, vtxBuffers, shader, "GX Pipeline"); +} + +void render(const State& state, const DrawData& data, const WGPURenderPassEncoder& pass) { + if (!bind_pipeline(data.pipeline, pass)) { + return; + } + + std::array offsets{data.uniformRange.offset}; + uint32_t bindIdx = 1; + for (uint32_t i = 0; i < GX_VA_MAX_ATTR; ++i) { + const auto& range = data.dataRanges.vaRanges[i]; + if (range.size <= 0) { + continue; + } + offsets[bindIdx] = range.offset; + ++bindIdx; + } + wgpuRenderPassEncoderSetBindGroup(pass, 0, find_bind_group(data.bindGroups.uniformBindGroup), bindIdx, + offsets.data()); + if (data.bindGroups.samplerBindGroup && data.bindGroups.textureBindGroup) { + wgpuRenderPassEncoderSetBindGroup(pass, 1, find_bind_group(data.bindGroups.samplerBindGroup), 0, nullptr); + wgpuRenderPassEncoderSetBindGroup(pass, 2, find_bind_group(data.bindGroups.textureBindGroup), 0, nullptr); + } + wgpuRenderPassEncoderSetVertexBuffer(pass, 0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size); + wgpuRenderPassEncoderSetIndexBuffer(pass, g_indexBuffer, WGPUIndexFormat_Uint16, data.idxRange.offset, + data.idxRange.size); + if (data.dstAlpha != UINT32_MAX) { + const WGPUColor color{0.f, 0.f, 0.f, data.dstAlpha / 255.f}; + wgpuRenderPassEncoderSetBlendConstant(pass, &color); + } + wgpuRenderPassEncoderDrawIndexed(pass, data.indexCount, 1, 0, 0, 0); +} +} // namespace aurora::gfx::model + +static absl::flat_hash_map sCachedRanges; diff --git a/lib/gfx/model/shader.hpp b/lib/gfx/model/shader.hpp new file mode 100644 index 0000000..871efa5 --- /dev/null +++ b/lib/gfx/model/shader.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "../common.hpp" +#include "../gx.hpp" + +namespace aurora::gfx::model { +struct DrawData { + PipelineRef pipeline; + Range vertRange; + Range idxRange; + gx::BindGroupRanges dataRanges; + Range uniformRange; + uint32_t indexCount; + gx::GXBindGroups bindGroups; + u32 dstAlpha; +}; + +struct PipelineConfig : gx::PipelineConfig {}; + +struct State {}; + +State construct_state(); +WGPURenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config); +void render(const State& state, const DrawData& data, const WGPURenderPassEncoder& pass); + +void queue_surface(const u8* dlStart, u32 dlSize) noexcept; +} // namespace aurora::gfx::model diff --git a/lib/gfx/stream/shader.cpp b/lib/gfx/stream/shader.cpp new file mode 100644 index 0000000..9a56ada --- /dev/null +++ b/lib/gfx/stream/shader.cpp @@ -0,0 +1,84 @@ +#include "shader.hpp" + +#include "../../webgpu/gpu.hpp" + +namespace aurora::gfx::stream { +static Module Log("aurora::gfx::stream"); + +using webgpu::g_device; + +WGPURenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config) { + const auto info = build_shader_info(config.shaderConfig); // TODO remove + const auto shader = build_shader(config.shaderConfig, info); + + std::array attributes{}; + attributes[0] = WGPUVertexAttribute{ + .format = WGPUVertexFormat_Float32x3, + .offset = 0, + .shaderLocation = 0, + }; + uint64_t offset = 12; + uint32_t shaderLocation = 1; + if (config.shaderConfig.vtxAttrs[GX_VA_NRM] == GX_DIRECT) { + attributes[shaderLocation] = WGPUVertexAttribute{ + .format = WGPUVertexFormat_Float32x3, + .offset = offset, + .shaderLocation = shaderLocation, + }; + offset += 12; + shaderLocation++; + } + if (config.shaderConfig.vtxAttrs[GX_VA_CLR0] == GX_DIRECT) { + attributes[shaderLocation] = WGPUVertexAttribute{ + .format = WGPUVertexFormat_Float32x4, + .offset = offset, + .shaderLocation = shaderLocation, + }; + offset += 16; + shaderLocation++; + } + for (int i = GX_VA_TEX0; i < GX_VA_TEX7; ++i) { + if (config.shaderConfig.vtxAttrs[i] != GX_DIRECT) { + continue; + } + attributes[shaderLocation] = WGPUVertexAttribute{ + .format = WGPUVertexFormat_Float32x2, + .offset = offset, + .shaderLocation = shaderLocation, + }; + offset += 8; + shaderLocation++; + } + const std::array vertexBuffers{WGPUVertexBufferLayout{ + .arrayStride = offset, + .attributeCount = shaderLocation, + .attributes = attributes.data(), + }}; + + return build_pipeline(config, info, vertexBuffers, shader, "Stream Pipeline"); +} + +State construct_state() { return {}; } + +void render(const State& state, const DrawData& data, const WGPURenderPassEncoder& pass) { + if (!bind_pipeline(data.pipeline, pass)) { + return; + } + + const std::array offsets{data.uniformRange.offset}; + wgpuRenderPassEncoderSetBindGroup(pass, 0, find_bind_group(data.bindGroups.uniformBindGroup), offsets.size(), + offsets.data()); + if (data.bindGroups.samplerBindGroup && data.bindGroups.textureBindGroup) { + wgpuRenderPassEncoderSetBindGroup(pass, 1, find_bind_group(data.bindGroups.samplerBindGroup), 0, nullptr); + wgpuRenderPassEncoderSetBindGroup(pass, 2, find_bind_group(data.bindGroups.textureBindGroup), 0, nullptr); + } + wgpuRenderPassEncoderSetVertexBuffer(pass, 0, g_vertexBuffer, data.vertRange.offset, data.vertRange.size); + wgpuRenderPassEncoderSetIndexBuffer(pass, g_indexBuffer, WGPUIndexFormat_Uint16, data.indexRange.offset, + data.indexRange.size); + if (data.dstAlpha != UINT32_MAX) { + const WGPUColor color{0.f, 0.f, 0.f, data.dstAlpha / 255.f}; + wgpuRenderPassEncoderSetBlendConstant(pass, &color); + } + wgpuRenderPassEncoderDrawIndexed(pass, data.indexCount, 1, 0, 0, 0); +} +} // namespace aurora::gfx::stream diff --git a/lib/gfx/stream/shader.hpp b/lib/gfx/stream/shader.hpp new file mode 100644 index 0000000..3c7dbd1 --- /dev/null +++ b/lib/gfx/stream/shader.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include "../common.hpp" +#include "../gx.hpp" + +namespace aurora::gfx::stream { +struct DrawData { + PipelineRef pipeline; + Range vertRange; + Range uniformRange; + Range indexRange; + uint32_t indexCount; + gx::GXBindGroups bindGroups; + u32 dstAlpha; +}; + +struct PipelineConfig : public gx::PipelineConfig {}; + +struct State {}; + +State construct_state(); +WGPURenderPipeline create_pipeline(const State& state, [[maybe_unused]] const PipelineConfig& config); +void render(const State& state, const DrawData& data, const WGPURenderPassEncoder& pass); +} // namespace aurora::gfx::stream diff --git a/lib/gfx/texture.cpp b/lib/gfx/texture.cpp new file mode 100644 index 0000000..fe2eb6b --- /dev/null +++ b/lib/gfx/texture.cpp @@ -0,0 +1,210 @@ +#include "common.hpp" + +#include "../webgpu/gpu.hpp" +#include "../internal.hpp" +#include "texture.hpp" +#include "texture_convert.hpp" + +#include + +namespace aurora::gfx { +static Module Log("aurora::gfx"); + +using webgpu::g_device; +using webgpu::g_queue; + +struct TextureFormatInfo { + uint8_t blockWidth; + uint8_t blockHeight; + uint8_t blockSize; + bool compressed; +}; +static TextureFormatInfo format_info(WGPUTextureFormat format) { + switch (format) { + case WGPUTextureFormat_R8Unorm: + return {1, 1, 1, false}; + case WGPUTextureFormat_R16Sint: + return {1, 1, 2, false}; + case WGPUTextureFormat_RGBA8Unorm: + case WGPUTextureFormat_R32Float: + return {1, 1, 4, false}; + case WGPUTextureFormat_BC1RGBAUnorm: + return {4, 4, 8, true}; + default: + Log.report(LOG_FATAL, FMT_STRING("format_info: unimplemented format {}"), magic_enum::enum_name(format)); + unreachable(); + } +} +static WGPUExtent3D physical_size(WGPUExtent3D size, TextureFormatInfo info) { + const uint32_t width = ((size.width + info.blockWidth - 1) / info.blockWidth) * info.blockWidth; + const uint32_t height = ((size.height + info.blockHeight - 1) / info.blockHeight) * info.blockHeight; + return {width, height, size.depthOrArrayLayers}; +} + +TextureHandle new_static_texture_2d(uint32_t width, uint32_t height, uint32_t mips, u32 format, + ArrayRef data, const char* label) noexcept { + auto handle = new_dynamic_texture_2d(width, height, mips, format, label); + const auto& ref = *handle; + + ByteBuffer buffer; + if (ref.gxFormat != InvalidTextureFormat) { + buffer = convert_texture(ref.gxFormat, ref.size.width, ref.size.height, ref.mipCount, data); + if (!buffer.empty()) { + data = {buffer.data(), buffer.size()}; + } + } + + uint32_t offset = 0; + for (uint32_t mip = 0; mip < mips; ++mip) { + const WGPUExtent3D mipSize{ + .width = std::max(ref.size.width >> mip, 1u), + .height = std::max(ref.size.height >> mip, 1u), + .depthOrArrayLayers = ref.size.depthOrArrayLayers, + }; + const auto info = format_info(ref.format); + const auto physicalSize = physical_size(mipSize, info); + const uint32_t widthBlocks = physicalSize.width / info.blockWidth; + const uint32_t heightBlocks = physicalSize.height / info.blockHeight; + const uint32_t bytesPerRow = widthBlocks * info.blockSize; + const uint32_t dataSize = bytesPerRow * heightBlocks * mipSize.depthOrArrayLayers; + if (offset + dataSize > data.size()) { + Log.report(LOG_FATAL, FMT_STRING("new_static_texture_2d[{}]: expected at least {} bytes, got {}"), label, + offset + dataSize, data.size()); + unreachable(); + } + const WGPUImageCopyTexture dstView{ + .texture = ref.texture, + .mipLevel = mip, + }; + // const auto range = push_texture_data(data.data() + offset, dataSize, bytesPerRow, heightBlocks); + const WGPUTextureDataLayout dataLayout{ + // .offset = range.offset, + .bytesPerRow = bytesPerRow, + .rowsPerImage = heightBlocks, + }; + // TODO + // g_textureUploads.emplace_back(dataLayout, std::move(dstView), physicalSize); + wgpuQueueWriteTexture(g_queue, &dstView, data.data() + offset, dataSize, &dataLayout, &physicalSize); + offset += dataSize; + } + if (data.size() != UINT32_MAX && offset < data.size()) { + Log.report(LOG_WARNING, FMT_STRING("new_static_texture_2d[{}]: texture used {} bytes, but given {} bytes"), label, + offset, data.size()); + } + return handle; +} + +TextureHandle new_dynamic_texture_2d(uint32_t width, uint32_t height, uint32_t mips, u32 format, + const char* label) noexcept { + const auto wgpuFormat = to_wgpu(format); + const WGPUExtent3D size{ + .width = width, + .height = height, + .depthOrArrayLayers = 1, + }; + const WGPUTextureDescriptor textureDescriptor{ + .label = label, + .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst, + .dimension = WGPUTextureDimension_2D, + .size = size, + .format = wgpuFormat, + .mipLevelCount = mips, + .sampleCount = 1, + }; + const auto viewLabel = fmt::format(FMT_STRING("{} view"), label); + const WGPUTextureViewDescriptor textureViewDescriptor{ + .label = viewLabel.c_str(), + .format = wgpuFormat, + .dimension = WGPUTextureViewDimension_2D, + .mipLevelCount = mips, + .arrayLayerCount = WGPU_ARRAY_LAYER_COUNT_UNDEFINED, + }; + auto texture = wgpuDeviceCreateTexture(g_device, &textureDescriptor); + auto textureView = wgpuTextureCreateView(texture, &textureViewDescriptor); + return std::make_shared(texture, textureView, size, wgpuFormat, mips, format, false); +} + +TextureHandle new_render_texture(uint32_t width, uint32_t height, u32 fmt, const char* label) noexcept { + const auto wgpuFormat = webgpu::g_graphicsConfig.colorFormat; + const WGPUExtent3D size{ + .width = width, + .height = height, + .depthOrArrayLayers = 1, + }; + const WGPUTextureDescriptor textureDescriptor{ + .label = label, + .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst, + .dimension = WGPUTextureDimension_2D, + .size = size, + .format = wgpuFormat, + .mipLevelCount = 1, + .sampleCount = 1, + }; + const auto viewLabel = fmt::format(FMT_STRING("{} view"), label); + const WGPUTextureViewDescriptor textureViewDescriptor{ + .label = viewLabel.c_str(), + .format = wgpuFormat, + .dimension = WGPUTextureViewDimension_2D, + .mipLevelCount = WGPU_MIP_LEVEL_COUNT_UNDEFINED, + .arrayLayerCount = WGPU_ARRAY_LAYER_COUNT_UNDEFINED, + }; + auto texture = wgpuDeviceCreateTexture(g_device, &textureDescriptor); + auto textureView = wgpuTextureCreateView(texture, &textureViewDescriptor); + return std::make_shared(texture, textureView, size, wgpuFormat, 1, fmt, true); +} + +void write_texture(const TextureRef& ref, ArrayRef data) noexcept { + ByteBuffer buffer; + if (ref.gxFormat != InvalidTextureFormat) { + buffer = convert_texture(ref.gxFormat, ref.size.width, ref.size.height, ref.mipCount, data); + if (!buffer.empty()) { + data = {buffer.data(), buffer.size()}; + } + } + + uint32_t offset = 0; + for (uint32_t mip = 0; mip < ref.mipCount; ++mip) { + const WGPUExtent3D mipSize{ + .width = std::max(ref.size.width >> mip, 1u), + .height = std::max(ref.size.height >> mip, 1u), + .depthOrArrayLayers = ref.size.depthOrArrayLayers, + }; + const auto info = format_info(ref.format); + const auto physicalSize = physical_size(mipSize, info); + const uint32_t widthBlocks = physicalSize.width / info.blockWidth; + const uint32_t heightBlocks = physicalSize.height / info.blockHeight; + const uint32_t bytesPerRow = widthBlocks * info.blockSize; + const uint32_t dataSize = bytesPerRow * heightBlocks * mipSize.depthOrArrayLayers; + if (offset + dataSize > data.size()) { + Log.report(LOG_FATAL, FMT_STRING("write_texture: expected at least {} bytes, got {}"), offset + dataSize, + data.size()); + unreachable(); + } + // auto dstView = WGPUImageCopyTexture{ + // .texture = ref.texture, + // .mipLevel = mip, + // }; + // const auto range = push_texture_data(data.data() + offset, dataSize, bytesPerRow, heightBlocks); + // const auto dataLayout = WGPUTextureDataLayout{ + // .offset = range.offset, + // .bytesPerRow = bytesPerRow, + // .rowsPerImage = heightBlocks, + // }; + // g_textureUploads.emplace_back(dataLayout, std::move(dstView), physicalSize); + const WGPUImageCopyTexture dstView{ + .texture = ref.texture, + .mipLevel = mip, + }; + const WGPUTextureDataLayout dataLayout{ + .bytesPerRow = bytesPerRow, + .rowsPerImage = heightBlocks, + }; + wgpuQueueWriteTexture(g_queue, &dstView, data.data() + offset, dataSize, &dataLayout, &physicalSize); + offset += dataSize; + } + if (data.size() != UINT32_MAX && offset < data.size()) { + Log.report(LOG_WARNING, FMT_STRING("write_texture: texture used {} bytes, but given {} bytes"), offset, + data.size()); + } +} +} // namespace aurora::gfx diff --git a/lib/gfx/texture.hpp b/lib/gfx/texture.hpp new file mode 100644 index 0000000..90a8ff6 --- /dev/null +++ b/lib/gfx/texture.hpp @@ -0,0 +1,90 @@ +#pragma once +#include + +#include "common.hpp" + +namespace aurora::gfx { +struct TextureUpload { + WGPUTextureDataLayout layout; + WGPUImageCopyTexture tex; + WGPUExtent3D size; + + TextureUpload(WGPUTextureDataLayout layout, WGPUImageCopyTexture tex, WGPUExtent3D size) noexcept + : layout(layout), tex(tex), size(size) {} +}; +extern std::vector g_textureUploads; + +constexpr u32 InvalidTextureFormat = -1; +struct TextureRef { + WGPUTexture texture; + WGPUTextureView view; + WGPUExtent3D size; + WGPUTextureFormat format; + uint32_t mipCount; + u32 gxFormat; + bool isRenderTexture; // :shrug: for now + + TextureRef(WGPUTexture texture, WGPUTextureView view, WGPUExtent3D size, WGPUTextureFormat format, uint32_t mipCount, + u32 gxFormat, bool isRenderTexture) + : texture(texture) + , view(view) + , size(size) + , format(format) + , mipCount(mipCount) + , gxFormat(gxFormat) + , isRenderTexture(isRenderTexture) {} + + ~TextureRef() { + wgpuTextureViewRelease(view); + wgpuTextureDestroy(texture); + } +}; + +using TextureHandle = std::shared_ptr; + +TextureHandle new_static_texture_2d(uint32_t width, uint32_t height, uint32_t mips, u32 format, + ArrayRef data, const char* label) noexcept; +TextureHandle new_dynamic_texture_2d(uint32_t width, uint32_t height, uint32_t mips, u32 format, + const char* label) noexcept; +TextureHandle new_render_texture(uint32_t width, uint32_t height, u32 fmt, const char* label) noexcept; +void write_texture(const TextureRef& ref, ArrayRef data) noexcept; +}; // namespace aurora::gfx + +struct GXTexObj_ { + aurora::gfx::TextureHandle ref; + const void* data; + u32 dataSize; + u16 width; + u16 height; + u32 fmt; + GXTexWrapMode wrapS; + GXTexWrapMode wrapT; + GXBool hasMips; + GXTexFilter minFilter; + GXTexFilter magFilter; + float minLod; + float maxLod; + float lodBias; + GXBool biasClamp; + GXBool doEdgeLod; + GXAnisotropy maxAniso; + GXTlut tlut; + bool dataInvalidated; +}; +static_assert(sizeof(GXTexObj_) <= sizeof(GXTexObj), "GXTexObj too small!"); +struct GXTlutObj_ { + aurora::gfx::TextureHandle ref; +}; +static_assert(sizeof(GXTlutObj_) <= sizeof(GXTlutObj), "GXTlutObj too small!"); + +namespace aurora::gfx { +struct TextureBind { + GXTexObj_ texObj; + + TextureBind() noexcept = default; + TextureBind(GXTexObj_ obj) noexcept : texObj(std::move(obj)) {} + void reset() noexcept { texObj.ref.reset(); }; + [[nodiscard]] WGPUSamplerDescriptor get_descriptor() const noexcept; + operator bool() const noexcept { return texObj.ref.operator bool(); } +}; +} // namespace aurora::gfx diff --git a/lib/gfx/texture_convert.cpp b/lib/gfx/texture_convert.cpp new file mode 100644 index 0000000..ad755d9 --- /dev/null +++ b/lib/gfx/texture_convert.cpp @@ -0,0 +1,607 @@ +#include "texture_convert.hpp" + +#include "../internal.hpp" + +namespace aurora::gfx { +static Module Log("aurora::gfx"); + +struct RGBA8 { + uint8_t r; + uint8_t g; + uint8_t b; + uint8_t a; +}; +struct DXT1Block { + uint16_t color1; + uint16_t color2; + std::array lines; +}; + +// http://www.mindcontrol.org/~hplus/graphics/expand-bits.html +template +constexpr uint8_t ExpandTo8(uint8_t n) { + if constexpr (v == 3) { + return (n << (8 - 3)) | (n << (8 - 6)) | (n >> (9 - 8)); + } else { + return (n << (8 - v)) | (n >> ((v * 2) - 8)); + } +} + +constexpr uint8_t S3TCBlend(uint32_t a, uint32_t b) { + return static_cast((((a << 1) + a) + ((b << 2) + b)) >> 3); +} + +constexpr uint8_t HalfBlend(uint8_t a, uint8_t b) { + return static_cast((static_cast(a) + static_cast(b)) >> 1); +} + +static size_t ComputeMippedTexelCount(uint32_t w, uint32_t h, uint32_t mips) { + size_t ret = w * h; + for (uint32_t i = mips; i > 1; --i) { + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + ret += w * h; + } + return ret; +} + +static size_t ComputeMippedBlockCountDXT1(uint32_t w, uint32_t h, uint32_t mips) { + w /= 4; + h /= 4; + size_t ret = w * h; + for (uint32_t i = mips; i > 1; --i) { + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + ret += w * h; + } + return ret; +} + +template +constexpr T bswap16(T val) noexcept { +#if __GNUC__ + return __builtin_bswap16(val); +#elif _WIN32 + return _byteswap_ushort(val); +#else + return (val = (val << 8) | ((val >> 8) & 0xFF)); +#endif +} + +static ByteBuffer BuildI4FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{texelCount}; + + uint32_t w = width; + uint32_t h = height; + uint8_t* targetMip = buf.data(); + const uint8_t* in = data.data(); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 7) / 8; + const uint32_t bheight = (h + 7) / 8; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 8; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 8; + for (uint32_t y = 0; y < std::min(h, 8u); ++y) { + uint8_t* target = targetMip + (baseY + y) * w + baseX; + for (uint32_t x = 0; x < std::min(w, 8u); ++x) { + target[x] = ExpandTo8<4>(in[x / 2] >> ((x & 1) ? 0 : 4) & 0xf); + } + in += std::min(w / 4, 4); + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +static ByteBuffer BuildI8FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{texelCount}; + + uint32_t w = width; + uint32_t h = height; + auto* targetMip = buf.data(); + const uint8_t* in = data.data(); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 7) / 8; + const uint32_t bheight = (h + 3) / 4; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 4; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 8; + for (uint32_t y = 0; y < 4; ++y) { + uint8_t* target = targetMip + (baseY + y) * w + baseX; + const auto n = std::min(w, 8u); + for (size_t x = 0; x < n; ++x) { + target[x] = in[x]; + } + in += n; + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildIA4FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{sizeof(RGBA8) * texelCount}; + + uint32_t w = width; + uint32_t h = height; + RGBA8* targetMip = reinterpret_cast(buf.data()); + const uint8_t* in = data.data(); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 7) / 8; + const uint32_t bheight = (h + 3) / 4; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 4; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 8; + for (uint32_t y = 0; y < 4; ++y) { + RGBA8* target = targetMip + (baseY + y) * w + baseX; + const auto n = std::min(w, 8u); + for (size_t x = 0; x < n; ++x) { + const uint8_t intensity = ExpandTo8<4>(in[x] & 0xf); + target[x].r = intensity; + target[x].g = intensity; + target[x].b = intensity; + target[x].a = ExpandTo8<4>(in[x] >> 4); + } + in += n; + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildIA8FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{sizeof(RGBA8) * texelCount}; + + uint32_t w = width; + uint32_t h = height; + auto* targetMip = reinterpret_cast(buf.data()); + const auto* in = reinterpret_cast(data.data()); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 3) / 4; + const uint32_t bheight = (h + 3) / 4; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 4; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 4; + for (uint32_t y = 0; y < 4; ++y) { + RGBA8* target = targetMip + (baseY + y) * w + baseX; + for (size_t x = 0; x < 4; ++x) { + const auto texel = bswap16(in[x]); + const uint8_t intensity = texel >> 8; + target[x].r = intensity; + target[x].g = intensity; + target[x].b = intensity; + target[x].a = texel & 0xff; + } + in += 4; + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildC4FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{texelCount * 2}; + + uint32_t w = width; + uint32_t h = height; + uint16_t* targetMip = reinterpret_cast(buf.data()); + const uint8_t* in = data.data(); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 7) / 8; + const uint32_t bheight = (h + 7) / 8; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 8; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 8; + for (uint32_t y = 0; y < std::min(8u, h); ++y) { + uint16_t* target = targetMip + (baseY + y) * w + baseX; + const auto n = std::min(w, 8u); + for (size_t x = 0; x < n; ++x) { + target[x] = in[x / 2] >> ((x & 1) ? 0 : 4) & 0xf; + } + in += n / 2; + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildC8FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{texelCount * 2}; + + uint32_t w = width; + uint32_t h = height; + uint16_t* targetMip = reinterpret_cast(buf.data()); + const uint8_t* in = data.data(); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 7) / 8; + const uint32_t bheight = (h + 3) / 4; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 4; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 8; + for (uint32_t y = 0; y < 4; ++y) { + uint16_t* target = targetMip + (baseY + y) * w + baseX; + const auto n = std::min(w, 8u); + for (size_t x = 0; x < n; ++x) { + target[x] = in[x]; + } + in += n; + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildRGB565FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{sizeof(RGBA8) * texelCount}; + + uint32_t w = width; + uint32_t h = height; + auto* targetMip = reinterpret_cast(buf.data()); + const auto* in = reinterpret_cast(data.data()); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 3) / 4; + const uint32_t bheight = (h + 3) / 4; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 4; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 4; + for (uint32_t y = 0; y < std::min(4u, h); ++y) { + RGBA8* target = targetMip + (baseY + y) * w + baseX; + for (size_t x = 0; x < std::min(4u, w); ++x) { + const auto texel = bswap16(in[x]); + target[x].r = ExpandTo8<5>(texel >> 11 & 0x1f); + target[x].g = ExpandTo8<6>(texel >> 5 & 0x3f); + target[x].b = ExpandTo8<5>(texel & 0x1f); + target[x].a = 0xff; + } + in += 4; + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildRGB5A3FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{sizeof(RGBA8) * texelCount}; + + uint32_t w = width; + uint32_t h = height; + auto* targetMip = reinterpret_cast(buf.data()); + const auto* in = reinterpret_cast(data.data()); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 3) / 4; + const uint32_t bheight = (h + 3) / 4; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 4; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 4; + for (uint32_t y = 0; y < std::min(4u, h); ++y) { + RGBA8* target = targetMip + (baseY + y) * w + baseX; + for (size_t x = 0; x < std::min(4u, w); ++x) { + const auto texel = bswap16(in[x]); + if ((texel & 0x8000) != 0) { + target[x].r = ExpandTo8<5>(texel >> 10 & 0x1f); + target[x].g = ExpandTo8<5>(texel >> 5 & 0x1f); + target[x].b = ExpandTo8<5>(texel & 0x1f); + target[x].a = 0xff; + } else { + target[x].r = ExpandTo8<4>(texel >> 8 & 0xf); + target[x].g = ExpandTo8<4>(texel >> 4 & 0xf); + target[x].b = ExpandTo8<4>(texel & 0xf); + target[x].a = ExpandTo8<3>(texel >> 12 & 0x7); + } + } + in += 4; + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildRGBA8FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + ByteBuffer buf{sizeof(RGBA8) * texelCount}; + + uint32_t w = width; + uint32_t h = height; + auto* targetMip = reinterpret_cast(buf.data()); + const uint8_t* in = data.data(); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 3) / 4; + const uint32_t bheight = (h + 3) / 4; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 4; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 4; + for (uint32_t c = 0; c < 2; ++c) { + for (uint32_t y = 0; y < 4; ++y) { + RGBA8* target = targetMip + (baseY + y) * w + baseX; + for (size_t x = 0; x < 4; ++x) { + if (c != 0) { + target[x].g = in[x * 2]; + target[x].b = in[x * 2 + 1]; + } else { + target[x].a = in[x * 2]; + target[x].r = in[x * 2 + 1]; + } + } + in += 8; + } + } + } + } + targetMip += w * h; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildDXT1FromGCN(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t blockCount = ComputeMippedBlockCountDXT1(width, height, mips); + ByteBuffer buf{sizeof(DXT1Block) * blockCount}; + + uint32_t w = width / 4; + uint32_t h = height / 4; + auto* targetMip = reinterpret_cast(buf.data()); + const auto* in = reinterpret_cast(data.data()); + for (uint32_t mip = 0; mip < mips; ++mip) { + const uint32_t bwidth = (w + 1) / 2; + const uint32_t bheight = (h + 1) / 2; + for (uint32_t by = 0; by < bheight; ++by) { + const uint32_t baseY = by * 2; + for (uint32_t bx = 0; bx < bwidth; ++bx) { + const uint32_t baseX = bx * 2; + for (uint32_t y = 0; y < 2; ++y) { + DXT1Block* target = targetMip + (baseY + y) * w + baseX; + for (size_t x = 0; x < 2; ++x) { + target[x].color1 = bswap16(in[x].color1); + target[x].color2 = bswap16(in[x].color2); + for (size_t i = 0; i < 4; ++i) { + std::array ind; + const uint8_t packed = in[x].lines[i]; + ind[3] = packed & 0x3; + ind[2] = (packed >> 2) & 0x3; + ind[1] = (packed >> 4) & 0x3; + ind[0] = (packed >> 6) & 0x3; + target[x].lines[i] = ind[0] | (ind[1] << 2) | (ind[2] << 4) | (ind[3] << 6); + } + } + in += 2; + } + } + } + targetMip += w * h; + + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer BuildRGBA8FromCMPR(uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + const size_t texelCount = ComputeMippedTexelCount(width, height, mips); + const size_t blockCount = ComputeMippedBlockCountDXT1(width, height, mips); + ByteBuffer buf{sizeof(RGBA8) * texelCount}; + + uint32_t h = height; + uint32_t w = width; + uint8_t* dst = buf.data(); + const uint8_t* src = data.data(); + for (uint32_t mip = 0; mip < mips; ++mip) { + for (uint32_t yy = 0; yy < h; yy += 8) { + for (uint32_t xx = 0; xx < w; xx += 8) { + for (uint32_t yb = 0; yb < 8; yb += 4) { + for (uint32_t xb = 0; xb < 8; xb += 4) { + // CMPR difference: Big-endian color1/2 + const uint16_t color1 = bswap16(*reinterpret_cast(src)); + const uint16_t color2 = bswap16(*reinterpret_cast(src + 2)); + src += 4; + + // Fill in first two colors in color table. + std::array color_table{}; + + color_table[0] = ExpandTo8<5>(static_cast((color1 >> 11) & 0x1F)); + color_table[1] = ExpandTo8<6>(static_cast((color1 >> 5) & 0x3F)); + color_table[2] = ExpandTo8<5>(static_cast(color1 & 0x1F)); + color_table[3] = 0xFF; + + color_table[4] = ExpandTo8<5>(static_cast((color2 >> 11) & 0x1F)); + color_table[5] = ExpandTo8<6>(static_cast((color2 >> 5) & 0x3F)); + color_table[6] = ExpandTo8<5>(static_cast(color2 & 0x1F)); + color_table[7] = 0xFF; + if (color1 > color2) { + // Predict gradients. + color_table[8] = S3TCBlend(color_table[4], color_table[0]); + color_table[9] = S3TCBlend(color_table[5], color_table[1]); + color_table[10] = S3TCBlend(color_table[6], color_table[2]); + color_table[11] = 0xFF; + + color_table[12] = S3TCBlend(color_table[0], color_table[4]); + color_table[13] = S3TCBlend(color_table[1], color_table[5]); + color_table[14] = S3TCBlend(color_table[2], color_table[6]); + color_table[15] = 0xFF; + } else { + color_table[8] = HalfBlend(color_table[0], color_table[4]); + color_table[9] = HalfBlend(color_table[1], color_table[5]); + color_table[10] = HalfBlend(color_table[2], color_table[6]); + color_table[11] = 0xFF; + + // CMPR difference: GX fills with an alpha 0 midway point here. + color_table[12] = color_table[8]; + color_table[13] = color_table[9]; + color_table[14] = color_table[10]; + color_table[15] = 0; + } + + for (uint32_t y = 0; y < 4; ++y) { + uint8_t bits = src[y]; + for (uint32_t x = 0; x < 4; ++x) { + if (xx + xb + x >= w || yy + yb + y >= h) { + continue; + } + uint8_t* dstOffs = dst + ((yy + yb + y) * w + (xx + xb + x)) * 4; + const uint8_t* colorTableOffs = &color_table[static_cast((bits >> 6) & 3) * 4]; + memcpy(dstOffs, colorTableOffs, 4); + bits <<= 2; + } + } + src += 4; + } + } + } + } + dst += w * h * 4; + if (w > 1) { + w /= 2; + } + if (h > 1) { + h /= 2; + } + } + + return buf; +} + +ByteBuffer convert_texture(u32 format, uint32_t width, uint32_t height, uint32_t mips, ArrayRef data) { + switch (format) { + default: + Log.report(LOG_FATAL, FMT_STRING("convert_texture: unknown format supplied {}"), format); + unreachable(); + case GX_TF_R8_PC: + case GX_TF_RGBA8_PC: + return {}; // No conversion + case GX_TF_I4: + return BuildI4FromGCN(width, height, mips, data); + case GX_TF_I8: + return BuildI8FromGCN(width, height, mips, data); + case GX_TF_IA4: + return BuildIA4FromGCN(width, height, mips, data); + case GX_TF_IA8: + return BuildIA8FromGCN(width, height, mips, data); + case GX_TF_C4: + return BuildC4FromGCN(width, height, mips, data); + case GX_TF_C8: + return BuildC8FromGCN(width, height, mips, data); + case GX_TF_C14X2: + Log.report(LOG_FATAL, FMT_STRING("convert_texture: C14X2 unimplemented")); + unreachable(); + case GX_TF_RGB565: + return BuildRGB565FromGCN(width, height, mips, data); + case GX_TF_RGB5A3: + return BuildRGB5A3FromGCN(width, height, mips, data); + case GX_TF_RGBA8: + return BuildRGBA8FromGCN(width, height, mips, data); + case GX_TF_CMPR: + if (wgpuDeviceHasFeature(webgpu::g_device, WGPUFeatureName_TextureCompressionBC)) { + return BuildDXT1FromGCN(width, height, mips, data); + } else { + return BuildRGBA8FromCMPR(width, height, mips, data); + } + } +} +} // namespace aurora::gfx diff --git a/lib/gfx/texture_convert.hpp b/lib/gfx/texture_convert.hpp new file mode 100644 index 0000000..f69a561 --- /dev/null +++ b/lib/gfx/texture_convert.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "common.hpp" +#include "texture.hpp" +#include "../webgpu/gpu.hpp" + +namespace aurora::gfx { +static WGPUTextureFormat to_wgpu(u32 format) { + switch (format) { + case GX_TF_I4: + case GX_TF_I8: + case GX_TF_R8_PC: + return WGPUTextureFormat_R8Unorm; + case GX_TF_C4: + case GX_TF_C8: + case GX_TF_C14X2: + return WGPUTextureFormat_R16Sint; + case GX_TF_CMPR: + if (wgpuDeviceHasFeature(webgpu::g_device, WGPUFeatureName_TextureCompressionBC)) { + return WGPUTextureFormat_BC1RGBAUnorm; + } + [[fallthrough]]; + default: + return WGPUTextureFormat_RGBA8Unorm; + } +} + +ByteBuffer convert_texture(u32 format, uint32_t width, uint32_t height, uint32_t mips, ArrayRef data); +} // namespace aurora::gfx diff --git a/lib/imgui.cpp b/lib/imgui.cpp new file mode 100644 index 0000000..c4c3753 --- /dev/null +++ b/lib/imgui.cpp @@ -0,0 +1,170 @@ +#include "imgui.hpp" + +#include "webgpu/gpu.hpp" +#include "internal.hpp" +#include "window.hpp" + +#include +#include + +#include "../imgui/backends/imgui_impl_sdl.cpp" // NOLINT(bugprone-suspicious-include) +#include "../imgui/backends/imgui_impl_sdlrenderer.cpp" // NOLINT(bugprone-suspicious-include) +#include "../imgui/backends/imgui_impl_wgpu.cpp" // NOLINT(bugprone-suspicious-include) + +namespace aurora::imgui { +static float g_scale; +static std::string g_imguiSettings{}; +static std::string g_imguiLog{}; +static bool g_useSdlRenderer = false; + +static std::vector g_sdlTextures; +static std::vector g_wgpuTextures; + +void create_context() noexcept { + IMGUI_CHECKVERSION(); + ImGui::CreateContext(); + ImGuiIO& io = ImGui::GetIO(); + g_imguiSettings = std::string{g_config.configPath} + "/imgui.ini"; + g_imguiLog = std::string{g_config.configPath} + "/imgui.log"; + io.IniFilename = g_imguiSettings.c_str(); + io.LogFilename = g_imguiLog.c_str(); +} + +void initialize() noexcept { + SDL_Renderer* renderer = window::get_sdl_renderer(); + ImGui_ImplSDL2_Init(window::get_sdl_window(), renderer); +#ifdef __APPLE__ + // Disable MouseCanUseGlobalState for scaling purposes + ImGui_ImplSDL2_GetBackendData()->MouseCanUseGlobalState = false; +#endif + g_useSdlRenderer = renderer != nullptr; + if (g_useSdlRenderer) { + ImGui_ImplSDLRenderer_Init(renderer); + } else { + ImGui_ImplWGPU_Init(webgpu::g_device, 1, webgpu::g_graphicsConfig.colorFormat); + } +} + +void shutdown() noexcept { + if (g_useSdlRenderer) { + ImGui_ImplSDLRenderer_Shutdown(); + } else { + ImGui_ImplWGPU_Shutdown(); + } + ImGui_ImplSDL2_Shutdown(); + ImGui::DestroyContext(); + for (const auto& texture : g_sdlTextures) { + SDL_DestroyTexture(texture); + } + g_sdlTextures.clear(); + for (const auto& texture : g_wgpuTextures) { + wgpuTextureDestroy(texture); + } + g_wgpuTextures.clear(); +} + +void process_event(const SDL_Event& event) noexcept { +#ifdef __APPLE__ + if (event.type == SDL_MOUSEMOTION) { + auto& io = ImGui::GetIO(); + // Scale up mouse coordinates + io.AddMousePosEvent(static_cast(event.motion.x) * g_scale, static_cast(event.motion.y) * g_scale); + return; + } +#endif + ImGui_ImplSDL2_ProcessEvent(&event); +} + +void new_frame(const AuroraWindowSize& size) noexcept { + if (g_useSdlRenderer) { + ImGui_ImplSDLRenderer_NewFrame(); + } else { + if (g_scale != size.scale) { + if (g_scale > 0.f) { + // TODO wgpu backend bug: doesn't clear bind groups on invalidate + g_resources.ImageBindGroups.Clear(); + ImGui_ImplWGPU_CreateDeviceObjects(); + } + g_scale = size.scale; + } + ImGui_ImplWGPU_NewFrame(); + } + ImGui_ImplSDL2_NewFrame(); + + // Render at full DPI + ImGui::GetIO().DisplaySize = { + static_cast(size.fb_width), + static_cast(size.fb_height), + }; + ImGui::NewFrame(); +} + +void render(WGPURenderPassEncoder pass) noexcept { + ImGui::Render(); + + auto* data = ImGui::GetDrawData(); + // io.DisplayFramebufferScale is informational; we're rendering at full DPI + data->FramebufferScale = {1.f, 1.f}; + if (g_useSdlRenderer) { + SDL_Renderer* renderer = ImGui_ImplSDLRenderer_GetBackendData()->SDLRenderer; + SDL_RenderClear(renderer); + ImGui_ImplSDLRenderer_RenderDrawData(data); + SDL_RenderPresent(renderer); + } else { + ImGui_ImplWGPU_RenderDrawData(data, pass); + } +} + +ImTextureID add_texture(uint32_t width, uint32_t height, const uint8_t* data) noexcept { + if (g_useSdlRenderer) { + SDL_Renderer* renderer = ImGui_ImplSDLRenderer_GetBackendData()->SDLRenderer; + SDL_Texture* texture = SDL_CreateTexture(renderer, SDL_PIXELFORMAT_RGBA32, SDL_TEXTUREACCESS_STATIC, width, height); + SDL_UpdateTexture(texture, nullptr, data, width * 4); + SDL_SetTextureScaleMode(texture, SDL_ScaleModeLinear); + g_sdlTextures.push_back(texture); + return texture; + } + const auto size = WGPUExtent3D{ + .width = width, + .height = height, + .depthOrArrayLayers = 1, + }; + const auto textureDescriptor = WGPUTextureDescriptor{ + .label = "imgui texture", + .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst, + .dimension = WGPUTextureDimension_2D, + .size = size, + .format = WGPUTextureFormat_RGBA8Unorm, + .mipLevelCount = 1, + .sampleCount = 1, + }; + const auto textureViewDescriptor = WGPUTextureViewDescriptor{ + .label = "imgui texture view", + .format = WGPUTextureFormat_RGBA8Unorm, + .dimension = WGPUTextureViewDimension_2D, + .mipLevelCount = WGPU_MIP_LEVEL_COUNT_UNDEFINED, + .arrayLayerCount = WGPU_ARRAY_LAYER_COUNT_UNDEFINED, + }; + auto texture = wgpuDeviceCreateTexture(webgpu::g_device, &textureDescriptor); + auto textureView = wgpuTextureCreateView(texture, &textureViewDescriptor); + { + const auto dstView = WGPUImageCopyTexture{ + .texture = texture, + }; + const auto dataLayout = WGPUTextureDataLayout{ + .bytesPerRow = 4 * width, + .rowsPerImage = height, + }; + wgpuQueueWriteTexture(webgpu::g_queue, &dstView, data, width * height * 4, &dataLayout, &size); + } + g_wgpuTextures.push_back(texture); + return textureView; +} +} // namespace aurora::imgui + +// C bindings +extern "C" { +ImTextureID aurora_imgui_add_texture(uint32_t width, uint32_t height, const void* rgba8) { + return aurora::imgui::add_texture(width, height, static_cast(rgba8)); +} +} diff --git a/lib/imgui.hpp b/lib/imgui.hpp new file mode 100644 index 0000000..2b34ed3 --- /dev/null +++ b/lib/imgui.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include + +#include + +union SDL_Event; +typedef struct WGPURenderPassEncoderImpl* WGPURenderPassEncoder; + +namespace aurora::imgui { +void create_context() noexcept; +void initialize() noexcept; +void shutdown() noexcept; + +void process_event(const SDL_Event& event) noexcept; +void new_frame(const AuroraWindowSize& size) noexcept; +void render(WGPURenderPassEncoder pass) noexcept; +} // namespace aurora::imgui diff --git a/lib/input.cpp b/lib/input.cpp new file mode 100644 index 0000000..910ed01 --- /dev/null +++ b/lib/input.cpp @@ -0,0 +1,815 @@ +#include "input.hpp" +#include "internal.hpp" +#include "pad.hpp" + +#include "magic_enum.hpp" + +#include +#include + +#include +#include +#include +#include + +using namespace std::string_view_literals; + +namespace aurora::input { +static Module Log("aurora::input"); + +struct GameController { + SDL_GameController* m_controller = nullptr; + bool m_isGameCube = false; + Sint32 m_index = -1; + bool m_hasRumble = false; + PADDeadZones m_deadZones{ + .emulateTriggers = true, + .useDeadzones = true, + .stickDeadZone = 8000, + .substickDeadZone = 8000, + .leftTriggerActivationZone = 31150, + .rightTriggerActivationZone = 31150, + }; + uint16_t m_vid = 0; + uint16_t m_pid = 0; + std::array m_mapping{}; + bool m_mappingLoaded = false; + constexpr bool operator==(const GameController& other) const { + return m_controller == other.m_controller && m_index == other.m_index; + } +}; +absl::flat_hash_map g_GameControllers; + +GameController* get_controller_for_player(uint32_t player) noexcept { + for (auto& [which, controller] : g_GameControllers) { + if (player_index(which) == player) { + return &controller; + } + } + +#if 0 + /* If we don't have a controller assigned to this port use the first unassigned controller */ + if (!g_GameControllers.empty()) { + int32_t availIndex = -1; + GameController* ct = nullptr; + for (auto& controller : g_GameControllers) { + if (player_index(controller.first) == -1) { + availIndex = controller.first; + ct = &controller.second; + break; + } + } + if (availIndex != -1) { + set_player_index(availIndex, player); + return ct; + } + } +#endif + return nullptr; +} + +Sint32 get_instance_for_player(uint32_t player) noexcept { + for (const auto& [which, controller] : g_GameControllers) { + if (player_index(which) == player) { + return which; + } + } + + return {}; +} + +static std::optional remap_controller_layout(std::string_view mapping) { + std::string newMapping; + newMapping.reserve(mapping.size()); + absl::btree_map entries; + for (size_t idx = 0; const auto value : absl::StrSplit(mapping, ',')) { + if (idx < 2) { + if (idx > 0) { + newMapping.push_back(','); + } + newMapping.append(value); + } else { + const auto split = absl::StrSplit(value, absl::MaxSplits(':', 2)); + auto iter = split.begin(); + entries.emplace(*iter++, *iter); + } + idx++; + } + if (entries.contains("rightshoulder"sv) && !entries.contains("leftshoulder"sv)) { + Log.report(LOG_INFO, FMT_STRING("Remapping GameCube controller layout")); + entries.insert_or_assign("back"sv, entries["rightshoulder"sv]); + // TODO trigger buttons may differ per platform + entries.insert_or_assign("leftshoulder"sv, "b11"sv); + entries.insert_or_assign("rightshoulder"sv, "b10"sv); + } else if (entries.contains("leftshoulder"sv) && entries.contains("rightshoulder"sv) && entries.contains("back"sv)) { + Log.report(LOG_INFO, FMT_STRING("Controller has standard layout")); + auto a = entries["a"sv]; + entries.insert_or_assign("a"sv, entries["b"sv]); + entries.insert_or_assign("b"sv, a); + auto x = entries["x"sv]; + entries.insert_or_assign("x"sv, entries["y"sv]); + entries.insert_or_assign("y"sv, x); + } else { + Log.report(LOG_ERROR, FMT_STRING("Controller has unsupported layout: {}"), mapping); + return {}; + } + for (auto [k, v] : entries) { + newMapping.push_back(','); + newMapping.append(k); + newMapping.push_back(':'); + newMapping.append(v); + } + return newMapping; +} + +Sint32 add_controller(Sint32 which) noexcept { + auto* ctrl = SDL_GameControllerOpen(which); + if (ctrl != nullptr) { + { + char* mapping = SDL_GameControllerMapping(ctrl); + if (mapping != nullptr) { + auto newMapping = remap_controller_layout(mapping); + SDL_free(mapping); + if (newMapping) { + if (SDL_GameControllerAddMapping(newMapping->c_str()) == -1) { + Log.report(LOG_ERROR, FMT_STRING("Failed to update controller mapping: {}"), SDL_GetError()); + } + } + } else { + Log.report(LOG_ERROR, FMT_STRING("Failed to retrieve mapping for controller")); + } + } + GameController controller; + controller.m_controller = ctrl; + controller.m_index = which; + controller.m_vid = SDL_GameControllerGetVendor(ctrl); + controller.m_pid = SDL_GameControllerGetProduct(ctrl); + if (controller.m_vid == 0x05ac /* USB_VENDOR_APPLE */ && controller.m_pid == 3) { + // Ignore Apple TV remote + SDL_GameControllerClose(ctrl); + return -1; + } + controller.m_isGameCube = controller.m_vid == 0x057E && controller.m_pid == 0x0337; +#if SDL_VERSION_ATLEAST(2, 0, 18) + controller.m_hasRumble = (SDL_GameControllerHasRumble(ctrl) != 0u); +#else + controller.m_hasRumble = true; +#endif + Sint32 instance = SDL_JoystickInstanceID(SDL_GameControllerGetJoystick(ctrl)); + g_GameControllers[instance] = controller; + return instance; + } + + return -1; +} + +void remove_controller(Uint32 instance) noexcept { + if (g_GameControllers.find(instance) != g_GameControllers.end()) { + SDL_GameControllerClose(g_GameControllers[instance].m_controller); + g_GameControllers.erase(instance); + } +} + +bool is_gamecube(Uint32 instance) noexcept { + if (g_GameControllers.find(instance) != g_GameControllers.end()) { + return g_GameControllers[instance].m_isGameCube; + } + return false; +} + +int32_t player_index(Uint32 instance) noexcept { + if (g_GameControllers.find(instance) != g_GameControllers.end()) { + return SDL_GameControllerGetPlayerIndex(g_GameControllers[instance].m_controller); + } + return -1; +} + +void set_player_index(Uint32 instance, Sint32 index) noexcept { + if (g_GameControllers.find(instance) != g_GameControllers.end()) { + SDL_GameControllerSetPlayerIndex(g_GameControllers[instance].m_controller, index); + } +} + +std::string controller_name(Uint32 instance) noexcept { + if (g_GameControllers.find(instance) != g_GameControllers.end()) { + const auto* name = SDL_GameControllerName(g_GameControllers[instance].m_controller); + if (name != nullptr) { + return {name}; + } + } + return {}; +} + +bool controller_has_rumble(Uint32 instance) noexcept { + if (g_GameControllers.find(instance) != g_GameControllers.end()) { + return g_GameControllers[instance].m_hasRumble; + } + + return false; +} + +void controller_rumble(uint32_t instance, uint16_t low_freq_intensity, uint16_t high_freq_intensity, + uint16_t duration_ms) noexcept { + + if (g_GameControllers.find(instance) != g_GameControllers.end()) { + SDL_GameControllerRumble(g_GameControllers[instance].m_controller, low_freq_intensity, high_freq_intensity, + duration_ms); + } +} + +uint32_t controller_count() noexcept { return g_GameControllers.size(); } + +} // namespace aurora::input + +static const std::array mDefaultButtons{{ + {SDL_CONTROLLER_BUTTON_A, PAD_BUTTON_A}, + {SDL_CONTROLLER_BUTTON_B, PAD_BUTTON_B}, + {SDL_CONTROLLER_BUTTON_X, PAD_BUTTON_X}, + {SDL_CONTROLLER_BUTTON_Y, PAD_BUTTON_Y}, + {SDL_CONTROLLER_BUTTON_START, PAD_BUTTON_START}, + {SDL_CONTROLLER_BUTTON_BACK, PAD_TRIGGER_Z}, + {SDL_CONTROLLER_BUTTON_LEFTSHOULDER, PAD_TRIGGER_L}, + {SDL_CONTROLLER_BUTTON_RIGHTSHOULDER, PAD_TRIGGER_R}, + {SDL_CONTROLLER_BUTTON_DPAD_UP, PAD_BUTTON_UP}, + {SDL_CONTROLLER_BUTTON_DPAD_DOWN, PAD_BUTTON_DOWN}, + {SDL_CONTROLLER_BUTTON_DPAD_LEFT, PAD_BUTTON_LEFT}, + {SDL_CONTROLLER_BUTTON_DPAD_RIGHT, PAD_BUTTON_RIGHT}, +}}; + +void PADSetSpec(u32 spec) {} +BOOL PADInit() { return true; } +BOOL PADRecalibrate(u32 mask) { return true; } +BOOL PADReset(u32 mask) { return true; } +void PADSetAnalogMode(u32 mode) {} + +aurora::input::GameController* __PADGetControllerForIndex(uint32_t idx) { + if (idx >= aurora::input::g_GameControllers.size()) { + return nullptr; + } + + uint32_t tmp = 0; + auto iter = aurora::input::g_GameControllers.begin(); + while (tmp < idx) { + ++iter; + ++tmp; + } + if (iter == aurora::input::g_GameControllers.end()) { + return nullptr; + } + + return &iter->second; +} + +uint32_t PADCount() { return aurora::input::g_GameControllers.size(); } + +const char* PADGetNameForControllerIndex(uint32_t idx) { + auto* ctrl = __PADGetControllerForIndex(idx); + if (ctrl == nullptr) { + return nullptr; + } + + return SDL_GameControllerName(ctrl->m_controller); +} + +void PADSetPortForIndex(uint32_t idx, int32_t port) { + auto* ctrl = __PADGetControllerForIndex(idx); + auto* dest = aurora::input::get_controller_for_player(port); + if (ctrl == nullptr) { + return; + } + if (dest != nullptr) { + SDL_GameControllerSetPlayerIndex(dest->m_controller, -1); + } + SDL_GameControllerSetPlayerIndex(ctrl->m_controller, port); +} + +int32_t PADGetIndexForPort(uint32_t port) { + auto* ctrl = aurora::input::get_controller_for_player(port); + if (ctrl == nullptr) { + return -1; + } + int32_t index = 0; + for (auto iter = aurora::input::g_GameControllers.begin(); iter != aurora::input::g_GameControllers.end(); + ++iter, ++index) { + if (&iter->second == ctrl) { + break; + } + } + + return index; +} + +void PADClearPort(uint32_t port) { + auto* ctrl = aurora::input::get_controller_for_player(port); + if (ctrl == nullptr) { + return; + } + SDL_GameControllerSetPlayerIndex(ctrl->m_controller, -1); +} + +void __PADLoadMapping(aurora::input::GameController* controller) { + int32_t playerIndex = SDL_GameControllerGetPlayerIndex(controller->m_controller); + if (playerIndex == -1) { + return; + } + + std::string basePath{aurora::g_config.configPath}; + if (!controller->m_mappingLoaded) { + controller->m_mapping = mDefaultButtons; + } + + controller->m_mappingLoaded = true; + + auto path = fmt::format(FMT_STRING("{}/{}_{:04X}_{:04X}.controller"), basePath, PADGetName(playerIndex), + controller->m_vid, controller->m_pid); + FILE* file = fopen(path.c_str(), "rb"); + if (file == nullptr) { + return; + } + + uint32_t magic = 0; + fread(&magic, 1, sizeof(uint32_t), file); + if (magic != SBIG('CTRL')) { + fmt::print(FMT_STRING("Invalid controller mapping magic!\n")); + return; + } + + uint32_t version = 0; + fread(&version, 1, sizeof(uint32_t), file); + if (version != 1) { + fmt::print(FMT_STRING("Invalid controller mapping version!\n")); + return; + } + + bool isGameCube = false; + fread(&isGameCube, 1, 1, file); + fseek(file, (ftell(file) + 31) & ~31, SEEK_SET); + uint32_t dataStart = ftell(file); + if (isGameCube) { + fseek(file, dataStart + ((sizeof(PADDeadZones) + sizeof(PADButtonMapping)) * playerIndex), SEEK_SET); + } + + fread(&controller->m_deadZones, 1, sizeof(PADDeadZones), file); + fread(&controller->m_mapping, 1, sizeof(PADButtonMapping) * controller->m_mapping.size(), file); + fclose(file); +} + +bool gBlockPAD = false; +uint32_t PADRead(PADStatus* status) { + if (gBlockPAD) { + return 0; + } + + uint32_t rumbleSupport = 0; + for (uint32_t i = 0; i < 4; ++i) { + memset(&status[i], 0, sizeof(PADStatus)); + auto controller = aurora::input::get_controller_for_player(i); + if (controller == nullptr) { + status[i].err = PAD_ERR_NO_CONTROLLER; + continue; + } + + if (!controller->m_mappingLoaded) { + __PADLoadMapping(controller); + } + status[i].err = PAD_ERR_NONE; + std::for_each(controller->m_mapping.begin(), controller->m_mapping.end(), + [&controller, &i, &status](const auto& mapping) { + if (SDL_GameControllerGetButton(controller->m_controller, + static_cast(mapping.nativeButton))) { + status[i].button |= mapping.padButton; + } + }); + + Sint16 x = SDL_GameControllerGetAxis(controller->m_controller, SDL_CONTROLLER_AXIS_LEFTX); + Sint16 y = SDL_GameControllerGetAxis(controller->m_controller, SDL_CONTROLLER_AXIS_LEFTY); + if (controller->m_deadZones.useDeadzones) { + if (std::abs(x) > controller->m_deadZones.stickDeadZone) { + x /= 256; + } else { + x = 0; + } + if (std::abs(y) > controller->m_deadZones.stickDeadZone) { + y = (-(y + 1u)) / 256u; + } else { + y = 0; + } + } else { + x /= 256; + y = (-(y + 1u)) / 256u; + } + + status[i].stickX = static_cast(x); + status[i].stickY = static_cast(y); + + x = SDL_GameControllerGetAxis(controller->m_controller, SDL_CONTROLLER_AXIS_RIGHTX); + y = SDL_GameControllerGetAxis(controller->m_controller, SDL_CONTROLLER_AXIS_RIGHTY); + if (controller->m_deadZones.useDeadzones) { + if (std::abs(x) > controller->m_deadZones.substickDeadZone) { + x /= 256; + } else { + x = 0; + } + + if (std::abs(y) > controller->m_deadZones.substickDeadZone) { + y = (-(y + 1u)) / 256u; + } else { + y = 0; + } + } else { + x /= 256; + y = (-(y + 1u)) / 256u; + } + + status[i].substickX = static_cast(x); + status[i].substickY = static_cast(y); + + x = SDL_GameControllerGetAxis(controller->m_controller, SDL_CONTROLLER_AXIS_TRIGGERLEFT); + y = SDL_GameControllerGetAxis(controller->m_controller, SDL_CONTROLLER_AXIS_TRIGGERRIGHT); + if (/*!controller->m_isGameCube && */ controller->m_deadZones.emulateTriggers) { + if (x > controller->m_deadZones.leftTriggerActivationZone) { + status[i].button |= PAD_TRIGGER_L; + } + if (y > controller->m_deadZones.rightTriggerActivationZone) { + status[i].button |= PAD_TRIGGER_R; + } + } + x /= 128; + y /= 128; + + status[i].triggerL = static_cast(x); + status[i].triggerR = static_cast(y); + + if (controller->m_hasRumble) { + rumbleSupport |= PAD_CHAN0_BIT >> i; + } + } + return rumbleSupport; +} + +void PADControlAllMotors(const uint32_t* commands) { + for (uint32_t i = 0; i < 4; ++i) { + auto controller = aurora::input::get_controller_for_player(i); + auto instance = aurora::input::get_instance_for_player(i); + if (controller == nullptr) { + continue; + } + + if (controller->m_isGameCube) { + if (commands[i] == PAD_MOTOR_STOP) { + aurora::input::controller_rumble(instance, 0, 1, 0); + } else if (commands[i] == PAD_MOTOR_RUMBLE) { + aurora::input::controller_rumble(instance, 1, 1, 0); + } else if (commands[i] == PAD_MOTOR_STOP_HARD) { + aurora::input::controller_rumble(instance, 0, 0, 0); + } + } else { + if (commands[i] == PAD_MOTOR_STOP) { + aurora::input::controller_rumble(instance, 0, 0, 1); + } else if (commands[i] == PAD_MOTOR_RUMBLE) { + aurora::input::controller_rumble(instance, 32767, 32767, 0); + } else if (commands[i] == PAD_MOTOR_STOP_HARD) { + aurora::input::controller_rumble(instance, 0, 0, 0); + } + } + } +} + +uint32_t SIProbe(int32_t chan) { + auto* const controller = aurora::input::get_controller_for_player(chan); + if (controller == nullptr) { + return SI_ERROR_NO_RESPONSE; + } + + if (controller->m_isGameCube) { + auto level = SDL_JoystickCurrentPowerLevel(SDL_GameControllerGetJoystick(controller->m_controller)); + if (level == SDL_JOYSTICK_POWER_UNKNOWN) { + return SI_GC_WAVEBIRD; + } + } + + return SI_GC_CONTROLLER; +} + +struct PADCLampRegion { + uint8_t minTrigger; + uint8_t maxTrigger; + int8_t minStick; + int8_t maxStick; + int8_t xyStick; + int8_t minSubstick; + int8_t maxSubstick; + int8_t xySubstick; + int8_t radStick; + int8_t radSubstick; +}; + +static constexpr PADCLampRegion ClampRegion{ + // Triggers + 30, + 180, + + // Left stick + 15, + 72, + 40, + + // Right stick + 15, + 59, + 31, + + // Stick radii + 56, + 44, +}; + +void ClampTrigger(uint8_t* trigger, uint8_t min, uint8_t max) { + if (*trigger <= min) { + *trigger = 0; + } else { + if (*trigger > max) { + *trigger = max; + } + *trigger -= min; + } +} + +void ClampCircle(int8_t* px, int8_t* py, int8_t radius, int8_t min) { + int x = *px; + int y = *py; + + if (-min < x && x < min) { + x = 0; + } else if (0 < x) { + x -= min; + } else { + x += min; + } + + if (-min < y && y < min) { + y = 0; + } else if (0 < y) { + y -= min; + } else { + y += min; + } + + int squared = x * x + y * y; + if (radius * radius < squared) { + int32_t length = static_cast(std::sqrt(squared)); + x = (x * radius) / length; + y = (y * radius) / length; + } + + *px = static_cast(x); + *py = static_cast(y); +} + +void ClampStick(int8_t* px, int8_t* py, int8_t max, int8_t xy, int8_t min) { + int32_t x = *px; + int32_t y = *py; + + int32_t signX = 0; + if (0 <= x) { + signX = 1; + } else { + signX = -1; + x = -x; + } + + int8_t signY = 0; + if (0 <= y) { + signY = 1; + } else { + signY = -1; + y = -y; + } + + if (x <= min) { + x = 0; + } else { + x -= min; + } + if (y <= min) { + y = 0; + } else { + y -= min; + } + + if (x == 0 && y == 0) { + *px = *py = 0; + return; + } + + if (xy * y <= xy * x) { + int32_t d = xy * x + (max - xy) * y; + if (xy * max < d) { + x = (xy * max * x / d); + y = (xy * max * y / d); + } + } else { + int32_t d = xy * y + (max - xy) * x; + if (xy * max < d) { + x = (xy * max * x / d); + y = (xy * max * y / d); + } + } + + *px = (signX * x); + *py = (signY * y); +} + +void PADClamp(PADStatus* status) { + for (uint32_t i = 0; i < 4; ++i) { + if (status[i].err != PAD_ERR_NONE) { + continue; + } + + ClampStick(&status[i].stickX, &status[i].stickY, ClampRegion.maxStick, ClampRegion.xyStick, ClampRegion.minStick); + ClampStick(&status[i].substickX, &status[i].substickY, ClampRegion.maxSubstick, ClampRegion.xySubstick, + ClampRegion.minSubstick); + ClampTrigger(&status[i].triggerL, ClampRegion.minTrigger, ClampRegion.maxTrigger); + ClampTrigger(&status[i].triggerR, ClampRegion.minTrigger, ClampRegion.maxTrigger); + } +} + +void PADClampCircle(PADStatus* status) { + for (uint32_t i = 0; i < 4; ++i) { + if (status[i].err != PAD_ERR_NONE) { + continue; + } + + ClampCircle(&status[i].stickX, &status[i].stickY, ClampRegion.radStick, ClampRegion.minStick); + ClampCircle(&status[i].substickX, &status[i].substickY, ClampRegion.radSubstick, ClampRegion.minSubstick); + ClampTrigger(&status[i].triggerL, ClampRegion.minTrigger, ClampRegion.maxTrigger); + ClampTrigger(&status[i].triggerR, ClampRegion.minTrigger, ClampRegion.maxTrigger); + } +} + +void PADGetVidPid(uint32_t port, uint32_t* vid, uint32_t* pid) { + *vid = 0; + *pid = 0; + auto* controller = aurora::input::get_controller_for_player(port); + if (controller == nullptr) { + return; + } + + *vid = controller->m_vid; + *pid = controller->m_pid; +} + +const char* PADGetName(uint32_t port) { + auto* controller = aurora::input::get_controller_for_player(port); + if (controller == nullptr) { + return nullptr; + } + + return SDL_GameControllerName(controller->m_controller); +} + +void PADSetButtonMapping(uint32_t port, PADButtonMapping mapping) { + auto* controller = aurora::input::get_controller_for_player(port); + if (controller == nullptr) { + return; + } + + auto iter = std::find_if(controller->m_mapping.begin(), controller->m_mapping.end(), + [mapping](const auto& pair) { return mapping.padButton == pair.padButton; }); + if (iter == controller->m_mapping.end()) { + return; + } + + *iter = mapping; +} + +void PADSetAllButtonMappings(uint32_t port, PADButtonMapping buttons[12]) { + for (uint32_t i = 0; i < 12; ++i) { + PADSetButtonMapping(port, buttons[i]); + } +} + +PADButtonMapping* PADGetButtonMappings(uint32_t port, uint32_t* buttonCount) { + auto* controller = aurora::input::get_controller_for_player(port); + if (controller == nullptr) { + *buttonCount = 0; + return nullptr; + } + + *buttonCount = controller->m_mapping.size(); + return controller->m_mapping.data(); +} + +void __PADWriteDeadZones(FILE* file, aurora::input::GameController& controller) { + fwrite(&controller.m_deadZones, 1, sizeof(PADDeadZones), file); +} + +void PADSerializeMappings() { + std::string basePath{aurora::g_config.configPath}; + + bool wroteGameCubeAlready = false; + for (auto& controller : aurora::input::g_GameControllers) { + if (!controller.second.m_mappingLoaded) { + __PADLoadMapping(&controller.second); + } + FILE* file = fopen(fmt::format(FMT_STRING("{}/{}_{:04X}_{:04X}.controller"), basePath, + aurora::input::controller_name(controller.second.m_index), controller.second.m_vid, + controller.second.m_pid) + .c_str(), + "wbe"); + if (file == nullptr) { + return; + } + + uint32_t magic = SBIG('CTRL'); + uint32_t version = 1; + fwrite(&magic, 1, sizeof(magic), file); + fwrite(&version, 1, sizeof(magic), file); + fwrite(&controller.second.m_isGameCube, 1, 1, file); + fseek(file, (ftell(file) + 31) & ~31, SEEK_SET); + int32_t dataStart = ftell(file); + if (!controller.second.m_isGameCube) { + __PADWriteDeadZones(file, controller.second); + fwrite(controller.second.m_mapping.data(), 1, sizeof(PADButtonMapping) * controller.second.m_mapping.size(), + file); + } else { + if (!wroteGameCubeAlready) { + for (uint32_t i = 0; i < 4; ++i) { + /* Just use the current controller's configs for this */ + __PADWriteDeadZones(file, controller.second); + fwrite(mDefaultButtons.data(), 1, sizeof(PADButtonMapping) * mDefaultButtons.size(), file); + } + fflush(file); + wroteGameCubeAlready = true; + } + uint32_t port = aurora::input::player_index(controller.second.m_index); + fseek(file, dataStart + ((sizeof(PADDeadZones) + sizeof(PADButtonMapping)) * port), SEEK_SET); + __PADWriteDeadZones(file, controller.second); + fwrite(controller.second.m_mapping.data(), 1, sizeof(PADButtonMapping) * controller.second.m_mapping.size(), + file); + } + fclose(file); + } +} + +PADDeadZones* PADGetDeadZones(uint32_t port) { + auto* controller = aurora::input::get_controller_for_player(port); + if (controller == nullptr) { + return nullptr; + } + return &controller->m_deadZones; +} + +static constexpr std::array, 12> skButtonNames = {{ + {PAD_BUTTON_LEFT, "Left"sv}, + {PAD_BUTTON_RIGHT, "Right"sv}, + {PAD_BUTTON_DOWN, "Down"sv}, + {PAD_BUTTON_UP, "Up"sv}, + {PAD_TRIGGER_Z, "Z"sv}, + {PAD_TRIGGER_R, "R"sv}, + {PAD_TRIGGER_L, "L"sv}, + {PAD_BUTTON_A, "A"sv}, + {PAD_BUTTON_B, "B"sv}, + {PAD_BUTTON_X, "X"sv}, + {PAD_BUTTON_Y, "Y"sv}, + {PAD_BUTTON_START, "Start"sv}, +}}; + +const char* PADGetButtonName(PADButton button) { + auto it = std::find_if(skButtonNames.begin(), skButtonNames.end(), + [&button](const auto& pair) { return button == pair.first; }); + + if (it != skButtonNames.end()) { + return it->second.data(); + } + + return nullptr; +} + +const char* PADGetNativeButtonName(uint32_t button) { + return SDL_GameControllerGetStringForButton(static_cast(button)); +} + +int32_t PADGetNativeButtonPressed(uint32_t port) { + auto* controller = aurora::input::get_controller_for_player(port); + if (controller == nullptr) { + return -1; + } + + for (uint32_t i = 0; i < SDL_CONTROLLER_BUTTON_MAX; ++i) { + if (SDL_GameControllerGetButton(controller->m_controller, static_cast(i)) != 0u) { + return i; + } + } + return -1; +} + +void PADRestoreDefaultMapping(uint32_t port) { + auto* controller = aurora::input::get_controller_for_player(port); + if (controller == nullptr) { + return; + } + controller->m_mapping = mDefaultButtons; +} + +void PADBlockInput(bool block) { gBlockPAD = block; } diff --git a/lib/input.hpp b/lib/input.hpp new file mode 100644 index 0000000..3d8fa19 --- /dev/null +++ b/lib/input.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "SDL_gamecontroller.h" +#include "SDL_keyboard.h" +#include "SDL_keycode.h" +#include "SDL_mouse.h" + +namespace aurora::input { +Sint32 get_instance_for_player(uint32_t player) noexcept; +Sint32 add_controller(Sint32 which) noexcept; +void remove_controller(Uint32 instance) noexcept; +Sint32 player_index(Uint32 instance) noexcept; +void set_player_index(Uint32 instance, Sint32 index) noexcept; +std::string controller_name(Uint32 instance) noexcept; +bool is_gamecube(Uint32 instance) noexcept; +bool controller_has_rumble(Uint32 instance) noexcept; +void controller_rumble(uint32_t instance, uint16_t low_freq_intensity, uint16_t high_freq_intensity, + uint16_t duration_ms) noexcept; +uint32_t controller_count() noexcept; +} // namespace aurora::input diff --git a/lib/internal.hpp b/lib/internal.hpp new file mode 100644 index 0000000..43ae6b4 --- /dev/null +++ b/lib/internal.hpp @@ -0,0 +1,113 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +using namespace std::string_view_literals; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#ifndef SBIG +#define SBIG(q) (((q)&0x000000FF) << 24 | ((q)&0x0000FF00) << 8 | ((q)&0x00FF0000) >> 8 | ((q)&0xFF000000) >> 24) +#endif +#else +#ifndef SBIG +#define SBIG(q) (q) +#endif +#endif + +#ifdef __GNUC__ +[[noreturn]] inline __attribute__((always_inline)) void unreachable() { __builtin_unreachable(); } +#elif defined(_MSC_VER) +[[noreturn]] __forceinline void unreachable() { __assume(false); } +#else +#error Unknown compiler +#endif + +#ifndef ALIGN +#define ALIGN(x, a) (((x) + ((a)-1)) & ~((a)-1)) +#endif + +namespace aurora { +extern AuroraConfig g_config; + +struct Module { + const char* name; + explicit Module(const char* name) noexcept : name(name) {} + + template + inline void report(AuroraLogLevel level, fmt::format_string fmt, T&&... args) noexcept { + auto message = fmt::format(fmt, std::forward(args)...); + if (g_config.logCallback != nullptr) { + g_config.logCallback(level, message.c_str(), message.size()); + } + } +}; + +template +class ArrayRef { +public: + using value_type = std::remove_cvref_t; + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using iterator = const_pointer; + using const_iterator = const_pointer; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + ArrayRef() = default; + explicit ArrayRef(const T& one) : ptr(&one), length(1) {} + ArrayRef(const T* data, size_t length) : ptr(data), length(length) {} + ArrayRef(const T* begin, const T* end) : ptr(begin), length(end - begin) {} + template + constexpr ArrayRef(const T (&arr)[N]) : ptr(arr), length(N) {} + template + constexpr ArrayRef(const std::array& arr) : ptr(arr.data()), length(arr.size()) {} + ArrayRef(const std::vector& vec) : ptr(vec.data()), length(vec.size()) {} +// template +// ArrayRef(const rstl::reserved_vector& vec) : ptr(vec.data()), length(vec.size()) {} + + const T* data() const { return ptr; } + size_t size() const { return length; } + bool empty() const { return length == 0; } + + const T& front() const { + assert(!empty()); + return ptr[0]; + } + const T& back() const { + assert(!empty()); + return ptr[length - 1]; + } + const T& operator[](size_t i) const { + assert(i < length && "Invalid index!"); + return ptr[i]; + } + + iterator begin() const { return ptr; } + iterator end() const { return ptr + length; } + + reverse_iterator rbegin() const { return reverse_iterator(end()); } + reverse_iterator rend() const { return reverse_iterator(begin()); } + + /// Disallow accidental assignment from a temporary. + template + std::enable_if_t::value, ArrayRef>& operator=(U&& Temporary) = delete; + + /// Disallow accidental assignment from a temporary. + template + std::enable_if_t::value, ArrayRef>& operator=(std::initializer_list) = delete; + +private: + const T* ptr = nullptr; + size_t length = 0; +}; +} // namespace aurora diff --git a/lib/main.cpp b/lib/main.cpp new file mode 100644 index 0000000..8dc1871 --- /dev/null +++ b/lib/main.cpp @@ -0,0 +1,6 @@ +#include +#undef main + +#include + +int main(int argc, char** argv) { return aurora_main(argc, argv); } diff --git a/lib/pad.hpp b/lib/pad.hpp new file mode 100644 index 0000000..1b27e5d --- /dev/null +++ b/lib/pad.hpp @@ -0,0 +1,5 @@ +#pragma once + +#include +#include + diff --git a/lib/webgpu/gpu.cpp b/lib/webgpu/gpu.cpp new file mode 100644 index 0000000..f7999c7 --- /dev/null +++ b/lib/webgpu/gpu.cpp @@ -0,0 +1,466 @@ +#include "gpu.hpp" + +#include + +#include "../window.hpp" +#include "../internal.hpp" + +#include +#include +#include +#include +#include + +#include "../dawn/BackendBinding.hpp" + +namespace aurora::webgpu { +static Module Log("aurora::gpu"); + +WGPUDevice g_device; +WGPUQueue g_queue; +WGPUSwapChain g_swapChain; +WGPUBackendType g_backendType; +GraphicsConfig g_graphicsConfig; +TextureWithSampler g_frameBuffer; +TextureWithSampler g_frameBufferResolved; +TextureWithSampler g_depthBuffer; + +// EFB -> XFB copy pipeline +static WGPUBindGroupLayout g_CopyBindGroupLayout; +WGPURenderPipeline g_CopyPipeline; +WGPUBindGroup g_CopyBindGroup; + +static std::unique_ptr g_Instance; +static dawn::native::Adapter g_Adapter; +static WGPUAdapterProperties g_AdapterProperties; +static std::unique_ptr g_BackendBinding; + +TextureWithSampler create_render_texture(bool multisampled) { + const WGPUExtent3D size{ + .width = g_graphicsConfig.width, + .height = g_graphicsConfig.height, + .depthOrArrayLayers = 1, + }; + const auto format = g_graphicsConfig.colorFormat; + uint32_t sampleCount = 1; + if (multisampled) { + sampleCount = g_graphicsConfig.msaaSamples; + } + const WGPUTextureDescriptor textureDescriptor{ + .label = "Render texture", + .usage = WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopySrc | + WGPUTextureUsage_CopyDst, + .dimension = WGPUTextureDimension_2D, + .size = size, + .format = format, + .mipLevelCount = 1, + .sampleCount = sampleCount, + }; + auto texture = wgpuDeviceCreateTexture(g_device, &textureDescriptor); + + const WGPUTextureViewDescriptor viewDescriptor{ + .dimension = WGPUTextureViewDimension_2D, + .mipLevelCount = WGPU_MIP_LEVEL_COUNT_UNDEFINED, + .arrayLayerCount = WGPU_ARRAY_LAYER_COUNT_UNDEFINED, + }; + auto view = wgpuTextureCreateView(texture, &viewDescriptor); + + const WGPUSamplerDescriptor samplerDescriptor{ + .label = "Render sampler", + .addressModeU = WGPUAddressMode_ClampToEdge, + .addressModeV = WGPUAddressMode_ClampToEdge, + .addressModeW = WGPUAddressMode_ClampToEdge, + .magFilter = WGPUFilterMode_Linear, + .minFilter = WGPUFilterMode_Linear, + .mipmapFilter = WGPUFilterMode_Linear, + .lodMinClamp = 0.f, + .lodMaxClamp = 1000.f, + .maxAnisotropy = 1, + }; + auto sampler = wgpuDeviceCreateSampler(g_device, &samplerDescriptor); + + return { + .texture{texture}, + .view{view}, + .size = size, + .format = format, + .sampler{sampler}, + }; +} + +static TextureWithSampler create_depth_texture() { + const WGPUExtent3D size{ + .width = g_graphicsConfig.width, + .height = g_graphicsConfig.height, + .depthOrArrayLayers = 1, + }; + const auto format = g_graphicsConfig.depthFormat; + const WGPUTextureDescriptor textureDescriptor{ + .label = "Depth texture", + .usage = WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_TextureBinding, + .dimension = WGPUTextureDimension_2D, + .size = size, + .format = format, + .mipLevelCount = 1, + .sampleCount = g_graphicsConfig.msaaSamples, + }; + auto texture = wgpuDeviceCreateTexture(g_device, &textureDescriptor); + + const WGPUTextureViewDescriptor viewDescriptor{ + .dimension = WGPUTextureViewDimension_2D, + .mipLevelCount = WGPU_MIP_LEVEL_COUNT_UNDEFINED, + .arrayLayerCount = WGPU_ARRAY_LAYER_COUNT_UNDEFINED, + }; + auto view = wgpuTextureCreateView(texture, &viewDescriptor); + + const WGPUSamplerDescriptor samplerDescriptor{ + .label = "Depth sampler", + .addressModeU = WGPUAddressMode_ClampToEdge, + .addressModeV = WGPUAddressMode_ClampToEdge, + .addressModeW = WGPUAddressMode_ClampToEdge, + .magFilter = WGPUFilterMode_Linear, + .minFilter = WGPUFilterMode_Linear, + .mipmapFilter = WGPUFilterMode_Linear, + .lodMinClamp = 0.f, + .lodMaxClamp = 1000.f, + .maxAnisotropy = 1, + }; + auto sampler = wgpuDeviceCreateSampler(g_device, &samplerDescriptor); + + return { + .texture{texture}, + .view{view}, + .size = size, + .format = format, + .sampler{sampler}, + }; +} + +void create_copy_pipeline() { + WGPUShaderModuleWGSLDescriptor sourceDescriptor{ + .chain = {.sType = WGPUSType_ShaderModuleWGSLDescriptor}, + .source = R"""( +@group(0) @binding(0) +var efb_sampler: sampler; +@group(0) @binding(1) +var efb_texture: texture_2d; + +struct VertexOutput { + @builtin(position) pos: vec4, + @location(0) uv: vec2, +}; + +var pos: array, 3> = array, 3>( + vec2(-1.0, 1.0), + vec2(-1.0, -3.0), + vec2(3.0, 1.0), +); +var uvs: array, 3> = array, 3>( + vec2(0.0, 0.0), + vec2(0.0, 2.0), + vec2(2.0, 0.0), +); + +@stage(vertex) +fn vs_main(@builtin(vertex_index) vtxIdx: u32) -> VertexOutput { + var out: VertexOutput; + out.pos = vec4(pos[vtxIdx], 0.0, 1.0); + out.uv = uvs[vtxIdx]; + return out; +} + +@stage(fragment) +fn fs_main(in: VertexOutput) -> @location(0) vec4 { + return textureSample(efb_texture, efb_sampler, in.uv); +} +)""", + }; + const WGPUShaderModuleDescriptor moduleDescriptor{ + .nextInChain = &sourceDescriptor.chain, + .label = "XFB Copy Module", + }; + auto module = wgpuDeviceCreateShaderModule(g_device, &moduleDescriptor); + const std::array colorTargets{WGPUColorTargetState{ + .format = g_graphicsConfig.colorFormat, + .writeMask = WGPUColorWriteMask_All, + }}; + const WGPUFragmentState fragmentState{ + .module = module, + .entryPoint = "fs_main", + .targetCount = colorTargets.size(), + .targets = colorTargets.data(), + }; + const std::array bindGroupLayoutEntries{ + WGPUBindGroupLayoutEntry{ + .binding = 0, + .visibility = WGPUShaderStage_Fragment, + .sampler = + WGPUSamplerBindingLayout{ + .type = WGPUSamplerBindingType_Filtering, + }, + }, + WGPUBindGroupLayoutEntry{ + .binding = 1, + .visibility = WGPUShaderStage_Fragment, + .texture = + WGPUTextureBindingLayout{ + .sampleType = WGPUTextureSampleType_Float, + .viewDimension = WGPUTextureViewDimension_2D, + }, + }, + }; + const WGPUBindGroupLayoutDescriptor bindGroupLayoutDescriptor{ + .entryCount = bindGroupLayoutEntries.size(), + .entries = bindGroupLayoutEntries.data(), + }; + g_CopyBindGroupLayout = wgpuDeviceCreateBindGroupLayout(g_device, &bindGroupLayoutDescriptor); + const WGPUPipelineLayoutDescriptor layoutDescriptor{ + .bindGroupLayoutCount = 1, + .bindGroupLayouts = &g_CopyBindGroupLayout, + }; + auto pipelineLayout = wgpuDeviceCreatePipelineLayout(g_device, &layoutDescriptor); + const WGPURenderPipelineDescriptor pipelineDescriptor{ + .layout = pipelineLayout, + .vertex = + WGPUVertexState{ + .module = module, + .entryPoint = "vs_main", + }, + .primitive = + WGPUPrimitiveState{ + .topology = WGPUPrimitiveTopology_TriangleList, + }, + .multisample = + WGPUMultisampleState{ + .count = 1, + .mask = UINT32_MAX, + }, + .fragment = &fragmentState, + }; + g_CopyPipeline = wgpuDeviceCreateRenderPipeline(g_device, &pipelineDescriptor); + wgpuPipelineLayoutRelease(pipelineLayout); +} + +void create_copy_bind_group() { + const std::array bindGroupEntries{ + WGPUBindGroupEntry{ + .binding = 0, + .sampler = g_graphicsConfig.msaaSamples > 1 ? g_frameBufferResolved.sampler : g_frameBuffer.sampler, + }, + WGPUBindGroupEntry{ + .binding = 1, + .textureView = g_graphicsConfig.msaaSamples > 1 ? g_frameBufferResolved.view : g_frameBuffer.view, + }, + }; + const WGPUBindGroupDescriptor bindGroupDescriptor{ + .layout = g_CopyBindGroupLayout, + .entryCount = bindGroupEntries.size(), + .entries = bindGroupEntries.data(), + }; + g_CopyBindGroup = wgpuDeviceCreateBindGroup(g_device, &bindGroupDescriptor); +} + +static void error_callback(WGPUErrorType type, char const* message, void* userdata) { + Log.report(LOG_FATAL, FMT_STRING("Dawn error {}: {}"), magic_enum::enum_name(static_cast(type)), + message); +} + +static void device_callback(WGPURequestDeviceStatus status, WGPUDevice device, char const* message, void* userdata) { + if (status == WGPURequestDeviceStatus_Success) { + g_device = device; + } else { + Log.report(LOG_WARNING, FMT_STRING("Device request failed with message: {}"), message); + } + *static_cast(userdata) = true; +} + +static WGPUBackendType to_wgpu_backend(AuroraBackend backend) { + switch (backend) { + case BACKEND_WEBGPU: + return WGPUBackendType_WebGPU; + case BACKEND_D3D12: + return WGPUBackendType_D3D12; + case BACKEND_METAL: + return WGPUBackendType_Metal; + case BACKEND_VULKAN: + return WGPUBackendType_Vulkan; + case BACKEND_OPENGL: + return WGPUBackendType_OpenGL; + case BACKEND_OPENGLES: + return WGPUBackendType_OpenGLES; + default: + return WGPUBackendType_Null; + } +} + +bool initialize(AuroraBackend auroraBackend) { + if (!g_Instance) { + Log.report(LOG_INFO, FMT_STRING("Creating Dawn instance")); + g_Instance = std::make_unique(); + } + WGPUBackendType backend = to_wgpu_backend(auroraBackend); + Log.report(LOG_INFO, FMT_STRING("Attempting to initialize {}"), magic_enum::enum_name(backend)); +#if 0 + // D3D12's debug layer is very slow + g_Instance->EnableBackendValidation(backend != WGPUBackendType::D3D12); +#endif + SDL_Window* window = window::get_sdl_window(); + if (!utils::DiscoverAdapter(g_Instance.get(), window, backend)) { + return false; + } + + { + std::vector adapters = g_Instance->GetAdapters(); + std::sort(adapters.begin(), adapters.end(), [&](const auto& a, const auto& b) { + WGPUAdapterProperties propertiesA; + WGPUAdapterProperties propertiesB; + a.GetProperties(&propertiesA); + b.GetProperties(&propertiesB); + constexpr std::array PreferredTypeOrder{ + WGPUAdapterType_DiscreteGPU, + WGPUAdapterType_IntegratedGPU, + WGPUAdapterType_CPU, + }; + const auto typeItA = std::find(PreferredTypeOrder.begin(), PreferredTypeOrder.end(), propertiesA.adapterType); + const auto typeItB = std::find(PreferredTypeOrder.begin(), PreferredTypeOrder.end(), propertiesB.adapterType); + return typeItA < typeItB; + }); + const auto adapterIt = std::find_if(adapters.begin(), adapters.end(), [=](const auto& adapter) -> bool { + WGPUAdapterProperties properties; + adapter.GetProperties(&properties); + return properties.backendType == backend; + }); + if (adapterIt == adapters.end()) { + return false; + } + g_Adapter = *adapterIt; + } + g_Adapter.GetProperties(&g_AdapterProperties); + g_backendType = g_AdapterProperties.backendType; + const auto backendName = magic_enum::enum_name(g_backendType); + Log.report(LOG_INFO, FMT_STRING("Graphics adapter information\n API: {}\n Device: {} ({})\n Driver: {}"), + backendName, g_AdapterProperties.name, magic_enum::enum_name(g_AdapterProperties.adapterType), + g_AdapterProperties.driverDescription); + + { + WGPUSupportedLimits supportedLimits{}; + g_Adapter.GetLimits(&supportedLimits); + const WGPURequiredLimits requiredLimits{ + .limits = + { + // Use "best" supported alignments + .minUniformBufferOffsetAlignment = supportedLimits.limits.minUniformBufferOffsetAlignment == 0 + ? static_cast(WGPU_LIMIT_U32_UNDEFINED) + : supportedLimits.limits.minUniformBufferOffsetAlignment, + .minStorageBufferOffsetAlignment = supportedLimits.limits.minStorageBufferOffsetAlignment == 0 + ? static_cast(WGPU_LIMIT_U32_UNDEFINED) + : supportedLimits.limits.minStorageBufferOffsetAlignment, + }, + }; + std::vector features; + const auto supportedFeatures = g_Adapter.GetSupportedFeatures(); + for (const auto* const feature : supportedFeatures) { + if (strcmp(feature, "texture-compression-bc") == 0) { + features.push_back(WGPUFeatureName_TextureCompressionBC); + } + } + const std::array enableToggles { + /* clang-format off */ +#if _WIN32 + "use_dxc", +#endif +#ifdef NDEBUG + "skip_validation", + "disable_robustness", +#endif + "use_user_defined_labels_in_backend", + "disable_symbol_renaming", + /* clang-format on */ + }; + const WGPUDawnTogglesDeviceDescriptor togglesDescriptor{ + .chain = {.sType = WGPUSType_DawnTogglesDeviceDescriptor}, + .forceEnabledTogglesCount = enableToggles.size(), + .forceEnabledToggles = enableToggles.data(), + }; + const WGPUDeviceDescriptor deviceDescriptor{ + .nextInChain = &togglesDescriptor.chain, + .requiredFeaturesCount = static_cast(features.size()), + .requiredFeatures = features.data(), + .requiredLimits = &requiredLimits, + }; + bool deviceCallbackReceived = false; + g_Adapter.RequestDevice(&deviceDescriptor, &device_callback, &deviceCallbackReceived); + // while (!deviceCallbackReceived) { + // TODO wgpuInstanceProcessEvents + // } + if (!g_device) { + return false; + } + wgpuDeviceSetUncapturedErrorCallback(g_device, &error_callback, nullptr); + } + wgpuDeviceSetDeviceLostCallback(g_device, nullptr, nullptr); + g_queue = wgpuDeviceGetQueue(g_device); + + g_BackendBinding = std::unique_ptr(utils::CreateBinding(g_backendType, window, g_device)); + if (!g_BackendBinding) { + return false; + } + + auto swapChainFormat = static_cast(g_BackendBinding->GetPreferredSwapChainTextureFormat()); + if (swapChainFormat == WGPUTextureFormat_RGBA8UnormSrgb) { + swapChainFormat = WGPUTextureFormat_RGBA8Unorm; + } else if (swapChainFormat == WGPUTextureFormat_BGRA8UnormSrgb) { + swapChainFormat = WGPUTextureFormat_BGRA8Unorm; + } + Log.report(LOG_INFO, FMT_STRING("Using swapchain format {}"), magic_enum::enum_name(swapChainFormat)); + { + const WGPUSwapChainDescriptor descriptor{ + .format = swapChainFormat, + .implementation = g_BackendBinding->GetSwapChainImplementation(), + }; + g_swapChain = wgpuDeviceCreateSwapChain(g_device, nullptr, &descriptor); + } + { + const auto size = window::get_window_size(); + g_graphicsConfig = GraphicsConfig{ + .width = size.fb_width, + .height = size.fb_height, + .colorFormat = swapChainFormat, + .depthFormat = WGPUTextureFormat_Depth32Float, + .msaaSamples = g_config.msaa, + .textureAnisotropy = g_config.maxTextureAnisotropy, + }; + create_copy_pipeline(); + resize_swapchain(size.fb_width, size.fb_height, true); + // g_windowSize = size; + } + return true; +} + +void shutdown() { + wgpuBindGroupLayoutRelease(g_CopyBindGroupLayout); + wgpuRenderPipelineRelease(g_CopyPipeline); + wgpuBindGroupRelease(g_CopyBindGroup); + g_frameBuffer = {}; + g_frameBufferResolved = {}; + g_depthBuffer = {}; + wgpuSwapChainRelease(g_swapChain); + wgpuQueueRelease(g_queue); + g_BackendBinding.reset(); + wgpuDeviceDestroy(g_device); + g_Instance.reset(); +} + +void resize_swapchain(uint32_t width, uint32_t height, bool force) { + if (!force && g_graphicsConfig.width == width && g_graphicsConfig.height == height) { + return; + } + g_graphicsConfig.width = width; + g_graphicsConfig.height = height; + wgpuSwapChainConfigure(g_swapChain, g_graphicsConfig.colorFormat, WGPUTextureUsage_RenderAttachment, width, height); + g_frameBuffer = create_render_texture(true); + g_frameBufferResolved = create_render_texture(false); + g_depthBuffer = create_depth_texture(); + create_copy_bind_group(); +} +} // namespace aurora::webgpu diff --git a/lib/webgpu/gpu.hpp b/lib/webgpu/gpu.hpp new file mode 100644 index 0000000..79b2b3a --- /dev/null +++ b/lib/webgpu/gpu.hpp @@ -0,0 +1,88 @@ +#pragma once + +#include + +#include "wgpu.hpp" + +#include +#include + +struct SDL_Window; + +namespace aurora::webgpu { +struct GraphicsConfig { + uint32_t width; + uint32_t height; + WGPUTextureFormat colorFormat; + WGPUTextureFormat depthFormat; + uint32_t msaaSamples; + uint16_t textureAnisotropy; +}; +struct TextureWithSampler { + wgpu::Texture texture; + wgpu::TextureView view; + WGPUExtent3D size; + WGPUTextureFormat format; + wgpu::Sampler sampler; + + // TextureWithSampler() = default; + // TextureWithSampler(WGPUTexture texture, WGPUTextureView view, WGPUExtent3D size, WGPUTextureFormat format, + // WGPUSampler sampler) noexcept + // : texture(texture), view(view), size(size), format(format), sampler(sampler) {} + // TextureWithSampler(const TextureWithSampler& rhs) noexcept + // : texture(rhs.texture), view(rhs.view), size(rhs.size), format(rhs.format), sampler(rhs.sampler) { + // wgpuTextureReference(texture); + // wgpuTextureViewReference(view); + // wgpuSamplerReference(sampler); + // } + // TextureWithSampler(TextureWithSampler&& rhs) noexcept + // : texture(rhs.texture), view(rhs.view), size(rhs.size), format(rhs.format), sampler(rhs.sampler) { + // rhs.texture = nullptr; + // rhs.view = nullptr; + // rhs.sampler = nullptr; + // } + // ~TextureWithSampler() { reset(); } + // TextureWithSampler& operator=(const TextureWithSampler& rhs) noexcept { + // reset(); + // texture = rhs.texture; + // view = rhs.view; + // size = rhs.size; + // format = rhs.format; + // sampler = rhs.sampler; + // wgpuTextureReference(texture); + // wgpuTextureViewReference(view); + // wgpuSamplerReference(sampler); + // return *this; + // } + // void reset() { + // if (texture != nullptr) { + // wgpuTextureRelease(texture); + // texture = nullptr; + // } + // if (view != nullptr) { + // wgpuTextureViewRelease(view); + // view = nullptr; + // } + // if (sampler != nullptr) { + // wgpuSamplerRelease(sampler); + // sampler = nullptr; + // } + // } +}; + +extern WGPUDevice g_device; +extern WGPUQueue g_queue; +extern WGPUSwapChain g_swapChain; +extern WGPUBackendType g_backendType; +extern GraphicsConfig g_graphicsConfig; +extern TextureWithSampler g_frameBuffer; +extern TextureWithSampler g_frameBufferResolved; +extern TextureWithSampler g_depthBuffer; +extern WGPURenderPipeline g_CopyPipeline; +extern WGPUBindGroup g_CopyBindGroup; + +bool initialize(AuroraBackend backend); +void shutdown(); +void resize_swapchain(uint32_t width, uint32_t height, bool force = false); +TextureWithSampler create_render_texture(bool multisampled); +} // namespace aurora::webgpu diff --git a/lib/webgpu/wgpu.hpp b/lib/webgpu/wgpu.hpp new file mode 100644 index 0000000..8dd2d28 --- /dev/null +++ b/lib/webgpu/wgpu.hpp @@ -0,0 +1,102 @@ +#include + +namespace wgpu { +template +struct ObjectBase { + ObjectBase() = default; + ObjectBase(CType handle) : mHandle(handle) {} + ~ObjectBase() { Reset(); } + + ObjectBase(ObjectBase const& other) : ObjectBase(other.Get()) {} + Derived& operator=(ObjectBase const& other) { + if (&other != this) { + if (mHandle) { + Derived::WGPURelease(mHandle); + } + mHandle = other.mHandle; + if (mHandle) { + Derived::WGPUReference(mHandle); + } + } + return static_cast(*this); + } + + ObjectBase(ObjectBase&& other) noexcept { + mHandle = other.mHandle; + other.mHandle = 0; + } + Derived& operator=(ObjectBase&& other) noexcept { + if (&other != this) { + if (mHandle) { + Derived::WGPURelease(mHandle); + } + mHandle = other.mHandle; + other.mHandle = nullptr; + } + return static_cast(*this); + } + + ObjectBase(std::nullptr_t) {} + Derived& operator=(std::nullptr_t) { + if (mHandle != nullptr) { + Derived::WGPURelease(mHandle); + mHandle = nullptr; + } + return static_cast(*this); + } + + bool operator==(std::nullptr_t) const { return mHandle == nullptr; } + bool operator!=(std::nullptr_t) const { return mHandle != nullptr; } + + explicit operator bool() const { return mHandle != nullptr; } + operator CType() { return mHandle; } + [[nodiscard]] CType Get() const { return mHandle; } + CType Release() { + CType result = mHandle; + mHandle = 0; + return result; + } + void Reset() { + if (mHandle) { + Derived::WGPURelease(mHandle); + mHandle = nullptr; + } + } + +protected: + CType mHandle = nullptr; +}; + +class Texture : public ObjectBase { +public: + using ObjectBase::ObjectBase; + using ObjectBase::operator=; + +private: + friend ObjectBase; + static void WGPUReference(WGPUTexture handle) { wgpuTextureReference(handle); } + static void WGPURelease(WGPUTexture handle) { wgpuTextureRelease(handle); } +}; + +class TextureView : public ObjectBase { +public: + using ObjectBase::ObjectBase; + using ObjectBase::operator=; + +private: + friend ObjectBase; + static void WGPUReference(WGPUTextureView handle) { wgpuTextureViewReference(handle); } + static void WGPURelease(WGPUTextureView handle) { wgpuTextureViewRelease(handle); } +}; + +class Sampler : public ObjectBase { +public: + using ObjectBase::ObjectBase; + using ObjectBase::operator=; + +private: + friend ObjectBase; + static void WGPUReference(WGPUSampler handle) { wgpuSamplerReference(handle); } + static void WGPURelease(WGPUSampler handle) { wgpuSamplerRelease(handle); } +}; +} // namespace wgpu diff --git a/lib/window.cpp b/lib/window.cpp new file mode 100644 index 0000000..7891684 --- /dev/null +++ b/lib/window.cpp @@ -0,0 +1,250 @@ +#include "window.hpp" + +#include "imgui.hpp" +#include "webgpu/gpu.hpp" +#include "input.hpp" +#include "internal.hpp" + +#include +#include + +namespace aurora::window { +static Module Log("aurora::window"); + +static SDL_Window* g_window; +static SDL_Renderer* g_renderer; +static AuroraWindowSize g_windowSize; +static std::vector g_events; + +static inline bool operator==(const AuroraWindowSize& lhs, const AuroraWindowSize& rhs) { + return lhs.width == rhs.width && lhs.height == rhs.height && lhs.fb_width == rhs.fb_width && + lhs.fb_height == rhs.fb_height && lhs.scale == rhs.scale; +} + +static void resize_swapchain(bool force) noexcept { + const auto size = get_window_size(); + if (!force && size == g_windowSize) { + return; + } + if (size.scale != g_windowSize.scale) { + if (g_windowSize.scale > 0.f) { + Log.report(LOG_INFO, FMT_STRING("Display scale changed to {}"), size.scale); + } + } + g_windowSize = size; + webgpu::resize_swapchain(size.fb_width, size.fb_height); +} + +const AuroraEvent* poll_events() { + g_events.clear(); + + SDL_Event event; + while (SDL_PollEvent(&event) != 0) { + imgui::process_event(event); + + switch (event.type) { + case SDL_WINDOWEVENT: { + switch (event.window.event) { + case SDL_WINDOWEVENT_MINIMIZED: { + // Android/iOS: Application backgrounded + g_events.push_back(AuroraEvent{ + .type = AURORA_PAUSED, + }); + break; + } + case SDL_WINDOWEVENT_RESTORED: { + // Android/iOS: Application focused + g_events.push_back(AuroraEvent{ + .type = AURORA_UNPAUSED, + }); + break; + } + case SDL_WINDOWEVENT_SIZE_CHANGED: { + resize_swapchain(false); + g_events.push_back(AuroraEvent{ + .type = AURORA_WINDOW_RESIZED, + .windowSize = get_window_size(), + }); + break; + } + } + break; + } + case SDL_CONTROLLERDEVICEADDED: { + auto instance = input::add_controller(event.cdevice.which); + g_events.push_back(AuroraEvent{ + .type = AURORA_CONTROLLER_ADDED, + .controller = instance, + }); + break; + } + case SDL_CONTROLLERDEVICEREMOVED: { + input::remove_controller(event.cdevice.which); + g_events.push_back(AuroraEvent{ + .type = AURORA_CONTROLLER_REMOVED, + .controller = event.cdevice.which, + }); + break; + } + case SDL_QUIT: + g_events.push_back(AuroraEvent{ + .type = AURORA_EXIT, + }); + } + } + g_events.push_back(AuroraEvent{ + .type = AURORA_NONE, + }); + return g_events.data(); +} + +static void set_window_icon() noexcept { + if (g_config.iconRGBA8 == nullptr) { + return; + } + auto* iconSurface = SDL_CreateRGBSurfaceFrom(g_config.iconRGBA8, g_config.iconWidth, g_config.iconHeight, 32, + 4 * g_config.iconWidth, 0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000); + if (iconSurface == nullptr) { + Log.report(LOG_FATAL, FMT_STRING("Failed to create icon surface: {}"), SDL_GetError()); + unreachable(); + } + SDL_SetWindowIcon(g_window, iconSurface); + SDL_FreeSurface(iconSurface); +} + +bool create_window(AuroraBackend backend) { + Uint32 flags = SDL_WINDOW_ALLOW_HIGHDPI; +#if TARGET_OS_IOS || TARGET_OS_TV + flags |= SDL_WINDOW_FULLSCREEN; +#else + flags |= SDL_WINDOW_HIDDEN | SDL_WINDOW_RESIZABLE; + if (g_config.startFullscreen) { + flags |= SDL_WINDOW_FULLSCREEN; + } +#endif + switch (backend) { +#ifdef DAWN_ENABLE_BACKEND_VULKAN + case BACKEND_VULKAN: + flags |= SDL_WINDOW_VULKAN; + break; +#endif +#ifdef DAWN_ENABLE_BACKEND_METAL + case BACKEND_METAL: + flags |= SDL_WINDOW_METAL; + break; +#endif +#ifdef DAWN_ENABLE_BACKEND_OPENGL + case BACKEND_OPENGL: + case BACKEND_OPENGLES: + flags |= SDL_WINDOW_OPENGL; + break; +#endif + default: + break; + } +#ifdef __SWITCH__ + uint32_t width = 1280; + uint32_t height = 720; +#else + uint32_t width = g_config.windowWidth; + uint32_t height = g_config.windowHeight; + if (width == 0 || height == 0) { + width = 1280; + height = 960; + } +#endif + g_window = SDL_CreateWindow(g_config.appName, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, width, height, flags); + if (g_window == nullptr) { + return false; + } + set_window_icon(); + return true; +} + +bool create_renderer() { + if (g_window == nullptr) { + return false; + } + const auto flags = SDL_RENDERER_PRESENTVSYNC; + g_renderer = SDL_CreateRenderer(g_window, -1, flags | SDL_RENDERER_ACCELERATED); + if (g_renderer == nullptr) { + // Attempt fallback to SW renderer + g_renderer = SDL_CreateRenderer(g_window, -1, flags); + } + return g_renderer != nullptr; +} + +void destroy_window() { + if (g_renderer != nullptr) { + SDL_DestroyRenderer(g_renderer); + g_renderer = nullptr; + } + if (g_window != nullptr) { + SDL_DestroyWindow(g_window); + g_window = nullptr; + } +} + +void show_window() { + if (g_window != nullptr) { + SDL_ShowWindow(g_window); + } +} + +bool initialize() { + if (SDL_Init(SDL_INIT_EVERYTHING) < 0) { + Log.report(LOG_FATAL, FMT_STRING("Error initializing SDL: {}"), SDL_GetError()); + unreachable(); + } + +#if !defined(_WIN32) && !defined(__APPLE__) + SDL_SetHint(SDL_HINT_VIDEO_X11_NET_WM_BYPASS_COMPOSITOR, "0"); +#endif +#if SDL_VERSION_ATLEAST(2, 0, 18) + SDL_SetHint(SDL_HINT_SCREENSAVER_INHIBIT_ACTIVITY_NAME, g_config.appName); +#endif +#ifdef SDL_HINT_JOYSTICK_GAMECUBE_RUMBLE_BRAKE + SDL_SetHint(SDL_HINT_JOYSTICK_GAMECUBE_RUMBLE_BRAKE, "1"); +#endif + + SDL_DisableScreenSaver(); + /* TODO: Make this an option rather than hard coding it */ + SDL_SetHint(SDL_HINT_JOYSTICK_ALLOW_BACKGROUND_EVENTS, "1"); + + return true; +} + +void shutdown() { + destroy_window(); + SDL_EnableScreenSaver(); + SDL_Quit(); +} + +AuroraWindowSize get_window_size() { + int width, height, fb_w, fb_h; + SDL_GetWindowSize(g_window, &width, &height); +#if DAWN_ENABLE_BACKEND_METAL + SDL_Metal_GetDrawableSize(g_window, &fb_w, &fb_h); +#else + SDL_GL_GetDrawableSize(g_window, &fb_w, &fb_h); +#endif + float scale = static_cast(fb_w) / static_cast(width); +#ifndef __APPLE__ + if (SDL_GetDisplayDPI(SDL_GetWindowDisplayIndex(g_window), nullptr, &scale, nullptr) == 0) { + scale /= 96.f; + } +#endif + return { + .width = static_cast(width), + .height = static_cast(height), + .fb_width = static_cast(fb_w), + .fb_height = static_cast(fb_h), + .scale = scale, + }; +} + +SDL_Window* get_sdl_window() { return g_window; } + +SDL_Renderer* get_sdl_renderer() { return g_renderer; } + +} // namespace aurora::window diff --git a/lib/window.hpp b/lib/window.hpp new file mode 100644 index 0000000..553405d --- /dev/null +++ b/lib/window.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + +struct SDL_Window; +struct SDL_Renderer; + +namespace aurora::window { +bool initialize(); +void shutdown(); +bool create_window(AuroraBackend backend); +bool create_renderer(); +void destroy_window(); +void show_window(); +AuroraWindowSize get_window_size(); +const AuroraEvent* poll_events(); +SDL_Window* get_sdl_window(); +SDL_Renderer* get_sdl_renderer(); +}; // namespace aurora::window