From 234f474679141d16019d5b4eaf74c64b23a762b8 Mon Sep 17 00:00:00 2001 From: Luke Street Date: Tue, 31 Jan 2023 00:41:48 -0500 Subject: [PATCH] Add frank.py; configure.py --frank (non-matching) Former-commit-id: 0f3111f58a125e9bb31f865e2c0df62aede07a7d --- .gitignore | 2 + configure.py | 31 ++++- src/Dolphin/GBA/GBAXfer.c | 2 +- src/Dolphin/ai.c | 4 +- src/Dolphin/ar/ar.c | 6 +- src/Dolphin/card/CARDBios.c | 6 +- src/Dolphin/card/CARDCheck.c | 4 +- src/Dolphin/dsp/dsp.c | 2 +- src/Dolphin/dvd/dvdfatal.c | 2 +- src/Dolphin/dvd/dvdfs.c | 2 +- src/Dolphin/os/OSError.c | 2 +- src/Dolphin/os/OSMemory.c | 2 +- src/Dolphin/os/OSResetSW.c | 2 +- src/Dolphin/os/OSRtc.c | 10 +- src/Dolphin/os/OSThread.c | 8 +- src/Dolphin/pad/pad.c | 4 +- tools/frank.py | 214 +++++++++++++++++++++++++++++++++++ 17 files changed, 271 insertions(+), 32 deletions(-) create mode 100644 tools/frank.py diff --git a/.gitignore b/.gitignore index 756bfa87..4fa9ca45 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ include/lmgr326b.dll .idea/ versions/ build.ninja +.ninja_deps +.ninja_log diff --git a/configure.py b/configure.py index 9cff61b1..6422b078 100755 --- a/configure.py +++ b/configure.py @@ -1110,6 +1110,12 @@ if __name__ == "__main__": default=Path("build"), help="base build directory", ) + parser.add_argument( + "--frank", + dest="frank", + action="store_true", + help="use full frank.py instead of franklite.py (non-matching)", + ) args = parser.parse_args() # On Windows, we need this to use && in commands @@ -1152,11 +1158,13 @@ if __name__ == "__main__": else: dkp_path = Path("/opt/devkitpro/devkitPPC") - cflags_base = f"-proc gekko -nodefaults -Cpp_exceptions off -RTTI off -fp hard -fp_contract on -O4,p -maxerrors 1 -enum int -inline auto -str reuse -nosyspath -MMD -DPRIME1 -DVERSION={version_num} -DNONMATCHING=0 -i include -i libc" + cflags_base = f"-proc gekko -nodefaults -Cpp_exceptions off -RTTI off -fp hard -fp_contract on -O4,p -maxerrors 1 -enum int -inline auto -str reuse -nosyspath -DPRIME1 -DVERSION={version_num} -DNONMATCHING=0 -i include -i libc" if args.debug: cflags_base += " -sym on -D_DEBUG" else: cflags_base += " -DNDEBUG" + if args.frank: + cflags_base += " -DFULL_FRANK" n.variable("cflags_base", cflags_base) n.variable( "cflags_retro", @@ -1249,11 +1257,20 @@ if __name__ == "__main__": compiler_path = args.compilers / "$mw_version" mwcc = compiler_path / "mwcceppc.exe" mwld = compiler_path / "mwldeppc.exe" + frank = tools_path / "frank.py" franklite = tools_path / "franklite.py" gnu_as = dkp_path / "bin" / f"powerpc-eabi-as{exe}" - mwcc_cmd = f"{chain}{wine}{mwcc} $cflags -c $in -o $basedir" - mwcc_frank_cmd = f"{mwcc_cmd} && $python {franklite} $out $out" + mwcc_cmd = f"{chain}{wine}{mwcc} $cflags -MMD -c $in -o $basedir" + if args.frank: + profile_mwcc = args.compilers / "1.2.5e" / "mwcceppc.exe" + mwcc_frank_cmd = ( + f"{chain}{wine}{mwcc} $cflags -MMD -c $in -o $basedir" + + f" && {wine}{profile_mwcc} $cflags -c $in -o $out.profile" + + f" && $python {frank} $out $out.profile $out" + ) + else: + mwcc_frank_cmd = f"{mwcc_cmd} && $python {franklite} $out $out" mwld_cmd = f"{wine}{mwld} $ldflags -o $out @$out.rsp" as_cmd = ( f"{chain}{gnu_as} $asflags -o $out $in -MD $out.d" @@ -1379,10 +1396,15 @@ if __name__ == "__main__": if completed is None: print(f"Mark as incomplete: {c_file}") rule = "mwcc" + implicit = [] if mw_version == "1.2.5e": mw_version = "1.2.5" if no_frank is False: rule = "mwcc_frank" + if args.frank: + implicit.append(frank) + else: + implicit.append(franklite) n.build( outputs=path(build_src_path / f"{object}.o"), rule=rule, @@ -1393,6 +1415,7 @@ if __name__ == "__main__": "basedir": os.path.dirname(build_src_path / f"{object}"), "basefile": path(build_src_path / f"{object}"), }, + implicit=path(implicit), ) if lib["host"]: n.build( @@ -1540,7 +1563,7 @@ if __name__ == "__main__": n.build( outputs="build.ninja", rule="configure", - implicit=["configure.py", "tools/ninja_syntax.py"], + implicit=path(["configure.py", tools_path / "ninja_syntax.py"]), ) n.newline() diff --git a/src/Dolphin/GBA/GBAXfer.c b/src/Dolphin/GBA/GBAXfer.c index 9fc6c25c..c88ffd1d 100644 --- a/src/Dolphin/GBA/GBAXfer.c +++ b/src/Dolphin/GBA/GBAXfer.c @@ -39,7 +39,7 @@ void __GBAHandler(s32 chan, u32 sr, OSContext* context) { void __GBASyncCallback(s32 chan, s32 ret) { OSWakeupThread(&__GBA[chan].thread_queue); } -#if NONMATCHING +#ifdef FULL_FRANK /* This actually does match, but has an epilogue swap */ s32 __GBASync(s32 chan) { GBA* gba; diff --git a/src/Dolphin/ai.c b/src/Dolphin/ai.c index 62e350a2..fb39c789 100644 --- a/src/Dolphin/ai.c +++ b/src/Dolphin/ai.c @@ -22,7 +22,7 @@ void __AIDHandler(s16 interrupt, OSContext* context); void __AICallbackStackSwitch(register AIDCallback cb); void __AI_SRC_INIT(void); -#if NONMATCHING +#ifdef FULL_FRANK AIDCallback AIRegisterDMACallback(AIDCallback callback) { s32 oldInts; AIDCallback ret; @@ -79,7 +79,7 @@ u32 AIGetDMAStartAddr(void) { return (u32)((__DSPRegs[24] & 0x03ff) << 16) | (__DSPRegs[25] & 0xffe0); } -#if NONMATCHING +#ifdef FULL_FRANK AISCallback AIRegisterStreamCallback(AISCallback callback) { AISCallback ret; s32 oldInts; diff --git a/src/Dolphin/ar/ar.c b/src/Dolphin/ar/ar.c index e5a487f9..1d59b303 100644 --- a/src/Dolphin/ar/ar.c +++ b/src/Dolphin/ar/ar.c @@ -21,7 +21,7 @@ static void __ARHandler(__OSInterrupt interrupt, OSContext* context); static void __ARChecksize(void); static void __ARClearArea(u32 start_addr, u32 length); -#if NONMATCHING +#ifdef FULL_FRANK ARCallback ARRegisterDMACallback(ARCallback callback) { ARCallback oldCb; BOOL enabled; @@ -60,7 +60,7 @@ asm ARCallback ARRegisterDMACallback(ARCallback callback) { #pragma pop /* clang-format on */ #endif -#if NONMATCHING +#ifdef FULL_FRANK u32 ARGetDMAStatus() { BOOL enabled; u32 val; @@ -110,7 +110,7 @@ void ARStartDMA(u32 type, u32 mainmem_addr, u32 aram_addr, u32 length) { OSRestoreInterrupts(enabled); } -#if NONMATCHING +#ifdef FULL_FRANK u32 ARAlloc(u32 length) { u32 tmp; BOOL enabled; diff --git a/src/Dolphin/card/CARDBios.c b/src/Dolphin/card/CARDBios.c index 129960a1..0a1fb45b 100644 --- a/src/Dolphin/card/CARDBios.c +++ b/src/Dolphin/card/CARDBios.c @@ -397,7 +397,7 @@ s32 __CARDWritePage(s32 chan, CARDCallback callback) { return result; } -#if NONMATCHING +#ifdef FULL_FRANK /* TODO: Needs frank fix for disconnected stack epilogue */ s32 __CARDEraseSector(s32 chan, u32 addr, CARDCallback callback) { CARDControl* card; @@ -557,7 +557,7 @@ s32 __CARDGetControlBlock(s32 chan, CARDControl** pcard) { return result; } -#if NONMATCHING +#ifdef FULL_FRANK /* TODO: Needs frank fix for disconnected stack epilogue */ s32 __CARDPutControlBlock(CARDControl* card, s32 result) { BOOL enabled; @@ -619,7 +619,7 @@ s32 CARDGetResultCode(s32 chan) { return card->result; } -#if NONMATCHING +#ifdef FULL_FRANK s32 CARDFreeBlocks(s32 chan, s32* byteNotUsed, s32* filesNotUsed) { CARDControl* card; s32 result; diff --git a/src/Dolphin/card/CARDCheck.c b/src/Dolphin/card/CARDCheck.c index b926a8ca..2253d556 100644 --- a/src/Dolphin/card/CARDCheck.c +++ b/src/Dolphin/card/CARDCheck.c @@ -67,7 +67,7 @@ static s32 VerifyID(CARDControl* card) { return CARD_RESULT_READY; } -#if NONMATCHING +#ifdef FULL_FRANK static s32 VerifyDir(CARDControl* card, int* outCurrent) { CARDDir* dir[2]; CARDDirCheck* check[2]; @@ -279,7 +279,7 @@ lbl_803BB228: #pragma pop #endif -#if NONMATCHING +#ifdef FULL_FRANK static s32 VerifyFAT(CARDControl* card, int* outCurrent) { u16* fat[2]; u16* fatp; diff --git a/src/Dolphin/dsp/dsp.c b/src/Dolphin/dsp/dsp.c index ebc26293..cab7a390 100644 --- a/src/Dolphin/dsp/dsp.c +++ b/src/Dolphin/dsp/dsp.c @@ -84,7 +84,7 @@ void DSPHalt(void) { u32 DSPGetDMAStatus(void) { return __DSPRegs[5] & 0x200; } -#if NONMATCHING +#ifdef FULL_FRANK DSPTaskInfo* DSPAddTask(DSPTaskInfo* task) { u32 oldInt; oldInt = OSDisableInterrupts(); diff --git a/src/Dolphin/dvd/dvdfatal.c b/src/Dolphin/dvd/dvdfatal.c index 1e70c155..ea8093c2 100644 --- a/src/Dolphin/dvd/dvdfatal.c +++ b/src/Dolphin/dvd/dvdfatal.c @@ -73,7 +73,7 @@ static void ShowMessage(void) { OSFatal(fg, bg, message); } -#if NONMATCHING +#ifdef FULL_FRANK BOOL DVDSetAutoFatalMessaging(BOOL enable) { BOOL enabled; BOOL prev; diff --git a/src/Dolphin/dvd/dvdfs.c b/src/Dolphin/dvd/dvdfs.c index 6bca6d6f..d0dc27cc 100644 --- a/src/Dolphin/dvd/dvdfs.c +++ b/src/Dolphin/dvd/dvdfs.c @@ -193,7 +193,7 @@ BOOL DVDOpen(char* fileName, DVDFileInfo* fileInfo) { return TRUE; } -#if NONMATCHING +#ifdef FULL_FRANK BOOL DVDClose(DVDFileInfo* fileInfo) { DVDCancel(&(fileInfo->cb)); return TRUE; diff --git a/src/Dolphin/os/OSError.c b/src/Dolphin/os/OSError.c index 7ab0ec5d..3859a62d 100644 --- a/src/Dolphin/os/OSError.c +++ b/src/Dolphin/os/OSError.c @@ -40,7 +40,7 @@ __declspec(weak) void OSPanic(const char* file, int line, const char* msg, ...) PPCHalt(); } -#if NONMATCHING +#ifdef FULL_FRANK OSErrorHandler OSSetErrorHandler(OSError error, OSErrorHandler handler) { OSErrorHandler oldHandler; BOOL enabled; diff --git a/src/Dolphin/os/OSMemory.c b/src/Dolphin/os/OSMemory.c index 996d3479..8f76aa89 100644 --- a/src/Dolphin/os/OSMemory.c +++ b/src/Dolphin/os/OSMemory.c @@ -13,7 +13,7 @@ static OSResetFunctionInfo ResetFunctionInfo = { 127, }; -#if NONMATCHING +#ifdef FULL_FRANK static BOOL OnReset(BOOL final) { if (final != FALSE) { __MEMRegs[8] = 0xFF; diff --git a/src/Dolphin/os/OSResetSW.c b/src/Dolphin/os/OSResetSW.c index 8abb8b87..97522133 100644 --- a/src/Dolphin/os/OSResetSW.c +++ b/src/Dolphin/os/OSResetSW.c @@ -34,7 +34,7 @@ void __OSResetSWInterruptHandler(__OSInterrupt interrupt, OSContext* context) { __PIRegs[0] = 2; } -#if NONMATCHING +#ifdef FULL_FRANK BOOL OSGetResetButtonState(void) { BOOL enabled; BOOL state; diff --git a/src/Dolphin/os/OSRtc.c b/src/Dolphin/os/OSRtc.c index 4fff32a8..be146706 100644 --- a/src/Dolphin/os/OSRtc.c +++ b/src/Dolphin/os/OSRtc.c @@ -172,7 +172,7 @@ static void* LockSram(u32 offset) { return Scb.sram + offset; } -#if NONMATCHING +#ifdef FULL_FRANK OSSram* __OSLockSram() { return LockSram(0); } #else /* clang-format off */ @@ -278,7 +278,7 @@ BOOL __OSReadROM(void* buffer, s32 length, s32 offset) { } inline OSSram* __OSLockSramHACK() { return LockSram(0); } -#if NONMATCHING +#ifdef FULL_FRANK u32 OSGetSoundMode() { OSSram* sram; u32 mode; @@ -352,7 +352,7 @@ void OSSetSoundMode(u32 mode) { __OSUnlockSram(TRUE); } -#if NONMATCHING +#ifdef FULL_FRANK u32 OSGetProgressiveMode() { OSSram* sram; u32 mode; @@ -420,7 +420,7 @@ void OSSetProgressiveMode(u32 mode) { __OSUnlockSram(TRUE); } -#if NONMATCHING +#ifdef FULL_FRANK u8 OSGetLanguage() { OSSram* sram; u8 language; @@ -471,7 +471,7 @@ lbl_8038428C: /* clang-format on */ #endif -#if NONMATCHING +#ifdef FULL_FRANK u16 OSGetWirelessID(s32 channel) { OSSramEx* sram; u16 id; diff --git a/src/Dolphin/os/OSThread.c b/src/Dolphin/os/OSThread.c index dfb392f1..07afd9b0 100644 --- a/src/Dolphin/os/OSThread.c +++ b/src/Dolphin/os/OSThread.c @@ -129,7 +129,7 @@ void OSInitThreadQueue(OSThreadQueue* queue) { queue->head = queue->tail = NULL; OSThread* OSGetCurrentThread() { return __OSCurrentThread; } -#if NONMATCHING +#ifdef FULL_FRANK /* Code matches, stack epilogue bug*/ s32 OSDisableScheduler() { BOOL enabled; @@ -168,7 +168,7 @@ asm s32 OSDisableScheduler() { #pragma pop #endif -#if NONMATCHING +#ifdef FULL_FRANK /* Code matches, stack epilogue bug*/ s32 OSEnableScheduler() { BOOL enabled; @@ -405,7 +405,7 @@ void OSCancelThread(OSThread* thread) { return; } -#if NONMATCHING +#ifdef FULL_FRANK /* Code matches, stack epilogue bug*/ s32 OSResumeThread(OSThread* thread) { BOOL enabled; @@ -636,7 +636,7 @@ lbl_80384F74: #endif -#if NONMATCHING +#ifdef FULL_FRANK /* Code matches, stack epilogue bug*/ s32 OSSuspendThread(OSThread* thread) { BOOL enabled; diff --git a/src/Dolphin/pad/pad.c b/src/Dolphin/pad/pad.c index 4cbabe4d..3ccf7d15 100644 --- a/src/Dolphin/pad/pad.c +++ b/src/Dolphin/pad/pad.c @@ -730,7 +730,7 @@ static void SamplingHandler(__OSInterrupt interrupt, OSContext* context) { } } -#if NONMATCHING +#ifdef FULL_FRANK PADSamplingCallback PADSetSamplingCallback(PADSamplingCallback callback) { PADSamplingCallback prev; @@ -777,7 +777,7 @@ lbl_803875F0: #pragma pop #endif -#if NONMATCHING +#ifdef FULL_FRANK BOOL __PADDisableRecalibration(BOOL disable) { BOOL enabled; BOOL prev; diff --git a/tools/frank.py b/tools/frank.py new file mode 100644 index 00000000..442a3576 --- /dev/null +++ b/tools/frank.py @@ -0,0 +1,214 @@ +#! /usr/bin/env python3 + +# Written by Ethan Roseman (ethteck) +# MIT License +# Copyright 2021 + +# Modified by EpochFlame + +import argparse +import sys + +# Byte sequence that marks code size +CODESIZE_MAGIC = b"\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x34" +BLR_BYTE_SEQ = b"\x4E\x80\x00\x20" +MTLR_BYTE_SEQ = b"\x7C\x08\x03\xA6" +PROFILE_EXTRA_BYTES = b"\x48\x00\x00\x01\x60\x00\x00\x00" + +LWZ_BYTE = b"\x80" + +# Byte sequence array for branches to link register +BLR_BYTE_SEQ_ARRAY = [BLR_BYTE_SEQ, +b"\x4D\x80\x00\x20", b"\x4D\x80\x00\x21", b"\x4C\x81\x00\x20", b"\x4C\x81\x00\x21", +b"\x4D\x82\x00\x20", b"\x4D\x82\x00\x21", b"\x4C\x80\x00\x20", b"\x4C\x80\x00\x21", +b"\x4D\x81\x00\x20", b"\x4D\x81\x00\x21", b"\x4C\x80\x00\x20", b"\x4C\x80\x00\x21", +b"\x4C\x82\x00\x20", b"\x4C\x82\x00\x21", b"\x4C\x81\x00\x20", b"\x4C\x81\x00\x21", +b"\x4D\x83\x00\x20", b"\x4D\x83\x00\x21", b"\x4C\x83\x00\x20", b"\x4C\x83\x00\x21", +b"\x4D\x83\x00\x20", b"\x4D\x83\x00\x21", b"\x4C\x83\x00\x20", b"\x4C\x83\x00\x21"] + +# Example invocation: ./frank.py vanilla.o profile.o output.o +parser = argparse.ArgumentParser() +parser.add_argument("vanilla", help="Path to the vanilla object", type=argparse.FileType('rb')) +parser.add_argument("profile", help="Path to the profile object", type=argparse.FileType('rb')) +parser.add_argument("target", help="Path to the target object (to write)") + +args = parser.parse_args() + +# Read contents into bytearrays and close files +vanilla_bytes = args.vanilla.read() +args.vanilla.close() + +# If the file contains no code, the codesize magic will not be found. +# The vanilla object requires no modification. +code_size_magic_idx = vanilla_bytes.find(CODESIZE_MAGIC) +if code_size_magic_idx == -1: + with open(args.target, "wb") as f: + f.write(vanilla_bytes) + sys.exit(0) + +profile_bytes = args.profile.read() +args.profile.close() + +# Peephole rescheduling +# +# This is the pattern we will detect: +# (A) lwz <--. .--> (A) li +# (B) li <---\-' bl +# \ nop +# '---> (B) lwz +# +# If the profiled schedule swaps the +# instructions around the bl/nop, we +# instead use the vanilla schedule. +# +idx = 8 +shift = 0 # difference between vanilla and profile code, due to bl/nops +while idx < len(profile_bytes) - 16: + # Find next epilogue + epi_pos = profile_bytes.find(PROFILE_EXTRA_BYTES, idx) + if epi_pos == -1: + break # break while loop when no targets remain + if epi_pos % 4 != 0: # check 4-byte alignment + idx += 4 + continue + + v_pos = epi_pos - shift + shift += 8 + + vanilla_inst_a = vanilla_bytes[v_pos-4:v_pos] + vanilla_inst_b = vanilla_bytes[v_pos:v_pos+4] + vanilla_inst_c = vanilla_bytes[v_pos+4:v_pos+8] + profile_inst_a = profile_bytes[epi_pos-4:epi_pos] + profile_inst_b = profile_bytes[epi_pos+8:epi_pos+12] + profile_inst_c = profile_bytes[epi_pos+12:epi_pos+16] + + opcode_a = vanilla_inst_a[0] >> 2 + opcode_b = vanilla_inst_b[0] >> 2 + opcode_c = vanilla_inst_c[0] >> 2 + + LWZ = 0x80 >> 2 + LFS = 0xC0 >> 2 + ADDI = 0x38 >> 2 + LI = ADDI # an LI instruction is just an ADDI with RA=0 + LMW = 0xB8 >> 2 + FDIVS = 0xEC >> 2 + + if opcode_a == LWZ and \ + opcode_b in [LI, LFS, FDIVS] and \ + vanilla_inst_a == profile_inst_b and \ + vanilla_inst_b == profile_inst_a and \ + vanilla_inst_c == profile_inst_c and \ + opcode_c != ADDI: # <- don't reorder if at the very end of the epilogue + + # Swap instructions (A) and (B) + profile_bytes = profile_bytes[:epi_pos-4] \ + + vanilla_inst_a \ + + PROFILE_EXTRA_BYTES \ + + vanilla_inst_b \ + + profile_bytes[epi_pos+12:] + + # Similar reordering for lwz/lmw, except both insns follow the bl/nop + elif opcode_b == LWZ and \ + opcode_c == LMW and \ + vanilla_inst_b == profile_inst_c and \ + vanilla_inst_c == profile_inst_b: + + profile_bytes = profile_bytes[:epi_pos+8] \ + + vanilla_inst_b \ + + vanilla_inst_c \ + + profile_bytes[epi_pos+16:] + + idx = epi_pos + 8 + +# Remove byte sequence +stripped_bytes = profile_bytes.replace(PROFILE_EXTRA_BYTES, b"") + +# Find end of code sections in vanilla and stripped bytes +code_size_offset = code_size_magic_idx + len(CODESIZE_MAGIC) +code_size_bytes = vanilla_bytes[code_size_offset:code_size_offset+4] +code_size = int.from_bytes(code_size_bytes, byteorder='big') + +eoc_offset = 0x34 + code_size + +# Break if the eoc is not found +assert(eoc_offset != len(vanilla_bytes)) + +# Replace 0x34 - eoc in vanilla with bytes from stripped +final_bytes = vanilla_bytes[:0x34] + stripped_bytes[0x34:eoc_offset] + vanilla_bytes[eoc_offset:] + +# Fix branches to link register +for seq in BLR_BYTE_SEQ_ARRAY: + idx = 0 + + while idx < len(vanilla_bytes): + found_pos = vanilla_bytes.find(seq, idx) + if found_pos == -1: + break # break while loop when no targets remain + if found_pos % 4 != 0: # check 4-byte alignment + idx += 4 + continue + final_bytes = final_bytes[:found_pos] + vanilla_bytes[found_pos:found_pos+4] + final_bytes[found_pos+4:] + idx = found_pos + len(seq) + +# Reunify mtlr/blr instructions, shifting intermediary instructions up +idx = 0 + +while idx < len(final_bytes): + # Find mtlr position + mtlr_found_pos = final_bytes.find(MTLR_BYTE_SEQ, idx) + if mtlr_found_pos == -1: + break # break while loop when no targets remain + if mtlr_found_pos % 4 != 0: # check 4-byte alignment + idx += 4 + continue + # Find paired blr position + blr_found_pos = final_bytes.find(BLR_BYTE_SEQ, mtlr_found_pos) + if blr_found_pos == -1: + break # break while loop when no targets remain + if blr_found_pos % 4 != 0: # check 4-byte alignment + idx += 4 + continue + if mtlr_found_pos + 4 == blr_found_pos: + idx += 4 + continue # continue if mtlr is followed directly by blr + + final_bytes = final_bytes[:mtlr_found_pos] + final_bytes[mtlr_found_pos+4:blr_found_pos] + final_bytes[mtlr_found_pos:mtlr_found_pos+4] + final_bytes[blr_found_pos:] + idx = mtlr_found_pos + len(MTLR_BYTE_SEQ) + +# Reorder lmw/lwz/lfd instructions, if needed (@Altafen) +# Specifically, if this sequence shows up in the stripped profiler code: "LMW, LWZ, LFD*" +# And this sequence shows up in the vanilla code: "LWZ, LFD*, LMW" +# (LFD* = any number of LFDs, including zero) +# If all bytes match between the two (except for the reordering), then use the vanilla ordering. +# This could be written to anchor around the "BL, NOP" instructions in unstripped profiler code, +# or to check for the presence of "ADDI, MTLR, BLR" soon after. +# This also could be written to decode the operands of each instruction to make sure the reorder is harmless. +# Neither of these safeguards are necessary at the moment. +LWZ = 32 +LMW = 46 +LFD = 50 +idx = 0 +while idx+4 < len(final_bytes): + if final_bytes[idx] >> 2 == LMW and final_bytes[idx+4] >> 2 == LWZ and vanilla_bytes[idx] >> 2 == LWZ: + start_idx = idx + lmw_bytes = final_bytes[idx:idx+4] + lwz_bytes = final_bytes[idx+4:idx+8] + if vanilla_bytes[idx:idx+4] != lwz_bytes: + idx += 4 + continue + lfd_bytes = b"" + idx += 4 + while vanilla_bytes[idx] >> 2 == LFD: + lfd_bytes += vanilla_bytes[idx:idx+4] + idx += 4 + if vanilla_bytes[idx:idx+4] != lmw_bytes: + continue + if final_bytes[start_idx+8:start_idx+8+len(lfd_bytes)] != lfd_bytes: + continue + idx += 4 + final_bytes = final_bytes[:start_idx] + lwz_bytes + lfd_bytes + lmw_bytes + final_bytes[idx:] + continue + idx += 4 + +with open(args.target, "wb") as f: + f.write(final_bytes)