prime/tools/frank.py

#! /usr/bin/env python3

# Written by Ethan Roseman (ethteck)
# MIT License
# Copyright 2021

# Modified by EpochFlame

import argparse
import sys

# Byte sequence that marks code size
CODESIZE_MAGIC = b"\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x34"
BLR_BYTE_SEQ = b"\x4E\x80\x00\x20"
MTLR_BYTE_SEQ = b"\x7C\x08\x03\xA6"
PROFILE_EXTRA_BYTES = b"\x48\x00\x00\x01\x60\x00\x00\x00"

LWZ_BYTE = b"\x80"

# Byte sequence array for branches to link register
BLR_BYTE_SEQ_ARRAY = [BLR_BYTE_SEQ,
b"\x4D\x80\x00\x20", b"\x4D\x80\x00\x21", b"\x4C\x81\x00\x20", b"\x4C\x81\x00\x21",
b"\x4D\x82\x00\x20", b"\x4D\x82\x00\x21", b"\x4C\x80\x00\x20", b"\x4C\x80\x00\x21",
b"\x4D\x81\x00\x20", b"\x4D\x81\x00\x21", b"\x4C\x80\x00\x20", b"\x4C\x80\x00\x21",
b"\x4C\x82\x00\x20", b"\x4C\x82\x00\x21", b"\x4C\x81\x00\x20", b"\x4C\x81\x00\x21",
b"\x4D\x83\x00\x20", b"\x4D\x83\x00\x21", b"\x4C\x83\x00\x20", b"\x4C\x83\x00\x21",
b"\x4D\x83\x00\x20", b"\x4D\x83\x00\x21", b"\x4C\x83\x00\x20", b"\x4C\x83\x00\x21"]

# Example invocation: ./frank.py vanilla.o profile.o output.o
parser = argparse.ArgumentParser()
parser.add_argument("vanilla", help="Path to the vanilla object", type=argparse.FileType('rb'))
parser.add_argument("profile", help="Path to the profile object", type=argparse.FileType('rb'))
parser.add_argument("target", help="Path to the target object (to write)")

args = parser.parse_args()

# Read contents into bytearrays and close files
vanilla_bytes = args.vanilla.read()
args.vanilla.close()

# If the file contains no code, the codesize magic will not be found.
# The vanilla object requires no modification.
code_size_magic_idx = vanilla_bytes.find(CODESIZE_MAGIC)
if code_size_magic_idx == -1:
    with open(args.target, "wb") as f:
        f.write(vanilla_bytes)
    sys.exit(0)

profile_bytes = args.profile.read()
args.profile.close()

# Peephole rescheduling
#
# This is the pattern we will detect:
#  (A) lwz <--.   .-->  (A) li
#  (B) li  <---\-'          bl
#               \           nop
#                '--->  (B) lwz
#
# If the profiled schedule swaps the
# instructions around the bl/nop, we
# instead use the vanilla schedule.
#
idx = 8
shift = 0 # difference between vanilla and profile code, due to bl/nops
while idx < len(profile_bytes) - 16:
    # Find next epilogue
    epi_pos = profile_bytes.find(PROFILE_EXTRA_BYTES, idx)
    if epi_pos == -1:
        break # break while loop when no targets remain
    if epi_pos % 4 != 0: # check 4-byte alignment
        idx += 4
        continue

    v_pos = epi_pos - shift
    shift += 8

    vanilla_inst_a = vanilla_bytes[v_pos-4:v_pos]
    vanilla_inst_b = vanilla_bytes[v_pos:v_pos+4]
    vanilla_inst_c = vanilla_bytes[v_pos+4:v_pos+8]
    profile_inst_a = profile_bytes[epi_pos-4:epi_pos]
    profile_inst_b = profile_bytes[epi_pos+8:epi_pos+12]
    profile_inst_c = profile_bytes[epi_pos+12:epi_pos+16]

    opcode_a = vanilla_inst_a[0] >> 2
    opcode_b = vanilla_inst_b[0] >> 2
    opcode_c = vanilla_inst_c[0] >> 2

    LWZ = 0x80 >> 2
    LFS = 0xC0 >> 2
    ADDI = 0x38 >> 2
    LI = ADDI # an LI instruction is just an ADDI with RA=0
    LMW = 0xB8 >> 2
    FDIVS = 0xEC >> 2

    # Adjust LWZ and LMW loading from r1.
    if opcode_a in [LWZ, LMW] and vanilla_inst_a[2] == 0x00 and \
       opcode_b in [LI, LFS, FDIVS] and \
       vanilla_inst_a == profile_inst_b and \
       vanilla_inst_b == profile_inst_a and \
       vanilla_inst_c == profile_inst_c and \
       opcode_c != ADDI: # <- don't reorder if at the very end of the epilogue

        # Swap instructions (A) and (B)
        profile_bytes = profile_bytes[:epi_pos-4] \
                + vanilla_inst_a \
                + PROFILE_EXTRA_BYTES \
                + vanilla_inst_b \
                + profile_bytes[epi_pos+12:]

    # Similar reordering for lwz/lmw, except both insns follow the bl/nop
    elif opcode_b == LWZ and \
         opcode_c == LMW and \
         vanilla_inst_b == profile_inst_c and \
         vanilla_inst_c == profile_inst_b:

        profile_bytes = profile_bytes[:epi_pos+8] \
                + vanilla_inst_b \
                + vanilla_inst_c \
                + profile_bytes[epi_pos+16:]

    idx = epi_pos + 8

# Remove byte sequence
stripped_bytes = profile_bytes.replace(PROFILE_EXTRA_BYTES, b"")

# Find end of code sections in vanilla and stripped bytes
code_size_offset = code_size_magic_idx + len(CODESIZE_MAGIC)
code_size_bytes = vanilla_bytes[code_size_offset:code_size_offset+4]
code_size = int.from_bytes(code_size_bytes, byteorder='big')

eoc_offset = 0x34 + code_size

# Break if the eoc is not found
assert(eoc_offset != len(vanilla_bytes))

# Replace 0x34 - eoc in vanilla with bytes from stripped
final_bytes = vanilla_bytes[:0x34] + stripped_bytes[0x34:eoc_offset] + vanilla_bytes[eoc_offset:]

# Fix branches to link register
for seq in BLR_BYTE_SEQ_ARRAY:
    idx = 0

    while idx < len(vanilla_bytes):
        found_pos = vanilla_bytes.find(seq, idx)
        if found_pos == -1:
            break # break while loop when no targets remain
        if found_pos % 4 != 0: # check 4-byte alignment
            idx += 4
            continue
        final_bytes = final_bytes[:found_pos] + vanilla_bytes[found_pos:found_pos+4] + final_bytes[found_pos+4:]
        idx = found_pos + len(seq)

# Reunify mtlr/blr instructions, shifting intermediary instructions up
idx = 0

while idx < len(final_bytes):
    # Find mtlr position
    mtlr_found_pos = final_bytes.find(MTLR_BYTE_SEQ, idx)
    if mtlr_found_pos == -1:
        break # break while loop when no targets remain
    if mtlr_found_pos % 4 != 0: # check 4-byte alignment
        idx += 4
        continue
    # Find paired blr position
    blr_found_pos = final_bytes.find(BLR_BYTE_SEQ, mtlr_found_pos)
    if blr_found_pos == -1:
        break # break while loop when no targets remain
    if blr_found_pos % 4 != 0: # check 4-byte alignment
        idx += 4
        continue
    if mtlr_found_pos + 4 == blr_found_pos:
        idx += 4
        continue # continue if mtlr is followed directly by blr

    final_bytes = final_bytes[:mtlr_found_pos] + final_bytes[mtlr_found_pos+4:blr_found_pos] + final_bytes[mtlr_found_pos:mtlr_found_pos+4] + final_bytes[blr_found_pos:]
    idx = mtlr_found_pos + len(MTLR_BYTE_SEQ)

# Reorder lmw/lwz/lfd instructions, if needed (@Altafen)
# Specifically, if this sequence shows up in the stripped profiler code: "LMW, LWZ, LFD*"
# And this sequence shows up in the vanilla code: "LWZ, LFD*, LMW"
# (LFD* = any number of LFDs, including zero)
# If all bytes match between the two (except for the reordering), then use the vanilla ordering.
# This could be written to anchor around the "BL, NOP" instructions in unstripped profiler code,
# or to check for the presence of "ADDI, MTLR, BLR" soon after.
# This also could be written to decode the operands of each instruction to make sure the reorder is harmless.
# Neither of these safeguards are necessary at the moment.
LWZ = 32
LMW = 46
LFD = 50
idx = 0
while idx+4 < len(final_bytes):
    if final_bytes[idx] >> 2 == LMW and final_bytes[idx+4] >> 2 == LWZ and vanilla_bytes[idx] >> 2 == LWZ:
        start_idx = idx
        lmw_bytes = final_bytes[idx:idx+4]
        lwz_bytes = final_bytes[idx+4:idx+8]
        if vanilla_bytes[idx:idx+4] != lwz_bytes:
            idx += 4
            continue
        lfd_bytes = b""
        idx += 4
        while vanilla_bytes[idx] >> 2 == LFD:
            lfd_bytes += vanilla_bytes[idx:idx+4]
            idx += 4
        if vanilla_bytes[idx:idx+4] != lmw_bytes:
            continue
        if final_bytes[start_idx+8:start_idx+8+len(lfd_bytes)] != lfd_bytes:
            continue
        idx += 4
        final_bytes = final_bytes[:start_idx] + lwz_bytes + lfd_bytes + lmw_bytes + final_bytes[idx:]
        continue
    idx += 4

with open(args.target, "wb") as f:
    f.write(final_bytes)
Add frank.py; configure.py --frank (non-matching) 2023-01-31 05:41:48 +00:00			`#! /usr/bin/env python3`

			`# Written by Ethan Roseman (ethteck)`
			`# MIT License`
			`# Copyright 2021`

			`# Modified by EpochFlame`

			`import argparse`
			`import sys`

			`# Byte sequence that marks code size`
			`CODESIZE_MAGIC = b"\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x34"`
			`BLR_BYTE_SEQ = b"\x4E\x80\x00\x20"`
			`MTLR_BYTE_SEQ = b"\x7C\x08\x03\xA6"`
			`PROFILE_EXTRA_BYTES = b"\x48\x00\x00\x01\x60\x00\x00\x00"`

			`LWZ_BYTE = b"\x80"`

			`# Byte sequence array for branches to link register`
			`BLR_BYTE_SEQ_ARRAY = [BLR_BYTE_SEQ,`
			`b"\x4D\x80\x00\x20", b"\x4D\x80\x00\x21", b"\x4C\x81\x00\x20", b"\x4C\x81\x00\x21",`
			`b"\x4D\x82\x00\x20", b"\x4D\x82\x00\x21", b"\x4C\x80\x00\x20", b"\x4C\x80\x00\x21",`
			`b"\x4D\x81\x00\x20", b"\x4D\x81\x00\x21", b"\x4C\x80\x00\x20", b"\x4C\x80\x00\x21",`
			`b"\x4C\x82\x00\x20", b"\x4C\x82\x00\x21", b"\x4C\x81\x00\x20", b"\x4C\x81\x00\x21",`
			`b"\x4D\x83\x00\x20", b"\x4D\x83\x00\x21", b"\x4C\x83\x00\x20", b"\x4C\x83\x00\x21",`
			`b"\x4D\x83\x00\x20", b"\x4D\x83\x00\x21", b"\x4C\x83\x00\x20", b"\x4C\x83\x00\x21"]`

			`# Example invocation: ./frank.py vanilla.o profile.o output.o`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("vanilla", help="Path to the vanilla object", type=argparse.FileType('rb'))`
			`parser.add_argument("profile", help="Path to the profile object", type=argparse.FileType('rb'))`
			`parser.add_argument("target", help="Path to the target object (to write)")`

			`args = parser.parse_args()`

			`# Read contents into bytearrays and close files`
			`vanilla_bytes = args.vanilla.read()`
			`args.vanilla.close()`

			`# If the file contains no code, the codesize magic will not be found.`
			`# The vanilla object requires no modification.`
			`code_size_magic_idx = vanilla_bytes.find(CODESIZE_MAGIC)`
			`if code_size_magic_idx == -1:`
			`with open(args.target, "wb") as f:`
			`f.write(vanilla_bytes)`
			`sys.exit(0)`

			`profile_bytes = args.profile.read()`
			`args.profile.close()`

			`# Peephole rescheduling`
			`#`
			`# This is the pattern we will detect:`
			`# (A) lwz <--. .--> (A) li`
			`# (B) li <---\-' bl`
			`# \ nop`
			`# '---> (B) lwz`
			`#`
			`# If the profiled schedule swaps the`
			`# instructions around the bl/nop, we`
			`# instead use the vanilla schedule.`
			`#`
			`idx = 8`
			`shift = 0 # difference between vanilla and profile code, due to bl/nops`
			`while idx < len(profile_bytes) - 16:`
			`# Find next epilogue`
			`epi_pos = profile_bytes.find(PROFILE_EXTRA_BYTES, idx)`
			`if epi_pos == -1:`
			`break # break while loop when no targets remain`
			`if epi_pos % 4 != 0: # check 4-byte alignment`
			`idx += 4`
			`continue`

			`v_pos = epi_pos - shift`
			`shift += 8`

			`vanilla_inst_a = vanilla_bytes[v_pos-4:v_pos]`
			`vanilla_inst_b = vanilla_bytes[v_pos:v_pos+4]`
			`vanilla_inst_c = vanilla_bytes[v_pos+4:v_pos+8]`
			`profile_inst_a = profile_bytes[epi_pos-4:epi_pos]`
			`profile_inst_b = profile_bytes[epi_pos+8:epi_pos+12]`
			`profile_inst_c = profile_bytes[epi_pos+12:epi_pos+16]`

			`opcode_a = vanilla_inst_a[0] >> 2`
			`opcode_b = vanilla_inst_b[0] >> 2`
			`opcode_c = vanilla_inst_c[0] >> 2`

			`LWZ = 0x80 >> 2`
			`LFS = 0xC0 >> 2`
			`ADDI = 0x38 >> 2`
			`LI = ADDI # an LI instruction is just an ADDI with RA=0`
			`LMW = 0xB8 >> 2`
			`FDIVS = 0xEC >> 2`

frank.py update should help with sdk stuff 2023-02-04 07:04:04 +00:00			`# Adjust LWZ and LMW loading from r1.`
			`if opcode_a in [LWZ, LMW] and vanilla_inst_a[2] == 0x00 and \`
Add frank.py; configure.py --frank (non-matching) 2023-01-31 05:41:48 +00:00			`opcode_b in [LI, LFS, FDIVS] and \`
			`vanilla_inst_a == profile_inst_b and \`
			`vanilla_inst_b == profile_inst_a and \`
			`vanilla_inst_c == profile_inst_c and \`
			`opcode_c != ADDI: # <- don't reorder if at the very end of the epilogue`

			`# Swap instructions (A) and (B)`
			`profile_bytes = profile_bytes[:epi_pos-4] \`
			`+ vanilla_inst_a \`
			`+ PROFILE_EXTRA_BYTES \`
			`+ vanilla_inst_b \`
			`+ profile_bytes[epi_pos+12:]`

			`# Similar reordering for lwz/lmw, except both insns follow the bl/nop`
			`elif opcode_b == LWZ and \`
			`opcode_c == LMW and \`
			`vanilla_inst_b == profile_inst_c and \`
			`vanilla_inst_c == profile_inst_b:`

			`profile_bytes = profile_bytes[:epi_pos+8] \`
			`+ vanilla_inst_b \`
			`+ vanilla_inst_c \`
			`+ profile_bytes[epi_pos+16:]`

			`idx = epi_pos + 8`

			`# Remove byte sequence`
			`stripped_bytes = profile_bytes.replace(PROFILE_EXTRA_BYTES, b"")`

			`# Find end of code sections in vanilla and stripped bytes`
			`code_size_offset = code_size_magic_idx + len(CODESIZE_MAGIC)`
			`code_size_bytes = vanilla_bytes[code_size_offset:code_size_offset+4]`
			`code_size = int.from_bytes(code_size_bytes, byteorder='big')`

			`eoc_offset = 0x34 + code_size`

			`# Break if the eoc is not found`
			`assert(eoc_offset != len(vanilla_bytes))`

			`# Replace 0x34 - eoc in vanilla with bytes from stripped`
			`final_bytes = vanilla_bytes[:0x34] + stripped_bytes[0x34:eoc_offset] + vanilla_bytes[eoc_offset:]`

			`# Fix branches to link register`
			`for seq in BLR_BYTE_SEQ_ARRAY:`
			`idx = 0`

			`while idx < len(vanilla_bytes):`
			`found_pos = vanilla_bytes.find(seq, idx)`
			`if found_pos == -1:`
			`break # break while loop when no targets remain`
			`if found_pos % 4 != 0: # check 4-byte alignment`
			`idx += 4`
			`continue`
			`final_bytes = final_bytes[:found_pos] + vanilla_bytes[found_pos:found_pos+4] + final_bytes[found_pos+4:]`
			`idx = found_pos + len(seq)`

			`# Reunify mtlr/blr instructions, shifting intermediary instructions up`
			`idx = 0`

			`while idx < len(final_bytes):`
			`# Find mtlr position`
			`mtlr_found_pos = final_bytes.find(MTLR_BYTE_SEQ, idx)`
			`if mtlr_found_pos == -1:`
			`break # break while loop when no targets remain`
			`if mtlr_found_pos % 4 != 0: # check 4-byte alignment`
			`idx += 4`
			`continue`
			`# Find paired blr position`
			`blr_found_pos = final_bytes.find(BLR_BYTE_SEQ, mtlr_found_pos)`
			`if blr_found_pos == -1:`
			`break # break while loop when no targets remain`
			`if blr_found_pos % 4 != 0: # check 4-byte alignment`
			`idx += 4`
			`continue`
			`if mtlr_found_pos + 4 == blr_found_pos:`
			`idx += 4`
			`continue # continue if mtlr is followed directly by blr`

			`final_bytes = final_bytes[:mtlr_found_pos] + final_bytes[mtlr_found_pos+4:blr_found_pos] + final_bytes[mtlr_found_pos:mtlr_found_pos+4] + final_bytes[blr_found_pos:]`
			`idx = mtlr_found_pos + len(MTLR_BYTE_SEQ)`

			`# Reorder lmw/lwz/lfd instructions, if needed (@Altafen)`
			`# Specifically, if this sequence shows up in the stripped profiler code: "LMW, LWZ, LFD*"`
			`# And this sequence shows up in the vanilla code: "LWZ, LFD*, LMW"`
			`# (LFD* = any number of LFDs, including zero)`
			`# If all bytes match between the two (except for the reordering), then use the vanilla ordering.`
			`# This could be written to anchor around the "BL, NOP" instructions in unstripped profiler code,`
			`# or to check for the presence of "ADDI, MTLR, BLR" soon after.`
			`# This also could be written to decode the operands of each instruction to make sure the reorder is harmless.`
			`# Neither of these safeguards are necessary at the moment.`
			`LWZ = 32`
			`LMW = 46`
			`LFD = 50`
			`idx = 0`
			`while idx+4 < len(final_bytes):`
			`if final_bytes[idx] >> 2 == LMW and final_bytes[idx+4] >> 2 == LWZ and vanilla_bytes[idx] >> 2 == LWZ:`
			`start_idx = idx`
			`lmw_bytes = final_bytes[idx:idx+4]`
			`lwz_bytes = final_bytes[idx+4:idx+8]`
			`if vanilla_bytes[idx:idx+4] != lwz_bytes:`
			`idx += 4`
			`continue`
			`lfd_bytes = b""`
			`idx += 4`
			`while vanilla_bytes[idx] >> 2 == LFD:`
			`lfd_bytes += vanilla_bytes[idx:idx+4]`
			`idx += 4`
			`if vanilla_bytes[idx:idx+4] != lmw_bytes:`
			`continue`
			`if final_bytes[start_idx+8:start_idx+8+len(lfd_bytes)] != lfd_bytes:`
			`continue`
			`idx += 4`
			`final_bytes = final_bytes[:start_idx] + lwz_bytes + lfd_bytes + lmw_bytes + final_bytes[idx:]`
			`continue`
			`idx += 4`

			`with open(args.target, "wb") as f:`
			`f.write(final_bytes)`