prime/tools/reldisasm.py

#!/usr/bin/env python3

from capstone import *
from capstone.ppc import *
from elftools.elf.elffile import *
from elftools.elf.sections import *
import sys

# addr -> name
labels = {}

# fileOffset -> {addr, type}
relocations = {}

# index -> {offset, flags, length, is_bss, name}
sectionInfo = []

R_PPC_NONE        = 0
R_PPC_ADDR32      = 1
R_PPC_ADDR24      = 2
R_PPC_ADDR16_LO   = 4
R_PPC_ADDR16_HA   = 6
R_PPC_REL24       = 10
R_DOLPHIN_SECTION = 202
R_DOLPHIN_END     = 203

relocationTypeNames = {
    R_PPC_NONE:        'R_PPC_NONE',
    R_PPC_ADDR32:      'R_PPC_ADDR32',
    R_PPC_ADDR24:      'R_PPC_ADDR24',
    R_PPC_ADDR16_LO:   'R_PPC_ADDR16_LO',
    R_PPC_ADDR16_HA:   'R_PPC_ADDR16_HA',
    R_PPC_REL24:       'R_PPC_REL24',
    R_DOLPHIN_SECTION: 'R_DOLPHIN_SECTION',
    R_DOLPHIN_END:     'R_DOLPHIN_END'
}

def read_u8(offset):
    return filecontent[offset]

def read_u16(offset):
    return (filecontent[offset + 0] << 8) | filecontent[offset + 1]

def read_u32(offset):
    return (filecontent[offset + 0] << 24) | (filecontent[offset + 1] << 16) | (filecontent[offset + 2] << 8) | filecontent[offset + 3]

def add_label(addr, name=None):
    if addr in labels:
        return labels[addr]
    if name == None:
        name = 'lbl_%08X' % addr
    labels[addr] = name
    return name

with open(sys.argv[1], 'rb') as file:
    filecontent = bytearray(file.read())

if len(sys.argv) >= 3:
    # Why is this so slow?
    with open(sys.argv[2], 'rb') as f:
        elf = ELFFile(f)
        elfsymtab = elf.get_section_by_name('.symtab')
        for i in range(0, elfsymtab.num_symbols()):
            sym = elfsymtab.get_symbol(i)
            if len(sym.name) > 0 and not sym.name[0] in {'.', '@'}:
                add_label(sym['st_value'], sym.name)

id = read_u32(0)
numSections = read_u32(0x0C)
sectionInfoOffset = read_u32(0x10)
nameOffset = read_u32(0x14)
nameSize = read_u32(0x18)
version = read_u32(0x1C)
bssSize = read_u32(0x20)
relOffset = read_u32(0x24)
impOffset = read_u32(0x28)
impSize = read_u32(0x2C)
prologSection = read_u8(0x30)
epilogSection = read_u8(0x31)
unresolvedSection = read_u8(0x32)
prolog = read_u32(0x34)
epilog = read_u32(0x38)
unresolved = read_u32(0x3C)

print("# id: %i" % id)
print("# version: %i" % version)
print("# nameoffset: 0x%X, size: 0x%X" % (nameOffset, nameSize))
print("# section table: 0x%X, size: 0x%X" % (sectionInfoOffset, numSections*8))
print("# imp table: 0x%X" % impOffset)
print("# relocs offset: 0x%X" % relOffset)
print("# _prolog:     %i:0x%X" % (prologSection, prolog))
print("# _epilog:     %i:0x%X" % (epilogSection, epilog))
print("# _unresolved: %i:0x%X" % (unresolvedSection, unresolved))
print("# num sections: %i" % numSections)
print('.include "macros.inc"')

#print("%i sections:" % numSections)
# Read sections
for i in range(0, numSections):
    o = sectionInfoOffset + i * 8
    section = {
        'offset': read_u32(o + 0) & ~3,
        'flags': read_u32(o + 0) & 3,
        'length': read_u32(o + 4)
    }
    if section['offset'] == 0 and section['length'] > 0:
        section['is_bss'] = True
    else:
        section['is_bss'] = False
    # Hack: if bss, then set file offset to something unique as to not
    # clash with other symbols
    if section['is_bss']:
        section['offset'] = 0x10000000
    # Determine name
    if section['is_bss']:
        section['name'] = '.bss%i' % i
    elif section['flags'] & 1:
        section['name'] = '.text%i' % i
    else:
        section['name'] = '.data%i' % i
    sectionInfo.append(section)
    print("# offset: 0x%08X\tlength: 0x%08X\tflags: %i" %
        (section['offset'], section['length'], section['flags']))


sectionInfo[1]['name'] = '.text'
sectionInfo[2]['name'] = '.ctors'
sectionInfo[3]['name'] = '.dtors'
sectionInfo[4]['name'] = '.rodata'
sectionInfo[5]['name'] = '.data'
sectionInfo[6]['name'] = '.bss'

# Add labels for prologue and epilogue
if prologSection != 0:
    labels[sectionInfo[prologSection]['offset'] + prolog] = '_prolog'
if epilogSection != 0:
    labels[sectionInfo[epilogSection]['offset'] + epilog] = '_epilog'
if unresolvedSection != 0:
    labels[sectionInfo[unresolvedSection]['offset'] + unresolved] = '_unresolved'

def read_relocation_info(module, o):
    currSection = None
    missingSymbols = False
    while True:
        offset = read_u16(o + 0)
        type = read_u8(o + 2)
        section = read_u8(o + 3)
        addend = read_u32(o + 4)

        # Get address of symbol and add label
        symAddr = 0
        if type == R_DOLPHIN_SECTION:  # R_DOLPHIN_SECTION
            currSection = sectionInfo[section]
            relocOffset = currSection['offset']
        if type < 200:
            if module == 0:  # dol
                symAddr = addend
                if symAddr not in labels:
                    print('error: symbol for 0x%08X not found' % symAddr)
                    missingSymbols = True
            else:  # rel
                symAddr = sectionInfo[section]['offset'] + addend
                labels[symAddr] = 'lbl_%08X' % symAddr

        # Get file offset for relocation
        relocOffset += offset

        if type < 200:
            reloc = {
                'addr': symAddr,
                'type': type,
            }
            relocations[relocOffset] = reloc

        #print(" offset: 0x%04X(+0x%X)\ttype: %s\tsection: %i\tsym_addr: 0x%08X" % (relocOffset, offset, relocationTypeNames[type], section, symAddr))
        #print(" offset: 0x%04X(+0x%X)\ttype: %s\tsection: %i\tsym_addr: ?" % (relocOffset, offset, relocationTypeNames[type], section))
        if type == R_DOLPHIN_END:
            break
        o += 8
    if missingSymbols:
        exit(1)

numImpEntries = impSize / 8
#print("%i imports" % numImpEntries)
for i in range(0, int(numImpEntries)):
    o = impOffset + i * 8
    module = read_u32(o + 0)
    relocation = read_u32(o + 4)
    #print("module: %i, offset: 0x%08X" % (module, relocation))
    read_relocation_info(module, relocation)


cs = Cs(CS_ARCH_PPC, CS_MODE_32 | CS_MODE_BIG_ENDIAN)
cs.detail = True
cs.imm_unsigned = False

def get_relocation_for_offset(o):
    for i in range(o, o + 4):
        if i in relocations:
            return relocations[i]
    return None


def get_label(addr):
    if addr in labels:
        return labels[addr]
    return '0x%08X' % addr

def print_label(label):
    if label in ['_prolog', '_epilog', '_unresolved']:
        label = '.global %s\n%s' % (label, label)
    print('%s:' % label)

def sign_extend_16(value):
    if value > 0 and (value & 0x8000):
        value -= 0x10000
    return value

def disasm_fcmp(inst):
    crd = (inst & 0x03800000) >> 23
    a = (inst & 0x001f0000) >> 16
    b = (inst & 0x0000f800) >> 11
    return 'fcmpo cr%i, f%i, f%i' % (crd, a, b)

def disasm_mspr(inst, mode):
    if (inst & 1):
        return None
    d = (inst & 0x03e00000) >> 21
    a = (inst & 0x001f0000) >> 16
    b = (inst & 0x0000f800) >>11
    spr = (b << 5) + a
    if mode:
        return 'mtspr 0x%X, r%i' % (spr, d)
    else:
        return 'mfspr r%i, 0x%X' % (d, spr)

def disasm_mcrxr(inst):
    if (inst & 0x007ff801):
        return None
    crd = (inst & 0x03800000) >> 23
    return 'mcrxr cr%i' % crd

def disassemble_insn_that_capstone_cant_handle(o, reloc):
    if reloc:
        relocComment = '\t;# %s:%s' % (get_label(reloc['addr']), relocationTypeNames[reloc['type']])
    else:
        relocComment = ''
    raw = read_u32(o)
    asm = None
    idx = (raw & 0xfc000000) >> 26
    idx2 = (raw & 0x000007fe) >> 1
    # mtspr
    if idx == 31 and idx2 == 467:
        asm = disasm_mspr(raw, 1)
    # mfspr
    elif idx == 31 and idx2 == 339:
        asm = disasm_mspr(raw, 0)
    # mcrxr
    elif idx == 31 and idx2 == 512:
        asm = disasm_mcrxr(raw)
    # fcmpo
    elif idx == 63 and idx2 == 32:
        asm = disasm_fcmp(raw)
    # Paired singles
    #elif idx == 4:
    #    asm = disasm_ps(raw)
    #elif idx in {56, 57, 60, 61}:
    #    asm = disasm_ps_mem(raw, idx)
    if asm:
        return asm
    return '.4byte 0x%08X  ;# (error: unknown instruction) %s' % (read_u32(o), relocComment)

def disassemble_insn(o, reloc):
    if reloc:
        relocComment = '\t;# %s:%s' % (get_label(reloc['addr']), relocationTypeNames[reloc['type']])
    else:
        relocComment = ''
    try:
        insn = next(cs.disasm(filecontent[o : o+4], o))
    except StopIteration:
        return disassemble_insn_that_capstone_cant_handle(o, reloc)
    if reloc:
        relocType = reloc['type']
    else:
        relocType = -1

    # handle relocs label
    if insn.id in {PPC_INS_BL, PPC_INS_BC} and relocType == R_PPC_REL24:
        return '%s %s' % (insn.mnemonic, get_label(reloc['addr']))
    if insn.id == PPC_INS_LIS and relocType == R_PPC_ADDR16_HA:
        return '%s %s, %s@ha' % (insn.mnemonic, insn.reg_name(insn.operands[0].reg), get_label(reloc['addr']))
    if insn.id == PPC_INS_ADDI and relocType == R_PPC_ADDR16_LO:
        return '%s %s, %s, %s@l' % (insn.mnemonic, insn.reg_name(insn.operands[0].reg), insn.reg_name(insn.operands[1].reg), get_label(reloc['addr']))
    if insn.id in {
        PPC_INS_LWZ,  PPC_INS_LHZ,  PPC_INS_LHA,  PPC_INS_LBZ,
        PPC_INS_LWZU, PPC_INS_LHZU, PPC_INS_LHAU, PPC_INS_LBZU,
        PPC_INS_LFS,  PPC_INS_LFD,
        PPC_INS_LFSU, PPC_INS_LFDU,
        PPC_INS_STW,  PPC_INS_STH,  PPC_INS_STB,
        PPC_INS_STWU, PPC_INS_STHU, PPC_INS_STBU,
        PPC_INS_STFS, PPC_INS_STFD,
        PPC_INS_STFSU, PPC_INS_STFDU} \
        and relocType == R_PPC_ADDR16_LO:
        return '%s %s, %s@l(%s)' % (insn.mnemonic, insn.reg_name(insn.operands[0].reg), get_label(reloc['addr']), insn.reg_name(insn.operands[1].mem.base))

    # branch target labels
    if insn.id in {PPC_INS_B, PPC_INS_BL, PPC_INS_BDZ, PPC_INS_BDNZ, PPC_INS_BC}:
        if reloc:
            return '%s %s' % (insn.mnemonic, get_label(reloc['addr']))
        #add_label(insn.operands[0].imm)
        #label = labels[insn.operands[0].imm]
        #if label:
        # WTF, capstone?
        if o == 0xAD8C:
            return '%s lbl_0000ADB0' % insn.mnemonic
        return '%s %s' % (insn.mnemonic, get_label(insn.operands[0].imm))

    # misc. fixes

    # Sign-extend immediate values because Capstone is an idiot and thinks all immediates are unsigned
    if insn.id in {PPC_INS_ADDI, PPC_INS_ADDIC, PPC_INS_SUBFIC, PPC_INS_MULLI} and (insn.operands[2].imm & 0x8000):
        return "%s %s, %s, %i  ;# fixed addi" % (insn.mnemonic, insn.reg_name(insn.operands[0].reg), insn.reg_name(insn.operands[1].value.reg), insn.operands[2].imm - 0x10000)
    if (insn.id == PPC_INS_LI or insn.id == PPC_INS_CMPWI) and (insn.operands[1].imm & 0x8000):
        return "%s %s, %i" % (insn.mnemonic, insn.reg_name(insn.operands[0].reg), insn.operands[1].imm - 0x10000)
    # cntlz -> cntlzw
    if insn.id == PPC_INS_CNTLZW:
        return "cntlzw %s" % insn.op_str

    return '%s %s%s' % (insn.mnemonic, insn.op_str, relocComment)

def scan_local_labels(o, size):
    end = o + size
    while o < end:
        reloc = get_relocation_for_offset(o)
        if reloc:
            pass
        else:
            try:
                insn = next(cs.disasm(filecontent[o:o+4], o))
                if insn.id in {PPC_INS_B, PPC_INS_BL, PPC_INS_BC, PPC_INS_BDZ, PPC_INS_BDNZ}:
                    for op in insn.operands:
                        if op.type == PPC_OP_IMM:
                            l = add_label(op.imm)
                            #print('adding local label %s(0x%X) from offset 0x%X' % (l, op.imm, o))
            except StopIteration:
                pass
        o += 4
    #for insn in cs.disasm(filecontent[o:o+size], o):
    #    # branch labels
    #    if insn.id in {PPC_INS_B, PPC_INS_BL, PPC_INS_BC, PPC_INS_BDZ, PPC_INS_BDNZ}:
    #        for op in insn.operands:
    #            if op.type == PPC_OP_IMM:
    #                l = add_label(op.imm)
    #                print('adding local label %s(0x%X) from offset 0x%X' % (l, op.imm, o))

def dump_code(o, size):
    scan_local_labels(o, size)
    end = o + size
    code = filecontent[o : end]
    while o < end:
        if o in labels:
            print_label(labels[o])
        asm = disassemble_insn(o, get_relocation_for_offset(o))
        print('/* %08X %08X */ %s' % (o, read_u32(o), asm))
        #print('/* %08X */ %s' % (read_u32(o), asm))
        o += 4
    if o < end:
        print('incomplete')

# returns True if value is 4-byte aligned
def is_aligned(num):
    return num % 4 == 0

def align(num):
    return (num + 3) & ~3

def is_ascii(code):
    if code >= 0x20 and code <= 0x7E:  # normal characters
        return True
    if code in [0x09, 0x0A]:  # tab, newline
        return True
    return False

# returns True if all elements are zero
def is_all_zero(arr):
    for val in arr:
        if val != 0:
            return False
    return True

# returns string of comma-separated hex bytes
def hex_bytes(data):
    return ', '.join('0x%02X' % n for n in data)

# reads a string starting at pos
def read_string(data, pos):
    text = ''
    while pos < len(data) and is_ascii(data[pos]):
        text += chr(data[pos])
        pos += 1
    if pos < len(data) and data[pos] == 0:
        return text
    return ''

# escapes special characters in the string for use in a C string literal
def escape_string(text):
    return text.replace('\\','\\\\').replace('"','\\"').replace('\n','\\n').replace('\t','\\t')

def output_data_range(secNum, o, end):
    print('    # 0x%X' % o)
    if not is_aligned(o):
        print('    .byte ' + hex_bytes(filecontent[o:align(o)]))
        o = align(o)
    while o < (end & ~3):
        # Try to see if this is a string.
        string = read_string(filecontent, o)
        if len(string) >= 4 and secNum == 5:  # strings are only in .data
            strEnd = o + len(string)+1
            if is_aligned(strEnd) or is_all_zero(filecontent[strEnd : align(strEnd)-strEnd]):
                print('    .asciz \"%s"' % escape_string(string))
                if not is_aligned(strEnd):
                    print('    .balign 4')
                o = align(strEnd)
                continue
        # Not a string
        reloc = get_relocation_for_offset(o)
        if reloc:
            type = reloc['type']
            if type == R_PPC_ADDR32:
                value = labels[reloc['addr']]
            else:
                print('dunno what to do about %s here' % relocationTypeNames[type])
        else:
            value = '0x%08X' % read_u32(o)
        print('    .4byte %s' % value)
        o += 4
    if o < end:
        print('    .byte ' + hex_bytes(filecontent[o:end]))
    return


def dump_data(secNum, o, size):
    end = o + size
    lastPos = o
    while o < end:
        if o in labels:
            if o - lastPos > 0:
                output_data_range(secNum, lastPos, o)
            print_label(labels[o])
            lastPos = o
        o += 1
    if o - lastPos > 0:
        output_data_range(secNum, lastPos, o)
    return


def output_bss_range(start, end):
    print('    .skip 0x%X' % (end - start))

def dump_bss(o, size):
    end = o + size
    lastPos = o
    while o < end:
        if o in labels:
            if o - lastPos > 0:
                output_bss_range(lastPos, o)
            print_label(labels[o])
            lastPos = o
        o += 1
    if o - lastPos > 0:
        output_bss_range(lastPos, o)
    return


for i in range(0, numSections):
    section = sectionInfo[i]
    if section['offset'] == 0 and section['length'] == 0:
        continue
    print('# %i' % i)
    print('.section %s' % section['name'])
    if section['is_bss']:
        # bss section
        dump_bss(section['offset'], section['length'])
    elif section['flags'] & 1:
        # code section
        dump_code(section['offset'], section['length'])
    elif section['offset'] != 0:
        # data section
        dump_data(i, section['offset'], section['length'])
    print('')