#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.10"
# dependencies = ["clang==17.0.6"]
# ///
"""
Generate Windows ABI trampolines by scanning C++ prototypes using libclang.

This emits x86 trampolines for guest<->host calls.
"""

if __name__ == "__main__":
    import script_venv

    script_venv.bootstrap_venv(__file__)

import argparse
import ctypes
import os
import sys
import tempfile
from dataclasses import dataclass, field
from enum import Enum, IntEnum
from pathlib import Path
from typing import Iterable, List, Optional

from clang.cindex import (
    Config,
    Cursor,
    CursorKind,
    Index,
    StorageClass,
    TranslationUnit,
    Type,
    TypeKind,
    conf,
)
from clang.cindex import Type as CXType

# Allow libclang path to be specified via environment variable
if "LIBCLANG_PATH" in os.environ:
    libclang_path = os.environ["LIBCLANG_PATH"]
    if os.path.isfile(libclang_path):
        Config.set_library_file(libclang_path)
    elif os.path.isdir(libclang_path):
        Config.set_library_path(libclang_path)
    else:
        sys.stderr.write(
            f"Warning: LIBCLANG_PATH={libclang_path} is not a file or directory\n"
        )


class Arch(str, Enum):
    X86 = "x86"
    X86_64 = "x86_64"


class CallingConv(IntEnum):
    """CXCallingConv enum values from clang-c/Index.h"""

    DEFAULT = 0
    C = 1
    X86_STDCALL = 2
    X86_FASTCALL = 3
    X86_THISCALL = 4
    X86_PASCAL = 5
    AAPCS = 6
    AAPCS_VFP = 7
    X86_REGCALL = 8
    INTELOCLBICC = 9
    WIN64 = 10
    X86_64_WIN64 = 11
    X86_64_SYSV = 12
    X86_VECTORCALL = 13
    SWIFT = 14
    PRESERVEMOST = 15
    PRESERVEALL = 16
    AARCH64_VECTORCALL = 17
    SWIFTASYNC = 18
    AARCH64_SVEPCS = 19
    M68K_RTD = 20
    INVALID = 100
    UNEXPOSED = 200


# Register the clang_getFunctionTypeCallingConv function
_get_calling_conv = conf.lib.clang_getFunctionTypeCallingConv
_get_calling_conv.argtypes = [CXType]
_get_calling_conv.restype = ctypes.c_int


def _get_function_calling_conv(func_type: CXType) -> CallingConv:
    """
    Get the calling convention of a function type.
    """
    return CallingConv(_get_calling_conv(func_type))


class ArgClass(str, Enum):
    INT = "int"
    MEMORY = "memory"


@dataclass
class ArgInfo:
    type: Type
    arg_class: ArgClass
    sign_extended: bool


@dataclass
class ArgPlacement:
    size: int
    slot_size: int
    stack_offset: Optional[int] = None
    register: Optional[str] = None

    def __init__(self, arg: ArgInfo, arch: Arch):
        self.size = arg.type.get_canonical().get_size()
        self.slot_size = _slot_size_for_arch(arg, arch)
        self.register = None
        self.stack_offset = None


@dataclass
class FuncInfo:
    qualified_ns: str
    name: str
    mangled: str
    source_cc: CallingConv
    target_cc: CallingConv
    variadic: bool
    return_type: ArgInfo
    args: List[ArgInfo] = field(default_factory=list)


@dataclass
class TypedefInfo:
    name: str
    source_cc: CallingConv
    target_cc: CallingConv
    variadic: bool
    return_type: ArgInfo
    args: List[ArgInfo] = field(default_factory=list)


@dataclass
class VarInfo:
    qualified_ns: str
    name: str


def parse_tu(
    headers: List[str], include_dirs: List[str], target: str
) -> TranslationUnit:
    # Construct a tiny TU that includes the requested headers
    tu_source = "\n".join([f'#include "{h}"' for h in headers]) + "\n"
    with tempfile.NamedTemporaryFile("w", suffix=".cpp") as tf:
        tf.write(tu_source)
        tf.flush()
        args = [
            "-x",
            "c++",
            "-std=c++17",
            "-target",
            target,
            "-DWIBO_CODEGEN=1",
        ] + [arg for inc in include_dirs for arg in ("-I", inc)]

        index = Index.create()
        tu = index.parse(
            tf.name, args=args, options=TranslationUnit.PARSE_SKIP_FUNCTION_BODIES
        )
        for d in tu.diagnostics:
            if d.severity >= d.Warning:
                sys.stderr.write(str(d) + "\n")
        return tu


def _cursor_namespace(cursor: Cursor) -> List[str]:
    ns: List[str] = []
    c = cursor
    while c is not None and c.kind != CursorKind.TRANSLATION_UNIT:
        if c.kind == CursorKind.NAMESPACE and c.spelling:
            ns.append(c.spelling)
        c = c.semantic_parent
    return list(reversed(ns))


def _source_cc_from_annotations(func: Cursor) -> CallingConv:
    for child in func.get_children():
        if child.kind == CursorKind.ANNOTATE_ATTR:
            if child.spelling == "CC:fastcall":
                return CallingConv.X86_FASTCALL
            elif child.spelling == "CC:stdcall":
                return CallingConv.X86_STDCALL
            elif child.spelling == "CC:cdecl":
                return CallingConv.C
    return CallingConv.DEFAULT


def _is_handle_typedef(arg_type: CXType) -> bool:
    """Check if a type is a HANDLE-like typedef (HWND, HINSTANCE, etc.)."""
    t = arg_type
    # Trace through ELABORATED and TYPEDEF to find the original typedef name
    while t.kind == TypeKind.ELABORATED or t.kind == TypeKind.TYPEDEF:
        if t.kind == TypeKind.TYPEDEF:
            decl = t.get_declaration()
            name = decl.spelling
            # Windows HANDLE types conventionally start with 'H'
            if name and name.startswith("H") and name.isupper():
                return True
            t = decl.underlying_typedef_type
        elif t.kind == TypeKind.ELABORATED:
            named = t.get_named_type()
            if named is None:
                break
            t = named
        else:
            break
    return False


SIGNED_KINDS = [
    TypeKind.SCHAR,
    TypeKind.CHAR_S,
    TypeKind.SHORT,
    TypeKind.INT,
    TypeKind.LONG,
    TypeKind.LONGLONG,
    TypeKind.INT128,
]


def _calculate_arg_info(t: Type) -> ArgInfo:
    canonical = t.get_canonical()

    # if canonical.kind == TypeKind.RECORD:
    #     arg_class = ArgClass.MEMORY
    # else:
    arg_class = ArgClass.INT

    if canonical.kind == TypeKind.POINTER:
        pointee = canonical.get_pointee()
        if pointee.kind == TypeKind.POINTER:
            print(f"Bugprone: Pointer to pointer ({_type_to_string(t)})")

    # Sign-extend signed integers and HANDLE-like typedefs
    is_sign_extended = canonical.kind in SIGNED_KINDS or _is_handle_typedef(t)

    return ArgInfo(
        arg_class=arg_class,
        sign_extended=is_sign_extended,
        type=t,
    )


def _collect_args(func_type: CXType) -> List[ArgInfo]:
    """Collect argument information for a function."""
    args: List[ArgInfo] = []
    for t in func_type.argument_types():
        args.append(_calculate_arg_info(t))
    return args


def _slot_size_for_arch(arg: ArgInfo, arch: Arch) -> int:
    """Return the slot size (in bytes) used to pass an argument on the given architecture."""
    canonical = arg.type.get_canonical()
    if canonical.kind == TypeKind.POINTER:
        return 8 if arch == Arch.X86_64 else 4
    size = canonical.get_size()
    if arch == Arch.X86:
        if size <= 4:
            return 4
        if size <= 8:
            return 8
    elif arch == Arch.X86_64:
        if size <= 8:
            return 8
    raise NotImplementedError(
        f"Argument size {size} not supported for architecture {arch.value}"
    )


@dataclass
class ArgLayout:
    args: List[ArgPlacement]
    stack_size: int


def compute_arg_layout(
    args: List[ArgInfo],
    cc: CallingConv,
    arch: Arch,
    stack_offset: int = 0,
    skip_args: int = 0,
) -> ArgLayout:
    """Compute how each argument is passed for the given calling convention and arch."""

    placements: List[ArgPlacement] = []
    stack_size = 0
    gpr_order: List[str] = []
    gpr_index = skip_args

    if arch == Arch.X86 and cc == CallingConv.X86_FASTCALL:
        gpr_order = ["ecx", "edx"]
    elif arch == Arch.X86_64 and cc == CallingConv.C:
        gpr_order = ["rdi", "rsi", "rdx", "rcx", "r8", "r9"]

    # Offset our stack based on number of extra arguments
    # We assume that every arg represented by skip_args fits in a register
    register_size = 8 if arch == Arch.X86_64 else 4
    consumed_stack = max(0, skip_args - len(gpr_order)) * register_size
    stack_offset += consumed_stack
    stack_size += consumed_stack

    def _push_stack(arg: ArgInfo) -> None:
        nonlocal stack_offset
        nonlocal stack_size
        placement = ArgPlacement(arg, arch)
        placement.stack_offset = stack_offset
        placements.append(placement)
        stack_offset += placement.slot_size
        stack_size += placement.slot_size

    def _push_register(arg: ArgInfo) -> None:
        nonlocal gpr_index
        placement = ArgPlacement(arg, arch)
        placement.register = gpr_order[gpr_index]
        placements.append(placement)
        gpr_index += 1

    # Special case for x86 fastcall: stop using registers if any spill onto the stack
    if arch == Arch.X86 and cc == CallingConv.X86_FASTCALL:
        stack_args_start = 0
        for i in range(min(len(gpr_order), len(args))):
            if gpr_index >= len(gpr_order):
                break
            arg = args[i]
            slot_size = _slot_size_for_arch(arg, arch)
            if arg.arg_class == ArgClass.INT and slot_size == 4:
                _push_register(arg)
                stack_args_start += 1
            else:
                break

        for i in range(stack_args_start, len(args)):
            _push_stack(args[i])
    else:
        for arg in args:
            slot_size = _slot_size_for_arch(arg, arch)
            if (
                arg.arg_class == ArgClass.INT
                and slot_size <= register_size
                and gpr_index < len(gpr_order)
            ):
                _push_register(arg)
            else:
                _push_stack(arg)

    return ArgLayout(args=placements, stack_size=stack_size)


def describe_arg_placement(placement: ArgPlacement) -> str:
    if placement.register is not None:
        return f"{placement.register}[{placement.slot_size}]"
    if placement.stack_offset is not None:
        return f"stack+{placement.stack_offset}[{placement.slot_size}]"
    raise ValueError(f"Unassigned placement {placement}")


def collect_functions(
    tu: TranslationUnit, ns_filter: Optional[str], arch: Arch
) -> List[FuncInfo]:
    want_ns = ns_filter.split("::") if ns_filter else None
    out: dict[str, FuncInfo] = {}

    def visit(node: Cursor) -> None:
        if node.kind == CursorKind.FUNCTION_DECL:
            ns_parts = _cursor_namespace(node)
            if want_ns is not None and ns_parts != want_ns:
                return
            name = node.spelling
            if not name:
                return
            source_cc = _source_cc_from_annotations(node)
            if source_cc == CallingConv.DEFAULT:
                return  # No CC annotation; skip
            out[name] = FuncInfo(
                qualified_ns="::".join(ns_parts),
                name=name,
                mangled=node.mangled_name or name,
                source_cc=source_cc,
                target_cc=_get_function_calling_conv(node.type),
                variadic=node.type.is_function_variadic(),
                return_type=_calculate_arg_info(node.type.get_result()),
                args=_collect_args(node.type),
            )

        # Recurse into children
        if node.kind in (CursorKind.TRANSLATION_UNIT, CursorKind.NAMESPACE):
            for c in node.get_children():
                visit(c)

    if tu.cursor is not None:
        visit(tu.cursor)
    return sorted(out.values(), key=lambda f: f.name)


def _type_to_string(t: CXType) -> str:
    """Convert a CXType to a C type string."""
    spelling = t.spelling
    # Clean up common type spellings
    spelling = (
        spelling.replace("struct ", "").replace("union ", "").replace("enum ", "")
    )
    return spelling


def collect_typedefs(tu: TranslationUnit, arch: Arch) -> List[TypedefInfo]:
    """Collect function pointer typedefs and type aliases from the translation unit."""
    out: dict[str, TypedefInfo] = {}

    def process_function_pointer_type(
        name: str, node: Cursor, func_type: CXType
    ) -> None:
        """Process a function pointer type and add it to the output."""
        if not name:
            return

        # Determine calling convention
        source_cc = _get_function_calling_conv(func_type)
        target_cc = _source_cc_from_annotations(node)
        if target_cc == CallingConv.DEFAULT:
            return  # No CC annotation; skip

        out[name] = TypedefInfo(
            name=name,
            source_cc=source_cc,
            target_cc=target_cc,
            variadic=func_type.is_function_variadic(),
            return_type=_calculate_arg_info(func_type.get_result()),
            args=_collect_args(func_type),
        )

    def visit(node: Cursor) -> None:
        if node.kind == CursorKind.TYPEDEF_DECL:
            name = node.spelling
            if not name:
                return
            underlying = node.underlying_typedef_type
            if underlying.kind == TypeKind.POINTER:
                pointee = underlying.get_pointee()
                if pointee.kind == TypeKind.FUNCTIONPROTO:
                    process_function_pointer_type(name, node, pointee)

        # Recurse into children
        if node.kind in (CursorKind.TRANSLATION_UNIT, CursorKind.NAMESPACE):
            for c in node.get_children():
                visit(c)

    if tu.cursor is not None:
        visit(tu.cursor)
    return sorted(out.values(), key=lambda t: t.name)


def collect_variables(tu: TranslationUnit, ns_filter: Optional[str]) -> List[VarInfo]:
    """Collect extern variable declarations from the translation unit."""
    want_ns = ns_filter.split("::") if ns_filter else None
    out: dict[str, VarInfo] = {}

    def visit(node: Cursor) -> None:
        if node.kind == CursorKind.VAR_DECL:
            if node.storage_class != StorageClass.EXTERN or node.is_definition():
                return
            ns_parts = _cursor_namespace(node)
            if want_ns is not None and ns_parts != want_ns:
                return
            name = node.spelling
            if not name:
                return
            out[name] = VarInfo(
                qualified_ns="::".join(ns_parts),
                name=name,
            )

        if node.kind in (CursorKind.TRANSLATION_UNIT, CursorKind.NAMESPACE):
            for c in node.get_children():
                visit(c)

    if tu.cursor is not None:
        visit(tu.cursor)
    return sorted(out.values(), key=lambda v: v.name)


def emit_cc_thunk32(f: FuncInfo | TypedefInfo, lines: List[str]):
    if isinstance(f, TypedefInfo):
        # Host-to-guest
        call_target = "[eax+4]"
        align = 0
        host_to_guest = True
    elif isinstance(f, FuncInfo):
        # Guest-to-host
        call_target = f.mangled
        align = 16
        host_to_guest = False

    if f.variadic:
        # Variadic functions are not yet supported for calling convention conversion.
        assert f.source_cc == CallingConv.C and f.target_cc == CallingConv.C, (
            "Variadic functions must be cdecl"
        )
        lines.append(f"\tjmp {call_target}")
        return

    source_layout = compute_arg_layout(
        f.args,
        f.source_cc,
        Arch.X86,
        stack_offset=4,
        skip_args=1 if host_to_guest else 0,
    )
    target_layout = compute_arg_layout(f.args, f.target_cc, Arch.X86)

    # Get current TEB
    if host_to_guest:
        lines.append("\tGET_TEB_HOST ecx")
    else:
        lines.append("\tmov ecx, fs:[TEB_SELF]")

    # Swap fs and gs
    lines.append("\tmov ax, fs")
    lines.append("\tmov dx, word ptr [ecx+TEB_FS_SEL]")
    lines.append("\tmov word ptr [ecx+TEB_FS_SEL], ax")
    lines.append("\tmov fs, dx")
    lines.append("\tmov ax, gs")
    lines.append("\tmov dx, word ptr [ecx+TEB_GS_SEL]")
    lines.append("\tmov word ptr [ecx+TEB_GS_SEL], ax")
    lines.append("\tmov gs, dx")

    # Store guest stack pointer in eax for arg access
    if len(f.args) > 0 or host_to_guest:
        lines.append("\tmov eax, esp")

    # Swap stack pointer
    lines.append("\tpush ebp")
    lines.append("\tmov ebp, dword ptr [ecx+TEB_SP]")
    lines.append("\tmov dword ptr [ecx+TEB_SP], esp")
    lines.append("\tmov esp, ebp")

    # Allocate stack space for arguments
    if target_layout.stack_size > 0:
        lines.append(f"\tsub esp, {target_layout.stack_size}")

    # Align stack if needed (must be done after allocating args)
    if align > 0:
        lines.append(f"\tand esp, ~{align - 1}")

    # Copy args onto stack for the callee
    for idx, target in enumerate(target_layout.args):
        if target.stack_offset is None:
            continue

        source = source_layout.args[idx]
        if source.stack_offset is None:
            raise NotImplementedError(
                f"Source calling convention {f.source_cc.name} requires register argument {idx}; not implemented"
            )

        if source.slot_size != target.slot_size:
            raise NotImplementedError(
                f"Argument {idx} requires size conversion {source.slot_size}->{target.slot_size}; not implemented"
            )

        for off in range(0, target.slot_size, 4):
            lines.append(f"\tmov ecx, [eax+{source.stack_offset + off}]")
            lines.append(f"\tmov [esp+{target.stack_offset + off}], ecx")

    # Load args into registers as needed
    for idx, target in enumerate(target_layout.args):
        if target.register is None:
            continue

        source = source_layout.args[idx]
        if source.stack_offset is None:
            raise NotImplementedError(
                f"Source calling convention {f.source_cc.name} requires register argument {idx}; not implemented"
            )

        lines.append(f"\tmov {target.register}, [eax+{source.stack_offset}]")

    # Call into target
    lines.append(f"\tcall {call_target}")

    # Determine if we can clobber eax/edx
    if f.return_type.arg_class != ArgClass.INT:
        raise NotImplementedError(
            f"Unsupported return type class {f.return_type.arg_class.value} for function {f.name}"
        )
    return_size = f.return_type.type.get_size()
    save_eax = return_size > 0
    save_edx = return_size > 4
    if return_size > 8:
        raise NotImplementedError(
            f"Return size {return_size} not supported for function {f.name}"
        )

    # Restore segment registers
    if save_eax:
        lines.append("\tpush eax")
    if save_edx:
        lines.append("\tpush edx")
    if host_to_guest:
        lines.append("\tmov ecx, fs:[TEB_SELF]")
    else:
        lines.append("\tGET_TEB_HOST ecx")
    lines.append("\tmov ax, fs")
    lines.append("\tmov dx, word ptr [ecx+TEB_FS_SEL]")
    lines.append("\tmov word ptr [ecx+TEB_FS_SEL], ax")
    lines.append("\tmov fs, dx")
    lines.append("\tmov ax, gs")
    lines.append("\tmov dx, word ptr [ecx+TEB_GS_SEL]")
    lines.append("\tmov word ptr [ecx+TEB_GS_SEL], ax")
    lines.append("\tmov gs, dx")
    if save_edx:
        lines.append("\tpop edx")
    if save_eax:
        lines.append("\tpop eax")

    # Swap stack pointer
    lines.append("\tmov esp, ebp")  # Clean up arg space
    lines.append("\tmov ebp, dword ptr [ecx+TEB_SP]")
    lines.append("\tmov dword ptr [ecx+TEB_SP], esp")

    # Restore stack and frame pointer
    lines.append("\tleave")

    # Return to guest
    if f.source_cc == CallingConv.X86_STDCALL and source_layout.stack_size > 0:
        lines.append(f"\tret {source_layout.stack_size}")
    else:
        lines.append("\tret")


def _x64_register_by_slot_size(reg: str, slot_size: int) -> str:
    if slot_size == 8:
        return reg
    if reg in ["rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "rsp"]:
        if slot_size == 4:
            return f"e{reg[1:]}"
        elif slot_size == 2:
            return reg[1:]
        elif slot_size == 1:
            if reg in ["rax", "rbx", "rcx", "rdx"]:
                return f"{reg[1]}l"
            elif reg in ["rsi", "rdi"]:
                return f"{reg[1]}il"
            else:
                return f"{reg[1]}pl"
    if slot_size == 4:
        return f"{reg}d"
    if slot_size == 2:
        return f"{reg}w"
    if slot_size == 1:
        return f"{reg}b"
    raise NotImplementedError(f"Unsupported register {reg} for slot size {slot_size}")


def _x64_ptr_type_by_slot_size(slot_size) -> str:
    if slot_size == 4:
        return "dword ptr"
    elif slot_size == 8:
        return "qword ptr"
    else:
        raise ValueError(f"Unsupported slot size {slot_size}")


def emit_cc_thunk64(f: FuncInfo | TypedefInfo, lines: List[str]):
    if isinstance(f, TypedefInfo):
        # Host-to-guest
        call_target = "edi"
        align = 0
        host_to_guest = True
    elif isinstance(f, FuncInfo):
        # Guest-to-host
        call_target = f.mangled
        align = 16
        host_to_guest = False

    if f.variadic:
        # Variadic functions are not yet supported for calling convention conversion.
        assert f.source_cc == CallingConv.C and f.target_cc == CallingConv.C, (
            "Variadic functions must be cdecl"
        )
        lines.append(f"\tjmp {call_target}")
        return

    source_layout = compute_arg_layout(
        f.args,
        f.source_cc,
        Arch.X86_64 if host_to_guest else Arch.X86,
        stack_offset=24 if host_to_guest else 20,
        skip_args=1 if host_to_guest else 0,
    )
    target_layout = compute_arg_layout(
        f.args, f.target_cc, Arch.X86 if host_to_guest else Arch.X86_64
    )

    if host_to_guest:
        lines.append(".code64")

        # Save rbx and rbp
        lines.append("\tpush rbp")
        lines.append("\tpush rbx")

        # Stash host stack in r10
        lines.append("\tmov r10, rsp")

        # Get current TEB
        lines.append("\tGET_TEB_HOST rbx")

        if sys.platform != "darwin":
            # Save FS base
            lines.append("\trdfsbase r9")
            lines.append("\tmov qword ptr [rbx+TEB_FSBASE], r9")

        # Save RSP and load guest stack
        lines.append("\tmov rbp, qword ptr [rbx+TEB_SP]")
        lines.append("\tmov qword ptr [rbx+TEB_SP], rsp")
        lines.append("\tmov rsp, rbp")

        # Allocate stack space for arguments
        if target_layout.stack_size > 0:
            lines.append(f"\tsub rsp, {target_layout.stack_size}")

        # Align stack if needed (must be done after allocating args)
        if align > 0:
            lines.append(f"\tand rsp, ~{align - 1}")

        # Transfer arguments
        for i, target in enumerate(target_layout.args):
            if target.stack_offset is None:
                raise NotImplementedError(f"Unexpected register argument {target}")

            source = source_layout.args[i]
            if source.stack_offset is not None:
                ptr_type = _x64_ptr_type_by_slot_size(source.slot_size)
                register = _x64_register_by_slot_size("rax", target.slot_size)
                lines.append(
                    f"\tmov {register}, {ptr_type} [r10+{source.stack_offset}]"
                )
                ptr_type = _x64_ptr_type_by_slot_size(target.slot_size)
                register = _x64_register_by_slot_size("rax", target.slot_size)
            elif source.register is not None:
                ptr_type = _x64_ptr_type_by_slot_size(target.slot_size)
                register = _x64_register_by_slot_size(source.register, target.slot_size)
            else:
                raise ValueError(f"Argument {i} is not a register or stack offset")
            lines.append(f"\tmov {ptr_type} [rsp+{target.stack_offset}], {register}")

        # Jump to 32-bit mode
        lines.append("\tLJMP32 rbx")

        # Setup FS selector
        lines.append("\tmov ax, word ptr [ebx+TEB_FS_SEL]")
        lines.append("\tmov fs, ax")

        # Call into target
        lines.append(f"\tcall {call_target}")

        # Get current TEB (32-bit code may clobber ebx)
        lines.append("\tmov ebx, fs:[TEB_SELF]")

        # Jump back to 64-bit
        lines.append("\tLJMP64 ebx")

        # Sign extend return value if necessary
        if f.return_type.sign_extended:
            lines.append("\tcdqe")

        if sys.platform != "darwin":
            # Restore FS base
            lines.append("\tmov r9, qword ptr [rbx+TEB_FSBASE]")
            lines.append("\twrfsbase r9")

        # Restore host stack
        lines.append("\tmov rsp, qword ptr [rbx+TEB_SP]")
        lines.append("\tmov qword ptr [rbx+TEB_SP], rbp")

        # Restore rbp, rbx and return
        lines.append("\tpop rbx")
        lines.append("\tpop rbp")
        lines.append("\tret")
    else:
        lines.append(".code32")

        # Save registers
        lines.append("\tpush ebp")
        lines.append("\tpush esi")
        lines.append("\tpush edi")
        lines.append("\tpush ebx")

        # Get current TEB
        lines.append("\tmov ebx, fs:[TEB_SELF]")

        if sys.platform != "darwin":
            # Save fs segment
            lines.append("\tmov di, fs")
            lines.append("\tmov word ptr [ebx+TEB_FS_SEL], di")

        # Jump back to 64-bit
        lines.append("\tLJMP64 ebx")

        if sys.platform != "darwin":
            # Restore FS base
            lines.append("\tmov r9, qword ptr [rbx+TEB_FSBASE]")
            lines.append("\twrfsbase r9")

        # Stash guest stack in r10
        lines.append("\tmov r10, rsp")

        # Restore host stack
        lines.append("\tmov rbp, qword ptr [rbx+TEB_SP]")
        lines.append("\tmov qword ptr [rbx+TEB_SP], rsp")
        lines.append("\tmov rsp, rbp")

        # Allocate stack space for arguments
        if target_layout.stack_size > 0:
            lines.append(f"\tsub rsp, {target_layout.stack_size}")

        # Align stack if needed (must be done after allocating args)
        if align > 0:
            lines.append(f"\tand rsp, ~{align - 1}")

        # Transfer args
        for i, target in enumerate(target_layout.args):
            arg = f.args[i]
            source = source_layout.args[i]

            if target.stack_offset is not None:
                if source.stack_offset is not None:
                    ptr_type = _x64_ptr_type_by_slot_size(source.slot_size)
                    register = _x64_register_by_slot_size("rax", source.slot_size)
                    lines.append(
                        f"\tmov {register}, {ptr_type} [r10+{source.stack_offset}]"
                    )
                    ptr_type = _x64_ptr_type_by_slot_size(target.slot_size)
                    register = _x64_register_by_slot_size("rax", target.slot_size)
                elif source.register is not None:
                    ptr_type = _x64_ptr_type_by_slot_size(target.slot_size)
                    register = _x64_register_by_slot_size(
                        source.register, target.slot_size
                    )
                else:
                    raise ValueError(f"Argument {i} is not a register or stack offset")
                lines.append(
                    f"\tmov {ptr_type} [rsp+{target.stack_offset}], {register}"
                )
            elif target.register is not None:
                ptr_type = _x64_ptr_type_by_slot_size(source.slot_size)
                if source.slot_size == 4 and target.slot_size == 8:
                    if arg.sign_extended:
                        register = _x64_register_by_slot_size(
                            target.register, source.slot_size
                        )
                        lines.append(
                            f"\tmov {register}, {ptr_type} [r10+{source.stack_offset}]"
                        )
                        lines.append(f"\tmovsxd {target.register}, {register}")
                    else:
                        register = _x64_register_by_slot_size(
                            target.register, source.slot_size
                        )
                        lines.append(
                            f"\tmov {register}, {ptr_type} [r10+{source.stack_offset}]"
                        )
                elif source.slot_size == 8 and target.slot_size == 8:
                    lines.append(
                        f"\tmov {target.register}, {ptr_type} [r10+{source.stack_offset}]"
                    )
                else:
                    raise NotImplementedError(
                        f"Unsupported conversion from {source.slot_size} to {target.slot_size}"
                    )

        # Call into target
        lines.append(f"\tcall {call_target}")

        # Restore host stack
        lines.append("\tmov rsp, qword ptr [rbx+TEB_SP]")
        lines.append("\tmov qword ptr [rbx+TEB_SP], rbp")

        # Jump to 32-bit mode
        lines.append("\tLJMP32 rbx")

        if sys.platform != "darwin":
            # Setup FS selector
            lines.append("\tmov di, word ptr [ebx+TEB_FS_SEL]")
            lines.append("\tmov fs, di")

        # Restore registers
        lines.append("\tpop ebx")
        lines.append("\tpop edi")
        lines.append("\tpop esi")
        lines.append("\tpop ebp")

        # Return to guest
        if f.source_cc == CallingConv.X86_STDCALL and source_layout.stack_size > 0:
            lines.append(f"\tret {source_layout.stack_size}")
        else:
            lines.append("\tret")


def emit_cc_thunk(f: FuncInfo | TypedefInfo, lines: List[str], arch: Arch):
    if arch == Arch.X86_64:
        return emit_cc_thunk64(f, lines)
    elif arch == Arch.X86:
        return emit_cc_thunk32(f, lines)


def emit_guest_to_host_thunks(
    lines: List[str], dll: str, funcs: Iterable[FuncInfo], arch: Arch
) -> None:
    for f in funcs:
        thunk = f"thunk_{dll}_{f.name}"
        lines.append("")
        lines.append(
            f"# {f.qualified_ns}::{f.name} (source_cc={f.source_cc.name}, target_cc={f.target_cc.name}, variadic={f.variadic})"
        )
        source_layout = compute_arg_layout(f.args, f.source_cc, Arch.X86)
        target_layout = compute_arg_layout(f.args, f.target_cc, arch)
        for i, arg in enumerate(f.args):
            details: List[str] = []
            details.append(f"src={describe_arg_placement(source_layout.args[i])}")
            details.append(f"dst={describe_arg_placement(target_layout.args[i])}")
            details.append(f"class={arg.arg_class.value}")
            details.append(f"sign_extended={arg.sign_extended}")
            lines.append(f"\t# Arg {i} ({', '.join(details)})")
        lines.append(f"ASM_GLOBAL({thunk}, @function)")
        emit_cc_thunk(f, lines, arch)
        lines.append(f"ASM_END({thunk})")


def emit_host_to_guest_thunks(
    lines: List[str], typedefs: Iterable[TypedefInfo], arch: Arch
) -> None:
    for f in typedefs:
        thunk = f"call_{f.name}"
        lines.append("")
        lines.append(
            f"# {f.name} (target_cc={f.target_cc.name}, variadic={f.variadic})"
        )
        source_layout = compute_arg_layout(f.args, f.source_cc, arch, skip_args=1)
        target_layout = compute_arg_layout(f.args, f.target_cc, Arch.X86)
        for i, arg in enumerate(f.args):
            details: List[str] = []
            details.append(f"src={describe_arg_placement(source_layout.args[i])}")
            details.append(f"dst={describe_arg_placement(target_layout.args[i])}")
            details.append(f"class={arg.arg_class.value}")
            details.append(f"sign_extended={arg.sign_extended}")
            lines.append(f"\t# Arg {i} ({', '.join(details)})")
        # details = []
        # details.append(f"class={f.return_type.arg_class.value}")
        # details.append(f"sign_extended={f.return_type.sign_extended}")
        # lines.append(f"\t# Ret ({', '.join(details)})")
        lines.append(f"ASM_WEAK({thunk}, @function)")
        emit_cc_thunk(f, lines, arch)
        lines.append(f"ASM_END({thunk})")


def emit_header_mapping(
    dll: str,
    funcs: Iterable[FuncInfo],
    typedefs: Iterable[TypedefInfo],
    variables: Iterable[VarInfo],
    arch: Arch,
) -> str:
    guard = f"WIBO_GEN_{dll.upper()}_THUNKS_H"
    lines: List[str] = []
    lines.append("/* Auto-generated; DO NOT EDIT. */")
    lines.append(f"#ifndef {guard}")
    lines.append(f"#define {guard}")
    lines.append("#include <stddef.h>")
    lines.append("#include <string.h>")
    lines.append('#ifdef __cplusplus\nextern "C" {\n#endif')

    # Guest-to-host thunk functions
    for f in funcs:
        # Generate best-effort function prototype so that simple thunks can be called directly
        # in special cases (e.g. thunk_entry_stubBase)
        def _is_opaque(t: Type) -> bool:
            if (
                t.kind == TypeKind.RECORD
                or t.kind == TypeKind.ENUM
                or t.kind == TypeKind.FUNCTIONPROTO
                or t.kind == TypeKind.FUNCTIONNOPROTO
            ):
                return True
            return t.kind == TypeKind.POINTER and _is_opaque(
                t.get_pointee().get_canonical()
            )

        def _canonical_type_str(t: Type) -> str:
            c = t.get_canonical()
            if _is_opaque(c):
                return "void *"
            return c.spelling

        thunk = f"thunk_{dll}_{f.name}"
        args = []
        for i, arg in enumerate(f.args):
            type_str = _canonical_type_str(arg.type)
            args.append(f"{type_str} arg{i}")
        param_list = ", ".join(args)
        return_type = _canonical_type_str(f.return_type.type)
        if arch == Arch.X86_64:
            cc_attr = ""
        elif f.source_cc == CallingConv.X86_STDCALL:
            cc_attr = "__attribute__((stdcall)) "
        elif f.source_cc == CallingConv.C:
            cc_attr = "__attribute__((cdecl)) "
        else:
            raise NotImplementedError(
                f"Unsupported calling convention {f.source_cc.name} for function {f.name}"
            )
        lines.append(f"{cc_attr}{return_type} {thunk}({param_list});")

    # Host-to-guest thunk functions
    for td in typedefs:
        thunk = f"call_{td.name}"
        if td.variadic:
            continue

        params = [f"{td.name} fn"]
        for i, arg in enumerate(td.args):
            type_str = _type_to_string(arg.type)
            params.append(f"{type_str} arg{i}")

        param_list = ", ".join(params)
        return_type = _type_to_string(td.return_type.type)
        lines.append(f"{return_type} {thunk}({param_list});")

    lines.append("#ifdef __cplusplus\n}\n#endif")
    lines.append("")
    # name->address helper for resolveByName
    lines.append("static inline void *%sThunkByName(const char *name) {" % dll)
    for f in funcs:
        lines.append(
            f'\tif (strcmp(name, "{f.name}") == 0) return (void*)&thunk_{dll}_{f.name};'
        )
    for v in variables:
        qualified = f"{v.qualified_ns}::{v.name}" if v.qualified_ns else v.name
        lines.append(
            f'\tif (strcmp(name, "{v.name}") == 0) return (void*)&{qualified};'
        )
    lines.append("\treturn NULL;")
    lines.append("}")

    lines.append(f"#endif /* {guard} */\n")
    return "\n".join(lines)


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dll", required=True, help="DLL name, e.g. kernel32")
    ap.add_argument("--headers", nargs="+", required=True, help="Header files to scan")
    ap.add_argument(
        "--namespace", dest="ns", default=None, help="Namespace filter, e.g. kernel32"
    )
    ap.add_argument("--arch", choices=["x86", "x86_64"], default="x86")
    ap.add_argument(
        "--out-asm", type=Path, required=True, help="Output assembly file (.S)"
    )
    ap.add_argument(
        "--out-hdr", type=Path, required=True, help="Output header file (.h)"
    )
    ap.add_argument("-I", dest="incs", action="append", default=[])
    args = ap.parse_args()

    if args.arch == "x86":
        arch = Arch.X86
        target = "i686-linux-gnu"
    elif args.arch == "x86_64":
        arch = Arch.X86_64
        if sys.platform == "darwin":
            target = "x86_64-apple-darwin"
        else:
            target = "x86_64-linux-gnu"
    else:
        raise ValueError(f"Unsupported architecture: {args.arch}")

    tu = parse_tu(args.headers, args.incs, target)
    funcs = collect_functions(tu, args.ns, arch)
    typedefs = collect_typedefs(tu, arch)
    variables = collect_variables(tu, args.ns)

    if not funcs and not typedefs and not variables:
        sys.stderr.write("No functions, typedefs, or variables found for generation.\n")
        return 1

    lines: List[str] = []
    lines.append("# Auto-generated thunks; DO NOT EDIT.")
    lines.append('#include "macros.S"')
    lines.append(".text")

    emit_guest_to_host_thunks(lines, args.dll, funcs, arch)
    emit_host_to_guest_thunks(lines, typedefs, arch)

    asm = "\n".join(lines) + "\n"
    hdr = emit_header_mapping(args.dll, funcs, typedefs, variables, arch)

    args.out_asm.parent.mkdir(parents=True, exist_ok=True)
    args.out_hdr.parent.mkdir(parents=True, exist_ok=True)
    args.out_asm.write_text(asm)
    args.out_hdr.write_text(hdr)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())