Experimental 64-bit host support

2025-12-13 07:06:18 +00:00 · 2025-11-04 22:07:51 -07:00
parent 463686d01a
commit 3dd9fb77ff
64 changed files with 1993 additions and 844 deletions
--- a/tools/gen_trampolines.py
+++ b/tools/gen_trampolines.py
@@ -20,7 +20,7 @@ import os
 import sys
 import tempfile
 from dataclasses import dataclass, field
-from enum import IntEnum
+from enum import Enum, IntEnum
 from pathlib import Path
 from typing import Iterable, List, Optional

@@ -50,6 +50,11 @@ if "LIBCLANG_PATH" in os.environ:
        )


+class Arch(str, Enum):
+    X86 = "x86"
+    X86_64 = "x86_64"
+
+
 class CallingConv(IntEnum):
    """CXCallingConv enum values from clang-c/Index.h"""

@@ -91,13 +96,30 @@ def _get_function_calling_conv(func_type: CXType) -> CallingConv:
    return CallingConv(_get_calling_conv(func_type))


+class ArgClass(str, Enum):
+    INT = "int"
+    MEMORY = "memory"
+
+
@dataclass
 class ArgInfo:
+    type: Type
+    arg_class: ArgClass
+    sign_extended: bool
+
+
+@dataclass
+class ArgPlacement:
    size: int
    slot_size: int
-    primitive: bool
-    sign_extended: bool
-    type: Type
+    stack_offset: Optional[int] = None
+    register: Optional[str] = None
+
+    def __init__(self, arg: ArgInfo, arch: Arch):
+        self.size = arg.type.get_canonical().get_size()
+        self.slot_size = _slot_size_for_arch(arg, arch)
+        self.register = None
+        self.stack_offset = None


@dataclass
@@ -108,7 +130,7 @@ class FuncInfo:
    source_cc: CallingConv
    target_cc: CallingConv
    variadic: bool
-    return_type: Type
+    return_type: ArgInfo
    args: List[ArgInfo] = field(default_factory=list)


@@ -118,7 +140,7 @@ class TypedefInfo:
    source_cc: CallingConv
    target_cc: CallingConv
    variadic: bool
-    return_type: Type
+    return_type: ArgInfo
    args: List[ArgInfo] = field(default_factory=list)


@@ -210,43 +232,146 @@ SIGNED_KINDS = [
 ]


+def _calculate_arg_info(t: Type) -> ArgInfo:
+    canonical = t.get_canonical()
+
+    # if canonical.kind == TypeKind.RECORD:
+    #     arg_class = ArgClass.MEMORY
+    # else:
+    arg_class = ArgClass.INT
+
+    if canonical.kind == TypeKind.POINTER:
+        pointee = canonical.get_pointee()
+        if pointee.kind == TypeKind.POINTER:
+            print(f"Bugprone: Pointer to pointer ({_type_to_string(t)})")
+
+    # Sign-extend signed integers and HANDLE-like typedefs
+    is_sign_extended = canonical.kind in SIGNED_KINDS or _is_handle_typedef(t)
+
+    return ArgInfo(
+        arg_class=arg_class,
+        sign_extended=is_sign_extended,
+        type=t,
+    )
+
+
 def _collect_args(func_type: CXType) -> List[ArgInfo]:
    """Collect argument information for a function."""
    args: List[ArgInfo] = []
    for t in func_type.argument_types():
-        size = t.get_size()
-        canonical = t.get_canonical()
-
-        # Determine if primitive (not struct/union)
-        is_primitive = canonical.kind != TypeKind.RECORD
-
-        # Determine if sign-extended
-        # Sign-extend signed integers and HANDLE-like typedefs
-        is_sign_extended = canonical in SIGNED_KINDS or _is_handle_typedef(t)
-
-        # Calculate stack slot size
-        if size <= 4:
-            slot_size = 4
-        elif size <= 8:
-            slot_size = 8
-        else:
-            raise NotImplementedError(
-                f"Argument size {size} not supported for function {func_type.spelling}"
-            )
-
-        args.append(
-            ArgInfo(
-                size=size,
-                slot_size=slot_size,
-                primitive=is_primitive,
-                sign_extended=is_sign_extended,
-                type=t,
-            )
-        )
+        args.append(_calculate_arg_info(t))
    return args


-def collect_functions(tu: TranslationUnit, ns_filter: Optional[str]) -> List[FuncInfo]:
+def _slot_size_for_arch(arg: ArgInfo, arch: Arch) -> int:
+    """Return the slot size (in bytes) used to pass an argument on the given architecture."""
+    canonical = arg.type.get_canonical()
+    if canonical.kind == TypeKind.POINTER:
+        return 8 if arch == Arch.X86_64 else 4
+    size = canonical.get_size()
+    if arch == Arch.X86:
+        if size <= 4:
+            return 4
+        if size <= 8:
+            return 8
+    elif arch == Arch.X86_64:
+        if size <= 8:
+            return 8
+    raise NotImplementedError(
+        f"Argument size {size} not supported for architecture {arch.value}"
+    )
+
+
+@dataclass
+class ArgLayout:
+    args: List[ArgPlacement]
+    stack_size: int
+
+
+def compute_arg_layout(
+    args: List[ArgInfo],
+    cc: CallingConv,
+    arch: Arch,
+    stack_offset: int = 0,
+    skip_args: int = 0,
+) -> ArgLayout:
+    """Compute how each argument is passed for the given calling convention and arch."""
+
+    placements: List[ArgPlacement] = []
+    stack_size = 0
+    gpr_order: List[str] = []
+    gpr_index = skip_args
+
+    if arch == Arch.X86 and cc == CallingConv.X86_FASTCALL:
+        gpr_order = ["ecx", "edx"]
+    elif arch == Arch.X86_64 and cc == CallingConv.C:
+        gpr_order = ["rdi", "rsi", "rdx", "rcx", "r8", "r9"]
+
+    # Offset our stack based on number of extra arguments
+    # We assume that every arg represented by skip_args fits in a register
+    register_size = 8 if arch == Arch.X86_64 else 4
+    consumed_stack = max(0, skip_args - len(gpr_order)) * register_size
+    stack_offset += consumed_stack
+    stack_size += consumed_stack
+
+    def _push_stack(arg: ArgInfo) -> None:
+        nonlocal stack_offset
+        nonlocal stack_size
+        placement = ArgPlacement(arg, arch)
+        placement.stack_offset = stack_offset
+        placements.append(placement)
+        stack_offset += placement.slot_size
+        stack_size += placement.slot_size
+
+    def _push_register(arg: ArgInfo) -> None:
+        nonlocal gpr_index
+        placement = ArgPlacement(arg, arch)
+        placement.register = gpr_order[gpr_index]
+        placements.append(placement)
+        gpr_index += 1
+
+    # Special case for x86 fastcall: stop using registers if any spill onto the stack
+    if arch == Arch.X86 and cc == CallingConv.X86_FASTCALL:
+        stack_args_start = 0
+        for i in range(min(len(gpr_order), len(args))):
+            if gpr_index >= len(gpr_order):
+                break
+            arg = args[i]
+            slot_size = _slot_size_for_arch(arg, arch)
+            if arg.arg_class == ArgClass.INT and slot_size == 4:
+                _push_register(arg)
+                stack_args_start += 1
+            else:
+                break
+
+        for i in range(stack_args_start, len(args)):
+            _push_stack(args[i])
+    else:
+        for arg in args:
+            slot_size = _slot_size_for_arch(arg, arch)
+            if (
+                arg.arg_class == ArgClass.INT
+                and slot_size <= register_size
+                and gpr_index < len(gpr_order)
+            ):
+                _push_register(arg)
+            else:
+                _push_stack(arg)
+
+    return ArgLayout(args=placements, stack_size=stack_size)
+
+
+def describe_arg_placement(placement: ArgPlacement) -> str:
+    if placement.register is not None:
+        return f"{placement.register}[{placement.slot_size}]"
+    if placement.stack_offset is not None:
+        return f"stack+{placement.stack_offset}[{placement.slot_size}]"
+    raise ValueError(f"Unassigned placement {placement}")
+
+
+def collect_functions(
+    tu: TranslationUnit, ns_filter: Optional[str], arch: Arch
+) -> List[FuncInfo]:
    want_ns = ns_filter.split("::") if ns_filter else None
    out: dict[str, FuncInfo] = {}

@@ -268,7 +393,7 @@ def collect_functions(tu: TranslationUnit, ns_filter: Optional[str]) -> List[Fun
                source_cc=source_cc,
                target_cc=_get_function_calling_conv(node.type),
                variadic=node.type.is_function_variadic(),
-                return_type=node.type.get_result(),
+                return_type=_calculate_arg_info(node.type.get_result()),
                args=_collect_args(node.type),
            )

@@ -292,7 +417,7 @@ def _type_to_string(t: CXType) -> str:
    return spelling


-def collect_typedefs(tu: TranslationUnit) -> List[TypedefInfo]:
+def collect_typedefs(tu: TranslationUnit, arch: Arch) -> List[TypedefInfo]:
    """Collect function pointer typedefs and type aliases from the translation unit."""
    out: dict[str, TypedefInfo] = {}

@@ -309,17 +434,13 @@ def collect_typedefs(tu: TranslationUnit) -> List[TypedefInfo]:
        if target_cc == CallingConv.DEFAULT:
            return  # No CC annotation; skip

-        variadic = func_type.is_function_variadic()
-        args = _collect_args(func_type)
-        return_type = func_type.get_result()
-
        out[name] = TypedefInfo(
            name=name,
            source_cc=source_cc,
            target_cc=target_cc,
-            variadic=variadic,
-            return_type=return_type,
-            args=args,
+            variadic=func_type.is_function_variadic(),
+            return_type=_calculate_arg_info(func_type.get_result()),
+            args=_collect_args(func_type),
        )

    def visit(node: Cursor) -> None:
@@ -372,17 +493,15 @@ def collect_variables(tu: TranslationUnit, ns_filter: Optional[str]) -> List[Var
    return sorted(out.values(), key=lambda v: v.name)


-def emit_cc_thunk(f: FuncInfo | TypedefInfo, lines: List[str]):
+def emit_cc_thunk32(f: FuncInfo | TypedefInfo, lines: List[str]):
    if isinstance(f, TypedefInfo):
        # Host-to-guest
-        target = "[eax+4]"
-        arg_off = 8
+        call_target = "[eax+4]"
        align = 0
        host_to_guest = True
    elif isinstance(f, FuncInfo):
        # Guest-to-host
-        target = f.mangled
-        arg_off = 4
+        call_target = f.mangled
        align = 16
        host_to_guest = False

@@ -391,49 +510,32 @@ def emit_cc_thunk(f: FuncInfo | TypedefInfo, lines: List[str]):
        assert f.source_cc == CallingConv.C and f.target_cc == CallingConv.C, (
            "Variadic functions must be cdecl"
        )
-        lines.append(f"\tjmp {target}")
+        lines.append(f"\tjmp {call_target}")
        return

-    # Compute argument stack offsets
-    offsets: List[int] = []
-    for arg in f.args:
-        offsets.append(arg_off)
-        arg_off += arg.slot_size
-
-    reg_indices: List[int] = []
-    if f.target_cc == CallingConv.X86_FASTCALL:
-        # Store the first two non-record 4-byte args in ECX/EDX for GCC/Clang x86 fastcall
-        if len(f.args) >= 1 and f.args[0].primitive and f.args[0].slot_size == 4:
-            reg_indices.append(0)  # ECX
-            if len(f.args) >= 2 and f.args[1].primitive and f.args[1].slot_size == 4:
-                reg_indices.append(1)  # EDX
-    elif f.target_cc == CallingConv.C or f.target_cc == CallingConv.X86_STDCALL:
-        # No register args for cdecl or stdcall
-        pass
-    else:
-        raise NotImplementedError(
-            f"Unsupported target calling convention {f.target_cc.name} for function {f.name}"
-        )
-
-    # Bytes we will push for the call (exclude args passed in registers)
-    stack_bytes = sum(
-        arg.slot_size for i, arg in enumerate(f.args) if i not in reg_indices
+    source_layout = compute_arg_layout(
+        f.args,
+        f.source_cc,
+        Arch.X86,
+        stack_offset=4,
+        skip_args=1 if host_to_guest else 0,
    )
+    target_layout = compute_arg_layout(f.args, f.target_cc, Arch.X86)

-    # Get current TIB
+    # Get current TEB
    if host_to_guest:
        lines.append("\tmov ecx, gs:[currentThreadTeb@ntpoff]")
    else:
-        lines.append("\tmov ecx, fs:[0x18]")
+        lines.append("\tmov ecx, fs:[TEB_SELF]")

    # Swap fs and gs
    lines.append("\tmov ax, fs")
-    lines.append("\tmov dx, word ptr [ecx+0xf98]")
-    lines.append("\tmov word ptr [ecx+0xf98], ax")
+    lines.append("\tmov dx, word ptr [ecx+TEB_FS_SEL]")
+    lines.append("\tmov word ptr [ecx+TEB_FS_SEL], ax")
    lines.append("\tmov fs, dx")
    lines.append("\tmov ax, gs")
-    lines.append("\tmov dx, word ptr [ecx+0xf9a]")
-    lines.append("\tmov word ptr [ecx+0xf9a], ax")
+    lines.append("\tmov dx, word ptr [ecx+TEB_GS_SEL]")
+    lines.append("\tmov word ptr [ecx+TEB_GS_SEL], ax")
    lines.append("\tmov gs, dx")

    # Store guest stack pointer in eax for arg access
@@ -442,48 +544,60 @@ def emit_cc_thunk(f: FuncInfo | TypedefInfo, lines: List[str]):

    # Swap stack pointer
    lines.append("\tpush ebp")
-    lines.append("\tmov ebp, dword ptr [ecx+0xf9c]")
-    lines.append("\tmov dword ptr [ecx+0xf9c], esp")
+    lines.append("\tmov ebp, dword ptr [ecx+TEB_SP]")
+    lines.append("\tmov dword ptr [ecx+TEB_SP], esp")
    lines.append("\tmov esp, ebp")

    # Allocate stack space for arguments
-    if stack_bytes > 0:
-        lines.append(f"\tsub esp, {stack_bytes}")
+    if target_layout.stack_size > 0:
+        lines.append(f"\tsub esp, {target_layout.stack_size}")

    # Align stack if needed (must be done after allocating args)
    if align > 0:
        lines.append(f"\tand esp, ~{align - 1}")

-    # Copy args onto stack
-    cur_off = 0
-    for i, arg in enumerate(f.args):
-        if i in reg_indices:
+    # Copy args onto stack for the callee
+    for idx, target in enumerate(target_layout.args):
+        if target.stack_offset is None:
            continue
-        base = offsets[i]
-        for part_off in range(0, arg.slot_size, 4):
-            lines.append(f"\tmov ecx, [eax+{base + part_off}]")
-            lines.append(f"\tmov [esp+{cur_off + part_off}], ecx")
-        cur_off += arg.slot_size
+
+        source = source_layout.args[idx]
+        if source.stack_offset is None:
+            raise NotImplementedError(
+                f"Source calling convention {f.source_cc.name} requires register argument {idx}; not implemented"
+            )
+
+        if source.slot_size != target.slot_size:
+            raise NotImplementedError(
+                f"Argument {idx} requires size conversion {source.slot_size}->{target.slot_size}; not implemented"
+            )
+
+        for off in range(0, target.slot_size, 4):
+            lines.append(f"\tmov ecx, [eax+{source.stack_offset + off}]")
+            lines.append(f"\tmov [esp+{target.stack_offset + off}], ecx")

    # Load args into registers as needed
-    if len(reg_indices) > 0:
-        i = reg_indices[0]
-        offset = offsets[i]
-        lines.append(f"\tmov ecx, [eax+{offset}]")
-    if len(reg_indices) > 1:
-        i = reg_indices[1]
-        offset = offsets[i]
-        lines.append(f"\tmov edx, [eax+{offset}]")
+    for idx, target in enumerate(target_layout.args):
+        if target.register is None:
+            continue
+
+        source = source_layout.args[idx]
+        if source.stack_offset is None:
+            raise NotImplementedError(
+                f"Source calling convention {f.source_cc.name} requires register argument {idx}; not implemented"
+            )
+
+        lines.append(f"\tmov {target.register}, [eax+{source.stack_offset}]")

    # Call into target
-    lines.append(f"\tcall {target}")
+    lines.append(f"\tcall {call_target}")

    # Determine if we can clobber eax/edx
-    if f.return_type.kind == TypeKind.RECORD:
+    if f.return_type.arg_class != ArgClass.INT:
        raise NotImplementedError(
-            f"Struct return type not supported for function {f.name}"
+            f"Unsupported return type class {f.return_type.arg_class.value} for function {f.name}"
        )
-    return_size = f.return_type.get_size()
+    return_size = f.return_type.type.get_size()
    save_eax = return_size > 0
    save_edx = return_size > 4
    if return_size > 8:
@@ -497,16 +611,16 @@ def emit_cc_thunk(f: FuncInfo | TypedefInfo, lines: List[str]):
    if save_edx:
        lines.append("\tpush edx")
    if host_to_guest:
-        lines.append("\tmov ecx, fs:[0x18]")
+        lines.append("\tmov ecx, fs:[TEB_SELF]")
    else:
        lines.append("\tmov ecx, gs:[currentThreadTeb@ntpoff]")
    lines.append("\tmov ax, fs")
-    lines.append("\tmov dx, word ptr [ecx+0xf98]")
-    lines.append("\tmov word ptr [ecx+0xf98], ax")
+    lines.append("\tmov dx, word ptr [ecx+TEB_FS_SEL]")
+    lines.append("\tmov word ptr [ecx+TEB_FS_SEL], ax")
    lines.append("\tmov fs, dx")
    lines.append("\tmov ax, gs")
-    lines.append("\tmov dx, word ptr [ecx+0xf9a]")
-    lines.append("\tmov word ptr [ecx+0xf9a], ax")
+    lines.append("\tmov dx, word ptr [ecx+TEB_GS_SEL]")
+    lines.append("\tmov word ptr [ecx+TEB_GS_SEL], ax")
    lines.append("\tmov gs, dx")
    if save_edx:
        lines.append("\tpop edx")
@@ -515,29 +629,293 @@ def emit_cc_thunk(f: FuncInfo | TypedefInfo, lines: List[str]):

    # Swap stack pointer
    lines.append("\tmov esp, ebp")  # Clean up arg space
-    lines.append("\tmov ebp, dword ptr [ecx+0xf9c]")
-    lines.append("\tmov dword ptr [ecx+0xf9c], esp")
+    lines.append("\tmov ebp, dword ptr [ecx+TEB_SP]")
+    lines.append("\tmov dword ptr [ecx+TEB_SP], esp")

    # Restore stack and frame pointer
    lines.append("\tleave")

    # Return to guest
-    if f.source_cc == CallingConv.X86_STDCALL:
-        ret_bytes = sum(arg.slot_size for arg in f.args)
-    elif f.source_cc == CallingConv.C:
-        ret_bytes = 0
-    else:
-        raise NotImplementedError(
-            f"Unsupported source calling convention {f.source_cc.name} for function {f.name}"
-        )
-    if ret_bytes > 0:
-        lines.append(f"\tret {ret_bytes}")
+    if f.source_cc == CallingConv.X86_STDCALL and source_layout.stack_size > 0:
+        lines.append(f"\tret {source_layout.stack_size}")
    else:
        lines.append("\tret")


+def _x64_register_by_slot_size(reg: str, slot_size: int) -> str:
+    if slot_size == 8:
+        return reg
+    if reg in ["rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "rsp"]:
+        if slot_size == 4:
+            return f"e{reg[1:]}"
+        elif slot_size == 2:
+            return reg[1:]
+        elif slot_size == 1:
+            if reg in ["rax", "rbx", "rcx", "rdx"]:
+                return f"{reg[1]}l"
+            elif reg in ["rsi", "rdi"]:
+                return f"{reg[1]}il"
+            else:
+                return f"{reg[1]}pl"
+    if slot_size == 4:
+        return f"{reg}d"
+    if slot_size == 2:
+        return f"{reg}w"
+    if slot_size == 1:
+        return f"{reg}b"
+    raise NotImplementedError(f"Unsupported register {reg} for slot size {slot_size}")
+
+
+def _x64_ptr_type_by_slot_size(slot_size) -> str:
+    if slot_size == 4:
+        return "dword ptr"
+    elif slot_size == 8:
+        return "qword ptr"
+    else:
+        raise ValueError(f"Unsupported slot size {slot_size}")
+
+
+def emit_cc_thunk64(f: FuncInfo | TypedefInfo, lines: List[str]):
+    if isinstance(f, TypedefInfo):
+        # Host-to-guest
+        call_target = "edi"
+        align = 0
+        host_to_guest = True
+    elif isinstance(f, FuncInfo):
+        # Guest-to-host
+        call_target = f.mangled
+        align = 16
+        host_to_guest = False
+
+    if f.variadic:
+        # Variadic functions are not yet supported for calling convention conversion.
+        assert f.source_cc == CallingConv.C and f.target_cc == CallingConv.C, (
+            "Variadic functions must be cdecl"
+        )
+        lines.append(f"\tjmp {call_target}")
+        return
+
+    source_layout = compute_arg_layout(
+        f.args,
+        f.source_cc,
+        Arch.X86_64 if host_to_guest else Arch.X86,
+        stack_offset=24 if host_to_guest else 16,
+        skip_args=1 if host_to_guest else 0,
+    )
+    target_layout = compute_arg_layout(
+        f.args, f.target_cc, Arch.X86 if host_to_guest else Arch.X86_64
+    )
+
+    if host_to_guest:
+        lines.append(".code64")
+
+        # Save rbx and rbp
+        lines.append("\tpush rbx")
+        lines.append("\tpush rbp")
+
+        # Stash host stack in r10
+        lines.append("\tmov r10, rsp")
+
+        # Get current TEB
+        lines.append("\tmov rcx, fs:[currentThreadTeb@tpoff]")
+
+        # Save FS base
+        lines.append("\trdfsbase r9")
+        lines.append("\tmov qword ptr [rcx+TEB_FSBASE], r9")
+
+        # Save RSP and load guest stack
+        lines.append("\tmov rbp, qword ptr [rcx+TEB_SP]")
+        lines.append("\tmov qword ptr [rcx+TEB_SP], rsp")
+        lines.append("\tmov rsp, rbp")
+
+        # Allocate stack space for arguments
+        if target_layout.stack_size > 0:
+            lines.append(f"\tsub rsp, {target_layout.stack_size}")
+
+        # Align stack if needed (must be done after allocating args)
+        if align > 0:
+            lines.append(f"\tand rsp, ~{align - 1}")
+
+        # Transfer arguments
+        for i, target in enumerate(target_layout.args):
+            if target.stack_offset is None:
+                raise NotImplementedError(f"Unexpected register argument {target}")
+
+            source = source_layout.args[i]
+            if source.stack_offset is not None:
+                ptr_type = _x64_ptr_type_by_slot_size(source.slot_size)
+                register = _x64_register_by_slot_size("rax", target.slot_size)
+                lines.append(
+                    f"\tmov {register}, {ptr_type} [r10+{source.stack_offset}]"
+                )
+                ptr_type = _x64_ptr_type_by_slot_size(target.slot_size)
+                register = _x64_register_by_slot_size("rax", target.slot_size)
+            elif source.register is not None:
+                ptr_type = _x64_ptr_type_by_slot_size(target.slot_size)
+                register = _x64_register_by_slot_size(source.register, target.slot_size)
+            else:
+                raise ValueError(f"Argument {i} is not a register or stack offset")
+            lines.append(f"\tmov {ptr_type} [rsp+{target.stack_offset}], {register}")
+
+        # Jump to 32-bit mode
+        lines.append("\tLJMP32")
+
+        # Setup FS selector
+        lines.append("\tmov ax, word ptr [ecx+TEB_FS_SEL]")
+        lines.append("\tmov fs, ax")
+
+        # Call into target
+        lines.append(f"\tcall {call_target}")
+
+        # Get current TEB
+        lines.append("\tmov ecx, fs:[TEB_SELF]")
+
+        # Jump back to 64-bit
+        lines.append("\tLJMP64")
+
+        # Sign extend return value if necessary
+        if f.return_type.sign_extended:
+            lines.append("\tcdqe")
+
+        # Restore FS base
+        lines.append("\tmov r9, qword ptr [rcx+TEB_FSBASE]")
+        lines.append("\twrfsbase r9")
+
+        # Restore host stack
+        lines.append("\tmov rsp, qword ptr [rcx+TEB_SP]")
+        lines.append("\tmov qword ptr [rcx+TEB_SP], rbp")
+
+        # Restore rbp, rbx and return
+        lines.append("\tpop rbp")
+        lines.append("\tpop rbx")
+        lines.append("\tret")
+    else:
+        lines.append(".code32")
+
+        # Save registers
+        lines.append("\tpush ebp")
+        lines.append("\tpush esi")
+        lines.append("\tpush edi")
+
+        # Get current TEB
+        lines.append("\tmov ecx, fs:[TEB_SELF]")
+
+        # Save fs segment
+        lines.append("\tmov di, fs")
+        lines.append("\tmov word ptr [ecx+TEB_FS_SEL], di")
+
+        # Jump back to 64-bit
+        lines.append("\tLJMP64")
+
+        # Restore FS base
+        lines.append("\tmov r9, qword ptr [rcx+TEB_FSBASE]")
+        lines.append("\twrfsbase r9")
+
+        # Stash guest stack in r10
+        lines.append("\tmov r10, rsp")
+
+        # Restore host stack
+        lines.append("\tmov rbp, qword ptr [rcx+TEB_SP]")
+        lines.append("\tmov qword ptr [rcx+TEB_SP], rsp")
+        lines.append("\tmov rsp, rbp")
+
+        # Allocate stack space for arguments
+        if target_layout.stack_size > 0:
+            lines.append(f"\tsub rsp, {target_layout.stack_size}")
+
+        # Align stack if needed (must be done after allocating args)
+        if align > 0:
+            lines.append(f"\tand rsp, ~{align - 1}")
+
+        # Transfer args
+        for i, target in enumerate(target_layout.args):
+            arg = f.args[i]
+            source = source_layout.args[i]
+
+            if target.stack_offset is not None:
+                if source.stack_offset is not None:
+                    ptr_type = _x64_ptr_type_by_slot_size(source.slot_size)
+                    register = _x64_register_by_slot_size("rax", source.slot_size)
+                    lines.append(
+                        f"\tmov {register}, {ptr_type} [r10+{source.stack_offset}]"
+                    )
+                    ptr_type = _x64_ptr_type_by_slot_size(target.slot_size)
+                    register = _x64_register_by_slot_size("rax", target.slot_size)
+                elif source.register is not None:
+                    ptr_type = _x64_ptr_type_by_slot_size(target.slot_size)
+                    register = _x64_register_by_slot_size(
+                        source.register, target.slot_size
+                    )
+                else:
+                    raise ValueError(f"Argument {i} is not a register or stack offset")
+                lines.append(
+                    f"\tmov {ptr_type} [rsp+{target.stack_offset}], {register}"
+                )
+            elif target.register is not None:
+                ptr_type = _x64_ptr_type_by_slot_size(source.slot_size)
+                if source.slot_size == 4 and target.slot_size == 8:
+                    if arg.sign_extended:
+                        register = _x64_register_by_slot_size(
+                            target.register, source.slot_size
+                        )
+                        lines.append(
+                            f"\tmov {register}, {ptr_type} [r10+{source.stack_offset}]"
+                        )
+                        lines.append(f"\tmovsxd {target.register}, {register}")
+                    else:
+                        register = _x64_register_by_slot_size(
+                            target.register, source.slot_size
+                        )
+                        lines.append(
+                            f"\tmov {register}, {ptr_type} [r10+{source.stack_offset}]"
+                        )
+                elif source.slot_size == 8 and target.slot_size == 8:
+                    lines.append(
+                        f"\tmov {target.register}, {ptr_type} [r10+{source.stack_offset}]"
+                    )
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported conversion from {source.slot_size} to {target.slot_size}"
+                    )
+
+        # Call into target
+        lines.append(f"\tcall {call_target}")
+
+        # Get current TEB
+        lines.append("\tmov rcx, fs:[currentThreadTeb@tpoff]")
+
+        # Restore host stack
+        lines.append("\tmov rsp, qword ptr [rcx+TEB_SP]")
+        lines.append("\tmov qword ptr [rcx+TEB_SP], rbp")
+
+        # Jump to 32-bit mode
+        lines.append("\tLJMP32")
+
+        # Setup FS selector
+        lines.append("\tmov di, word ptr [ecx+TEB_FS_SEL]")
+        lines.append("\tmov fs, di")
+
+        # Restore registers
+        lines.append("\tpop edi")
+        lines.append("\tpop esi")
+        lines.append("\tpop ebp")
+
+        # Return to guest
+        if f.source_cc == CallingConv.X86_STDCALL and source_layout.stack_size > 0:
+            lines.append(f"\tret {source_layout.stack_size}")
+        else:
+            lines.append("\tret")
+
+
+def emit_cc_thunk(f: FuncInfo | TypedefInfo, lines: List[str], arch: Arch):
+    if arch == Arch.X86_64:
+        return emit_cc_thunk64(f, lines)
+    elif arch == Arch.X86:
+        return emit_cc_thunk32(f, lines)
+
+
 def emit_guest_to_host_thunks(
-    lines: List[str], dll: str, funcs: Iterable[FuncInfo]
+    lines: List[str], dll: str, funcs: Iterable[FuncInfo], arch: Arch
 ) -> None:
    for f in funcs:
        thunk = f"thunk_{dll}_{f.name}"
@@ -545,19 +923,24 @@ def emit_guest_to_host_thunks(
        lines.append(
            f"# {f.qualified_ns}::{f.name} (source_cc={f.source_cc.name}, target_cc={f.target_cc.name}, variadic={f.variadic})"
        )
+        source_layout = compute_arg_layout(f.args, f.source_cc, Arch.X86)
+        target_layout = compute_arg_layout(f.args, f.target_cc, arch)
        for i, arg in enumerate(f.args):
-            lines.append(
-                f"\t# Arg {i} (slot_size={arg.slot_size}, primitive={arg.primitive}, sign_extended={arg.sign_extended})"
-            )
+            details: List[str] = []
+            details.append(f"src={describe_arg_placement(source_layout.args[i])}")
+            details.append(f"dst={describe_arg_placement(target_layout.args[i])}")
+            details.append(f"class={arg.arg_class.value}")
+            details.append(f"sign_extended={arg.sign_extended}")
+            lines.append(f"\t# Arg {i} ({', '.join(details)})")
        lines.append(f".globl {thunk}")
        lines.append(f".type {thunk}, @function")
        lines.append(f"{thunk}:")
-        emit_cc_thunk(f, lines)
+        emit_cc_thunk(f, lines, arch)
        lines.append(f".size {thunk}, .-{thunk}")


 def emit_host_to_guest_thunks(
-    lines: List[str], typedefs: Iterable[TypedefInfo]
+    lines: List[str], typedefs: Iterable[TypedefInfo], arch: Arch
 ) -> None:
    for f in typedefs:
        thunk = f"call_{f.name}"
@@ -565,14 +948,23 @@ def emit_host_to_guest_thunks(
        lines.append(
            f"# {f.name} (target_cc={f.target_cc.name}, variadic={f.variadic})"
        )
+        source_layout = compute_arg_layout(f.args, f.source_cc, arch, skip_args=1)
+        target_layout = compute_arg_layout(f.args, f.target_cc, Arch.X86)
        for i, arg in enumerate(f.args):
-            lines.append(
-                f"\t# Arg {i} (slot_size={arg.slot_size}, primitive={arg.primitive}, sign_extended={arg.sign_extended})"
-            )
+            details: List[str] = []
+            details.append(f"src={describe_arg_placement(source_layout.args[i])}")
+            details.append(f"dst={describe_arg_placement(target_layout.args[i])}")
+            details.append(f"class={arg.arg_class.value}")
+            details.append(f"sign_extended={arg.sign_extended}")
+            lines.append(f"\t# Arg {i} ({', '.join(details)})")
+        # details = []
+        # details.append(f"class={f.return_type.arg_class.value}")
+        # details.append(f"sign_extended={f.return_type.sign_extended}")
+        # lines.append(f"\t# Ret ({', '.join(details)})")
        lines.append(f".weak {thunk}")
        lines.append(f".type {thunk}, @function")
        lines.append(f"{thunk}:")
-        emit_cc_thunk(f, lines)
+        emit_cc_thunk(f, lines, arch)
        lines.append(f".size {thunk}, .-{thunk}")


@@ -581,6 +973,7 @@ def emit_header_mapping(
    funcs: Iterable[FuncInfo],
    typedefs: Iterable[TypedefInfo],
    variables: Iterable[VarInfo],
+    arch: Arch,
 ) -> str:
    guard = f"WIBO_GEN_{dll.upper()}_THUNKS_H"
    lines: List[str] = []
@@ -619,16 +1012,18 @@ def emit_header_mapping(
            type_str = _canonical_type_str(arg.type)
            args.append(f"{type_str} arg{i}")
        param_list = ", ".join(args)
-        return_type = _canonical_type_str(f.return_type)
-        if f.source_cc == CallingConv.X86_STDCALL:
-            cc_attr = "__attribute__((stdcall))"
+        return_type = _canonical_type_str(f.return_type.type)
+        if arch == Arch.X86_64:
+            cc_attr = ""
+        elif f.source_cc == CallingConv.X86_STDCALL:
+            cc_attr = "__attribute__((stdcall)) "
        elif f.source_cc == CallingConv.C:
-            cc_attr = "__attribute__((cdecl))"
+            cc_attr = "__attribute__((cdecl)) "
        else:
            raise NotImplementedError(
                f"Unsupported calling convention {f.source_cc.name} for function {f.name}"
            )
-        lines.append(f"{cc_attr} {return_type} {thunk}({param_list});")
+        lines.append(f"{cc_attr}{return_type} {thunk}({param_list});")

    # Host-to-guest thunk functions
    for td in typedefs:
@@ -642,7 +1037,7 @@ def emit_header_mapping(
            params.append(f"{type_str} arg{i}")

        param_list = ", ".join(params)
-        return_type = _type_to_string(td.return_type)
+        return_type = _type_to_string(td.return_type.type)
        lines.append(f"{return_type} {thunk}({param_list});")

    lines.append("#ifdef __cplusplus\n}\n#endif")
@@ -672,7 +1067,7 @@ def main() -> int:
    ap.add_argument(
        "--namespace", dest="ns", default=None, help="Namespace filter, e.g. kernel32"
    )
-    ap.add_argument("--arch", choices=["x86"], default="x86")
+    ap.add_argument("--arch", choices=["x86", "x86_64"], default="x86")
    ap.add_argument(
        "--out-asm", type=Path, required=True, help="Output assembly file (.S)"
    )
@@ -682,10 +1077,17 @@ def main() -> int:
    ap.add_argument("-I", dest="incs", action="append", default=[])
    args = ap.parse_args()

+    if args.arch == "x86":
+        arch = Arch.X86
+    elif args.arch == "x86_64":
+        arch = Arch.X86_64
+    else:
+        raise ValueError(f"Unsupported architecture: {args.arch}")
+
    target = "i686-pc-linux-gnu" if args.arch == "x86" else "x86_64-pc-linux-gnu"
    tu = parse_tu(args.headers, args.incs, target)
-    funcs = collect_functions(tu, args.ns)
-    typedefs = collect_typedefs(tu)
+    funcs = collect_functions(tu, args.ns, arch)
+    typedefs = collect_typedefs(tu, arch)
    variables = collect_variables(tu, args.ns)

    if not funcs and not typedefs and not variables:
@@ -694,15 +1096,15 @@ def main() -> int:

    lines: List[str] = []
    lines.append("# Auto-generated thunks; DO NOT EDIT.")
-    lines.append(".intel_syntax noprefix")
+    lines.append('#include "macros.S"')
    lines.append('.section .note.GNU-stack, "", @progbits')
    lines.append(".text")

-    emit_guest_to_host_thunks(lines, args.dll, funcs)
-    emit_host_to_guest_thunks(lines, typedefs)
+    emit_guest_to_host_thunks(lines, args.dll, funcs, arch)
+    emit_host_to_guest_thunks(lines, typedefs, arch)

    asm = "\n".join(lines) + "\n"
-    hdr = emit_header_mapping(args.dll, funcs, typedefs, variables)
+    hdr = emit_header_mapping(args.dll, funcs, typedefs, variables, arch)

    args.out_asm.parent.mkdir(parents=True, exist_ok=True)
    args.out_hdr.parent.mkdir(parents=True, exist_ok=True)