Reimplement x86 arch, MSVC section group combining

Plus display_row/DiffText refactoring
2025-12-17 00:47:12 +00:00 · 2025-02-28 00:17:32 -07:00
parent 506c251d68
commit 95868f1d19
29 changed files with 2580 additions and 1044 deletions
--- a/objdiff-core/src/obj/read.rs
+++ b/objdiff-core/src/obj/read.rs
@@ -4,6 +4,7 @@ use alloc::{
    string::{String, ToString},
    vec::Vec,
 };
+use core::cmp::Ordering;

 use anyhow::{bail, ensure, Context, Result};
 use object::{Object as _, ObjectSection as _, ObjectSymbol as _};
@@ -32,16 +33,22 @@ fn map_symbol(
    arch: &dyn Arch,
    file: &object::File,
    symbol: &object::Symbol,
+    section_indices: &[usize],
    split_meta: Option<&SplitMeta>,
 ) -> Result<Symbol> {
    let mut name = symbol.name().context("Failed to process symbol name")?.to_string();
-    let size = symbol.size();
+    let mut size = symbol.size();
    if let (object::SymbolKind::Section, Some(section)) =
        (symbol.kind(), symbol.section_index().and_then(|i| file.section_by_index(i).ok()))
    {
        let section_name = section.name().context("Failed to process section name")?;
        name = format!("[{}]", section_name);
-        // size = section.size();
+        // For section symbols, set the size to zero. If the size is non-zero, it will be included
+        // in the diff. Most of the time, this is duplicative, given that we'll have function or
+        // object symbols that cover the same range. In the case of an empty section, the size
+        // inference logic below will set the size back to the section size, thus acting as a
+        // placeholder symbol.
+        size = 0;
    }

    let mut flags = arch.extra_symbol_flags(symbol);
@@ -74,7 +81,7 @@ fn map_symbol(
    let virtual_address = split_meta
        .and_then(|m| m.virtual_addresses.as_ref())
        .and_then(|v| v.get(symbol.index().0).cloned());
-    let section = symbol.section_index().map(|i| map_section_index(file, i));
+    let section = symbol.section_index().and_then(|i| section_indices.get(i.0).copied());

    Ok(Symbol {
        name,
@@ -89,39 +96,123 @@ fn map_symbol(
    })
 }

-fn map_section_index(file: &object::File, idx: object::SectionIndex) -> usize {
-    match file.format() {
-        object::BinaryFormat::Elf => idx.0 - 1,
-        _ => idx.0,
-    }
-}
-
-fn map_symbol_index(file: &object::File, idx: object::SymbolIndex) -> usize {
-    match file.format() {
-        object::BinaryFormat::Elf => idx.0 - 1,
-        _ => idx.0,
-    }
-}
-
 fn map_symbols(
    arch: &dyn Arch,
    obj_file: &object::File,
+    sections: &[Section],
+    section_indices: &[usize],
    split_meta: Option<&SplitMeta>,
-) -> Result<Vec<Symbol>> {
-    let mut symbols = Vec::<Symbol>::with_capacity(obj_file.symbols().count());
-    for symbol in obj_file.symbols() {
-        symbols.push(map_symbol(arch, obj_file, &symbol, split_meta)?);
+) -> Result<(Vec<Symbol>, Vec<usize>)> {
+    let symbol_count = obj_file.symbols().count();
+    let mut symbols = Vec::<Symbol>::with_capacity(symbol_count);
+    let mut symbol_indices = Vec::<usize>::with_capacity(symbol_count + 1);
+    for obj_symbol in obj_file.symbols() {
+        if symbol_indices.len() <= obj_symbol.index().0 {
+            symbol_indices.resize(obj_symbol.index().0 + 1, usize::MAX);
+        }
+        let symbol = map_symbol(arch, obj_file, &obj_symbol, section_indices, split_meta)?;
+        symbol_indices[obj_symbol.index().0] = symbols.len();
+        symbols.push(symbol);
+    }
+
+    // Infer symbol sizes for 0-size symbols
+    infer_symbol_sizes(&mut symbols, sections);
+
+    Ok((symbols, symbol_indices))
+}
+
+fn infer_symbol_sizes(symbols: &mut [Symbol], sections: &[Section]) {
+    // Create a sorted list of symbol indices by section
+    let mut symbols_with_section = Vec::<usize>::with_capacity(symbols.len());
+    for (i, symbol) in symbols.iter().enumerate() {
+        if symbol.section.is_some() {
+            symbols_with_section.push(i);
+        }
+    }
+    symbols_with_section.sort_by(|a, b| {
+        let a = &symbols[*a];
+        let b = &symbols[*b];
+        a.section
+            .unwrap_or(usize::MAX)
+            .cmp(&b.section.unwrap_or(usize::MAX))
+            .then_with(|| {
+                // Sort section symbols first
+                if a.kind == SymbolKind::Section {
+                    Ordering::Less
+                } else if b.kind == SymbolKind::Section {
+                    Ordering::Greater
+                } else {
+                    Ordering::Equal
+                }
+            })
+            .then_with(|| a.address.cmp(&b.address))
+            .then_with(|| a.size.cmp(&b.size))
+    });
+
+    // Set symbol sizes based on the next symbol's address
+    let mut iter_idx = 0;
+    while iter_idx < symbols_with_section.len() {
+        let symbol_idx = symbols_with_section[iter_idx];
+        let symbol = &symbols[symbol_idx];
+        iter_idx += 1;
+        if symbol.size != 0 {
+            continue;
+        }
+        let section_idx = symbol.section.unwrap();
+        let next_symbol = match symbol.kind {
+            // For function/object symbols, find the next function/object symbol (in other words:
+            // skip over labels)
+            SymbolKind::Function | SymbolKind::Object => loop {
+                if iter_idx >= symbols_with_section.len() {
+                    break None;
+                }
+                let next_symbol = &symbols[symbols_with_section[iter_idx]];
+                if next_symbol.section != Some(section_idx) {
+                    break None;
+                }
+                if let SymbolKind::Function | SymbolKind::Object = next_symbol.kind {
+                    break Some(next_symbol);
+                }
+                iter_idx += 1;
+            },
+            // For labels (or anything else), simply use the next symbol's address
+            SymbolKind::Unknown | SymbolKind::Section => symbols_with_section
+                .get(iter_idx)
+                .map(|&i| &symbols[i])
+                .take_if(|s| s.section == Some(section_idx)),
+        };
+        let next_address = next_symbol.map(|s| s.address).unwrap_or_else(|| {
+            let section = &sections[section_idx];
+            section.address + section.size
+        });
+        let new_size = next_address.saturating_sub(symbol.address);
+        if new_size > 0 {
+            let symbol = &mut symbols[symbol_idx];
+            symbol.size = new_size;
+            if symbol.kind != SymbolKind::Section {
+                symbol.flags |= SymbolFlag::SizeInferred;
+            }
+            // Set symbol kind if unknown and size is non-zero
+            if symbol.kind == SymbolKind::Unknown {
+                symbol.kind = match sections[section_idx].kind {
+                    SectionKind::Code => SymbolKind::Function,
+                    SectionKind::Data | SectionKind::Bss => SymbolKind::Object,
+                    _ => SymbolKind::Unknown,
+                };
+            }
+        }
    }
-    Ok(symbols)
 }

 fn map_sections(
-    arch: &dyn Arch,
+    _arch: &dyn Arch,
    obj_file: &object::File,
    split_meta: Option<&SplitMeta>,
-) -> Result<Vec<Section>> {
+) -> Result<(Vec<Section>, Vec<usize>)> {
    let mut section_names = BTreeMap::<String, usize>::new();
-    let mut result = Vec::<Section>::with_capacity(obj_file.sections().count());
+    let section_count = obj_file.sections().count();
+    let mut result = Vec::<Section>::with_capacity(section_count);
+    let mut section_indices = Vec::<usize>::with_capacity(section_count + 1);
    for section in obj_file.sections() {
        let name = section.name().context("Failed to process section name")?;
        let kind = map_section_kind(&section);
@@ -142,12 +233,14 @@ fn map_sections(
                .and_then(|v| v.get(s.index().0).cloned())
        });

-        let relocations = map_relocations(arch, obj_file, &section)?;
-
        let unique_id = section_names.entry(name.to_string()).or_insert(0);
        let id = format!("{}-{}", name, unique_id);
        *unique_id += 1;

+        if section_indices.len() <= section.index().0 {
+            section_indices.resize(section.index().0 + 1, usize::MAX);
+        }
+        section_indices[section.index().0] = result.len();
        result.push(Section {
            id,
            name: name.to_string(),
@@ -156,33 +249,14 @@ fn map_sections(
            kind,
            data: SectionData(data),
            flags: Default::default(),
-            relocations,
+            relocations: Default::default(),
            virtual_address,
            line_info: Default::default(),
        });
    }
-    Ok(result)
+    Ok((result, section_indices))
 }

-//     result.sort_by(|a, b| a.address.cmp(&b.address).then(a.size.cmp(&b.size)));
-//     let mut iter = result.iter_mut().peekable();
-//     while let Some(symbol) = iter.next() {
-//         if symbol.size == 0 {
-//             if let Some(next_symbol) = iter.peek() {
-//                 symbol.size = next_symbol.address - symbol.address;
-//             } else {
-//                 symbol.size = (section.address + section.size) - symbol.address;
-//             }
-//             // Set symbol kind if we ended up with a non-zero size
-//             if symbol.kind == ObjSymbolKind::Unknown && symbol.size > 0 {
-//                 symbol.kind = match section.kind {
-//                     ObjSectionKind::Code => ObjSymbolKind::Function,
-//                     ObjSectionKind::Data | ObjSectionKind::Bss => ObjSymbolKind::Object,
-//                 };
-//             }
-//         }
-//     }
-
 const LOW_PRIORITY_SYMBOLS: &[&str] =
    &["__gnu_compiled_c", "__gnu_compiled_cplusplus", "gcc2_compiled."];

@@ -230,6 +304,7 @@ fn map_relocations(
    arch: &dyn Arch,
    obj_file: &object::File,
    obj_section: &object::Section,
+    symbol_indices: &[usize],
 ) -> Result<Vec<Relocation>> {
    let mut relocations = Vec::<Relocation>::with_capacity(obj_section.relocations().count());
    let mut ordered_symbols = None;
@@ -269,7 +344,13 @@ fn map_relocations(
                } else {
                    idx
                };
-                map_symbol_index(obj_file, idx)
+                match symbol_indices.get(idx.0).copied() {
+                    Some(i) => i,
+                    None => {
+                        log::warn!("Invalid symbol index {}", idx.0);
+                        continue;
+                    }
+                }
            }
            object::RelocationTarget::Absolute => {
                let section_name = obj_section.name()?;
@@ -299,6 +380,7 @@ fn map_relocations(
 fn parse_line_info(
    obj_file: &object::File,
    sections: &mut [Section],
+    section_indices: &[usize],
    obj_data: &[u8],
 ) -> Result<()> {
    // DWARF 1.1
@@ -326,7 +408,6 @@ fn parse_line_info(
                }
                let address_delta = read_u32(obj_file, &mut section_data)? as u64;
                out_section.line_info.insert(base_address + address_delta, line_number);
-                log::debug!("Line: {:#x} -> {}", base_address + address_delta, line_number);
            }
        }
    }
@@ -376,7 +457,7 @@ fn parse_line_info(

    // COFF
    if let object::File::Coff(coff) = obj_file {
-        parse_line_info_coff(coff, sections, obj_data)?;
+        parse_line_info_coff(coff, sections, section_indices, obj_data)?;
    }

    Ok(())
@@ -385,6 +466,7 @@ fn parse_line_info(
 fn parse_line_info_coff(
    coff: &object::coff::CoffFile,
    sections: &mut [Section],
+    section_indices: &[usize],
    obj_data: &[u8],
 ) -> Result<()> {
    use object::{
@@ -405,7 +487,9 @@ fn parse_line_info_coff(

        // Find this section in our out_section. If it's not in out_section,
        // skip it.
-        let Some(out_section) = sections.get_mut(sect.index().0) else {
+        let Some(out_section) =
+            section_indices.get(sect.index().0).and_then(|&i| sections.get_mut(i))
+        else {
            continue;
        };

@@ -514,7 +598,12 @@ fn combine_sections(
    for (i, section) in sections.iter().enumerate() {
        match section.kind {
            SectionKind::Data | SectionKind::Bss => {
-                data_sections.entry(section.name.clone()).or_default().push(i);
+                let base_name = if let Some(i) = section.name.rfind('$') {
+                    &section.name[..i]
+                } else {
+                    &section.name
+                };
+                data_sections.entry(base_name.to_string()).or_default().push(i);
            }
            SectionKind::Code => {
                text_sections.push(i);
@@ -523,12 +612,12 @@ fn combine_sections(
        }
    }
    if config.combine_data_sections {
-        for (_, section_indices) in data_sections {
-            do_combine_sections(sections, symbols, &section_indices)?;
+        for (combined_name, mut section_indices) in data_sections {
+            do_combine_sections(sections, symbols, &mut section_indices, combined_name)?;
        }
    }
    if config.combine_text_sections {
-        do_combine_sections(sections, symbols, &text_sections)?;
+        do_combine_sections(sections, symbols, &mut text_sections, ".text".to_string())?;
    }
    Ok(())
 }
@@ -536,11 +625,24 @@ fn combine_sections(
 fn do_combine_sections(
    sections: &mut [Section],
    symbols: &mut [Symbol],
-    section_indices: &[usize],
+    section_indices: &mut [usize],
+    combined_name: String,
 ) -> Result<()> {
    if section_indices.len() < 2 {
        return Ok(());
    }
+    // Sort sections lexicographically by name (for COFF section groups)
+    section_indices.sort_by(|&a, &b| {
+        let a_name = &sections[a].name;
+        let b_name = &sections[b].name;
+        // .text$di < .text$mn < .text
+        if a_name.contains('$') && !b_name.contains('$') {
+            return Ordering::Less;
+        } else if !a_name.contains('$') && b_name.contains('$') {
+            return Ordering::Greater;
+        }
+        a_name.cmp(b_name)
+    });
    let first_section_idx = section_indices[0];

    // Calculate the new offset for each section
@@ -548,7 +650,7 @@ fn do_combine_sections(
    let mut current_offset = 0;
    let mut data_size = 0;
    let mut num_relocations = 0;
-    for &i in section_indices {
+    for i in section_indices.iter().copied() {
        let section = &sections[i];
        if section.address != 0 {
            bail!("Section {} ({}) has non-zero address", i, section.name);
@@ -580,6 +682,8 @@ fn do_combine_sections(
    }
    {
        let first_section = &mut sections[first_section_idx];
+        first_section.id = format!("{combined_name}-combined");
+        first_section.name = combined_name;
        first_section.size = current_offset;
        first_section.data = SectionData(data);
        first_section.flags |= SectionFlag::Combined;
@@ -659,9 +763,18 @@ pub fn parse(data: &[u8], config: &DiffObjConfig) -> Result<Object> {
    let obj_file = object::File::parse(data)?;
    let arch = new_arch(&obj_file)?;
    let split_meta = parse_split_meta(&obj_file)?;
-    let mut symbols = map_symbols(arch.as_ref(), &obj_file, split_meta.as_ref())?;
-    let mut sections = map_sections(arch.as_ref(), &obj_file, split_meta.as_ref())?;
-    parse_line_info(&obj_file, &mut sections, data)?;
+    let (mut sections, section_indices) =
+        map_sections(arch.as_ref(), &obj_file, split_meta.as_ref())?;
+    let (mut symbols, symbol_indices) =
+        map_symbols(arch.as_ref(), &obj_file, &sections, &section_indices, split_meta.as_ref())?;
+    for obj_section in obj_file.sections() {
+        let section = &mut sections[section_indices[obj_section.index().0]];
+        if section.kind != SectionKind::Unknown {
+            section.relocations =
+                map_relocations(arch.as_ref(), &obj_file, &obj_section, &symbol_indices)?;
+        }
+    }
+    parse_line_info(&obj_file, &mut sections, &section_indices, data)?;
    if config.combine_data_sections || config.combine_text_sections {
        combine_sections(&mut sections, &mut symbols, config)?;
    }
@@ -803,7 +916,8 @@ mod test {
                ..Default::default()
            },
        ];
-        do_combine_sections(&mut sections, &mut symbols, &[1, 2, 3]).unwrap();
+        do_combine_sections(&mut sections, &mut symbols, &mut [1, 2, 3], ".data".to_string())
+            .unwrap();
        assert_eq!(sections[1].data.0, (1..=12).collect::<Vec<_>>());
        insta::assert_debug_snapshot!((sections, symbols));
    }