Version v2.0.0: Native Rust port

This commit is contained in:
Luke Street 2025-10-14 17:21:25 -06:00
parent e81a6d439b
commit 9dcfecc8b2
10 changed files with 895 additions and 241 deletions

View File

@ -8,31 +8,21 @@ jobs:
strategy:
matrix:
platform: [ ubuntu-latest, macos-latest, windows-latest ]
toolchain: [ stable, 1.71.0, nightly ]
toolchain: [ stable, 1.81.0, nightly ]
features:
- compress,alloc
- compress,decompress
- compress,decompress,std
- compress,alloc
- compress,decompress
- compress,decompress,std
fail-fast: false
runs-on: ${{ matrix.platform }}
steps:
- name: Install LLVM and Clang # required for bindgen to work, see https://github.com/rust-lang/rust-bindgen/issues/1797
uses: KyleMayes/install-llvm-action@v2.0.8
if: matrix.platform == 'windows-latest'
with:
version: 21.1.3
directory: ${{ runner.temp }}/llvm
- name: Set LIBCLANG_PATH
run: echo "LIBCLANG_PATH=$((gcm clang).source -replace "clang.exe")" >> $env:GITHUB_ENV
if: matrix.platform == 'windows-latest'
- uses: actions/checkout@v2
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- uses: actions-rs/toolchain@v1
- name: Setup Rust toolchain
uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ matrix.toolchain }}
override: true
- uses: actions-rs/cargo@v1
with:
command: test
args: --release --no-default-features --features ${{ matrix.features }}
- name: Cargo test
run: cargo test --release --no-default-features --features ${{ matrix.features }}

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "lzokay"]
path = lzokay
url = https://github.com/jackoalan/lzokay.git

View File

@ -1,7 +1,7 @@
[package]
name = "lzokay"
version = "1.0.2"
edition = "2018"
version = "2.0.0"
edition = "2021"
license = "MIT"
repository = "https://github.com/encounter/lzokay-rs"
documentation = "https://docs.rs/lzokay"
@ -11,7 +11,7 @@ A minimal, MIT-licensed implementation of the LZO compression format.
"""
keywords = ["lzo", "compression", "no_std"]
categories = ["compression", "no-std", "api-bindings"]
rust-version = "1.71.0"
rust-version = "1.81.0"
[features]
alloc = []
@ -19,7 +19,3 @@ std = ["alloc"]
decompress = []
compress = []
default = ["compress", "decompress", "std"]
[build-dependencies]
bindgen = "0.72.1"
cc = "1.2.41"

View File

@ -6,9 +6,9 @@
[crates.io]: https://crates.io/crates/lzokay
[Api Rustdoc]: https://img.shields.io/badge/api-rustdoc-blue.svg
[rustdoc]: https://docs.rs/lzokay
[Rust Version]: https://img.shields.io/badge/rust-1.70+-blue.svg?maxAge=3600
[Rust Version]: https://img.shields.io/badge/rust-1.81+-blue.svg?maxAge=3600
Rust wrapper for [LZ👌](https://github.com/jackoalan/lzokay), a minimal, MIT-licensed implementation of the
Pure-Rust port of [LZ👌](https://github.com/jackoalan/lzokay), a minimal, MIT-licensed implementation of the
[LZO compression format](http://www.oberhumer.com/opensource/lzo/).
See the original [README](https://github.com/jackoalan/lzokay/blob/master/README.md) for more information.
@ -21,29 +21,29 @@ See the original [README](https://github.com/jackoalan/lzokay/blob/master/README
### Usage
See the [compress](https://docs.rs/lzokay/latest/lzokay/compress)
or [decompress](https://docs.rs/lzokay/latest/lzokay/decompress)
See the [compress](https://docs.rs/lzokay/latest/lzokay/compress)
or [decompress](https://docs.rs/lzokay/latest/lzokay/decompress)
documentation for reference.
In `Cargo.toml`:
```toml
[dependencies]
lzokay = "1.0.1"
lzokay = "2.0.0"
```
Or, to only enable certain features:
```toml
[dependencies.lzokay]
version = "1.0.1"
version = "2.0.0"
default-features = false
features = ["decompress", "compress"]
```
- `decompress`: Enables decompression functions.
- `compress`: Enables compression functions.
- `alloc`: Enables optional compression functions that perform heap allocation.
- `alloc`: Enables optional compression functions that perform heap allocation.
Without `std`, this uses `extern crate alloc`.
- `std`: Enables use of `std`. Implies `alloc`.

View File

@ -1,39 +0,0 @@
use std::{env, path::PathBuf};
fn main() {
println!("cargo:rerun-if-changed=wrapper.hpp");
println!("cargo:rerun-if-changed=lzokay/lzokay.cpp");
println!("cargo:rerun-if-changed=lzokay/lzokay.hpp");
cc::Build::new()
.cpp(true)
.file("lzokay/lzokay.cpp")
.flag_if_supported("-std=c++14") // GCC/Clang
.flag_if_supported("/std:c++14") // MSVC
.compile("lzokay");
#[allow(unused_mut)]
let mut bindings = bindgen::Builder::default()
.header("wrapper.hpp")
.clang_arg("-Ilzokay")
.allowlist_function("lzokay::.*")
.size_t_is_usize(true)
.ctypes_prefix("types")
.derive_debug(false)
.clang_arg("-std=c++14")
.parse_callbacks(Box::new(bindgen::CargoCallbacks::new()));
#[cfg(not(feature = "std"))]
{
bindings = bindings.layout_tests(false);
}
if matches!(env::var("CARGO_CFG_TARGET_OS"), Result::Ok(v) if v == "android") {
if let Result::Ok(cc) = env::var("TARGET_CXX") {
let mut sysroot = PathBuf::from(cc).with_file_name("../sysroot");
sysroot = sysroot.canonicalize().unwrap_or_else(|err| {
panic!("Failed to locate {}: {}", sysroot.to_string_lossy(), err)
});
bindings = bindings.clang_arg(format!("--sysroot={}", sysroot.to_string_lossy()));
}
}
let result = bindings.generate().expect("Unable to generate bindings");
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
result.write_to_file(out_path.join("bindings.rs")).expect("Couldn't write bindings!");
}

1
lzokay

@ -1 +0,0 @@
Subproject commit db2df1fcbebc2ed06c10f727f72567d40f06a2be

View File

@ -2,7 +2,7 @@
//!
//! Available with feature `compress`.
//!
//! [`compress`] and [`compress_with_dict`] available with features `std` and/or `alloc`.
//! [`compress`] and [`compress_with_dict`] are available when the `alloc` feature is enabled.
//!
//! # Examples
//!
@ -42,7 +42,7 @@
//! // Allocate dst on stack, with worst-case compression size
//! let mut dst = [0u8; compress_worst_size(input.len())];
//! // Allocate dictionary storage on stack
//! let mut storage = [0u8; dict_storage_size()];
//! let mut storage = DictStorage::new();
//! // Create dictionary from storage
//! let mut dict = dict_from_storage(&mut storage);
//! let size = compress_no_alloc(&input, &mut dst, &mut dict)?;
@ -50,120 +50,638 @@
//! # Ok::<(), lzokay::Error>(())
//! ```
#[cfg(all(not(feature = "std"), feature = "alloc"))]
#[cfg(all(feature = "alloc", not(feature = "std")))]
extern crate alloc;
#[cfg(all(not(feature = "std"), feature = "alloc"))]
use alloc::{boxed::Box, vec::Vec};
#[cfg(all(feature = "alloc", not(feature = "std")))]
use alloc::{boxed::Box, vec, vec::Vec};
use core::{cmp, mem::size_of};
#[cfg(all(feature = "alloc", feature = "std"))]
use std::{boxed::Box, vec, vec::Vec};
use crate::Error;
#[cfg(feature = "alloc")]
use core::ptr::null_mut;
use core::{marker::PhantomData, mem::size_of};
use crate::{bindings, lzokay_result, Error};
type DictStorage = bindings::lzokay_DictBase_storage_type;
/// Dictionary type
pub struct Dict<'a> {
base: bindings::lzokay_DictBase,
#[cfg(feature = "alloc")]
storage: Option<Box<[u8; dict_storage_size()]>>,
phantom: PhantomData<&'a DictStorage>,
/// Compress `src` into a freshly allocated `Vec<u8>` using a temporary dictionary.
pub fn compress(src: &[u8]) -> Result<Vec<u8>, Error> {
let mut dict = new_dict();
compress_with_dict(src, &mut dict)
}
/// Creates a new heap-allocated dictionary.
#[cfg(feature = "alloc")]
pub fn new_dict() -> Dict<'static> {
let mut dict = Dict {
base: bindings::lzokay_DictBase { _storage: null_mut() },
storage: Option::Some(Box::new([0u8; dict_storage_size()])),
phantom: PhantomData,
};
dict.base._storage = dict.storage.as_mut().unwrap().as_mut_ptr() as *mut DictStorage;
dict
/// Compress `src`, reusing the provided dictionary across calls.
pub fn compress_with_dict(src: &[u8], dict: &mut Dict) -> Result<Vec<u8>, Error> {
let capacity = compress_worst_size(src.len());
let mut buf = vec![0u8; capacity];
let size = compress_no_alloc(src, &mut buf, dict)?;
buf.truncate(size);
Ok(buf)
}
/// Dictionary storage size, for manual or stack allocation.
pub const fn dict_storage_size() -> usize { size_of::<DictStorage>() }
/// Creates a dictionary from the supplied storage.
///
/// Storage **must** be at least [`dict_storage_size()`] bytes,
/// otherwise this function will panic.
pub fn dict_from_storage(storage: &mut [u8]) -> Dict<'_> {
if storage.len() < dict_storage_size() {
panic!(
"Dictionary storage is not large enough: {}, expected {}",
storage.len(),
dict_storage_size()
);
}
Dict {
base: bindings::lzokay_DictBase { _storage: storage.as_mut_ptr() as *mut DictStorage },
#[cfg(feature = "alloc")]
storage: Option::None,
phantom: PhantomData,
}
}
/// Worst-case compression size.
/// Worst-case compressed size according to the LZO format guarantees.
pub const fn compress_worst_size(s: usize) -> usize { s + s / 16 + 64 + 3 }
/// Compress the supplied buffer into a heap-allocated vector.
///
/// Creates a new dictionary for each invocation.
#[cfg(feature = "alloc")]
pub fn compress(src: &[u8]) -> Result<Vec<u8>, Error> { compress_with_dict(src, &mut new_dict()) }
/// Compress the supplied buffer into a heap-allocated vector,
/// with the supplied pre-allocated dictionary.
#[cfg(feature = "alloc")]
pub fn compress_with_dict(src: &[u8], dict: &mut Dict) -> Result<Vec<u8>, Error> {
let mut out_size = 0usize;
let capacity = compress_worst_size(src.len());
let mut dst = Vec::with_capacity(capacity);
let result = unsafe {
let result = bindings::lzokay_compress(
src.as_ptr(),
src.len(),
dst.as_mut_ptr(),
capacity,
&mut out_size,
&mut dict.base,
);
if result == bindings::lzokay_EResult_Success {
dst.set_len(out_size as usize);
}
result
};
lzokay_result(dst, result)
/// Compress without heap allocations, writing the output into `dst`.
pub fn compress_no_alloc(src: &[u8], dst: &mut [u8], dict: &mut Dict) -> Result<usize, Error> {
let storage = dict.storage_mut();
compress_impl(src, dst, storage)
}
/// Compress the supplied buffer.
///
/// For sizing `dst`, use [`compress_worst_size`].
pub fn compress_no_alloc(src: &[u8], dst: &mut [u8], dict: &mut Dict) -> Result<usize, Error> {
let mut out_size = 0usize;
let result = unsafe {
bindings::lzokay_compress(
src.as_ptr(),
src.len(),
dst.as_mut_ptr(),
dst.len(),
&mut out_size,
&mut dict.base,
)
};
lzokay_result(out_size as usize, result)
const HASH_SIZE: usize = 0x4000;
const MAX_DIST: usize = 0xBFFF;
const MAX_MATCH_LEN: usize = 0x800;
const BUF_SIZE: usize = MAX_DIST + MAX_MATCH_LEN;
const MAX_MATCH_TABLE: usize = 34;
const BUF_GUARD: usize = BUF_SIZE + MAX_MATCH_LEN;
const M1_MAX_OFFSET: u32 = 0x0400;
const M2_MAX_OFFSET: u32 = 0x0800;
const M3_MAX_OFFSET: u32 = 0x4000;
const M4_BASE_OFFSET: u32 = 0x4000;
const M2_MIN_LEN: u32 = 3;
const M2_MAX_LEN: u32 = 8;
const M3_MAX_LEN: u32 = 33;
const M4_MAX_LEN: u32 = 9;
const M3_MARKER: u8 = 0x20;
const M4_MARKER: u8 = 0x10;
/// Hash chains tracking recent 3-byte sequences, keeping per-key chains and
/// remembering the best match length at each node.
#[derive(Clone)]
struct Match3 {
head: [u16; HASH_SIZE],
chain_sz: [u16; HASH_SIZE],
chain: [u16; BUF_SIZE],
best_len: [u16; BUF_SIZE],
}
impl Match3 {
const fn new() -> Self {
Self {
head: [0; HASH_SIZE],
chain_sz: [0; HASH_SIZE],
chain: [0; BUF_SIZE],
best_len: [0; BUF_SIZE],
}
}
#[inline]
fn make_key(bytes: &[u8]) -> usize {
let a = bytes[0] as u32;
let b = bytes[1] as u32;
let c = bytes[2] as u32;
let mix = (((a << 5) ^ b).wrapping_shl(5)) ^ c;
let prod = 0x9f5f_u32.wrapping_mul(mix);
((prod >> 5) & 0x3fff) as usize
}
#[inline]
fn get_head(&self, key: usize) -> u16 {
if self.chain_sz[key] == 0 {
u16::MAX
} else {
self.head[key]
}
}
fn init(&mut self) { self.chain_sz.fill(0); }
fn remove(&mut self, pos: usize, buffer: &[u8; BUF_GUARD]) {
let key = Self::make_key(&buffer[pos..]);
self.chain_sz[key] = self.chain_sz[key].saturating_sub(1);
}
/// Insert the current position into the hash chains and return the head
/// position alongside the bounded chain length to inspect.
fn advance(&mut self, state: &State, buffer: &[u8; BUF_GUARD]) -> (u16, u32) {
let key = Self::make_key(&buffer[state.wind_b as usize..]);
let head = self.get_head(key);
self.chain[state.wind_b as usize] = head;
let mut count = self.chain_sz[key] as u32;
self.chain_sz[key] = self.chain_sz[key].wrapping_add(1);
if count > MAX_MATCH_LEN as u32 {
count = MAX_MATCH_LEN as u32;
}
self.head[key] = state.wind_b as u16;
(head, count)
}
/// Fast path for known matches: advance the hash chains without searching.
fn skip_advance(&mut self, state: &State, buffer: &[u8; BUF_GUARD]) {
let key = Self::make_key(&buffer[state.wind_b as usize..]);
self.chain[state.wind_b as usize] = self.get_head(key);
self.head[key] = state.wind_b as u16;
self.best_len[state.wind_b as usize] = (MAX_MATCH_LEN + 1) as u16;
self.chain_sz[key] = self.chain_sz[key].wrapping_add(1);
}
}
/// Direct lookup table for 2-byte prefixes used to seed matches quickly.
#[derive(Clone)]
struct Match2 {
head: [u16; 1 << 16],
}
impl Match2 {
const fn new() -> Self { Self { head: [u16::MAX; 1 << 16] } }
#[inline]
fn make_key(bytes: &[u8]) -> usize { (bytes[0] as usize) ^ ((bytes[1] as usize) << 8) }
fn init(&mut self) { self.head.fill(u16::MAX); }
fn add(&mut self, pos: u16, buffer: &[u8; BUF_GUARD]) {
let key = Self::make_key(&buffer[pos as usize..]);
self.head[key] = pos;
}
fn remove(&mut self, pos: usize, buffer: &[u8; BUF_GUARD]) {
let key = Self::make_key(&buffer[pos..]);
if self.head[key] as usize == pos {
self.head[key] = u16::MAX;
}
}
/// Try to find a 2-byte prefix match at the current window position.
fn search(
&self,
state: &State,
lb_pos: &mut u32,
lb_len: &mut u32,
best_pos: &mut [u32; MAX_MATCH_TABLE],
buffer: &[u8; BUF_GUARD],
) -> bool {
let key = Self::make_key(&buffer[state.wind_b as usize..]);
let pos = self.head[key];
if pos == u16::MAX {
return false;
}
if best_pos[2] == 0 {
best_pos[2] = pos as u32 + 1;
}
if *lb_len < 2 {
*lb_len = 2;
*lb_pos = pos as u32;
}
true
}
}
/// Concrete storage backing a dictionary instance. Buffers and match tables are
/// stored side by side so the encoder can share logic across heap and stack
/// configurations.
#[derive(Clone)]
pub struct DictStorage {
match3: Match3,
match2: Match2,
buffer: [u8; BUF_GUARD],
}
impl DictStorage {
pub const fn new() -> Self {
Self { match3: Match3::new(), match2: Match2::new(), buffer: [0; BUF_GUARD] }
}
/// Initialize dictionary tables and preload the first window from `state.src`.
fn init(&mut self, state: &mut State<'_>) {
self.match3.init();
self.match2.init();
state.cycle1_countdown = MAX_DIST as u32;
state.inp = 0;
state.wind_sz = cmp::min(state.src.len(), MAX_MATCH_LEN) as u32;
state.wind_b = 0;
state.wind_e = state.wind_sz;
if state.wind_sz > 0 {
let len = state.wind_sz as usize;
self.buffer[..len].copy_from_slice(&state.src[..len]);
}
state.inp += state.wind_sz as usize;
if state.wind_sz < 3 {
let start = state.wind_b as usize + state.wind_sz as usize;
let end = start + (3 - state.wind_sz as usize);
self.buffer[start..end].fill(0);
}
}
/// Remove stale entries before the sliding window overwrites them.
fn reset_next_input_entry(&mut self, state: &mut State) {
if state.cycle1_countdown == 0 {
let pos = state.wind_e as usize;
self.match3.remove(pos, &self.buffer);
self.match2.remove(pos, &self.buffer);
} else {
state.cycle1_countdown -= 1;
}
}
/// Advance the dictionary by one position, returning the best match offset
/// and length. When `skip` is true the already-emitted match bytes are
/// fast-forwarded first so the dictionary stays aligned with the encoded
/// output.
fn advance(
&mut self,
state: &mut State,
prev_len: u32,
best_off: &mut [u32; MAX_MATCH_TABLE],
skip: bool,
) -> (u32, u32) {
if skip {
// Skip phase: advance through already-encoded match bytes while
// keeping the dictionary in sync with the emitted output.
for _ in 0..prev_len.saturating_sub(1) {
self.reset_next_input_entry(state);
self.match3.skip_advance(state, &self.buffer);
self.match2.add(state.wind_b as u16, &self.buffer);
state.get_byte(&mut self.buffer);
}
}
let mut lb_len = 1u32;
let mut lb_off = 0u32;
let mut lb_pos = 0u32;
let mut best_pos = [0u32; MAX_MATCH_TABLE];
let (match_head, mut match_count) = self.match3.advance(state, &self.buffer);
if match_head == u16::MAX {
match_count = 0;
}
let mut should_terminate = false;
let best_len = lb_len;
if lb_len >= state.wind_sz {
// Window exhausted: no further matches possible once we reach EOF.
if state.wind_sz == 0 {
should_terminate = true;
}
self.match3.best_len[state.wind_b as usize] = (MAX_MATCH_LEN + 1) as u16;
} else {
if self.match2.search(state, &mut lb_pos, &mut lb_len, &mut best_pos, &self.buffer)
&& state.wind_sz >= 3
{
let mut match_pos = match_head as usize;
for _ in 0..match_count {
if match_pos >= BUF_SIZE {
break;
}
let ref_pos = state.wind_b as usize;
let window = state.wind_sz as usize;
let mut matched = 0usize;
while matched < window
&& self.buffer[ref_pos + matched] == self.buffer[match_pos + matched]
{
matched += 1;
}
if matched >= 2 {
if matched < MAX_MATCH_TABLE && best_pos[matched] == 0 {
// Remember first occurrence for potential match length tweaks.
best_pos[matched] = match_pos as u32 + 1;
}
let matched_u32 = matched as u32;
if matched_u32 > lb_len {
lb_len = matched_u32;
lb_pos = match_pos as u32;
if lb_len == state.wind_sz
|| lb_len > self.match3.best_len[match_pos] as u32
{
break;
}
}
}
match_pos = self.match3.chain[match_pos] as usize;
}
}
if lb_len > best_len {
lb_off = state.pos2off(lb_pos);
}
self.match3.best_len[state.wind_b as usize] = lb_len as u16;
for i in 2..MAX_MATCH_TABLE {
best_off[i] = if best_pos[i] != 0 { state.pos2off(best_pos[i] - 1) } else { 0 };
}
}
self.reset_next_input_entry(state);
self.match2.add(state.wind_b as u16, &self.buffer);
state.get_byte(&mut self.buffer);
if should_terminate {
state.buf_sz = 0;
lb_len = 0;
} else {
// Buffer size counts the current byte plus the lookahead window.
state.buf_sz = state.wind_sz + 1;
}
state.bufp = state.inp - state.buf_sz as usize;
(lb_off, lb_len)
}
}
/// Sliding window state tracked while searching for matches.
struct State<'a> {
src: &'a [u8],
inp: usize,
wind_sz: u32,
wind_b: u32,
wind_e: u32,
cycle1_countdown: u32,
bufp: usize,
buf_sz: u32,
}
impl<'a> State<'a> {
/// Create a new window over `src`.
fn new(src: &'a [u8]) -> Self {
Self {
src,
inp: 0,
wind_sz: 0,
wind_b: 0,
wind_e: 0,
cycle1_countdown: 0,
bufp: 0,
buf_sz: 0,
}
}
/// Advance the window by one byte, copying from `src` and maintaining the
/// duplicated tail used for wrap-around reads.
fn get_byte(&mut self, buffer: &mut [u8; BUF_GUARD]) {
if self.inp >= self.src.len() {
if self.wind_sz > 0 {
self.wind_sz -= 1;
}
let idx = self.wind_e as usize;
buffer[idx] = 0;
if idx < MAX_MATCH_LEN {
buffer[BUF_SIZE + idx] = 0;
}
} else {
let value = self.src[self.inp];
let idx = self.wind_e as usize;
buffer[idx] = value;
if idx < MAX_MATCH_LEN {
buffer[BUF_SIZE + idx] = value;
}
self.inp += 1;
}
self.wind_e = (self.wind_e + 1) % BUF_SIZE as u32;
self.wind_b = (self.wind_b + 1) % BUF_SIZE as u32;
}
/// Convert a buffer index into a backwards distance within the window.
#[inline]
fn pos2off(&self, pos: u32) -> u32 {
if self.wind_b > pos {
self.wind_b - pos
} else {
BUF_SIZE as u32 - (pos - self.wind_b)
}
}
}
/// Internal representation for dictionaries, either borrowed or owned.
enum DictInner<'a> {
Borrowed(&'a mut DictStorage),
#[cfg(feature = "alloc")]
Owned(Box<DictStorage>),
}
/// Compression dictionary used to retain the sliding window between calls.
pub struct Dict<'a> {
inner: DictInner<'a>,
}
impl<'a> Dict<'a> {
/// Return the mutable storage backing this dictionary, regardless of
/// whether it is owned or borrowed.
fn storage_mut(&mut self) -> &mut DictStorage {
match &mut self.inner {
DictInner::Borrowed(storage) => storage,
#[cfg(feature = "alloc")]
DictInner::Owned(storage) => storage.as_mut(),
}
}
}
#[cfg(feature = "alloc")]
/// Create a heap-allocated dictionary with the canonical storage layout.
pub fn new_dict() -> Dict<'static> {
Dict { inner: DictInner::Owned(Box::new(DictStorage::new())) }
}
/// Total number of bytes required to back a dictionary.
pub const fn dict_storage_size() -> usize { size_of::<DictStorage>() }
/// Wrap user-provided storage (e.g. stack-allocated) inside a dictionary.
pub fn dict_from_storage(storage: &mut DictStorage) -> Dict<'_> {
Dict { inner: DictInner::Borrowed(storage) }
}
/// Emit the repeated zero-byte encoding used for long literal/match lengths.
fn write_zero_byte_length(
dst: &mut [u8],
out_pos: &mut usize,
mut len: usize,
) -> Result<(), Error> {
while len > 255 {
write_dst(dst, out_pos, &[0])?;
len -= 255;
}
write_dst(dst, out_pos, &[len as u8])?;
Ok(())
}
/// Emit a literal run following the LZO opcode rules.
fn encode_literal_run(
dst: &mut [u8],
out_pos: &mut usize,
src: &[u8],
lit_ptr: usize,
lit_len: usize,
) -> Result<(), Error> {
if *out_pos == 0 && lit_len <= 238 {
write_dst(dst, out_pos, &[17 + lit_len as u8])?;
} else if lit_len <= 3 {
let idx = out_pos.checked_sub(2).ok_or(Error::OutputOverrun)?;
*dst_byte_mut(dst, idx)? |= lit_len as u8;
} else if lit_len <= 18 {
write_dst(dst, out_pos, &[(lit_len - 3) as u8])?;
} else {
write_dst(dst, out_pos, &[0])?;
write_zero_byte_length(dst, out_pos, lit_len - 18)?;
}
let src_chunk = src.get(lit_ptr..lit_ptr + lit_len).ok_or(Error::InputOverrun)?;
write_dst(dst, out_pos, src_chunk)?;
Ok(())
}
/// Emit a back-reference according to the LZOKAY/LZO opcode encoding.
fn encode_lookback_match(
dst: &mut [u8],
out_pos: &mut usize,
lb_len: u32,
mut lb_off: u32,
last_lit_len: u32,
) -> Result<(), Error> {
if lb_len == 2 {
lb_off -= 1;
write_dst(dst, out_pos, &[((lb_off & 0x3) << 2) as u8, (lb_off >> 2) as u8])?;
} else if lb_len <= M2_MAX_LEN && lb_off <= M2_MAX_OFFSET {
lb_off -= 1;
write_dst(dst, out_pos, &[
(((lb_len - 1) << 5) | ((lb_off & 0x7) << 2)) as u8,
(lb_off >> 3) as u8,
])?;
} else if lb_len == M2_MIN_LEN && lb_off <= M1_MAX_OFFSET + M2_MAX_OFFSET && last_lit_len >= 4 {
lb_off -= 1 + M2_MAX_OFFSET;
write_dst(dst, out_pos, &[((lb_off & 0x3) << 2) as u8, (lb_off >> 2) as u8])?;
} else if lb_off <= M3_MAX_OFFSET {
lb_off -= 1;
if lb_len <= M3_MAX_LEN {
write_dst(dst, out_pos, &[M3_MARKER | (lb_len as u8 - 2)])?;
} else {
let extra = (lb_len - M3_MAX_LEN) as usize;
write_dst(dst, out_pos, &[M3_MARKER])?;
write_zero_byte_length(dst, out_pos, extra)?;
}
write_dst(dst, out_pos, &[(lb_off << 2) as u8, (lb_off >> 6) as u8])?;
} else {
lb_off -= M4_BASE_OFFSET;
if lb_len <= M4_MAX_LEN {
write_dst(dst, out_pos, &[M4_MARKER
| (((lb_off & 0x4000) >> 11) as u8)
| (lb_len as u8 - 2)])?;
} else {
let extra = (lb_len - M4_MAX_LEN) as usize;
write_dst(dst, out_pos, &[M4_MARKER | (((lb_off & 0x4000) >> 11) as u8)])?;
write_zero_byte_length(dst, out_pos, extra)?;
}
write_dst(dst, out_pos, &[(lb_off << 2) as u8, (lb_off >> 6) as u8])?;
}
Ok(())
}
/// Apply the heuristics that prefer cheaper opcodes when a shorter match can be
/// emitted at a closer distance.
fn find_better_match(best_off: &[u32; MAX_MATCH_TABLE], lb_len: &mut u32, lb_off: &mut u32) {
let len = *lb_len;
let off = *lb_off;
if len <= M2_MIN_LEN || off <= M2_MAX_OFFSET {
return;
}
// Prefer re-encoding long matches as cheaper opcodes whenever the distance
// permits switching to a shorter back-reference class.
if off > M2_MAX_OFFSET
&& len >= M2_MIN_LEN + 1
&& len <= M2_MAX_LEN + 1
&& best_off[len as usize - 1] != 0
&& best_off[len as usize - 1] <= M2_MAX_OFFSET
{
*lb_len = len - 1;
*lb_off = best_off[len as usize - 1];
} else if off > M3_MAX_OFFSET
&& len >= M4_MAX_LEN + 1
&& len <= M2_MAX_LEN + 2
&& best_off[len as usize - 2] != 0
&& best_off[len as usize] <= M2_MAX_OFFSET
{
*lb_len = len - 2;
*lb_off = best_off[len as usize - 2];
} else if off > M3_MAX_OFFSET
&& len >= M4_MAX_LEN + 1
&& len <= M3_MAX_LEN + 1
&& best_off[len as usize - 1] != 0
&& best_off[len as usize - 2] <= M3_MAX_OFFSET
{
*lb_len = len - 1;
*lb_off = best_off[len as usize - 1];
}
}
/// Core compression routine shared by the heap-allocating and stack variants.
/// Maintains the window management and opcode selection heuristics required by
/// the LZO format while using safe Rust semantics.
fn compress_impl(src: &[u8], dst: &mut [u8], storage: &mut DictStorage) -> Result<usize, Error> {
let mut state = State::new(src);
storage.init(&mut state);
let mut out_pos = 0usize;
let mut lit_len = 0u32;
let mut best_off = [0u32; MAX_MATCH_TABLE];
let mut lit_ptr = state.inp;
let (mut lb_off, mut lb_len) = storage.advance(&mut state, 0, &mut best_off, false);
while state.buf_sz > 0 {
if lit_len == 0 {
// Capture the starting point for the next literal run.
lit_ptr = state.bufp;
}
if lb_len < 2
|| (lb_len == 2 && (lb_off > M1_MAX_OFFSET || lit_len == 0 || lit_len >= 4))
|| (lb_len == 2 && out_pos == 0)
|| (out_pos == 0 && lit_len == 0)
{
lb_len = 0;
} else if lb_len == M2_MIN_LEN && lb_off > M1_MAX_OFFSET + M2_MAX_OFFSET && lit_len >= 4 {
lb_len = 0;
}
if lb_len == 0 {
lit_len += 1;
// No match chosen: step forward by one literal byte.
let (next_off, next_len) = storage.advance(&mut state, 0, &mut best_off, false);
lb_off = next_off;
lb_len = next_len;
continue;
}
find_better_match(&best_off, &mut lb_len, &mut lb_off);
encode_literal_run(dst, &mut out_pos, src, lit_ptr, lit_len as usize)?;
encode_lookback_match(dst, &mut out_pos, lb_len, lb_off, lit_len)?;
let prev_len = lb_len;
lit_len = 0;
// Advance over the matched bytes, updating the search structures.
let (next_off, next_len) = storage.advance(&mut state, prev_len, &mut best_off, true);
lb_off = next_off;
lb_len = next_len;
}
// Flush any trailing literal bytes.
encode_literal_run(dst, &mut out_pos, src, lit_ptr, lit_len as usize)?;
// Emit terminating M4 instruction (distance 0x4000, length 3).
write_dst(dst, &mut out_pos, &[M4_MARKER | 1, 0, 0])?;
Ok(out_pos)
}
#[inline(always)]
fn dst_byte_mut<'a>(dst: &'a mut [u8], idx: usize) -> Result<&'a mut u8, Error> {
dst.get_mut(idx).ok_or(Error::OutputOverrun)
}
#[inline(always)]
fn write_dst(dst: &mut [u8], out_pos: &mut usize, slice: &[u8]) -> Result<(), Error> {
let pos = *out_pos;
let end = pos.checked_add(slice.len()).ok_or(Error::OutputOverrun)?;
let dst_chunk = dst.get_mut(pos..end).ok_or(Error::OutputOverrun)?;
dst_chunk.copy_from_slice(slice);
*out_pos = end;
Ok(())
}
#[cfg(test)]
mod tests {
#[cfg(feature = "alloc")]
use crate::compress::{compress, compress_with_dict, new_dict};
use crate::compress::{
compress_no_alloc, compress_worst_size, dict_from_storage, dict_storage_size,
};
use super::{compress, compress_with_dict, new_dict};
use super::{compress_no_alloc, compress_worst_size, dict_from_storage, DictStorage};
const INPUT_1: &[u8] = include_bytes!("test1.txt");
const EXPECTED_1: &[u8] = include_bytes!("test1.bin");
@ -183,7 +701,6 @@ mod tests {
let mut dict = new_dict();
let dst = compress_with_dict(INPUT_1, &mut dict).expect("Failed to compress (1)");
assert_eq!(dst, EXPECTED_1);
// Compress a second time to test dictionary reuse
let dst = compress_with_dict(INPUT_2, &mut dict).expect("Failed to compress (2)");
assert_eq!(dst, EXPECTED_2);
}
@ -191,12 +708,11 @@ mod tests {
#[test]
fn test_compress_no_alloc() {
let mut dst = [0u8; compress_worst_size(INPUT_1.len())];
let mut storage = [0u8; dict_storage_size()];
let mut storage = DictStorage::new();
let mut dict = dict_from_storage(&mut storage);
let out_size =
compress_no_alloc(INPUT_1, &mut dst, &mut dict).expect("Failed to compress (1)");
assert_eq!(&dst[0..out_size], EXPECTED_1);
// Compress a second time to test dictionary reuse
let out_size =
compress_no_alloc(INPUT_2, &mut dst, &mut dict).expect("Failed to compress (2)");
assert_eq!(&dst[0..out_size], EXPECTED_2);

View File

@ -16,23 +16,238 @@
//! # Ok::<(), lzokay::Error>(())
//! ```
use crate::{bindings, lzokay_result, Error};
use crate::Error;
/// Maximum repeat count representable via zero marker bytes when extending
/// literal or match lengths.
const MAX255_COUNT: usize = usize::MAX / 255 - 2;
/// Opcode marker for mid-range matches (labelled "M3" in the LZO reference).
const M3_MARKER: u8 = 0x20;
/// Opcode marker for far matches ("M4") and the terminator instruction.
const M4_MARKER: u8 = 0x10;
/// Decompress `src` into `dst`.
///
/// `dst` must be large enough to hold the entire decompressed output.
/// `dst` must be large enough to hold the entire decompressed output. The
/// function follows the documented LZO opcode semantics and state transitions.
pub fn decompress(src: &[u8], dst: &mut [u8]) -> Result<usize, Error> {
let mut out_size = 0usize;
let result = unsafe {
bindings::lzokay_decompress(
src.as_ptr(),
src.len(),
dst.as_mut_ptr(),
dst.len(),
&mut out_size,
)
};
lzokay_result(out_size as usize, result)
if src.len() < 3 {
return Err(Error::InputOverrun);
}
let mut inp = 0usize;
let mut outp = 0usize;
let mut state = 0usize;
let mut nstate: usize;
let mut lblen: usize;
let mut lbcur: usize;
let first = input_byte(src, &mut inp)?;
// The LZO bitstream reserves the first byte for literal priming. Codes >= 22
// copy a literal block immediately; 18..21 seed the literal countdown (`state`).
if first >= 22 {
let len = (first as usize) - 17;
copy_slice(src, &mut inp, dst, &mut outp, len)?;
state = 4;
} else if first >= 18 {
nstate = (first as usize) - 17;
state = nstate;
copy_slice(src, &mut inp, dst, &mut outp, nstate)?;
}
loop {
let inst = input_byte(src, &mut inp)?;
if inst & 0xC0 != 0 {
// [M2]
// 1 L L D D D S S (128..255)
// Copy 5-8 bytes from block within 2kB distance
// state = S
// length = 5 + L
// 0 1 L D D D S S (64..127)
// Copy 3-4 bytes from block within 2kB distance
// length = 3 + L
// Always followed by one byte: distance = (next << 3) + D + 1
let next = input_byte(src, &mut inp)?;
let distance = ((next as usize) << 3) + (((inst as usize) >> 2) & 0x7) + 1;
lbcur = outp.checked_sub(distance).ok_or(Error::LookbehindOverrun)?;
lblen = ((inst as usize) >> 5) + 1;
nstate = (inst as usize) & 0x3;
} else if inst & M3_MARKER != 0 {
// [M3]
// 0 0 1 L L L L L (32..63)
// Copy from <= 16kB distance
// length = 2 + (L ?: 31 + zero-runs + tail)
// Followed by LE16: distance = (value >> 2) + 1, state = value & 3
lblen = ((inst as usize) & 0x1F) + 2;
if lblen == 2 {
let offset = consume_zero_byte_length(src, &mut inp)?;
let tail = input_byte(src, &mut inp)?;
lblen += offset * 255 + 31 + tail as usize;
}
let raw = read_le16(src, &mut inp)?;
let distance = ((raw as usize) >> 2) + 1;
lbcur = outp.checked_sub(distance).ok_or(Error::LookbehindOverrun)?;
nstate = (raw as usize) & 0x3;
} else if inst & M4_MARKER != 0 {
// [M4]
// 0 0 0 1 H L L L (16..31)
// Copy from 16..48kB distance
// length = 2 + (L ?: 7 + zero-runs + tail)
// Followed by LE16: distance = 16384 + (H << 14) + value, state = value & 3
// Terminating opcode when distance == 16384.
lblen = ((inst as usize) & 0x7) + 2;
if lblen == 2 {
let offset = consume_zero_byte_length(src, &mut inp)?;
let tail = input_byte(src, &mut inp)?;
lblen += offset * 255 + 7 + tail as usize;
}
let raw = read_le16(src, &mut inp)?;
let base_dist = ((inst as usize & 0x8) << 11) + ((raw as usize) >> 2);
if base_dist == 0 {
// Stream finished
break;
}
let distance = base_dist + 16384;
lbcur = outp.checked_sub(distance).ok_or(Error::LookbehindOverrun)?;
nstate = (raw as usize) & 0x3;
} else {
if state == 0 {
// [Literal]
// 0 0 0 0 L L L L (0..15)
// Copy long literal string: length = 3 + extended length bytes.
let mut len = inst as usize + 3;
if len == 3 {
let offset = consume_zero_byte_length(src, &mut inp)?;
let tail = input_byte(src, &mut inp)?;
len += offset * 255 + 15 + tail as usize;
}
copy_slice(src, &mut inp, dst, &mut outp, len)?;
state = 4;
continue;
} else if state != 4 {
// [M1, short]
// state = 1..3
// 0 0 0 0 D D S S (0..15)
// Copy 2 bytes within 1kB distance, state = S afterwards.
let tail = input_byte(src, &mut inp)?;
let distance = ((inst as usize) >> 2) + ((tail as usize) << 2) + 1;
lbcur = outp.checked_sub(distance).ok_or(Error::LookbehindOverrun)?;
lblen = 2;
nstate = (inst as usize) & 0x3;
} else {
// [M1, long]
// state == 4
// 0 0 0 0 D D S S (0..15)
// Copy 3 bytes within 2..3kB distance, state = S afterwards.
let tail = input_byte(src, &mut inp)?;
let distance = ((inst as usize) >> 2) + ((tail as usize) << 2) + 2049;
lbcur = outp.checked_sub(distance).ok_or(Error::LookbehindOverrun)?;
lblen = 3;
nstate = (inst as usize) & 0x3;
}
}
// Copy the lookback run (source and destination may overlap).
if lblen > 0 {
let out_end = outp.checked_add(lblen).ok_or(Error::OutputOverrun)?;
let lb_end = lbcur.checked_add(lblen).ok_or(Error::OutputOverrun)?;
if out_end > dst.len() || lb_end > dst.len() {
return Err(Error::OutputOverrun);
}
for i in 0..lblen {
dst[outp + i] = dst[lbcur + i];
}
outp = out_end;
}
// Copy the following literal run dictated by `nstate`.
copy_slice(src, &mut inp, dst, &mut outp, nstate)?;
state = nstate;
}
// The stream must end with the terminating M4 instruction (length == 3).
if lblen != 3 {
return Err(Error::Error);
}
if inp == src.len() {
Ok(outp)
} else if inp < src.len() {
Err(Error::InputNotConsumed)
} else {
Err(Error::InputOverrun)
}
}
/// Read a single byte from `src`.
#[inline(always)]
fn input_byte(src: &[u8], idx: &mut usize) -> Result<u8, Error> {
let n = src.get(*idx).copied().ok_or(Error::InputOverrun)?;
*idx += 1;
Ok(n)
}
/// Read a slice of length `len` starting at `start` from `src`.
#[inline(always)]
fn input_slice<'a>(src: &'a [u8], start: &mut usize, len: usize) -> Result<&'a [u8], Error> {
let end = start.checked_add(len).ok_or(Error::InputOverrun)?;
let slice = src.get(*start..end).ok_or(Error::InputOverrun)?;
*start = end;
Ok(slice)
}
/// Read a little-endian `u16` starting at `pos`.
#[inline(always)]
fn read_le16(bytes: &[u8], pos: &mut usize) -> Result<u16, Error> {
let slice = input_slice(bytes, pos, 2)?;
Ok(u16::from_le_bytes(slice.try_into().unwrap()))
}
/// Get a mutable slice of length `len` starting at `start` from `dst`.
#[inline(always)]
fn output_slice<'a>(
dst: &'a mut [u8],
start: &mut usize,
len: usize,
) -> Result<&'a mut [u8], Error> {
let end = start.checked_add(len).ok_or(Error::OutputOverrun)?;
let slice = dst.get_mut(*start..end).ok_or(Error::OutputOverrun)?;
*start = end;
Ok(slice)
}
/// Copy a slice from `src` to `dst`.
#[inline(always)]
fn copy_slice(
src: &[u8],
src_start: &mut usize,
dst: &mut [u8],
dst_start: &mut usize,
len: usize,
) -> Result<(), Error> {
if len == 0 {
return Ok(());
}
let src_slice = input_slice(src, src_start, len)?;
let dst_slice = output_slice(dst, dst_start, len)?;
dst_slice.copy_from_slice(src_slice);
Ok(())
}
/// Consume a run of zero marker bytes used for long length encodings.
#[inline(always)]
fn consume_zero_byte_length(src: &[u8], inp: &mut usize) -> Result<usize, Error> {
let start = *inp;
while src.get(*inp).copied() == Some(0) {
*inp += 1;
}
let offset = *inp - start;
if offset > MAX255_COUNT {
Err(Error::Error)
} else {
Ok(offset)
}
}
#[cfg(test)]

View File

@ -1,7 +1,7 @@
#![cfg_attr(not(feature = "std"), no_std)]
//! # LZ👌-rs
//!
//! Rust wrapper for [LZ👌](https://github.com/jackoalan/lzokay), a minimal, MIT-licensed
//! Pure-Rust port of [LZ👌](https://github.com/jackoalan/lzokay), a minimal, MIT-licensed
//! implementation of the [LZO compression format](http://www.oberhumer.com/opensource/lzo/).
//!
//! See the original [README](https://github.com/jackoalan/lzokay/blob/master/README.md) for more information.
@ -20,14 +20,14 @@
//!
//! ```toml
//! [dependencies]
//! lzokay = "1.0.1"
//! lzokay = "2.0.0"
//! ```
//!
//! Or, to only enable certain features:
//!
//! ```toml
//! [dependencies.lzokay]
//! version = "1.0.1"
//! version = "2.0.0"
//! default-features = false
//! features = ["decompress", "compress"]
//! ```
@ -49,36 +49,8 @@ pub mod compress;
#[cfg(feature = "decompress")]
pub mod decompress;
mod bindings {
#![allow(unknown_lints)]
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(deref_nullptr)]
#![allow(dead_code)]
#[cfg(not(feature = "std"))]
mod types {
pub type c_uchar = u8;
pub type c_ushort = u16;
pub type c_uint = u32;
pub type c_int = i32;
pub type c_ulong = usize;
pub type c_ulonglong = usize;
}
#[cfg(feature = "std")]
mod types {
pub type c_uchar = ::std::os::raw::c_uchar;
pub type c_ushort = ::std::os::raw::c_ushort;
pub type c_uint = ::std::os::raw::c_uint;
pub type c_int = ::std::os::raw::c_int;
pub type c_ulong = usize;
pub type c_ulonglong = usize;
}
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
}
/// Error result codes
#[derive(Debug, Eq, PartialEq)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum Error {
/// Likely indicates bad compressed LZO input.
LookbehindOverrun,
@ -92,20 +64,20 @@ pub enum Error {
InputNotConsumed,
}
fn lzokay_result<T>(result: T, error: bindings::lzokay_EResult) -> Result<T, Error> {
if error == bindings::lzokay_EResult_Success {
Result::Ok(result)
} else {
Result::Err(match error {
bindings::lzokay_EResult_LookbehindOverrun => Error::LookbehindOverrun,
bindings::lzokay_EResult_OutputOverrun => Error::OutputOverrun,
bindings::lzokay_EResult_InputOverrun => Error::InputOverrun,
bindings::lzokay_EResult_InputNotConsumed => Error::InputNotConsumed,
_ => Error::Error,
})
impl core::fmt::Display for Error {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
match self {
Error::LookbehindOverrun => write!(f, "lookbehind overrun"),
Error::OutputOverrun => write!(f, "output overrun"),
Error::InputOverrun => write!(f, "input overrun"),
Error::Error => write!(f, "unknown error"),
Error::InputNotConsumed => write!(f, "input not consumed"),
}
}
}
impl core::error::Error for Error {}
#[cfg(test)]
#[cfg(all(feature = "compress", feature = "decompress", feature = "alloc"))]
mod tests {
@ -117,13 +89,22 @@ mod tests {
use super::{compress::compress, decompress::decompress};
const INPUT: &[u8] = include_bytes!("test1.txt");
const INPUT1: &[u8] = include_bytes!("test1.txt");
const INPUT2: &[u8] = include_bytes!("test2.txt");
#[test]
fn test_round_trip() {
let compressed = compress(INPUT).expect("Failed to compress");
let mut dst = vec![0u8; INPUT.len()];
fn test_round_trip1() {
let compressed = compress(INPUT1).expect("Failed to compress");
let mut dst = vec![0u8; INPUT1.len()];
decompress(&compressed, &mut dst).expect("Failed to decompress");
assert_eq!(INPUT, dst.as_slice());
assert_eq!(INPUT1, dst.as_slice());
}
#[test]
fn test_round_trip2() {
let compressed = compress(INPUT2).expect("Failed to compress");
let mut dst = vec![0u8; INPUT2.len()];
decompress(&compressed, &mut dst).expect("Failed to decompress");
assert_eq!(INPUT2, dst.as_slice());
}
}

View File

@ -1 +0,0 @@
#include <lzokay.hpp>