Compare commits

..

8 Commits

7 changed files with 555 additions and 202 deletions

View File

@@ -4,3 +4,4 @@ version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
num = "0.4.3"

46
examples/example.rs Normal file
View File

@@ -0,0 +1,46 @@
use sac::model::Model;
use sac::modelA::ModelA;
const DATA: &[u8] = b"
I'd just like to interject for a moment. What you're refering to as Linux, is in fact, GNU/Linux, or as I've re
aken to calling it, GNU plus Linux. Linux is not an operating system unto itself, but rather another free compo
a fully functioning GNU system made useful by the GNU corelibs, shell utilities and vital system components com
a full OS as defined by POSIX.
Many computer users run a modified version of the GNU system every day, without realizing it. Through a peculia
f events, the version of GNU which is widely used today is often called Linux, and many of its users are not aw
it is basically the GNU system, developed by the GNU Project.
There really is a Linux, and these people are using it, but it is just a part of the system they use. Linux is
el: the program in the system that allocates the machine's resources to the other programs that you run. The ke
an essential part of an operating system, but useless by itself; it can only function in the context of a compl
ating system. Linux is normally used in combination with the GNU operating system: the whole system is basicall
th Linux added, or GNU/Linux. All the so-called Linux distributions are really distributions of GNU/Linux!
";
fn main() {
type CodeValue = u32;
println!(
"Using model: ModelA<{}>",
std::any::type_name::<CodeValue>()
);
let model: ModelA<CodeValue> = ModelA::default();
model.print_metrics();
println!("");
let mut compressed = Vec::new();
println!("compressing...");
model.compress(&DATA[..], &mut compressed).unwrap();
println!("ModelA compressed to {} bytes", compressed.len());
println!(
"Compression Ratio: {}",
DATA.len() as f64 / compressed.len() as f64
);
println!("");
println!("decompressing...");
let mut decompressed = Vec::new();
let model: ModelA<CodeValue> = ModelA::default();
model.decompress(&compressed, &mut decompressed).unwrap();
println!("{}", String::from_utf8_lossy(&decompressed));
}

View File

@@ -1,87 +1,181 @@
#[derive(Debug)] use std::io::{self, Bytes, Cursor, Read, Write};
pub struct BitWriter {
data: Vec<u8>, pub struct BitWriter<'a, W: ?Sized + Write> {
bits: u8, bits: u8,
nextbit: usize, nextbit: usize,
output: &'a mut W,
} }
impl BitWriter { impl<'a, W: Write> From<&'a mut W> for BitWriter<'a, W> {
pub fn new() -> Self { fn from(value: &'a mut W) -> Self {
BitWriter::new(value)
}
}
impl<'a, W: Write> BitWriter<'a, W> {
pub fn new(writer: &'a mut W) -> Self {
return BitWriter { return BitWriter {
data: vec![],
bits: 0, bits: 0,
nextbit: 7, nextbit: 7,
output: writer,
}; };
//writer.into()
} }
pub fn write(&mut self, bit: bool) { pub fn write(&mut self, bit: bool) -> io::Result<()> {
if bit { if bit {
self.bits = 1 << self.nextbit | self.bits; self.bits = 1 << self.nextbit | self.bits;
} }
if self.nextbit == 0 { if self.nextbit == 0 {
self.data.push(self.bits); self.output.write(&[self.bits])?;
self.bits = 0; self.bits = 0;
self.nextbit = 7; self.nextbit = 7;
} else { } else {
self.nextbit -= 1; self.nextbit -= 1;
} }
Ok(())
} }
pub fn flush(mut self) -> Vec<u8> { pub fn flush(self) -> std::io::Result<()> {
if self.bits != 0 { if self.bits != 0 {
self.data.push(self.bits); self.output.write(&[self.bits])?;
} }
return self.data; return self.output.flush();
} }
} }
trait Poppable { pub struct BitReader<T> {
fn pop(&mut self) -> Option<u8>; next: u8,
bits: u8,
repeat_bits: i32,
input: Bytes<T>,
} }
impl Poppable for &[u8] {
fn pop(&mut self) -> Option<u8> { impl<R: Read> From<R> for BitReader<R> {
if self.len() == 0 { fn from(value: R) -> Self {
return None; BitReader::new(value.bytes())
}
}
impl<T: AsRef<[u8]>> From<T> for BitReader<Cursor<T>> {
fn from(value: T) -> Self {
let c = Cursor::new(value);
c.into()
}
}
impl<R: Read> BitReader<R> {
pub fn new(value: Bytes<R>) -> Self {
BitReader {
next: 0,
bits: 0,
repeat_bits: 0,
input: value,
} }
let out = self[0];
*self = &self[1..];
return Some(out);
} }
} pub fn get_bit(&mut self) -> io::Result<bool> {
if self.next == 0 {
pub struct BitReader<'a> { let next = self.input.next().transpose()?;
data: &'a [u8], if let Some(byte) = next {
// bits: u8, self.bits = byte;
nextbit: usize, } else if self.repeat_bits <= 0 {
} return Err(io::Error::other("No more bits"));
} else {
impl<'a> BitReader<'a> { self.repeat_bits -= 8;
pub fn new(data: &'a [u8]) -> Self { }
BitReader { data, nextbit: 7 } self.next = 1 << 7;
}
}
impl<'a> From<&'a [u8]> for BitReader<'a> {
fn from(value: &'a [u8]) -> Self {
BitReader::new(value)
}
}
impl BitReader<'_> {
pub fn pop(&mut self) -> Option<bool> {
if self.data.len() == 0 {
return None;
} }
let bit = (self.data[0] >> self.nextbit) & 1; let bit = (self.bits & self.next) > 0;
if self.nextbit == 0 { self.next = self.next >> 1;
self.data.pop(); return Ok(bit);
self.nextbit = 8; }
} }
self.nextbit -= 1; impl<T> BitReader<T> {
return Some(bit > 0); pub fn with_repeat_bits(mut self, n_bits: u16) -> Self {
self.repeat_bits = n_bits as i32;
self
} }
} }
impl Iterator for BitReader<'_> { #[cfg(test)]
type Item = bool; mod tests {
fn next(&mut self) -> Option<Self::Item> { use super::*;
self.pop() use crate::model::Metrics;
use crate::modelA::tests::COMPRESSED_BYTES;
struct InputBits<'a> {
input: &'a [u8],
current_byte: u32,
last_mask: u32,
code_value_bits: i32,
}
pub trait Poppable {
fn pop(&mut self) -> Option<u8>;
}
impl Poppable for &[u8] {
fn pop(&mut self) -> Option<u8> {
if self.len() == 0 {
return None;
}
let out = self[0];
*self = &self[1..];
return Some(out);
}
}
impl<'a> InputBits<'a> {
pub fn new<T: Metrics>(data: &'a [u8]) -> Self {
Self {
input: data,
current_byte: 0,
last_mask: 1,
code_value_bits: T::CODE_VALUE_BITS as i32,
}
}
fn get_bit(&mut self) -> Option<bool> {
if self.last_mask == 1 {
match self.input.pop() {
None => {
if self.code_value_bits <= 0 {
return None;
//panic!("IDK Man");
} else {
self.code_value_bits -= 8;
}
}
Some(byte) => self.current_byte = byte as u32,
}
self.last_mask = 0x80;
} else {
self.last_mask >>= 1;
}
let bit = (self.current_byte & self.last_mask) != 0;
return Some(bit);
}
}
#[test]
fn bit_reader_test_i32() {
bit_reader_test_type::<i32>();
}
#[test]
fn bit_reader_test_u32() {
bit_reader_test_type::<u32>();
}
#[test]
fn bit_reader_test_i64() {
bit_reader_test_type::<i64>();
}
#[test]
fn bit_reader_test_u64() {
bit_reader_test_type::<u64>();
}
#[test]
fn bit_reader_test_i128() {
bit_reader_test_type::<i128>();
}
fn bit_reader_test_type<T: Metrics>() {
let mut br = BitReader::from(COMPRESSED_BYTES).with_repeat_bits(T::CODE_VALUE_BITS as u16);
let mut ib = InputBits::new::<T>(&COMPRESSED_BYTES);
while let Some(a) = ib.get_bit() {
let b = br.get_bit().unwrap();
assert_eq!(a, b);
}
let _ = br.get_bit().expect_err("Extra bits");
} }
} }

6
src/lib.rs Normal file
View File

@@ -0,0 +1,6 @@
// https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
pub mod model;
#[allow(non_snake_case)]
pub mod modelA;
pub mod bit_buffer;

View File

@@ -1,152 +1,52 @@
// https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html use std::{
use std::collections::HashMap; env,
fs::File,
io::{self, BufReader, Read},
path::Path,
};
mod bit_buffer; use sac::model::Model;
use bit_buffer::{BitReader, BitWriter}; use sac::modelA::ModelA;
type Model = HashMap<u8, (f64, f64)>; enum Mode {
Compress,
Decompress,
}
type CodeValue = u32;
fn get_symbol(model: &Model, low: f64, high: f64) -> Option<u8> { fn get_file<FilePath: AsRef<Path>>(filepath: FilePath) -> io::Result<Box<dyn Read>> {
for (symbol, (start, end)) in model { let f = File::open(&filepath)?;
if low >= *start && high < *end { Ok(Box::new(BufReader::new(f)))
return Some(*symbol); }
fn main() -> io::Result<()> {
let args: Vec<String> = env::args().collect();
let name = &args[0];
let (input, mode): (Box<dyn Read>, Mode) = match args.len() {
2 if args[1].to_lowercase() == "compress" => (Box::new(io::stdin()), Mode::Compress),
2 if args[1].to_lowercase() == "decompress" => (Box::new(io::stdin()), Mode::Decompress),
3 if args[1].to_lowercase() == "compress" => (get_file(&args[2])?, Mode::Compress),
3 if args[1].to_lowercase() == "decompress" => (get_file(&args[2])?, Mode::Decompress),
_ => {
eprintln!("Usage:");
eprintln!("{name} compress file > output # compress file and save to output");
eprintln!("{name} decompress file > output # decompress file and save to output");
return Err(io::Error::other(format!(
"Invalid command `{}`",
args.join(" ")
)));
} }
} };
return None;
} let model: ModelA<CodeValue> = ModelA::default();
let mut output = io::stdout().lock();
fn encode(input: &[u8], model: &Model) -> Vec<u8> { match mode {
const HALF: u64 = 1 << (u64::BITS - 1); Mode::Compress => model.compress(input, &mut output)?,
const LOW_CONVERGE: u64 = 0b10 << (u64::BITS - 2); Mode::Decompress => model.decompress(input, &mut output)?,
const HIGH_CONVERGE: u64 = 0b01 << (u64::BITS - 2); };
let mut output = BitWriter::new(); Ok(())
let mut high = u64::MAX;
let mut low = u64::MIN;
let mut pending_bits = 0;
for symbol in input {
let range = high - low;
let p = model.get(symbol).expect("Invalid/Unsupported data");
high = low + (range as f64 * p.1) as u64;
low = low + (range as f64 * p.0) as u64;
loop {
if high < HALF {
output.write(false);
print!("0");
while pending_bits > 0 {
output.write(true);
print!("1");
pending_bits -= 1;
}
} else if low >= HALF {
output.write(true);
print!("1");
while pending_bits > 0 {
output.write(true);
print!("0");
pending_bits -= 1;
}
} else if low >= LOW_CONVERGE && high < HIGH_CONVERGE {
println!("BET");
pending_bits += 1;
low <<= 1;
low &= HALF - 1;
high <<= 1;
high &= HALF + 1;
continue;
} else {
break;
}
low <<= 1;
high <<= 1;
high |= 1;
}
}
println!("");
return output.flush();
}
fn decode(input: &[u8], model: &Model) -> Vec<u8> {
let mut high = 1.0;
let mut low = 0.0;
let mut output = vec![];
for bit in BitReader::new(input) {
let diff = high - low;
if bit {
//print!("1");
low = low + (diff / 2.0);
} else {
high = high - (diff / 2.0);
//print!("0");
}
if let Some(symbol) = get_symbol(model, low, high) {
//println!("\nGot sym: {} from [{}, {})", symbol as char, low, high);
output.push(symbol);
let (slow, shigh) = model.get(&symbol).unwrap();
let symdiff = *shigh - *slow;
high = (high - *slow) / symdiff;
low = (low - *slow) / symdiff;
}
}
return output;
}
fn make_model(probabilities: &[(u8, f64)]) -> Model {
let mut model = HashMap::new();
let mut end: f64 = 0.0;
for (symbol, probability) in probabilities {
let start: f64 = end;
end = start + probability;
model.insert(*symbol, (start, end));
println!("{}: [{}, {})", *symbol as char, start, end);
}
return model;
}
const ENGLISH: &[(u8, f64)] = &[
(b'a', 0.08),
(b'b', 0.01),
(b'c', 0.02),
(b'd', 0.04),
(b'e', 0.12),
(b'f', 0.02),
(b'g', 0.02),
(b'h', 0.06),
(b'i', 0.07),
(b'j', 0.01),
(b'k', 0.01),
(b'l', 0.04),
(b'm', 0.02),
(b'n', 0.06),
(b'o', 0.07),
(b'p', 0.01),
(b'q', 0.01),
(b'r', 0.06),
(b's', 0.06),
(b't', 0.09),
(b'u', 0.02),
(b'v', 0.01),
(b'w', 0.02),
(b'x', 0.01),
(b'y', 0.02),
(b'z', 0.01),
(b' ', 0.01),
(b'-', 0.02),
];
fn main() {
let data = b"hello world-";
println!("MODEL:");
let model: Model = make_model(ENGLISH);
println!("");
let _enc = encode(data, &model);
let _dec = decode(&_enc, &model);
println!("{}", String::from_utf8(_dec).unwrap());
println!(
"Compression Ratio: {}",
data.len() as f64 / _enc.len() as f64
);
} }

207
src/model.rs Normal file
View File

@@ -0,0 +1,207 @@
use num::{FromPrimitive, Integer};
use std::{
io::{self, Read, Write},
ops::{BitAnd, Shl},
usize,
};
use crate::bit_buffer::{BitReader, BitWriter};
trait Precision {
const PRECISION: usize;
}
macro_rules! unsignedImplDigits {
($($type: ident),*) => { $(
impl Precision for $type {
const PRECISION: usize = (std::mem::size_of::<$type>() * 8);
}
)* };
}
macro_rules! signedImplDigits {
($($type: ident),*) => { $(
impl Precision for $type {
const PRECISION: usize = (std::mem::size_of::<$type>() * 8) - 1;
}
)* };
}
unsignedImplDigits!(u32, u64);
signedImplDigits!(i32, i64, i128);
pub trait Metrics:
Integer + FromPrimitive + Copy + BitAnd<Output = Self> + Shl<Output = Self>
{
const PRECISION: usize;
const FREQUENCY_BITS: usize = (Self::PRECISION / 2) - 1;
const CODE_VALUE_BITS: usize = Self::FREQUENCY_BITS + 2;
const MAX_CODE: usize = if Self::CODE_VALUE_BITS == 64 {
u64::MAX as usize
} else {
(1 << Self::CODE_VALUE_BITS) - 1
};
const MAX_FREQ: usize = (1 << Self::FREQUENCY_BITS) - 1;
const ONE_FOURTH: usize = 1 << (Self::CODE_VALUE_BITS - 2);
const ONE_HALF: usize = 2 * Self::ONE_FOURTH;
const THREE_FOURTHS: usize = 3 * Self::ONE_FOURTH;
fn print_metrics() {
println!("--------- Metrics ---------");
println!(" PRECISION: {}", Self::PRECISION);
println!(" FREQUENCY_BITS: {}", Self::FREQUENCY_BITS);
println!("CODE_VALUE_BITS: {}", Self::CODE_VALUE_BITS);
println!(" MAX_CODE: {}", Self::MAX_CODE);
println!(" MAX_FREQ: {}", Self::MAX_FREQ);
println!(" ONE_FOURTH: {}", Self::ONE_FOURTH);
println!(" ONE_HALF: {}", Self::ONE_HALF);
println!(" THREE_FOURTHS: {}", Self::THREE_FOURTHS);
}
}
impl<
T: Precision + Integer + FromPrimitive + Copy + BitAnd<Output = Self> + Shl<Output = Self>,
> Metrics for T
{
const PRECISION: usize = T::PRECISION;
}
#[derive(Debug)]
pub struct Prob<T> {
pub low: T,
pub high: T,
pub max_code: T,
}
pub trait Model<CodeWord: Metrics> {
fn get_probability(&mut self, c: i32) -> Prob<CodeWord>;
fn get_char(&mut self, scaled_value: CodeWord) -> Option<(i32, Prob<CodeWord>)>;
fn get_max_code(&self) -> CodeWord;
#[allow(non_snake_case)]
fn decompress<T: Read, O: Write, I: Into<BitReader<T>>>(
mut self,
input: I,
output: &mut O,
) -> io::Result<()>
where
Self: Sized,
{
let ONE: CodeWord = CodeWord::one();
let ZERO: CodeWord = CodeWord::zero();
let ONE_HALF: CodeWord = CodeWord::from_usize(CodeWord::ONE_HALF).unwrap();
let ONE_FORTH: CodeWord = CodeWord::from_usize(CodeWord::ONE_FOURTH).unwrap();
let THREE_FOURTHS: CodeWord = CodeWord::from_usize(CodeWord::THREE_FOURTHS).unwrap();
let mut input: BitReader<T> = input
.into()
.with_repeat_bits(CodeWord::CODE_VALUE_BITS as u16);
let mut low: CodeWord = ZERO;
let mut high: CodeWord = CodeWord::from_usize(CodeWord::MAX_CODE).unwrap();
let mut value: CodeWord = ZERO;
for _ in 0..CodeWord::CODE_VALUE_BITS {
value = (value << CodeWord::one()) + if input.get_bit()? { ONE } else { ZERO };
}
loop {
let range: CodeWord = high - low + ONE;
let scaled_value = ((value - low + ONE) * self.get_max_code() - ONE) / range;
let (c, p) = self.get_char(scaled_value).unwrap();
if c > 255 || c < 0 {
break;
}
output.write(&[c as u8])?;
high = low + (range * p.high) / p.max_code - ONE;
low = low + (range * p.low) / p.max_code;
loop {
if high < ONE_HALF {
} else if low >= ONE_HALF {
value = value - ONE_HALF;
low = low - ONE_HALF;
high = high - ONE_HALF
} else if low >= ONE_FORTH && high < THREE_FOURTHS {
value = value - ONE_FORTH;
low = low - ONE_FORTH;
high = high - ONE_FORTH;
} else {
break;
}
low = low << ONE;
high = (high << ONE) + ONE;
value = (value << ONE) + if input.get_bit()? { ONE } else { ZERO };
}
}
return Ok(());
}
#[allow(non_snake_case)]
fn compress<IN: Read, OUT: Write>(mut self, input: IN, output: &mut OUT) -> std::io::Result<()>
where
Self: Sized,
{
let ONE: CodeWord = CodeWord::one();
let ZERO: CodeWord = CodeWord::zero();
let MAX_CODE: CodeWord = CodeWord::from_usize(CodeWord::MAX_CODE).unwrap();
let ONE_HALF: CodeWord = CodeWord::from_usize(CodeWord::ONE_HALF).unwrap();
let ONE_FORTH: CodeWord = CodeWord::from_usize(CodeWord::ONE_FOURTH).unwrap();
let THREE_FOURTHS: CodeWord = CodeWord::from_usize(CodeWord::THREE_FOURTHS).unwrap();
let mut output: BitWriter<OUT> = output.into();
let mut pending_bits: i32 = 0;
let mut low: CodeWord = ZERO;
let mut high: CodeWord = MAX_CODE;
let mut iter = input
.bytes()
.map(|r| r.map(|b| b as i32))
.chain([Ok(256_i32)]);
while let Some(Ok(mut c)) = iter.next() {
if c > 255 || c < 0 {
c = 256;
}
let p = self.get_probability(c);
let range: CodeWord = high - low + ONE;
high = low + (range * p.high / p.max_code) - ONE;
low = low + (range * p.low / p.max_code);
loop {
if high < ONE_HALF {
write_with_pending(false, &mut pending_bits, &mut output)?;
} else if low >= ONE_HALF {
write_with_pending(true, &mut pending_bits, &mut output)?;
} else if low >= ONE_FORTH && high < THREE_FOURTHS {
pending_bits += 1;
low = low - ONE_FORTH;
high = high - ONE_FORTH;
} else {
break;
}
high = ((high << ONE) + ONE) & MAX_CODE;
low = (low << ONE) & MAX_CODE;
}
if c == 256 {
break;
}
}
pending_bits += 1;
if low < ONE_FORTH {
write_with_pending(false, &mut pending_bits, &mut output)?;
} else {
write_with_pending(true, &mut pending_bits, &mut output)?;
}
return output.flush();
}
}
fn write_with_pending<W: std::io::Write>(
bit: bool,
pending: &mut i32,
output: &mut BitWriter<W>,
) -> std::io::Result<()> {
output.write(bit)?;
for _ in 0..*pending {
output.write(!bit)?;
}
*pending = 0;
Ok(())
}

99
src/modelA.rs Normal file
View File

@@ -0,0 +1,99 @@
use crate::model::{Metrics, Model, Prob};
pub struct ModelA<T> {
cumulative_frequency: [T; 258],
m_frozen: bool,
}
impl<T: Metrics> Default for ModelA<T> {
fn default() -> Self {
let m_frozen = false;
let mut cumulative_frequency = [T::zero(); 258];
for i in 0..258 {
cumulative_frequency[i] = T::from_usize(i).unwrap();
}
Self {
cumulative_frequency,
m_frozen,
}
}
}
impl<T: Metrics> ModelA<T> {
pub fn print_metrics(&self) {
T::print_metrics();
}
fn update(&mut self, c: i32) {
for i in (c as usize + 1)..258 {
self.cumulative_frequency[i] = self.cumulative_frequency[i] + T::one();
}
if self.cumulative_frequency[257] >= T::from_usize(T::MAX_FREQ).unwrap() {
self.m_frozen = true;
}
}
}
impl<T: Metrics> Model<T> for ModelA<T> {
fn get_probability(&mut self, c: i32) -> crate::model::Prob<T> {
let p = Prob {
low: self.cumulative_frequency[c as usize],
high: self.cumulative_frequency[c as usize + 1],
max_code: self.cumulative_frequency[257],
};
if !self.m_frozen {
self.update(c);
}
return p;
}
fn get_char(&mut self, scaled_value: T) -> Option<(i32, crate::model::Prob<T>)> {
for i in 0..258 {
if scaled_value < self.cumulative_frequency[i + 1] {
let p = Prob {
low: self.cumulative_frequency[i],
high: self.cumulative_frequency[i + 1],
max_code: self.cumulative_frequency[257],
};
if !self.m_frozen {
self.update(i as i32)
}
return Some((i as i32, p));
}
}
return None;
}
fn get_max_code(&self) -> T {
self.cumulative_frequency[257]
}
}
#[cfg(test)]
pub mod tests {
use super::*;
pub const UNCOMPRESSED_BYTES: &[u8; 13] = b"hello world-\n";
/// Compressed bytes taken from output of the c++ version
pub const COMPRESSED_BYTES: [u8; 14] = [
0x67, 0xfc, 0x3e, 0x4a, 0x9d, 0x03, 0x7f, 0x35, 0xf1, 0x08, 0xd8, 0xa6, 0xbc, 0xda,
];
#[test]
fn compression_test() {
let model: ModelA<i32> = ModelA::default();
let mut enc = Vec::new();
model.compress(&UNCOMPRESSED_BYTES[..], &mut enc).unwrap();
assert_eq!(COMPRESSED_BYTES.len(), enc.len());
for (a, b) in enc.iter().zip(COMPRESSED_BYTES.iter()) {
assert_eq!(a, b);
}
}
#[test]
fn decompression_test() {
let model: ModelA<i32> = ModelA::default();
let mut dec = Vec::new();
model.decompress(&COMPRESSED_BYTES, &mut dec).unwrap();
assert_eq!(UNCOMPRESSED_BYTES.len(), dec.len());
for (a, b) in dec.iter().zip(UNCOMPRESSED_BYTES.iter()) {
assert_eq!(a, b);
}
}
}