Compare commits

..

12 Commits

8 changed files with 587 additions and 93 deletions

1
.gitignore vendored
View File

@@ -1 +1,2 @@
/target
Cargo.lock

View File

@@ -4,3 +4,4 @@ version = "0.1.0"
edition = "2021"
[dependencies]
num = "0.4.3"

46
examples/example.rs Normal file
View File

@@ -0,0 +1,46 @@
use sac::model::Model;
use sac::modelA::ModelA;
const DATA: &[u8] = b"
I'd just like to interject for a moment. What you're refering to as Linux, is in fact, GNU/Linux, or as I've re
aken to calling it, GNU plus Linux. Linux is not an operating system unto itself, but rather another free compo
a fully functioning GNU system made useful by the GNU corelibs, shell utilities and vital system components com
a full OS as defined by POSIX.
Many computer users run a modified version of the GNU system every day, without realizing it. Through a peculia
f events, the version of GNU which is widely used today is often called Linux, and many of its users are not aw
it is basically the GNU system, developed by the GNU Project.
There really is a Linux, and these people are using it, but it is just a part of the system they use. Linux is
el: the program in the system that allocates the machine's resources to the other programs that you run. The ke
an essential part of an operating system, but useless by itself; it can only function in the context of a compl
ating system. Linux is normally used in combination with the GNU operating system: the whole system is basicall
th Linux added, or GNU/Linux. All the so-called Linux distributions are really distributions of GNU/Linux!
";
fn main() {
type CodeValue = u32;
println!(
"Using model: ModelA<{}>",
std::any::type_name::<CodeValue>()
);
let model: ModelA<CodeValue> = ModelA::default();
model.print_metrics();
println!("");
let mut compressed = Vec::new();
println!("compressing...");
model.compress(&DATA[..], &mut compressed).unwrap();
println!("ModelA compressed to {} bytes", compressed.len());
println!(
"Compression Ratio: {}",
DATA.len() as f64 / compressed.len() as f64
);
println!("");
println!("decompressing...");
let mut decompressed = Vec::new();
let model: ModelA<CodeValue> = ModelA::default();
model.decompress(&compressed, &mut decompressed).unwrap();
println!("{}", String::from_utf8_lossy(&decompressed));
}

181
src/bit_buffer.rs Normal file
View File

@@ -0,0 +1,181 @@
use std::io::{self, Bytes, Cursor, Read, Write};
pub struct BitWriter<'a, W: ?Sized + Write> {
bits: u8,
nextbit: usize,
output: &'a mut W,
}
impl<'a, W: Write> From<&'a mut W> for BitWriter<'a, W> {
fn from(value: &'a mut W) -> Self {
BitWriter::new(value)
}
}
impl<'a, W: Write> BitWriter<'a, W> {
pub fn new(writer: &'a mut W) -> Self {
return BitWriter {
bits: 0,
nextbit: 7,
output: writer,
};
//writer.into()
}
pub fn write(&mut self, bit: bool) -> io::Result<()> {
if bit {
self.bits = 1 << self.nextbit | self.bits;
}
if self.nextbit == 0 {
self.output.write(&[self.bits])?;
self.bits = 0;
self.nextbit = 7;
} else {
self.nextbit -= 1;
}
Ok(())
}
pub fn flush(self) -> std::io::Result<()> {
if self.bits != 0 {
self.output.write(&[self.bits])?;
}
return self.output.flush();
}
}
pub struct BitReader<T> {
next: u8,
bits: u8,
repeat_bits: i32,
input: Bytes<T>,
}
impl<R: Read> From<R> for BitReader<R> {
fn from(value: R) -> Self {
BitReader::new(value.bytes())
}
}
impl<T: AsRef<[u8]>> From<T> for BitReader<Cursor<T>> {
fn from(value: T) -> Self {
let c = Cursor::new(value);
c.into()
}
}
impl<R: Read> BitReader<R> {
pub fn new(value: Bytes<R>) -> Self {
BitReader {
next: 0,
bits: 0,
repeat_bits: 0,
input: value,
}
}
pub fn get_bit(&mut self) -> io::Result<bool> {
if self.next == 0 {
let next = self.input.next().transpose()?;
if let Some(byte) = next {
self.bits = byte;
} else if self.repeat_bits <= 0 {
return Err(io::Error::other("No more bits"));
} else {
self.repeat_bits -= 8;
}
self.next = 1 << 7;
}
let bit = (self.bits & self.next) > 0;
self.next = self.next >> 1;
return Ok(bit);
}
}
impl<T> BitReader<T> {
pub fn with_repeat_bits(mut self, n_bits: u16) -> Self {
self.repeat_bits = n_bits as i32;
self
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::Metrics;
use crate::modelA::tests::COMPRESSED_BYTES;
struct InputBits<'a> {
input: &'a [u8],
current_byte: u32,
last_mask: u32,
code_value_bits: i32,
}
pub trait Poppable {
fn pop(&mut self) -> Option<u8>;
}
impl Poppable for &[u8] {
fn pop(&mut self) -> Option<u8> {
if self.len() == 0 {
return None;
}
let out = self[0];
*self = &self[1..];
return Some(out);
}
}
impl<'a> InputBits<'a> {
pub fn new<T: Metrics>(data: &'a [u8]) -> Self {
Self {
input: data,
current_byte: 0,
last_mask: 1,
code_value_bits: T::CODE_VALUE_BITS as i32,
}
}
fn get_bit(&mut self) -> Option<bool> {
if self.last_mask == 1 {
match self.input.pop() {
None => {
if self.code_value_bits <= 0 {
return None;
//panic!("IDK Man");
} else {
self.code_value_bits -= 8;
}
}
Some(byte) => self.current_byte = byte as u32,
}
self.last_mask = 0x80;
} else {
self.last_mask >>= 1;
}
let bit = (self.current_byte & self.last_mask) != 0;
return Some(bit);
}
}
#[test]
fn bit_reader_test_i32() {
bit_reader_test_type::<i32>();
}
#[test]
fn bit_reader_test_u32() {
bit_reader_test_type::<u32>();
}
#[test]
fn bit_reader_test_i64() {
bit_reader_test_type::<i64>();
}
#[test]
fn bit_reader_test_u64() {
bit_reader_test_type::<u64>();
}
#[test]
fn bit_reader_test_i128() {
bit_reader_test_type::<i128>();
}
fn bit_reader_test_type<T: Metrics>() {
let mut br = BitReader::from(COMPRESSED_BYTES).with_repeat_bits(T::CODE_VALUE_BITS as u16);
let mut ib = InputBits::new::<T>(&COMPRESSED_BYTES);
while let Some(a) = ib.get_bit() {
let b = br.get_bit().unwrap();
assert_eq!(a, b);
}
let _ = br.get_bit().expect_err("Extra bits");
}
}

6
src/lib.rs Normal file
View File

@@ -0,0 +1,6 @@
// https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
pub mod model;
#[allow(non_snake_case)]
pub mod modelA;
pub mod bit_buffer;

View File

@@ -1,99 +1,52 @@
// https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
use std::collections::HashMap;
use std::{
env,
fs::File,
io::{self, BufReader, Read},
path::Path,
};
type Model = HashMap<u8, (f64, f64)>;
use sac::model::Model;
use sac::modelA::ModelA;
fn get_symbol(model: &Model, d: f64) -> Option<u8> {
// Brute force
for (symbol, (start, end)) in model {
if d >= *start && d < *end {
return Some(*symbol);
enum Mode {
Compress,
Decompress,
}
type CodeValue = u32;
fn get_file<FilePath: AsRef<Path>>(filepath: FilePath) -> io::Result<Box<dyn Read>> {
let f = File::open(&filepath)?;
Ok(Box::new(BufReader::new(f)))
}
fn main() -> io::Result<()> {
let args: Vec<String> = env::args().collect();
let name = &args[0];
let (input, mode): (Box<dyn Read>, Mode) = match args.len() {
2 if args[1].to_lowercase() == "compress" => (Box::new(io::stdin()), Mode::Compress),
2 if args[1].to_lowercase() == "decompress" => (Box::new(io::stdin()), Mode::Decompress),
3 if args[1].to_lowercase() == "compress" => (get_file(&args[2])?, Mode::Compress),
3 if args[1].to_lowercase() == "decompress" => (get_file(&args[2])?, Mode::Decompress),
_ => {
eprintln!("Usage:");
eprintln!("{name} compress file > output # compress file and save to output");
eprintln!("{name} decompress file > output # decompress file and save to output");
return Err(io::Error::other(format!(
"Invalid command `{}`",
args.join(" ")
)));
}
}
return None;
}
};
fn encode(data: &[u8], model: &Model) -> f64 {
let mut high: f64 = 1.0;
let mut low: f64 = 0.0;
for symbol in data {
let p = model.get(symbol).expect("Invalid/Unsupported data");
let range = high - low;
high = low + range * p.1;
low = low + range * p.0;
}
return low + (high - low) / 2.0;
}
let model: ModelA<CodeValue> = ModelA::default();
let mut output = io::stdout().lock();
match mode {
Mode::Compress => model.compress(input, &mut output)?,
Mode::Decompress => model.decompress(input, &mut output)?,
};
fn decode(message: f64, model: &Model) {
let mut high: f64 = 1.0;
let mut low: f64 = 0.0;
loop {
let range = high - low;
let d = (message - low) / range;
let c = match get_symbol(&model, d) {
Some(c) => c,
None => {
println!("");
eprintln!("Decode error: d={d}");
return;
}
};
if c == b'-' {
println!("");
return;
}
print!("{}", c as char);
let p = model.get(&c).expect("Decode error");
high = low + range * p.1;
low = low + range * p.0;
}
}
fn make_model(probabilities: &[(u8, f64)]) -> Model {
let mut model = HashMap::new();
let mut end: f64 = 0.0;
for (symbol, probability) in probabilities {
let start: f64 = end;
end = start + probability;
model.insert(*symbol, (start, end));
println!("{}: [{}, {})", *symbol as char, start, end);
}
return model;
}
const ENGLISH: &[(u8, f64)] = &[
(b'a', 0.08),
(b'b', 0.01),
(b'c', 0.02),
(b'd', 0.04),
(b'e', 0.12),
(b'f', 0.02),
(b'g', 0.02),
(b'h', 0.06),
(b'i', 0.07),
(b'j', 0.01),
(b'k', 0.01),
(b'l', 0.04),
(b'm', 0.02),
(b'n', 0.06),
(b'o', 0.07),
(b'p', 0.01),
(b'q', 0.01),
(b'r', 0.06),
(b's', 0.06),
(b't', 0.09),
(b'u', 0.02),
(b'v', 0.01),
(b'w', 0.02),
(b'x', 0.01),
(b'y', 0.02),
(b'z', 0.01),
(b' ', 0.01),
(b'-', 0.02),
];
fn main() {
let model: Model = make_model(ENGLISH);
let message = encode(b"hello world-", &model);
println!("{message}");
decode(message, &model);
Ok(())
}

207
src/model.rs Normal file
View File

@@ -0,0 +1,207 @@
use num::{FromPrimitive, Integer};
use std::{
io::{self, Read, Write},
ops::{BitAnd, Shl},
usize,
};
use crate::bit_buffer::{BitReader, BitWriter};
trait Precision {
const PRECISION: usize;
}
macro_rules! unsignedImplDigits {
($($type: ident),*) => { $(
impl Precision for $type {
const PRECISION: usize = (std::mem::size_of::<$type>() * 8);
}
)* };
}
macro_rules! signedImplDigits {
($($type: ident),*) => { $(
impl Precision for $type {
const PRECISION: usize = (std::mem::size_of::<$type>() * 8) - 1;
}
)* };
}
unsignedImplDigits!(u32, u64);
signedImplDigits!(i32, i64, i128);
pub trait Metrics:
Integer + FromPrimitive + Copy + BitAnd<Output = Self> + Shl<Output = Self>
{
const PRECISION: usize;
const FREQUENCY_BITS: usize = (Self::PRECISION / 2) - 1;
const CODE_VALUE_BITS: usize = Self::FREQUENCY_BITS + 2;
const MAX_CODE: usize = if Self::CODE_VALUE_BITS == 64 {
u64::MAX as usize
} else {
(1 << Self::CODE_VALUE_BITS) - 1
};
const MAX_FREQ: usize = (1 << Self::FREQUENCY_BITS) - 1;
const ONE_FOURTH: usize = 1 << (Self::CODE_VALUE_BITS - 2);
const ONE_HALF: usize = 2 * Self::ONE_FOURTH;
const THREE_FOURTHS: usize = 3 * Self::ONE_FOURTH;
fn print_metrics() {
println!("--------- Metrics ---------");
println!(" PRECISION: {}", Self::PRECISION);
println!(" FREQUENCY_BITS: {}", Self::FREQUENCY_BITS);
println!("CODE_VALUE_BITS: {}", Self::CODE_VALUE_BITS);
println!(" MAX_CODE: {}", Self::MAX_CODE);
println!(" MAX_FREQ: {}", Self::MAX_FREQ);
println!(" ONE_FOURTH: {}", Self::ONE_FOURTH);
println!(" ONE_HALF: {}", Self::ONE_HALF);
println!(" THREE_FOURTHS: {}", Self::THREE_FOURTHS);
}
}
impl<
T: Precision + Integer + FromPrimitive + Copy + BitAnd<Output = Self> + Shl<Output = Self>,
> Metrics for T
{
const PRECISION: usize = T::PRECISION;
}
#[derive(Debug)]
pub struct Prob<T> {
pub low: T,
pub high: T,
pub max_code: T,
}
pub trait Model<CodeWord: Metrics> {
fn get_probability(&mut self, c: i32) -> Prob<CodeWord>;
fn get_char(&mut self, scaled_value: CodeWord) -> Option<(i32, Prob<CodeWord>)>;
fn get_max_code(&self) -> CodeWord;
#[allow(non_snake_case)]
fn decompress<T: Read, O: Write, I: Into<BitReader<T>>>(
mut self,
input: I,
output: &mut O,
) -> io::Result<()>
where
Self: Sized,
{
let ONE: CodeWord = CodeWord::one();
let ZERO: CodeWord = CodeWord::zero();
let ONE_HALF: CodeWord = CodeWord::from_usize(CodeWord::ONE_HALF).unwrap();
let ONE_FORTH: CodeWord = CodeWord::from_usize(CodeWord::ONE_FOURTH).unwrap();
let THREE_FOURTHS: CodeWord = CodeWord::from_usize(CodeWord::THREE_FOURTHS).unwrap();
let mut input: BitReader<T> = input
.into()
.with_repeat_bits(CodeWord::CODE_VALUE_BITS as u16);
let mut low: CodeWord = ZERO;
let mut high: CodeWord = CodeWord::from_usize(CodeWord::MAX_CODE).unwrap();
let mut value: CodeWord = ZERO;
for _ in 0..CodeWord::CODE_VALUE_BITS {
value = (value << CodeWord::one()) + if input.get_bit()? { ONE } else { ZERO };
}
loop {
let range: CodeWord = high - low + ONE;
let scaled_value = ((value - low + ONE) * self.get_max_code() - ONE) / range;
let (c, p) = self.get_char(scaled_value).unwrap();
if c > 255 || c < 0 {
break;
}
output.write(&[c as u8])?;
high = low + (range * p.high) / p.max_code - ONE;
low = low + (range * p.low) / p.max_code;
loop {
if high < ONE_HALF {
} else if low >= ONE_HALF {
value = value - ONE_HALF;
low = low - ONE_HALF;
high = high - ONE_HALF
} else if low >= ONE_FORTH && high < THREE_FOURTHS {
value = value - ONE_FORTH;
low = low - ONE_FORTH;
high = high - ONE_FORTH;
} else {
break;
}
low = low << ONE;
high = (high << ONE) + ONE;
value = (value << ONE) + if input.get_bit()? { ONE } else { ZERO };
}
}
return Ok(());
}
#[allow(non_snake_case)]
fn compress<IN: Read, OUT: Write>(mut self, input: IN, output: &mut OUT) -> std::io::Result<()>
where
Self: Sized,
{
let ONE: CodeWord = CodeWord::one();
let ZERO: CodeWord = CodeWord::zero();
let MAX_CODE: CodeWord = CodeWord::from_usize(CodeWord::MAX_CODE).unwrap();
let ONE_HALF: CodeWord = CodeWord::from_usize(CodeWord::ONE_HALF).unwrap();
let ONE_FORTH: CodeWord = CodeWord::from_usize(CodeWord::ONE_FOURTH).unwrap();
let THREE_FOURTHS: CodeWord = CodeWord::from_usize(CodeWord::THREE_FOURTHS).unwrap();
let mut output: BitWriter<OUT> = output.into();
let mut pending_bits: i32 = 0;
let mut low: CodeWord = ZERO;
let mut high: CodeWord = MAX_CODE;
let mut iter = input
.bytes()
.map(|r| r.map(|b| b as i32))
.chain([Ok(256_i32)]);
while let Some(Ok(mut c)) = iter.next() {
if c > 255 || c < 0 {
c = 256;
}
let p = self.get_probability(c);
let range: CodeWord = high - low + ONE;
high = low + (range * p.high / p.max_code) - ONE;
low = low + (range * p.low / p.max_code);
loop {
if high < ONE_HALF {
write_with_pending(false, &mut pending_bits, &mut output)?;
} else if low >= ONE_HALF {
write_with_pending(true, &mut pending_bits, &mut output)?;
} else if low >= ONE_FORTH && high < THREE_FOURTHS {
pending_bits += 1;
low = low - ONE_FORTH;
high = high - ONE_FORTH;
} else {
break;
}
high = ((high << ONE) + ONE) & MAX_CODE;
low = (low << ONE) & MAX_CODE;
}
if c == 256 {
break;
}
}
pending_bits += 1;
if low < ONE_FORTH {
write_with_pending(false, &mut pending_bits, &mut output)?;
} else {
write_with_pending(true, &mut pending_bits, &mut output)?;
}
return output.flush();
}
}
fn write_with_pending<W: std::io::Write>(
bit: bool,
pending: &mut i32,
output: &mut BitWriter<W>,
) -> std::io::Result<()> {
output.write(bit)?;
for _ in 0..*pending {
output.write(!bit)?;
}
*pending = 0;
Ok(())
}

99
src/modelA.rs Normal file
View File

@@ -0,0 +1,99 @@
use crate::model::{Metrics, Model, Prob};
pub struct ModelA<T> {
cumulative_frequency: [T; 258],
m_frozen: bool,
}
impl<T: Metrics> Default for ModelA<T> {
fn default() -> Self {
let m_frozen = false;
let mut cumulative_frequency = [T::zero(); 258];
for i in 0..258 {
cumulative_frequency[i] = T::from_usize(i).unwrap();
}
Self {
cumulative_frequency,
m_frozen,
}
}
}
impl<T: Metrics> ModelA<T> {
pub fn print_metrics(&self) {
T::print_metrics();
}
fn update(&mut self, c: i32) {
for i in (c as usize + 1)..258 {
self.cumulative_frequency[i] = self.cumulative_frequency[i] + T::one();
}
if self.cumulative_frequency[257] >= T::from_usize(T::MAX_FREQ).unwrap() {
self.m_frozen = true;
}
}
}
impl<T: Metrics> Model<T> for ModelA<T> {
fn get_probability(&mut self, c: i32) -> crate::model::Prob<T> {
let p = Prob {
low: self.cumulative_frequency[c as usize],
high: self.cumulative_frequency[c as usize + 1],
max_code: self.cumulative_frequency[257],
};
if !self.m_frozen {
self.update(c);
}
return p;
}
fn get_char(&mut self, scaled_value: T) -> Option<(i32, crate::model::Prob<T>)> {
for i in 0..258 {
if scaled_value < self.cumulative_frequency[i + 1] {
let p = Prob {
low: self.cumulative_frequency[i],
high: self.cumulative_frequency[i + 1],
max_code: self.cumulative_frequency[257],
};
if !self.m_frozen {
self.update(i as i32)
}
return Some((i as i32, p));
}
}
return None;
}
fn get_max_code(&self) -> T {
self.cumulative_frequency[257]
}
}
#[cfg(test)]
pub mod tests {
use super::*;
pub const UNCOMPRESSED_BYTES: &[u8; 13] = b"hello world-\n";
/// Compressed bytes taken from output of the c++ version
pub const COMPRESSED_BYTES: [u8; 14] = [
0x67, 0xfc, 0x3e, 0x4a, 0x9d, 0x03, 0x7f, 0x35, 0xf1, 0x08, 0xd8, 0xa6, 0xbc, 0xda,
];
#[test]
fn compression_test() {
let model: ModelA<i32> = ModelA::default();
let mut enc = Vec::new();
model.compress(&UNCOMPRESSED_BYTES[..], &mut enc).unwrap();
assert_eq!(COMPRESSED_BYTES.len(), enc.len());
for (a, b) in enc.iter().zip(COMPRESSED_BYTES.iter()) {
assert_eq!(a, b);
}
}
#[test]
fn decompression_test() {
let model: ModelA<i32> = ModelA::default();
let mut dec = Vec::new();
model.decompress(&COMPRESSED_BYTES, &mut dec).unwrap();
assert_eq!(UNCOMPRESSED_BYTES.len(), dec.len());
for (a, b) in dec.iter().zip(UNCOMPRESSED_BYTES.iter()) {
assert_eq!(a, b);
}
}
}