New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
lz77 compression algorithm implemented #708
base: master
Are you sure you want to change the base?
Conversation
Codecov ReportAll modified and coverable lines are covered by tests ✅
Additional details and impacted files@@ Coverage Diff @@
## master #708 +/- ##
==========================================
+ Coverage 94.78% 94.81% +0.02%
==========================================
Files 297 298 +1
Lines 22149 22259 +110
==========================================
+ Hits 20994 21104 +110
Misses 1155 1155 ☔ View full report in Codecov by Sentry. |
Please describe the PR along with the problem's descriptions. It helps reviewers understand what the PR does. Could you explain the way you solve the problem in detail? Add docstrings to explain your code. |
The overall logic was good, but there are some points we need to improve in the implementation:
|
#[cfg(test)] | ||
mod test { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_lz77_encode() { | ||
let res = lz77_encode(""); | ||
assert_eq!(res, []); | ||
|
||
let res = lz77_encode("A"); | ||
let expected: Vec<(usize, usize, char)> = vec![(0, 0, 'A')]; | ||
assert_eq!(res, expected); | ||
|
||
let res = lz77_encode("AA"); | ||
let expected: Vec<(usize, usize, char)> = vec![(0, 0, 'A'), (0, 0, 'A')]; | ||
assert_eq!(res, expected); | ||
|
||
let res = lz77_encode("AAAABBBCCDAA"); | ||
let expected: Vec<(usize, usize, char)> = vec![ | ||
(0, 0, 'A'), | ||
(1, 1, 'A'), | ||
(3, 1, 'B'), | ||
(1, 1, 'B'), | ||
(0, 0, 'C'), | ||
(1, 1, 'D'), | ||
(10, 1, 'A'), | ||
]; | ||
assert_eq!(res, expected); | ||
|
||
let res = lz77_encode("Rust-Trends"); | ||
let expected: Vec<(usize, usize, char)> = vec![ | ||
(0, 0, 'R'), | ||
(0, 0, 'u'), | ||
(0, 0, 's'), | ||
(0, 0, 't'), | ||
(0, 0, '-'), | ||
(0, 0, 'T'), | ||
(0, 0, 'r'), | ||
(0, 0, 'e'), | ||
(0, 0, 'n'), | ||
(0, 0, 'd'), | ||
(0, 0, 's'), | ||
]; | ||
assert_eq!(res, expected); | ||
} | ||
|
||
#[test] | ||
fn test_lz77_decode() { | ||
let res = lz77_decode(vec![]); | ||
assert_eq!(res, ""); | ||
let res = lz77_decode(vec![(0, 0, 'A')]); | ||
assert_eq!(res, "A"); | ||
let res = lz77_decode(vec![(0, 0, 'A'), (0, 0, 'A')]); | ||
assert_eq!(res, "AA"); | ||
let res = lz77_decode(vec![ | ||
(0, 0, 'A'), | ||
(1, 1, 'A'), | ||
(3, 1, 'B'), | ||
(1, 1, 'B'), | ||
(0, 0, 'C'), | ||
(1, 1, 'D'), | ||
(10, 1, 'A'), | ||
]); | ||
assert_eq!(res, "AAAABBBCCDAA"); | ||
let res = lz77_decode(vec![ | ||
(0, 0, 'R'), | ||
(0, 0, 'u'), | ||
(0, 0, 's'), | ||
(0, 0, 't'), | ||
(0, 0, '-'), | ||
(0, 0, 'T'), | ||
(0, 0, 'r'), | ||
(0, 0, 'e'), | ||
(0, 0, 'n'), | ||
(0, 0, 'd'), | ||
(0, 0, 's'), | ||
]); | ||
assert_eq!(res, "Rust-Trends"); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#[cfg(test)] | |
mod test { | |
use super::*; | |
#[test] | |
fn test_lz77_encode() { | |
let res = lz77_encode(""); | |
assert_eq!(res, []); | |
let res = lz77_encode("A"); | |
let expected: Vec<(usize, usize, char)> = vec![(0, 0, 'A')]; | |
assert_eq!(res, expected); | |
let res = lz77_encode("AA"); | |
let expected: Vec<(usize, usize, char)> = vec![(0, 0, 'A'), (0, 0, 'A')]; | |
assert_eq!(res, expected); | |
let res = lz77_encode("AAAABBBCCDAA"); | |
let expected: Vec<(usize, usize, char)> = vec![ | |
(0, 0, 'A'), | |
(1, 1, 'A'), | |
(3, 1, 'B'), | |
(1, 1, 'B'), | |
(0, 0, 'C'), | |
(1, 1, 'D'), | |
(10, 1, 'A'), | |
]; | |
assert_eq!(res, expected); | |
let res = lz77_encode("Rust-Trends"); | |
let expected: Vec<(usize, usize, char)> = vec![ | |
(0, 0, 'R'), | |
(0, 0, 'u'), | |
(0, 0, 's'), | |
(0, 0, 't'), | |
(0, 0, '-'), | |
(0, 0, 'T'), | |
(0, 0, 'r'), | |
(0, 0, 'e'), | |
(0, 0, 'n'), | |
(0, 0, 'd'), | |
(0, 0, 's'), | |
]; | |
assert_eq!(res, expected); | |
} | |
#[test] | |
fn test_lz77_decode() { | |
let res = lz77_decode(vec![]); | |
assert_eq!(res, ""); | |
let res = lz77_decode(vec![(0, 0, 'A')]); | |
assert_eq!(res, "A"); | |
let res = lz77_decode(vec![(0, 0, 'A'), (0, 0, 'A')]); | |
assert_eq!(res, "AA"); | |
let res = lz77_decode(vec![ | |
(0, 0, 'A'), | |
(1, 1, 'A'), | |
(3, 1, 'B'), | |
(1, 1, 'B'), | |
(0, 0, 'C'), | |
(1, 1, 'D'), | |
(10, 1, 'A'), | |
]); | |
assert_eq!(res, "AAAABBBCCDAA"); | |
let res = lz77_decode(vec![ | |
(0, 0, 'R'), | |
(0, 0, 'u'), | |
(0, 0, 's'), | |
(0, 0, 't'), | |
(0, 0, '-'), | |
(0, 0, 'T'), | |
(0, 0, 'r'), | |
(0, 0, 'e'), | |
(0, 0, 'n'), | |
(0, 0, 'd'), | |
(0, 0, 's'), | |
]); | |
assert_eq!(res, "Rust-Trends"); | |
} | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::*; | |
macro_rules! test_lz77_encode_decode { | |
($($name:ident: $input:expr, $expected:expr,)*) => { | |
$( | |
#[test] | |
fn $name() { | |
let input_string = $input; | |
let expected_tokens = $expected.iter().map(|&(distance, length, next_char)| Token::new(distance, length, next_char)).collect::<Vec<_>>(); | |
let encoded_tokens = lz77_encode(&input_string); | |
let decoded_string = lz77_decode(encoded_tokens.clone()); | |
assert_eq!(input_string, decoded_string); | |
assert_eq!(encoded_tokens, expected_tokens); | |
} | |
)* | |
}; | |
} | |
test_lz77_encode_decode! { | |
empty_string: "", [], | |
single_character: "A", [(0, 0, 'A')], | |
repeated_characters: "AA", [(0, 0, 'A'), (0, 0, 'A')], | |
mixed_characters: "AAAABBBCCDAA", [ | |
(0, 0, 'A'), | |
(1, 1, 'A'), | |
(3, 1, 'B'), | |
(1, 1, 'B'), | |
(0, 0, 'C'), | |
(1, 1, 'D'), | |
(10, 1, 'A'), | |
], | |
alphanumeric_characters_with_dash: "Rust-Trends", [ | |
(0, 0, 'R'), | |
(0, 0, 'u'), | |
(0, 0, 's'), | |
(0, 0, 't'), | |
(0, 0, '-'), | |
(0, 0, 'T'), | |
(0, 0, 'r'), | |
(0, 0, 'e'), | |
(0, 0, 'n'), | |
(0, 0, 'd'), | |
(0, 0, 's'), | |
], | |
} | |
} |
pub fn lz77_decode(tokens: Vec<(usize, usize, char)>) -> String { | ||
let mut result = String::new(); | ||
for token in tokens { | ||
if token.0 != 0 { | ||
let start = result.len() - token.0; | ||
let length = token.1; | ||
let substring: String = result.chars().skip(start).take(length).collect(); | ||
result += &substring; | ||
}; | ||
result.push(token.2); | ||
} | ||
result | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pub fn lz77_decode(tokens: Vec<(usize, usize, char)>) -> String { | |
let mut result = String::new(); | |
for token in tokens { | |
if token.0 != 0 { | |
let start = result.len() - token.0; | |
let length = token.1; | |
let substring: String = result.chars().skip(start).take(length).collect(); | |
result += &substring; | |
}; | |
result.push(token.2); | |
} | |
result | |
} | |
/// Decompresses a vector of tokens generated by the LZ77 algorithm and returns the original input string. | |
/// | |
/// # Arguments | |
/// | |
/// * `tokens` - A vector of tokens generated by the LZ77 compression algorithm. | |
/// | |
/// # Returns | |
/// | |
/// The original input string before compression. | |
pub fn lz77_decode(tokens: Vec<Token>) -> String { | |
let mut result = String::new(); | |
for token in tokens { | |
if token.distance != 0 { | |
let start = result.len() - token.distance; | |
let length = token.length; | |
let substring: String = result.chars().skip(start).take(length).collect(); | |
result += &substring; | |
}; | |
result.push(token.next_char); | |
} | |
result | |
} |
pub fn lz77_encode(input: &str) -> Vec<(usize, usize, char)> { | ||
let mut tokens = Vec::new(); | ||
let mut index = 0; | ||
|
||
while index < input.len() { | ||
let mut best_match = (0, 0); | ||
let candidate = &input[index..input.len() - 1]; | ||
|
||
for i in 0..index { | ||
let search_box = &input[i..index]; | ||
let match_length = candidate | ||
.chars() | ||
.zip(search_box.chars()) | ||
.take_while(|&(a, b)| a == b) | ||
.count(); | ||
|
||
if match_length > best_match.1 { | ||
best_match = (index - i, match_length); | ||
} | ||
} | ||
|
||
if best_match.1 > 0 { | ||
tokens.push(( | ||
best_match.0, | ||
best_match.1, | ||
input.chars().nth(index + best_match.1).unwrap(), | ||
)); | ||
index += best_match.1 + 1; | ||
} else { | ||
tokens.push((0, 0, input.chars().nth(index).unwrap())); | ||
index += 1; | ||
} | ||
} | ||
tokens | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pub fn lz77_encode(input: &str) -> Vec<(usize, usize, char)> { | |
let mut tokens = Vec::new(); | |
let mut index = 0; | |
while index < input.len() { | |
let mut best_match = (0, 0); | |
let candidate = &input[index..input.len() - 1]; | |
for i in 0..index { | |
let search_box = &input[i..index]; | |
let match_length = candidate | |
.chars() | |
.zip(search_box.chars()) | |
.take_while(|&(a, b)| a == b) | |
.count(); | |
if match_length > best_match.1 { | |
best_match = (index - i, match_length); | |
} | |
} | |
if best_match.1 > 0 { | |
tokens.push(( | |
best_match.0, | |
best_match.1, | |
input.chars().nth(index + best_match.1).unwrap(), | |
)); | |
index += best_match.1 + 1; | |
} else { | |
tokens.push((0, 0, input.chars().nth(index).unwrap())); | |
index += 1; | |
} | |
} | |
tokens | |
} | |
//! This module provides an implementation of the LZ77 compression algorithm. | |
//! | |
//! LZ77 is a lossless data compression algorithm that replaces repeated occurrences | |
//! of data with references to a single copy of that data existing earlier in the uncompressed data stream. | |
//! It achieves compression by replacing repeated occurrences of data with references to a dictionary | |
//! that contains the position and length of the previous occurrence of that data. | |
//! # References | |
//! | |
//! - [LZ77 Compression Algorithm](https://en.wikipedia.org/wiki/LZ77_and_LZ78) | |
/// ## Token struct | |
/// | |
/// The `Token` struct represents a token generated during the LZ77 compression process. | |
/// It consists of three fields: `distance`, `length`, and `next_char`. | |
/// | |
/// - `distance`: Distance to the previous occurrence of the matched substring. | |
/// - `length`: Length of the matched substring. | |
/// - `next_char`: The next character in the input stream after the matched substring. | |
#[derive(Debug, Clone, Copy)] | |
pub struct Token { | |
distance: usize, | |
length: usize, | |
next_char: char, | |
} | |
impl Token { | |
fn new(distance: usize, length: usize, next_char: char) -> Self { | |
Token { | |
distance, | |
length, | |
next_char, | |
} | |
} | |
} | |
impl PartialEq for Token { | |
fn eq(&self, other: &Self) -> bool { | |
self.distance == other.distance | |
&& self.length == other.length | |
&& self.next_char == other.next_char | |
} | |
} | |
/// Compresses a given input string using the LZ77 algorithm and returns a vector of tokens. | |
/// | |
/// # Arguments | |
/// | |
/// * `input` - The input string to be compressed. | |
/// | |
/// # Returns | |
/// | |
/// A vector of tokens representing the compressed input string. | |
pub fn lz77_encode(input: &str) -> Vec<Token> { | |
let mut tokens = Vec::new(); | |
let mut index = 0; | |
while index < input.len() { | |
let longest_match = find_longest_match(input, index); | |
if longest_match.1 > 0 { | |
tokens.push(Token::new( | |
longest_match.0, | |
longest_match.1, | |
input.chars().nth(index + longest_match.1).unwrap(), | |
)); | |
index += longest_match.1 + 1; | |
} else { | |
tokens.push(Token::new(0, 0, input.chars().nth(index).unwrap())); | |
index += 1; | |
} | |
} | |
tokens | |
} | |
/// Finds the longest match for the current index in the input string. | |
/// | |
/// This function searches for the longest match of the substring starting from | |
/// the current index within the input string. It iterates through the sliding | |
/// window to find the longest matching sequence. | |
/// | |
/// # Arguments | |
/// | |
/// * `input` - The input string to search within. | |
/// * `index` - The current index in the input string to start the search from. | |
/// | |
/// # Returns | |
/// | |
/// A tuple containing the distance to the previous occurrence of the matched substring | |
/// and the length of the matched substring. | |
fn find_longest_match(input: &str, index: usize) -> (usize, usize) { | |
let candidate = &input[index..input.len() - 1]; | |
let mut longest_match = (0, 0); | |
for i in 0..index { | |
let search_box = &input[i..index]; | |
let match_length = get_match_length(candidate, search_box); | |
if match_length > longest_match.1 { | |
longest_match = (index - i, match_length); | |
} | |
} | |
longest_match | |
} | |
/// Determines the length of the match between two substrings. | |
/// | |
/// This function calculates the length of the match between the candidate | |
/// substring and the substring within the sliding window. | |
/// | |
/// # Arguments | |
/// | |
/// * `candidate` - The substring being compared from the current index. | |
/// * `search_box` - The substring within the sliding window to compare with. | |
/// | |
/// # Returns | |
/// | |
/// The length of the matching sequence between the candidate and search_box substrings. | |
fn get_match_length(candidate: &str, search_box: &str) -> usize { | |
candidate | |
.chars() | |
.zip(search_box.chars()) | |
.take_while(|&(a, b)| a == b) | |
.count() | |
} |
No description provided.