Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Make QGramIndex use less memory; add docs; fix bug #471

Merged
merged 1 commit into from Feb 8, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
93 changes: 90 additions & 3 deletions src/data_structures/qgram_index.rs
Expand Up @@ -36,10 +36,14 @@ use crate::alphabets::{Alphabet, RankTransform};
use crate::utils;

/// A classical, flexible, q-gram index implementation.
///
/// Uses |alphabet|^q + k words of memory, where k is the number of q-grams in the text with count at most `max_count` (if specified).
#[derive(Serialize, Deserialize)]
pub struct QGramIndex {
q: u32,
// For each q-gram, the position in `pos` where positions for this q-gram are stored.
address: Vec<usize>,
// The positions in `text` where each q-gram occurs.
pos: Vec<usize>,
ranks: RankTransform,
}
Expand Down Expand Up @@ -69,13 +73,12 @@ impl QGramIndex {

let qgram_count = alphabet.len().pow(q as u32);
let mut address = vec![0; qgram_count + 1];
let mut pos = vec![0; text.len()];

for qgram in ranks.qgrams(q, text.clone()) {
address[qgram] += 1;
}

for a in address.iter_mut().skip(1) {
for a in address.iter_mut() {
if *a > max_count {
// mask qgram
*a = 0;
Expand All @@ -84,6 +87,9 @@ impl QGramIndex {

utils::prescan(&mut address, 0, |a, b| a + b);

// Address has at least size 1, so unwrap is fine.
let mut pos = vec![0; *address.last().unwrap()];

{
let mut offset = vec![0; qgram_count];
for (i, qgram) in ranks.qgrams(q, text).enumerate() {
Expand Down Expand Up @@ -114,8 +120,10 @@ impl QGramIndex {
&self.pos[self.address[qgram]..self.address[qgram + 1]]
}

/// Return matches of the given pattern.
/// Return matches of the given pattern, matching in at least `min_count` q-grams.
/// Complexity O(m + k) for pattern of length m and k being the number of matching q-grams.
///
/// A match is a substring of `pattern` and a corresponding substring of the text that share at least `min_count` q-grams.
pub fn matches(&self, pattern: &[u8], min_count: usize) -> Vec<Match> {
let q = self.q as usize;
let mut diagonals = collections::HashMap::new();
Expand Down Expand Up @@ -153,6 +161,8 @@ impl QGramIndex {

/// Return exact matches (substrings) of the given pattern.
/// Complexity O(m + k) for pattern of length m and k being the number of matching q-grams.
///
/// An exact match is a substring of `pattern` occurring in the text of length at least `q`.
pub fn exact_matches(&self, pattern: &[u8]) -> Vec<ExactMatch> {
let q = self.q as usize;
let mut diagonals = collections::HashMap::new();
Expand Down Expand Up @@ -264,12 +274,60 @@ mod tests {
assert_eq!(matches, [5, 10]);
}

#[test]
fn test_qgram_with_max_count() {
let (text, alphabet) = setup();
let q = 3;
let qgram_index = QGramIndex::with_max_count(q, text, &alphabet, 1);

let ranks = alphabets::RankTransform::new(&alphabet);

let qgram = ranks.qgrams(q, b"TGA").next().unwrap();

// Should be pruned because the count of 2 is larger than the max_count of 1.
let matches = qgram_index.qgram_matches(qgram);
assert_eq!(matches, []);
}

#[test]
fn test_qgram_with_max_count_index_0() {
let (_, alphabet) = setup();
let text = b"AAAAA";
let q = 3;
let qgram_index = QGramIndex::with_max_count(q, text, &alphabet, 1);

let ranks = alphabets::RankTransform::new(&alphabet);

let qgram = ranks.qgrams(q, b"AAA").next().unwrap();

// Should be pruned because the count of 3 is larger than the max_count of 1.
let matches = qgram_index.qgram_matches(qgram);
assert_eq!(matches, []);
}

#[test]
fn test_qgram_sizeof_pos() {
let (_, alphabet) = setup();
let text = b"AAAAA";
let q = 3;
let qgram_index = QGramIndex::new(q, text, &alphabet);

let ranks = alphabets::RankTransform::new(&alphabet);

let qgram = ranks.qgrams(q, b"AAA").next().unwrap();

// Should be pruned because the count of 3 is larger than the max_count of 1.
let matches = qgram_index.qgram_matches(qgram);
assert_eq!(matches, [0, 1, 2]);
}

#[test]
fn test_matches() {
let (text, alphabet) = setup();
let q = 3;
let qgram_index = QGramIndex::new(q, text, &alphabet);

// A fully matching pattern.
let pattern = b"GCTG";
let matches = qgram_index.matches(pattern, 1);
assert_eq!(
Expand All @@ -280,6 +338,18 @@ mod tests {
count: 2,
}]
);

// A pattern that matches in one position on two disjoint q-grams.
let pattern = b"GCTAAGA";
let matches = qgram_index.matches(pattern, 2);
assert_eq!(
matches,
[Match {
pattern: Interval { start: 0, stop: 7 },
text: Interval { start: 3, stop: 10 },
count: 2,
}]
);
}

#[test]
Expand All @@ -294,6 +364,23 @@ mod tests {
for m in exact_matches {
assert_eq!(m.pattern.get(pattern), m.text.get(text));
}

// A pattern that matches in one position on two disjoint q-grams.
let pattern = b"GCTAAGA";
let matches = qgram_index.exact_matches(pattern);
assert_eq!(
matches,
[
ExactMatch {
pattern: Interval { start: 0, stop: 3 },
text: Interval { start: 3, stop: 6 }
},
ExactMatch {
pattern: Interval { start: 4, stop: 7 },
text: Interval { start: 7, stop: 10 }
}
]
);
}

#[test]
Expand Down