Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: One Simple Trick to significantly improve the speed and memory usage of Occ #448

Merged
merged 10 commits into from Aug 23, 2021
44 changes: 30 additions & 14 deletions src/data_structures/bwt.rs
Expand Up @@ -73,7 +73,7 @@ pub fn invert_bwt(bwt: &BWTSlice) -> Vec<u8> {
}

/// An occurrence array implementation.
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct Occ {
occ: Vec<Vec<usize>>,
k: u32,
Expand All @@ -83,8 +83,8 @@ impl Occ {
/// Calculate occ array with sampling from BWT of length n.
/// Time complexity: O(n).
/// Space complexity: O(n / k * A) with A being the alphabet size.
/// Alphabet size is determined on the fly from the BWT.
/// For large texts, it is therefore advisable to transform
/// The specified alphabet must match the alphabet of the text and its BWT.
/// For large texts, it is advisable to transform
/// the text before calculating the BWT (see alphabets::rank_transform).
///
/// # Arguments
Expand All @@ -97,12 +97,27 @@ impl Occ {
.max_symbol()
.expect("Expecting non-empty alphabet.") as usize
+ 1;
let mut occ = Vec::with_capacity(n / k as usize);
let mut curr_occ: Vec<usize> = repeat(0).take(m).collect();
let mut alpha = alphabet.symbols.iter().collect::<Vec<usize>>();
// include sentinel '$'
if (b'$' as usize) < m && !alphabet.is_word(b"$") {
alpha.push(b'$' as usize);
}
let mut occ: Vec<Vec<usize>> = vec![Vec::new(); m];
let mut curr_occ = vec![0usize; m];

// characters not in the alphabet won't take up much space
for &a in &alpha {
occ[a].reserve(n / k as usize);
}

for (i, &c) in bwt.iter().enumerate() {
curr_occ[c as usize] += 1;

if i % k as usize == 0 {
occ.push(curr_occ.clone());
// only visit characters in the alphabet
for &a in &alpha {
occ[a].push(curr_occ[a]);
}
}
}

Expand All @@ -122,7 +137,7 @@ impl Occ {
// .iter()
// .filter(|&&c| c == a)
// .count();
// self.occ[i][a as usize] + count
// self.occ[a as usize][i] + count
// ```
//
// But there are a couple of reasons to do this manually:
Expand All @@ -140,15 +155,13 @@ impl Occ {
// self.k is our sampling rate, so find the checkpoints either side of r.
let lo_checkpoint = r / self.k as usize;
// Get the occurences at the low checkpoint
let lo_occ = self.occ[lo_checkpoint][a as usize];
let lo_occ = self.occ[a as usize][lo_checkpoint];

// If the sampling rate is infrequent it is worth checking if there is a closer
// hi checkpoint.
if self.k > 64 {
let hi_checkpoint = lo_checkpoint + 1;
if let Some(hi_occs) = self.occ.get(hi_checkpoint) {
let hi_occ = hi_occs[a as usize];

if let Some(&hi_occ) = self.occ[a as usize].get(hi_checkpoint) {
// Its possible that there are no occurences between the low and high
// checkpoint in which case we bail early.
if lo_occ == hi_occ {
Expand All @@ -158,7 +171,6 @@ impl Occ {
// If r is closer to the high checkpoint, count backwards from there.
let hi_idx = hi_checkpoint * self.k as usize;
if (hi_idx - r) < (self.k as usize / 2) {
let hi_occ = hi_occs[a as usize];
return hi_occ - bytecount::count(&bwt[r + 1..=hi_idx], a) as usize;
}
}
Expand Down Expand Up @@ -232,15 +244,19 @@ mod tests {
let bwt = vec![1u8, 3u8, 3u8, 1u8, 2u8, 0u8];
let alphabet = Alphabet::new(&[0u8, 1u8, 2u8, 3u8]);
let occ = Occ::new(&bwt, 3, &alphabet);
assert_eq!(occ.occ, [[0, 1, 0, 0], [0, 2, 0, 2]]);
assert_eq!(occ.occ, [[0, 0], [1, 2], [0, 0], [0, 2]]);
assert_eq!(occ.get(&bwt, 4, 2u8), 1);
assert_eq!(occ.get(&bwt, 4, 3u8), 2);
}

#[test]
fn test_occwm() {
let text = b"GCCTTAACATTATTACGCCTA$";
let alphabet = dna::n_alphabet();
let alphabet = {
let mut a = dna::n_alphabet();
a.insert(b'$');
a
};
let sa = suffix_array(text);
let bwt = bwt(text, &sa);
let occ = Occ::new(&bwt, 3, &alphabet);
Expand Down
15 changes: 15 additions & 0 deletions src/data_structures/fmindex.rs
Expand Up @@ -480,6 +480,21 @@ impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> FMDIndex<DBWT, D

self.backward_ext(&interval.swapped(), comp_a).swapped()
}

/// Construct a new instance of the FMD index (see Heng Li (2012) Bioinformatics)
/// without checking whether the text is over the DNA alphabet with N.
/// This expects a BWT that was created from a text over the DNA alphabet with N
/// (`alphabets::dna::n_alphabet()`) consisting of the
/// concatenation with its reverse complement, separated by the sentinel symbol `$`.
/// I.e., let T be the original text and R be its reverse complement.
/// Then, the expected text is T$R$. Further, multiple concatenated texts are allowed, e.g.
/// T1$R1$T2$R2$T3$R3$.
/// It is unsafe to construct an FMD index from an FM index that is not built on the DNA alphabet.
pub unsafe fn from_fmindex_unchecked(
fmindex: FMIndex<DBWT, DLess, DOcc>,
) -> FMDIndex<DBWT, DLess, DOcc> {
FMDIndex { fmindex }
}
}

#[cfg(test)]
Expand Down