Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
feat: Backward search api (#457)
* Fix backward search on FM-Index (#454)

* add enum to track result type in fm_index backward search

* format

* address feedback on initial PR

* Update lib.rs

* fix doc examples

* cargo fmt

Co-authored-by: J. Christian Heide <christian_heide@eva.mpg.de>
Co-authored-by: Rob Patro <rob@newton>
Co-authored-by: Johannes Köster <johannes.koester@tu-dortmund.de>
  • Loading branch information
4 people committed Oct 19, 2021
1 parent 43d198e commit eb9d378
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 21 deletions.
129 changes: 114 additions & 15 deletions src/data_structures/fmindex.rs
Expand Up @@ -77,15 +77,35 @@ impl Interval {
}
}

/// This enum represents the potential result states
/// from a backward_search in the fm index. The
/// potential variants of the enum are:
/// Complete(Interval) — the query matched completely. The interval is the
/// range of suffix array indices matching the query string.
/// Partial(Intarval, usize) - some suffix of the query matched, but not the whole query.
/// The interval returned is the range of suffix array indices for the maximal
/// matching suffix, and the `usize` is the length of the maximal matching suffix.
/// Absent - None suffix of the pattern matched in the text.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum BackwardSearchResult {
Complete(Interval),
Partial(Interval, usize),
Absent,
}

pub trait FMIndexable {
/// Get occurrence count of symbol a in BWT[..r+1].
fn occ(&self, r: usize, a: u8) -> usize;
/// Also known as
fn less(&self, a: u8) -> usize;
fn bwt(&self) -> &BWT;

/// Perform backward search, yielding suffix array
/// interval denoting exact occurrences of the given pattern of length m in the text.
/// Perform backward search, yielding `BackwardSearchResult` enum that
/// contains the suffix array interval denoting exact occurrences of the given pattern
/// of length m in the text if it exists, or the suffix array interval denoting the
/// exact occurrences of a maximal matching suffix of the given pattern if it does
/// not exist. If none of the pattern can be matched, the `BackwardSearchResult` is
/// `Absent`.
/// Complexity: O(m).
///
/// # Arguments
Expand All @@ -97,7 +117,7 @@ pub trait FMIndexable {
/// ```
/// use bio::alphabets::dna;
/// use bio::data_structures::bwt::{bwt, less, Occ};
/// use bio::data_structures::fmindex::{FMIndex, FMIndexable};
/// use bio::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable};
/// use bio::data_structures::suffix_array::suffix_array;
///
/// let text = b"GCCTTAACATTATTACGCCTA$";
Expand All @@ -109,32 +129,72 @@ pub trait FMIndexable {
/// let fm = FMIndex::new(&bwt, &less, &occ);
///
/// let pattern = b"TTA";
/// let sai = fm.backward_search(pattern.iter());
/// let bsr = fm.backward_search(pattern.iter());
///
/// let positions = sai.occ(&sa);
/// let positions = match bsr {
/// BackwardSearchResult::Complete(sai) => sai.occ(&sa),
/// BackwardSearchResult::Partial(sai, _l) => sai.occ(&sa),
/// BackwardSearchResult::Absent => Vec::<usize>::new()
/// };
///
/// assert_eq!(positions, [3, 12, 9]);
/// ```
fn backward_search<'b, P: Iterator<Item = &'b u8> + DoubleEndedIterator>(
&self,
pattern: P,
) -> Interval {
) -> BackwardSearchResult {
let (mut l, mut r) = (0, self.bwt().len() - 1);
// to keep track of the last "valid" search interval if
// there is any valid suffix match.
let (mut pl, mut pr) = (l, r);

// the length of the suffix we have been able to match
// successfully
let mut matched_len = 0;
// track if we exit early or not due to an empty
// search interval.
let mut complete_match = true;

for &a in pattern.rev() {
let less = self.less(a);
pl = l;
pr = r;
l = less + if l > 0 { self.occ(l - 1, a) } else { 0 };
r = less + self.occ(r, a) - 1;

// The symbol was not found if we end up with an empty interval.
// Terminate the LF-mapping process.
// Terminate the LF-mapping process. In this case, also mark that
// we do not have a complete match.
if l > r {
complete_match = false;
break;
}
matched_len += 1;
}

Interval {
lower: l,
upper: r + 1,
// if we matched at least 1 character
if matched_len > 0 {
// if we matched the full pattern length we
// have a complete match
if complete_match {
return BackwardSearchResult::Complete(Interval {
lower: l,
upper: r + 1,
});
} else {
// if we matched less than the full pattern length, we have
// a partial suffix match
return BackwardSearchResult::Partial(
Interval {
lower: pl,
upper: pr + 1,
},
matched_len,
);
}
} else {
// if we matched nothing we have an absent result
return BackwardSearchResult::Absent;
}
}
}
Expand Down Expand Up @@ -517,25 +577,33 @@ mod tests {
let pattern = b"TTA";
let sai = fm.backward_search(pattern.iter());

let positions = sai.occ(&sa);
let positions = match sai {
BackwardSearchResult::Complete(saint) => saint.occ(&sa),
BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa),
BackwardSearchResult::Absent => Vec::<usize>::new(),
};

assert_eq!(positions, [3, 12, 9]);
}

#[test]
fn test_fmindex_not_found() {
let text = b"GCCTTAACATTATTACGCCTA$";
let text = b"TCCTTAACATTATTACTCCTA$";
let alphabet = dna::n_alphabet();
let sa = suffix_array(text);
let bwt = bwt(text, &sa);
let less = less(&bwt, &alphabet);
let occ = Occ::new(&bwt, 3, &alphabet);
let fm = FMIndex::new(&bwt, &less, &occ);

let pattern = b"TTT";
let pattern = b"TTG";
let sai = fm.backward_search(pattern.iter());

let positions = sai.occ(&sa);
let positions = match sai {
BackwardSearchResult::Complete(saint) => saint.occ(&sa),
BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa),
BackwardSearchResult::Absent => Vec::<usize>::new(),
};

assert_eq!(positions, []);
}
Expand All @@ -553,11 +621,42 @@ mod tests {

let sai = fm.backward_search(pattern.iter());

let positions = sai.occ(&sa);
let positions = match sai {
BackwardSearchResult::Complete(saint) => saint.occ(&sa),
BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa),
BackwardSearchResult::Absent => Vec::<usize>::new(),
};

assert_eq!(positions, [0]);
}

#[test]
fn test_fmindex_backward_search_partial_match() {
let text = b"GATTACA$";
let pattern = b"GTACA";
let alphabet = dna::n_alphabet();
let sa = suffix_array(text);
let bwt = bwt(text, &sa);
let less = less(&bwt, &alphabet);
let occ = Occ::new(&bwt, 3, &alphabet);
let fm = FMIndex::new(&bwt, &less, &occ);

let sai = fm.backward_search(pattern.iter());

let mut partial_match_len = 0;
let positions = match sai {
BackwardSearchResult::Complete(saint) => saint.occ(&sa),
BackwardSearchResult::Partial(saint, l) => {
partial_match_len = l;
saint.occ(&sa)
}
BackwardSearchResult::Absent => Vec::<usize>::new(),
};

assert_eq!(partial_match_len, 4);
assert_eq!(positions, [3]);
}

#[test]
fn test_smems() {
let orig_text = b"GCCTTAACAT";
Expand Down
33 changes: 27 additions & 6 deletions src/lib.rs
Expand Up @@ -99,7 +99,7 @@
//! // Import some modules
//! use bio::alphabets;
//! use bio::data_structures::bwt::{bwt, less, Occ};
//! use bio::data_structures::fmindex::{FMIndex, FMIndexable};
//! use bio::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable};
//! use bio::data_structures::suffix_array::suffix_array;
//! use bio::io::fastq;
//! use bio::io::fastq::FastqRead;
Expand All @@ -124,14 +124,23 @@
//! let fmindex = FMIndex::new(&bwt, &less, &occ);
//! // do a backwards search for the pattern
//! let interval = fmindex.backward_search(pattern.iter());
//! let positions = interval.occ(&sa);
//!
//! let mut partial_match_len = 0;
//! // get the locations where the pattern matched (completely in this case).
//! let positions = match interval {
//! BackwardSearchResult::Complete(saint) => saint.occ(&sa),
//! BackwardSearchResult::Partial(saint, l) => {
//! partial_match_len = l;
//! saint.occ(&sa)
//! }
//! BackwardSearchResult::Absent => Vec::new(),
//! };
//! // Iterate over a FASTQ file, use the alphabet to validate read
//! // sequences and search for exact matches in the FM-Index.
//!
//! // create FASTQ reader
//! let mut reader = fastq::Reader::new(io::stdin());
//! let mut record = fastq::Record::new();
//! let mut partial_match_len = 0;
//! reader.read(&mut record).expect("Failed to parse record");
//! while !record.is_empty() {
//! let check = record.check();
Expand All @@ -143,7 +152,16 @@
//! // check, whether seq is in the expected alphabet
//! if alphabet.is_word(seq) {
//! let interval = fmindex.backward_search(seq.iter());
//! let positions = interval.occ(&positions);
//! // get the positions where seq matched completely
//! // or where the maximal matching suffix of seq occurred.
//! let positions = match interval {
//! BackwardSearchResult::Complete(saint) => saint.occ(&sa),
//! BackwardSearchResult::Partial(saint, l) => {
//! partial_match_len = l;
//! saint.occ(&sa)
//! }
//! BackwardSearchResult::Absent => Vec::new(),
//! };
//! }
//! reader.read(&mut record).expect("Failed to parse record");
//! }
Expand All @@ -157,7 +175,7 @@
//! ```rust
//! use bio::alphabets;
//! use bio::data_structures::bwt::{bwt, less, Occ};
//! use bio::data_structures::fmindex::{FMIndex, FMIndexable};
//! use bio::data_structures::fmindex::{BackwardSearchResult,FMIndex, FMIndexable};
//! use bio::data_structures::suffix_array::suffix_array;
//! use std::sync::Arc;
//! use std::thread;
Expand All @@ -184,7 +202,10 @@
//!
//! // Loop through the results, extracting the positions array for each pattern
//! for interval_calculator in interval_calculators {
//! let positions = interval_calculator.join().unwrap().occ(&sa);
//! let positions = match interval_calculator.join().unwrap() {
//! BackwardSearchResult::Complete(saint) => saint.occ(&sa),
//! _ => Vec::new()
//! };
//! }
//! ```
//!
Expand Down

0 comments on commit eb9d378

Please sign in to comment.