From eb9d378c70bc43d1dcf477e48cb167802799a546 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 19 Oct 2021 16:46:57 -0400 Subject: [PATCH] feat: Backward search api (#457) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix backward search on FM-Index (#454) * add enum to track result type in fm_index backward search * format * address feedback on initial PR * Update lib.rs * fix doc examples * cargo fmt Co-authored-by: J. Christian Heide Co-authored-by: Rob Patro Co-authored-by: Johannes Köster --- src/data_structures/fmindex.rs | 129 +++++++++++++++++++++++++++++---- src/lib.rs | 33 +++++++-- 2 files changed, 141 insertions(+), 21 deletions(-) diff --git a/src/data_structures/fmindex.rs b/src/data_structures/fmindex.rs index 370c9fc18d..d660366281 100644 --- a/src/data_structures/fmindex.rs +++ b/src/data_structures/fmindex.rs @@ -77,6 +77,22 @@ impl Interval { } } +/// This enum represents the potential result states +/// from a backward_search in the fm index. The +/// potential variants of the enum are: +/// Complete(Interval) — the query matched completely. The interval is the +/// range of suffix array indices matching the query string. +/// Partial(Intarval, usize) - some suffix of the query matched, but not the whole query. +/// The interval returned is the range of suffix array indices for the maximal +/// matching suffix, and the `usize` is the length of the maximal matching suffix. +/// Absent - None suffix of the pattern matched in the text. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum BackwardSearchResult { + Complete(Interval), + Partial(Interval, usize), + Absent, +} + pub trait FMIndexable { /// Get occurrence count of symbol a in BWT[..r+1]. fn occ(&self, r: usize, a: u8) -> usize; @@ -84,8 +100,12 @@ pub trait FMIndexable { fn less(&self, a: u8) -> usize; fn bwt(&self) -> &BWT; - /// Perform backward search, yielding suffix array - /// interval denoting exact occurrences of the given pattern of length m in the text. + /// Perform backward search, yielding `BackwardSearchResult` enum that + /// contains the suffix array interval denoting exact occurrences of the given pattern + /// of length m in the text if it exists, or the suffix array interval denoting the + /// exact occurrences of a maximal matching suffix of the given pattern if it does + /// not exist. If none of the pattern can be matched, the `BackwardSearchResult` is + /// `Absent`. /// Complexity: O(m). /// /// # Arguments @@ -97,7 +117,7 @@ pub trait FMIndexable { /// ``` /// use bio::alphabets::dna; /// use bio::data_structures::bwt::{bwt, less, Occ}; - /// use bio::data_structures::fmindex::{FMIndex, FMIndexable}; + /// use bio::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable}; /// use bio::data_structures::suffix_array::suffix_array; /// /// let text = b"GCCTTAACATTATTACGCCTA$"; @@ -109,32 +129,72 @@ pub trait FMIndexable { /// let fm = FMIndex::new(&bwt, &less, &occ); /// /// let pattern = b"TTA"; - /// let sai = fm.backward_search(pattern.iter()); + /// let bsr = fm.backward_search(pattern.iter()); /// - /// let positions = sai.occ(&sa); + /// let positions = match bsr { + /// BackwardSearchResult::Complete(sai) => sai.occ(&sa), + /// BackwardSearchResult::Partial(sai, _l) => sai.occ(&sa), + /// BackwardSearchResult::Absent => Vec::::new() + /// }; /// /// assert_eq!(positions, [3, 12, 9]); /// ``` fn backward_search<'b, P: Iterator + DoubleEndedIterator>( &self, pattern: P, - ) -> Interval { + ) -> BackwardSearchResult { let (mut l, mut r) = (0, self.bwt().len() - 1); + // to keep track of the last "valid" search interval if + // there is any valid suffix match. + let (mut pl, mut pr) = (l, r); + + // the length of the suffix we have been able to match + // successfully + let mut matched_len = 0; + // track if we exit early or not due to an empty + // search interval. + let mut complete_match = true; + for &a in pattern.rev() { let less = self.less(a); + pl = l; + pr = r; l = less + if l > 0 { self.occ(l - 1, a) } else { 0 }; r = less + self.occ(r, a) - 1; // The symbol was not found if we end up with an empty interval. - // Terminate the LF-mapping process. + // Terminate the LF-mapping process. In this case, also mark that + // we do not have a complete match. if l > r { + complete_match = false; break; } + matched_len += 1; } - Interval { - lower: l, - upper: r + 1, + // if we matched at least 1 character + if matched_len > 0 { + // if we matched the full pattern length we + // have a complete match + if complete_match { + return BackwardSearchResult::Complete(Interval { + lower: l, + upper: r + 1, + }); + } else { + // if we matched less than the full pattern length, we have + // a partial suffix match + return BackwardSearchResult::Partial( + Interval { + lower: pl, + upper: pr + 1, + }, + matched_len, + ); + } + } else { + // if we matched nothing we have an absent result + return BackwardSearchResult::Absent; } } } @@ -517,14 +577,18 @@ mod tests { let pattern = b"TTA"; let sai = fm.backward_search(pattern.iter()); - let positions = sai.occ(&sa); + let positions = match sai { + BackwardSearchResult::Complete(saint) => saint.occ(&sa), + BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa), + BackwardSearchResult::Absent => Vec::::new(), + }; assert_eq!(positions, [3, 12, 9]); } #[test] fn test_fmindex_not_found() { - let text = b"GCCTTAACATTATTACGCCTA$"; + let text = b"TCCTTAACATTATTACTCCTA$"; let alphabet = dna::n_alphabet(); let sa = suffix_array(text); let bwt = bwt(text, &sa); @@ -532,10 +596,14 @@ mod tests { let occ = Occ::new(&bwt, 3, &alphabet); let fm = FMIndex::new(&bwt, &less, &occ); - let pattern = b"TTT"; + let pattern = b"TTG"; let sai = fm.backward_search(pattern.iter()); - let positions = sai.occ(&sa); + let positions = match sai { + BackwardSearchResult::Complete(saint) => saint.occ(&sa), + BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa), + BackwardSearchResult::Absent => Vec::::new(), + }; assert_eq!(positions, []); } @@ -553,11 +621,42 @@ mod tests { let sai = fm.backward_search(pattern.iter()); - let positions = sai.occ(&sa); + let positions = match sai { + BackwardSearchResult::Complete(saint) => saint.occ(&sa), + BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa), + BackwardSearchResult::Absent => Vec::::new(), + }; assert_eq!(positions, [0]); } + #[test] + fn test_fmindex_backward_search_partial_match() { + let text = b"GATTACA$"; + let pattern = b"GTACA"; + let alphabet = dna::n_alphabet(); + let sa = suffix_array(text); + let bwt = bwt(text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + let fm = FMIndex::new(&bwt, &less, &occ); + + let sai = fm.backward_search(pattern.iter()); + + let mut partial_match_len = 0; + let positions = match sai { + BackwardSearchResult::Complete(saint) => saint.occ(&sa), + BackwardSearchResult::Partial(saint, l) => { + partial_match_len = l; + saint.occ(&sa) + } + BackwardSearchResult::Absent => Vec::::new(), + }; + + assert_eq!(partial_match_len, 4); + assert_eq!(positions, [3]); + } + #[test] fn test_smems() { let orig_text = b"GCCTTAACAT"; diff --git a/src/lib.rs b/src/lib.rs index deada07fb7..ed029dda6e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -99,7 +99,7 @@ //! // Import some modules //! use bio::alphabets; //! use bio::data_structures::bwt::{bwt, less, Occ}; -//! use bio::data_structures::fmindex::{FMIndex, FMIndexable}; +//! use bio::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable}; //! use bio::data_structures::suffix_array::suffix_array; //! use bio::io::fastq; //! use bio::io::fastq::FastqRead; @@ -124,14 +124,23 @@ //! let fmindex = FMIndex::new(&bwt, &less, &occ); //! // do a backwards search for the pattern //! let interval = fmindex.backward_search(pattern.iter()); -//! let positions = interval.occ(&sa); -//! +//! let mut partial_match_len = 0; +//! // get the locations where the pattern matched (completely in this case). +//! let positions = match interval { +//! BackwardSearchResult::Complete(saint) => saint.occ(&sa), +//! BackwardSearchResult::Partial(saint, l) => { +//! partial_match_len = l; +//! saint.occ(&sa) +//! } +//! BackwardSearchResult::Absent => Vec::new(), +//! }; //! // Iterate over a FASTQ file, use the alphabet to validate read //! // sequences and search for exact matches in the FM-Index. //! //! // create FASTQ reader //! let mut reader = fastq::Reader::new(io::stdin()); //! let mut record = fastq::Record::new(); +//! let mut partial_match_len = 0; //! reader.read(&mut record).expect("Failed to parse record"); //! while !record.is_empty() { //! let check = record.check(); @@ -143,7 +152,16 @@ //! // check, whether seq is in the expected alphabet //! if alphabet.is_word(seq) { //! let interval = fmindex.backward_search(seq.iter()); -//! let positions = interval.occ(&positions); +//! // get the positions where seq matched completely +//! // or where the maximal matching suffix of seq occurred. +//! let positions = match interval { +//! BackwardSearchResult::Complete(saint) => saint.occ(&sa), +//! BackwardSearchResult::Partial(saint, l) => { +//! partial_match_len = l; +//! saint.occ(&sa) +//! } +//! BackwardSearchResult::Absent => Vec::new(), +//! }; //! } //! reader.read(&mut record).expect("Failed to parse record"); //! } @@ -157,7 +175,7 @@ //! ```rust //! use bio::alphabets; //! use bio::data_structures::bwt::{bwt, less, Occ}; -//! use bio::data_structures::fmindex::{FMIndex, FMIndexable}; +//! use bio::data_structures::fmindex::{BackwardSearchResult,FMIndex, FMIndexable}; //! use bio::data_structures::suffix_array::suffix_array; //! use std::sync::Arc; //! use std::thread; @@ -184,7 +202,10 @@ //! //! // Loop through the results, extracting the positions array for each pattern //! for interval_calculator in interval_calculators { -//! let positions = interval_calculator.join().unwrap().occ(&sa); +//! let positions = match interval_calculator.join().unwrap() { +//! BackwardSearchResult::Complete(saint) => saint.occ(&sa), +//! _ => Vec::new() +//! }; //! } //! ``` //!