Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Backward search api #457

Merged
merged 8 commits into from Oct 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
129 changes: 114 additions & 15 deletions src/data_structures/fmindex.rs
Expand Up @@ -77,15 +77,35 @@ impl Interval {
}
}

/// This enum represents the potential result states
/// from a backward_search in the fm index. The
/// potential variants of the enum are:
/// Complete(Interval) — the query matched completely. The interval is the
/// range of suffix array indices matching the query string.
/// Partial(Intarval, usize) - some suffix of the query matched, but not the whole query.
/// The interval returned is the range of suffix array indices for the maximal
/// matching suffix, and the `usize` is the length of the maximal matching suffix.
/// Absent - None suffix of the pattern matched in the text.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum BackwardSearchResult {
Complete(Interval),
Partial(Interval, usize),
Absent,
}

pub trait FMIndexable {
/// Get occurrence count of symbol a in BWT[..r+1].
fn occ(&self, r: usize, a: u8) -> usize;
/// Also known as
fn less(&self, a: u8) -> usize;
fn bwt(&self) -> &BWT;

/// Perform backward search, yielding suffix array
/// interval denoting exact occurrences of the given pattern of length m in the text.
/// Perform backward search, yielding `BackwardSearchResult` enum that
/// contains the suffix array interval denoting exact occurrences of the given pattern
/// of length m in the text if it exists, or the suffix array interval denoting the
/// exact occurrences of a maximal matching suffix of the given pattern if it does
/// not exist. If none of the pattern can be matched, the `BackwardSearchResult` is
/// `Absent`.
/// Complexity: O(m).
///
/// # Arguments
Expand All @@ -97,7 +117,7 @@ pub trait FMIndexable {
/// ```
/// use bio::alphabets::dna;
/// use bio::data_structures::bwt::{bwt, less, Occ};
/// use bio::data_structures::fmindex::{FMIndex, FMIndexable};
/// use bio::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable};
/// use bio::data_structures::suffix_array::suffix_array;
///
/// let text = b"GCCTTAACATTATTACGCCTA$";
Expand All @@ -109,32 +129,72 @@ pub trait FMIndexable {
/// let fm = FMIndex::new(&bwt, &less, &occ);
///
/// let pattern = b"TTA";
/// let sai = fm.backward_search(pattern.iter());
/// let bsr = fm.backward_search(pattern.iter());
///
/// let positions = sai.occ(&sa);
/// let positions = match bsr {
/// BackwardSearchResult::Complete(sai) => sai.occ(&sa),
/// BackwardSearchResult::Partial(sai, _l) => sai.occ(&sa),
/// BackwardSearchResult::Absent => Vec::<usize>::new()
/// };
///
/// assert_eq!(positions, [3, 12, 9]);
/// ```
fn backward_search<'b, P: Iterator<Item = &'b u8> + DoubleEndedIterator>(
&self,
pattern: P,
) -> Interval {
) -> BackwardSearchResult {
let (mut l, mut r) = (0, self.bwt().len() - 1);
// to keep track of the last "valid" search interval if
// there is any valid suffix match.
let (mut pl, mut pr) = (l, r);

// the length of the suffix we have been able to match
// successfully
let mut matched_len = 0;
// track if we exit early or not due to an empty
// search interval.
let mut complete_match = true;

for &a in pattern.rev() {
let less = self.less(a);
pl = l;
pr = r;
l = less + if l > 0 { self.occ(l - 1, a) } else { 0 };
r = less + self.occ(r, a) - 1;

// The symbol was not found if we end up with an empty interval.
// Terminate the LF-mapping process.
// Terminate the LF-mapping process. In this case, also mark that
// we do not have a complete match.
if l > r {
complete_match = false;
break;
}
matched_len += 1;
}

Interval {
lower: l,
upper: r + 1,
// if we matched at least 1 character
if matched_len > 0 {
// if we matched the full pattern length we
// have a complete match
if complete_match {
return BackwardSearchResult::Complete(Interval {
lower: l,
upper: r + 1,
});
} else {
// if we matched less than the full pattern length, we have
// a partial suffix match
return BackwardSearchResult::Partial(
Interval {
lower: pl,
upper: pr + 1,
},
matched_len,
);
}
} else {
// if we matched nothing we have an absent result
return BackwardSearchResult::Absent;
}
}
}
Expand Down Expand Up @@ -517,25 +577,33 @@ mod tests {
let pattern = b"TTA";
let sai = fm.backward_search(pattern.iter());

let positions = sai.occ(&sa);
let positions = match sai {
BackwardSearchResult::Complete(saint) => saint.occ(&sa),
BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa),
BackwardSearchResult::Absent => Vec::<usize>::new(),
};

assert_eq!(positions, [3, 12, 9]);
}

#[test]
fn test_fmindex_not_found() {
let text = b"GCCTTAACATTATTACGCCTA$";
let text = b"TCCTTAACATTATTACTCCTA$";
let alphabet = dna::n_alphabet();
let sa = suffix_array(text);
let bwt = bwt(text, &sa);
let less = less(&bwt, &alphabet);
let occ = Occ::new(&bwt, 3, &alphabet);
let fm = FMIndex::new(&bwt, &less, &occ);

let pattern = b"TTT";
let pattern = b"TTG";
let sai = fm.backward_search(pattern.iter());

let positions = sai.occ(&sa);
let positions = match sai {
BackwardSearchResult::Complete(saint) => saint.occ(&sa),
BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa),
BackwardSearchResult::Absent => Vec::<usize>::new(),
};

assert_eq!(positions, []);
}
Expand All @@ -553,11 +621,42 @@ mod tests {

let sai = fm.backward_search(pattern.iter());

let positions = sai.occ(&sa);
let positions = match sai {
BackwardSearchResult::Complete(saint) => saint.occ(&sa),
BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa),
BackwardSearchResult::Absent => Vec::<usize>::new(),
};

assert_eq!(positions, [0]);
}

#[test]
fn test_fmindex_backward_search_partial_match() {
let text = b"GATTACA$";
let pattern = b"GTACA";
let alphabet = dna::n_alphabet();
let sa = suffix_array(text);
let bwt = bwt(text, &sa);
let less = less(&bwt, &alphabet);
let occ = Occ::new(&bwt, 3, &alphabet);
let fm = FMIndex::new(&bwt, &less, &occ);

let sai = fm.backward_search(pattern.iter());

let mut partial_match_len = 0;
let positions = match sai {
BackwardSearchResult::Complete(saint) => saint.occ(&sa),
BackwardSearchResult::Partial(saint, l) => {
partial_match_len = l;
saint.occ(&sa)
}
BackwardSearchResult::Absent => Vec::<usize>::new(),
};
Comment on lines +647 to +654
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're expecting a partial match here, I think having only the corresponding Partial match arm and a panic/unreachable/... for the other two should make this (and also the other) testcase(s) clearer

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, github showed this too late to me (after merging actually). Can you (when you find the time), modify this in a separate PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure


assert_eq!(partial_match_len, 4);
assert_eq!(positions, [3]);
}

#[test]
fn test_smems() {
let orig_text = b"GCCTTAACAT";
Expand Down
33 changes: 27 additions & 6 deletions src/lib.rs
Expand Up @@ -99,7 +99,7 @@
//! // Import some modules
//! use bio::alphabets;
//! use bio::data_structures::bwt::{bwt, less, Occ};
//! use bio::data_structures::fmindex::{FMIndex, FMIndexable};
//! use bio::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable};
//! use bio::data_structures::suffix_array::suffix_array;
//! use bio::io::fastq;
//! use bio::io::fastq::FastqRead;
Expand All @@ -124,14 +124,23 @@
//! let fmindex = FMIndex::new(&bwt, &less, &occ);
//! // do a backwards search for the pattern
//! let interval = fmindex.backward_search(pattern.iter());
//! let positions = interval.occ(&sa);
//!
//! let mut partial_match_len = 0;
//! // get the locations where the pattern matched (completely in this case).
//! let positions = match interval {
//! BackwardSearchResult::Complete(saint) => saint.occ(&sa),
//! BackwardSearchResult::Partial(saint, l) => {
//! partial_match_len = l;
//! saint.occ(&sa)
//! }
//! BackwardSearchResult::Absent => Vec::new(),
//! };
//! // Iterate over a FASTQ file, use the alphabet to validate read
//! // sequences and search for exact matches in the FM-Index.
//!
//! // create FASTQ reader
//! let mut reader = fastq::Reader::new(io::stdin());
//! let mut record = fastq::Record::new();
//! let mut partial_match_len = 0;
//! reader.read(&mut record).expect("Failed to parse record");
//! while !record.is_empty() {
//! let check = record.check();
Expand All @@ -143,7 +152,16 @@
//! // check, whether seq is in the expected alphabet
//! if alphabet.is_word(seq) {
//! let interval = fmindex.backward_search(seq.iter());
//! let positions = interval.occ(&positions);
//! // get the positions where seq matched completely
//! // or where the maximal matching suffix of seq occurred.
//! let positions = match interval {
//! BackwardSearchResult::Complete(saint) => saint.occ(&sa),
//! BackwardSearchResult::Partial(saint, l) => {
//! partial_match_len = l;
//! saint.occ(&sa)
//! }
//! BackwardSearchResult::Absent => Vec::new(),
//! };
//! }
//! reader.read(&mut record).expect("Failed to parse record");
//! }
Expand All @@ -157,7 +175,7 @@
//! ```rust
//! use bio::alphabets;
//! use bio::data_structures::bwt::{bwt, less, Occ};
//! use bio::data_structures::fmindex::{FMIndex, FMIndexable};
//! use bio::data_structures::fmindex::{BackwardSearchResult,FMIndex, FMIndexable};
//! use bio::data_structures::suffix_array::suffix_array;
//! use std::sync::Arc;
//! use std::thread;
Expand All @@ -184,7 +202,10 @@
//!
//! // Loop through the results, extracting the positions array for each pattern
//! for interval_calculator in interval_calculators {
//! let positions = interval_calculator.join().unwrap().occ(&sa);
//! let positions = match interval_calculator.join().unwrap() {
//! BackwardSearchResult::Complete(saint) => saint.occ(&sa),
//! _ => Vec::new()
//! };
//! }
//! ```
//!
Expand Down