From f295ea437601ef71ea8daff4e6fa38f335267803 Mon Sep 17 00:00:00 2001
From: Daniel Liu <daniel.liu@10xgenomics.com>
Date: Thu, 19 Aug 2021 16:43:22 -0700
Subject: [PATCH 1/8] Fixed sampled suffix array sentinels bug and added more
 tests

---
 src/data_structures/suffix_array.rs | 307 +++++++++++++++-------------
 1 file changed, 168 insertions(+), 139 deletions(-)
diff --git a/src/data_structures/suffix_array.rs b/src/data_structures/suffix_array.rs
index b7aaabe26..44458d01e 100644
--- a/src/data_structures/suffix_array.rs
+++ b/src/data_structures/suffix_array.rs
@@ -25,6 +25,9 @@ use std::cmp;
 use std::fmt::Debug;
 use std::iter;
 use std::ops::Deref;
+use std::borrow::Borrow;
+use std::collections::HashMap;
+use std::hash::BuildHasherDefault;
 
 use num_integer::Integer;
 use num_traits::{cast, NumCast, Unsigned};
@@ -32,76 +35,95 @@ use num_traits::{cast, NumCast, Unsigned};
 use bv::{BitVec, Bits, BitsMut};
 use vec_map::VecMap;
 
+use fxhash::FxHasher;
+
 use crate::alphabets::{Alphabet, RankTransform};
 use crate::data_structures::smallints::SmallInts;
+use crate::data_structures::bwt::{BWT, Less, Occ};
 
 pub type LCPArray = SmallInts<i8, isize>;
 pub type RawSuffixArray = Vec<usize>;
 pub type RawSuffixArraySlice<'a> = &'a [usize];
 
+type HashMapFx<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher>>;
+
 /// A trait exposing general functionality of suffix arrays.
 pub trait SuffixArray {
     fn get(&self, index: usize) -> Option<usize>;
     fn len(&self) -> usize;
     fn is_empty(&self) -> bool;
 
-    // /// Sample the suffix array with the given sample rate.
-    // ///
-    // /// # Arguments
-    // ///
-    // /// * `bwt` - the corresponding BWT
-    // /// * `less` - the corresponding less array
-    // /// * `occ` - the corresponding occ table
-    // /// * `sampling_rate` - if sampling rate is k, every k-th entry will be kept
-    // ///
-    // /// # Example
-    // ///
-    // /// ```
-    // /// use bio::data_structures::suffix_array::{suffix_array, SuffixArray};
-    // /// use bio::data_structures::bwt::{bwt, less, Occ};
-    // /// use bio::alphabets::dna;
-    // ///
-    // /// let text = b"ACGCGAT$";
-    // /// let alphabet = dna::n_alphabet();
-    // /// let sa = suffix_array(text);
-    // /// let bwt = bwt(text, &sa);
-    // /// let less = less(&bwt, &alphabet);
-    // /// let occ = Occ::new(&bwt, 3, &alphabet);
-    // /// let sampled = sa.sample(&bwt, &less, &occ, 1);
-    // ///
-    // /// for i in 0..sa.len() {
-    // ///    assert_eq!(sa.get(i), sampled.get(i));
-    // /// }
-    // /// ```
-    // fn sample<DBWT: DerefBWT, DLess: DerefLess, DOcc: DerefOcc>
-    //     (&self, bwt: DBWT, less: DLess, occ: DOcc, sampling_rate: usize) ->
-    //     SampledSuffixArray<DBWT, DLess, DOcc> {
-    //
-    //     let mut sample = Vec::with_capacity((self.len() as f32 / sampling_rate as f32).ceil() as usize);
-    //     for i in 0..self.len() {
-    //         if (i % sampling_rate) == 0 {
-    //             sample.push(self.get(i).unwrap());
-    //         }
-    //     }
-    //
-    //     SampledSuffixArray {
-    //         bwt: bwt,
-    //         less: less,
-    //         occ: occ,
-    //         sample: sample,
-    //         s: sampling_rate,
-    //     }
-    // }
+    /// Sample the suffix array with the given sample rate.
+    ///
+    /// # Arguments
+    ///
+    /// * `text` - text that the suffix array is built on
+    /// * `bwt` - the corresponding BWT
+    /// * `less` - the corresponding less array
+    /// * `occ` - the corresponding occ table
+    /// * `sampling_rate` - if sampling rate is k, every k-th entry will be kept
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use bio::data_structures::suffix_array::{suffix_array, SuffixArray};
+    /// use bio::data_structures::bwt::{bwt, less, Occ};
+    /// use bio::alphabets::dna;
+    ///
+    /// let text = b"ACGCGAT$";
+    /// let alphabet = dna::n_alphabet();
+    /// let sa = suffix_array(text);
+    /// let bwt = bwt(text, &sa);
+    /// let less = less(&bwt, &alphabet);
+    /// let occ = Occ::new(&bwt, 3, &alphabet);
+    /// let sampled = sa.sample(text, &bwt, &less, &occ, 2);
+    ///
+    /// for i in 0..sa.len() {
+    ///    assert_eq!(sa.get(i), sampled.get(i));
+    /// }
+    /// ```
+    fn sample<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>
+        (&self, text: &[u8], bwt: DBWT, less: DLess, occ: DOcc, sampling_rate: usize) ->
+        SampledSuffixArray<DBWT, DLess, DOcc> {
+
+        let mut sample = Vec::with_capacity((self.len() as f32 / sampling_rate as f32).ceil() as usize);
+        let mut extra_rows = HashMapFx::default();
+        let sentinel = sentinel(text);
+
+        for i in 0..self.len() {
+            let idx = self.get(i).unwrap();
+            if (i % sampling_rate) == 0 {
+                sample.push(idx);
+            } else if bwt.borrow()[i] == sentinel {
+                // If bwt lookup will return a sentinel
+                // Text suffixes that begin right after a sentinel are always saved as extra rows
+                // to help deal with bwt inaccuracy when there are many sentinels
+                extra_rows.insert(i, idx);
+            }
+        }
+
+        SampledSuffixArray {
+            bwt,
+            less,
+            occ,
+            sample,
+            s: sampling_rate,
+            extra_rows,
+            sentinel,
+        }
+    }
 }
 
-// /// A sampled suffix array.
-// pub struct SampledSuffixArray<DBWT: DerefBWT, DLess: DerefLess, DOcc: DerefOcc> {
-//     bwt: DBWT,
-//     less: DLess,
-//     occ: DOcc,
-//     sample: Vec<usize>,
-//     s: usize, // Rate of sampling
-// }
+/// A sampled suffix array.
+pub struct SampledSuffixArray<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> {
+    bwt: DBWT,
+    less: DLess,
+    occ: DOcc,
+    sample: Vec<usize>,
+    s: usize, // Rate of sampling
+    extra_rows: HashMapFx<usize, usize>,
+    sentinel: u8,
+}
 
 impl SuffixArray for RawSuffixArray {
     fn get(&self, index: usize) -> Option<usize> {
@@ -120,58 +142,51 @@ impl SuffixArray for RawSuffixArray {
     fn is_empty(&self) -> bool {
         Vec::is_empty(self)
     }
+}
+
+impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray for SampledSuffixArray<DBWT, DLess, DOcc> {
+    fn get(&self, index: usize) -> Option<usize> {
+        if index < self.len() {
+            let mut pos = index;
+            let mut offset = 0;
+            loop {
+                if pos % self.s == 0 {
+                    return Some(self.sample[pos / self.s] + offset);
+                }
+
+                let c = self.bwt.borrow()[pos];
+
+                if c == self.sentinel {
+                    // Check if next character is the sentinel
+                    // If so, there must be a cached result to workaround bwt lookup
+                    // inaccuracy when there are multiple sentinels
+                    // This branch should rarely be triggered so the performance impact
+                    // of hashmap lookups would be low
+                    return Some(self.extra_rows[&pos] + offset);
+                }
+
+                pos = self.less.borrow()[c as usize] + self.occ.borrow().get(self.bwt.borrow(), pos - 1, c);
+                offset += 1;
+            }
+        } else {
+            None
+        }
+    }
 
-    // fn sample<DBWT: DerefBWT, DLess: DerefLess, DOcc: DerefOcc>
-    //     (&self, bwt: DBWT, less: DLess, occ: DOcc, sampling_rate: usize) ->
-    //     SampledSuffixArray<DBWT, DLess, DOcc> {
-    //     // Provide a specialized, faster implementation using iterators.
-    //
-    //     let sample = self.iter().cloned().step(sampling_rate).collect();
-    //
-    //     SampledSuffixArray {
-    //         bwt: bwt,
-    //         less: less,
-    //         occ: occ,
-    //         sample: sample,
-    //         s: sampling_rate,
-    //     }
-    // }
+    fn len(&self) -> usize {
+        self.bwt.borrow().len()
+    }
+
+    fn is_empty(&self) -> bool {
+        self.bwt.borrow().is_empty()
+    }
 }
 
-// impl<DBWT: DerefBWT, DLess: DerefLess, DOcc: DerefOcc> SuffixArray for SampledSuffixArray<DBWT, DLess, DOcc> {
-//     fn get(&self, index: usize) -> Option<usize> {
-//         if index < self.len() {
-//             let mut pos = index;
-//             let mut offset = 0;
-//             loop {
-//                 if pos % self.s == 0 {
-//                     return Some(self.sample[pos / self.s] + offset);
-//                 }
-//
-//                 let c = self.bwt[pos];
-//                 pos = self.less[c as usize] + self.occ.get(&self.bwt, pos - 1, c);
-//                 offset += 1;
-//             }
-//         } else {
-//             None
-//         }
-//     }
-//
-//     fn len(&self) -> usize {
-//         self.bwt.len()
-//     }
-
-//     fn is_empty(&self) -> bool {
-//         self.bwt.is_empty()
-//     }
-// }
-//
-//
-// impl<DBWT: DerefBWT, DLess: DerefLess, DOcc: DerefOcc> SampledSuffixArray<DBWT, DLess, DOcc> {
-//     pub fn sampling_rate(&self) -> usize {
-//         self.s
-//     }
-// }
+impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SampledSuffixArray<DBWT, DLess, DOcc> {
+    pub fn sampling_rate(&self) -> usize {
+        self.s
+    }
+}
 
 /// Construct suffix array for given text of length n.
 /// Complexity: O(n).
@@ -679,13 +694,11 @@ impl PosTypes {
 
 #[cfg(test)]
 mod tests {
-    // Commented-out imports waiting on re-enabling of sampled suffix array
-    // See issue #70
     use super::*;
     use super::{transform_text, PosTypes, SAIS};
-    use crate::alphabets::Alphabet;
+    use crate::alphabets::{dna, Alphabet};
+    use crate::data_structures::bwt::{bwt, less};
     use bv::{BitVec, BitsPush};
-    //use data_structures::bwt::{bwt, less, Occ};
     use std::str;
 
     #[test]
@@ -817,35 +830,51 @@ mod tests {
         }
     }
 
-    // #[test]
-    // fn test_sampled_matches() {
-    //     let test_cases =             [(&b"A$C$G$T$"[..], "simple"),
-    //          (&b"A$A$T$T$"[..], "duplicates"),
-    //          (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
-    //          (&b"AGCCAT$\
-    //             CAGCC$"[..],
-    //             "substring"),
-    //          (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
-    //             AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..],
-    //             "complex"),
-    //          (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
-    //             TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\
-    //             AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\
-    //             TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
-    //             "complex with revcomps"),
-    //          ];
-    //
-    //     for &(text, _) in test_cases.into_iter() {
-    //         let alphabet = dna::n_alphabet();
-    //         let sa = suffix_array(text);
-    //         let bwt = bwt(text, &sa);
-    //         let less = less(&bwt, &alphabet);
-    //         let occ = Occ::new(&bwt, 3, &alphabet);
-    //         let sampled = sa.sample(&bwt, &less, &occ, 2);
-    //
-    //         for i in 0..sa.len() {
-    //             assert_eq!(sa.get(i), sampled.get(i));
-    //         }
-    //     }
-    // }
+    #[test]
+    fn test_sampled_matches() {
+        let test_cases =             [(&b"A$C$G$T$"[..], "simple"),
+             (&b"A$A$T$T$"[..], "duplicates"),
+             (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
+             (&b"AGCCAT$\
+                CAGCC$"[..],
+                "substring"),
+             (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
+                AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..],
+                "complex"),
+             (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\
+                TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\
+                AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\
+                TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
+                "complex with revcomps"),
+             (&b"GTAG$GCCTAAT$TATAATCAG$"[..], "issue70"),
+             (&b"TGTGTGTGTG$"[..], "repeating"),
+             (&b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$"[..], "complex sentinels"),
+             ];
+
+        for &(text, test_name) in test_cases.iter() {
+            for &sample_rate in &[2, 3, 5] {
+                let alphabet = dna::n_alphabet();
+                let sa = suffix_array(text);
+                let bwt = bwt(text, &sa);
+                let less = less(&bwt, &alphabet);
+                let occ = Occ::new(&bwt, 3, &alphabet);
+                let sampled = sa.sample(text, &bwt, &less, &occ, sample_rate);
+
+                for i in 0..sa.len() {
+                    let sa_idx = sa.get(i).unwrap();
+                    let sampled_idx = sampled.get(i).unwrap();
+                    assert_eq!(
+                        sa_idx,
+                        sampled_idx,
+                        "Failed:\n{}\n{}\nat index {} do not match in test: {} (sample rate: {})",
+                        str::from_utf8(&text[sa_idx..]).unwrap(),
+                        str::from_utf8(&text[sampled_idx..]).unwrap(),
+                        i,
+                        test_name,
+                        sample_rate
+                    );
+                }
+            }
+        }
+    }
 }

From c3b85f8c0bd5c5011b7bc5b1d4abae25f2131f96 Mon Sep 17 00:00:00 2001
From: Daniel Liu <daniel.liu@10xgenomics.com>
Date: Thu, 19 Aug 2021 17:01:27 -0700
Subject: [PATCH 2/8] Serde derive for sampled suffix array

---
 src/data_structures/suffix_array.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/data_structures/suffix_array.rs b/src/data_structures/suffix_array.rs
index 44458d01e..4bb3daf39 100644
--- a/src/data_structures/suffix_array.rs
+++ b/src/data_structures/suffix_array.rs
@@ -37,6 +37,8 @@ use vec_map::VecMap;
 
 use fxhash::FxHasher;
 
+use serde::{Serialize, Deserialize};
+
 use crate::alphabets::{Alphabet, RankTransform};
 use crate::data_structures::smallints::SmallInts;
 use crate::data_structures::bwt::{BWT, Less, Occ};
@@ -115,6 +117,7 @@ pub trait SuffixArray {
 }
 
 /// A sampled suffix array.
+#[derive(Clone, Serialize, Deserialize)]
 pub struct SampledSuffixArray<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> {
     bwt: DBWT,
     less: DLess,

From 2057ceb4a468524462b4e792eb8945e8865798ae Mon Sep 17 00:00:00 2001
From: Daniel Liu <daniel.liu@10xgenomics.com>
Date: Thu, 19 Aug 2021 17:33:09 -0700
Subject: [PATCH 3/8] Cargo fmt

---
 src/data_structures/suffix_array.rs | 42 ++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/data_structures/suffix_array.rs b/src/data_structures/suffix_array.rs
index 4bb3daf39..cf0cb3fcf 100644
--- a/src/data_structures/suffix_array.rs
+++ b/src/data_structures/suffix_array.rs
@@ -21,13 +21,13 @@
 //! );
 //! ```
 
+use std::borrow::Borrow;
 use std::cmp;
+use std::collections::HashMap;
 use std::fmt::Debug;
+use std::hash::BuildHasherDefault;
 use std::iter;
 use std::ops::Deref;
-use std::borrow::Borrow;
-use std::collections::HashMap;
-use std::hash::BuildHasherDefault;
 
 use num_integer::Integer;
 use num_traits::{cast, NumCast, Unsigned};
@@ -37,11 +37,11 @@ use vec_map::VecMap;
 
 use fxhash::FxHasher;
 
-use serde::{Serialize, Deserialize};
+use serde::{Deserialize, Serialize};
 
 use crate::alphabets::{Alphabet, RankTransform};
+use crate::data_structures::bwt::{Less, Occ, BWT};
 use crate::data_structures::smallints::SmallInts;
-use crate::data_structures::bwt::{BWT, Less, Occ};
 
 pub type LCPArray = SmallInts<i8, isize>;
 pub type RawSuffixArray = Vec<usize>;
@@ -68,9 +68,9 @@ pub trait SuffixArray {
     /// # Example
     ///
     /// ```
-    /// use bio::data_structures::suffix_array::{suffix_array, SuffixArray};
-    /// use bio::data_structures::bwt::{bwt, less, Occ};
     /// use bio::alphabets::dna;
+    /// use bio::data_structures::bwt::{bwt, less, Occ};
+    /// use bio::data_structures::suffix_array::{suffix_array, SuffixArray};
     ///
     /// let text = b"ACGCGAT$";
     /// let alphabet = dna::n_alphabet();
@@ -81,14 +81,19 @@ pub trait SuffixArray {
     /// let sampled = sa.sample(text, &bwt, &less, &occ, 2);
     ///
     /// for i in 0..sa.len() {
-    ///    assert_eq!(sa.get(i), sampled.get(i));
+    ///     assert_eq!(sa.get(i), sampled.get(i));
     /// }
     /// ```
-    fn sample<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>
-        (&self, text: &[u8], bwt: DBWT, less: DLess, occ: DOcc, sampling_rate: usize) ->
-        SampledSuffixArray<DBWT, DLess, DOcc> {
-
-        let mut sample = Vec::with_capacity((self.len() as f32 / sampling_rate as f32).ceil() as usize);
+    fn sample<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>(
+        &self,
+        text: &[u8],
+        bwt: DBWT,
+        less: DLess,
+        occ: DOcc,
+        sampling_rate: usize,
+    ) -> SampledSuffixArray<DBWT, DLess, DOcc> {
+        let mut sample =
+            Vec::with_capacity((self.len() as f32 / sampling_rate as f32).ceil() as usize);
         let mut extra_rows = HashMapFx::default();
         let sentinel = sentinel(text);
 
@@ -147,7 +152,9 @@ impl SuffixArray for RawSuffixArray {
     }
 }
 
-impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray for SampledSuffixArray<DBWT, DLess, DOcc> {
+impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray
+    for SampledSuffixArray<DBWT, DLess, DOcc>
+{
     fn get(&self, index: usize) -> Option<usize> {
         if index < self.len() {
             let mut pos = index;
@@ -168,7 +175,8 @@ impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray for
                     return Some(self.extra_rows[&pos] + offset);
                 }
 
-                pos = self.less.borrow()[c as usize] + self.occ.borrow().get(self.bwt.borrow(), pos - 1, c);
+                pos = self.less.borrow()[c as usize]
+                    + self.occ.borrow().get(self.bwt.borrow(), pos - 1, c);
                 offset += 1;
             }
         } else {
@@ -185,7 +193,9 @@ impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray for
     }
 }
 
-impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SampledSuffixArray<DBWT, DLess, DOcc> {
+impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>
+    SampledSuffixArray<DBWT, DLess, DOcc>
+{
     pub fn sampling_rate(&self) -> usize {
         self.s
     }

From 78b69bfcd8e1439c9d2e2ebfb78a0242184cb3a2 Mon Sep 17 00:00:00 2001
From: Daniel Liu <daniel.liu@10xgenomics.com>
Date: Thu, 19 Aug 2021 21:36:10 -0700
Subject: [PATCH 4/8] Clarified comments

---
 src/data_structures/suffix_array.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/data_structures/suffix_array.rs b/src/data_structures/suffix_array.rs
index cf0cb3fcf..7f4a80d78 100644
--- a/src/data_structures/suffix_array.rs
+++ b/src/data_structures/suffix_array.rs
@@ -104,7 +104,7 @@ pub trait SuffixArray {
             } else if bwt.borrow()[i] == sentinel {
                 // If bwt lookup will return a sentinel
                 // Text suffixes that begin right after a sentinel are always saved as extra rows
-                // to help deal with bwt inaccuracy when there are many sentinels
+                // to help deal with FM index last to front inaccuracy when there are many sentinels
                 extra_rows.insert(i, idx);
             }
         }
@@ -167,9 +167,9 @@ impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray
                 let c = self.bwt.borrow()[pos];
 
                 if c == self.sentinel {
-                    // Check if next character is the sentinel
-                    // If so, there must be a cached result to workaround bwt lookup
-                    // inaccuracy when there are multiple sentinels
+                    // Check if next character in the bwt is the sentinel
+                    // If so, there must be a cached result to workaround FM index last to front
+                    // mapping inaccuracy when there are multiple sentinels
                     // This branch should rarely be triggered so the performance impact
                     // of hashmap lookups would be low
                     return Some(self.extra_rows[&pos] + offset);

From db74992304bb88067a8300270864a529daf424ad Mon Sep 17 00:00:00 2001
From: Daniel Liu <daniel.liu@10xgenomics.com>
Date: Thu, 19 Aug 2021 22:18:51 -0700
Subject: [PATCH 5/8] Allow Occ to be clonable

---
 src/data_structures/bwt.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data_structures/bwt.rs b/src/data_structures/bwt.rs
index 6147d1b53..477ee3bbf 100644
--- a/src/data_structures/bwt.rs
+++ b/src/data_structures/bwt.rs
@@ -73,7 +73,7 @@ pub fn invert_bwt(bwt: &BWTSlice) -> Vec<u8> {
 }
 
 /// An occurrence array implementation.
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct Occ {
     occ: Vec<Vec<usize>>,
     k: u32,

From 330fc2ea20120e1a9858e1b187d1883eeb647c42 Mon Sep 17 00:00:00 2001
From: Daniel Liu <daniel.liu@10xgenomics.com>
Date: Fri, 20 Aug 2021 13:27:18 -0700
Subject: [PATCH 6/8] Make fm index accessible in sampled suffix array

---
 src/data_structures/suffix_array.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/data_structures/suffix_array.rs b/src/data_structures/suffix_array.rs
index 7f4a80d78..8bf49a393 100644
--- a/src/data_structures/suffix_array.rs
+++ b/src/data_structures/suffix_array.rs
@@ -199,6 +199,18 @@ impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>
     pub fn sampling_rate(&self) -> usize {
         self.s
     }
+
+    pub fn bwt(&self) -> &BWT {
+        self.bwt.borrow()
+    }
+
+    pub fn less(&self) -> &Less {
+        self.less.borrow()
+    }
+
+    pub fn occ(&self) -> &Occ {
+        self.occ.borrow()
+    }
 }
 
 /// Construct suffix array for given text of length n.

From f25e632e7dd54fdf5256f44668f0b0c58ff53c25 Mon Sep 17 00:00:00 2001
From: Daniel Liu <daniel.liu@10xgenomics.com>
Date: Fri, 20 Aug 2021 14:09:31 -0700
Subject: [PATCH 7/8] Added unsafe fmd index contruction useful for special
 cases

---
 src/data_structures/fmindex.rs | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/data_structures/fmindex.rs b/src/data_structures/fmindex.rs
index d51b883f8..124f5a898 100644
--- a/src/data_structures/fmindex.rs
+++ b/src/data_structures/fmindex.rs
@@ -480,6 +480,21 @@ impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> FMDIndex<DBWT, D
 
         self.backward_ext(&interval.swapped(), comp_a).swapped()
     }
+
+    /// Construct a new instance of the FMD index (see Heng Li (2012) Bioinformatics)
+    /// without checking whether the text is over the DNA alphabet with N.
+    /// This expects a BWT that was created from a text over the DNA alphabet with N
+    /// (`alphabets::dna::n_alphabet()`) consisting of the
+    /// concatenation with its reverse complement, separated by the sentinel symbol `$`.
+    /// I.e., let T be the original text and R be its reverse complement.
+    /// Then, the expected text is T$R$. Further, multiple concatenated texts are allowed, e.g.
+    /// T1$R1$T2$R2$T3$R3$.
+    /// It is unsafe to construct an FMD index from an FM index that is not built on the DNA alphabet.
+    pub unsafe fn from_fmindex_unchecked(
+        fmindex: FMIndex<DBWT, DLess, DOcc>,
+    ) -> FMDIndex<DBWT, DLess, DOcc> {
+        FMDIndex { fmindex }
+    }
 }
 
 #[cfg(test)]

From 9ff05fb1319d120404648cf79dae80806c6edb49 Mon Sep 17 00:00:00 2001
From: Daniel Liu <daniel.liu@10xgenomics.com>
Date: Fri, 20 Aug 2021 18:17:00 -0700
Subject: [PATCH 8/8] Added random test cases

---
 src/data_structures/suffix_array.rs | 46 +++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/src/data_structures/suffix_array.rs b/src/data_structures/suffix_array.rs
index 8bf49a393..886529d24 100644
--- a/src/data_structures/suffix_array.rs
+++ b/src/data_structures/suffix_array.rs
@@ -196,18 +196,22 @@ impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>> SuffixArray
 impl<DBWT: Borrow<BWT>, DLess: Borrow<Less>, DOcc: Borrow<Occ>>
     SampledSuffixArray<DBWT, DLess, DOcc>
 {
+    /// Get the sampling rate of the suffix array.
     pub fn sampling_rate(&self) -> usize {
         self.s
     }
 
+    /// Get a reference to the internal BWT.
     pub fn bwt(&self) -> &BWT {
         self.bwt.borrow()
     }
 
+    /// Get a reference to the internal Less.
     pub fn less(&self) -> &Less {
         self.less.borrow()
     }
 
+    /// Get a reference to the internal Occ.
     pub fn occ(&self) -> &Occ {
         self.occ.borrow()
     }
@@ -724,6 +728,8 @@ mod tests {
     use crate::alphabets::{dna, Alphabet};
     use crate::data_structures::bwt::{bwt, less};
     use bv::{BitVec, BitsPush};
+    use rand;
+    use rand::prelude::*;
     use std::str;
 
     #[test]
@@ -816,9 +822,27 @@ mod tests {
         ) + "$"
     }
 
+    fn rand_seqs(num_seqs: usize, seq_len: usize) -> Vec<u8> {
+        let mut rng = rand::thread_rng();
+        let alpha = [b'A', b'T', b'C', b'G', b'N'];
+        let seqs = (0..num_seqs)
+            .into_iter()
+            .map(|_| {
+                let len = rng.gen_range((seq_len / 2)..=seq_len);
+                (0..len)
+                    .into_iter()
+                    .map(|_| *alpha.choose(&mut rng).unwrap())
+                    .collect::<Vec<u8>>()
+            })
+            .collect::<Vec<_>>();
+        let mut res = seqs.join(&b'$');
+        res.push(b'$');
+        res
+    }
+
     #[test]
     fn test_sorts_lexically() {
-        let test_cases =             [(&b"A$C$G$T$"[..], "simple"),
+        let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"),
              (&b"A$A$T$T$"[..], "duplicates"),
              (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
              (&b"AGCCAT$\
@@ -833,6 +857,14 @@ mod tests {
                 TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..],
                 "complex with revcomps"),
              ];
+        let num_rand = 100;
+        let rand_cases = (0..num_rand)
+            .into_iter()
+            .map(|i| rand_seqs(10, i * 10))
+            .collect::<Vec<_>>();
+        for i in 0..num_rand {
+            test_cases.push((&rand_cases[i], "rand test case"));
+        }
 
         for &(text, test_name) in test_cases.iter() {
             let pos = suffix_array(text);
@@ -857,7 +889,7 @@ mod tests {
 
     #[test]
     fn test_sampled_matches() {
-        let test_cases =             [(&b"A$C$G$T$"[..], "simple"),
+        let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"),
              (&b"A$A$T$T$"[..], "duplicates"),
              (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"),
              (&b"AGCCAT$\
@@ -875,9 +907,17 @@ mod tests {
              (&b"TGTGTGTGTG$"[..], "repeating"),
              (&b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$"[..], "complex sentinels"),
              ];
+        let num_rand = 100;
+        let rand_cases = (0..num_rand)
+            .into_iter()
+            .map(|i| rand_seqs(10, i * 10))
+            .collect::<Vec<_>>();
+        for i in 0..num_rand {
+            test_cases.push((&rand_cases[i], "rand test case"));
+        }
 
         for &(text, test_name) in test_cases.iter() {
-            for &sample_rate in &[2, 3, 5] {
+            for &sample_rate in &[2, 3, 5, 16] {
                 let alphabet = dna::n_alphabet();
                 let sa = suffix_array(text);
                 let bwt = bwt(text, &sa);