fix: Improved parameter estimation for (H)PHMM (#371)

### Description This PR updates the (H)PHMM parameter estimation routine: - counts transitions a little differently then before: 1. conceptually expand cigar strings (i.e. `3M2D1I2M` → `1M1M1M1D1D1I1M1M`) 2. iterate consecutive pairs e.g. `(1M, 1M)` and look at rseq and qseq at the corresponding positions 3. classify that into any of the 82 transitions, e.g. `(1M, 1M)` together with `((T, T), (G, G))` corresponds to a `match T → match G` transition - this will only look at `N` records from the SAM/BAM/CRAM, where `N` is a number calculated as a multinomial sample size estimation as described in [https://www.jstor.org/stable/2683352](https://www.jstor.org/stable/2683352) When the number of alignments in the file is known (which is the case for indexed BAM files but not for indexed SAM/CRAM), can use a finite population correction version of the estimate instead. - currently, iterates through the whole file with a fixed step size. However, this isn't any faster than simply looking at every record, so do any combination of the following instead: 1. only use the *first* `N` records (possibly biased subsample, but definitely quicker) 2. for CRAM, set lowlevel htslib reading options to skip fields that aren't used for estimation purposes (potentially quicker, but only for CRAM?) 3. look at samtools subsample or simply use that as a preprocessing step 4. make better use of indices for random access ### QC  * [ ] The PR contains a test case for the changes or the changes are already covered by an existing test case. * [ ] The documentation at https://github.com/varlociraptor/varlociraptor.github.io is updated in a separate PR to reflect the changes or this is not necessary (e.g. if the change does neither modify the calling grammar nor the behavior or functionalities of Varlociraptor). --------- Co-authored-by: Johannes Köster <johannes.koester@tu-dortmund.de>
varlociraptor · Jun 12, 2023 · 508333e · 508333e
1 parent 2c95caa
commit 508333e
Show file tree

Hide file tree

Showing 12 changed files with 433 additions and 137 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,12 +17,12 @@ askama = "0.11.1"
 auto_ops = "0.3.0"
 bincode = "1.3"
 bio = "1.1.0"
-bio-types = {version = ">=0.12", features = ["serde"]}
+bio-types = { version = ">=0.12", features = ["serde"] }
 boolean_expression = "0.4"
 bv = "0.11"
 byteorder = "1.3"
 cached = "0.42"
-clap = {version = "2", features = ["yaml", "color", "suggestions"]}
+clap = { version = "2", features = ["yaml", "color", "suggestions"] }
 counter = "0.5"
 csv = "1"
 data-encoding = "2.3"
@@ -32,17 +32,17 @@ derive_builder = "0.12.0"
 eval = "0.4"
 fern = "0.6"
 getset = "0.1.0"
-half = {version = "1.4", features = ["serde"]}
+half = { version = "1.4", features = ["serde"] }
 itertools = "0.10"
 itertools-num = "0.1"
 jemallocator = "0.5.0"
 lazy_static = "1"
 log = "0.4"
 lru = "0.7"
 lru_time_cache = "0.11"
-ndarray = "0.15"
+ndarray = { version = "0.15", features = ["serde"] }
 num-traits = "0.2"
-ordered-float = {version = "1", features = ["serde"]}
+ordered-float = { version = "1", features = ["serde"] }
 paste = "1"
 pest = "2"
 pest_derive = "2"
@@ -67,7 +67,7 @@ tempfile = "3"
 thiserror = "1.0"
 time = "0.3"
 typed-builder = "0.12"
-uuid = {version = "0.8", features = ["v4"]}
+uuid = { version = "0.8", features = ["v4"] }
 vec_map = "0.8"
 yaml-rust = ">=0.4.1,<0.5"
 linear-map = "1.2"

diff --git a/src/cli.rs b/src/cli.rs
@@ -809,11 +809,10 @@ pub fn run(opt: Varlociraptor) -> Result<()> {
                         );
                     };
 
-                    let mut reference_buffer = Arc::new(reference::Buffer::new(
-                        fasta::IndexedReader::from_file(&reference)
+                    let mut reference_buffer = Arc::new(
+                        reference::Buffer::from_path(&reference, reference_buffer_size)
                             .context("Unable to read genome reference.")?,
-                        reference_buffer_size,
-                    ));
+                    );
 
                     let alignment_properties = est_or_load_alignment_properties(
                         &alignment_properties,
@@ -1231,8 +1230,7 @@ pub fn run(opt: Varlociraptor) -> Result<()> {
                 num_records,
                 epsilon_gap,
             } => {
-                let mut reference_buffer =
-                    reference::Buffer::new(fasta::IndexedReader::from_file(&reference)?, 1);
+                let mut reference_buffer = reference::Buffer::from_path(&reference, 1)?;
                 let alignment_properties = estimate_alignment_properties(
                     bam,
                     false,