From f45e91dfdc64ecb662d676f2996ed4f14c079995 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Tue, 6 Jul 2021 18:46:25 +1000 Subject: [PATCH] feat!: Improve bcf Record filter interface and improve docs (#306) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add docs for bcf Record::format * change signature for filter methods * fmt * add FilterId trait and switch has_filter implementation * update push_filter impl * update remove and set filter impl * fix format enum spelling for vcf Co-authored-by: Johannes Köster --- CHANGELOG.md | 8 +- src/bcf/mod.rs | 9 +- src/bcf/record.rs | 292 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 276 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eec88833d..7888d7f07 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [Unreleased] +### Changes +- `bcf::Record` methods `has_filter`, `remove_filter`, `push_filter`, and `set_filter` + all now take a byte slice (`&[u8]`) or an `Id`. + +[Unreleased]: https://github.com/rust-bio/rust-htslib/compare/v0.37.0...HEAD + ## [0.37.0] - 2021-07-05 ### Added - `bcf::Record` methods `end`, `clear`, and `rlen` (@mbhall88). @@ -16,7 +23,6 @@ This project adheres to [Semantic Versioning](http://semver.org/). - Fixed compiler warnings (@fxwiegand). - BAM header representation is now always kept in sync between textual and binary (@jch-13). - ## [0.36.0] - 2020-11-23 ### Changes - Improved genotype API in VCF/BCF records (@MaltheSR). diff --git a/src/bcf/mod.rs b/src/bcf/mod.rs index a6b593512..3e3b0d6ac 100644 --- a/src/bcf/mod.rs +++ b/src/bcf/mod.rs @@ -1272,7 +1272,6 @@ mod tests { Format::Vcf, ) .expect("Error opening file."); - let header = writer.header().clone(); // Setup empty record, filled below. let mut record = writer.empty_record(); @@ -1296,10 +1295,10 @@ mod tests { record.push_id(b"first_id").unwrap(); assert!(record.filters().next().is_none()); - record.set_filters(&[header.name_to_id(b"q10").unwrap()]); - record.push_filter(header.name_to_id(b"s50").unwrap()); - record.remove_filter(header.name_to_id(b"q10").unwrap(), true); - record.push_filter(header.name_to_id(b"q10").unwrap()); + record.set_filters(&["q10".as_bytes()]).unwrap(); + record.push_filter("s50".as_bytes()).unwrap(); + record.remove_filter("q10".as_bytes(), true).unwrap(); + record.push_filter("q10".as_bytes()).unwrap(); record.set_alleles(&[b"C", b"T", b"G"]).unwrap(); diff --git a/src/bcf/record.rs b/src/bcf/record.rs index f253fe19d..12c43bc3f 100644 --- a/src/bcf/record.rs +++ b/src/bcf/record.rs @@ -79,6 +79,30 @@ impl NumericUtils for i32 { } } +/// A trait to allow for seamless use of bytes or integer identifiers for filters +pub trait FilterId { + fn id_from_header(&self, header: &HeaderView) -> Result; + fn is_pass(&self) -> bool; +} + +impl FilterId for [u8] { + fn id_from_header(&self, header: &HeaderView) -> Result { + header.name_to_id(self) + } + fn is_pass(&self) -> bool { + matches!(self, b"PASS" | b".") + } +} + +impl FilterId for Id { + fn id_from_header(&self, _header: &HeaderView) -> Result { + Ok(*self) + } + fn is_pass(&self) -> bool { + *self == Id(0) + } +} + /// A buffer for info or format data. #[derive(Debug)] pub struct Buffer { @@ -335,69 +359,187 @@ impl Record { /// Query whether the filter with the given ID has been set. /// - /// # Arguments + /// This method can be used to check if a record passes filtering by using either `Id(0)`, + /// `PASS` or `.` /// - /// - `flt_id` - The filter ID to query for. - pub fn has_filter(&self, flt_id: Id) -> bool { - if *flt_id == 0 && self.inner().d.n_flt == 0 { + /// # Example + /// ```rust + /// # use rust_htslib::bcf::{Format, Header, Writer}; + /// # use rust_htslib::bcf::header::Id; + /// # use tempfile::NamedTempFile; + /// # let tmp = tempfile::NamedTempFile::new().unwrap(); + /// # let path = tmp.path(); + /// let mut header = Header::new(); + /// header.push_record(br#"##FILTER="#); + /// # let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + /// # let mut record = vcf.empty_record(); + /// assert!(record.has_filter("PASS".as_bytes())); + /// assert!(record.has_filter(".".as_bytes())); + /// assert!(record.has_filter(&Id(0))); + /// + /// record.push_filter("foo".as_bytes()).unwrap(); + /// assert!(record.has_filter("foo".as_bytes())); + /// assert!(!record.has_filter("PASS".as_bytes())) + /// ``` + pub fn has_filter(&self, flt_id: &T) -> bool { + if flt_id.is_pass() && self.inner().d.n_flt == 0 { return true; } + let id = match flt_id.id_from_header(&self.header()) { + Ok(i) => *i, + Err(_) => return false, + }; for i in 0..(self.inner().d.n_flt as isize) { - if unsafe { *self.inner().d.flt.offset(i) } == *flt_id as i32 { + if unsafe { *self.inner().d.flt.offset(i) } == id as i32 { return true; } } false } - /// Set the given filters IDs to the FILTER column. + /// Set the given filter IDs to the FILTER column. /// - /// Setting an empty slice removes all filters. + /// Setting an empty slice removes all filters and sets `PASS`. /// - /// # Arguments + /// # Example + /// ```rust + /// # use rust_htslib::bcf::{Format, Header, Writer}; + /// # use rust_htslib::bcf::header::Id; + /// # use tempfile::NamedTempFile; + /// # let tmp = tempfile::NamedTempFile::new().unwrap(); + /// # let path = tmp.path(); + /// let mut header = Header::new(); + /// header.push_record(br#"##FILTER="#); + /// header.push_record(br#"##FILTER="#); + /// # let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + /// # let mut record = vcf.empty_record(); + /// let foo = record.header().name_to_id(b"foo").unwrap(); + /// let bar = record.header().name_to_id(b"bar").unwrap(); + /// assert!(record.has_filter("PASS".as_bytes())); + /// let mut filters = vec![&foo, &bar]; + /// record.set_filters(&filters).unwrap(); + /// assert!(record.has_filter(&foo)); + /// assert!(record.has_filter(&bar)); + /// assert!(!record.has_filter("PASS".as_bytes())); + /// filters.clear(); + /// record.set_filters(&filters).unwrap(); + /// assert!(record.has_filter("PASS".as_bytes())); + /// assert!(!record.has_filter("foo".as_bytes())); + /// // 'baz' isn't in the header + /// assert!(record.set_filters(&["baz".as_bytes()]).is_err()) + /// ``` /// - /// - `flt_ids` - The identifiers of the filter values to set. - pub fn set_filters(&mut self, flt_ids: &[Id]) { - let mut flt_ids: Vec = flt_ids.iter().map(|x| **x as i32).collect(); + /// # Errors + /// If any of the filter IDs do not exist in the header, an [`Error::BcfUnknownID`] is returned. + /// + pub fn set_filters(&mut self, flt_ids: &[&T]) -> Result<()> { + let mut ids: Vec = flt_ids + .iter() + .map(|id| id.id_from_header(self.header()).map(|id| *id as i32)) + .collect::>>()?; unsafe { htslib::bcf_update_filter( self.header().inner, self.inner, - flt_ids.as_mut_ptr(), - flt_ids.len() as i32, + ids.as_mut_ptr(), + ids.len() as i32, ); - } + }; + Ok(()) } /// Add the given filter to the FILTER column. /// - /// If `val` corresponds to `"PASS"` then all existing filters are removed first. If other than - /// `"PASS"`, then existing `"PASS"` is removed. + /// If `flt_id` is `PASS` or `.` then all existing filters are removed first. Otherwise, + /// any existing `PASS` filter is removed. /// - /// # Arguments + /// # Example + /// ```rust + /// # use rust_htslib::bcf::{Format, Header, Writer}; + /// # use tempfile::NamedTempFile; + /// # let tmp = tempfile::NamedTempFile::new().unwrap(); + /// # let path = tmp.path(); + /// let mut header = Header::new(); + /// header.push_record(br#"##FILTER="#); + /// header.push_record(br#"##FILTER="#); + /// # let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + /// # let mut record = vcf.empty_record(); + /// let foo = "foo".as_bytes(); + /// let bar = record.header().name_to_id(b"bar").unwrap(); + /// assert!(record.has_filter("PASS".as_bytes())); + /// + /// record.push_filter(foo).unwrap(); + /// record.push_filter(&bar).unwrap(); + /// assert!(record.has_filter(foo)); + /// assert!(record.has_filter(&bar)); + /// // filter must exist in the header + /// assert!(record.push_filter("baz".as_bytes()).is_err()) + /// ``` /// - /// - `flt_id` - The corresponding filter ID value to add. - pub fn push_filter(&mut self, flt_id: Id) { + /// # Errors + /// If the `flt_id` does not exist in the header, an [`Error::BcfUnknownID`] is returned. + /// + pub fn push_filter(&mut self, flt_id: &T) -> Result<()> { + let id = flt_id.id_from_header(self.header())?; unsafe { - htslib::bcf_add_filter(self.header().inner, self.inner, *flt_id as i32); - } + htslib::bcf_add_filter(self.header().inner, self.inner, *id as i32); + }; + Ok(()) } /// Remove the given filter from the FILTER column. /// /// # Arguments /// - /// - `val` - The corresponding filter ID to remove. - /// - `pass_on_empty` - Set to "PASS" when removing the last value. - pub fn remove_filter(&mut self, flt_id: Id, pass_on_empty: bool) { + /// - `flt_id` - The corresponding filter ID to remove. + /// - `pass_on_empty` - Set to `PASS` when removing the last filter. + /// + /// # Example + /// ```rust + /// # use rust_htslib::bcf::{Format, Header, Writer}; + /// # use tempfile::NamedTempFile; + /// # let tmp = tempfile::NamedTempFile::new().unwrap(); + /// # let path = tmp.path(); + /// let mut header = Header::new(); + /// header.push_record(br#"##FILTER="#); + /// header.push_record(br#"##FILTER="#); + /// # let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + /// # let mut record = vcf.empty_record(); + /// let foo = "foo".as_bytes(); + /// let bar = "bar".as_bytes(); + /// record.set_filters(&[foo, bar]).unwrap(); + /// assert!(record.has_filter(foo)); + /// assert!(record.has_filter(bar)); + /// + /// record.remove_filter(foo, true).unwrap(); + /// assert!(!record.has_filter(foo)); + /// assert!(record.has_filter(bar)); + /// // 'baz' is not in the header + /// assert!(record.remove_filter("baz".as_bytes(), true).is_err()); + /// + /// record.remove_filter(bar, true).unwrap(); + /// assert!(!record.has_filter(bar)); + /// assert!(record.has_filter("PASS".as_bytes())); + /// ``` + /// + /// # Errors + /// If the `flt_id` does not exist in the header, an [`Error::BcfUnknownID`] is returned. + /// + pub fn remove_filter( + &mut self, + flt_id: &T, + pass_on_empty: bool, + ) -> Result<()> { + let id = flt_id.id_from_header(self.header())?; unsafe { htslib::bcf_remove_filter( self.header().inner, self.inner, - *flt_id as i32, + *id as i32, pass_on_empty as i32, - ); - } + ) + }; + Ok(()) } /// Get alleles strings. @@ -1404,4 +1546,100 @@ mod tests { assert_eq!(record.sample_count(), 0); assert_eq!(record.pos(), 0) } + + #[test] + fn test_record_has_filter_pass_is_default() { + let tmp = NamedTempFile::new().unwrap(); + let path = tmp.path(); + let header = Header::new(); + let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + let record = vcf.empty_record(); + + assert!(record.has_filter("PASS".as_bytes())); + assert!(record.has_filter(".".as_bytes())); + assert!(record.has_filter(&Id(0))); + assert!(!record.has_filter("foo".as_bytes())); + assert!(!record.has_filter(&Id(2))); + } + + #[test] + fn test_record_has_filter_custom() { + let tmp = NamedTempFile::new().unwrap(); + let path = tmp.path(); + let mut header = Header::new(); + header.push_record(br#"##FILTER="#); + let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + let mut record = vcf.empty_record(); + record.push_filter("foo".as_bytes()).unwrap(); + + assert!(record.has_filter("foo".as_bytes())); + assert!(!record.has_filter("PASS".as_bytes())) + } + + #[test] + fn test_record_push_filter() { + let tmp = NamedTempFile::new().unwrap(); + let path = tmp.path(); + let mut header = Header::new(); + header.push_record(br#"##FILTER="#); + header.push_record(br#"##FILTER="#); + let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + let mut record = vcf.empty_record(); + assert!(record.has_filter("PASS".as_bytes())); + record.push_filter("foo".as_bytes()).unwrap(); + let bar = record.header().name_to_id(b"bar").unwrap(); + record.push_filter(&bar).unwrap(); + assert!(record.has_filter("foo".as_bytes())); + assert!(record.has_filter(&bar)); + assert!(!record.has_filter("PASS".as_bytes())); + assert!(record.push_filter("baz".as_bytes()).is_err()) + } + + #[test] + fn test_record_set_filters() { + let tmp = NamedTempFile::new().unwrap(); + let path = tmp.path(); + let mut header = Header::new(); + header.push_record(br#"##FILTER="#); + header.push_record(br#"##FILTER="#); + let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + let mut record = vcf.empty_record(); + assert!(record.has_filter("PASS".as_bytes())); + record + .set_filters(&["foo".as_bytes(), "bar".as_bytes()]) + .unwrap(); + assert!(record.has_filter("foo".as_bytes())); + assert!(record.has_filter("bar".as_bytes())); + assert!(!record.has_filter("PASS".as_bytes())); + let filters: &[&Id] = &[]; + record.set_filters(filters).unwrap(); + assert!(record.has_filter("PASS".as_bytes())); + assert!(!record.has_filter("foo".as_bytes())); + assert!(record + .set_filters(&["foo".as_bytes(), "baz".as_bytes()]) + .is_err()) + } + + #[test] + fn test_record_remove_filter() { + let tmp = NamedTempFile::new().unwrap(); + let path = tmp.path(); + let mut header = Header::new(); + header.push_record(br#"##FILTER="#); + header.push_record(br#"##FILTER="#); + let vcf = Writer::from_path(path, &header, true, Format::Vcf).unwrap(); + let mut record = vcf.empty_record(); + let foo = record.header().name_to_id(b"foo").unwrap(); + let bar = record.header().name_to_id(b"bar").unwrap(); + record.set_filters(&[&foo, &bar]).unwrap(); + assert!(record.has_filter(&foo)); + assert!(record.has_filter(&bar)); + record.remove_filter(&foo, true).unwrap(); + assert!(!record.has_filter(&foo)); + assert!(record.has_filter(&bar)); + assert!(record.remove_filter("baz".as_bytes(), true).is_err()); + record.remove_filter(&bar, true).unwrap(); + assert!(!record.has_filter(&bar)); + assert!(record.has_filter("PASS".as_bytes())); + } }