diff --git a/R/d_ratio_filter_class.R b/R/d_ratio_filter_class.R index e0f92a2..814385e 100644 --- a/R/d_ratio_filter_class.R +++ b/R/d_ratio_filter_class.R @@ -4,11 +4,19 @@ #' M = dratio_filter(threshold=20,qc_label='QC',factor_name='Class') #' M = model_apply(M,D) #' @export dratio_filter -dratio_filter = function(threshold=20, qc_label='QC', factor_name, ...) { +dratio_filter = function( + threshold=20, + qc_label='QC', + factor_name, + method='ratio', + dispersion='sd', + ...) { out=struct::new_struct('dratio_filter', threshold=threshold, qc_label=qc_label, factor_name=factor_name, + method=method, + dispersion=dispersion, ...) return(out) } @@ -21,7 +29,9 @@ dratio_filter = function(threshold=20, qc_label='QC', factor_name, ...) { factor_name='entity', filtered='entity', flags='entity', - d_ratio='data.frame' + d_ratio='data.frame', + method='enum', + dispersion='enum' ), prototype=list(name = 'Dispersion ratio filter', description = paste0('The dispersion ratio (d-ratio) compares the ', @@ -33,7 +43,7 @@ dratio_filter = function(threshold=20, qc_label='QC', factor_name, ...) { 'the feature is removed.'), type = 'filter', predicted = 'filtered', - .params=c('threshold','qc_label','factor_name'), + .params=c('threshold','qc_label','factor_name','method','dispersion'), .outputs=c('filtered','flags','d_ratio'), citations=list( bibentry( @@ -75,8 +85,35 @@ dratio_filter = function(threshold=20, qc_label='QC', factor_name, ...) { description = 'Flag indicating whether the feature was rejected by the filter or not.', type='data.frame', value=data.frame() + ), + method=enum( + name='dratio method', + description = c( + 'ratio' = paste0('Dispersion of the QCs divided by the ', + 'dispersion of the samples. Corresponds to Eq 4 in ', + ' Broadhurst et al (2018).'), + 'euclidean' = paste0('Dispersion of the QCs divided by the ', + 'euclidean length of the total dispersion. Total dispersion ', + 'is estimated from the QC and Sample dispersion by assuming ', + 'that they are orthogonal. Corresponds to Eq 5 in ', + 'Broadhurst et al (2018)')), + allowed=c('ratio','euclidean'), + value='ratio', + type='character', + max_length = 1 + ), + dispersion=enum( + name='Dispersion method', + description = c( + 'sd' = paste0('Dispersion is estimated using the ', + 'standard deviation.'), + 'mad' = paste0('Dispersion is estimated using the median ', + 'absolute deviation.')), + allowed=c('sd','mad'), + value='sd', + type='character', + max_length = 1 ) - ) ) @@ -86,25 +123,42 @@ setMethod(f="model_train", signature=c("dratio_filter","DatasetExperiment"), definition=function(M,D) { - # mad QC samples + # dispersion QC samples QC = filter_smeta( mode='include', levels=M$qc_label, factor_name=M$factor_name) QC = model_apply(QC,D) QC = predicted(QC)$data - QC = apply(QC,2,mad,na.rm=TRUE) + + if (M$dispersion=='mad') { + QC = apply(QC,2,mad,na.rm=TRUE) + } else { + QC = apply(QC,2,sd,na.rm=TRUE) + } - # mad not qc samples + # dispersion (not QC) samples S = filter_smeta( mode='exclude', levels=M$qc_label, factor_name=M$factor_name) S = model_apply(S,D) S = predicted(S)$data - S = apply(S,2,mad,na.rm=TRUE) + + if (M$dispersion=='mad') { + S = apply(S,2,mad,na.rm=TRUE) # constant = 1.4826 default + } else { + S = apply(S,2,sd,na.rm=TRUE) + } - d_ratio=(QC/(QC+S))*100 + # dispersion ratio + if (M$method=='ratio') { + # eq 4 + d_ratio=(QC/S)*100 + } else { + # eq 5 + d_ratio= (QC / sqrt((QC^2) + (S^2))) * 100 + } OUT=d_ratio>M$threshold diff --git a/man/dratio_filter.Rd b/man/dratio_filter.Rd index e9c1355..c8fd652 100644 --- a/man/dratio_filter.Rd +++ b/man/dratio_filter.Rd @@ -4,7 +4,14 @@ \alias{dratio_filter} \title{Dispersion ratio filter} \usage{ -dratio_filter(threshold = 20, qc_label = "QC", factor_name, ...) +dratio_filter( + threshold = 20, + qc_label = "QC", + factor_name, + method = "ratio", + dispersion = "sd", + ... +) } \arguments{ \item{threshold}{(numeric) The threshold above which features are removed. The default is \code{20}.} @@ -13,6 +20,10 @@ dratio_filter(threshold = 20, qc_label = "QC", factor_name, ...) \item{factor_name}{(character) The name of a sample-meta column to use.} +\item{method}{(character) dratio method. Allowed values are limited to the following: \itemize{\item{\code{"ratio"}: Dispersion of the QCs divided by the dispersion of the samples. Corresponds to Eq 4 in Broadhurst et al (2018).}\item{\code{"euclidean"}: Dispersion of the QCs divided by the euclidean length of the total dispersion. Total dispersion is estimated from the QC and Sample dispersion by assuming that they are orthogonal. Corresponds to Eq 5 in Broadhurst et al (2018).}} The default is \code{"ratio"}.} + +\item{dispersion}{(character) Dispersion method. Allowed values are limited to the following: \itemize{\item{\code{"sd"}: Dispersion is estimated using the standard deviation.}\item{\code{"mad"}: Dispersion is estimated using the median absolute deviation.}} The default is \code{"sd"}.} + \item{...}{Additional slots and values passed to \code{struct_class}.} } \value{ diff --git a/tests/testthat/test-dratio-filter.R b/tests/testthat/test-dratio-filter.R index d7b9197..7d9eb7c 100644 --- a/tests/testthat/test-dratio-filter.R +++ b/tests/testthat/test-dratio-filter.R @@ -1,12 +1,119 @@ # filter d-ratio -test_that('d-ratio filter works as expected',{ +test_that('d-ratio filter works as expected using sd ratio',{ set.seed('57475') # DatasetExperiment DE = MTBLS79_DatasetExperiment(filtered=TRUE) M = dratio_filter( threshold=20, qc_label = 'QC', - factor_name='Class' + factor_name='Class', + method = 'ratio', + dispersion = 'sd' + ) + M = model_apply(M,DE) + + # manually compute d-ratio + qc_mad=unlist( + lapply(DE$data[DE$sample_meta$Class=='QC',],sd,na.rm=TRUE) + ) + sa_mad=unlist( + lapply(DE$data[DE$sample_meta$Class!='QC',],sd,na.rm=TRUE) + ) + dr = (qc_mad/(sa_mad))*100 + + # check manual vs fcn + expect_true(all(dr==M$d_ratio$d_ratio)) + + # check values havent changed + expect_equal(dr[[1]],23.05762,tolerance = 5e-6) + expect_equal(dr[[100]],64.49242,tolerance = 5e-6) + + # just number of filtered columns + expect_true(ncol(predicted(M))==725) + expect_true(ncol(DE)-ncol(predicted(M))==854) + expect_true(ncol(DE)-ncol(predicted(M))==sum(dr>20)) +}) + +test_that('d-ratio filter works as expected using mad ratio',{ + set.seed('57475') + # DatasetExperiment + DE = MTBLS79_DatasetExperiment(filtered=TRUE) + M = dratio_filter( + threshold=20, + qc_label = 'QC', + factor_name='Class', + method = 'ratio', + dispersion = 'mad' + ) + M = model_apply(M,DE) + + # manually compute d-ratio + qc_mad=unlist( + lapply(DE$data[DE$sample_meta$Class=='QC',],mad,na.rm=TRUE) + ) + sa_mad=unlist( + lapply(DE$data[DE$sample_meta$Class!='QC',],mad,na.rm=TRUE) + ) + dr = (qc_mad/(sa_mad))*100 + + # check manual vs fcn + expect_true(all(dr==M$d_ratio$d_ratio)) + + # check values havent changed + expect_equal(dr[[1]],14.14758,tolerance = 5e-6) + expect_equal(dr[[100]],48.23871,tolerance = 5e-6) + + # just number of filtered columns + expect_true(ncol(predicted(M))==834) + expect_true(ncol(DE)-ncol(predicted(M))==745) + expect_true(ncol(DE)-ncol(predicted(M))==sum(dr>20)) +}) + +test_that('d-ratio filter works as expected using sd euclidean',{ + set.seed('57475') + # DatasetExperiment + DE = MTBLS79_DatasetExperiment(filtered=TRUE) + M = dratio_filter( + threshold=20, + qc_label = 'QC', + factor_name='Class', + method = 'euclidean', + dispersion = 'sd' + ) + M = model_apply(M,DE) + + # manually compute d-ratio + qc_mad=unlist( + lapply(DE$data[DE$sample_meta$Class=='QC',],sd,na.rm=TRUE) + ) + sa_mad=unlist( + lapply(DE$data[DE$sample_meta$Class!='QC',],sd,na.rm=TRUE) + ) + dr = qc_mad/sqrt((qc_mad^2) + (sa_mad^2))*100 + + # check manual vs fcn + expect_true(all(dr==M$d_ratio$d_ratio)) + + # check values havent changed + expect_equal(dr[[1]],22.46809,tolerance = 5e-6) + expect_equal(dr[[100]],54.19862,tolerance = 5e-6) + + # just number of filtered columns + expect_true(ncol(predicted(M))==747) + expect_true(ncol(DE)-ncol(predicted(M))==832) + expect_true(ncol(DE)-ncol(predicted(M))==sum(dr>20)) +}) + +test_that('d-ratio filter works as expected using mad euclidean',{ + set.seed('57475') + # DatasetExperiment + DE = MTBLS79_DatasetExperiment(filtered=TRUE) + M = dratio_filter( + threshold=20, + qc_label = 'QC', + factor_name='Class', + method = 'euclidean', + dispersion = 'mad' ) M = model_apply(M,DE) @@ -17,17 +124,17 @@ test_that('d-ratio filter works as expected',{ sa_mad=unlist( lapply(DE$data[DE$sample_meta$Class!='QC',],mad,na.rm=TRUE) ) - dr = (qc_mad/(qc_mad+sa_mad))*100 + dr = qc_mad/sqrt((qc_mad^2) + (sa_mad^2))*100 # check manual vs fcn expect_true(all(dr==M$d_ratio$d_ratio)) # check values havent changed - expect_equal(dr[[1]],12.39,tolerance = 1e3) - expect_equal(dr[[100]],32.54,tolerance = 1e3) + expect_equal(dr[[1]],14.00808,tolerance = 5e-6) + expect_equal(dr[[100]],43.44776,tolerance = 5e-6) # just number of filtered columns - expect_true(ncol(predicted(M))==1052) - expect_true(ncol(DE)-ncol(predicted(M))==527) + expect_true(ncol(predicted(M))==850) + expect_true(ncol(DE)-ncol(predicted(M))==729) expect_true(ncol(DE)-ncol(predicted(M))==sum(dr>20)) })