diff --git a/.Rbuildignore b/.Rbuildignore index e9333c4..54840d8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,13 +6,18 @@ CONTRIBUTING.md DIscBIOlogo.png README.md notebook +data/valuesG1msReduced.rda +data/valuesG1msRed.rda data/valuesG1ms.rda data/genes20k.rda data/pan_indrop_matrix_8000cells_18556genes.rda +data/valuesG1msReduced_treated_K.rda +data/valuesG1msReduced_treated_MB.rda apt.txt install.R postBuild runtime.txt .travis.yml .github -.git \ No newline at end of file +.git +Aux \ No newline at end of file diff --git a/.gitignore b/.gitignore index 666540e..50a8bc7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ inst/doc doc -Meta \ No newline at end of file +Meta +Aux \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 769754d..fb80073 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,5 +5,7 @@ cache: packages: true timeout: 3000 r: - - bioc-release - - bioc-devel \ No newline at end of file + - release + - devel +bioc_required: true +use_bioc: true \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 4baa3ac..d957fc6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,96 +1,94 @@ Package: DIscBIO -Date: 2020-05-04 +Date: 2020-07-30 Title: A User-Friendly Pipeline for Biomarker Discovery in Single-Cell Transcriptomics -Version: 0.99.8 +Version: 1.0.0 Authors@R: - c( - person(given = "Salim", - family = "Ghannoum", - role = c("aut", "cph"), - email = "salim.ghannoum@medisin.uio.no"), - person(given = "Alvaro", - family = "Köhn-Luque", - role = c("aut", "ths"), - email = "alvaro.kohn-luque@medisin.uio.no"), - person(given = "Waldir", - family = "Leoncio", - role = c("cre", "aut"), - email = "w.l.netto@medisin.uio.no"), - person(given = "Damiano", - family = "Fantini", - role = c("ctb")) - ) + c( + person( + given = "Salim", + family = "Ghannoum", + role = c("aut", "cph"), + email = "salim.ghannoum@medisin.uio.no" + ), + person( + given = "Alvaro", + family = "Köhn-Luque", + role = c("aut", "ths"), + email = "alvaro.kohn-luque@medisin.uio.no" + ), + person( + given = "Waldir", + family = "Leoncio", + role = c("cre", "aut"), + email = "w.l.netto@medisin.uio.no" + ), + person( + given = "Damiano", + family = "Fantini", + role = c("ctb") + ) + ) Description: An open, multi-algorithmic pipeline for easy, fast and efficient - analysis of cellular sub-populations and the molecular signatures that - characterize them. The pipeline consists of four successive steps: data - pre-processing, cellular clustering with pseudo-temporal ordering, defining - differential expressed genes and biomarker identification. This package - implements extensions of the work published by Ghannoum et. al. - . -biocViews: SingleCell, Transcriptomics, Clustering, DecisionTree + analysis of cellular sub-populations and the molecular signatures that + characterize them. The pipeline consists of four successive steps: data + pre-processing, cellular clustering with pseudo-temporal ordering, defining + differential expressed genes and biomarker identification. This package + implements extensions of the work published by Ghannoum et. al. (2019) + . License: MIT + file LICENSE Encoding: UTF-8 -Imports: methods, TSCAN, boot, httr, mclust, statmod, biomaRt, samr, igraph, - RWeka, partykit, grid, philentropy, NetIndices, png, matrixStats, grDevices, - readr, RColorBrewer, ggplot2, rpart, fpc, cluster, rpart.plot, amap, dplyr, - tsne, AnnotationDbi, org.Hs.eg.db, calibrate, graphics, stats, utils +Imports: methods, TSCAN, boot, httr, mclust, statmod, igraph, + RWeka, philentropy, NetIndices, png, grDevices, + readr, RColorBrewer, ggplot2, rpart, fpc, cluster, rpart.plot, + tsne, AnnotationDbi, org.Hs.eg.db, graphics, stats, utils, impute Depends: - R (>= 4.0), SingleCellExperiment + R (>= 4.0), SingleCellExperiment Suggests: - knitr, - rmarkdown, testthat, - enrichR, - Seurat -VignetteBuilder: knitr + Seurat LazyData: true -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 +URL: https://github.com/ocbe-uio/DIscBIO#usage BugReports: https://github.com/ocbe-uio/DIscBIO/issues Collate: 'DIscBIO-classes.R' 'DIscBIO-generic-ClassVectoringDT.R' + 'DIscBIO-generic-ClustDiffGenes.R' 'DIscBIO-generic-Clustexp.R' 'DIscBIO-generic-DEGanalysis.R' 'DIscBIO-generic-DEGanalysis2clust.R' 'DIscBIO-generic-Exprmclust.R' 'DIscBIO-generic-FinalPreprocessing.R' - 'DIscBIO-generic-FindOutliersKM.R' - 'DIscBIO-generic-FindOutliersMB.R' - 'DIscBIO-generic-KMClustDiffGenes.R' - 'DIscBIO-generic-KMclustheatmap.R' - 'DIscBIO-generic-KmeanOrder.R' - 'DIscBIO-generic-MBClustDiffGenes.R' - 'DIscBIO-generic-MBclustheatmap.R' + 'DIscBIO-generic-FindOutliers.R' 'DIscBIO-generic-NoiseFiltering.R' 'DIscBIO-generic-Normalizedata.R' 'DIscBIO-generic-PCAplotSymbols.R' 'DIscBIO-generic-PlotmclustMB.R' + 'DIscBIO-generic-clusteringOrder.R' + 'DIscBIO-generic-clustheatmap.R' 'DIscBIO-generic-comptSNE.R' - 'DIscBIO-generic-comptsneMB.R' 'DIscBIO-generic-plotExptSNE.R' 'DIscBIO-generic-plotGap.R' - 'DIscBIO-generic-plotKmeansLabelstSNE.R' - 'DIscBIO-generic-plotMBLabelstSNE.R' - 'DIscBIO-generic-plotOrderKMtsne.R' - 'DIscBIO-generic-plotOrderMBtsne.R' + 'DIscBIO-generic-plotLabelstSNE.R' + 'DIscBIO-generic-plotOrderTsne.R' 'DIscBIO-generic-plotSilhouette.R' 'DIscBIO-generic-plotSymbolstSNE.R' - 'DIscBIO-generic-plotexptsneMB.R' - 'DIscBIO-generic-plotsilhouetteMB.R' 'DIscBIO-generic-plottSNE.R' - 'DIscBIO-generic-plottsneMB.R' + 'DIscBIO-generic-pseudoTimeOrdering.R' 'J48DT.R' 'J48DTeval.R' 'Jaccard.R' - 'MB_Order.R' 'NetAnalysis.R' 'Networking.R' 'PPI.R' - 'PlotMBexpPCA.R' - 'PlotMBorderPCA.R' + 'PlotMBpca.R' 'RpartDT.R' 'RpartEVAL.R' 'VolcanoPlot.R' 'customConverters.R' 'datasets.R' - 'retrieveBiomart.R' + 'internal-functions.R' + 'prepExampleDataset.R' + 'reformatSiggenes.R' + 'replaceDecimals.R' + 'samr-adapted.R' diff --git a/LICENSE b/LICENSE index 80db17c..b02088b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,2 @@ -MIT License - -Copyright (c) 2019 SystemsBiologist - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +YEAR: 2019 +COPYRIGHT HOLDER: Salim Ghannoum \ No newline at end of file diff --git a/NAMESPACE b/NAMESPACE index 838c1d0..2ead175 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,18 +1,15 @@ # Generated by roxygen2: do not edit by hand +export(ClustDiffGenes) export(DISCBIO) export(DISCBIO2SingleCellExperiment) export(J48DT) export(J48DTeval) export(Jaccard) -export(KMClustDiffGenes) -export(MBClustDiffGenes) -export(MB_Order) export(NetAnalysis) export(Networking) export(PPI) -export(PlotMBexpPCA) -export(PlotMBorderPCA) +export(PlotMBpca) export(RpartDT) export(RpartEVAL) export(VolcanoPlot) @@ -21,60 +18,42 @@ export(customConvertFeats) export(plotGap) exportClasses(DISCBIO) exportMethods(ClassVectoringDT) +exportMethods(ClustDiffGenes) exportMethods(Clustexp) exportMethods(DEGanalysis) exportMethods(DEGanalysis2clust) exportMethods(Exprmclust) exportMethods(FinalPreprocessing) -exportMethods(FindOutliersKM) -exportMethods(FindOutliersMB) -exportMethods(KMClustDiffGenes) -exportMethods(KMclustheatmap) +exportMethods(FindOutliers) exportMethods(KmeanOrder) -exportMethods(MBClustDiffGenes) -exportMethods(MBclustheatmap) exportMethods(NoiseFiltering) exportMethods(Normalizedata) exportMethods(PCAplotSymbols) exportMethods(PlotmclustMB) +exportMethods(clustheatmap) exportMethods(comptSNE) -exportMethods(comptsneMB) exportMethods(plotExptSNE) -exportMethods(plotKmeansLabelstSNE) -exportMethods(plotMBLabelstSNE) -exportMethods(plotOrderKMtsne) -exportMethods(plotOrderMBtsne) +exportMethods(plotLabelstSNE) +exportMethods(plotOrderTsne) exportMethods(plotSilhouette) exportMethods(plotSymbolstSNE) -exportMethods(plotexptsneMB) -exportMethods(plotsilhouetteMB) exportMethods(plottSNE) -exportMethods(plottsneMB) +exportMethods(pseudoTimeOrdering) import(org.Hs.eg.db) importClassesFrom(SingleCellExperiment,SingleCellExperiment) importFrom(AnnotationDbi,keys) -importFrom(AnnotationDbi,select) importFrom(NetIndices,GenInd) importFrom(RColorBrewer,brewer.pal) importFrom(RWeka,J48) importFrom(TSCAN,TSCANorder) -importFrom(amap,K) -importFrom(amap,Kmeans) -importFrom(biomaRt,getBM) -importFrom(biomaRt,useDataset) -importFrom(biomaRt,useEnsembl) -importFrom(biomaRt,useMart) importFrom(boot,boot) -importFrom(calibrate,textxy) importFrom(cluster,clusGap) importFrom(cluster,maxSE) importFrom(cluster,silhouette) -importFrom(dplyr,summarize) importFrom(fpc,calinhara) importFrom(fpc,cluster.stats) importFrom(fpc,clusterboot) importFrom(fpc,dudahart2) -importFrom(fpc,kmeansCBI) importFrom(ggplot2,aes) importFrom(ggplot2,aes_string) importFrom(ggplot2,element_blank) @@ -109,7 +88,6 @@ importFrom(graphics,points) importFrom(graphics,rasterImage) importFrom(graphics,text) importFrom(graphics,title) -importFrom(grid,gpar) importFrom(httr,GET) importFrom(httr,content) importFrom(httr,status_code) @@ -127,22 +105,17 @@ importFrom(igraph,graph.adjacency) importFrom(igraph,graph.data.frame) importFrom(igraph,mean_distance) importFrom(igraph,minimum.spanning.tree) -importFrom(matrixStats,rowVars) +importFrom(impute,impute.knn) importFrom(mclust,Mclust) importFrom(mclust,mclustBIC) importFrom(methods,is) importFrom(methods,new) importFrom(methods,validObject) -importFrom(partykit,as.party) importFrom(philentropy,distance) importFrom(png,readPNG) importFrom(readr,read_tsv) importFrom(rpart,rpart) importFrom(rpart.plot,rpart.plot) -importFrom(samr,samr) -importFrom(samr,samr.compute.delta.table) -importFrom(samr,samr.compute.siggenes.table) -importFrom(samr,samr.plot) importFrom(statmod,glmgam.fit) importFrom(stats,aggregate) importFrom(stats,as.dist) @@ -161,6 +134,9 @@ importFrom(stats,pnbinom) importFrom(stats,prcomp) importFrom(stats,predict) importFrom(stats,quantile) +importFrom(stats,rpois) +importFrom(stats,runif) +importFrom(stats,smooth.spline) importFrom(stats,var) importFrom(tsne,tsne) importFrom(utils,capture.output) diff --git a/NEWS.md b/NEWS.md deleted file mode 100644 index 6a3face..0000000 --- a/NEWS.md +++ /dev/null @@ -1,2 +0,0 @@ -Changes in version 0.99.0 (2020-02-17) -+ Submitted to Bioconductor diff --git a/R/DIscBIO-classes.R b/R/DIscBIO-classes.R index c9a50f0..c5ca4a0 100644 --- a/R/DIscBIO-classes.R +++ b/R/DIscBIO-classes.R @@ -50,14 +50,14 @@ #' @exportClass DISCBIO #' #' @importClassesFrom SingleCellExperiment SingleCellExperiment +#' @export #' #' @examples -#' class(valuesG1msReduced) -#' G1_reclassified <- DISCBIO(valuesG1msReduced) +#' class(valuesG1msTest) +#' G1_reclassified <- DISCBIO(valuesG1msTest) #' class(G1_reclassified) #' str(G1_reclassified, max.level=2) -#' identical(G1_reclassified@expdataAll, valuesG1msReduced) -#' @export +#' identical(G1_reclassified@expdataAll, valuesG1msTest) DISCBIO <- setClass( Class = "DISCBIO", slots = c( diff --git a/R/DIscBIO-generic-ClassVectoringDT.R b/R/DIscBIO-generic-ClassVectoringDT.R index fdc1e4f..8984af7 100644 --- a/R/DIscBIO-generic-ClassVectoringDT.R +++ b/R/DIscBIO-generic-ClassVectoringDT.R @@ -12,26 +12,7 @@ #' @param sigDEG A data frame of the differentially expressed genes (DEGs) #' generated by running "DEGanalysis()" or "DEGanalysisM()". #' @param quiet If `TRUE`, suppresses intermediary output -#' @importFrom biomaRt useDataset useMart getBM #' @return A data frame. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Clustexp(sc, cln=2) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' cdiff <- DEGanalysis2clust( -#' sc, Clustering="K-means", K=2, fdr=.2, name="Name", First="CL1", -#' Second="CL2", export=FALSE -#' ) -#' DATAforDT <- ClassVectoringDT( -#' sc, Clustering="K-means", K=2, First="CL1", Second="CL2", cdiff[[1]] -#' ) -#' str(DATAforDT) setGeneric( "ClassVectoringDT", function( @@ -92,13 +73,10 @@ setMethod( gene_names2 <- gene_names[idx_genes] DEGsfilteredDataset <- sg1[gene_names2, ] if (!quiet) { - cat( + message( "The DEGs filtered normalized dataset contains:\n", - "Genes:", - length(DEGsfilteredDataset[, 1]), - "\n", - "cells:", - length(DEGsfilteredDataset[1, ]) + "Genes: ", length(DEGsfilteredDataset[, 1]), "\n", + "cells: ", length(DEGsfilteredDataset[1, ]) ) } G_list = sigDEG diff --git a/R/DIscBIO-generic-KMClustDiffGenes.R b/R/DIscBIO-generic-ClustDiffGenes.R similarity index 72% rename from R/DIscBIO-generic-KMClustDiffGenes.R rename to R/DIscBIO-generic-ClustDiffGenes.R index d516fb3..5b2e4cc 100644 --- a/R/DIscBIO-generic-KMClustDiffGenes.R +++ b/R/DIscBIO-generic-ClustDiffGenes.R @@ -1,5 +1,5 @@ #' @title ClustDiffGenes -#' @description description +#' @description Creates a table of cluster differences #' @param object \code{DISCBIO} class object. #' @param K A numeric value of the number of clusters. #' @param pValue A numeric value of the p-value. Default is 0.05. @@ -7,27 +7,49 @@ #' @param export A logical vector that allows writing the final gene list in #' excel file. Default is TRUE. #' @param quiet if `TRUE`, suppresses intermediate text output -#' @importFrom dplyr summarize +#' @param filename_up Name of the exported "up" file (if `export=TRUE`) +#' @param filename_down Name of the exported "down" file (if `export=TRUE`) +#' @param filename_binom Name of the exported binomial file +#' @param filename_sigdeg Name of the exported sigDEG file #' @importFrom stats pbinom median -#' @rdname KMClustDiffGenes +#' @import org.Hs.eg.db +#' @rdname ClustDiffGenes #' @return A list containing two tables. #' @export #' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- Clustexp(sc, cln=3, quiet=TRUE) # K-means clustering -#' KMClustDiffGenes(sc, K=3, fdr=.3, export=FALSE) +#' sc <- DISCBIO(valuesG1msTest) +#' sc <- Clustexp(sc, cln=3, quiet=TRUE) +#' cdiff <- ClustDiffGenes(sc, K=3, fdr=.3, export=FALSE) +#' str(cdiff) +#' cdiff[[2]] + setGeneric( - "KMClustDiffGenes", - function(object, K, pValue = 0.05, fdr = .01, export = TRUE, quiet = FALSE) - standardGeneric("KMClustDiffGenes") + "ClustDiffGenes", + function( + object, K, pValue = 0.05, fdr = .01, export = FALSE, quiet = FALSE, + filename_up = "Up-DEG-cluster", + filename_down = "Down-DEG-cluster", + filename_binom = "binomial-DEGsTable", + filename_sigdeg = "binomial-sigDEG" + ) { + standardGeneric("ClustDiffGenes") + } ) #' @export -#' @rdname KMClustDiffGenes +#' @rdname ClustDiffGenes setMethod( - "KMClustDiffGenes", + "ClustDiffGenes", signature = "DISCBIO", - definition = function(object, K, pValue, fdr, export, quiet) { - # Validation + definition = function( + object, K, pValue, fdr, export, quiet, filename_up, filename_down, + filename_binom, filename_sigdeg + ) + { + # ====================================================================== + # Validating + # ====================================================================== + ran_k <- length(object@kmeans$kpart) > 0 + ran_m <- length(object@MBclusters) > 0 if (!is.numeric(fdr)) { stop("fdr has to be a number between 0 and 1") } else if (fdr < 0 | fdr > 1) { @@ -39,18 +61,21 @@ setMethod( stop("pValue has to be a number between 0 and 1") } if (length(object@kmeans$kpart) == 0) { - stop("run Clustexp before KMClustDiffGenes") + stop("run Clustexp before ClustDiffGenes") } cdiff <- list() x <- object@ndata y <- object@expdata[, names(object@ndata)] - part <- object@kmeans$kpart - binompval <- function(p, N, n) { - pval <- pbinom(n, round(N, 0), p, lower.tail = TRUE) - filter <- !is.na(pval) & pval > 0.5 - pval[filter] <- 1 - pval[filter] - return(pval) + if (ran_k) { + part <- object@kmeans$kpart + } else if (ran_m) { + part <- object@MBclusters$clusterid + } else { + stop("Run Clustexp() before running this function") } + # ====================================================================== + # Operating + # ====================================================================== for (i in 1:max(part)) { if (sum(part == i) == 0) next @@ -148,10 +173,10 @@ setMethod( DEGsE <- c(DEGsE, as.character(rownames(Final))) Up <- subset(Final, Final[, 7] == "Up") - Up <- dplyr::select( - Up, "Regulation", "genes", "pv", "mean.all", "mean.cl", - "fc", "p.adj" + cols_to_keep <- c( + "Regulation", "genes", "pv", "mean.all", "mean.cl", "fc", "p.adj" ) + Up <- Up[, cols_to_keep] Up[, 3] <- rownames(Up) Up[, 6] <- log2(Up[, 6]) Up[, 1] <- Up[, 2] @@ -166,15 +191,15 @@ setMethod( ) if (export) { write.csv( - Up, file = paste0("Up-DEG-cluster", n, ".csv") + Up, file = paste0(filename_up, n, ".csv") ) } Down <- subset(Final, Final[, 7] == "Down") - Down <- dplyr::select( - Down, "Regulation", "genes", "pv", "mean.all", - "mean.cl", "fc", "p.adj" + cols_to_keep <- c( + "Regulation", "genes", "pv", "mean.all", "mean.cl", "fc", "p.adj" ) + Down <- Down[, cols_to_keep] Down[, 3] <- rownames(Down) Down[, 6] <- log2(Down[, 6]) Down[, 1] <- Down[, 2] @@ -190,21 +215,23 @@ setMethod( if (export) { write.csv( Down, - file = paste0("Down-DEG-cluster", n, ".csv") + file = paste0(filename_down, n, ".csv") ) } sigDEG <- cbind(DEGsE, DEGsS) if (export) { - write.csv(sigDEG, file = "binomial-sigDEG.csv") + write.csv( + sigDEG, file = paste0(filename_sigdeg, ".csv") + ) } DEGsTable[n, 1] <- paste0("Cluster ", n) DEGsTable[n, 2] <- "Remaining Clusters" DEGsTable[n, 3] <- length(Up[, 1]) - DEGsTable[n, 4] <- paste0("Up-DEG-cluster", n, ".csv") + DEGsTable[n, 4] <- paste0(filename_up, n, ".csv") DEGsTable[n, 5] <- length(Down[, 1]) - DEGsTable[n, 6] <- paste0("Down-DEG-cluster", n, ".csv") + DEGsTable[n, 6] <- paste0(filename_down, n, ".csv") } } } @@ -214,11 +241,11 @@ setMethod( "Gene number", "File name" ) if (export) { - write.csv(DEGsTable, file = "binomial-DEGsTable.csv") + write.csv(DEGsTable, file = paste0(filename_binom, ".csv")) } return(list(sigDEG, DEGsTable)) } else{ - print(paste0("There are no DEGs with fdr=", fdr)) + print(paste("There are no DEGs with fdr =", fdr)) } } ) \ No newline at end of file diff --git a/R/DIscBIO-generic-Clustexp.R b/R/DIscBIO-generic-Clustexp.R index cd1f8a2..e4967a6 100644 --- a/R/DIscBIO-generic-Clustexp.R +++ b/R/DIscBIO-generic-Clustexp.R @@ -23,23 +23,19 @@ #' statistics. Default is 50 #' @param cln Number of clusters to be used. Default is \code{NULL} and the #' cluster number is inferred by the saturation criterion. -#' @param rseed Integer number. Random seed to enforce reproducible clustering -#' results. Default is 17000. +#' @param rseed Random integer to enforce reproducible clustering results. #' @param quiet if `TRUE`, intermediate output is suppressed #' @importFrom stats as.dist cor kmeans #' @importFrom cluster clusGap maxSE -#' @importFrom fpc clusterboot -#' @importFrom amap Kmeans #' @importFrom graphics pairs -#' @importFrom fpc cluster.stats calinhara dudahart2 #' @importFrom methods is #' @return The DISCBIO-class object input with the cpart slot filled. #' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data -#' sc <- Clustexp(sc, cln=3) -setGeneric("Clustexp", function(object, clustnr = 20, bootnr = 50, +#' sc <- DISCBIO(valuesG1msTest) # changes signature of data +#' sc <- Clustexp(sc, cln=2) +setGeneric("Clustexp", function(object, clustnr = 3, bootnr = 50, metric = "pearson", do.gap = TRUE, SE.method = "Tibs2001SEmax", - SE.factor = .25, B.gap = 50, cln = 0, rseed = 17000, quiet = FALSE) + SE.factor = .25, B.gap = 50, cln = 0, rseed = NULL, quiet = FALSE) { standardGeneric("Clustexp") } @@ -51,7 +47,7 @@ setMethod( f = "Clustexp", signature = "DISCBIO", definition = function(object, clustnr, bootnr, metric, do.gap, SE.method, - SE.factor, B.gap, cln, rseed, quiet = FALSE) + SE.factor, B.gap, cln, rseed, quiet) { if (!is.numeric(clustnr)) stop("clustnr has to be a positive integer") @@ -97,8 +93,8 @@ setMethod( stop("cln has to be a non-negative integer") else if (round(cln) != cln | cln < 0) stop("cln has to be a non-negative integer") - if (!is.numeric(rseed)) - stop("rseed has to be numeric") + if (!is.null(rseed) & !is.numeric(rseed)) + stop("rseed has to be numeric or NULL") if (!do.gap & cln == 0) stop("cln has to be a positive integer or do.gap has to be TRUE") @@ -115,256 +111,26 @@ setMethod( cln = cln, rseed = rseed ) - dist.gen <- - function(x, method = "euclidean", ...) - if (method %in% c("spearman", "pearson", "kendall")) - as.dist(1 - cor(t(x), method = method, ...)) - else - dist(x, method = method, ...) - dist.gen.pairs <- - function(x, y, ...) - dist.gen(t(cbind(x, y)), ...) - clustfun <- - function(x, - clustnr = 20, - bootnr = 50, - metric = "pearson", - do.gap = TRUE, - SE.method = "Tibs2001SEmax", - SE.factor = .25, - B.gap = 50, - cln = 0, - rseed = 17000, - quiet = FALSE) { - if (clustnr < 2) - stop("Choose clustnr > 1") - di <- - dist.gen(t(x), method = metric) - if (do.gap | cln > 0) { - gpr <- NULL - if (do.gap) { - set.seed(rseed) - gpr <- - clusGap( - as.matrix(di), - FUNcluster = kmeans, - K.max = clustnr, - B = B.gap, - verbose = !quiet - ) - if (cln == 0) - cln <- - maxSE(gpr$Tab[, 3], - gpr$Tab[, 4], - method = SE.method, - SE.factor) - } - if (cln <= 1) { - clb <- list( - result = list( - partition = rep(1, dim(x)[2]) - ), - bootmean = 1 - ) - names(clb$result$partition) <- names(x) - return(list( - x = x, - clb = clb, - gpr = gpr, - di = di - )) - } - - Kmeansruns <- - function (data, - krange = 2:10, - criterion = "ch", - iter.max = 100, - runs = 100, - scaledata = FALSE, - alpha = 0.001, - critout = FALSE, - plot = FALSE, - method = "euclidean", - ...) { - data <- as.matrix(data) - if (criterion == "asw") - sdata <- - dist(data) - if (scaledata) - data <- - scale(data) - cluster1 <- - 1 %in% krange - crit <- - numeric(max(krange)) - km <- list() - for (k in krange) { - if (k > 1) { - minSS <- Inf - kmopt <- - NULL - for (i in 1:runs) { - options(show.error.messages = FALSE) - repeat { - kmm <- try(Kmeans( - data, - k, - iter.max = iter.max, - method = method, - ... - )) - if (!is(kmm, - "try-error")) - break - } - options(show.error.messages = TRUE) - swss <- - sum(kmm$withinss) - if (swss < minSS) { - kmopt <- kmm - minSS <- - swss - } - if (plot) { - par(ask = TRUE) - pairs(data, - col = kmm$cluster, - main = swss) - } - } - km[[k]] <- - kmopt - crit[k] <- - switch( - criterion, - asw = cluster.stats( - sdata, - km[[k]]$cluster - )$avg.silwidth, - ch = calinhara(data, - km[[k]]$cluster) - ) - if (critout) - cat(k, - " clusters ", - crit[k], - "\n") - } - } - if (cluster1) - cluster1 <- - dudahart2( - data, km[[2]]$cluster, alpha = alpha - )$cluster1 - k.best <- - which.max(crit) - if (cluster1) - k.best <- - 1 - km[[k.best]]$crit <- - crit - km[[k.best]]$bestk <- - k.best - out <- - km[[k.best]] - out - } - - - KmeansCBI <- - function (data, - krange, - k = NULL, - scaling = FALSE, - runs = 1, - criterion = "ch", - method = "euclidean", - ...) { - if (!is.null(k)) - krange <- - k - if (!identical(scaling, FALSE)) - sdata <- - scale(data, - center = TRUE, - scale = scaling) - else - sdata <- - data - c1 <- - Kmeansruns( - sdata, - krange, - runs = runs, - criterion = criterion, - method = method, - ... - ) - partition <- - c1$cluster - cl <- list() - nc <- krange - for (i in 1:nc) - cl[[i]] <- - partition == i - out <- - list( - result = c1, - nc = nc, - clusterlist = cl, - partition = partition, - clustermethod = "kmeans" - ) - out - } - - clb <- - clusterboot( - di, - B = bootnr, - distances = FALSE, - bootmethod = "boot", - clustermethod = KmeansCBI, - krange = cln, - scaling = FALSE, - multipleboot = FALSE, - bscompare = TRUE, - seed = rseed, - count = !quiet - ) - return(list( - x = x, - clb = clb, - gpr = gpr, - di = di - )) - } - } - y <- - clustfun( - object@fdata, - clustnr, - bootnr, - metric, - do.gap, - SE.method, - SE.factor, - B.gap, - cln, - rseed, - quiet = quiet - ) - object@kmeans <- - list( - kpart = y$clb$result$partition, - jaccard = y$clb$bootmean, - gap = y$gpr - ) + y <- clustfun( + object@fdata, + clustnr, + bootnr, + metric, + do.gap, + SE.method, + SE.factor, + B.gap, + cln, + rseed = rseed, + quiet = quiet + ) + object@kmeans <- list( + kpart = y$clb$result$partition, + jaccard = y$clb$bootmean, + gap = y$gpr + ) object@distances <- as.matrix(y$di) - set.seed(111111) # fixed seed to keep the same colors - object@fcol <- - sample(rainbow(max(y$clb$result$partition))) + object@fcol <- rainbow(max(y$clb$result$partition)) object@cpart <- object@kmeans$kpart return(object) } diff --git a/R/DIscBIO-generic-DEGanalysis.R b/R/DIscBIO-generic-DEGanalysis.R index 404c559..c5f366f 100644 --- a/R/DIscBIO-generic-DEGanalysis.R +++ b/R/DIscBIO-generic-DEGanalysis.R @@ -13,37 +13,28 @@ #' excel file. Default is TRUE. #' @param quiet if `TRUE`, suppresses intermediate text output #' @param plot if `TRUE`, plots are generated -#' @importFrom samr samr samr.compute.delta.table samr.plot -#' samr.compute.siggenes.table +#' @param filename_deg Name of the exported DEG table +#' @param filename_sigdeg Name of the exported sigDEG table #' @importFrom graphics title #' @importFrom utils write.csv capture.output #' @param ... additional parameters to be passed to samr() #' @return A list containing two tables. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- comptSNE(sc, max_iter=100) -#' DEGanalysis( -#' sc, Clustering="K-means", K=3, fdr=0.1, name="Name", export = FALSE -#' ) setGeneric( name = "DEGanalysis", - def = function(object, - Clustering = "K-means", - K, - fdr = 0.05, - name = "Name", - export = TRUE, - quiet = FALSE, - plot = TRUE, - ...) - standardGeneric("DEGanalysis") + def = function( + object, + K, + Clustering = "K-means", + fdr = 0.05, + name = "Name", + export = FALSE, + quiet = FALSE, + plot = TRUE, + filename_deg = "DEGsTable", + filename_sigdeg = "sigDEG", + ...) { + standardGeneric("DEGanalysis") + } ) #' @export @@ -51,16 +42,19 @@ setGeneric( setMethod( f = "DEGanalysis", signature = "DISCBIO", - definition = function(object, - Clustering = "K-means", - K, - fdr = 0.05, - name = "Name", - export = TRUE, - quiet = FALSE, - plot = TRUE, - ...) - { + definition = function( + object, + K, + Clustering = "K-means", + fdr = 0.05, + name = "Name", + export = FALSE, + quiet = FALSE, + plot = TRUE, + filename_deg, + filename_sigdeg, + ... + ) { # Validation if (!(Clustering %in% c("K-means", "MB"))) { stop("Clustering has to be either K-means or MB") @@ -95,7 +89,7 @@ setMethod( colnames(dataset) <- Nam if (!quiet) { - cat("The dataset is ready for differential expression analysis") + message("The dataset is ready for differential expression analysis") } num1 <- paste("CL", num, sep = "") clustName <- paste("Cl", num, sep = "") @@ -195,13 +189,12 @@ setMethod( o <- c(1:K) oo <- o[-length(o)] com <- sum(oo) - if (!quiet) - cat("Number of comparisons: ", com * 2, "\n") + if (!quiet) message("Number of comparisons: ", com * 2, "\n") comNUM <- paste("comp", c(1:com), sep = "") DEGsTable <- data.frame() DEGsE <- c() DEGsS <- c() - for (i in 1:com) { + for (i in seq_len(com)) { FinalDEGsL <- data.frame() FinalDEGsU <- data.frame() FDRl <- c() @@ -219,7 +212,7 @@ setMethod( data <- list(x = x, y = y, geneid = gname) if (quiet) { invisible(capture.output( - samr.obj <- samr( + samr.obj <- sammy( data, resp.type = "Two class unpaired", assay.type = "seq", @@ -229,7 +222,7 @@ setMethod( ) )) } else { - samr.obj <- samr( + samr.obj <- sammy( data, resp.type = "Two class unpaired", assay.type = "seq", @@ -335,11 +328,9 @@ setMethod( FinalDEGsL[is.na(FinalDEGsL[, 2]), c(2, 3)] <- FinalDEGsL[is.na(FinalDEGsL[, 2]), 3] if (!quiet) { - cat( - paste0( - "Low-regulated genes in the ", second[i], - " in ", first[i], " VS ", second[i], "\n" - ) + message( + "Low-regulated genes in the ", second[i], + " in ", first[i], " VS ", second[i], "\n" ) } if (export) { @@ -400,11 +391,9 @@ setMethod( FinalDEGsU[is.na(FinalDEGsU[, 2]), c(2, 3)] <- FinalDEGsU[is.na(FinalDEGsU[, 2]), 3] if (!quiet) { - cat( - paste0( - "Up-regulated genes in the ", second[i], " in ", - first[i], " VS ", second[i], "\n" - ) + message( + "Up-regulated genes in the ", second[i], " in ", + first[i], " VS ", second[i], "\n" ) } if (export) { @@ -445,19 +434,16 @@ setMethod( } if (!quiet & export) { - cat("The results of DEGs are saved in your directory", "\n") + message("The results of DEGs are saved in your directory") } colnames(DEGsTable) <- c( "Comparisons", "Target cluster", "Gene number", "File name", "Gene number", "File name" ) - if (export) - write.csv(DEGsTable, file = "DEGsTable.csv") - if (!quiet) - print(DEGsTable) + if (export) write.csv(DEGsTable, file = paste0(filename_deg, ".csv")) + if (!quiet) print(DEGsTable) sigDEG <- cbind(DEGsE, DEGsS) - if (export) - write.csv(sigDEG, file = "sigDEG.csv") + if (export) write.csv(sigDEG, file = paste0(filename_sigdeg, ".csv")) return(list(sigDEG, DEGsTable)) } ) \ No newline at end of file diff --git a/R/DIscBIO-generic-DEGanalysis2clust.R b/R/DIscBIO-generic-DEGanalysis2clust.R index 317df66..a98a084 100644 --- a/R/DIscBIO-generic-DEGanalysis2clust.R +++ b/R/DIscBIO-generic-DEGanalysis2clust.R @@ -17,32 +17,20 @@ #' excel file. Default is TRUE. #' @param quiet if `TRUE`, suppresses intermediate text output #' @param plot if `TRUE`, plots are generated +#' @param filename_deg Name of the exported DEG table +#' @param filename_sigdeg Name of the exported sigDEG table #' @param ... additional parameters to be passed to samr() -#' @importFrom samr samr samr.compute.delta.table samr.plot -#' samr.compute.siggenes.table #' @importFrom graphics title #' @importFrom utils write.csv capture.output #' @importFrom AnnotationDbi keys #' @return A list containing two tables. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' DEGanalysis2clust( -#' sc, Clustering="K-means", K=3, fdr=0.1, name="Name", export = FALSE -#' ) setGeneric( "DEGanalysis2clust", function( - object, Clustering = "K-means", K, fdr = 0.05, name = "Name", - First = "CL1", Second = "CL2", export = TRUE, quiet = FALSE, - plot = TRUE, ... + object, K, Clustering = "K-means", fdr = 0.05, name = "Name", + First = "CL1", Second = "CL2", export = FALSE, quiet = FALSE, + plot = TRUE, filename_deg = "DEGsTable", filename_sigdeg = "sigDEG", + ... ) standardGeneric("DEGanalysis2clust") ) @@ -53,9 +41,8 @@ setMethod( "DEGanalysis2clust", signature = "DISCBIO", definition = function( - object, Clustering = "K-means", K, fdr = 0.05, name = "Name", - First = "CL1", Second = "CL2", export = TRUE, quiet = FALSE, - plot = TRUE, ...) + object, K, Clustering, fdr, name, First, Second, export, quiet, plot, + filename_deg, filename_sigdeg, ...) { if (!(Clustering %in% c("K-means", "MB"))) { stop("Clustering has to be either K-means or MB") @@ -105,7 +92,7 @@ setMethod( data = list(x = x, y = y, geneid = gname) if (quiet) { invisible(capture.output({ - samr.obj <- samr( + samr.obj <- sammy( data, resp.type = "Two class unpaired", assay.type = "seq", @@ -116,7 +103,7 @@ setMethod( delta.table <- samr.compute.delta.table(samr.obj) })) } else { - samr.obj <- samr( + samr.obj <- sammy( data, resp.type = "Two class unpaired", assay.type = "seq", @@ -132,6 +119,7 @@ setMethod( wm <- which.min(delta.table[, 5]) if (delta.table[wm, 5] <= fdr) { w <- which(delta.table[, 5] <= fdr) + if (is.null (w)) stop("No suitable deltas. Try a lower FDR value.") delta <- delta.table[w[1], 1] - 0.001 if (plot) { samr.plot(samr.obj, delta) @@ -140,6 +128,11 @@ setMethod( siggenes.table <- samr.compute.siggenes.table( samr.obj, delta, data, delta.table ) + # ------------------------------------------------------------------ + # Reformat siggenes.table as data.frame + # ------------------------------------------------------------------ + siggenes.table$genes.lo <- reformatSiggenes(siggenes.table$genes.lo) + siggenes.table$genes.up <- reformatSiggenes(siggenes.table$genes.up) FDRl <- as.numeric(siggenes.table$genes.lo[, 8]) / 100 FDRu <- as.numeric(siggenes.table$genes.up[, 8]) / 100 @@ -171,6 +164,7 @@ setMethod( "Up-regulated-", name, First, "in", First, "VS", Second, ".csv" ) + FinalDEGsL <- data.frame() if (length(FDRl) > 0) { genes <- siggenes.table$genes.lo[, 3] if (quiet) { @@ -198,28 +192,25 @@ setMethod( gene_list <- geneList[, 3] idx_genes <- is.element(gene_list, genes) genes2 <- geneList[idx_genes, ] - FinalDEGsL <- merge( - FinalDEGsL, - genes2, - by.x = "genes", - by.y = "ENSEMBL", - all.x = TRUE - ) - FinalDEGsL[, 3] <- FinalDEGsL[, 11] - FinalDEGsL <- FinalDEGsL[, c(-1, -10, -11)] - FinalDEGsL <- FinalDEGsL[order(FinalDEGsL[, 8]), ] - FinalDEGsL[is.na(FinalDEGsL[, 2]), c(2, 3)] <- - FinalDEGsL[is.na(FinalDEGsL[, 2]), 3] - if (export) { - cat( - "The results of DEGs are saved in your directory", - "\n" + if (!is.null(FinalDEGsL)) { + FinalDEGsL <- merge( + FinalDEGsL, + genes2, + by.x = "genes", + by.y = "ENSEMBL", + all.x = TRUE ) - cat( - paste0( - "Low-regulated genes in the ", Second, " in ", - First, " VS ", Second, "\n" - ) + FinalDEGsL[, 3] <- FinalDEGsL[, 11] + FinalDEGsL <- FinalDEGsL[, c(-1, -10, -11)] + FinalDEGsL <- FinalDEGsL[order(FinalDEGsL[, 8]), ] + FinalDEGsL[is.na(FinalDEGsL[, 2]), c(2, 3)] <- + FinalDEGsL[is.na(FinalDEGsL[, 2]), 3] + } + if (export) { + message("The results of DEGs are saved in your directory") + message( + "Low-regulated genes in the ", Second, " in ", + First, " VS ", Second, "\n" ) write.csv( FinalDEGsL, @@ -239,7 +230,7 @@ setMethod( DEGsS <- c(DEGsS, FinalDEGsL[, 2]) DEGsE <- c(DEGsE, as.character(FinalDEGsL[, 3])) } - + FinalDEGsU <- data.frame() if (length(FDRu) > 0) { genes <- siggenes.table$genes.up[, 3] if (quiet) { @@ -265,28 +256,25 @@ setMethod( gene_list <- geneList[, 3] idx_genes <- is.element(gene_list, genes) genes2 <- geneList[idx_genes, ] - FinalDEGsU <- merge( - FinalDEGsU, - genes2, - by.x = "genes", - by.y = "ENSEMBL", - all.x = TRUE - ) - FinalDEGsU[, 3] <- FinalDEGsU[, 11] - FinalDEGsU <- FinalDEGsU[, c(-1, -10, -11)] - FinalDEGsU <- FinalDEGsU[order(FinalDEGsU[, 8]), ] - FinalDEGsU[is.na(FinalDEGsU[, 2]), c(2, 3)] <- - FinalDEGsU[is.na(FinalDEGsU[, 2]), 3] - if (export) { - cat( - "The results of DEGs are saved in your directory", - "\n" + if (!is.null(FinalDEGsU)) { + FinalDEGsU <- merge( + FinalDEGsU, + genes2, + by.x = "genes", + by.y = "ENSEMBL", + all.x = TRUE ) - cat( - paste0( - "Up-regulated genes in the ", Second, " in ", First, - " VS ", Second, "\n" - ) + FinalDEGsU[, 3] <- FinalDEGsU[, 11] + FinalDEGsU <- FinalDEGsU[, c(-1, -10, -11)] + FinalDEGsU <- FinalDEGsU[order(FinalDEGsU[, 8]), ] + FinalDEGsU[is.na(FinalDEGsU[, 2]), c(2, 3)] <- + FinalDEGsU[is.na(FinalDEGsU[, 2]), 3] + } + if (export) { + message("The results of DEGs are saved in your directory") + message( + "Up-regulated genes in the ", Second, " in ", First, + " VS ", Second, "\n" ) write.csv( FinalDEGsU, @@ -332,9 +320,17 @@ setMethod( if (!quiet) print(DEGsTable) sigDEG <- cbind(DEGsE, DEGsS) if (export) { - write.csv(DEGsTable, file = "DEGsTable.csv") - write.csv(sigDEG, file = "sigDEG.csv") + write.csv(DEGsTable, file = paste0(filename_deg, ".csv")) + write.csv(sigDEG, file = paste0(filename_sigdeg, ".csv")) } - return(list(sigDEG, DEGsTable)) + return( + list( + sigDEG = sigDEG, + DEGsTable = DEGsTable, + FinalDEGsL = FinalDEGsL, + FinalDEGsU = FinalDEGsU + + ) + ) } ) \ No newline at end of file diff --git a/R/DIscBIO-generic-Exprmclust.R b/R/DIscBIO-generic-Exprmclust.R index d0df1d1..0530792 100644 --- a/R/DIscBIO-generic-Exprmclust.R +++ b/R/DIscBIO-generic-Exprmclust.R @@ -18,98 +18,88 @@ #' the MBclusters slot filled. If the `object` is a data frame, the function #' returns a named list containing the four objects that together correspond #' to the contents of the MBclusters slot. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc,K = 2) -#' print(sc@MBclusters) - setGeneric( - name = "Exprmclust", - def = function(object, - K = 3, - modelNames = "VVV", - reduce = TRUE, - cluster = NULL, - quiet = FALSE) +#' +setGeneric( +name = "Exprmclust", +def = function( + object, + K = 3, + modelNames = "VVV", + reduce = TRUE, + cluster = NULL, + quiet = FALSE +) { standardGeneric("Exprmclust") - ) +} +) - #' @export - #' @rdname Exprmclust - setMethod( +#' @export +#' @rdname Exprmclust +setMethod( f = "Exprmclust", signature = "DISCBIO", - definition = function(object, - K = 3, - modelNames = "VVV", - reduce = TRUE, - cluster = NULL, - quiet = FALSE) { - set.seed(12345) - obj <- object@fdata - if (reduce) { - sdev <- prcomp(t(obj), scale = T)$sdev[1:20] - x <- 1:20 - optpoint <- which.min(sapply(2:10, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev ~ x + x2)$residuals ^ 2) - })) - pcadim = optpoint + 1 - tmpdata <- t(apply(obj, 1, scale)) - colnames(tmpdata) <- colnames(obj) - tmppc <- prcomp(t(tmpdata), scale = T) - pcareduceres <- - t(tmpdata) %*% tmppc$rotation[, 1:pcadim] - } - else { - pcareduceres <- t(obj) - } - if (is.null(cluster)) { - K <- K[K > 1] - res <- Mclust( - data = pcareduceres, - G = K, - modelNames = modelNames, - warn = FALSE, - verbose = !quiet - ) - clusterid <- apply(res$z, 1, which.max) - clunum <- res$G - } else { - clunum <- length(unique(cluster)) - clusterid <- cluster - } - clucenter <- - matrix(0, ncol = ncol(pcareduceres), nrow = clunum) - for (cid in 1:clunum) { - clucenter[cid, ] <- colMeans( - pcareduceres[names(clusterid[clusterid == cid]), , drop = FALSE] - ) - } - dp <- as.matrix(dist(clucenter)) - gp <- - graph.adjacency(dp, mode = "undirected", weighted = TRUE) - dp_mst <- minimum.spanning.tree(gp) - full_List <- - list( - pcareduceres = pcareduceres, - MSTtree = dp_mst, - clusterid = clusterid, - clucenter = clucenter + definition = function(object, K, modelNames, reduce, cluster, quiet) { + obj <- object@fdata + if (reduce) { + sdev <- prcomp(t(obj), scale = T)$sdev[1:20] + x <- 1:20 + optpoint <- which.min( + sapply( + 2:10, + function(i) { + x2 <- pmax(0, x - i) + sum(lm(sdev ~ x + x2)$residuals ^ 2) + } + ) + ) + pcadim <- optpoint + 1 + tmpdata <- t(apply(obj, 1, scale)) + colnames(tmpdata) <- colnames(obj) + tmppc <- prcomp(t(tmpdata), scale = T) + pcareduceres <- t(tmpdata) %*% tmppc$rotation[, 1:pcadim] + } + else { + pcareduceres <- t(obj) + } + if (is.null(cluster)) { + K <- K[K > 1] + res <- Mclust( + data = pcareduceres, + G = K, + modelNames = modelNames, + warn = FALSE, + verbose = !quiet + ) + if (is.null(res)) stop("Unable to cluster. Try a lower value for K.") + clusterid <- apply(res$z, 1, which.max) + clunum <- res$G + } else { + clunum <- length(unique(cluster)) + clusterid <- cluster + } + clucenter <- matrix(0, ncol = ncol(pcareduceres), nrow = clunum) + for (cid in 1:clunum) { + clucenter[cid, ] <- colMeans( + pcareduceres[names(clusterid[clusterid == cid]), , drop = FALSE] + ) + } + dp <- as.matrix(dist(clucenter)) + gp <- graph.adjacency(dp, mode = "undirected", weighted = TRUE) + dp_mst <- minimum.spanning.tree(gp) + full_List <- list( + pcareduceres = pcareduceres, + MSTtree = dp_mst, + clusterid = clusterid, + clucenter = clucenter ) - object@MBclusters <- full_List - return(object) + object@MBclusters <- full_List + return(object) } - ) +) - #' @export - #' @rdname Exprmclust - setMethod( +#' @export +#' @rdname Exprmclust +setMethod( f = "Exprmclust", signature = "data.frame", definition = function(object, @@ -118,7 +108,6 @@ reduce = TRUE, cluster = NULL, quiet = FALSE) { - set.seed(12345) obj <- object if (reduce) { sdev <- prcomp(t(obj), scale = T)$sdev[1:20] @@ -145,6 +134,9 @@ warn = FALSE, verbose = !quiet ) + if (is.null(res)) { + stop("Unable to cluster. Try a lower value for K.") + } clusterid <- apply(res$z, 1, which.max) clunum <- res$G } else { diff --git a/R/DIscBIO-generic-FinalPreprocessing.R b/R/DIscBIO-generic-FinalPreprocessing.R index 996d505..0b6bb18 100644 --- a/R/DIscBIO-generic-FinalPreprocessing.R +++ b/R/DIscBIO-generic-FinalPreprocessing.R @@ -6,15 +6,21 @@ #' @param export A logical vector that allows writing the final gene list in #' excel file. Default is TRUE. #' @param quiet if `TRUE`, intermediary output is suppressed +#' @param fileName File name for exporting (if `export = TRUE`) #' @return The DISCBIO-class object input with the FinalGeneList slot filled. #' @examples -#' sc <- DISCBIO(valuesG1msReduced) +#' sc <- DISCBIO(valuesG1msTest) #' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) #' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) +#' setGeneric( "FinalPreprocessing", function( - object, GeneFlitering = "NoiseF", export = TRUE, quiet = FALSE + object, + GeneFlitering = "NoiseF", + export = FALSE, + quiet = FALSE, + fileName = "filteredDataset" ) standardGeneric("FinalPreprocessing") ) @@ -24,9 +30,7 @@ setGeneric( setMethod( "FinalPreprocessing", signature = "DISCBIO", - definition = function( - object, GeneFlitering, export = TRUE, quiet = FALSE - ) + definition = function(object, GeneFlitering, export, quiet, fileName) { if (GeneFlitering == "NoiseF") { if (length(object@noiseF) < 1) @@ -40,52 +44,30 @@ setMethod( filteredDataset <- object@fdata[gene_names2, ] object@fdata <- filteredDataset object@FinalGeneList <- rownames(filteredDataset) - - if (!quiet) { - cat( - "The gene filtering method= Noise filtering\n\n", - "The Filtered Normalized dataset contains:\n", - "Genes:", - length(filteredDataset[, 1]), - "\n", - "cells:", - length(filteredDataset[1, ]), - "\n\n" - ) - } - if (export) { - cat( - "The Filtered Normalized dataset was saved as: ", - "filteredDataset.Rdata\n" - ) - save(filteredDataset, file = "filteredDataset.Rdata") - } } if (GeneFlitering == "ExpF") { if (nrow(object@fdata) < 1) stop("run Normalizedata before running FinalPreprocessing") filteredDataset <- object@fdata object@FinalGeneList <- rownames(filteredDataset) - + } + if (!quiet) { + message( + "The gene filtering method = Noise filtering\n\n", + "The Filtered Normalized dataset contains:\n", + "Genes: ", length(filteredDataset[, 1]), "\n", + "cells: ", length(filteredDataset[1, ]), "\n\n" + ) + } + if (export) { + fileNameExt <- paste0(fileName, ".Rdata") if (!quiet) { - cat( - "The gene filtering method= Expression filtering\n\n", - "The Filtered Normalized dataset contains:\n", - "Genes:", - length(filteredDataset[, 1]), - "\n", - "cells:", - length(filteredDataset[1,]), - "\n\n" - ) - } - if (export) { - cat( + message( "The Filtered Normalized dataset was saved as: ", - "filteredDataset.Rdata\n" + fileNameExt ) - save(filteredDataset, file = "filteredDataset.Rdata") } + save(filteredDataset, file = fileNameExt) } return(object) } diff --git a/R/DIscBIO-generic-FindOutliersKM.R b/R/DIscBIO-generic-FindOutliers.R similarity index 84% rename from R/DIscBIO-generic-FindOutliersKM.R rename to R/DIscBIO-generic-FindOutliers.R index c9bc7c6..5785602 100644 --- a/R/DIscBIO-generic-FindOutliersKM.R +++ b/R/DIscBIO-generic-FindOutliers.R @@ -1,5 +1,5 @@ -#' @title Inference of outlier cells in K-means clustering -#' @description This functions performs the outlier identification +#' @title Inference of outlier cells +#' @description This functions performs the outlier identification for k-means and model-based clustering #' @param object \code{DISCBIO} class object. #' @param outminc minimal transcript count of a gene in a clusters to be tested #' for being an outlier gene. Default is 5. @@ -10,7 +10,7 @@ #' binomial background model of expression in a cluster. Default is 0.001. #' @param thr probability values for which the number of outliers is computed in #' order to plot the dependence of the number of outliers on the probability -#' threshold. Default is 2**-(1:40). +#' threshold. Default is 2**-(1:40).set #' @param outdistquant Real number between zero and one. Outlier cells are #' merged to outlier clusters if their distance smaller than the #' outdistquant-quantile of the distance distribution of pairs of cells in @@ -19,38 +19,56 @@ #' @param plot if `TRUE`, produces a plot of -log10prob per K #' @param quiet if `TRUE`, intermediary output is suppressed #' @importFrom stats coef pnbinom -#' @importFrom amap K #' @return A named vector of the genes containing outlying cells and the number #' of cells on each. #' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' Outliers <- FindOutliersKM( -#' sc, K=3, outminc=5, outlg=2, probthr=.5*1e-3, thr=2**-(1:40), -#' outdistquant=.75, plot = FALSE -#' ) +#' sc <- DISCBIO(valuesG1msTest) +#' sc <- Clustexp(sc, cln=2) # K-means clustering +#' FindOutliers(sc, K=2) #' setGeneric( - "FindOutliersKM", + "FindOutliers", function( object, K, outminc = 5, outlg = 2, probthr = 1e-3, thr = 2 ** -(1:40), outdistquant = .75, plot = TRUE, quiet = FALSE ) - standardGeneric("FindOutliersKM") + standardGeneric("FindOutliers") ) -#' @rdname FindOutliersKM +#' @rdname FindOutliers #' @export setMethod( - "FindOutliersKM", + "FindOutliers", signature = "DISCBIO", definition = function( - object, K, outminc, outlg, probthr, thr, outdistquant, plot = TRUE, - quiet = FALSE + object, K, outminc, outlg, probthr, thr, outdistquant, plot, quiet ) { - if (length(object@kmeans$kpart) == 0) - stop("run Clustexp before FindOutliersKM") + # ====================================================================== + # Validating + # ====================================================================== + ran_k <- length(object@kmeans) > 0 + ran_m <- length(object@MBclusters) > 0 + if (ran_k) { + clusters <- object@kmeans$kpart + } else if (ran_m) { + object <- Clustexp( + object, + clustnr = 20, + bootnr = 50, + metric = "pearson", + do.gap = T, + SE.method = "Tibs2001SEmax", + SE.factor = .25, + B.gap = 50, + cln = K, + rseed = 17000, + quiet = quiet + ) + clusters <- object@MBclusters$clusterid + } else { + stop("run Clustexp before FindOutliers") + } if (!is.numeric(outminc)) stop("outminc has to be a non-negative integer") else if (round(outminc) != outminc | outminc < 0) @@ -72,14 +90,13 @@ setMethod( else if (outdistquant < 0 | outdistquant > 1) stop("outdistquant has to be a number between 0 and 1") - object@outlierpar <- - list( - outminc = outminc, - outlg = outlg, - probthr = probthr, - thr = thr, - outdistquant = outdistquant - ) + object@outlierpar <- list( + outminc = outminc, + outlg = outlg, + probthr = probthr, + thr = thr, + outdistquant = outdistquant + ) ### calibrate background model m <- log2(apply(object@fdata, 1, mean)) v <- log2(apply(object@fdata, 1, var)) @@ -114,15 +131,15 @@ setMethod( out <- c() stest <- rep(0, length(thr)) cprobs <- c() - for (n in 1:max(object@kmeans$kpart)) { - if (sum(object@kmeans$kpart == n) == 1) { + for (n in 1:max(clusters)) { + if (sum(clusters == n) == 1) { cprobs <- append(cprobs, .5) names(cprobs)[length(cprobs)] <- - names(object@kmeans$kpart)[object@kmeans$kpart == n] + names(clusters)[clusters == n] next } - x <- object@fdata[, object@kmeans$kpart == n] + x <- object@fdata[, clusters == n] x <- x[apply(x, 1, max) > outminc, ] z <- t(apply(x, 1, function(x) { @@ -165,8 +182,8 @@ setMethod( clp2p.cl <- c() cols <- names(object@fdata) di <- as.data.frame(object@distances) - for (i in 1:max(object@kmeans$kpart)) { - tcol <- cols[object@kmeans$kpart == i] + for (i in 1:max(clusters)) { + tcol <- cols[clusters == i] if (sum(!(tcol %in% out)) > 1) clp2p.cl <- append( clp2p.cl, @@ -177,7 +194,7 @@ setMethod( } clp2p.cl <- clp2p.cl[clp2p.cl > 0] - cpart <- object@kmeans$kpart + cpart <- clusters cadd <- list() if (length(out) > 0) { if (length(out) == 1) { @@ -228,10 +245,8 @@ setMethod( object@cpart <- cpart - set.seed(111111) - object@fcol <- sample(rainbow(max(cpart))) - p <- - object@kmeans$kpart[order(object@kmeans$kpart, decreasing = FALSE)] + object@fcol <- rainbow(max(cpart)) + p <- clusters[order(clusters, decreasing = FALSE)] x <- object@out$cprobs[names(p)] fcol <- c("black", "blue", "green", "red", "yellow", "gray") if (plot) { @@ -274,8 +289,8 @@ setMethod( box() } if (!quiet) { - cat( - "The following cells are considered as outlier cells:", + message( + "The following cells are considered outliers: ", which(object@cpart > K), "\n" ) diff --git a/R/DIscBIO-generic-FindOutliersMB.R b/R/DIscBIO-generic-FindOutliersMB.R deleted file mode 100644 index 93ca891..0000000 --- a/R/DIscBIO-generic-FindOutliersMB.R +++ /dev/null @@ -1,313 +0,0 @@ -#' @title Inference of outlier cells in Model-based clustering -#' @description This functions performs the outlier identification -#' @param object \code{DISCBIO} class object. -#' @param outminc minimal transcript count of a gene in a clusters to be tested -#' for being an outlier gene. Default is 5. -#' @param outlg Minimum number of outlier genes required for being an outlier -#' cell. Default is 2. -#' @param probthr outlier probability threshold for a minimum of \code{outlg} -#' genes to be an outlier cell. This probability is computed from a negative -#' binomial background model of expression in a cluster. Default is 0.001. -#' @param thr probability values for which the number of outliers is computed in -#' order to plot the dependence of the number of outliers on the probability -#' threshold. Default is 2**-(1:40). -#' @param outdistquant Real number between zero and one. Outlier cells are -#' merged to outlier clusters if their distance smaller than the -#' outdistquant-quantile of the distance distribution of pairs of cells in -#' the orginal clusters after outlier removal. Default is 0.75. -#' @param K Number of clusters to be used. -#' @param plot if `TRUE`, produces a plot of -log10prob per K -#' @param quiet if `TRUE`, intermediary output is suppressed -#' @importFrom stats coef pnbinom -#' @importFrom amap K -#' @return A named vector of the genes containing outlying cells and the number -#' of cells on each. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc) -#' FindOutliersMB( -#' sc, K=3, outminc=5, outlg=2, probthr=.5*1e-3, thr=2**-(1:40), -#' outdistquant=.75, plot = FALSE, quiet = TRUE -#' ) -#' -setGeneric( - name = "FindOutliersMB", - def = function(object, - K, - outminc = 5, - outlg = 2, - probthr = 1e-3, - thr = 2 ** -(1:40), - outdistquant = .75, - plot = TRUE, - quiet = FALSE) { - standardGeneric("FindOutliersMB") - } -) - -#' @export -#' @rdname FindOutliersMB -setMethod( - f = "FindOutliersMB", - signature = "DISCBIO", - definition = function(object, - K, - outminc, - outlg, - probthr, - thr, - outdistquant, - plot = TRUE, - quiet = FALSE) { - if (length(object@MBclusters$clusterid) == 0) - stop("run exprmclust before FindOutliersMB") - if (!is.numeric(outminc)) - stop("outminc has to be a non-negative integer") - else if (round(outminc) != outminc | - outminc < 0) - stop("outminc has to be a non-negative integer") - if (!is.numeric(outlg)) - stop("outlg has to be a non-negative integer") - else if (round(outlg) != outlg | - outlg < 0) - stop("outlg has to be a non-negative integer") - if (!is.numeric(probthr)) - stop("probthr has to be a number between 0 and 1") - else if (probthr < 0 | - probthr > 1) - stop("probthr has to be a number between 0 and 1") - if (!is.numeric(thr)) - stop("thr hast to be a vector of numbers between 0 and 1") - else if (min(thr) < 0 | - max(thr) > 1) - stop("thr hast to be a vector of numbers between 0 and 1") - if (!is.numeric(outdistquant)) - stop("outdistquant has to be a number between 0 and 1") - else if (outdistquant < 0 | - outdistquant > 1) - stop("outdistquant has to be a number between 0 and 1") - - object <- Clustexp( - object, - clustnr = 20, - bootnr = 50, - metric = "pearson", - do.gap = T, - SE.method = "Tibs2001SEmax", - SE.factor = .25, - B.gap = 50, - cln = K, - rseed = 17000, - quiet = quiet - ) - - object@outlierpar <- - list( - outminc = outminc, - outlg = outlg, - probthr = probthr, - thr = thr, - outdistquant = outdistquant - ) - - ### calibrate background model - m <- log2(apply(object@fdata, 1, mean)) - v <- log2(apply(object@fdata, 1, var)) - f <- m > -Inf & v > -Inf - m <- m[f] - v <- v[f] - mm <- -8 - repeat { - fit <- lm(v ~ m + I(m ^ 2)) - if (coef(fit)[3] >= 0 | mm >= 3) { - break - } - mm <- mm + .5 - f <- m > mm - m <- m[f] - v <- v[f] - } - object@background <- list() - object@background$vfit <- fit - object@background$lvar <- function(x, object) { - 2 ** ( - coef(object@background$vfit)[1] + - log2(x) * coef(object@background$vfit)[2] + - coef(object@background$vfit)[3] * - log2(x) ** 2 - ) - } - object@background$lsize <- function(x, object) { - x ** 2 / (max(x + 1e-6, object@background$lvar(x, object)) - x) - } - - ### identify outliers - out <- c() - stest <- rep(0, length(thr)) - cprobs <- c() - for (n in 1:max(object@MBclusters$clusterid)) { - if (sum(object@MBclusters$clusterid == n) == 1) { - cprobs <- - append(cprobs, .5) - names(cprobs)[length(cprobs)] <- - names(object@MBclusters$clusterid)[object@MBclusters$clusterid == n] - next - } - x <- object@fdata[, object@MBclusters$clusterid == n] - x <- x[apply(x, 1, max) > outminc, ] - z <- - t(apply(x, 1, function(x) { - apply(cbind( - pnbinom( - round(x, 0), - mu = mean(x), - size = object@background$lsize(mean(x), object) - ) , - 1 - pnbinom( - round(x, 0), - mu = mean(x), - size = object@background$lsize(mean(x), object) - ) - ), 1, min) - })) - cp <- - apply(z, 2, function(x) { - y <- - p.adjust(x, method = "BH") - y <- y[order(y, decreasing = FALSE)] - return(y[outlg]) - }) - f <- cp < probthr - cprobs <- append(cprobs, cp) - if (sum(f) > 0) - out <- append(out, names(x)[f]) - for (j in 1:length(thr)) - stest[j] <- stest[j] + sum(cp < thr[j]) - } - object@out <- list( - out = out, - stest = stest, - thr = thr, - cprobs = cprobs - ) - - ### cluster outliers - clp2p.cl <- c() - cols <- names(object@fdata) - di <- as.data.frame(object@distances) - for (i in 1:max(object@MBclusters$clusterid)) { - tcol <- cols[object@MBclusters$clusterid == i] - if (sum(!(tcol %in% out)) > 1) - clp2p.cl <- append( - clp2p.cl, - as.vector(t(di[tcol[!(tcol %in% out)], tcol[!(tcol %in% out)]])) - ) - } - clp2p.cl <- clp2p.cl[clp2p.cl > 0] - - cpart <- object@MBclusters$clusterid - cadd <- list() - if (length(out) > 0) { - if (length(out) == 1) { - cadd <- list(out) - } else{ - n <- out - m <- as.data.frame(di[out, out]) - - for (i in 1:length(out)) { - if (length(n) > 1) { - o <- - order(apply(cbind(m, 1:dim(m)[1]), 1, function(x) - min(x[1:(length(x) - 1)][-x[length(x)]])), decreasing = FALSE) - m <- m[o, o] - n <- n[o] - f <- m[, 1] < quantile(clp2p.cl, outdistquant) | - m[, 1] == min(clp2p.cl) - ind <- 1 - if (sum(f) > 1) - for (j in 2:sum(f)) - comp1 <- m[f, f][j, c(ind, j)] - comp2 <- quantile(clp2p.cl, outdistquant) - if (apply(comp1 > comp2, 1, sum) == 0) - ind <- append(ind, j) - cadd[[i]] <- n[f][ind] - g <- !n %in% n[f][ind] - n <- n[g] - m <- m[g, g] - if (sum(g) == 0) - break - - } else if (length(n) == 1) { - cadd[[i]] <- n - break - } - } - } - - for (i in 1:length(cadd)) { - cpart[cols %in% cadd[[i]]] <- max(cpart) + 1 - } - } - - ### determine final clusters - - object@cpart <- cpart - - set.seed(111111) - object@fcol <- sample(rainbow(max(cpart))) - p <- object@MBclusters$clusterid[order( - object@MBclusters$clusterid, decreasing = FALSE - )] - x <- object@out$cprobs[names(p)] - fcol <- c("black", "blue", "green", "red", "yellow", "gray") - if (plot) { - for (i in 1:max(p)) { - y <- -log10(x + 2.2e-16) - y[p != i] <- 0 - if (i == 1) - b <- - barplot( - y, - ylim = c(0, max(-log10(x + 2.2e-16)) * 1.1), - col = fcol[i], - border = fcol[i], - names.arg = FALSE, - ylab = "-log10prob" - ) - else - barplot( - y, - add = TRUE, - col = fcol[i], - border = fcol[i], - names.arg = FALSE, - axes = FALSE - ) - } - abline(-log10(object@outlierpar$probthr), - 0, - col = "black", - lty = 2) - d <- b[2, 1] - b[1, 1] - y <- 0 - for (i in 1:max(p)) - y <- append(y, b[sum(p <= i), 1] + d / 2) - axis(1, at = (y[1:(length(y) - 1)] + y[-1]) / 2, labels = 1:max(p)) - box() - } - if (!quiet) { - cat("The following cells are considered as outlier cells:", - which(object@cpart > K), - "\n") - print(which(object@cpart > K)) - } - LL = which(object@cpart > K) - return(LL) - } - ) \ No newline at end of file diff --git a/R/DIscBIO-generic-MBClustDiffGenes.R b/R/DIscBIO-generic-MBClustDiffGenes.R deleted file mode 100644 index 2c7f1f0..0000000 --- a/R/DIscBIO-generic-MBClustDiffGenes.R +++ /dev/null @@ -1,230 +0,0 @@ -#' @title ClustDiffGenes -#' @rdname MBClustDiffGenes -#' @param object \code{DISCBIO} class object. -#' @param K A numeric value of the number of clusters. -#' @param pValue A numeric value of the p-value. Default is 0.05. -#' @param fdr A numeric value of the false discovery rate. Default is 0.01. -#' @param export A logical vector that allows writing the final gene list in -#' excel file. Default is TRUE. -#' @param quiet if `TRUE`, suppresses intermediate text output -#' @importFrom stats pbinom median -#' @importFrom AnnotationDbi select -#' @import org.Hs.eg.db -#' @export -#' @return A list containing two tables. -#' @examples -#' \dontrun{ -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=3) -#' sc <- comptsneMB(sc, max_iter=100) -#' sc <- Clustexp(sc, cln=3) -#' sc <- MB_Order(sc, export = FALSE) -#' cdiff <- MBClustDiffGenes(sc, K=3, fdr=.1) -#' str(cdiff) -#' } -setGeneric( - "MBClustDiffGenes", - function(object, K, pValue = 0.05, fdr = .01, export = TRUE, quiet = FALSE) - standardGeneric("MBClustDiffGenes") -) -#' @export -#' @rdname MBClustDiffGenes -setMethod( - "MBClustDiffGenes", - signature = "DISCBIO", - definition = function(object, K, pValue, fdr, export, quiet) { - if (!is.numeric(fdr)) { - stop("fdr has to be a number between 0 and 1") - } else if (fdr < 0 | fdr > 1) { - stop("fdr has to be a number between 0 and 1") - } - if (!is.numeric(pValue)) { - stop("pValue has to be a number between 0 and 1") - } else if (pValue < 0 | pValue > 1) { - stop("pValue has to be a number between 0 and 1") - } - - cdiff <- list() - x <- object@ndata - y <- object@expdata[, names(object@ndata)] - part <- object@MBclusters$clusterid - binompval <- function(p, N, n) { - pval <- pbinom(n, round(N, 0), p, lower.tail = TRUE) - filter <- !is.na(pval) & pval > 0.5 - pval[filter] <- 1 - pval[filter] - return(pval) - } - for (i in 1:max(part)) { - if (sum(part == i) == 0) - next - m <- apply(x, 1, mean) - n <- - if (sum(part == i) > 1) - apply(x[, part == i], 1, mean) - else - x[, part == i] - no <- - if (sum(part == i) > 1) - median(apply(y[, part == i], 2, sum)) / - median(apply(x[, part == i], 2, sum)) - else - sum(y[, part == i]) / sum(x[, part == i]) - m <- m * no - n <- n * no - pv <- binompval(m / sum(m), sum(n), n) - d <- - data.frame( - mean.all = m, - mean.cl = n, - fc = n / m, - pv = pv - )[order(pv, decreasing = FALSE), ] - cdiff[[i]] <- d[d$pv < pValue, ] - } - DEGsE <- c() - DEGsS <- c() - DEGsTable <- data.frame() - - for (n in 1:K) { - if (length(cdiff[[n]][, 1]) == 0) { - next - } - - if (length(cdiff[[n]][, 1]) > 0) { - p.adj <- p.adjust(cdiff[[n]][, 4], method = "bonferroni") - out <- cbind(cdiff[[n]], p.adj) - out <- subset(out, out[, 5] < fdr) - if (length(out[, 1]) > 0) { - Regulation <- c() - for (i in 1:length(out[, 1])) { - if (out[i, 1] > out[i, 2]) { - Regulation[i] = "Down" - } else{ - Regulation[i] = "Up" - } - } - out <- cbind(out, Regulation) - if (quiet) { - suppressMessages( - geneList <- - AnnotationDbi::select( - org.Hs.eg.db, - keys = keys(org.Hs.eg.db), - columns = c("SYMBOL", "ENSEMBL") - ) - ) - GL <- c(1, "MTRNR2", "ENSG00000210082") - GL1 <- c(1, "MTRNR1", "ENSG00000211459") - geneList <- rbind(geneList, GL, GL1) - } else { - geneList <- - AnnotationDbi::select( - org.Hs.eg.db, - keys = keys(org.Hs.eg.db), - columns = c("SYMBOL", "ENSEMBL") - ) - GL <- c(1, "MTRNR2", "ENSG00000210082") - GL1 <- c(1, "MTRNR1", "ENSG00000211459") - geneList <- rbind(geneList, GL, GL1) - } - genes <- rownames(out) - gene_list <- geneList[, 3] - idx_genes <- is.element(gene_list, genes) - genes2 <- geneList[idx_genes, ] - Final <- cbind(genes, out) - - Final <- - merge( - Final, - genes2, - by.x = "genes", - by.y = "ENSEMBL", - all.x = TRUE - ) - Final <- Final[!duplicated(Final[, 1]),] - Final[is.na(Final[, 9]), c(1, 9)] <- - Final[is.na(Final[, 9]), 1] - rownames(Final) <- Final[, 1] - Final[, 1] <- Final[, 9] - Final <- Final[, -9] - DEGsS <- c(DEGsS, Final[, 1]) - DEGsE <- - c(DEGsE, as.character(rownames(Final))) - Up <- subset(Final, Final[, 7] == "Up") - Up <- dplyr::select( - Up, "Regulation", "genes", "pv", "mean.all", "mean.cl", - "fc", "p.adj" - ) - Up[, 3] <- rownames(Up) - Up[, 6] <- log2(Up[, 6]) - Up[, 1] <- Up[, 2] - colnames(Up) <- c( - "Genes", "genes", "E.genes", "mean.all", "mean.cl", - "log2.fc", "p.adj" - ) - if (export) { - write.csv( - Up, file = paste0("Up-DEG-cluster", n, ".csv") - ) - } - - Down <- subset(Final, Final[, 7] == "Down") - Down <- dplyr::select( - Down, "Regulation", "genes", "pv", "mean.all", - "mean.cl", "fc", "p.adj" - ) - Down[, 3] <- rownames(Down) - Down[, 6] <- log2(Down[, 6]) - Down[, 1] <- Down[, 2] - colnames(Down) <- c( - "Genes", - "genes", - "E.genes", - "mean.all", - "mean.cl", - "log2.fc", - "p.adj" - ) - if (export) { - write.csv( - Down, - file = paste0("Down-DEG-cluster", n, ".csv") - ) - } - - sigDEG <- cbind(DEGsE, DEGsS) - if (export) { - write.csv(sigDEG, file = "binomial-sigDEG.csv") - } - - DEGsTable[n, 1] <- paste0("Cluster ", n) - DEGsTable[n, 2] <- "Remaining Clusters" - DEGsTable[n, 3] <- length(Up[, 1]) - DEGsTable[n, 4] <- - paste0("Up-DEG-cluster", n, ".csv") - DEGsTable[n, 5] <- length(Down[, 1]) - DEGsTable[n, 6] <- - paste0("Down-DEG-cluster", n, ".csv") - } - } - } - if (length(DEGsTable) > 0) { - colnames(DEGsTable) <- c( - "Target Cluster", "VS", "Gene number", "File name", - "Gene number", "File name" - ) - if (export) { - write.csv(DEGsTable, file = "binomial-DEGsTable.csv") - } - return(list(sigDEG, DEGsTable)) - } else{ - print(paste0("There are no DEGs with fdr=", fdr)) - } - } -) diff --git a/R/DIscBIO-generic-MBclustheatmap.R b/R/DIscBIO-generic-MBclustheatmap.R deleted file mode 100644 index b511403..0000000 --- a/R/DIscBIO-generic-MBclustheatmap.R +++ /dev/null @@ -1,248 +0,0 @@ -#' @title Plotting the Model-based clusters in a heatmap representation of the -#' cell-to-cell distances -#' @description This functions plots a heatmap of the distance matrix grouped -#' by clusters. Individual clusters are highlighted with rainbow colors along -#' the x and y-axes. -#' @param object \code{DISCBIO} class object. -#' @param hmethod Agglomeration method used for determining the cluster order -#' from hierarchical clustering of the cluster medoids. This should be one of -#' "ward.D", "ward.D2", "single", "complete", "average". Default is "single". -#' @param plot if `TRUE`, plots the heatmap; otherwise, just prints cclmo -#' @param quiet if `TRUE`, intermediary output is suppressed -#' @importFrom stats hclust as.dist cor kmeans -#' @importFrom cluster clusGap maxSE -#' @importFrom fpc clusterboot kmeansCBI -#' @return Unless otherwise specified, a heatmap and a vector of the underlying -#' cluster order. -#' @examples -#' \dontrun{ -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc,K = 2) -#' sc <- comptsneMB(sc, max_iter=100) -#' sc <- Clustexp(sc, cln=3) -#' sc <- MB_Order(sc, export = FALSE) -#' MBclustheatmap(sc, hmethod="single") -#' } -setGeneric( - "MBclustheatmap", - function( - object, hmethod="single", plot=TRUE, quiet=FALSE - ) - { - standardGeneric("MBclustheatmap") - } -) - -#' @export -#' @rdname MBclustheatmap -setMethod( - "MBclustheatmap", - signature = "DISCBIO", - definition = function(object, hmethod, plot, quiet) { - x <- object@fdata - object@clusterpar$metric <- "pearson" - dist.gen <- - function(x, method = "euclidean", ...) - if (method %in% c("spearman", "pearson", "kendall")) - as.dist(1 - cor(t(x), method = method, ...)) - else - dist(x, method = method, ...) - dist.gen.pairs <- function(x, y, ...) - dist.gen(t(cbind(x, y)), ...) - clustfun <- - function(x, - clustnr = 20, - bootnr = 50, - metric = "pearson", - do.gap = TRUE, - SE.method = "Tibs2001SEmax", - SE.factor = .25, - B.gap = 50, - cln = 0, - rseed = 17000, - quiet = FALSE) { - if (clustnr < 2) - stop("Choose clustnr > 1") - di <- dist.gen(t(x), method = metric) - if (do.gap | cln > 0) { - gpr <- NULL - if (do.gap) { - set.seed(rseed) - gpr <- clusGap( - as.matrix(di), - FUNcluster = kmeans, - K.max = clustnr, - B = B.gap, - verbose = !quiet - ) - if (cln == 0) - cln <- maxSE( - gpr$Tab[, 3], gpr$Tab[, 4], method = SE.method, SE.factor - ) - } - if (cln <= 1) { - clb <- list(result = list(partition = rep(1, dim(x)[2])), - bootmean = 1) - names(clb$result$partition) <- names(x) - return(list( - x = x, - clb = clb, - gpr = gpr, - di = di - )) - } - # FUN <- match.fun(clustermethod) - clb <- clusterboot( - di, - B = bootnr, - distances = FALSE, - bootmethod = "boot", - clustermethod = fpc::kmeansCBI, - krange = cln, - scaling = FALSE, - multipleboot = FALSE, - bscompare = TRUE, - seed = rseed, - count = !quiet - ) - return(list( - x = x, - clb = clb, - gpr = gpr, - di = di - )) - } - } - y <- - clustfun( - object@fdata, - clustnr = 20, - bootnr = 50, - metric = "pearson", - do.gap = TRUE, - SE.method = "Tibs2001SEmax", - SE.factor = .25, - B.gap = 50, - cln = 0, - rseed = 17000, - quiet = quiet - ) - object@distances <- as.matrix(y$di) - part <- object@MBclusters$clusterid - na <- c() - j <- 0 - for (i in 1:max(part)) { - if (sum(part == i) == 0) - next - j <- j + 1 - na <- append(na, i) - d <- x[, part == i] - if (sum(part == i) == 1) - cent <- d - else - cent <- apply(d, 1, mean) - if (j == 1) - tmp <- data.frame(cent) - else - tmp <- cbind(tmp, cent) - } - names(tmp) <- paste("cl", na, sep = ".") - if (max(part) > 1) - cclmo <- - hclust(dist.gen(as.matrix( - dist.gen(t(tmp), method = object@clusterpar$metric) - )), method = hmethod)$order - else - cclmo <- 1 - q <- part - for (i in 1:max(part)) { - q[part == na[cclmo[i]]] <- i - } - part <- q - di <- - as.data.frame(as.matrix(dist.gen(t(object@distances)))) - pto <- part[order(part, decreasing = FALSE)] - ptn <- c() - for (i in 1:max(pto)) { - pt <- - names(pto)[pto == i] - z <- - if (length(pt) == 1) - pt - else - pt[hclust(as.dist(t(di[pt, pt])), method = hmethod)$order] - ptn <- append(ptn, z) - } - col = c("black", "blue", "green", "red", "yellow", "gray") - mi <- min(di, na.rm = TRUE) - ma <- max(di, na.rm = TRUE) - - if (plot) { - layout( - matrix( - data = c(1, 3, 2, 4), - nrow = 2, - ncol = 2 - ), - widths = c(5, 1, 5, 1), - heights = c(5, 1, 1, 1) - ) - ColorRamp <- - colorRampPalette(brewer.pal(n = 7, name = "RdYlBu"))(100) - ColorLevels <- seq(mi, ma, length = length(ColorRamp)) - if (mi == ma) { - ColorLevels <- seq(0.99 * mi, 1.01 * ma, length = length(ColorRamp)) - } - par(mar = c(3, 5, 2.5, 2)) - image(as.matrix(di[ptn, ptn]), col = ColorRamp, axes = FALSE) - abline(0, 1) - box() - - tmp <- c() - for (u in 1:max(part)) { - ol <- (0:(length(part) - 1) / - (length(part) - 1))[ptn %in% names(x)[part == u]] - points( - rep(0, length(ol)), - ol, - col = col[cclmo[u]], - pch = 15, - cex = .75 - ) - points( - ol, - rep(0, length(ol)), - col = col[cclmo[u]], - pch = 15, - cex = .75 - ) - tmp <- append(tmp, mean(ol)) - } - axis(1, at = tmp, labels = cclmo) - axis(2, at = tmp, labels = cclmo) - par(mar = c(3, 2.5, 2.5, 2)) - image( - 1, - ColorLevels, - matrix( - data = ColorLevels, - ncol = length(ColorLevels), - nrow = 1 - ), - col = ColorRamp, - xlab = "", - ylab = "", - las = 2, - xaxt = "n" - ) - layout(1) - } - return(cclmo) - } -) \ No newline at end of file diff --git a/R/DIscBIO-generic-NoiseFiltering.R b/R/DIscBIO-generic-NoiseFiltering.R index 97c602d..2f34189 100644 --- a/R/DIscBIO-generic-NoiseFiltering.R +++ b/R/DIscBIO-generic-NoiseFiltering.R @@ -19,22 +19,24 @@ #' @param export A logical vector that allows writing the final gene list in #' excel file. Default is TRUE. #' @param quiet if `TRUE`, suppresses printed output -#' @importFrom matrixStats rowVars +#' @param filename Name of the exported file (if `export=TRUE`) #' @importFrom stats quantile var fitted.values pchisq p.adjust median aggregate #' @importFrom graphics plot axis abline points lines #' @importFrom statmod glmgam.fit #' @note This function should be used only if the dataset has ERCC. #' @return The DISCBIO-class object input with the noiseF slot filled. #' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data +#' sc <- DISCBIO(valuesG1msTest) # changes signature of data #' sd_filtered <- NoiseFiltering(sc, export=FALSE) #' str(sd_filtered) +#' setGeneric( name = "NoiseFiltering", def = function( object, percentile = 0.8, CV = 0.3, geneCol = "yellow", FgeneCol = "black", erccCol = "blue", Val = TRUE, plot = TRUE, - export = TRUE, quiet = FALSE + export = FALSE, quiet = FALSE, + filename = "Noise_filtering_genes_test" ) standardGeneric("NoiseFiltering") ) @@ -46,7 +48,7 @@ setMethod( signature = "DISCBIO", definition = function( object, percentile, CV, geneCol, FgeneCol, erccCol, Val, plot, - export, quiet + export, quiet, filename ) { if (!is.numeric(percentile)) @@ -86,20 +88,20 @@ setMethod( # perform fit, define sample moments per gene meansG1ms <- rowMeans(nCountsG1ms) - varsG1ms <- rowVars(nCountsG1ms) + varsG1ms <- apply(nCountsG1ms, 1, var) cv2G1ms <- varsG1ms / meansG1ms ^ 2 meansERCC <- rowMeans(nCountsERCC) - varsERCC <- rowVars(nCountsERCC) + varsERCC <- apply(nCountsERCC, 1, var) cv2ERCC <- varsERCC / meansERCC ^ 2 minMeanForFit <- unname( quantile(meansERCC[which(cv2ERCC > CV)], percentile) ) if (!quiet) { - cat( + message( "Cut-off value for the ERCCs= ", round(minMeanForFit, digits = 2), - "\n\n" + "\n" ) } @@ -115,7 +117,7 @@ setMethod( ) if (!quiet) { - cat("Coefficients of the fit:", "\n") + message("Coefficients of the fit:") print(fit$coefficients) } @@ -130,9 +132,11 @@ setMethod( total <- var(log(cv2ERCC[useForFit])) if (!quiet) { - cat("Explained variances of log CV^2 values= ", + message( + "Explained variances of log CV^2 values= ", c(round(1 - residual / total, digits = 2)), - "\n\n") + "\n" + ) } ## Pick out genes above noise line @@ -155,17 +159,21 @@ setMethod( genes_test <- sapply(genes_test, paste0, collapse = "") if (!quiet) { - cat("Number of genes that passed the filtering= ", + message( + "Number of genes that passed the filtering = ", length(genes_test), - "\n\n") + "\n" + ) } if (export) { - write.csv(genes_test, file = "Noise_filtering_genes_test.csv") - cat( - "The filtered gene list was saved as:", - "Noise_filtering_genes_test\n" - ) + write.csv(genes_test, file = paste0(filename, ".csv")) + if (!quiet) { + message( + "The filtered gene list was saved as ", + paste0(filename, ".csv") + ) + } } if (plot) { @@ -240,24 +248,6 @@ setMethod( col = erccCol ) # Showing all the valied ERCCs } - add_legend <- function(...) { - opar <- par( - fig = c(0, 1, 0, 1), - oma = c(0, 0, 0, 0), - mar = c(0, 0, 0, 0), - new = TRUE - ) - on.exit(par(opar)) - plot( - 0, - 0, - type = 'n', - bty = 'n', - xaxt = 'n', - yaxt = 'n' - ) - legend(...) - } add_legend( "topleft", legend = c( diff --git a/R/DIscBIO-generic-Normalizedata.R b/R/DIscBIO-generic-Normalizedata.R index 71e67e5..875b86f 100644 --- a/R/DIscBIO-generic-Normalizedata.R +++ b/R/DIscBIO-generic-Normalizedata.R @@ -19,12 +19,12 @@ #' downsampled versions of the transcript count data. Default is 1 which means #' that sampling noise should be comparable across cells. For high numbers of #' dsn the data will become similar to the median normalization. -#' @param rseed Integer number. Random seed to enforce reproducible clustering -#' results. Default is 17000. +#' @param rseed Random integer to enforce reproducible clustering. +#' results #' @include DIscBIO-classes.R #' @return The DISCBIO-class object input with the ndata and fdata slots filled. #' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data +#' sc <- DISCBIO(valuesG1msTest) # changes signature of data #' #' # In this case this function is used to normalize the reads #' sc_normal <- Normalizedata( @@ -32,11 +32,12 @@ #' dsn=1, rseed=17000 #' ) #' summary(sc_normal@fdata) +#' setGeneric( "Normalizedata", function( object, mintotal = 1000, minexpr = 0, minnumber = 0, maxexpr = Inf, - downsample = FALSE, dsn = 1, rseed = 17000 + downsample = FALSE, dsn = 1, rseed = NULL ) standardGeneric("Normalizedata") ) @@ -78,35 +79,6 @@ setMethod( cols <- apply(object@expdata, 2, sum, na.rm = TRUE) >= mintotal object@ndata <- object@expdata[, cols] if (downsample) { - downsample <- function(x, n, dsn) { - x <- round(x[, apply(x, 2, sum, na.rm = TRUE) >= n], 0) - nn <- min(apply(x, 2, sum)) - for (j in 1:dsn) { - z <- data.frame(GENEID = rownames(x)) - rownames(z) <- rownames(x) - initv <- rep(0, nrow(z)) - for (i in 1:dim(x)[2]) { - y <- - aggregate(rep(1, nn), list(sample( - rep(rownames(x), x[, i]), nn - )), sum) - na <- names(x)[i] - names(y) <- c("GENEID", na) - rownames(y) <- y$GENEID - z[, na] <- initv - k <- intersect(rownames(z), y$GENEID) - z[k, na] <- y[k, na] - z[is.na(z[, na]), na] <- 0 - } - rownames(z) <- as.vector(z$GENEID) - ds <- if (j == 1) - z[, -1] - else - ds + z[, -1] - } - ds <- ds / dsn + .1 - return(ds) - } set.seed(rseed) object@ndata <- downsample(object@expdata, n = mintotal, dsn = dsn) } else{ diff --git a/R/DIscBIO-generic-PCAplotSymbols.R b/R/DIscBIO-generic-PCAplotSymbols.R index b43a076..0de0caf 100644 --- a/R/DIscBIO-generic-PCAplotSymbols.R +++ b/R/DIscBIO-generic-PCAplotSymbols.R @@ -6,16 +6,7 @@ #' @importFrom grDevices rainbow #' @importFrom graphics legend #' @return Plot of the Principal Components -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=2) -#' PCAplotSymbols(sc) +#' setGeneric("PCAplotSymbols", function(object, types = NULL) standardGeneric("PCAplotSymbols")) diff --git a/R/DIscBIO-generic-PlotmclustMB.R b/R/DIscBIO-generic-PlotmclustMB.R index 7a640b9..d9732fd 100644 --- a/R/DIscBIO-generic-PlotmclustMB.R +++ b/R/DIscBIO-generic-PlotmclustMB.R @@ -6,16 +6,6 @@ #' element_line unit element_text element_rect #' @importFrom igraph get.edgelist degree get.shortest.paths #' @return A plot of the PCA. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=2) -#' PlotmclustMB(sc) setGeneric("PlotmclustMB", function(object) standardGeneric("PlotmclustMB")) diff --git a/R/DIscBIO-generic-KmeanOrder.R b/R/DIscBIO-generic-clusteringOrder.R similarity index 70% rename from R/DIscBIO-generic-KmeanOrder.R rename to R/DIscBIO-generic-clusteringOrder.R index 140e2d0..5e8df8b 100644 --- a/R/DIscBIO-generic-KmeanOrder.R +++ b/R/DIscBIO-generic-clusteringOrder.R @@ -5,14 +5,11 @@ #' @param object \code{DISCBIO} class object. #' @param quiet if `TRUE`, suppresses intermediary output #' @param export if `TRUE`, exports order table to csv +#' @param filename Name of the exported file (if `export=TRUE`) #' @importFrom TSCAN TSCANorder #' @return The DISCBIO-class object input with the kordering slot filled. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' Order <- KmeanOrder(sc, export = FALSE) -#' Order@kordering -setGeneric("KmeanOrder", function(object, quiet = FALSE, export = TRUE) +setGeneric("KmeanOrder", function(object, quiet = FALSE, export = FALSE, + filename = "Cellular_pseudo-time_ordering_based_on_k-meansc-lusters") standardGeneric("KmeanOrder") ) @@ -21,7 +18,7 @@ setGeneric("KmeanOrder", function(object, quiet = FALSE, export = TRUE) setMethod( "KmeanOrder", signature = "DISCBIO", - definition = function(object, quiet = FALSE, export = TRUE) { + definition = function(object, quiet, export, filename) { # Validation if (length(object@kmeans$kpart) == 0) { stop("run Clustexp before KmeanOrder") @@ -35,15 +32,9 @@ setMethod( orderID <- lpsorder order <- c(1:length(lpsorder)) orderTable <- data.frame(order, orderID) - if (export) { - nm <- "Cellular_pseudo-time_ordering_based_on_k-meansc-lusters.csv" - write.csv(orderTable, file = nm) - } - if (!quiet) { - print(orderTable) - } - FinalOrder <- - orderTable[match(sampleNames, orderTable$orderID), ] + if (export) write.csv(orderTable, file = paste0(filename, ".csv")) + if (!quiet) print(orderTable) + FinalOrder <- orderTable[match(sampleNames, orderTable$orderID), ] out_order <- FinalOrder[, 1] names(out_order) <- names(Clusters) object@kordering <- out_order diff --git a/R/DIscBIO-generic-KMclustheatmap.R b/R/DIscBIO-generic-clustheatmap.R similarity index 73% rename from R/DIscBIO-generic-KMclustheatmap.R rename to R/DIscBIO-generic-clustheatmap.R index 3c1fd57..cb26ad0 100644 --- a/R/DIscBIO-generic-KMclustheatmap.R +++ b/R/DIscBIO-generic-clustheatmap.R @@ -1,36 +1,62 @@ -#' @title Plotting the K-means clusters in a heatmap representation of the -#' cell-to-cell distances +#' @title Plotting clusters in a heatmap representation of the cell distances #' @description This functions plots a heatmap of the distance matrix grouped #' by clusters. Individual clusters are highlighted with rainbow colors along #' the x and y-axes. #' @param object \code{DISCBIO} class object. +#' @param clustering_method either "k-means" or "model-based" ("k" and "mb" are also accepted) #' @param hmethod Agglomeration method used for determining the cluster order #' from hierarchical clustering of the cluster medoids. This should be one of #' "ward.D", "ward.D2", "single", "complete", "average". Default is "single". +#' @param quiet if `TRUE`, intermediary output is suppressed +#' @param rseed Random integer to fix random results. #' @param plot if `TRUE`, plots the heatmap; otherwise, just prints cclmo #' @return Unless otherwise specified, a heatmap and a vector of the underlying #' cluster order. #' @importFrom stats hclust as.dist cor -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- Clustexp(sc, cln=3, quiet=TRUE) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100, quiet=TRUE) -#' KMclustheatmap(sc, hmethod="single") setGeneric( - "KMclustheatmap", + "clustheatmap", function( - object, hmethod = "single", plot = TRUE - ) - standardGeneric("KMclustheatmap") + object, + clustering_method = "k-means", + hmethod="single", + rseed=NULL, + quiet=FALSE, + plot=TRUE) + { + standardGeneric("clustheatmap") + } ) #' @export -#' @rdname KMclustheatmap +#' @rdname clustheatmap setMethod( - "KMclustheatmap", + "clustheatmap", signature = "DISCBIO", - definition = function(object, hmethod, plot = TRUE) { + definition = function( + object, clustering_method, hmethod, rseed, quiet, plot + ) + { x <- object@fdata + if (tolower(clustering_method) %in% c("k-means", "k")) { + part <- object@kmeans$kpart + } else if (tolower(clustering_method) %in% c("model-based", "mb")) { + object@clusterpar$metric <- "pearson" + y <- clustfun( + object@fdata, + clustnr = 20, + bootnr = 50, + metric = "pearson", + do.gap = TRUE, + SE.method = "Tibs2001SEmax", + SE.factor = .25, + B.gap = 50, + cln = 0, + rseed = rseed, + quiet = quiet + ) + object@distances <- as.matrix(y$di) + part <- object@MBclusters$clusterid + } part <- object@kmeans$kpart na <- c() j <- 0 @@ -50,16 +76,6 @@ setMethod( tmp <- cbind(tmp, cent) } names(tmp) <- paste("cl", na, sep = ".") - - dist.gen <- - function(x, method = "euclidean", ...) - if (method %in% c("spearman", "pearson", "kendall")) - as.dist(1 - cor(t(x), method = method, ...)) - else - dist(x, method = method, ...) - dist.gen.pairs <- - function(x, y, ...) - dist.gen(t(cbind(x, y)), ...) if (max(part) > 1) cclmo <- hclust(dist.gen(as.matrix( @@ -72,10 +88,7 @@ setMethod( q[part == na[cclmo[i]]] <- i } part <- q - di <- - as.data.frame(as.matrix(dist.gen(t( - object@distances - )))) + di <- as.data.frame(as.matrix(dist.gen(t(object@distances)))) pto <- part[order(part, decreasing = FALSE)] ptn <- c() for (i in 1:max(pto)) { @@ -101,8 +114,9 @@ setMethod( widths = c(5, 1, 5, 1), heights = c(5, 1, 1, 1) ) - ColorRamp <- - colorRampPalette(brewer.pal(n = 7, name = "RdYlBu"))(100) + ColorRamp <- colorRampPalette( + brewer.pal(n = 7, name = "RdYlBu") + )(100) ColorLevels <- seq(mi, ma, length = length(ColorRamp)) if (mi == ma) { ColorLevels <- seq( @@ -110,7 +124,8 @@ setMethod( ) } - par(mar = c(3, 5, 2.5, 2)) + opar <- par(mar = c(3, 5, 2.5, 2)) + on.exit(par(opar)) image(as.matrix(di[ptn, ptn]), col = ColorRamp, axes = FALSE) abline(0, 1) box() @@ -137,7 +152,8 @@ setMethod( } axis(1, at = tmp, labels = cclmo) axis(2, at = tmp, labels = cclmo) - par(mar = c(3, 2.5, 2.5, 2)) + opar <- par(mar = c(3, 2.5, 2.5, 2)) + on.exit(par(opar)) image( 1, ColorLevels, diff --git a/R/DIscBIO-generic-comptSNE.R b/R/DIscBIO-generic-comptSNE.R index dcb9ea3..e6740a3 100644 --- a/R/DIscBIO-generic-comptSNE.R +++ b/R/DIscBIO-generic-comptSNE.R @@ -1,9 +1,9 @@ -#' @title Computing tSNE for K-means clustering +#' @title Computing tSNE #' @description This function is used to compute the t-Distributed Stochastic #' Neighbor Embedding (t-SNE). #' @param object \code{DISCBIO} class object. -#' @param rseed Integer number. Random seed to to yield exactly reproducible -#' maps across different runs. Default is 15555. +#' @param rseed Random integer to to yield reproducible maps across different +#' runs #' @param max_iter maximum number of iterations to perform. #' @param epoch The number of iterations in between update messages. #' @param quiet if `TRUE`, suppresses intermediate output @@ -12,42 +12,60 @@ #' @importFrom stats as.dist cor #' @return The DISCBIO-class object input with the tsne slot filled. #' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data -#' sc <- Clustexp(sc, cln=3) # data must be clustered before plottin -#' sc <- comptSNE(sc, max_iter=1000) +#' sc <- DISCBIO(valuesG1msTest) # changes signature of data +#' sc <- Clustexp(sc, cln=2) # data must be clustered before plottin +#' sc <- comptSNE(sc, max_iter=30) #' head(sc@tsne) +#' setGeneric( name = "comptSNE", - def = function(object, rseed = 15555, max_iter = 5000, epoch = 500, - quiet = FALSE, ...) { + def = function( + object, rseed=NULL, max_iter=5000, epoch=500, quiet=FALSE, ... + ) + { standardGeneric("comptSNE") } ) #' @rdname comptSNE #' @export - setMethod( - "comptSNE", +setMethod( + f = "comptSNE", signature = "DISCBIO", - definition = function(object, rseed, max_iter, epoch, quiet, ...) { - if (length(object@kmeans$kpart) == 0) - stop("run Clustexp before comptSNE") - set.seed(rseed) - dist.gen <- - function(x, method = "euclidean") - if (method %in% c("spearman", "pearson", "kendall")) - as.dist(1 - cor(t(x), method = method)) - else - dist(x, method = method) - di <- dist.gen(as.matrix(object@distances)) - if (quiet) { - ts <- suppressMessages( - tsne(di, max_iter = max_iter, epoch = epoch, ...) - ) - } else { - ts <- tsne(di, max_iter = max_iter, epoch = epoch, ...) + definition = function(object, rseed, max_iter, epoch, quiet, ...) + { + # ====================================================================== + # Validating + # ====================================================================== + ran_k <- length(object@kmeans$kpart) > 0 + ran_m <- length(object@MBclusters) > 0 + if (ran_k) { + di <- dist.gen(as.matrix(object@distances)) + } else if (ran_m) { + di <- dist.gen(as.matrix(t(object@fdata))) + } else { + stop("run clustexp before comptSNE") + } + # ====================================================================== + # Computing + # ====================================================================== + set.seed(rseed) + if (quiet) { + ts <- suppressMessages( + tsne(di, max_iter = max_iter, epoch = epoch, ...) + ) + } else { + message("This function may take time") + ts <- tsne(di, max_iter = max_iter, epoch = epoch, ...) + } + # ====================================================================== + # Filling output + # ====================================================================== + if (ran_k) { + object@tsne <- as.data.frame(ts) + } else if (ran_m) { + object@MBtsne <- as.data.frame(ts) + } + return(object) } - object@tsne <- as.data.frame(ts) - return(object) - } - ) \ No newline at end of file +) \ No newline at end of file diff --git a/R/DIscBIO-generic-comptsneMB.R b/R/DIscBIO-generic-comptsneMB.R deleted file mode 100644 index 1b5a93d..0000000 --- a/R/DIscBIO-generic-comptsneMB.R +++ /dev/null @@ -1,61 +0,0 @@ -#' @title Computing tSNE for Model-based clustering -#' @description This function is used to compute the t-Distributed Stochastic -#' Neighbor Embedding (t-SNE). -#' @param object \code{DISCBIO} class object. -#' @param rseed Integer number. Random seed to to yield exactly reproducible -#' maps across different runs. Default is 15555. -#' @param max_iter maximum number of iterations to perform. -#' @param epoch The number of iterations in between update messages. -#' @param quiet if `TRUE`, suppresses intermediate output -#' @param ... other parameters to be passed to `tsne::tsne` -#' @importFrom tsne tsne -#' @importFrom stats as.dist cor -#' @return The DISCBIO-class object input with the MBtsne slot filled. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc) -#' sc <- comptsneMB(sc, rseed=15555, max_iter = 1000) -#' print(sc@MBtsne) -setGeneric( - name = "comptsneMB", - def = function(object, rseed = 15555, max_iter = 5000, epoch = 500, - quiet = FALSE, ...) { - standardGeneric("comptsneMB") - } -) - -#' @rdname comptsneMB -#' @export -setMethod( - f = "comptsneMB", - signature = "DISCBIO", - definition = function(object, rseed, max_iter, epoch, quiet, ...) { - if (length(object@MBclusters) == 0) - stop("run clustexp before comptsneMB") - set.seed(rseed) - dist.gen <- function(x, method = "euclidean") { - if (method %in% c("spearman", "pearson", "kendall")) { - as.dist(1 - cor(t(x), method = method)) - } else { - dist(x, method = method) - } - } - di <- dist.gen(as.matrix(t(object@fdata))) - if (quiet) { - ts <- suppressMessages( - tsne(di, max_iter = max_iter, epoch = epoch, ...) - ) - } else { - message("This function may take time") - ts <- tsne(di, max_iter = max_iter, epoch = epoch, ...) - } - object@MBtsne <- as.data.frame(ts) - return(object) - } -) \ No newline at end of file diff --git a/R/DIscBIO-generic-plotExptSNE.R b/R/DIscBIO-generic-plotExptSNE.R index b73acae..0fed04b 100644 --- a/R/DIscBIO-generic-plotExptSNE.R +++ b/R/DIscBIO-generic-plotExptSNE.R @@ -1,4 +1,4 @@ -#' @title Highlighting gene expression in K-means clustering in the t-SNE map +#' @title Highlighting gene expression in the t-SNE map #' @description The t-SNE map representation can also be used to analyze #' expression of a gene or a group of genes, to investigate cluster specific #' gene expression patterns @@ -9,12 +9,6 @@ #' @param n String of characters representing the title of the plot. Default is #' NULL and the first element of \code{g} is chosen. #' @return t-SNE plot for one particular gene -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- Clustexp(sc, cln=3, quiet=TRUE) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' g <- 'ENSG00000001460' -#' plotExptSNE(sc, g) setGeneric("plotExptSNE", function(object, g, n = NULL) standardGeneric("plotExptSNE") @@ -26,20 +20,33 @@ setMethod( "plotExptSNE", signature = "DISCBIO", definition = function(object, g, n = NULL) { - if (length(object@tsne) == 0) + # ====================================================================== + # Validation + # ====================================================================== + ran_k <- length(object@tsne) > 0 + ran_m <- length(object@MBtsne) > 0 + if (ran_k) { + x <- object@tsne + } else if (ran_m) { + x <- object@MBtsne + } else { stop("run comptSNE before plotExptSNE") + } if (length(intersect(g, rownames(object@ndata))) < length(unique(g))) stop( "second argument does not correspond to set of rownames slot", "ndata of SCseq object" ) - if (is.null(n)) - n <- g[1] + if (is.null(n)) n <- g[1] + # ====================================================================== + # Plotting + # ====================================================================== l <- apply(object@ndata[g, ] - .1, 2, sum) + .1 mi <- min(l, na.rm = TRUE) ma <- max(l, na.rm = TRUE) - ColorRamp <- - colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100) + ColorRamp <- colorRampPalette( + rev(brewer.pal(n = 7, name = "RdYlBu")) + )(100) ColorLevels <- seq(mi, ma, length = length(ColorRamp)) v <- round((l - mi) / (ma - mi) * 99 + 1, 0) layout( @@ -51,7 +58,8 @@ setMethod( widths = c(5, 1, 5, 1), heights = c(5, 1, 1, 1) ) - par(mar = c(3, 5, 2.5, 2)) + opar <- par(mar = c(3, 5, 2.5, 2)) + on.exit(par(opar)) plot( object@tsne, xlab = "Dim 1", @@ -64,14 +72,15 @@ setMethod( ) for (k in 1:length(v)) { points( - object@tsne[k, 1], - object@tsne[k, 2], + x[k, 1], + x[k, 2], col = ColorRamp[v[k]], pch = 20, cex = 1.5 ) } - par(mar = c(3, 2.5, 2.5, 2)) + opar <- par(mar = c(3, 2.5, 2.5, 2)) + on.exit(par(opar)) image( 1, ColorLevels, diff --git a/R/DIscBIO-generic-plotGap.R b/R/DIscBIO-generic-plotGap.R index 91e2c69..f878e48 100644 --- a/R/DIscBIO-generic-plotGap.R +++ b/R/DIscBIO-generic-plotGap.R @@ -4,10 +4,6 @@ #' @param object \code{DISCBIO} class object. #' @param y_limits 2-length numeric vector with the limits for the gap plot #' @return A plot of the gap statistics -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data -#' sc <- Clustexp(sc, cln=3) # data must be clustered before plotting -#' plotGap(sc) setGeneric( "plotGap", function(object, y_limits = NULL) standardGeneric("plotGap") ) diff --git a/R/DIscBIO-generic-plotKmeansLabelstSNE.R b/R/DIscBIO-generic-plotKmeansLabelstSNE.R deleted file mode 100644 index be29590..0000000 --- a/R/DIscBIO-generic-plotKmeansLabelstSNE.R +++ /dev/null @@ -1,49 +0,0 @@ -#' @title tSNE map for K-means clustering with labels -#' @description Visualizing the K-means clusters using tSNE maps -#' @param object \code{DISCBIO} class object. -#' @rdname plotKmeansLabelstSNE -#' @importFrom graphics text -#' @return Plot containing the ID of the cells in each cluster -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data -#' sc <- Clustexp(sc, cln=3) # data must be clustered before plottin -#' sc <- comptSNE(sc, max_iter=100) -#' plotKmeansLabelstSNE(sc) # Plots the ID of the cells in each cluster -setGeneric("plotKmeansLabelstSNE", function(object) - standardGeneric("plotKmeansLabelstSNE")) - -#' @rdname plotKmeansLabelstSNE -#' @export -setMethod( - "plotKmeansLabelstSNE", - signature = "DISCBIO", - definition = function(object) { - if (length(object@tsne) == 0) - stop("run comptsne before plotKmeansLabelstSNE") - Clusters <- object@kmeans$kpart - ClustersFactor <- as.factor(Clusters) - ClustersFactor <- gsub("1", "black", ClustersFactor) - ClustersFactor <- gsub("2", "blue", ClustersFactor) - ClustersFactor <- gsub("3", "green", ClustersFactor) - ClustersFactor <- gsub("4", "red", ClustersFactor) - ClustersFactor <- gsub("5", "yellow", ClustersFactor) - ClustersFactor <- gsub("6", "gray", ClustersFactor) - COL <- ClustersFactor - labels = names(object@ndata) - plot( - object@tsne, - xlab = "Dim 1", - ylab = "Dim 2", - pch = 20, - cex = .5, - col = "lightgrey" - ) - text( - object@tsne[, 1], - object@tsne[, 2], - labels, - cex = .7, - col = COL - ) - } -) \ No newline at end of file diff --git a/R/DIscBIO-generic-plotLabelstSNE.R b/R/DIscBIO-generic-plotLabelstSNE.R new file mode 100644 index 0000000..41c9f8e --- /dev/null +++ b/R/DIscBIO-generic-plotLabelstSNE.R @@ -0,0 +1,62 @@ +#' @title tSNE map with labels +#' @description Visualizing k-means or model-based clusters using tSNE maps +#' @param object \code{DISCBIO} class object. +#' @rdname plotLabelstSNE +#' @importFrom graphics text +#' @return Plot containing the ID of the cells in each cluster +setGeneric("plotLabelstSNE", function(object) + { + standardGeneric("plotLabelstSNE") + } +) + +#' @rdname plotLabelstSNE +#' @export +setMethod( + "plotLabelstSNE", + signature = "DISCBIO", + definition = function(object) + { + # ====================================================================== + # Validating + # ====================================================================== + ran_k <- length(object@tsne) > 0 + ran_m <- length(object@MBtsne) > 0 + if (ran_k) { + Clusters <- object@kmeans$kpart + x <- object@tsne + } else if (ran_m) { + Clusters <- object@MBclusters$clusterid + x <- object@MBtsne + } else { + stop("run comptsne before plotLabelstSNE") + } + # ====================================================================== + # Plotting + # ====================================================================== + ClustersFactor <- as.factor(Clusters) + ClustersFactor <- gsub("1", "black", ClustersFactor) + ClustersFactor <- gsub("2", "blue", ClustersFactor) + ClustersFactor <- gsub("3", "green", ClustersFactor) + ClustersFactor <- gsub("4", "red", ClustersFactor) + ClustersFactor <- gsub("5", "yellow", ClustersFactor) + ClustersFactor <- gsub("6", "gray", ClustersFactor) + COL <- ClustersFactor + labels = names(object@ndata) + plot( + x, + xlab = "Dim 1", + ylab = "Dim 2", + pch = 20, + cex = .5, + col = "lightgrey" + ) + text( + x[, 1], + x[, 2], + labels, + cex = .7, + col = COL + ) + } +) \ No newline at end of file diff --git a/R/DIscBIO-generic-plotMBLabelstSNE.R b/R/DIscBIO-generic-plotMBLabelstSNE.R deleted file mode 100644 index 2cdcbbd..0000000 --- a/R/DIscBIO-generic-plotMBLabelstSNE.R +++ /dev/null @@ -1,55 +0,0 @@ -#' @title tSNE map for Model-based clustering with labels -#' @description Visualizing the Model-based clusters using tSNE maps -#' @param object \code{DISCBIO} class object. -#' @importFrom graphics text -#' @return A plot of the `object@MBtsne` values -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=2) -#' sc <- comptsneMB(sc, rseed=15555, quiet=TRUE, max_iter=100) -#' plotMBLabelstSNE(sc) -setGeneric("plotMBLabelstSNE", function(object) - standardGeneric("plotMBLabelstSNE") -) - -#' @rdname plotMBLabelstSNE -#' @export -setMethod( - "plotMBLabelstSNE", - signature = "DISCBIO", - definition = function(object) { - if (length(object@MBtsne) == 0) - stop("run comptsneMB before plotMBLabelstSNE") - Clusters <- object@MBclusters$clusterid - ClustersFactor <- as.factor(Clusters) - ClustersFactor <- gsub("1", "black", ClustersFactor) - ClustersFactor <- gsub("2", "blue", ClustersFactor) - ClustersFactor <- gsub("3", "green", ClustersFactor) - ClustersFactor <- gsub("4", "red", ClustersFactor) - ClustersFactor <- gsub("5", "yellow", ClustersFactor) - ClustersFactor <- gsub("6", "gray", ClustersFactor) - COL <- ClustersFactor - labels = names(object@ndata) - plot( - object@MBtsne, - xlab = "Dim 1", - ylab = "Dim 2", - pch = 20, - cex = .5, - col = "lightgrey" - ) - text( - object@MBtsne[, 1], - object@MBtsne[, 2], - labels, - cex = .7, - col = COL - ) - } -) \ No newline at end of file diff --git a/R/DIscBIO-generic-plotOrderMBtsne.R b/R/DIscBIO-generic-plotOrderMBtsne.R deleted file mode 100644 index ea7d226..0000000 --- a/R/DIscBIO-generic-plotOrderMBtsne.R +++ /dev/null @@ -1,85 +0,0 @@ -#' @title Plotting the pseudo-time ordering based on Model-based clusters in the -#' t-SNE map -#' @description The tSNE representation can also be used to show the pseudo-time -#' ordering. -#' @param object \code{DISCBIO} class object. -#' @return A plot of the pseudo-time ordering. -#' @examples -#' sc<- DISCBIO(valuesG1msReduced) -#' sc<-NoiseFiltering(sc,percentile=0.9, CV=0.2, export=FALSE) -#' sc<-Normalizedata(sc) -#' sc<-FinalPreprocessing(sc,GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=2, reduce=TRUE, quiet=TRUE) -#' sc<- comptsneMB(sc, rseed=15555, quiet=TRUE, max_iter=100) -#' plotOrderMBtsne(sc) -setGeneric("plotOrderMBtsne", function(object) - standardGeneric("plotOrderMBtsne")) - -#' @export -#' @rdname plotOrderMBtsne -setMethod( - "plotOrderMBtsne", - signature = "DISCBIO", - definition = function(object) { - if (length(object@MBtsne) == 0) - stop("run comptsneMB before plotOrderMBtsne") - total <- rbind(object@ndata, object@MBordering) - rownames(total)[nrow(total)] <- - "Pseudo-time ordering of MBclustering" - g <- rownames(total)[nrow(total)] - n <- g[1] - l <- apply(total[g, ] - .1, 2, sum) + .1 - - mi <- min(l, na.rm = TRUE) - ma <- max(l, na.rm = TRUE) - ColorRamp <- - colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100) - ColorLevels <- seq(mi, ma, length = length(ColorRamp)) - v <- round((l - mi) / (ma - mi) * 99 + 1, 0) - layout( - matrix( - data = c(1, 3, 2, 4), - nrow = 2, - ncol = 2 - ), - widths = c(5, 1, 5, 1), - heights = c(5, 1, 1, 1) - ) - par(mar = c(3, 5, 2.5, 2)) - plot( - object@MBtsne, - xlab = "Dim 1", - ylab = "Dim 2", - main = n, - pch = 20, - cex = 0, - col = "grey", - las = 1 - ) - for (k in 1:length(v)) { - points( - object@MBtsne[k, 1], - object@MBtsne[k, 2], - col = ColorRamp[v[k]], - pch = 20, - cex = 1.5 - ) - } - par(mar = c(3, 2.5, 2.5, 2)) - image( - 1, - ColorLevels, - matrix( - data = ColorLevels, - ncol = length(ColorLevels), - nrow = 1 - ), - col = ColorRamp, - xlab = "", - ylab = "", - las = 1, - xaxt = "n" - ) - layout(1) - } -) \ No newline at end of file diff --git a/R/DIscBIO-generic-plotOrderKMtsne.R b/R/DIscBIO-generic-plotOrderTsne.R similarity index 60% rename from R/DIscBIO-generic-plotOrderKMtsne.R rename to R/DIscBIO-generic-plotOrderTsne.R index f5319ca..1b6c96e 100644 --- a/R/DIscBIO-generic-plotOrderKMtsne.R +++ b/R/DIscBIO-generic-plotOrderTsne.R @@ -1,37 +1,43 @@ -#' @title Plotting the pseudo-time ordering based on K-means clusters in the -#' t-SNE map +#' @title Plotting the pseudo-time ordering in the t-SNE map #' @description The tSNE representation can also be used to show the pseudo-time #' ordering. #' @param object \code{DISCBIO} class object. #' @return A plot of the pseudo-time ordering. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' sc <- KmeanOrder(sc, export = FALSE) -#' plotOrderKMtsne(sc) -setGeneric("plotOrderKMtsne", function(object) - standardGeneric("plotOrderKMtsne")) +setGeneric("plotOrderTsne", function(object) + standardGeneric("plotOrderTsne")) #' @export -#' @rdname plotOrderKMtsne +#' @rdname plotOrderTsne setMethod( - "plotOrderKMtsne", + "plotOrderTsne", signature = "DISCBIO", definition = function(object) { - if (length(object@tsne) == 0) - stop("run comptsne before plotOrderKMtsne") - total <- rbind(object@ndata, object@kordering) - rownames(total)[nrow(total)] <- - "Pseudo-time ordering of K-means clustering" + ran_k <- length(object@tsne) > 0 + ran_m <- length(object@MBtsne) > 0 + if (ran_k) { + total <- rbind(object@ndata, object@kordering) + clustering_method <- "k-means" + x <- object@tsne + } else if (ran_m) { + total <- rbind(object@ndata, object@MBordering) + clustering_method <- "model-based" + x <- object@MBtsne + } else { + stop("run comptsne before plotOrderTsne") + } + + rownames(total)[nrow(total)] <- paste( + "Pseudo-time ordering of", clustering_method, "clustering" + ) g <- rownames(total)[nrow(total)] n <- g[1] l <- apply(total[g, ] - .1, 2, sum) + .1 mi <- min(l, na.rm = TRUE) ma <- max(l, na.rm = TRUE) - ColorRamp <- - colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100) + ColorRamp <- colorRampPalette( + rev(brewer.pal(n = 7, name = "RdYlBu")) + )(100) ColorLevels <- seq(mi, ma, length = length(ColorRamp)) v <- round((l - mi) / (ma - mi) * 99 + 1, 0) layout( @@ -43,9 +49,10 @@ setMethod( widths = c(5, 1, 5, 1), heights = c(5, 1, 1, 1) ) - par(mar = c(3, 5, 2.5, 2)) + opar <- par(mar = c(3, 5, 2.5, 2)) + on.exit(par(opar)) plot( - object@tsne, + x, xlab = "Dim 1", ylab = "Dim 2", main = n, @@ -56,14 +63,15 @@ setMethod( ) for (k in 1:length(v)) { points( - object@tsne[k, 1], - object@tsne[k, 2], + x[k, 1], + x[k, 2], col = ColorRamp[v[k]], pch = 20, cex = 1.5 ) } - par(mar = c(3, 2.5, 2.5, 2)) + opar <- par(mar = c(3, 2.5, 2.5, 2)) + on.exit(par(opar)) image( 1, ColorLevels, diff --git a/R/DIscBIO-generic-plotSilhouette.R b/R/DIscBIO-generic-plotSilhouette.R index 58f5457..3986d5b 100644 --- a/R/DIscBIO-generic-plotSilhouette.R +++ b/R/DIscBIO-generic-plotSilhouette.R @@ -11,11 +11,6 @@ #' @importFrom stats as.dist cor #' @importFrom cluster silhouette #' @return A silhouette plot -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data -#' sc <- Clustexp(sc, cln=3) # data must be clustered before plottin -#' sc <- comptSNE(sc, max_iter=100) -#' plotSilhouette(sc, K=3) setGeneric( name = "plotSilhouette", def = function(object, K) standardGeneric("plotSilhouette") @@ -27,22 +22,27 @@ setMethod( f = "plotSilhouette", signature = "DISCBIO", definition = function(object, K) { - if (length(object@kmeans$kpart) == 0) { - stop("run clustexp before plotsilhouette") - } - if (length(unique(object@kmeans$kpart)) < 2) { - stop("only a single cluster: no silhouette plot") - } - col <- c("black", "blue", "green", "red", "yellow", "gray") - kpart <- object@kmeans$kpart - dist.gen <- - function(x, method = "euclidean", ...) - if (method %in% c("spearman", "pearson", "kendall")) - as.dist(1 - cor(t(x), method = method, ...)) - else - dist(x, method = method, ...) - distances <- dist.gen(object@distances) - si <- silhouette(kpart, distances) - plot(si, col = col[1:K]) + # ====================================================================== + # Validation + # ====================================================================== + ran_clustexp <- length(object@kmeans$kpart) > 0 + ran_exprmclust <- length(object@MBclusters$clusterid) > 0 + if (ran_clustexp) { + kpart <- object@kmeans$kpart + } else if (ran_exprmclust) { + kpart <- object@MBclusters$clusterid + } else { + stop("run clustexp or exprmclust before plotSilhouette") + } + if (length(unique(kpart)) < 2) { + stop("only a single cluster: no silhouette plot") + } + # ====================================================================== + # Plotting + # ====================================================================== + col <- c("black", "blue", "green", "red", "yellow", "gray") + distances <- dist.gen(object@distances) + si <- silhouette(kpart, distances) + plot(si, col = col[1:K]) } ) \ No newline at end of file diff --git a/R/DIscBIO-generic-plotSymbolstSNE.R b/R/DIscBIO-generic-plotSymbolstSNE.R index 0ab56f7..099b6d3 100644 --- a/R/DIscBIO-generic-plotSymbolstSNE.R +++ b/R/DIscBIO-generic-plotSymbolstSNE.R @@ -6,11 +6,6 @@ #' @param legloc A keyword from the list "bottomright", "bottom", "bottomleft", #' "left", "topleft", "top", "topright", "right" and "center". Default is #' "bottomright" -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data -#' sc <- Clustexp(sc, cln=3) # data must be clustered before plottin -#' sc <- comptSNE(sc, max_iter=100, quiet=TRUE) -#' plotSymbolstSNE(sc,types=sub("(\\_\\d+)$","", names(sc@ndata))) setGeneric( "plotSymbolstSNE", function(object, types = NULL, legloc = "bottomright") { diff --git a/R/DIscBIO-generic-plotexptsneMB.R b/R/DIscBIO-generic-plotexptsneMB.R deleted file mode 100644 index e3aa0d0..0000000 --- a/R/DIscBIO-generic-plotexptsneMB.R +++ /dev/null @@ -1,101 +0,0 @@ -#' @title Highlighting gene expression in Model-based clustering in the t-SNE -#' map -#' @description The t-SNE map representation can also be used to analyze -#' expression of a gene or a group of genes, to investigate cluster specific -#' gene expression patterns -#' @param object \code{DISCBIO} class object. -#' @param g Individual gene name or vector with a group of gene names -#' corresponding to a subset of valid row names of the \code{ndata} slot of -#' the \code{DISCBIO} object. -#' @param n String of characters representing the title of the plot. Default is -#' NULL and the first element of \code{g} is chosen. -#' @return t-SNE plot for one particular gene -#' @examples -#' \dontrun{ -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=3) -#' sc <- comptsneMB(sc, max_iter=100) -#' sc <- Clustexp(sc, cln=3) -#' sc <- MB_Order(sc, export = FALSE) -#' g <- 'ENSG00000001460' -#' plotexptsneMB(sc, g) -#' } -setGeneric("plotexptsneMB", function(object, g, n = NULL) - standardGeneric("plotexptsneMB")) - -#' @export -#' @rdname plotexptsneMB -setMethod( - "plotexptsneMB", - signature = "DISCBIO", - definition = function(object, g, n = NULL) { - if (length(object@MBtsne) == 0) - stop("run comptsneMB before plotexptsneMB") - if (length(intersect(g, rownames(object@ndata))) < length(unique(g))) - stop( - "second argument does not correspond to set of rownames", - "slot ndata of SCseq object" - ) - if (is.null(n)) - n <- g[1] - l <- apply(object@ndata[g, ] - .1, 2, sum) + .1 - - mi <- min(l, na.rm = TRUE) - ma <- max(l, na.rm = TRUE) - ColorRamp <- - colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100) - ColorLevels <- seq(mi, ma, length = length(ColorRamp)) - v <- round((l - mi) / (ma - mi) * 99 + 1, 0) - layout( - matrix( - data = c(1, 3, 2, 4), - nrow = 2, - ncol = 2 - ), - widths = c(5, 1, 5, 1), - heights = c(5, 1, 1, 1) - ) - par(mar = c(3, 5, 2.5, 2)) - plot( - object@MBtsne, - xlab = "Dim 1", - ylab = "Dim 2", - main = n, - pch = 20, - cex = 0, - col = "grey", - las = 1 - ) - for (k in 1:length(v)) { - points( - object@MBtsne[k, 1], - object@MBtsne[k, 2], - col = ColorRamp[v[k]], - pch = 20, - cex = 1.5 - ) - } - par(mar = c(3, 2.5, 2.5, 2)) - image( - 1, - ColorLevels, - matrix( - data = ColorLevels, - ncol = length(ColorLevels), - nrow = 1 - ), - col = ColorRamp, - xlab = "", - ylab = "", - las = 1, - xaxt = "n" - ) - layout(1) - } -) \ No newline at end of file diff --git a/R/DIscBIO-generic-plotsilhouetteMB.R b/R/DIscBIO-generic-plotsilhouetteMB.R deleted file mode 100644 index de3338e..0000000 --- a/R/DIscBIO-generic-plotsilhouetteMB.R +++ /dev/null @@ -1,49 +0,0 @@ -#' @title Silhouette Plot for Model-based clustering -#' @description The silhouette provides a representation of how well each point -#' is represented by its cluster in comparison to the closest neighboring -#' cluster. It computes for each point the difference between the average -#' similarity to all points in the same cluster and to all points in the -#' closest neighboring cluster. This difference it normalize such that it can -#' take values between -1 and 1 with higher values reflecting better -#' representation of a point by its cluster. -#' @param object \code{DISCBIO} class object. -#' @param K A numeric value of the number of clusters -#' @importFrom stats as.dist cor -#' @importFrom cluster silhouette -#' @return A silhouette plot -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, -#' downsample=FALSE, dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=2, reduce=TRUE, quiet=TRUE) -#' plotsilhouetteMB(sc, K=2) -setGeneric("plotsilhouetteMB", function(object, K) - standardGeneric("plotsilhouetteMB")) - -#' @export -#' @rdname plotsilhouetteMB -setMethod( - "plotsilhouetteMB", - signature = "DISCBIO", - definition = function(object, K) { - if (length(object@MBclusters$clusterid) == 0) - stop("run exprmclust before plotsilhouetteMB") - if (length(unique(object@MBclusters$clusterid)) < 2) - stop("only a single cluster: no silhouette plot") - col = c("black", "blue", "green", "red", "yellow", "gray") - kpart <- object@MBclusters$clusterid - dist.gen <- - function(x, method = "euclidean", ...) - if (method %in% c("spearman", "pearson", "kendall")) - as.dist(1 - cor(t(x), method = method, ...)) - else - dist(x, method = method, ...) - distances <- dist.gen(t(object@fdata)) - si <- silhouette(kpart, distances) - plot(si, col = col[1:K]) - } -) \ No newline at end of file diff --git a/R/DIscBIO-generic-plottSNE.R b/R/DIscBIO-generic-plottSNE.R index 3196498..60d05ba 100644 --- a/R/DIscBIO-generic-plottSNE.R +++ b/R/DIscBIO-generic-plottSNE.R @@ -1,28 +1,37 @@ -#' @title tSNE map for K-means clustering -#' @description Visualizing the K-means clusters using tSNE maps +#' @title tSNE map +#' @description Visualizing the k-means or model-based clusters using tSNE maps #' @param object \code{DISCBIO} class object. #' @importFrom graphics text #' @return A plot of t-SNEs. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) # changes signature of data -#' sc <- Clustexp(sc, cln=3) # data must be clustered before plotting -#' sc <- comptSNE(sc, max_iter=100, quiet=TRUE) -#' plottSNE(sc) setGeneric("plottSNE", function(object) standardGeneric("plottSNE")) -#' @rdname plotSilhouette +#' @rdname plottSNE #' @export setMethod( "plottSNE", signature = "DISCBIO", definition = function(object) { - if (length(object@tsne) == 0) - stop("run comptsne before plottsne") + # ====================================================================== + # Validating + # ====================================================================== + ran_k <- length(object@tsne) > 0 + ran_m <- length(object@MBtsne) > 0 + if (ran_k) { + part <- object@kmeans$kpart + x <- object@tsne + } else if (ran_m) { + part <- object@MBclusters$clusterid + x <- object@MBtsne + } else { + stop("run comptsne before plottSNE") + } + # ====================================================================== + # Plotting + # ====================================================================== col <- c("black", "blue", "green", "red", "yellow", "gray") - part <- object@kmeans$kpart plot( - object@tsne, + x, las = 1, xlab = "Dim 1", ylab = "Dim 2", @@ -30,11 +39,11 @@ setMethod( cex = 1.5, col = "lightgrey" ) - for (i in 1:max(part)) { + for (i in seq_len(part)) { if (sum(part == i) > 0) { text( - object@tsne[part == i, 1], - object@tsne[part == i, 2], + x[part == i, 1], + x[part == i, 2], i, col = col[i], cex = .75, diff --git a/R/DIscBIO-generic-plottsneMB.R b/R/DIscBIO-generic-plottsneMB.R deleted file mode 100644 index e70956c..0000000 --- a/R/DIscBIO-generic-plottsneMB.R +++ /dev/null @@ -1,57 +0,0 @@ -#' @title tSNE map for Model-based clustering -#' @description Visualizing the Model-based clusters using tSNE maps -#' @param object \code{DISCBIO} class object. -#' @param K A numeric value of the number of clusters -#' @importFrom graphics text -#' @return A plot of t-SNEs. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=2) -#' sc <- comptsneMB(sc, rseed=15555, quiet=TRUE, max_iter=100) -#' plottsneMB(sc) -setGeneric( - name = "plottsneMB", - def = function(object, K = length(table(object@MBclusters$clusterid))) { - standardGeneric("plottsneMB") - } -) - -#' @export -#' @rdname plottsneMB -setMethod( - f = "plottsneMB", - signature = "DISCBIO", - definition = function(object, K) { - if (length(object@MBtsne) == 0) - stop("run comptsneMB before plottsneMB") - col = c("black", "blue", "green", "red", "yellow", "gray") - part <- object@MBclusters$clusterid - plot( - object@MBtsne, - xlab = "Dim 1", - ylab = "Dim 2", - pch = 20, - cex = 1.5, - col = "lightgrey", - las = 1 - ) - for (i in 1:K) { - if (sum(part == i) > 0) { - text( - object@MBtsne[part == i, 1], - object@MBtsne[part == i, 2], - i, - col = col[i], - cex = .75, - font = 4 - ) - } - } - } -) \ No newline at end of file diff --git a/R/DIscBIO-generic-pseudoTimeOrdering.R b/R/DIscBIO-generic-pseudoTimeOrdering.R new file mode 100644 index 0000000..bca289e --- /dev/null +++ b/R/DIscBIO-generic-pseudoTimeOrdering.R @@ -0,0 +1,61 @@ +#' @title Pseudo-time ordering +#' @description This function takes the exact output of exprmclust function and +#' construct Pseudo-time ordering by mapping all cells onto the path that +#' connects cluster centers. +#' @param object \code{DISCBIO} class object. +#' @param quiet if `TRUE`, suppresses intermediary output +#' @param export if `TRUE`, exports order table to csv +#' @param filename Name of the exported file (if `export=TRUE`) +#' @importFrom TSCAN TSCANorder +#' @return The DISCBIO-class object input with the kordering slot filled. +setGeneric("pseudoTimeOrdering", function( + object, + quiet=FALSE, + export=FALSE, + filename="Cellular_pseudo-time_ordering" + ) + { + standardGeneric("pseudoTimeOrdering") + } +) + +#' @export +#' @rdname pseudoTimeOrdering +setMethod( + "pseudoTimeOrdering", + signature = "DISCBIO", + definition = function(object, quiet, export, filename) { + # ====================================================================== + # Validating + # ====================================================================== + ran_k <- length(object@kmeans$kpart) > 0 + ran_m <- length(object@MBclusters) > 0 + if (ran_k) { + Obj <- object@fdata + Names <- object@cpart + lpsmclust <- Exprmclust(Obj, K = 4, reduce = F, cluster = Names) + lpsorder <- TSCANorder(lpsmclust) + } else if (ran_m) { + Obj <- object@fdata + Names <- names(object@MBclusters$clusterid) + lpsmclust <- object@MBclusters + lpsorder <- TSCANorder(lpsmclust) + } else { + stop("run clustexp before this pseudoTimeOrdering") + } + # ====================================================================== + # Ordering + # ====================================================================== + sampleNames <- colnames(Obj) + orderID <- lpsorder + order <- c(1:length(lpsorder)) + orderTable <- data.frame(order, orderID) + if (export) write.csv(orderTable, file = paste0(filename, ".csv")) + if (!quiet) print(orderTable) + FinalOrder <- orderTable[match(sampleNames, orderTable$orderID), ] + out_order <- FinalOrder[, 1] + names(out_order) <- names(Names) + object@kordering <- out_order + return(object) + } +) \ No newline at end of file diff --git a/R/J48DT.R b/R/J48DT.R index 3618d07..38e516d 100644 --- a/R/J48DT.R +++ b/R/J48DT.R @@ -8,29 +8,8 @@ #' @param plot If `FALSE`, suppresses plot output #' @importFrom RWeka J48 #' @importFrom graphics plot -#' @importFrom partykit as.party -#' @importFrom grid gpar #' @return Information about the J48 model and, by default, a plot of the #' decision tree. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' cdiff <- DEGanalysis2clust( -#' sc, Clustering="K-means", K=3, fdr=.2, name="Name", First="CL1", -#' Second="CL2", export=FALSE -#' ) -#' sigDEG <- cdiff[[1]] -#' DATAforDT <- ClassVectoringDT( -#' sc, Clustering="K-means", K=3, First="CL1", Second="CL2", sigDEG, -#' ) -#' J48DT(DATAforDT) J48DT <- function(data, quiet = FALSE, plot = TRUE) { msg <- NULL if (!is.data.frame(data)) { @@ -49,19 +28,7 @@ J48DT <- function(data, quiet = FALSE, plot = TRUE) { exp.df <- as.data.frame(t(data)) classVector <- factor(colnames(data)) j48.model <- J48(classVector ~ ., exp.df) - if (!quiet) - print(j48.model) - if (plot) { - plot( - as.party(j48.model), - gp = gpar( - cex = 0.65, - col = "black", - lty = "solid", - lwd = 1.5, - fontsize = 12 - ) - ) - } + if (!quiet) print(j48.model) + if (plot) plot(j48.model) return(j48.model) } \ No newline at end of file diff --git a/R/J48DTeval.R b/R/J48DTeval.R index 3ccbf70..bdd33c8 100644 --- a/R/J48DTeval.R +++ b/R/J48DTeval.R @@ -12,26 +12,6 @@ #' @param quiet If `TRUE`, suppresses intermediary output #' @importFrom stats predict #' @return Statistics about the J48 model -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' cdiff <- DEGanalysis2clust( -#' sc, Clustering="K-means", K=3, fdr=.2, name="Name", First="CL1", -#' Second="CL2", export=FALSE -#' ) -#' sigDEG <- cdiff[[1]] -#' DATAforDT <- ClassVectoringDT( -#' sc, Clustering="K-means", K=3, First="CL1", Second="CL2", sigDEG -#' ) -#' J48DTeval(DATAforDT, num.folds=10, First="CL1", Second="CL2") - J48DTeval <- function( data, num.folds = 10, First = "CL1", Second = "CL2", quiet = FALSE ) @@ -46,8 +26,7 @@ J48DTeval <- function( #Start cross validation loop class1 <- levels(class.vec)[1] for (fold in 1:length(segments)) { - if (!quiet) - cat("Fold", fold, "of", length(segments), "\n") + if (!quiet) message("Fold ", fold, " of ", length(segments)) #Define training and test set test.ind <- segments[[fold]] training.set <- exp.df[-test.ind, ] @@ -67,25 +46,7 @@ J48DTeval <- function( } else{ stop("Unknown classification algorithm") } - #Evaluate model on test set - eval.pred <- function(pred.class, true.class, class1, - performance) { - for (index in 1:length(pred.class)) { - pred <- pred.class[index] - true <- true.class[index] - if (pred == true && true == class1) { - performance["TP"] <- performance["TP"] + 1 - } else if (pred != true && true == class1) { - performance["FN"] <- performance["FN"] + 1 - } else if (pred != true && true != class1) { - performance["FP"] <- performance["FP"] + 1 - } else if (pred == true && true != class1) { - performance["TN"] <- performance["TN"] + 1 - } - } - return(performance) - } performance <- eval.pred( pred.class, test.class, class1, performance ) @@ -113,45 +74,17 @@ J48DTeval <- function( ) colnames(j48.confusion.matrix) <- c(First, Second) if (!quiet) print(j48.confusion.matrix) - - SN <- function(con.mat) { - TP <- con.mat[1, 1] - FN <- con.mat[2, 1] - return(TP / (TP + FN)) - } - SP <- function(con.mat) { - TN <- con.mat[2, 2] - FP <- con.mat[1, 2] - return(TN / (TN + FP)) - } - ACC <- function(con.mat) { - TP <- con.mat[1, 1] - FN <- con.mat[2, 1] - TN <- con.mat[2, 2] - FP <- con.mat[1, 2] - return((TP + TN) / (TP + FN + TN + FP)) - } - MCC <- function(con.mat) { - TP <- con.mat[1, 1] - FN <- con.mat[2, 1] - TN <- con.mat[2, 2] - FP <- con.mat[1, 2] - denom <- sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) - denom <- ifelse(denom == 0, NA, denom) - return((TP * TN - FP * FN) / denom) - } j48.sn <- SN(j48.confusion.matrix) j48.sp <- SP(j48.confusion.matrix) j48.acc <- ACC(j48.confusion.matrix) j48.mcc <- MCC(j48.confusion.matrix) if (!quiet) { - cat( + message( "J48 SN: ", j48.sn, "\n", "J48 SP: ", j48.sp, "\n", "J48 ACC: ", j48.acc, "\n", "J48 MCC: ", j48.mcc, "\n", - sep = "" ) } return(j48.performance) diff --git a/R/Jaccard.R b/R/Jaccard.R index b4fd7ea..4b1efb2 100644 --- a/R/Jaccard.R +++ b/R/Jaccard.R @@ -15,10 +15,6 @@ #' @importFrom boot boot #' @importFrom graphics barplot box #' @return A plot of the mean Jaccard similarity coefficient per cluster. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- Clustexp(sc, cln=3, quiet=TRUE) # K-means clustering -#' Jaccard(sc, Clustering="K-means", K=3) Jaccard <- function( object, Clustering = "K-means", @@ -51,6 +47,7 @@ Jaccard <- function( } else if (Clustering == "MB") { target_col <- object@MBclusters$clusterid } + # TODO: replace with in-house code to eliminate boot package dependency results <- boot( data = object@fdata[, which(target_col == i)], statistic = JS, diff --git a/R/MB_Order.R b/R/MB_Order.R deleted file mode 100644 index 00805e1..0000000 --- a/R/MB_Order.R +++ /dev/null @@ -1,47 +0,0 @@ -#' @title Pseudo-time ordering based on Model-based clusters -#' @description This function takes the exact output of exprmclust function and -#' construct Pseudo-time ordering by mapping all cells onto the path that -#' connects cluster centers. -#' @export -#' @param object \code{DISCBIO} class object. -#' @param quiet if `TRUE`, intermediary output is suppressed -#' @param export if `TRUE`, exports the results as a CSV file -#' @importFrom TSCAN TSCANorder -#' @return The DISCBIO-class object input with the MBordering slot filled. -#' @examples -#' \dontrun{ -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=2) -#' sc <- comptsneMB(sc, max_iter=100) -#' sc <- Clustexp(sc, cln=3) -#' sc <- MB_Order(sc, export = FALSE) -#' sc@MBordering -#' } -MB_Order <- function(object, quiet = FALSE, export = TRUE) { - data = object@MBclusters - lpsorderMB <- TSCANorder(data) - Names <- names(object@MBclusters$clusterid) - sampleNames <- colnames(object@fdata) - orderID <- lpsorderMB - order <- c(1:length(lpsorderMB)) - orderTableMB <- data.frame(order, orderID) - if (export) { - nm <- "Cellular_pseudo-time_ordering_based_on_Model-based_clusters.csv" - write.csv(orderTableMB, file = nm) - } - if (!quiet) { - print(orderTableMB) - } - FinalOrder <- - orderTableMB[match(sampleNames, orderTableMB$orderID), ] - MBordering <- FinalOrder[, 1] - names(MBordering) <- names(Names) - object@MBordering <- MBordering - return(object) -} \ No newline at end of file diff --git a/R/NetAnalysis.R b/R/NetAnalysis.R index 0083f4d..896a225 100644 --- a/R/NetAnalysis.R +++ b/R/NetAnalysis.R @@ -11,26 +11,9 @@ #' average.path.length get.adjacency V E mean_distance betweenness #' @importFrom NetIndices GenInd #' @return A network analysis table -#' @examples -#' \dontrun{ -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF") -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' dff <- DEGanalysis2clust(sc, Clustering="K-means", K=3, fdr=0.1, name="Name") -#' DEGs <- dff[[2]][1, 6] -#' data <- read.csv(file=paste0(DEGs),head=TRUE,sep=",") -#' data <- data[,3] -#' FileName <- paste0(DEGs) -#' ppi <- PPI(data, FileName) -#' NetAnalysis(ppi) -#' } -NetAnalysis <- function(data, export = TRUE, FileName = "1") { +NetAnalysis <- function( + data, export = FALSE, FileName = "NetworkAnalysisTable-1" +) { if (length(data[, 1]) < 1) stop("No Protein-Protein Interactions") df <- data[, -c(1, 2)] @@ -46,24 +29,18 @@ NetAnalysis <- function(data, export = TRUE, FileName = "1") { AnalysisTable <- cbind(names, degree.table, betweenness.table) if (export) { - write.csv( - AnalysisTable, - file = paste0("NetworkAnalysisTable-", FileName, ".csv") + write.csv(AnalysisTable, file = paste0(FileName, ".csv") ) } test.graph.adj <- get.adjacency(gg, sparse = FALSE) test.graph.properties <- GenInd(test.graph.adj) - cat("Number of nodes: ", test.graph.properties$N, "\n") - V(gg) - cat("Number of links: ", test.graph.properties$Ltot, "\n") - E(gg) - cat("Link Density: ", test.graph.properties$LD, "\n") - cat("The connectance of the graph: ", - test.graph.properties$C, - "\n") - cat("Mean Distences", mean_distance(gg), "\n") - cat("Average Path Length", average.path.length(gg), "\n", "\n") + message("Number of nodes: ", test.graph.properties$N) + message("Number of links: ", test.graph.properties$Ltot) + message("Link Density: ", test.graph.properties$LD) + message("The connectance of the graph: ", test.graph.properties$C) + message("Mean Distences", mean_distance(gg)) + message("Average Path Length", average.path.length(gg), "\n") AnalysisTable <- AnalysisTable[order(AnalysisTable[, 2], decreasing = TRUE), ] return(AnalysisTable) diff --git a/R/Networking.R b/R/Networking.R index 609617e..818106e 100644 --- a/R/Networking.R +++ b/R/Networking.R @@ -3,7 +3,8 @@ #' @export #' @param data A gene list. #' @param FileName A string vector showing the name to be used to save the -#' resulted network. +#' resulted network. If `NULL`, the network will be saved to a temporary +#' directory #' @param species The taxonomy name/id. Default is "9606" for Homo sapiens. #' @param plot_width Plot width #' @param plot_height Plot height @@ -12,29 +13,12 @@ #' @importFrom png readPNG #' @importFrom graphics plot rasterImage #' @return A plot of the network -#' @examples -#' \dontrun{ -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF") -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' dff <- DEGanalysis2clust(sc, Clustering="K-means", K=3, fdr=0.1, name="Name") -#' DEGs <- dff[[2]][1, 6] -#' data <- read.csv(file=paste0(DEGs),head=TRUE,sep=",") -#' data <- data[,3] -#' FileName <- paste0(DEGs) -#' ppi <- PPI(data, FileName) -#' networking <- NetAnalysis(ppi) -#' FileName <- "Up.DownDEG" -#' Networking(data, FileName) -#' } Networking <- function( - data, FileName, species = "9606", plot_width = 25, plot_height = 15 + data, + FileName = NULL, + species = "9606", + plot_width = 25, + plot_height = 15 ) { if (length(data) > 600) { @@ -62,24 +46,33 @@ Networking <- function( species ) ) - cat( - "Examine response components =", status_code(repos), "\t", - "200 means successful", "\n" + message( + "Examine response components =", + status_code(repos), + "\t", + "(200 means successful)", ) - y = repos$request$url - download.file(y, paste0("network", FileName, ".png"), mode = 'wb') - Network <- - readPNG(paste0("network", FileName, ".png"), native = TRUE) + y <- repos$request$url + if (!is.null(FileName)) { + FileName <- paste0("network", FileName, ".png") + } else { + FileName <- tempfile() + } + download.file(y, FileName, mode = 'wb') + Network <- readPNG(FileName, native=TRUE) set_plot_dimensions <- function(width_choice, height_choice) { - options(repr.plot.width = width_choice, - repr.plot.height = height_choice) + opar <- options( + repr.plot.width = width_choice, + repr.plot.height = height_choice + ) + on.exit(options(opar)) } set_plot_dimensions(plot_width, plot_height) plot(0:1, 0:1, type = "n", ann = FALSE, axes = FALSE) rasterImage(Network, 0, 0, 1, 1) - cat( + message( "\n", "You can see the network with high resolution", "by clicking on the following link:", diff --git a/R/PPI.R b/R/PPI.R index 724eabc..b00c188 100644 --- a/R/PPI.R +++ b/R/PPI.R @@ -4,31 +4,14 @@ #' @export #' @param data A gene list. #' @param FileName A string vector showing the name to be used to save the -#' resulted table. +#' resulted table. If null, no file will be exported #' @param species The taxonomy name/id. Default is "9606" for Homo sapiens. #' @importFrom httr content #' @importFrom readr read_tsv -#' @return A TSV file stored in the user's file system and its corresponding -#' `data.frame` object in R. -#' @examples -#' \dontrun{ -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF") -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' dff <- DEGanalysis2clust(sc, Clustering="K-means", K=3, fdr=0.1, name="Name") -#' DEGs <- dff[[2]][1, 6] -#' data <- read.csv(file=paste0(DEGs),head=TRUE,sep=",") -#' data <- data[,3] -#' FileName <- paste0(DEGs) -#' PPI(data, FileName) -#' } -PPI <- function(data, FileName, species = "9606") { +#' @return Either a TSV file stored in the user's file system and its +#' corresponding `data.frame` object in R or and R object containing that +#' information. +PPI <- function(data, FileName = NULL, species = "9606") { # Save base enpoint as variable string_api_url <- "https://string-db.org/api/" output_format <- "tsv" #"json", "tsv-no-header", "tsv", "xml" @@ -50,16 +33,15 @@ PPI <- function(data, FileName, species = "9606") { species ) ) - cat( - "Examine response components =", - status_code(repos), - "\t", - "200 means successful", - "\n" + message( + "Examine response components = ", status_code(repos), "\t", + "(200 means successful)", "\n" ) # Process API request content repo_content <- content(repos) - results <- read_tsv(repo_content) - write.csv(results, file = paste0("PPI-", FileName, ".csv")) + results <- read_tsv(repo_content) + if (!is.null(FileName)) { + write.csv(results, file = paste0("PPI-", FileName, ".csv")) + } return(results) } \ No newline at end of file diff --git a/R/PlotMBexpPCA.R b/R/PlotMBexpPCA.R deleted file mode 100644 index 30d98f2..0000000 --- a/R/PlotMBexpPCA.R +++ /dev/null @@ -1,88 +0,0 @@ -#' @title Plotting gene expression in Model-based clustering in PCA. -#' @description The PCA representation can also be used to show the gene -#' expression of a particular gene. -#' @param object \code{DISCBIO} class object. -#' @param g Individual gene name or vector with a group of gene names -#' corresponding to a subset of valid row names of the \code{ndata} slot of -#' the \code{DISCBIO} object. -#' @param n String of characters representing the title of the plot. Default is -#' NULL and the first element of \code{g} is chosen. -#' @importFrom RColorBrewer brewer.pal -#' @importFrom grDevices colorRampPalette -#' @importFrom graphics layout par image -#' @return A plot of the PCA. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata(sc) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=2, reduce=TRUE, quiet=TRUE) -#' g <- "ENSG00000010244" # Plotting the expression of MT-RNR2 -#' PlotMBexpPCA(sc, g) -#' @export - -PlotMBexpPCA <- function(object, g, n = NULL) { - if (length(intersect(g, rownames(object@ndata))) < length(unique(g))) - stop( - "second argument does not correspond to set of rownames slot", - "ndata of SCseq object" - ) - if (is.null(n)) - n <- g[1] - data = object@MBclusters - #Expression<-cbind(data$pcareduceres,object@ndata[g,]) - l <- apply(object@ndata[g,] - .1, 2, sum) + .1 - #l <- Expression[,3] - mi <- min(l, na.rm = TRUE) - ma <- max(l, na.rm = TRUE) - ColorRamp <- - colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100) - ColorLevels <- seq(mi, ma, length = length(ColorRamp)) - v <- round((l - mi) / (ma - mi) * 99 + 1, 0) - layout( - matrix( - data = c(1, 3, 2, 4), - nrow = 2, - ncol = 2 - ), - widths = c(5, 1, 5, 1), - heights = c(5, 1, 1, 1) - ) - par(mar = c(5, 5, 2.5, 2)) - plot( - data$pcareduceres[, 1], - data$pcareduceres[, 2], - xlab = "PC1", - ylab = "PC2", - pch = 20, - cex = 0, - col = "grey", - las = 1, - main = n - ) - for (k in 1:length(v)) { - points( - data$pcareduceres[k, 1], - data$pcareduceres[k, 2], - col = ColorRamp[v[k]], - pch = 20, - cex = 2 - ) - } - par(mar = c(3, 2.5, 2.5, 2)) - image( - 1, - ColorLevels, - matrix( - data = ColorLevels, - ncol = length(ColorLevels), - nrow = 1 - ), - col = ColorRamp, - xlab = "", - ylab = "", - las = 2, - xaxt = "n" - ) - layout(1) -} \ No newline at end of file diff --git a/R/PlotMBorderPCA.R b/R/PlotMBorderPCA.R deleted file mode 100644 index 3e7e103..0000000 --- a/R/PlotMBorderPCA.R +++ /dev/null @@ -1,78 +0,0 @@ -#' @title Plotting pseudo-time ordering in Model-based clustering in PCA. -#' @description The PCA representation can also be used to show the pseudo-time -#' ordering. -#' @param object \code{DISCBIO} class object. -#' @importFrom RColorBrewer brewer.pal -#' @importFrom grDevices colorRampPalette -#' @importFrom graphics layout par image -#' @export -#' @return A plot of the PCA. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Exprmclust(sc, K=3) -#' sc <- Clustexp(sc, cln=3) -#' sc <- MB_Order(sc, export = FALSE) -#' PlotMBorderPCA(sc) - -PlotMBorderPCA <- function(object) { - data = object@MBclusters - MBordertable <- cbind(data$pcareduceres, object@MBordering) - l <- MBordertable[, 3] - mi <- min(l, na.rm = TRUE) - ma <- max(l, na.rm = TRUE) - ColorRamp <- - colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100) - ColorLevels <- seq(mi, ma, length = length(ColorRamp)) - v <- round((l - mi) / (ma - mi) * 99 + 1, 0) - layout( - matrix( - data = c(1, 3, 2, 4), - nrow = 2, - ncol = 2 - ), - widths = c(5, 1, 5, 1), - heights = c(5, 1, 1, 1) - ) - par(mar = c(5, 5, 2.5, 2)) - plot( - MBordertable[, 1], - MBordertable[, 2], - xlab = "PC1", - ylab = "PC2", - pch = 20, - cex = 0, - col = "grey", - las = 1 - ) - for (k in 1:length(v)) { - points( - MBordertable[k, 1], - MBordertable[k, 2], - col = ColorRamp[v[k]], - pch = 20, - cex = 2 - ) - } - par(mar = c(3, 2.5, 2.5, 2)) - image( - 1, - ColorLevels, - matrix( - data = ColorLevels, - ncol = length(ColorLevels), - nrow = 1 - ), - col = ColorRamp, - xlab = "", - ylab = "", - las = 2, - xaxt = "n" - ) - layout(1) -} \ No newline at end of file diff --git a/R/PlotMBpca.R b/R/PlotMBpca.R new file mode 100644 index 0000000..17fb30d --- /dev/null +++ b/R/PlotMBpca.R @@ -0,0 +1,100 @@ +#' @title Plotting pseudo-time ordering or gene expression in Model-based clustering in PCA +#' @description The PCA representation can either be used to show pseudo-time ordering or the gene expression of a particular gene. +#' @param object \code{DISCBIO} class object. +#' @param type either `order` to plot pseudo-time ordering or `exp` to plot gene expression +#' @param g Individual gene name or vector with a group of gene names +#' corresponding to a subset of valid row names of the \code{ndata} slot of +#' the \code{DISCBIO} object. Ignored if `type="order"`. +#' @param n String of characters representing the title of the plot. Default is +#' NULL and the first element of \code{g} is chosen. Ignored if +#' `type="order"`. +#' @importFrom RColorBrewer brewer.pal +#' @importFrom grDevices colorRampPalette +#' @importFrom graphics layout par image +#' @return A plot of the PCA. +#' @export + +PlotMBpca <- function(object, type="order", g=NULL, n=NULL) { + # ========================================================================== + # Validation + # ========================================================================== + data <- object@MBclusters + if (type == "exp") { + if (is.null(g)) { + stop('g must be provided if type="exp"') + } + if (length(intersect(g, rownames(object@ndata))) < length(unique(g))) { + stop( + "second argument does not correspond to set of rownames slot", + "ndata of SCseq object" + ) + } + if (is.null(n)) { + n <- g[1] + } + l <- apply(object@ndata[g,] - .1, 2, sum) + .1 + x <- data$pcareduceres + } else if (type == "order") { + MBordertable <- cbind(data$pcareduceres, object@MBordering) + l <- MBordertable[, 3] + x <- MBordertable + } else { + stop("Invalid type. Valid alternatives as 'order' and 'exp'.") + } + # ========================================================================== + # Plotting + # ========================================================================== + mi <- min(l, na.rm = TRUE) + ma <- max(l, na.rm = TRUE) + ColorRamp <- colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100) + ColorLevels <- seq(mi, ma, length = length(ColorRamp)) + v <- round((l - mi) / (ma - mi) * 99 + 1, 0) + layout( + matrix( + data = c(1, 3, 2, 4), + nrow = 2, + ncol = 2 + ), + widths = c(5, 1, 5, 1), + heights = c(5, 1, 1, 1) + ) + opar <- par(mar = c(5, 5, 2.5, 2)) + on.exit(par(opar)) + plot( + x[, 1], + x[, 2], + xlab = "PC1", + ylab = "PC2", + pch = 20, + cex = 0, + col = "grey", + las = 1, + main = n + ) + for (k in 1:length(v)) { + points( + x[k, 1], + x[k, 2], + col = ColorRamp[v[k]], + pch = 20, + cex = 2 + ) + } + opar <- par(mar = c(3, 2.5, 2.5, 2)) + on.exit(par(opar)) + image( + 1, + ColorLevels, + matrix( + data = ColorLevels, + ncol = length(ColorLevels), + nrow = 1 + ), + col = ColorRamp, + xlab = "", + ylab = "", + las = 2, + xaxt = "n" + ) + layout(1) +} \ No newline at end of file diff --git a/R/RpartDT.R b/R/RpartDT.R index 269c0aa..3a19521 100644 --- a/R/RpartDT.R +++ b/R/RpartDT.R @@ -10,25 +10,6 @@ #' @importFrom rpart.plot rpart.plot #' @return Information about the model and, by default, a plot of the decision #' tree. -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' cdiff <- DEGanalysis2clust( -#' sc, Clustering="K-means", K=3, fdr=.2, name="Name", First="CL1", -#' Second="CL2", export=FALSE -#' ) -#' sigDEG <- cdiff[[1]] -#' DATAforDT <- ClassVectoringDT( -#' sc, Clustering="K-means", K=3, First="CL1", Second="CL2", sigDEG, -#' ) -#' RpartDT(DATAforDT) RpartDT <- function(data, quiet = FALSE, plot = TRUE) { exp.df <- as.data.frame(t(data)) classVector <- factor(colnames(data)) diff --git a/R/RpartEVAL.R b/R/RpartEVAL.R index 3f10349..e32be89 100644 --- a/R/RpartEVAL.R +++ b/R/RpartEVAL.R @@ -12,26 +12,6 @@ #' @param quiet If `TRUE`, suppresses intermediary output #' @importFrom stats predict #' @return Performance statistics of the model -#' @examples -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' cdiff <- DEGanalysis2clust( -#' sc, Clustering="K-means", K=3, fdr=.2, name="Name", First="CL1", -#' Second="CL2", export=FALSE -#' ) -#' sigDEG <- cdiff[[1]] -#' DATAforDT <- ClassVectoringDT( -#' sc, Clustering="K-means", K=3, First="CL1", Second="CL2", sigDEG, -#' ) -#' RpartEVAL(DATAforDT,num.folds=10,First="CL1",Second="CL2") - RpartEVAL <- function(data, num.folds = 10, First = "CL1", Second = "CL2", quiet = FALSE) { exp.imput.df <- as.data.frame(t(data)) @@ -43,7 +23,7 @@ RpartEVAL <- function(data, num.folds = 10, First = "CL1", Second = "CL2", #Start cross validation loop class1 <- levels(class.vec)[1] for (fold in 1:length(segments)) { - if (!quiet) cat("Fold", fold, "of", length(segments), "\n") + if (!quiet) message("Fold ", fold, " of ", length(segments)) #Define training and test set test.ind <- segments[[fold]] training.set <- exp.df[-test.ind, ] @@ -65,24 +45,6 @@ RpartEVAL <- function(data, num.folds = 10, First = "CL1", Second = "CL2", stop("Unknown classification algorithm") } #Evaluate model on test set - - eval.pred <- function(pred.class, true.class, class1, - performance) { - for (index in 1:length(pred.class)) { - pred <- pred.class[index] - true <- true.class[index] - if (pred == true && true == class1) { - performance["TP"] <- performance["TP"] + 1 - } else if (pred != true && true == class1) { - performance["FN"] <- performance["FN"] + 1 - } else if (pred != true && true != class1) { - performance["FP"] <- performance["FP"] + 1 - } else if (pred == true && true != class1) { - performance["TN"] <- performance["TN"] + 1 - } - } - return(performance) - } performance <- eval.pred( pred.class, test.class, class1, performance ) @@ -112,46 +74,17 @@ RpartEVAL <- function(data, num.folds = 10, First = "CL1", Second = "CL2", ) colnames(Rpart.confusion.matrix) <- c(First, Second) if (!quiet) print(Rpart.confusion.matrix) - - SN <- function(con.mat) { - TP <- con.mat[1, 1] - FN <- con.mat[2, 1] - return(TP / (TP + FN)) - } - SP <- function(con.mat) { - TN <- con.mat[2, 2] - FP <- con.mat[1, 2] - return(TN / (TN + FP)) - } - ACC <- function(con.mat) { - TP <- con.mat[1, 1] - FN <- con.mat[2, 1] - TN <- con.mat[2, 2] - FP <- con.mat[1, 2] - return((TP + TN) / (TP + FN + TN + FP)) - } - MCC <- function(con.mat) { - TP <- con.mat[1, 1] - FN <- con.mat[2, 1] - TN <- con.mat[2, 2] - FP <- con.mat[1, 2] - denom <- sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) - denom <- ifelse(denom == 0, NA, denom) - return((TP * TN - FP * FN) / denom) - } - Rpart.sn <- SN(Rpart.confusion.matrix) Rpart.sp <- SP(Rpart.confusion.matrix) Rpart.acc <- ACC(Rpart.confusion.matrix) Rpart.mcc <- MCC(Rpart.confusion.matrix) if (!quiet) { - cat( + message( "Rpart SN: ", Rpart.sn, "\n", "Rpart SP: ", Rpart.sp, "\n", "Rpart ACC: ", Rpart.acc, "\n", "Rpart MCC: ", Rpart.mcc, "\n", - sep = "" ) } return(Rpart.performance) diff --git a/R/VolcanoPlot.R b/R/VolcanoPlot.R index 53201f0..c1c8398 100644 --- a/R/VolcanoPlot.R +++ b/R/VolcanoPlot.R @@ -8,40 +8,24 @@ #' Default is 0.05 #' @param fc A numeric value of the fold change. Default is 0.5. #' @param FS A numeric value of the font size. Default is 0.4. -#' @param name A string vector showing the name to be used to save the resulted -#' tables. -#' @importFrom samr samr samr.compute.delta.table samr.plot -#' samr.compute.siggenes.table +#' @param name A string vector showing the name to be used on the plot title #' @importFrom graphics title #' @importFrom utils write.csv -#' @importFrom calibrate textxy #' @return A volcano plot #' @export -#' @examples -#' \dontrun{ -#' sc <- DISCBIO(valuesG1msReduced) -#' sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -#' sc <- Normalizedata( -#' sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, -#' dsn=1, rseed=17000 -#' ) -#' sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF") -#' sc <- Clustexp(sc, cln=3) # K-means clustering -#' sc <- comptSNE(sc, max_iter=100) -#' dff <- DEGanalysis2clust(sc, Clustering="K-means", K=3, fdr=0.1, name="Name") -#' name <- dff[[2]][1, 6] -#' U <- read.csv(file = paste0(name), head=TRUE, sep=",") -#' VolcanoPlot(U, value=0.05, name=name, adj=FALSE, FS=.4) -#' } -VolcanoPlot <- function(object, value = 0.05, name, fc = 0.5, FS = .4) { +VolcanoPlot <- function(object, value = 0.05, name = NULL, fc = 0.5, FS = .4) { if (length(object[1, ]) > 8) { object <- object[, -1] } NO0 <- object[, 8] - NO0 <- NO0[-which(NO0 == 0)] + NO0 <- NO0[which(NO0 != 0)] w <- which.min(NO0) adjV <- NO0[w] / 100 - object[, 8] <- ifelse(object[, 8] == 0, adjV, object[, 8]) + object[, 8] <- ifelse(object[, 8] == 0 & length(adjV) > 0, adjV, object[, 8]) + if (all(object[, 8] == 0)) { + message("All q-values are 0. Adjusting") + object[, 8] <- object[, 8] + 1e-10 + } with( object, plot( @@ -52,12 +36,12 @@ VolcanoPlot <- function(object, value = 0.05, name, fc = 0.5, FS = .4) { las = 1, xlab = "log2 Fold Change", ylab = "-log10 FDR", - sub = paste0("Volcano plot ", name), + sub = paste("Volcano plot", name), font.sub = 4, col.sub = "black" ) ) - FC <- subset(object, abs(object[, 7]) > fc) # Fold Change + FC <- subset(object, abs(object[, 7]) > fc) # Fold Change sigFC <- subset( object, object[, 8] < value & abs(object[, 7]) > fc ) # Significant genes @@ -76,7 +60,7 @@ VolcanoPlot <- function(object, value = 0.05, name, fc = 0.5, FS = .4) { col = "blue" )) with(sigFC, - textxy( + text( abs(sigFC[, 7]), -log10(sigFC[, 8]), labs = sigFC[, 2], @@ -84,24 +68,6 @@ VolcanoPlot <- function(object, value = 0.05, name, fc = 0.5, FS = .4) { col = "blue" ) ) - add_legend <- function(...) { - opar <- par( - fig = c(0, 1, 0, 1), - oma = c(0, 0, 0, 0), - mar = c(0, 0, 0, 0), - new = TRUE - ) - on.exit(par(opar)) - plot( - 0, - 0, - type = 'n', - bty = 'n', - xaxt = 'n', - yaxt = 'n' - ) - legend(...) - } add_legend( "topleft", legend = c( diff --git a/R/customConverters.R b/R/customConverters.R index 37709cb..ac1e328 100644 --- a/R/customConverters.R +++ b/R/customConverters.R @@ -25,7 +25,6 @@ customConvertFeats <- function(x, verbose = TRUE) { # uncomment myDict <- DIscBIO::HumanMouseGeneIds - # myDict <- HumanMouseGeneIds myDict <- myDict[!is.na(myDict[, "ENSEMBL"]), ] # @@ -80,13 +79,6 @@ customConvertFeats <- function(x, verbose = TRUE) { #' #' @export #' -#' @examples -#' g1_sce <- SingleCellExperiment::SingleCellExperiment( -#' list(counts=as.matrix(valuesG1msReduced)) -#' ) -#' g1_disc <- as.DISCBIO(g1_sce) -#' class(g1_disc) -#' as.DISCBIO <- function(x, ...) { if ("Seurat" %in% class(x)) { # Get Arguments and parse out what we want @@ -155,13 +147,13 @@ as.DISCBIO <- function(x, ...) { #' @return a SingleCellExperiment-class object #' #' @export -#' #' @examples -#' g1_disc <- DISCBIO(valuesG1msReduced) +#' g1_disc <- DISCBIO(valuesG1msTest) #' class(g1_disc) #' g1_sce <- DISCBIO2SingleCellExperiment(g1_disc) #' class(g1_sce) #' +#' DISCBIO2SingleCellExperiment <- function(x) { return(x@SingleCellExperiment) } diff --git a/R/datasets.R b/R/datasets.R index 0424018..00391e4 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -1,12 +1,12 @@ #' @title Single-cells data from a myxoid liposarcoma cell line #' -#' @description A dataset of 30 single cells from a myxoid liposarcoma cell -#' line. Columns refer to samples and rows refer to genes. The last 92 rows +#' @description A sample of single cells from a myxoid liposarcoma cell +#' line. Columns refer to samples and rows refer to genes. The last rows #' refer to external RNA controls consortium (ERCC) spike-ins. This dataset is #' part of a larger dataset containing 94 single cells. The complete dataset #' is fully compatible with this package and an rda file can be obtained at #' https://github.com/ocbe-uio/DIscBIO/blob/dev/data/valuesG1ms.rda -#' @name valuesG1msReduced +#' @name valuesG1msTest #' @docType data NULL @@ -18,10 +18,6 @@ NULL #' @source Data were imported, modified, and formatted from the #' Mus.musculus (ver 1.3.1) and the Homo.sapiens (ver 1.3.1) #' BioConductor libraries. -#' -#' @examples -#' data(HumanMouseGeneIds) -#' print(HumanMouseGeneIds[1:6,]) #' @name HumanMouseGeneIds #' @docType data -NULL +NULL \ No newline at end of file diff --git a/R/internal-functions.R b/R/internal-functions.R new file mode 100644 index 0000000..5534575 --- /dev/null +++ b/R/internal-functions.R @@ -0,0 +1,272 @@ +#' @importFrom fpc clusterboot cluster.stats calinhara dudahart2 +clustfun <- function( + x, + clustnr = 20, + bootnr = 50, + metric = "pearson", + do.gap = TRUE, + SE.method = "Tibs2001SEmax", + SE.factor = .25, + B.gap = 50, + cln = 0, + rseed = rseed, + quiet = FALSE +) { + if (clustnr < 2) stop("Choose clustnr > 1") + di <- dist.gen(t(x), method = metric) + if (do.gap | cln > 0) { + gpr <- NULL + if (do.gap) { + set.seed(rseed) + gpr <- clusGap( + as.matrix(di), + FUNcluster = kmeans, + K.max = clustnr, + B = B.gap, + verbose = !quiet + ) + if (cln == 0) { + cln <- maxSE( + gpr$Tab[, 3], + gpr$Tab[, 4], + method = SE.method, + SE.factor + ) + } + } + if (cln <= 1) { + clb <- list( + result = list(partition = rep(1, dim(x)[2])), + bootmean = 1 + ) + names(clb$result$partition) <- names(x) + return(list(x = x, clb = clb, gpr = gpr, di = di)) + } + clb <- clusterboot( + di, + B = bootnr, + distances = FALSE, + bootmethod = "boot", + clustermethod = KmeansCBI, + krange = cln, + scaling = FALSE, + multipleboot = FALSE, + bscompare = TRUE, + seed = rseed, + count = !quiet + ) + return(list(x = x, clb = clb,gpr = gpr, di = di)) + } +} + +Kmeansruns <- function ( + data, + krange = 2: 10, + criterion = "ch", + iter.max = 100, + runs = 100, + scaledata = FALSE, + alpha = 0.001, + critout = FALSE, + plot = FALSE, + method = "euclidean", + ... +) { + data <- as.matrix(data) + if (criterion == "asw") sdata <- dist(data) + if (scaledata) data <- scale(data) + cluster1 <- 1 %in% krange + crit <- numeric(max(krange)) + km <- list() + for (k in krange) { + if (k > 1) { + minSS <- Inf + kmopt <- NULL + for (i in 1:runs) { + opar <- options(show.error.messages = FALSE) + on.exit(options(opar)) + repeat { + kmm <- try(kmeans(data, k)) + if (!is(kmm, "try-error")) break + } + opar <- options(show.error.messages = TRUE) + on.exit(options(opar)) + swss <- sum(kmm$withinss) + if (swss < minSS) { + kmopt <- kmm + minSS <- swss + } + if (plot) { + opar <- par(ask = TRUE) + on.exit(par(opar)) + pairs(data, col = kmm$cluster, main = swss) + } + } + km[[k]] <- kmopt + crit[k] <- switch( + criterion, + asw = cluster.stats(sdata, km[[k]]$cluster)$avg.silwidth, + ch = calinhara(data, km[[k]]$cluster) + ) + if (critout) { + message(k, " clusters ", crit[k], "\n") + } + } + } + if (cluster1) { + cluster1 <- dudahart2( data, km[[2]]$cluster, alpha = alpha)$cluster1 + } + k.best <- which.max(crit) + if (cluster1) k.best <- 1 + km[[k.best]]$crit <- crit + km[[k.best]]$bestk <- k.best + out <- km[[k.best]] + return(out) +} + +KmeansCBI <- function (data, + krange, + k = NULL, + scaling = FALSE, + runs = 1, + criterion = "ch", + method = "euclidean", + ... +) { + if (!is.null(k)) krange <- k + if (!identical(scaling, FALSE)) { + sdata <- scale(data, center = TRUE, scale = scaling) + } else { + sdata <- data + } + c1 <- Kmeansruns( + sdata, + krange, + runs = runs, + criterion = criterion, + method = method, + ... + ) + partition <- c1$cluster + cl <- list() + nc <- krange + for (i in 1:nc) cl[[i]] <- partition == i + out <- list( + result = c1, + nc = nc, + clusterlist = cl, + partition = partition, + clustermethod = "kmeans" + ) + return(out) +} + +dist.gen <- function(x, method = "euclidean") { + if (method %in% c("spearman", "pearson", "kendall")) { + as.dist(1 - cor(t(x), method = method)) + } else { + dist(x, method = method) + } +} + +binompval <- function(p, N, n) { + pval <- pbinom(n, round(N, 0), p, lower.tail = TRUE) + filter <- !is.na(pval) & pval > 0.5 + pval[filter] <- 1 - pval[filter] + return(pval) +} + +add_legend <- function(...) { + opar <- par( + fig = c(0, 1, 0, 1), + oma = c(0, 0, 0, 0), + mar = c(0, 0, 0, 0), + new = TRUE + ) + on.exit(par(opar)) + plot( + x = 0, + y = 0, + type = 'n', + bty = 'n', + xaxt = 'n', + yaxt = 'n' + ) + legend(...) +} + +downsample <- function(x, n, dsn) { + x <- round(x[, apply(x, 2, sum, na.rm = TRUE) >= n], 0) + nn <- min(apply(x, 2, sum)) + for (j in 1:dsn) { + z <- data.frame(GENEID = rownames(x)) + rownames(z) <- rownames(x) + initv <- rep(0, nrow(z)) + for (i in 1:dim(x)[2]) { + y <- aggregate( + rep(1, nn), list(sample( + rep(rownames(x), x[, i]), nn + )), + sum + ) + na <- names(x)[i] + names(y) <- c("GENEID", na) + rownames(y) <- y$GENEID + z[, na] <- initv + k <- intersect(rownames(z), y$GENEID) + z[k, na] <- y[k, na] + z[is.na(z[, na]), na] <- 0 + } + rownames(z) <- as.vector(z$GENEID) + ds <- if (j == 1) z[, -1] else ds + z[, -1] + } + ds <- ds / dsn + .1 + return(ds) +} + +eval.pred <- function(pred.class, true.class, class1, performance) { + for (index in 1:length(pred.class)) { + pred <- pred.class[index] + true <- true.class[index] + if (pred == true && true == class1) { + performance["TP"] <- performance["TP"] + 1 + } else if (pred != true && true == class1) { + performance["FN"] <- performance["FN"] + 1 + } else if (pred != true && true != class1) { + performance["FP"] <- performance["FP"] + 1 + } else if (pred == true && true != class1) { + performance["TN"] <- performance["TN"] + 1 + } + } + return(performance) +} + +SN <- function(con.mat) { + TP <- con.mat[1, 1] + FN <- con.mat[2, 1] + return(TP / (TP + FN)) +} + +SP <- function(con.mat) { + TN <- con.mat[2, 2] + FP <- con.mat[1, 2] + return(TN / (TN + FP)) +} + +ACC <- function(con.mat) { + TP <- con.mat[1, 1] + FN <- con.mat[2, 1] + TN <- con.mat[2, 2] + FP <- con.mat[1, 2] + return((TP + TN) / (TP + FN + TN + FP)) +} + +MCC <- function(con.mat) { + TP <- con.mat[1, 1] + FN <- con.mat[2, 1] + TN <- con.mat[2, 2] + FP <- con.mat[1, 2] + denom <- sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) + denom <- ifelse(denom == 0, NA, denom) + return((TP * TN - FP * FN) / denom) +} \ No newline at end of file diff --git a/R/prepExampleDataset.R b/R/prepExampleDataset.R new file mode 100644 index 0000000..336a1ff --- /dev/null +++ b/R/prepExampleDataset.R @@ -0,0 +1,50 @@ +#' @title Prepare Example Dataset +#' @description Internal function that prepares a pre-treated dataset for use in +#' several examples +#' @param dataset Dataset used for transformation +#' @param save save results? +#' @details This function serves the purpose of treating datasets such as +#' valuesG1msReduced to reduce examples of other functions by bypassing some +#' analysis steps covered in the vignettes. +#' @return Two rda files, ones for K-means clustering and another for +#' Model-based clustering. +#' @author Waldir Leoncio +prepExampleDataset <- function(dataset, save=TRUE) { + # ========================================================================== + # Initial data treatment + # ========================================================================== + message("Treating dataset") + sc <- DISCBIO(dataset) + sc <- NoiseFiltering( + sc, percentile=0.9, CV=0.2, export=FALSE, plot=FALSE, quiet=TRUE + ) + sc <- Normalizedata(sc) + sc <- FinalPreprocessing(sc, export=FALSE, quiet=TRUE) + # ========================================================================== + # Clustering + # ========================================================================== + message("K-means clustering") + sc_k <- Clustexp(sc, cln=3, quiet=TRUE) + sc_k <- comptSNE(sc_k, quiet=TRUE) + valuesG1msReduced_treated_K <- sc_k + message("Model-based clustering") + sc_mb <- Exprmclust(sc, quiet=TRUE) + sc_mb <- comptSNE(sc_mb, rseed=15555, quiet=TRUE) + valuesG1msReduced_treated_MB <- sc_mb + # ========================================================================== + # Output + # ========================================================================== + message("Saving datasets") + if (save) { + save( + valuesG1msReduced_treated_K, + file = "data/valuesG1msReduced_treated_K.rda" + ) + save( + valuesG1msReduced_treated_MB, + file = "data/valuesG1msReduced_treated_MB.rda" + ) + } else { + message("Not saving dataset because (save == FALSE)") + } +} \ No newline at end of file diff --git a/R/reformatSiggenes.R b/R/reformatSiggenes.R new file mode 100644 index 0000000..49e28fd --- /dev/null +++ b/R/reformatSiggenes.R @@ -0,0 +1,29 @@ +#' @title Reformat Siggenes Table +#' @description Reformats the Siggenes table output from the SAMR package +#' @param table output from `samr::samr.compute.siggenes.table` +#' @seealso replaceDecimals +reformatSiggenes <- function(table) { + if (is.null(table)) return(table) + table <- as.data.frame(table) + # ========================================================================== + # Replacing decimal separators + # ========================================================================== + table[, "Score(d)"] <- replaceDecimals(table[, "Score(d)"]) + table[, "Numerator(r)"] <- replaceDecimals(table[, "Numerator(r)"]) + table[, "Denominator(s+s0)"] <- replaceDecimals(table[, "Denominator(s+s0)"]) + table[, "Fold Change"] <- replaceDecimals(table[, "Fold Change"]) + table[, "q-value(%)"] <- replaceDecimals(table[, "q-value(%)"]) + # ========================================================================== + # Changing vector classes + # ========================================================================== + table[, "Row"] <- as.numeric(table[, "Row"]) + table[, "Score(d)"] <- as.numeric(table[, "Score(d)"]) + table[, "Numerator(r)"] <- as.numeric(table[, "Numerator(r)"]) + table[, "Denominator(s+s0)"] <- as.numeric(table[, "Denominator(s+s0)"]) + table[, "Fold Change"] <- as.numeric(table[, "Fold Change"]) + table[, "q-value(%)"] <- as.numeric(table[, "q-value(%)"]) + # ========================================================================== + # Returning output + # ========================================================================== + return(table) +} \ No newline at end of file diff --git a/R/replaceDecimals.R b/R/replaceDecimals.R new file mode 100644 index 0000000..6da146a --- /dev/null +++ b/R/replaceDecimals.R @@ -0,0 +1,12 @@ +#' @title Replace Decimals +#' @description Replaces decimals separators between comma and periods on a +#' character vector +#' @note This function was especially designed to be used with retormatSiggenes +#' @param x vector of characters +#' @param from decimal separator on input file +#' @param to decimal separator for output file +#' @seealso reformatSiggenes +replaceDecimals <- function(x, from=",", to=".") { + x <- gsub(",", ".", x) + return(x) +} \ No newline at end of file diff --git a/R/retrieveBiomart.R b/R/retrieveBiomart.R deleted file mode 100644 index e43aa5f..0000000 --- a/R/retrieveBiomart.R +++ /dev/null @@ -1,123 +0,0 @@ -#' @title Retrieve data from BioMart -#' @description uses functions from the biomaRt package to retrieve dataframes -#' from the BioMart Database -#' @details Since the BioMart database is not always accessible, this function -#' envelops the requests to the database in a set of tryCatch functions to -#' allow for multiple queries and easier feedback to the end user -#' @param gene_name gene signature -#' @param quiet if `TRUE`, suppresses messages -#' @param max_tries maximum number of times the function will try to reach the -#' database -#' @importFrom biomaRt useDataset useMart useEnsembl getBM -#' @return data.frame resulting from a successful call to getBM. -retrieveBiomart <- function(gene_name, quiet = FALSE, max_tries = 3) { - # Generates a Mart object - if (!quiet) - message("Retrieving mart object. Please wait.") - mart <- tryCatch({ - useDataset("hsapiens_gene_ensembl", useMart("ensembl")) - }, - error = function(err) { - message( - "The Mart object could not be retrieved.", - "Will try again using a different function.", - "Here is the original error message:\n", - err - ) - return(NULL) - }) - if (is.null(mart)) { - mart <- tryCatch({ - useDataset("hsapiens_gene_ensembl", useEnsembl("ensembl")) - }, - error = function(err) { - message(err) - stop( - "The Mart object still could not be retrieved. ", - "The server may be down. Please try again later." - ) - return(NULL) - }) - } - - # Retrieve BioMart dataframe - if (!quiet) { - message( - "Done.\n", - "Accessing the BioMart database.", - "This operation may take a few minutes. Please wait." - ) - } - G_list <- NULL - tries <- 1 - while (is.null(G_list) & tries <= max_tries) { - G_list <- tryCatch({ - if (quiet) { - suppressMessages( - getBM( - filters = "ensembl_gene_id", - attributes = c("ensembl_gene_id", "hgnc_symbol"), - values = gene_name, - mart = mart, - useCache = FALSE - ) - ) - } else { - getBM( - filters = "ensembl_gene_id", - attributes = c("ensembl_gene_id", "hgnc_symbol"), - values = gene_name, - mart = mart, - useCache = FALSE - ) - } - }, - error = function(err) { - if (!quiet) { - message( - "The BioMart database could not be reached. ", - "Retrying (", - tries, - "/", - max_tries, - "). ", - "Here is the original error message:\n", - err - ) - } - tries <- tries + 1 - return(NULL) - }, - warning = function(warn) { - if (!quiet) { - message( - "The BioMart database could not be reached. ", - "Retrying (", - tries, - "/", - max_tries, - "). ", - "Here is the original warning:\n", - warn - ) - } - tries <- tries + 1 - return(NULL) - }) - } - - # Giving up or returning output - if (is.null(G_list) & tries > 3) { - stop( - "The BioMart database could not be reached after", - max_tries, - "tries. ", - "Please try again later." - ) - } else { - if (!quiet) - message("Done.") - } - - return(G_list) - } \ No newline at end of file diff --git a/R/samr-adapted.R b/R/samr-adapted.R new file mode 100644 index 0000000..a03c1aa --- /dev/null +++ b/R/samr-adapted.R @@ -0,0 +1,2119 @@ +# This script contains customized versions of functions found in the samr package. This is necessary because samr seems to have been abandoned, so an upstream collaboration doesn't seem possible at the time of writing. + +# ============================================================================== +# Constants +# ============================================================================== +samr.const.twoclass.unpaired.response <- "Two class unpaired" +samr.const.twoclass.paired.response <- "Two class paired" +samr.const.oneclass.response <- "One class" +samr.const.quantitative.response <- "Quantitative" +samr.const.multiclass.response <- "Multiclass" +samr.const.twoclass.unpaired.timecourse.response <- "Two class unpaired timecourse" +samr.const.twoclass.paired.timecourse.response <- "Two class paired timecourse" +samr.const.oneclass.timecourse.response <- "One class timecourse" +samr.const.survival.response <- "Survival" +samr.const.patterndiscovery.response <- "Pattern discovery" + +# ============================================================================== +# Functions +# ============================================================================== + +#' @title Significance analysis of microarrays +#' @description This function is an adaptation of `samr::samr` +#' @param data Data object with components x- p by n matrix of features, one observation per column (missing values allowed); y- n-vector of outcome measurements; censoring.status- n-vector of censoring censoring.status (1= died or event occurred, 0=survived, or event was censored), needed for a censored survival outcome +#' @param resp.type Problem type: "Quantitative" for a continuous parameter (Available for both array and sequencing data); "Two class unpaired" (for both array and sequencing data); "Survival" for censored survival outcome (for both array and sequencingdata); "Multiclass": more than 2 groups (for both array and sequencing data); "One class" for a single group (only for array data); "Two class paired" for two classes with paired observations (for both array and sequencing data); "Two class unpaired timecourse" (only for array data), "One class time course" (only for array data), "Two class.paired timecourse" (only for array data), or "Pattern discovery" (only for array data) +#' @param assay.type Assay type: "array" for microarray data, "seq" for counts from sequencing +#' @param s0 Exchangeability factor for denominator of test statistic; Default is automatic choice. Only used for array data. +#' @param s0.perc Percentile of standard deviation values to use for s0; default is automatic choice; -1 means s0=0 (different from s0.perc=0, meaning s0=zeroeth percentile of standard deviation values= min of sd values. Only used for array data. +#' @param nperms Number of permutations used to estimate false discovery rates +#' @param center.arrays Should the data for each sample (array) be median centered at the outset? Default =FALSE. Only used for array data. +#' @param testStatistic Test statistic to use in two class unpaired case.Either "standard" (t-statistic) or ,"wilcoxon" (Two-sample wilcoxon or Mann-Whitney test). Only used for array data. +#' @param time.summary.type Summary measure for each time course: "slope", or"signed.area"). Only used for array data. +#' @param regression.method Regression method for quantitative case: "standard",(linear least squares) or "ranks" (linear least squares on ranked data). Only used for array data. +#' @param return.x Should the matrix of feature values be returned? Only useful for time course data, where x contains summaries of the features over time. Otherwise x is the same as the input data data\$x +#' @param knn.neighbors Number of nearest neighbors to use for imputation of missing features values. Only used for array data. +#' @param random.seed Optional initial seed for random number generator (integer) +#' @param nresamp For assay.type="seq", number of resamples used to construct test statistic. Default 20. Only used for sequencing data. +#' @param nresamp.perm For assay.type="seq", number of resamples used to construct test statistic for permutations. Default is equal to nresamp and it must be at most nresamp. Only used for sequencing data. +#' @param xl.mode Used by Excel interface +#' @param xl.time Used by Excel interface +#' @param xl.prevfit Used by Excel interface +#' @importFrom impute impute.knn +sammy <- function (data, resp.type = c("Quantitative", "Two class unpaired", + "Survival", "Multiclass", "One class", "Two class paired", + "Two class unpaired timecourse", "One class timecourse", + "Two class paired timecourse", "Pattern discovery"), assay.type = c("array", + "seq"), s0 = NULL, s0.perc = NULL, nperms = 100, center.arrays = FALSE, + testStatistic = c("standard", "wilcoxon"), time.summary.type = c("slope", + "signed.area"), regression.method = c("standard", "ranks"), + return.x = FALSE, knn.neighbors = 10, random.seed = NULL, + nresamp = 20, nresamp.perm = NULL, xl.mode = c("regular", + "firsttime", "next20", "lasttime"), xl.time = NULL, xl.prevfit = NULL) +{ + this.call = match.call() + resp.type.arg = match.arg(resp.type) + assay.type = match.arg(assay.type) + xl.mode = match.arg(xl.mode) + set.seed(random.seed) + if (is.null(nresamp.perm)) nresamp.perm <- nresamp + nresamp.perm = min(nresamp, nresamp.perm) + if (xl.mode == "regular" | xl.mode == "firsttime") { + x = NULL + xresamp = NULL + ttstar0 = NULL + evo = NULL + ystar = NULL + sdstar.keep = NULL + censoring.status = NULL + sdstar = NULL + pi0 = NULL + stand.contrasts = NULL + stand.contrasts.star = NULL + stand.contrasts.95 = NULL + foldchange = NULL + foldchange.star = NULL + perms = NULL + permsy = NULL + eigengene = NULL + eigengene.number = NULL + testStatistic <- match.arg(testStatistic) + time.summary.type <- match.arg(time.summary.type) + regression.method <- match.arg(regression.method) + x = data$x + y = data$y + argy = y + if (!is.null(data$eigengene.number)) { + eigengene.number = data$eigengene.number + } + if (sum(is.na(x)) > 0) { + x = impute.knn(x, k = knn.neighbors) + if (!is.matrix(x)) { + x = x$data + } + } + are.blocks.specified = FALSE + cond = (resp.type == "One class") | (resp.type == "Two class unpaired timecourse") | + (resp.type == "One class unpaired timecourse") | + (resp.type == "Two class paired timecourse") | (resp.type == + "Pattern discovery") + if (assay.type == "seq" & cond) { + stop(paste("Resp.type=", resp.type, " not allowed when assay.type='seq'")) + } + if (assay.type == "seq" & min(x) < 0) { + stop(paste("Negative values not allowed when assay.type='seq'")) + } + if (assay.type == "seq" & (sum(x%%1 != 0) != 0)) { + stop("Non-integer values not alled when assay.type='seq'") + } + if (assay.type == "seq" & center.arrays) { + stop(paste("Centering not allowed when assay.type='seq'")) + } + if (assay.type == "seq" & regression.method == "ranks") { + stop(paste("regression.method==ranks not allowed when assay.type='seq'")) + } + if (center.arrays) { + x <- scale(x, center = apply(x, 2, median), scale = FALSE) + } + depth = scaling.factors = rep(NA, ncol(x)) + scaling.factors = (prod(depth)^(1/length(depth)))/depth + if (assay.type == "seq") { + message("Estimating sequencing depths...") + depth = samr.estimate.depth(x) + message("Resampling to get new data matrices...") + xresamp = resa(x, depth, nresamp = nresamp) + } + scaling.factors = (prod(depth)^(1/length(depth)))/depth + if (resp.type == samr.const.twoclass.unpaired.response) { + if (substring(y[1], 2, 6) == "Block" | substring(y[1], + 2, 6) == "block") { + junk = parse.block.labels.for.2classes(y) + y = junk$y + blocky = junk$blocky + are.blocks.specified = TRUE + } + } + if (resp.type == samr.const.twoclass.unpaired.response | + resp.type == samr.const.twoclass.paired.response | + resp.type == samr.const.oneclass.response | resp.type == + samr.const.quantitative.response | resp.type == samr.const.multiclass.response) { + y = as.numeric(y) + } + sd.internal = NULL + if (resp.type == samr.const.twoclass.unpaired.timecourse.response | + resp.type == samr.const.twoclass.paired.timecourse.response | + resp.type == samr.const.oneclass.timecourse.response) { + junk = parse.time.labels.and.summarize.data(x, y, + resp.type, time.summary.type) + y = junk$y + x = junk$x + sd.internal = sqrt(rowMeans(junk$sd^2)) + if (min(table(y)) == 1) { + warning( + "Only one timecourse in one or more classes;\n", + "SAM plot and FDRs will be unreliable;", + "only gene scores are informative" + ) + } + } + if (resp.type == samr.const.twoclass.unpaired.timecourse.response) { + resp.type = samr.const.twoclass.unpaired.response + } + if (resp.type == samr.const.twoclass.paired.timecourse.response) { + resp.type = samr.const.twoclass.paired.response + } + if (resp.type == samr.const.oneclass.timecourse.response) { + resp.type = samr.const.oneclass.response + } + stand.contrasts = NULL + stand.contrasts.95 = NULL + if (resp.type == samr.const.survival.response) { + censoring.status = data$censoring.status + } + check.format(y, resp.type = resp.type, censoring.status = censoring.status) + if (resp.type == samr.const.quantitative.response & regression.method == + "ranks") { + y = rank(y) + x = t(apply(x, 1, rank)) + } + n <- nrow(x) + ny <- length(y) + sd <- NULL + numer <- NULL + if (resp.type == samr.const.twoclass.unpaired.response & + testStatistic == "standard" & assay.type == "array") { + init.fit <- ttest.func(x, y, sd = sd.internal) + numer <- init.fit$numer + sd <- init.fit$sd + } + if (resp.type == samr.const.twoclass.unpaired.response & + testStatistic == "wilcoxon" & assay.type == "array") { + init.fit <- wilcoxon.func(x, y) + numer <- init.fit$numer + sd <- init.fit$sd + } + if (resp.type == samr.const.oneclass.response & assay.type == + "array") { + init.fit <- onesample.ttest.func(x, y, sd = sd.internal) + numer <- init.fit$numer + sd <- init.fit$sd + } + if (resp.type == samr.const.twoclass.paired.response & + assay.type == "array") { + init.fit <- paired.ttest.func(x, y, sd = sd.internal) + numer <- init.fit$numer + sd <- init.fit$sd + } + if (resp.type == samr.const.survival.response & assay.type == + "array") { + init.fit <- cox.func(x, y, censoring.status) + numer <- init.fit$numer + sd <- init.fit$sd + } + if (resp.type == samr.const.multiclass.response & assay.type == + "array") { + init.fit <- multiclass.func(x, y) + numer <- init.fit$numer + sd <- init.fit$sd + } + if (resp.type == samr.const.quantitative.response & assay.type == + "array") { + init.fit <- quantitative.func(x, y) + numer <- init.fit$numer + sd <- init.fit$sd + } + if (resp.type == samr.const.patterndiscovery.response & + assay.type == "array") { + init.fit <- patterndiscovery.func(x) + numer <- init.fit$numer + sd <- init.fit$sd + } + if ((resp.type == samr.const.quantitative.response & + (testStatistic == "wilcoxon" | regression.method == + "ranks" & assay.type == "array") | resp.type == + samr.const.patterndiscovery.response) | resp.type == + samr.const.twoclass.unpaired.response & assay.type == + "array" & testStatistic == "wilcoxon" | (nrow(x) < + 500) & is.null(s0) & is.null(s0.perc)) { + s0 = quantile(sd, 0.05) + s0.perc = 0.05 + } + if (is.null(s0) & assay.type == "array") { + if (!is.null(s0.perc)) { + if ((s0.perc != -1 & s0.perc < 0) | s0.perc > + 100) { + stop("Illegal value for s0.perc: must be between 0 and 100, or equal\nto (-1) (meaning that s0 should be set to zero)") + } + if (s0.perc == -1) { + s0 = 0 + } + if (s0.perc >= 0) { + s0 <- quantile(init.fit$sd, s0.perc/100) + } + } + if (is.null(s0.perc)) { + s0 = est.s0(init.fit$tt, init.fit$sd)$s0.hat + s0.perc = 100 * sum(init.fit$sd < s0)/length(init.fit$sd) + } + } + if (assay.type == "seq") { + s0 = 0 + s0.perc = 0 + } + if (resp.type == samr.const.twoclass.unpaired.response & + testStatistic == "standard" & assay.type == "array") { + tt <- ttest.func(x, y, s0 = s0, sd = sd.internal)$tt + } + if (resp.type == samr.const.twoclass.unpaired.response & + testStatistic == "wilcoxon" & assay.type == "array") { + tt <- wilcoxon.func(x, y, s0 = s0)$tt + } + if (resp.type == samr.const.oneclass.response & assay.type == + "array") { + tt <- onesample.ttest.func(x, y, s0 = s0, sd = sd.internal)$tt + } + if (resp.type == samr.const.twoclass.paired.response & + assay.type == "array") { + tt <- paired.ttest.func(x, y, s0 = s0, sd = sd.internal)$tt + } + if (resp.type == samr.const.survival.response & assay.type == + "array") { + tt <- cox.func(x, y, censoring.status, s0 = s0)$tt + } + if (resp.type == samr.const.multiclass.response & assay.type == + "array") { + junk2 <- multiclass.func(x, y, s0 = s0) + tt = junk2$tt + stand.contrasts = junk2$stand.contrasts + } + if (resp.type == samr.const.quantitative.response & assay.type == + "array") { + tt <- quantitative.func(x, y, s0 = s0)$tt + } + if (resp.type == samr.const.patterndiscovery.response & + assay.type == "array") { + junk <- patterndiscovery.func(x, s0 = s0, eigengene.number = eigengene.number) + tt <- junk$tt + eigengene = junk$eigengene + } + if (resp.type == samr.const.twoclass.unpaired.response & + assay.type == "seq") { + junk = wilcoxon.unpaired.seq.func(xresamp, y) + tt = junk$tt + numer = junk$numer + sd = junk$sd + } + if (resp.type == samr.const.twoclass.paired.response & + assay.type == "seq") { + junk <- wilcoxon.paired.seq.func(xresamp, y) + tt = junk$tt + numer = junk$numer + sd = junk$sd + } + if (resp.type == samr.const.quantitative.response & assay.type == + "seq") { + junk <- quantitative.seq.func(xresamp, y) + tt = junk$tt + numer = junk$numer + sd = junk$sd + } + if (resp.type == samr.const.survival.response & assay.type == + "seq") { + junk <- cox.seq.func(xresamp, y, censoring.status) + tt = junk$tt + numer = junk$numer + sd = junk$sd + } + if (resp.type == samr.const.multiclass.response & assay.type == + "seq") { + junk2 <- multiclass.seq.func(xresamp, y) + tt = junk2$tt + numer = junk2$numer + sd = junk2$sd + stand.contrasts = junk2$stand.contrasts + } + if (resp.type == samr.const.quantitative.response | resp.type == + samr.const.multiclass.response | resp.type == samr.const.survival.response) { + junk <- getperms(y, nperms) + perms = junk$perms + all.perms.flag = junk$all.perms.flag + nperms.act = junk$nperms.act + } + if (resp.type == samr.const.twoclass.unpaired.response) { + if (are.blocks.specified) { + junk = compute.block.perms(y, blocky, nperms) + permsy = matrix(junk$permsy, ncol = length(y)) + all.perms.flag = junk$all.perms.flag + nperms.act = junk$nperms.act + } + else { + junk <- getperms(y, nperms) + permsy = matrix(y[junk$perms], ncol = length(y)) + all.perms.flag = junk$all.perms.flag + nperms.act = junk$nperms.act + } + } + if (resp.type == samr.const.oneclass.response) { + if ((length(y) * log(2)) < log(nperms)) { + allii = 0:((2^length(y)) - 1) + nperms.act = 2^length(y) + all.perms.flag = 1 + } + else { + nperms.act = nperms + all.perms.flag = 0 + } + permsy = matrix(NA, nrow = nperms.act, ncol = length(y)) + if (all.perms.flag == 1) { + k = 0 + for (i in allii) { + junk = integer.base.b(i, b = 2) + if (length(junk) < length(y)) { + junk = c(rep(0, length(y) - length(junk)), + junk) + } + k = k + 1 + permsy[k, ] = y * (2 * junk - 1) + } + } + else { + for (i in 1:nperms.act) { + permsy[i, ] = sample(c(-1, 1), size = length(y), + replace = TRUE) + } + } + } + if (resp.type == samr.const.twoclass.paired.response) { + junk = compute.block.perms(y, abs(y), nperms) + permsy = junk$permsy + all.perms.flag = junk$all.perms.flag + nperms.act = junk$nperms.act + } + if (resp.type == samr.const.patterndiscovery.response) { + nperms.act = nperms + perms = NULL + permsy = NULL + all.perms.flag = FALSE + } + sdstar.keep <- NULL + if (assay.type != "seq") { + sdstar.keep <- matrix(0, ncol = nperms.act, nrow = nrow(x)) + } + ttstar <- matrix(0, nrow = nrow(x), ncol = nperms.act) + foldchange.star = NULL + if (resp.type == samr.const.twoclass.unpaired.response | + resp.type == samr.const.twoclass.paired.response) { + foldchange.star <- matrix(0, nrow = nrow(x), ncol = nperms.act) + } + if (resp.type == samr.const.multiclass.response) { + stand.contrasts.star = array(NA, c(nrow(x), length(table(y)), + nperms.act)) + } + } + if (xl.mode == "next20" | xl.mode == "lasttime") { + evo = xl.prevfit$evo + tt = xl.prevfit$tt + numer = xl.prevfit$numer + eigengene = xl.prevfit$eigengene + eigengene.number = xl.prevfit$eigengene.number + sd = xl.prevfit$sd - xl.prevfit$s0 + sd.internal = xl.prevfit$sd.internal + ttstar = xl.prevfit$ttstar + ttstar0 = xl.prevfit$ttstar0 + n = xl.prevfit$n + pi0 = xl.prevfit$pi0 + foldchange = xl.prevfit$foldchange + y = xl.prevfit$y + x = xl.prevfit$x + xresamp = xl.prevfit$xresamp + censoring.status = xl.prevfit$censoring.status + argy = xl.prevfit$argy + testStatistic = xl.prevfit$testStatistic + foldchange.star = xl.prevfit$foldchange.star + s0 = xl.prevfit$s0 + s0.perc = xl.prevfit$s0.perc + resp.type = xl.prevfit$resp.type + resp.type.arg = xl.prevfit$resp.type.arg + assay.type = xl.prevfit$assay.type + sdstar.keep = xl.prevfit$sdstar.keep + resp.type = xl.prevfit$resp.type + stand.contrasts = xl.prevfit$stand.contrasts + stand.contrasts.star = xl.prevfit$stand.contrasts.star + stand.contrasts.95 = xl.prevfit$stand.contrasts.95 + perms = xl.prevfit$perms + permsy = xl.prevfit$permsy + nperms = xl.prevfit$nperms + nperms.act = xl.prevfit$nperms.act + all.perms.flag = xl.prevfit$all.perms.flag + depth = xl.prevfit$depth + scaling.factors = xl.prevfit$scaling.factors + nresamp = xl.prevfit$nresamp + nresamp.perm = xl.prevfit$nresamp.perm + } + if (xl.mode == "regular") { + first = 1 + last = nperms.act + } + if (xl.mode == "firsttime") { + first = 1 + last = 1 + } + if (xl.mode == "next20") { + first = xl.time + last = min(xl.time + 19, nperms.act - 1) + } + if (xl.mode == "lasttime") { + first = nperms.act + last = nperms.act + } + for (b in first:last) { + message(c("perm = ", b)) + if (assay.type == "array") { + xstar <- x + } + if (assay.type == "seq") { + xstar <- xresamp[, , 1:nresamp.perm] + } + if (resp.type == samr.const.oneclass.response) { + ystar = permsy[b, ] + if (testStatistic == "standard") { + ttstar[, b] <- onesample.ttest.func(xstar, ystar, + s0 = s0, sd = sd.internal)$tt + } + } + if (resp.type == samr.const.twoclass.paired.response) { + ystar = permsy[b, ] + if (assay.type == "array") { + ttstar[, b] <- paired.ttest.func(xstar, ystar, + s0 = s0, sd = sd.internal)$tt + foldchange.star[, b] = foldchange.paired(xstar, + ystar, data$logged2) + } + if (assay.type == "seq") { + ttstar[, b] <- wilcoxon.paired.seq.func(xstar, + ystar)$tt + foldchange.star[, b] <- foldchange.seq.twoclass.paired(x, + ystar, depth) + } + } + if (resp.type == samr.const.twoclass.unpaired.response) { + ystar = permsy[b, ] + if (assay.type == "array") { + if (testStatistic == "standard") { + junk <- ttest.func(xstar, ystar, s0 = s0, sd = sd.internal) + } + if (testStatistic == "wilcoxon") { + junk <- wilcoxon.func(xstar, ystar, s0 = s0) + } + ttstar[, b] <- junk$tt + sdstar.keep[, b] <- junk$sd + foldchange.star[, b] = foldchange.twoclass(xstar, + ystar, data$logged2) + } + if (assay.type == "seq") { + ttstar[, b] <- wilcoxon.unpaired.seq.func(xstar, + ystar)$tt + foldchange.star[, b] <- foldchange.seq.twoclass.unpaired(x, + ystar, depth) + } + } + if (resp.type == samr.const.survival.response) { + o <- perms[b, ] + if (assay.type == "array") { + ttstar[, b] <- cox.func(xstar, y[o], censoring.status = censoring.status[o], + s0 = s0)$tt + } + if (assay.type == "seq") { + ttstar[, b] <- cox.seq.func(xstar, y[o], censoring.status = censoring.status[o])$tt + } + } + if (resp.type == samr.const.multiclass.response) { + ystar = y[perms[b, ]] + if (assay.type == "array") { + junk <- multiclass.func(xstar, ystar, s0 = s0) + ttstar[, b] <- junk$tt + sdstar.keep[, b] <- junk$sd + stand.contrasts.star[, , b] = junk$stand.contrasts + } + if (assay.type == "seq") { + junk <- multiclass.seq.func(xstar, ystar) + ttstar[, b] <- junk$tt + stand.contrasts.star[, , b] <- junk$stand.contrasts + } + } + if (resp.type == samr.const.quantitative.response) { + ystar = y[perms[b, ]] + if (assay.type == "array") { + junk <- quantitative.func(xstar, ystar, s0 = s0) + ttstar[, b] <- junk$tt + sdstar.keep[, b] <- junk$sd + } + if (assay.type == "seq") { + junk <- quantitative.seq.func(xstar, ystar) + ttstar[, b] <- junk$tt + } + } + if (resp.type == samr.const.patterndiscovery.response) { + xstar = permute.rows(x) + junk <- patterndiscovery.func(xstar, s0 = s0, eigengene.number = eigengene.number) + ttstar[, b] <- junk$tt + sdstar.keep[, b] <- junk$sd + } + } + if (xl.mode == "regular" | xl.mode == "lasttime") { + ttstar0 <- ttstar + for (j in 1:ncol(ttstar)) { + ttstar[, j] <- -1 * sort(-1 * ttstar[, j]) + } + for (i in 1:nrow(ttstar)) { + ttstar[i, ] <- sort(ttstar[i, ]) + } + evo <- apply(ttstar, 1, mean) + evo <- evo[length(evo):1] + sdstar <- sdstar.keep + pi0 = 1 + if (resp.type != samr.const.multiclass.response) { + qq <- quantile(ttstar, c(0.25, 0.75)) + } + if (resp.type == samr.const.multiclass.response) { + qq <- quantile(ttstar, c(0, 0.5)) + } + pi0 <- sum(tt > qq[1] & tt < qq[2])/(0.5 * length(tt)) + foldchange = NULL + if (resp.type == samr.const.twoclass.unpaired.response & + assay.type == "array") { + foldchange = foldchange.twoclass(x, y, data$logged2) + } + if (resp.type == samr.const.twoclass.paired.response & + assay.type == "array") { + foldchange = foldchange.paired(x, y, data$logged2) + } + if (resp.type == samr.const.oneclass.response & assay.type == + "array") { + } + stand.contrasts.95 = NULL + if (resp.type == samr.const.multiclass.response) { + stand.contrasts.95 = quantile(stand.contrasts.star, + c(0.025, 0.975)) + } + if (resp.type == samr.const.twoclass.unpaired.response & + assay.type == "seq") { + foldchange <- foldchange.seq.twoclass.unpaired(x, + y, depth) + } + if (resp.type == samr.const.twoclass.paired.response & + assay.type == "seq") { + foldchange <- foldchange.seq.twoclass.paired(x, y, + depth) + } + if (return.x == FALSE) { + x = NULL + } + } + return(list(n = n, x = x, xresamp = xresamp, y = y, argy = argy, + censoring.status = censoring.status, testStatistic = testStatistic, + nperms = nperms, nperms.act = nperms.act, tt = tt, numer = numer, + sd = sd + s0, sd.internal = sd.internal, s0 = s0, s0.perc = s0.perc, + evo = evo, perms = perms, permsy = permsy, nresamp = nresamp, + nresamp.perm = nresamp.perm, all.perms.flag = all.perms.flag, + ttstar = ttstar, ttstar0 = ttstar0, eigengene = eigengene, + eigengene.number = eigengene.number, pi0 = pi0, foldchange = foldchange, + foldchange.star = foldchange.star, sdstar.keep = sdstar.keep, + resp.type = resp.type, resp.type.arg = resp.type.arg, + assay.type = assay.type, stand.contrasts = stand.contrasts, + stand.contrasts.star = stand.contrasts.star, stand.contrasts.95 = stand.contrasts.95, + depth = depth, call = this.call)) +} + +#' @title Estimate sequencing depths +#' @param x data matrix. nrow=#gene, ncol=#sample +#' @return depth: estimated sequencing depth. a vector with len sample. +samr.estimate.depth <- function(x) { + iter <- 5 + cmeans <- colSums(x)/sum(x) + for (i in 1:iter) { + n0 <- rowSums(x) %*% t(cmeans) + prop <- rowSums((x - n0)^2/(n0 + 1e-08)) + qs <- quantile(prop, c(0.25, 0.75)) + keep <- (prop >= qs[1]) & (prop <= qs[2]) + cmeans <- colMeans(x[keep, ]) + cmeans <- cmeans/sum(cmeans) + } + depth <- cmeans/mean(cmeans) + return(depth) +} + +#' @title Resampling +#' @param x data matrix. nrow=#gene, ncol=#sample +#' @param d estimated sequencing depth +#' @param nresamp number of resamplings +#' @return xresamp: an rank array with dim #gene*#sample*nresamp +#' @description Corresponds to `samr::resample` +#' @importFrom stats rpois runif +resa <- function(x, d, nresamp = 20) { + ng <- nrow(x) + ns <- ncol(x) + dbar <- exp(mean(log(d))) + xresamp <- array(0, dim = c(ng, ns, nresamp)) + for (k in 1:nresamp) { + for (j in 1:ns) { + xresamp[, j, k] <- rpois(n = ng, lambda = (dbar/d[j]) * + x[, j]) + runif(ng) * 0.1 + } + } + for (k in 1:nresamp) { + xresamp[, , k] <- t(rankcols(t(xresamp[, , k]))) + } + return(xresamp) +} + +#' @title Rank columns +#' @description Ranks the elements within each col of the matrix x and returns +#' these ranks in a matrix +#' @note this function is equivalent to `samr::rankcol`, but uses `apply` to +#' rank the colums instead of a compiled Fortran function which was causing our +#' DEGanalysis functions to freeze in large datasets. +#' @param x x +rankcols <- function(x) { + # ranks the elements within each col of the matrix x + # and returns these ranks in a matrix + n = nrow(x) + p = ncol(x) + mode(n) = "integer" + mode(p) = "integer" + mode(x) = "double" + xr <- apply(x, 2, rank) + return(xr) +} + +#' @title Check format +#' @param y y +#' @param resp.type resp type +#' @param censoring.status censoring status +check.format <- function(y, resp.type, censoring.status = NULL) { + # here i do some format checks for the input data$y + # note that checks for time course data are done in the + # parse function for time course; + # we then check the output from the parser in this function + if (resp.type == samr.const.twoclass.unpaired.response | + resp.type == samr.const.twoclass.unpaired.timecourse.response) { + if (sum(y == 1) + sum(y == 2) != length(y)) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; values must be 1 or 2")) + } + } + if (resp.type == samr.const.twoclass.paired.response | resp.type == + samr.const.twoclass.paired.timecourse.response) { + if (sum(y) != 0) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; values must be -1, 1, -2, 2, etc")) + } + if (sum(table(y[y > 0]) != abs(table(y[y < 0])))) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; values must be -1, 1, -2, 2, etc")) + } + } + if (resp.type == samr.const.oneclass.response | resp.type == + samr.const.oneclass.timecourse.response) { + if (sum(y == 1) != length(y)) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; values must all be 1")) + } + } + if (resp.type == samr.const.multiclass.response) { + tt = table(y) + nc = length(tt) + if (sum(y <= nc & y > 0) < length(y)) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; values must be 1,2, ... number of classes")) + } + for (k in 1:nc) { + if (sum(y == k) < 2) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; there must be >1 sample per class")) + } + } + } + if (resp.type == samr.const.quantitative.response) { + if (!is.numeric(y)) { + stop(paste("Error in input response data: response type", + resp.type, " specified; values must be numeric")) + } + } + if (resp.type == samr.const.survival.response) { + if (is.null(censoring.status)) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; error in censoring indicator")) + } + if (!is.numeric(y) | sum(y < 0) > 0) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; survival times must be numeric and nonnegative")) + if (sum(censoring.status == 0) + sum(censoring.status == + 1) != length(censoring.status)) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; censoring indicators must be 0 (censored) or 1 (failed)")) + } + } + if (sum(censoring.status == 1) < 1) { + stop(paste("Error in input response data: response type ", + resp.type, " specified; there are no uncensored observations")) + } + } + return() +} + +#' @title Twoclass Wilcoxon statistics +#' @param xresamp an rank array with dim #gene*#sample*nresamp +#' @param y outcome vector of values 1 and 2 +#' @return the statistic. +wilcoxon.unpaired.seq.func <- function(xresamp, y) { + tt <- rep(0, dim(xresamp)[1]) + for (i in 1:dim(xresamp)[3]) { + tt <- tt + rowSums(xresamp[, y == 2, i]) - sum(y == 2) * + (length(y) + 1)/2 + } + tt <- tt/dim(xresamp)[3] + return(list(tt = tt, numer = tt, sd = rep(1, length(tt)))) +} +wilcoxon.paired.seq.func <- function(xresamp, y) { + tt <- rep(0, dim(xresamp)[1]) + for (i in 1:dim(xresamp)[3]) { + tt <- tt + rowSums(xresamp[, y > 0, i]) - sum(y > 0) * + (length(y) + 1)/2 + } + tt <- tt/dim(xresamp)[3] + return(list(tt = tt, numer = tt, sd = rep(1, length(tt)))) +} +getperms = function(y, nperms) { + total.perms = factorial(length(y)) + if (total.perms <= nperms) { + perms = permute(1:length(y)) + all.perms.flag = 1 + nperms.act = total.perms + } + if (total.perms > nperms) { + perms = matrix(NA, nrow = nperms, ncol = length(y)) + for (i in 1:nperms) { + perms[i, ] = sample(1:length(y), size = length(y)) + } + all.perms.flag = 0 + nperms.act = nperms + } + return(list(perms = perms, all.perms.flag = all.perms.flag, + nperms.act = nperms.act)) +} +foldchange.twoclass = function(x, y, logged2) { + # if(logged2){x=2^x} + m1 <- rowMeans(x[, y == 1, drop = F]) + m2 <- rowMeans(x[, y == 2, drop = F]) + if (!logged2) { + fc = m2/m1 + } + if (logged2) { + fc = 2^{ + m2 - m1 + } + } + return(fc) +} +#' @title Foldchange of twoclass unpaired sequencing data +#' @param x x +#' @param y y +#' @param depth depth +foldchange.seq.twoclass.unpaired <- function(x, y, depth) +{ + x.norm <- scale(x, center = F, scale = depth) + 1e-08 + fc <- apply(x.norm[, y == 2], 1, median) / + apply(x.norm[, y == + 1], 1, median) + return(fc) +} +foldchange.seq.twoclass.paired <- function(x, y, depth) { + nc <- ncol(x)/2 + o1 <- o2 <- rep(0, nc) + for (j in 1:nc) { + o1[j] <- which(y == -j) + o2[j] <- which(y == j) + } + x.norm <- scale(x, center = F, scale = depth) + 1e-08 + d <- x.norm[, o2, drop = F]/x.norm[, o1, drop = F] + fc <- lapply(d, 1, function(x) median(x, na.rm = T)) + return(fc) +} +permute <- function(elem) { + # generates all perms of the vector elem + if (!missing(elem)) { + if (length(elem) == 2) + return(matrix(c(elem, elem[2], elem[1]), nrow = 2)) + last.matrix <- permute(elem[-1]) + dim.last <- dim(last.matrix) + new.matrix <- matrix(0, nrow = dim.last[1] * (dim.last[2] + + 1), ncol = dim.last[2] + 1) + for (row in 1:(dim.last[1])) { + for (col in 1:(dim.last[2] + 1)) new.matrix[row + + (col - 1) * dim.last[1], ] <- insert.value(last.matrix[row, + ], elem[1], col) + } + return(new.matrix) + } + else message("Usage: permute(elem)\n\twhere elem is a vector") +} +insert.value <- function(vec, newval, pos) { + if (pos == 1) + return(c(newval, vec)) + lvec <- length(vec) + if (pos > lvec) + return(c(vec, newval)) + return(c(vec[1:pos - 1], newval, vec[pos:lvec])) +} +parse.block.labels.for.2classes = function(y) { + #this only works for 2 class case- having form jBlockn, + # where j=1 or 2 + n = length(y) + y.act = rep(NA, n) + blocky = rep(NA, n) + for (i in 1:n) { + blocky[i] = as.numeric(substring(y[i], 7, nchar(y[i]))) + y.act[i] = as.numeric(substring(y[i], 1, 1)) + } + return(list(y.act = y.act, blocky = blocky)) +} +parse.time.labels.and.summarize.data = function(x, + y, resp.type, time.summary.type) { + # parse time labels, and summarize time data for each + # person, via a slope or area + # does some error checking too + n = length(y) + last5char = rep(NA, n) + last3char = rep(NA, n) + for (i in 1:n) { + last3char[i] = substring(y[i], nchar(y[i]) - 2, nchar(y[i])) + last5char[i] = substring(y[i], nchar(y[i]) - 4, nchar(y[i])) + } + if (sum(last3char == "End") != sum(last5char == "Start")) { + stop("Error in format of time course data: a Start or End tag is missing") + } + y.act = rep(NA, n) + timey = rep(NA, n) + person.id = rep(NA, n) + k = 1 + end.flag = FALSE + person.id[1] = 1 + if (substring(y[1], nchar(y[1]) - 4, nchar(y[1])) != "Start") { + stop("Error in format of time course data: first cell should have a Start tag") + } + for (i in 1:n) { + message(i) + j = 1 + while (substring(y[i], j, j) != "T") { + j = j + 1 + } + end.of.y = j - 1 + y.act[i] = as.numeric(substring(y[i], 1, end.of.y)) + timey[i] = substring(y[i], end.of.y + 5, nchar(y[i])) + if (nchar(timey[i]) > 3 & substring(timey[i], nchar(timey[i]) - + 2, nchar(timey[i])) == "End") { + end.flag = TRUE + timey[i] = substring(timey[i], 1, nchar(timey[i]) - + 3) + } + if (nchar(timey[i]) > 3 & substring(timey[i], nchar(timey[i]) - + 4, nchar(timey[i])) == "Start") { + timey[i] = substring(timey[i], 1, nchar(timey[i]) - + 5) + } + if (i < n & !end.flag) { + person.id[i + 1] = k + } + if (i < n & end.flag) { + k = k + 1 + person.id[i + 1] = k + } + end.flag = FALSE + } + timey = as.numeric(timey) + # do a check that the format was correct + tt = table(person.id, y.act) + junk = function(x) { + sum(x != 0) + } + if (sum(apply(tt, 1, junk) != 1) > 0) { + num = (1:nrow(tt))[apply(tt, 1, junk) > 1] + stop(paste("Error in format of time course data, timecourse #", + as.character(num))) + } + npeople = length(unique(person.id)) + newx = matrix(NA, nrow = nrow(x), ncol = npeople) + sd = matrix(NA, nrow = nrow(x), ncol = npeople) + for (j in 1:npeople) { + jj = person.id == j + tim = timey[jj] + xc = t(scale(t(x[, jj, drop = F]), center = TRUE, scale = FALSE)) + if (time.summary.type == "slope") { + junk = quantitative.func(xc, tim - mean(tim)) + newx[, j] = junk$numer + sd[, j] = junk$sd + } + if (time.summary.type == "signed.area") { + junk = timearea.func(x[, jj, drop = F], tim) + newx[, j] = junk$numer + sd[, j] = junk$sd + } + } + y.unique = y.act[!duplicated(person.id)] + return(list(y = y.unique, x = newx, sd = sd)) +} +ttest.func <- function(x, y, s0 = 0, sd = NULL) { + n1 <- sum(y == 1) + n2 <- sum(y == 2) + p <- nrow(x) + m1 <- rowMeans(x[, y == 1, drop = F]) + m2 <- rowMeans(x[, y == 2, drop = F]) + if (is.null(sd)) { + sd <- sqrt(((n2 - 1) * varr(x[, y == 2], meanx = m2) + + (n1 - 1) * varr(x[, y == 1], meanx = m1)) * (1/n1 + + 1/n2)/(n1 + n2 - 2)) + } + numer <- m2 - m1 + dif.obs <- (numer)/(sd + s0) + return(list(tt = dif.obs, numer = numer, sd = sd)) +} + +wilcoxon.func <- function(x, y, s0 = 0) { + n1 <- sum(y == 1) + n2 <- sum(y == 2) + p = nrow(x) + r2 = rowSums(t(apply(x, 1, rank))[, y == 2, drop = F]) + numer = r2 - (n2/2) * (n2 + 1) - (n1 * n2)/2 + sd = sqrt(n1 * n2 * (n1 + n2 + 1)/12) + tt = (numer)/(sd + s0) + return(list(tt = tt, numer = numer, sd = rep(sd, p))) +} + +onesample.ttest.func <- function(x, y, s0 = 0, sd = NULL) { + n <- length(y) + x <- x * matrix(y, nrow = nrow(x), ncol = ncol(x), byrow = TRUE) + m <- rowMeans(x) + if (is.null(sd)) { + sd <- sqrt(varr(x, meanx = m)/n) + } + dif.obs <- m/(sd + s0) + return(list(tt = dif.obs, numer = m, sd = sd)) +} + +patterndiscovery.func = function(x, s0 = 0, eigengene.number = 1) { + a = mysvd(x, n.components = eigengene.number) + v = a$v[, eigengene.number] + # here we try to guess the most interpretable orientation + # for the eigengene + om = abs(a$u[, eigengene.number]) > quantile(abs(a$u[, eigengene.number]), + 0.95) + if (median(a$u[om, eigengene.number]) < 0) { + v = -1 * v + } + aa = quantitative.func(x, v, s0 = s0) + eigengene = cbind(1:nrow(a$v), v) + dimnames(eigengene) = list(NULL, c("sample number", "value")) + return(list(tt = aa$tt, numer = aa$numer, sd = aa$sd, eigengene = eigengene)) +} + +paired.ttest.func <- function(x, y, s0 = 0, sd = NULL) { + nc <- ncol(x)/2 + o <- 1:nc + o1 <- rep(0, ncol(x)/2) + o2 <- o1 + for (j in 1:nc) { + o1[j] <- (1:ncol(x))[y == -o[j]] + } + for (j in 1:nc) { + o2[j] <- (1:ncol(x))[y == o[j]] + } + d <- x[, o2, drop = F] - x[, o1, drop = F] + su <- x[, o2, drop = F] + x[, o1, drop = F] + if (is.matrix(d)) { + m <- rowMeans(d) + } + if (!is.matrix(d)) { + m <- mean(d) + } + if (is.null(sd)) { + if (is.matrix(d)) { + sd <- sqrt(varr(d, meanx = m)/nc) + } + if (!is.matrix(d)) { + sd <- sqrt(var(d)/nc) + } + } + dif.obs <- m/(sd + s0) + return(list(tt = dif.obs, numer = m, sd = sd)) +} + +cox.func <- function(x, y, censoring.status, s0 = 0) { + # find the index matrix + Dn <- sum(censoring.status == 1) + Dset <- c(1:ncol(x))[censoring.status == 1] # the set of observed + ind <- matrix(0, ncol(x), Dn) + # get the matrix + for (i in 1:Dn) { + ind[y > y[Dset[i]] - 1e-08, i] <- 1/sum(y > y[Dset[i]] - + 1e-08) + } + ind.sums <- rowSums(ind) + x.ind <- x %*% ind + # get the derivatives + numer <- x %*% (censoring.status - ind.sums) + sd <- sqrt((x * x) %*% ind.sums - rowSums(x.ind * x.ind)) + tt <- numer/(sd + s0) + return(list(tt = tt, numer = numer, sd = sd)) +} + +multiclass.func <- function(x, y, s0 = 0) { + ##assumes y is coded 1,2... + nn <- table(y) + m <- matrix(0, nrow = nrow(x), ncol = length(nn)) + v <- m + for (j in 1:length(nn)) { + m[, j] <- rowMeans(x[, y == j]) + v[, j] <- (nn[j] - 1) * varr(x[, y == j], meanx = m[, + j]) + } + mbar <- rowMeans(x) + mm <- m - matrix(mbar, nrow = length(mbar), ncol = length(nn)) + fac <- (sum(nn)/prod(nn)) + scor <- sqrt(fac * (apply(matrix(nn, nrow = nrow(m), ncol = ncol(m), + byrow = TRUE) * mm * mm, 1, sum))) + sd <- sqrt(rowSums(v) * (1/sum(nn - 1)) * sum(1/nn)) + tt <- scor/(sd + s0) + mm.stand = t(scale(t(mm), center = FALSE, scale = sd)) + return(list(tt = tt, numer = scor, sd = sd, stand.contrasts = mm.stand)) +} + +est.s0 <- function(tt, sd, s0.perc = seq(0, 1, by = 0.05)) { + ## estimate s0 (exchangeability) factor for denominator. + ## returns the actual estimate s0 (not a percentile) + br = unique(quantile(sd, seq(0, 1, len = 101))) + nbr = length(br) + a <- cut(sd, br, labels = F) + a[is.na(a)] <- 1 + cv.sd <- rep(0, length(s0.perc)) + for (j in 1:length(s0.perc)) { + w <- quantile(sd, s0.perc[j]) + w[j == 1] <- 0 + tt2 <- tt * sd/(sd + w) + tt2[tt2 == Inf] = NA + sds <- rep(0, nbr - 1) + for (i in 1:(nbr - 1)) { + sds[i] <- stats::mad(tt2[a == i], na.rm = TRUE) + } + cv.sd[j] <- sqrt(var(sds))/mean(sds) + } + o = (1:length(s0.perc))[cv.sd == min(cv.sd)] + # we don;t allow taking s0.hat to be 0th percentile when + # min sd is 0 + s0.hat = quantile(sd[sd != 0], s0.perc[o]) + return(list(s0.perc = s0.perc, cv.sd = cv.sd, s0.hat = s0.hat)) +} + +permute.rows <- function(x) { + dd <- dim(x) + n <- dd[1] + p <- dd[2] + mm <- runif(length(x)) + rep(seq(n) * 10, rep(p, n)) + matrix(t(x)[order(mm)], n, p, byrow = TRUE) +} + +foldchange.paired = function(x, y, logged2) { + # if(logged2){x=2^x} + nc <- ncol(x)/2 + o <- 1:nc + o1 <- rep(0, ncol(x)/2) + o2 <- o1 + for (j in 1:nc) { + o1[j] <- (1:ncol(x))[y == -o[j]] + } + for (j in 1:nc) { + o2[j] <- (1:ncol(x))[y == o[j]] + } + if (!logged2) { + d <- x[, o2, drop = F]/x[, o1, drop = F] + } + if (logged2) { + d <- x[, o2, drop = F] - x[, o1, drop = F] + } + if (!logged2) { + fc <- rowMeans(d) + } + if (logged2) { + fc <- 2^rowMeans(d) + } + return(fc) +} +foldchange.seq.twoclass.unpaired <- function(x, y, depth) +{ + x.norm <- scale(x, center = F, scale = depth) + 1e-08 + fc <- apply(x.norm[, y == 2], 1, median) / + apply(x.norm[, y == 1], 1, median) + return(fc) +} +integer.base.b <- function(x, b = 2) { + xi <- as.integer(x) + if (xi == 0) { + return(0) + } + if (any(is.na(xi) | ((x - xi) != 0))) + print(list(ERROR = "x not integer", x = x)) + N <- length(x) + xMax <- max(x) + ndigits <- (floor(logb(xMax, base = 2)) + 1) + Base.b <- array(NA, dim = c(N, ndigits)) + for (i in 1:ndigits) { + #i <- 1 + Base.b[, ndigits - i + 1] <- (x%%b) + x <- (x%/%b) + } + if (N == 1) + Base.b[1, ] + else Base.b +} +varr <- function(x, meanx = NULL) { + n <- ncol(x) + p <- nrow(x) + Y <- matrix(1, nrow = n, ncol = 1) + if (is.null(meanx)) { + meanx <- rowMeans(x) + } + ans <- rep(1, p) + xdif <- x - meanx %*% t(Y) + ans <- (xdif^2) %*% rep(1/(n - 1), n) + ans <- drop(ans) + return(ans) +} +quantitative.func <- function(x, y, s0 = 0) { + # regression of x on y + my = mean(y) + yy <- y - my + temp <- x %*% yy + mx = rowMeans(x) + syy = sum(yy^2) + scor <- temp/syy + b0hat <- mx - scor * my + ym = matrix(y, nrow = nrow(x), ncol = ncol(x), byrow = T) + xhat <- matrix(b0hat, nrow = nrow(x), ncol = ncol(x)) + ym * + matrix(scor, nrow = nrow(x), ncol = ncol(x)) + sigma <- sqrt(rowSums((x - xhat)^2)/(ncol(xhat) - 2)) + sd <- sigma/sqrt(syy) + tt <- scor/(sd + s0) + return(list(tt = tt, numer = scor, sd = sd)) +} +timearea.func <- function(x, y, s0 = 0) { + n <- ncol(x) + xx <- 0.5 * (x[, 2:n] + x[, 1:(n - 1)]) * matrix(diff(y), + nrow = nrow(x), ncol = n - 1, byrow = T) + numer <- rowMeans(xx) + sd <- sqrt(varr(xx, meanx = numer)/n) + tt <- numer/sqrt(sd + s0) + return(list(tt = tt, numer = numer, sd = sd)) +} +cox.seq.func <- function(xresamp, y, censoring.status) { + # get the dimensions + ng <- dim(xresamp)[1] + ns <- dim(xresamp)[2] + # prepare for the calculation + # find the index matrix + Dn <- sum(censoring.status == 1) + Dset <- c(1:ns)[censoring.status == 1] # the set of died + ind <- matrix(0, ns, Dn) + # get the matrix + for (i in 1:Dn) { + ind[y >= y[Dset[i]] - 1e-08, i] <- 1/sum(y >= y[Dset[i]] - + 1e-08) + } + ind.sums <- rowSums(ind) + # calculate the score statistic + tt <- apply(xresamp, 3, function(x, cen.ind, ind.para, ind.sums.para) { + dev1 <- x %*% cen.ind + x.ind <- x %*% ind.para + dev2 <- (x * x) %*% ind.sums.para - rowSums(x.ind * x.ind) + dev1/(sqrt(dev2) + 1e-08) + }, (censoring.status - ind.sums), ind, ind.sums) + tt <- rowMeans(tt) + return(list(tt = tt, numer = tt, sd = rep(1, length(tt)))) +} +compute.block.perms = function(y, blocky, nperms) { + # y are the data (eg class label 1 vs 2; or -1,1, -2,2 for + # paired data) + # blocky are the block labels (abs(y) for paired daatr) + ny = length(y) + nblocks = length(unique(blocky)) + tab = table(blocky) + total.nperms = prod(factorial(tab)) + # block.perms is a list of all possible permutations + block.perms = vector("list", nblocks) + # first enumerate all perms, when possible + if (total.nperms <= nperms) { + all.perms.flag = 1 + nperms.act = total.nperms + for (i in 1:nblocks) { + block.perms[[i]] = permute(y[blocky == i]) + } + kk = 0:(factorial(max(tab))^nblocks - 1) + #the rows of the matrix outerm runs through the 'outer + # product' + # first we assume that all blocks have max(tab) members; + # then we remove rows of outerm that + # are illegal (ie when a block has fewer members) + outerm = matrix(0, nrow = length(kk), ncol = nblocks) + for (i in 1:length(kk)) { + kkkk = integer.base.b(kk[i], b = factorial(max(tab))) + if (length(kkkk) > nblocks) { + kkkk = kkkk[(length(kkkk) - nblocks + 1):length(kkkk)] + } + outerm[i, (nblocks - length(kkkk) + 1):nblocks] = kkkk + } + outerm = outerm + 1 + # now remove rows that are illegal perms + ind = rep(TRUE, nrow(outerm)) + for (j in 1:ncol(outerm)) { + ind = ind & outerm[, j] <= factorial(tab[j]) + } + outerm = outerm[ind, , drop = F] + # finally, construct permutation matrix from outer product + permsy = matrix(NA, nrow = total.nperms, ncol = ny) + for (i in 1:total.nperms) { + junk = NULL + for (j in 1:nblocks) { + junk = c(junk, block.perms[[j]][outerm[i, j], + ]) + } + permsy[i, ] = junk + } + } + # next handle case when there are too many perms to enumerate + if (total.nperms > nperms) { + all.perms.flag = 0 + nperms.act = nperms + permsy = NULL + block.perms = vector("list", nblocks) + for (j in 1:nblocks) { + block.perms[[j]] = sample.perms(y[blocky == j], nperms = nperms) + } + for (j in 1:nblocks) { + permsy = cbind(permsy, block.perms[[j]]) + } + } + return(list(permsy = permsy, all.perms.flag = all.perms.flag, + nperms.act = nperms.act)) +} +sample.perms <- function(elem, nperms) { + # randomly generates nperms of the vector elem + res = permute.rows(matrix(elem, nrow = nperms, ncol = length(elem), + byrow = T)) + return(res) +} +mysvd <- function(x, n.components = NULL) { + # finds PCs of matrix x + p <- nrow(x) + n <- ncol(x) + # center the observations (rows) + feature.means <- rowMeans(x) + x <- t(scale(t(x), center = feature.means, scale = F)) + if (is.null(n.components)) { + n.components = min(n, p) + } + if (p > n) { + a <- eigen(t(x) %*% x) + v <- a$vec[, 1:n.components, drop = FALSE] + d <- sqrt(a$val[1:n.components, drop = FALSE]) + u <- scale(x %*% v, center = FALSE, scale = d) + return(list(u = u, d = d, v = v)) + } + else { + junk <- svd(x, LINPACK = TRUE) + nc = min(ncol(junk$u), n.components) + return(list(u = junk$u[, 1:nc], d = junk$d[1:nc], v = junk$v[, + 1:nc])) + } +} +quantitative.seq.func <- function(xresamp, y) { + tt <- rep(0, dim(xresamp)[1]) + for (i in 1:dim(xresamp)[3]) { + y.ranked <- rank(y, ties.method = "random") - (dim(xresamp)[2] + + 1)/2 + tt <- tt + (xresamp[, , i] - (dim(xresamp)[2] + 1)/2) %*% + y.ranked + } + ns <- dim(xresamp)[2] + tt <- tt/(dim(xresamp)[3] * (ns^3 - ns)/12) + return(list(tt = as.vector(tt), numer = as.vector(tt), sd = rep(1, + length(tt)))) +} +multiclass.seq.func <- function(xresamp, y) +{ + # number of classes and number of samples in each class + K <- max(y) + n.each <- rep(0, K) + for (k in 1 : K) + { + n.each[k] <- sum(y == k) + } + # the statistic + tt <- temp <- rep(0, dim(xresamp)[1]) + stand.contrasts <- matrix(0, dim(xresamp)[1], K) + + for (i in 1 : dim(xresamp)[3]) + { + for (k in 1 : K) + { + temp <- rowSums(xresamp[, y == k, i]) + tt <- tt + temp ^2 / n.each[k] + stand.contrasts[, k] <- stand.contrasts[, k] + temp + } + } + # finalize + nresamp <- dim(xresamp)[3] + ns <- dim(xresamp)[2] + tt <- tt / nresamp * 12 / ns / (ns + 1) - 3 * (ns + 1) + stand.contrasts <- stand.contrasts / nresamp + stand.contrasts <- scale(stand.contrasts, center=n.each * (ns + 1) / 2, + scale=sqrt(n.each * (ns - n.each) * (ns + 1) / 12)) + return(list(tt = tt, numer = tt, sd = rep(1, length(tt)), + stand.contrasts = stand.contrasts)) +} + +# ============================================================================== +# samr.compute.delta.table +# ============================================================================== +## Jun added starts +samr.compute.delta.table <- function(samr.obj, min.foldchange = 0, + dels = NULL, nvals = 50) { + res <- NULL + if (samr.obj$assay.type == "array") { + res <- samr.compute.delta.table.array(samr.obj, min.foldchange, + dels, nvals) + } + else if (samr.obj$assay.type == "seq") { + res <- samr.compute.delta.table.seq(samr.obj, min.foldchange, + dels) + } + return(res) +} +## Jun added ends + +## Jun added the first row below, and commented the row +# after it +samr.compute.delta.table.array <- function(samr.obj, + min.foldchange = 0, dels = NULL, nvals = 50) { + #samr.compute.delta.table <- function(samr.obj, + # min.foldchange=0, dels=NULL, nvals=50) { + # computes delta table, starting with samr object 'a', for + # nvals values of delta + lmax = sqrt(max(abs(sort(samr.obj$tt) - samr.obj$evo))) + if (is.null(dels)) { + dels = (seq(0, lmax, length = nvals)^2) + } + col = matrix(1, nrow = length(samr.obj$evo), ncol = nvals) + ttstar0 <- samr.obj$ttstar0 + tt <- samr.obj$tt + n <- samr.obj$n + evo <- samr.obj$evo + nsim <- ncol(ttstar0) + res1 <- NULL + foldchange.cond.up = matrix(T, nrow = nrow(samr.obj$ttstar), + ncol = ncol(samr.obj$ttstar)) + foldchange.cond.lo = matrix(T, nrow = nrow(samr.obj$ttstar), + ncol = ncol(samr.obj$ttstar)) + if (!is.null(samr.obj$foldchange[1]) & (min.foldchange > + 0)) { + foldchange.cond.up = samr.obj$foldchange.star >= min.foldchange + foldchange.cond.lo = samr.obj$foldchange.star <= 1/min.foldchange + } + cutup = rep(NA, length(dels)) + cutlow = rep(NA, length(dels)) + g2 = rep(NA, length(dels)) + errup = matrix(NA, ncol = length(dels), nrow = ncol(samr.obj$ttstar0)) + errlow = matrix(NA, ncol = length(dels), nrow = ncol(samr.obj$ttstar0)) + cat("", fill = T) + cat("Computing delta table", fill = T) + for (ii in 1:length(dels)) { + cat(ii, fill = TRUE) + ttt <- detec.slab(samr.obj, dels[ii], min.foldchange) + cutup[ii] <- 1e+10 + if (length(ttt$pup > 0)) { + cutup[ii] <- min(samr.obj$tt[ttt$pup]) + } + cutlow[ii] <- -1e+10 + if (length(ttt$plow) > 0) { + cutlow[ii] <- max(samr.obj$tt[ttt$plow]) + } + g2[ii] = sumlengths(ttt) + errup[, ii] = colSums(samr.obj$ttstar0 > cutup[ii] & + foldchange.cond.up) + errlow[, ii] = colSums(samr.obj$ttstar0 < cutlow[ii] & + foldchange.cond.lo) + } + s <- sqrt(apply(errup, 2, var)/nsim + apply(errlow, 2, var)/nsim) + gmed <- apply(errup + errlow, 2, median) + g90 = apply(errup + errlow, 2, quantile, 0.9) + res1 <- cbind(samr.obj$pi0 * gmed, samr.obj$pi0 * g90, g2, + samr.obj$pi0 * gmed/g2, samr.obj$pi0 * g90/g2, cutlow, + cutup) + res1 <- cbind(dels, res1) + # remove rows with #called=0 + #om=res1[,4]==0 + #res1=res1[!om,,drop=F] + # remove duplicate rows with same # of genes called + #omm=!duplicated(res1[,4]) + #res1=res1[omm,,drop=F] + dimnames(res1) <- list(NULL, c("delta", "# med false pos", + "90th perc false pos", "# called", "median FDR", "90th perc FDR", + "cutlo", "cuthi")) + return(res1) +} + +####################################################################### +#\tcompute the delta table for sequencing data +####################################################################### +samr.compute.delta.table.seq <- function(samr.obj, + min.foldchange = 0, dels = NULL) { + res1 <- NULL + flag <- T + ## check whether any gene satisfies the foldchange + # restrictions + if ((samr.obj$resp.type == samr.const.twoclass.unpaired.response | + samr.obj$resp.type == samr.const.twoclass.paired.response) & + (min.foldchange > 0)) { + sat.up <- (samr.obj$foldchange >= min.foldchange) & (samr.obj$evo > + 0) + sat.dn <- (samr.obj$foldchange <= 1/min.foldchange) & + (samr.obj$evo < 0) + if (sum(sat.up) + sum(sat.dn) == 0) { + flag <- F + } + } + if (flag) { + if (is.null(dels)) { + dels <- generate.dels(samr.obj, min.foldchange = min.foldchange) + } + cat("Number of thresholds chosen (all possible thresholds) =", + length(dels), fill = T) + if (length(dels) > 0) { + ## sort delta to make the fast calculation right + dels <- sort(dels) + ## get the upper and lower cutoffs + cat("Getting all the cutoffs for the thresholds...\n") + slabs <- samr.seq.detec.slabs(samr.obj, dels, min.foldchange) + cutup <- slabs$cutup + cutlow <- slabs$cutlow + g2 <- slabs$g2 + ## get the number of errors under the null hypothesis + cat("Getting number of false positives in the permutation...\n") + errnum <- samr.seq.null.err(samr.obj, min.foldchange, + cutup, cutlow) + res1 <- NULL + gmed <- apply(errnum, 2, median) + g90 = apply(errnum, 2, quantile, 0.9) + res1 <- cbind(samr.obj$pi0 * gmed, samr.obj$pi0 * + g90, g2, samr.obj$pi0 * gmed/g2, samr.obj$pi0 * + g90/g2, cutlow, cutup) + res1 <- cbind(dels, res1) + dimnames(res1) <- list(NULL, c("delta", "# med false pos", + "90th perc false pos", "# called", "median FDR", + "90th perc FDR", "cutlo", "cuthi")) + } + } + return(res1) +} + +# ============================================================================== +# samr.plot +# ============================================================================== +samr.plot <- function(samr.obj, del = NULL, min.foldchange = 0) { + ## make observed-expected plot + ## takes foldchange into account too + if (is.null(del)) { + del = sqrt(max(abs(sort(samr.obj$tt) - samr.obj$evo))) + } + LARGE = 1e+10 + b <- detec.slab(samr.obj, del, min.foldchange) + bb <- c(b$pup, b$plow) + b1 = LARGE + b0 = -LARGE + if (!is.null(b$pup)) { + b1 <- min(samr.obj$tt[b$pup]) + } + if (!is.null(b$plow)) { + b0 <- max(samr.obj$tt[b$plow]) + } + c1 <- (1:samr.obj$n)[sort(samr.obj$tt) >= b1] + c0 <- (1:samr.obj$n)[sort(samr.obj$tt) <= b0] + c2 <- c(c0, c1) + foldchange.cond.up = rep(T, length(samr.obj$evo)) + foldchange.cond.lo = rep(T, length(samr.obj$evo)) + if (!is.null(samr.obj$foldchange[1]) & (min.foldchange > + 0)) { + foldchange.cond.up = samr.obj$foldchange >= min.foldchange + foldchange.cond.lo = samr.obj$foldchange <= 1/min.foldchange + } + col = rep(1, length(samr.obj$evo)) + col[b$plow] = 3 + col[b$pup] = 2 + if (!is.null(samr.obj$foldchange[1]) & (min.foldchange > + 0)) { + col[!foldchange.cond.lo & !foldchange.cond.up] = 1 + } + col.ordered = col[order(samr.obj$tt)] + ylims <- range(samr.obj$tt) + xlims <- range(samr.obj$evo) + plot(samr.obj$evo, sort(samr.obj$tt), xlab = "expected score", + ylab = "observed score", ylim = ylims, xlim = xlims, + type = "n") + points(samr.obj$evo, sort(samr.obj$tt), col = col.ordered) + abline(0, 1) + abline(del, 1, lty = 2) + abline(-del, 1, lty = 2) +} + +# ============================================================================== +# samr.compute.siggenes.table +# ============================================================================== +samr.compute.siggenes.table = function(samr.obj, del, + data, delta.table, min.foldchange = 0, all.genes = FALSE, + compute.localfdr = FALSE) +{ + ## computes significant genes table, starting with samr + # object 'a' and 'delta.table' + ## for a **single** value del + ## if all.genes is true, all genes are printed (and value + # of del is ignored) + if (is.null(data$geneid)) + { + data$geneid = paste("g", 1:nrow(data$x), sep = "") + } + if (is.null(data$genenames)) + { + data$genenames = paste("g", 1:nrow(data$x), sep = "") + } + if (!all.genes) + { + sig = detec.slab(samr.obj, del, min.foldchange) + } + if (all.genes) + { + p = length(samr.obj$tt) + pup = (1:p)[samr.obj$tt >= 0] + plo = (1:p)[samr.obj$tt < 0] + sig = list(pup = pup, plo = plo) + } + if (compute.localfdr) + { + aa = localfdr(samr.obj, min.foldchange) + if (length(sig$pup) > 0) + { + fdr.up = predictlocalfdr(aa$smooth.object, samr.obj$tt[sig$pup]) + } + if (length(sig$plo) > 0) + { + fdr.lo = predictlocalfdr(aa$smooth.object, samr.obj$tt[sig$plo]) + } + } + qvalues = NULL + if (length(sig$pup) > 0 | length(sig$plo) > 0) + { + qvalues = qvalue.func(samr.obj, sig, delta.table) + } + res.up = NULL + res.lo = NULL + done = FALSE + + # two class unpaired or paired (foldchange is reported) + if ((samr.obj$resp.type == samr.const.twoclass.unpaired.response | + samr.obj$resp.type == samr.const.twoclass.paired.response)) + { + if (!is.null(sig$pup)) + { + res.up = cbind(sig$pup + 1, data$genenames[sig$pup], + data$geneid[sig$pup], samr.obj$tt[sig$pup], samr.obj$numer[sig$pup], + samr.obj$sd[sig$pup], samr.obj$foldchange[sig$pup], + qvalues$qvalue.up) + if (compute.localfdr) + { + res.up = cbind(res.up, fdr.up) + } + temp.names = list(NULL, c("Row", "Gene ID", "Gene Name", + "Score(d)", "Numerator(r)", "Denominator(s+s0)", + "Fold Change", "q-value(%)")) + if (compute.localfdr) + { + temp.names[[2]] = c(temp.names[[2]], "localfdr(%)") + } + dimnames(res.up) = temp.names + } + if (!is.null(sig$plo)) + { + res.lo = cbind(sig$plo + 1, data$genenames[sig$plo], + data$geneid[sig$plo], samr.obj$tt[sig$plo], samr.obj$numer[sig$plo], + samr.obj$sd[sig$plo], samr.obj$foldchange[sig$plo], + qvalues$qvalue.lo) + if (compute.localfdr) + { + res.lo = cbind(res.lo, fdr.lo) + } + temp.names = list(NULL, c("Row", "Gene ID", "Gene Name", + "Score(d)", "Numerator(r)", "Denominator(s+s0)", + "Fold Change", "q-value(%)")) + if (compute.localfdr) + { + temp.names[[2]] = c(temp.names[[2]], "localfdr(%)") + } + dimnames(res.lo) = temp.names + } + done = TRUE + } + + # multiclass + if (samr.obj$resp.type == samr.const.multiclass.response) + { + if (!is.null(sig$pup)) + { + res.up = cbind(sig$pup + 1, data$genenames[sig$pup], + data$geneid[sig$pup], samr.obj$tt[sig$pup], samr.obj$numer[sig$pup], + samr.obj$sd[sig$pup], samr.obj$stand.contrasts[sig$pup, ], qvalues$qvalue.up) + + if (compute.localfdr) + { + res.up = cbind(res.up, fdr.up) + } + + collabs.contrast = paste("contrast-", as.character(1:ncol(samr.obj$stand.contrasts)), + sep = "") + temp.names = list(NULL, c("Row", "Gene ID", "Gene Name", + "Score(d)", "Numerator(r)", "Denominator(s+s0)", + collabs.contrast, "q-value(%)")) + + if (compute.localfdr) + { + temp.names[[2]] = c(temp.names[[2]], "localfdr(%)") + } + dimnames(res.up) = temp.names + } + res.lo = NULL + done = TRUE + } + + #all other cases + if (!done) + { + if (!is.null(sig$pup)) + { + res.up = cbind(sig$pup + 1, data$genenames[sig$pup], + data$geneid[sig$pup], samr.obj$tt[sig$pup], samr.obj$numer[sig$pup], + samr.obj$sd[sig$pup], samr.obj$foldchange[sig$pup], + qvalues$qvalue.up) + if (compute.localfdr) + { + res.up = cbind(res.up, fdr.up) + } + temp.names = list(NULL, c("Row", "Gene ID", "Gene Name", + "Score(d)", "Numerator(r)", "Denominator(s+s0)", + "q-value(%)")) + if (compute.localfdr) + { + temp.names[[2]] = c(temp.names[[2]], "localfdr(%)") + } + dimnames(res.up) = temp.names + } + if (!is.null(sig$plo)) + { + res.lo = cbind(sig$plo + 1, data$genenames[sig$plo], + data$geneid[sig$plo], samr.obj$tt[sig$plo], samr.obj$numer[sig$plo], + samr.obj$sd[sig$plo], samr.obj$foldchange[sig$plo], + qvalues$qvalue.lo) + if (compute.localfdr) + { + res.lo = cbind(res.lo, fdr.lo) + } + temp.names = list(NULL, c("Row", "Gene ID", "Gene Name", + "Score(d)", "Numerator(r)", "Denominator(s+s0)", + "q-value(%)")) + if (compute.localfdr) + { + temp.names[[2]] = c(temp.names[[2]], "localfdr(%)") + } + dimnames(res.lo) = temp.names + } + done = TRUE + } + if (!is.null(res.up)) + { + o1 = order(-samr.obj$tt[sig$pup]) + res.up = res.up[o1, , drop = F] + } + if (!is.null(res.lo)) + { + o2 = order(samr.obj$tt[sig$plo]) + res.lo = res.lo[o2, , drop = F] + } + color.ind.for.multi = NULL + if (samr.obj$resp.type == samr.const.multiclass.response & !is.null(sig$pup)) + { + color.ind.for.multi = 1 * (samr.obj$stand.contrasts[sig$pup, + ] > samr.obj$stand.contrasts.95[2]) + (-1) * (samr.obj$stand.contrasts[sig$pup, + ] < samr.obj$stand.contrasts.95[1]) + } + ngenes.up = nrow(res.up) + if (is.null(ngenes.up)) + { + ngenes.up = 0 + } + ngenes.lo = nrow(res.lo) + if (is.null(ngenes.lo)) + { + ngenes.lo = 0 + } + return(list(genes.up = res.up, genes.lo = res.lo, color.ind.for.multi = color.ind.for.multi, + ngenes.up = ngenes.up, ngenes.lo = ngenes.lo)) +} +generate.dels <- function(samr.obj, min.foldchange = 0) { + dels <- NULL + ## initialize calculation + tag <- order(samr.obj$tt) + if ((samr.obj$resp.type == samr.const.twoclass.unpaired.response | + samr.obj$resp.type == samr.const.twoclass.paired.response) & + (min.foldchange > 0)) { + res.mat <- data.frame(tt = samr.obj$tt[tag], fc = samr.obj$foldchange[tag], + evo = samr.obj$evo, dif = samr.obj$tt[tag] - samr.obj$evo) + res.up <- res.mat[res.mat$evo > 0, ] + res.lo <- res.mat[res.mat$evo < 0, ] + res.up <- res.up[res.up$fc >= min.foldchange, ] + res.lo <- res.lo[res.lo$fc <= 1/min.foldchange, ] + } + else { + res.mat <- data.frame(tt = samr.obj$tt[tag], evo = samr.obj$evo, + dif = samr.obj$tt[tag] - samr.obj$evo) + res.up <- res.mat[res.mat$evo > 0, ] + res.lo <- res.mat[res.mat$evo < 0, ] + } + ## for the upper part + up.vec <- rep(NA, nrow(res.up)) + if (nrow(res.up) > 0) { + st <- 1e-08 + i.cur <- 1 + for (i in 1:nrow(res.up)) { + if (res.up$dif[i] > st) { + st <- res.up$dif[i] + up.vec[i.cur] <- st + i.cur <- i.cur + 1 + } + } + } + ## for the lower part + lo.vec <- rep(NA, nrow(res.lo)) + if (nrow(res.lo) > 0) { + st <- -1e-08 + i.cur <- 1 + for (i in nrow(res.lo):1) { + if (res.lo$dif[i] < st) { + st <- res.lo$dif[i] + lo.vec[i.cur] <- st + i.cur <- i.cur + 1 + } + } + } + ## combine them + vec <- c(up.vec, -lo.vec) + vec <- vec[!is.na(vec)] + vec <- vec - 1e-08 + dels <- sort(unique(vec)) + return(dels) +} +samr.seq.detec.slabs <- function(samr.obj, dels, min.foldchange) { + ## initialize calculation + tag <- order(samr.obj$tt) + if ((samr.obj$resp.type == samr.const.twoclass.unpaired.response | + samr.obj$resp.type == samr.const.twoclass.paired.response) & + (min.foldchange > 0)) { + res.mat <- data.frame(tt = samr.obj$tt[tag], fc = samr.obj$foldchange[tag], + evo = samr.obj$evo, dif = samr.obj$tt[tag] - samr.obj$evo) + res.up <- res.mat[res.mat$evo > 0, ] + res.lo <- res.mat[res.mat$evo < 0, ] + res.up <- res.up[res.up$fc >= min.foldchange, ] + res.lo <- res.lo[res.lo$fc <= 1/min.foldchange, ] + } + else { + res.mat <- data.frame(tt = samr.obj$tt[tag], evo = samr.obj$evo, + dif = samr.obj$tt[tag] - samr.obj$evo) + res.up <- res.mat[res.mat$evo > 0, ] + res.lo <- res.mat[res.mat$evo < 0, ] + } + ## begin calculating + cutup <- rep(1e+10, length(dels)) + cutlow <- rep(-1e+10, length(dels)) + g2.up <- g2.lo <- rep(0, length(dels)) + if (nrow(res.up) > 0) { + res.up <- data.frame(dif = res.up$dif, tt = res.up$tt, + num = nrow(res.up):1) + ## get the upper part + j <- 1 + ii <- 1 + while (j <= nrow(res.up) & ii <= length(dels)) { + if (res.up$dif[j] > dels[ii]) { + cutup[ii] <- res.up$tt[j] + g2.up[ii] <- res.up$num[j] + ii <- ii + 1 + } + else { + j <- j + 1 + } + } + } + if (nrow(res.lo) > 0) { + res.lo <- data.frame(dif = res.lo$dif, tt = res.lo$tt, + num = 1:nrow(res.lo)) + ## get the lower part + j <- nrow(res.lo) + ii <- 1 + while (j >= 1 & ii <= length(dels)) { + if (res.lo$dif[j] < -dels[ii]) { + cutlow[ii] <- res.lo$tt[j] + g2.lo[ii] <- res.lo$num[j] + ii <- ii + 1 + } + else { + j <- j - 1 + } + } + } + g2 <- g2.up + g2.lo + return(list(cutup = cutup, cutlow = cutlow, g2 = g2)) +} +sumlengths <- function(aa) { + length(aa$pl) + length(aa$pu) +} + +samr.seq.null.err <- function(samr.obj, min.foldchange, + cutup, cutlow) { + errup = matrix(NA, ncol = length(cutup), nrow = ncol(samr.obj$ttstar0)) + errlow = matrix(NA, ncol = length(cutlow), nrow = ncol(samr.obj$ttstar0)) + cutup.rank <- rank(cutup, ties.method = "min") + cutlow.rank <- rank(-cutlow, ties.method = "min") + for (jj in 1:ncol(samr.obj$ttstar0)) { + #cat(jj, fill=TRUE) + keep.up <- keep.dn <- samr.obj$ttstar0[, jj] + if ((samr.obj$resp.type == samr.const.twoclass.unpaired.response | + samr.obj$resp.type == samr.const.twoclass.paired.response) & + (min.foldchange > 0)) { + keep.up <- keep.up[samr.obj$foldchange.star[, jj] >= + min.foldchange] + keep.dn <- keep.dn[samr.obj$foldchange.star[, jj] <= + 1/min.foldchange] + } + errup[jj, ] <- length(keep.up) - (rank(c(cutup, keep.up), + ties.method = "min")[1:length(cutup)] - cutup.rank) + errlow[jj, ] <- length(keep.dn) - (rank(c(-cutlow, -keep.dn), + ties.method = "min")[1:length(cutlow)] - cutlow.rank) + } + errnum <- errup + errlow + return(errnum) +} +detec.slab <- function(samr.obj, del, min.foldchange) { + ## find genes above and below the slab of half-width del + # this calculation is tricky- for consistency, the slab + # condition picks + # all genes that are beyond the first departure from the + # slab + # then the fold change condition is applied (if applicable) + n <- length(samr.obj$tt) + tt <- samr.obj$tt + evo <- samr.obj$evo + numer <- samr.obj$tt * (samr.obj$sd + samr.obj$s0) + tag <- order(tt) + pup <- NULL + foldchange.cond.up = rep(T, length(evo)) + foldchange.cond.lo = rep(T, length(evo)) + if (!is.null(samr.obj$foldchange[1]) & (min.foldchange > + 0)) { + foldchange.cond.up = samr.obj$foldchange >= min.foldchange + foldchange.cond.lo = samr.obj$foldchange <= 1/min.foldchange + } + o1 <- (1:n)[(tt[tag] - evo > del) & evo > 0] + if (length(o1) > 0) { + o1 <- o1[1] + o11 <- o1:n + o111 <- rep(F, n) + o111[tag][o11] <- T + pup <- (1:n)[o111 & foldchange.cond.up] + } + plow <- NULL + o2 <- (1:n)[(evo - tt[tag] > del) & evo < 0] + if (length(o2) > 0) { + o2 <- o2[length(o2)] + o22 <- 1:o2 + o222 <- rep(F, n) + o222[tag][o22] <- T + plow <- (1:n)[o222 & foldchange.cond.lo] + } + return(list(plow = plow, pup = pup)) +} + +#' @importFrom stats smooth.spline +localfdr <- function(samr.obj, min.foldchange, perc = 0.01, + df = 10) { + ## estimates compute.localfdr at score 'd', using SAM + # object 'samr.obj' + ## 'd' can be a vector of d scores + ## returns estimate of symmetric fdr as a percentage + # this version uses a 1% symmetric window, and does not + # estimate fdr in + # windows having fewer than 100 genes + ## to use: first run samr and then pass the resulting fit + # object to + ## localfdr + ## NOTE: at most 20 of the perms are used to estimate the + # fdr (for speed sake) + # I try two window shapes: symmetric and an assymetric one + # currently I use the symmetric window to estimate the + # compute.localfdr + ngenes = length(samr.obj$tt) + mingenes = 50 + # perc is increased, in order to get at least mingenes in a + # window + perc = max(perc, mingenes/length(samr.obj$tt)) + nperms.to.use = min(20, ncol(samr.obj$ttstar)) + nperms = ncol(samr.obj$ttstar) + d = seq(sort(samr.obj$tt)[1], sort(samr.obj$tt)[ngenes], + length = 100) + ndscore <- length(d) + dvector <- rep(NA, ndscore) + ind.foldchange = rep(T, length(samr.obj$tt)) + if (!is.null(samr.obj$foldchange[1]) & min.foldchange > 0) { + ind.foldchange = (samr.obj$foldchange >= min.foldchange) | + (samr.obj$foldchange <= min.foldchange) + } + fdr.temp = function(temp, dlow, dup, pi0, ind.foldchange) { + return(sum(pi0 * (temp >= dlow & temp <= dup & ind.foldchange))) + } + for (i in 1:ndscore) { + pi0 <- samr.obj$pi0 + r <- sum(samr.obj$tt < d[i]) + r22 <- round(max(r - length(samr.obj$tt) * perc/2, 1)) + dlow.sym <- sort(samr.obj$tt)[r22] + # if(d[i]<0) + # { + # r2 <- max(r-length(samr.obj$tt)*perc/2, 1) + # r22= min(r+length(samr.obj$tt)*perc/2, + # length(samr.obj$tt)) + # + # dlow <- sort(samr.obj$tt)[r2] + # dup=sort(samr.obj$tt)[r22] + # } + r22 <- min(r + length(samr.obj$tt) * perc/2, length(samr.obj$tt)) + dup.sym <- sort(samr.obj$tt)[r22] + # if(d[i]>0) + # { + # r2 <- min(r+length(samr.obj$tt)*perc/2, + # length(samr.obj$tt)) + # r22 <- max(r-length(samr.obj$tt)*perc/2, 1) + # dup <- sort(samr.obj$tt)[r2] + # dlow <- sort(samr.obj$tt)[r22] + # + # } + # o <- samr.obj$tt>=dlow & samr.obj$tt<= dup & + # ind.foldchange + oo <- samr.obj$tt >= dlow.sym & samr.obj$tt <= dup.sym & + ind.foldchange + nsim <- ncol(samr.obj$ttstar) + fdr <- rep(NA, nsim) + fdr2 <- fdr + if (!is.null(samr.obj$foldchange[1]) & min.foldchange > + 0) { + temp = as.vector(samr.obj$foldchange.star[, 1:nperms.to.use]) + ind.foldchange = (temp >= min.foldchange) | (temp <= + min.foldchange) + } + temp = samr.obj$ttstar0[, sample(1:nperms, size = nperms.to.use)] + # fdr <-median(apply(temp,2,fdr.temp,dlow, dup, pi0, + # ind.foldchange)) + fdr.sym <- median(apply(temp, 2, fdr.temp, dlow.sym, + dup.sym, pi0, ind.foldchange)) + # fdr <- 100*fdr/sum(o) + fdr.sym <- 100 * fdr.sym/sum(oo) + dlow.sym <- dlow.sym + dup.sym <- dup.sym + dvector[i] <- fdr.sym + } + om = !is.na(dvector) & (dvector != Inf) + aa = smooth.spline(d[om], dvector[om], df = df) + return(list(smooth.object = aa, perc = perc, df = df)) +} + +predictlocalfdr = function(smooth.object, d) { + yhat = predict(smooth.object, d)$y + yhat = pmin(yhat, 100) + yhat = pmax(yhat, 0) + return(yhat) +} + +qvalue.func = function(samr.obj, sig, delta.table) { + # returns q-value as a percentage (out of 100) + LARGE = 1e+10 + qvalue.up = rep(NA, length(sig$pup)) + o1 = sig$pup + cutup = delta.table[, 8] + FDR = delta.table[, 5] + ii = 0 + for (i in o1) { + o = abs(cutup - samr.obj$tt[i]) + o[is.na(o)] = LARGE + oo = (1:length(o))[o == min(o)] + oo = oo[length(oo)] + ii = ii + 1 + qvalue.up[ii] = FDR[oo] + } + qvalue.lo = rep(NA, length(sig$plo)) + o2 = sig$plo + cutlo = delta.table[, 7] + ii = 0 + for (i in o2) { + o = abs(cutlo - samr.obj$tt[i]) + o[is.na(o)] = LARGE + oo = (1:length(o))[o == min(o)] + oo = oo[length(oo)] + ii = ii + 1 + qvalue.lo[ii] = FDR[oo] + } + # any qvalues that are missing, are set to 1 (the highest + # value) + qvalue.lo[is.na(qvalue.lo)] = 1 + qvalue.up[is.na(qvalue.up)] = 1 + # ensure that each qvalue vector is monotone non-increasing + o1 = order(samr.obj$tt[sig$plo]) + qv1 = qvalue.lo[o1] + qv11 = qv1 + if (length(qv1) > 1) { + for (i in 2:length(qv1)) { + if (qv11[i] < qv11[i - 1]) { + qv11[i] = qv11[i - 1] + } + } + qv111 = qv11 + qv111[o1] = qv11 + } + else { + qv111 = qv1 + } + o2 = order(samr.obj$tt[sig$pup]) + qv2 = qvalue.up[o2] + qv22 = qv2 + if (length(qv2) > 1) { + for (i in 2:length(qv2)) { + if (qv22[i] > qv22[i - 1]) { + qv22[i] = qv22[i - 1] + } + } + qv222 = qv22 + qv222[o2] = qv22 + } + else { + qv222 = qv2 + } + return(list(qvalue.lo = 100 * qv111, qvalue.up = 100 * qv222)) +} diff --git a/README.md b/README.md index b631030..97f3471 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ocbe-uio/DIscBIO/dev?filepath=notebook) [![Build Status](https://travis-ci.org/ocbe-uio/DIscBIO.svg?branch=dev)](https://travis-ci.org/ocbe-uio/DIscBIO) +![Current CRAN release](https://www.r-pkg.org/badges/version/DIscBIO) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ocbe-uio/DIscBIO/dev?filepath=notebook) [![Build Status](https://travis-ci.org/ocbe-uio/DIscBIO.svg?branch=dev)](https://travis-ci.org/ocbe-uio/DIscBIO) # DIscBIO @@ -54,7 +54,7 @@ A step-by-step tutorial of DIscBIO is under construction as a standalone R vigne In order to use the Binder version of DIscBIO, just click on the badge below: -[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ocbe-uio/DIscBIO/dev?filepath=notebook) +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ocbe-uio/DIscBIO/dev?filepath=notebook) # Development diff --git a/TODO.md b/TODO.md index d23b309..431d80d 100644 --- a/TODO.md +++ b/TODO.md @@ -14,5 +14,5 @@ Ideally, the non-critical tasks below would also be addressed before release: - [x] Reduce data folder size - [ ] Standardize capitalization of function names and arguments - [x] Remove unessential variable assignments from [test.reproduceNotebook.R](tests/testthat/test.reproduceNotebook.R) -- [ ] Shorten examples with CPU or elapsed time > 5s +- [x] Shorten examples with CPU or elapsed time > 5s - [x] Reduce unit test length diff --git a/data/HumanMouseGeneIds.rda b/data/HumanMouseGeneIds.rda index 7d0a6c8..4e4c1e7 100644 Binary files a/data/HumanMouseGeneIds.rda and b/data/HumanMouseGeneIds.rda differ diff --git a/data/valuesG1msRed.rda b/data/valuesG1msRed.rda new file mode 100644 index 0000000..685bbfb Binary files /dev/null and b/data/valuesG1msRed.rda differ diff --git a/data/valuesG1msReduced_treated_K.rda b/data/valuesG1msReduced_treated_K.rda new file mode 100644 index 0000000..0daf66b Binary files /dev/null and b/data/valuesG1msReduced_treated_K.rda differ diff --git a/data/valuesG1msReduced_treated_MB.rda b/data/valuesG1msReduced_treated_MB.rda new file mode 100644 index 0000000..12bf35a Binary files /dev/null and b/data/valuesG1msReduced_treated_MB.rda differ diff --git a/data/valuesG1msTest.rda b/data/valuesG1msTest.rda new file mode 100644 index 0000000..f61a23e Binary files /dev/null and b/data/valuesG1msTest.rda differ diff --git a/install.R b/install.R deleted file mode 100644 index 0fb2dd3..0000000 --- a/install.R +++ /dev/null @@ -1,18 +0,0 @@ -if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") - -BiocManager::install( - c( - "pheatmap", "cluster", "mclust", "flexmix", - "lattice", "fpc", "amap", "RColorBrewer", "locfit", "TSCAN", - "genefilter", "statmod", "ggplot2", "gplots", "DESeq2", - "matrixStats", "robustbase", "philentropy", "igraph", "boot", - "biomaRt", "tidyr", "calibrate", "partykit", "RWeka", "rpart", - "rpart.plot", "imager", "png", "NetIndices", "httr", "jsonlite", - "tidyverse", "samr", "tidyverse", "org.Hs.eg.db", "AnnotationDbi", - "enrichR", "tsne" - ) -) - -# BiocInstaller appears to be needed to install DIscBIO from source -# it shouldn't be needed, however -install.packages("BiocInstaller", repos="http://bioconductor.org/packages/2.13/bioc") diff --git a/man/ClassVectoringDT.Rd b/man/ClassVectoringDT.Rd index c713447..c6f1c38 100644 --- a/man/ClassVectoringDT.Rd +++ b/man/ClassVectoringDT.Rd @@ -51,22 +51,3 @@ A data frame. This function generates a class vector for the input dataset so the decision tree analysis can be implemented afterwards. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Clustexp(sc, cln=2) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -cdiff <- DEGanalysis2clust( - sc, Clustering="K-means", K=2, fdr=.2, name="Name", First="CL1", - Second="CL2", export=FALSE -) -DATAforDT <- ClassVectoringDT( - sc, Clustering="K-means", K=2, First="CL1", Second="CL2", cdiff[[1]] -) -str(DATAforDT) -} diff --git a/man/ClustDiffGenes.Rd b/man/ClustDiffGenes.Rd new file mode 100644 index 0000000..ff1bb8a --- /dev/null +++ b/man/ClustDiffGenes.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DIscBIO-generic-ClustDiffGenes.R +\name{ClustDiffGenes} +\alias{ClustDiffGenes} +\alias{ClustDiffGenes,DISCBIO-method} +\title{ClustDiffGenes} +\usage{ +ClustDiffGenes( + object, + K, + pValue = 0.05, + fdr = 0.01, + export = FALSE, + quiet = FALSE, + filename_up = "Up-DEG-cluster", + filename_down = "Down-DEG-cluster", + filename_binom = "binomial-DEGsTable", + filename_sigdeg = "binomial-sigDEG" +) + +\S4method{ClustDiffGenes}{DISCBIO}( + object, + K, + pValue = 0.05, + fdr = 0.01, + export = FALSE, + quiet = FALSE, + filename_up = "Up-DEG-cluster", + filename_down = "Down-DEG-cluster", + filename_binom = "binomial-DEGsTable", + filename_sigdeg = "binomial-sigDEG" +) +} +\arguments{ +\item{object}{\code{DISCBIO} class object.} + +\item{K}{A numeric value of the number of clusters.} + +\item{pValue}{A numeric value of the p-value. Default is 0.05.} + +\item{fdr}{A numeric value of the false discovery rate. Default is 0.01.} + +\item{export}{A logical vector that allows writing the final gene list in +excel file. Default is TRUE.} + +\item{quiet}{if `TRUE`, suppresses intermediate text output} + +\item{filename_up}{Name of the exported "up" file (if `export=TRUE`)} + +\item{filename_down}{Name of the exported "down" file (if `export=TRUE`)} + +\item{filename_binom}{Name of the exported binomial file} + +\item{filename_sigdeg}{Name of the exported sigDEG file} +} +\value{ +A list containing two tables. +} +\description{ +Creates a table of cluster differences +} +\examples{ +sc <- DISCBIO(valuesG1msTest) +sc <- Clustexp(sc, cln=3, quiet=TRUE) +cdiff <- ClustDiffGenes(sc, K=3, fdr=.3, export=FALSE) +str(cdiff) +cdiff[[2]] +} diff --git a/man/Clustexp.Rd b/man/Clustexp.Rd index 8d4ba02..c5874ac 100644 --- a/man/Clustexp.Rd +++ b/man/Clustexp.Rd @@ -8,7 +8,7 @@ \usage{ Clustexp( object, - clustnr = 20, + clustnr = 3, bootnr = 50, metric = "pearson", do.gap = TRUE, @@ -16,13 +16,13 @@ Clustexp( SE.factor = 0.25, B.gap = 50, cln = 0, - rseed = 17000, + rseed = NULL, quiet = FALSE ) \S4method{Clustexp}{DISCBIO}( object, - clustnr = 20, + clustnr = 3, bootnr = 50, metric = "pearson", do.gap = TRUE, @@ -30,7 +30,7 @@ Clustexp( SE.factor = 0.25, B.gap = 50, cln = 0, - rseed = 17000, + rseed = NULL, quiet = FALSE ) } @@ -65,8 +65,7 @@ statistics. Default is 50} \item{cln}{Number of clusters to be used. Default is \code{NULL} and the cluster number is inferred by the saturation criterion.} -\item{rseed}{Integer number. Random seed to enforce reproducible clustering -results. Default is 17000.} +\item{rseed}{Random integer to enforce reproducible clustering results.} \item{quiet}{if `TRUE`, intermediate output is suppressed} } @@ -78,6 +77,6 @@ This functions performs the initial clustering of the RaceID algorithm. } \examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data -sc <- Clustexp(sc, cln=3) +sc <- DISCBIO(valuesG1msTest) # changes signature of data +sc <- Clustexp(sc, cln=2) } diff --git a/man/DEGanalysis.Rd b/man/DEGanalysis.Rd index 4f641eb..caf4744 100644 --- a/man/DEGanalysis.Rd +++ b/man/DEGanalysis.Rd @@ -8,36 +8,40 @@ \usage{ DEGanalysis( object, - Clustering = "K-means", K, + Clustering = "K-means", fdr = 0.05, name = "Name", - export = TRUE, + export = FALSE, quiet = FALSE, plot = TRUE, + filename_deg = "DEGsTable", + filename_sigdeg = "sigDEG", ... ) \S4method{DEGanalysis}{DISCBIO}( object, - Clustering = "K-means", K, + Clustering = "K-means", fdr = 0.05, name = "Name", - export = TRUE, + export = FALSE, quiet = FALSE, plot = TRUE, + filename_deg = "DEGsTable", + filename_sigdeg = "sigDEG", ... ) } \arguments{ \item{object}{\code{DISCBIO} class object.} +\item{K}{A numeric value of the number of clusters.} + \item{Clustering}{Clustering has to be one of the following: ["K-means","MB"]. Default is "K-means"} -\item{K}{A numeric value of the number of clusters.} - \item{fdr}{A numeric value of the false discovery rate. Default is 0.05.} \item{name}{A string vector showing the name to be used to save the resulted @@ -50,6 +54,10 @@ excel file. Default is TRUE.} \item{plot}{if `TRUE`, plots are generated} +\item{filename_deg}{Name of the exported DEG table} + +\item{filename_sigdeg}{Name of the exported sigDEG table} + \item{...}{additional parameters to be passed to samr()} } \value{ @@ -59,17 +67,3 @@ A list containing two tables. This function defines DEGs between all individual clusters generated by either K-means or model based clustering. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- comptSNE(sc, max_iter=100) -DEGanalysis( - sc, Clustering="K-means", K=3, fdr=0.1, name="Name", export = FALSE -) -} diff --git a/man/DEGanalysis2clust.Rd b/man/DEGanalysis2clust.Rd index e9c9ce8..7d2314a 100644 --- a/man/DEGanalysis2clust.Rd +++ b/man/DEGanalysis2clust.Rd @@ -8,40 +8,44 @@ \usage{ DEGanalysis2clust( object, - Clustering = "K-means", K, + Clustering = "K-means", fdr = 0.05, name = "Name", First = "CL1", Second = "CL2", - export = TRUE, + export = FALSE, quiet = FALSE, plot = TRUE, + filename_deg = "DEGsTable", + filename_sigdeg = "sigDEG", ... ) \S4method{DEGanalysis2clust}{DISCBIO}( object, - Clustering = "K-means", K, + Clustering = "K-means", fdr = 0.05, name = "Name", First = "CL1", Second = "CL2", - export = TRUE, + export = FALSE, quiet = FALSE, plot = TRUE, + filename_deg = "DEGsTable", + filename_sigdeg = "sigDEG", ... ) } \arguments{ \item{object}{\code{DISCBIO} class object.} +\item{K}{A numeric value of the number of clusters.} + \item{Clustering}{Clustering has to be one of the following: ["K-means","MB"]. Default is "K-means"} -\item{K}{A numeric value of the number of clusters.} - \item{fdr}{A numeric value of the false discovery rate. Default is 0.05.} \item{name}{A string vector showing the name to be used to save the resulted @@ -60,6 +64,10 @@ excel file. Default is TRUE.} \item{plot}{if `TRUE`, plots are generated} +\item{filename_deg}{Name of the exported DEG table} + +\item{filename_sigdeg}{Name of the exported sigDEG table} + \item{...}{additional parameters to be passed to samr()} } \value{ @@ -69,17 +77,3 @@ A list containing two tables. This function defines DEGs between particular clusters generated by either K-means or model based clustering. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -DEGanalysis2clust( - sc, Clustering="K-means", K=3, fdr=0.1, name="Name", export = FALSE -) -} diff --git a/man/DISCBIO.Rd b/man/DISCBIO.Rd index 421dd10..e3d863b 100644 --- a/man/DISCBIO.Rd +++ b/man/DISCBIO.Rd @@ -83,9 +83,9 @@ from running the noise filtering or/and the expression filtering.} }} \examples{ -class(valuesG1msReduced) -G1_reclassified <- DISCBIO(valuesG1msReduced) +class(valuesG1msTest) +G1_reclassified <- DISCBIO(valuesG1msTest) class(G1_reclassified) str(G1_reclassified, max.level=2) -identical(G1_reclassified@expdataAll, valuesG1msReduced) +identical(G1_reclassified@expdataAll, valuesG1msTest) } diff --git a/man/DISCBIO2SingleCellExperiment.Rd b/man/DISCBIO2SingleCellExperiment.Rd index fb3f993..16a96a6 100644 --- a/man/DISCBIO2SingleCellExperiment.Rd +++ b/man/DISCBIO2SingleCellExperiment.Rd @@ -17,9 +17,10 @@ Extract the SingleCellExperiment input data from the corresponding input slot in a DISCBIO-class object } \examples{ -g1_disc <- DISCBIO(valuesG1msReduced) +g1_disc <- DISCBIO(valuesG1msTest) class(g1_disc) g1_sce <- DISCBIO2SingleCellExperiment(g1_disc) class(g1_sce) + } diff --git a/man/Exprmclust.Rd b/man/Exprmclust.Rd index 6c696b3..82d42b7 100644 --- a/man/Exprmclust.Rd +++ b/man/Exprmclust.Rd @@ -60,14 +60,3 @@ this function first uses principal component analysis (PCA) to reduce dimensionality of original data. It then performs model-based clustering on the transformed expression values. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc,K = 2) -print(sc@MBclusters) -} diff --git a/man/FinalPreprocessing.Rd b/man/FinalPreprocessing.Rd index feb0d3a..68af9d1 100644 --- a/man/FinalPreprocessing.Rd +++ b/man/FinalPreprocessing.Rd @@ -8,15 +8,17 @@ FinalPreprocessing( object, GeneFlitering = "NoiseF", - export = TRUE, - quiet = FALSE + export = FALSE, + quiet = FALSE, + fileName = "filteredDataset" ) \S4method{FinalPreprocessing}{DISCBIO}( object, GeneFlitering = "NoiseF", - export = TRUE, - quiet = FALSE + export = FALSE, + quiet = FALSE, + fileName = "filteredDataset" ) } \arguments{ @@ -29,6 +31,8 @@ FinalPreprocessing( excel file. Default is TRUE.} \item{quiet}{if `TRUE`, intermediary output is suppressed} + +\item{fileName}{File name for exporting (if `export = TRUE`)} } \value{ The DISCBIO-class object input with the FinalGeneList slot filled. @@ -37,7 +41,8 @@ The DISCBIO-class object input with the FinalGeneList slot filled. This function generates the final filtered normalized dataset. } \examples{ -sc <- DISCBIO(valuesG1msReduced) +sc <- DISCBIO(valuesG1msTest) sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) + } diff --git a/man/FindOutliersKM.Rd b/man/FindOutliers.Rd similarity index 73% rename from man/FindOutliersKM.Rd rename to man/FindOutliers.Rd index 7e193b9..11a3074 100644 --- a/man/FindOutliersKM.Rd +++ b/man/FindOutliers.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-FindOutliersKM.R -\name{FindOutliersKM} -\alias{FindOutliersKM} -\alias{FindOutliersKM,DISCBIO-method} -\title{Inference of outlier cells in K-means clustering} +% Please edit documentation in R/DIscBIO-generic-FindOutliers.R +\name{FindOutliers} +\alias{FindOutliers} +\alias{FindOutliers,DISCBIO-method} +\title{Inference of outlier cells} \usage{ -FindOutliersKM( +FindOutliers( object, K, outminc = 5, @@ -17,7 +17,7 @@ FindOutliersKM( quiet = FALSE ) -\S4method{FindOutliersKM}{DISCBIO}( +\S4method{FindOutliers}{DISCBIO}( object, K, outminc = 5, @@ -46,7 +46,7 @@ binomial background model of expression in a cluster. Default is 0.001.} \item{thr}{probability values for which the number of outliers is computed in order to plot the dependence of the number of outliers on the probability -threshold. Default is 2**-(1:40).} +threshold. Default is 2**-(1:40).set} \item{outdistquant}{Real number between zero and one. Outlier cells are merged to outlier clusters if their distance smaller than the @@ -62,14 +62,11 @@ A named vector of the genes containing outlying cells and the number of cells on each. } \description{ -This functions performs the outlier identification +This functions performs the outlier identification for k-means and model-based clustering } \examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- Clustexp(sc, cln=3) # K-means clustering -Outliers <- FindOutliersKM( - sc, K=3, outminc=5, outlg=2, probthr=.5*1e-3, thr=2**-(1:40), - outdistquant=.75, plot = FALSE -) +sc <- DISCBIO(valuesG1msTest) +sc <- Clustexp(sc, cln=2) # K-means clustering +FindOutliers(sc, K=2) } diff --git a/man/FindOutliersMB.Rd b/man/FindOutliersMB.Rd deleted file mode 100644 index 3236c6d..0000000 --- a/man/FindOutliersMB.Rd +++ /dev/null @@ -1,81 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-FindOutliersMB.R -\name{FindOutliersMB} -\alias{FindOutliersMB} -\alias{FindOutliersMB,DISCBIO-method} -\title{Inference of outlier cells in Model-based clustering} -\usage{ -FindOutliersMB( - object, - K, - outminc = 5, - outlg = 2, - probthr = 0.001, - thr = 2^-(1:40), - outdistquant = 0.75, - plot = TRUE, - quiet = FALSE -) - -\S4method{FindOutliersMB}{DISCBIO}( - object, - K, - outminc = 5, - outlg = 2, - probthr = 0.001, - thr = 2^-(1:40), - outdistquant = 0.75, - plot = TRUE, - quiet = FALSE -) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{K}{Number of clusters to be used.} - -\item{outminc}{minimal transcript count of a gene in a clusters to be tested -for being an outlier gene. Default is 5.} - -\item{outlg}{Minimum number of outlier genes required for being an outlier -cell. Default is 2.} - -\item{probthr}{outlier probability threshold for a minimum of \code{outlg} -genes to be an outlier cell. This probability is computed from a negative -binomial background model of expression in a cluster. Default is 0.001.} - -\item{thr}{probability values for which the number of outliers is computed in -order to plot the dependence of the number of outliers on the probability -threshold. Default is 2**-(1:40).} - -\item{outdistquant}{Real number between zero and one. Outlier cells are -merged to outlier clusters if their distance smaller than the -outdistquant-quantile of the distance distribution of pairs of cells in -the orginal clusters after outlier removal. Default is 0.75.} - -\item{plot}{if `TRUE`, produces a plot of -log10prob per K} - -\item{quiet}{if `TRUE`, intermediary output is suppressed} -} -\value{ -A named vector of the genes containing outlying cells and the number - of cells on each. -} -\description{ -This functions performs the outlier identification -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc) -FindOutliersMB( - sc, K=3, outminc=5, outlg=2, probthr=.5*1e-3, thr=2**-(1:40), - outdistquant=.75, plot = FALSE, quiet = TRUE -) - -} diff --git a/man/HumanMouseGeneIds.Rd b/man/HumanMouseGeneIds.Rd index 75d70a2..eb9b231 100644 --- a/man/HumanMouseGeneIds.Rd +++ b/man/HumanMouseGeneIds.Rd @@ -13,7 +13,3 @@ BioConductor libraries. Data.frame including ENTREZID, SYMBOL, and ENSEMBL gene identifiers of human and mouse genes. } -\examples{ -data(HumanMouseGeneIds) -print(HumanMouseGeneIds[1:6,]) -} diff --git a/man/J48DT.Rd b/man/J48DT.Rd index dcef16e..714a7d1 100644 --- a/man/J48DT.Rd +++ b/man/J48DT.Rd @@ -22,23 +22,3 @@ The decision tree analysis is implemented over a training dataset, which consisted of the DEGs obtained by either SAMseq or the binomial differential expression. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -cdiff <- DEGanalysis2clust( - sc, Clustering="K-means", K=3, fdr=.2, name="Name", First="CL1", - Second="CL2", export=FALSE -) -sigDEG <- cdiff[[1]] -DATAforDT <- ClassVectoringDT( - sc, Clustering="K-means", K=3, First="CL1", Second="CL2", sigDEG, -) -J48DT(DATAforDT) -} diff --git a/man/J48DTeval.Rd b/man/J48DTeval.Rd index 669f593..33c073d 100644 --- a/man/J48DTeval.Rd +++ b/man/J48DTeval.Rd @@ -27,23 +27,3 @@ Statistics about the J48 model This function evaluates the performance of the generated trees for error estimation by ten-fold cross validation assessment. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -cdiff <- DEGanalysis2clust( - sc, Clustering="K-means", K=3, fdr=.2, name="Name", First="CL1", - Second="CL2", export=FALSE -) -sigDEG <- cdiff[[1]] -DATAforDT <- ClassVectoringDT( - sc, Clustering="K-means", K=3, First="CL1", Second="CL2", sigDEG -) -J48DTeval(DATAforDT, num.folds=10, First="CL1", Second="CL2") -} diff --git a/man/Jaccard.Rd b/man/Jaccard.Rd index 56dc870..8c14dcf 100644 --- a/man/Jaccard.Rd +++ b/man/Jaccard.Rd @@ -29,8 +29,3 @@ Robustness of the clusters can be assessed by Jaccard’s across bootstrapping runs. Jaccard’s similarity is the intersect of two clusters divided by the union. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- Clustexp(sc, cln=3, quiet=TRUE) # K-means clustering -Jaccard(sc, Clustering="K-means", K=3) -} diff --git a/man/KMClustDiffGenes.Rd b/man/KMClustDiffGenes.Rd deleted file mode 100644 index e33d9f6..0000000 --- a/man/KMClustDiffGenes.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-KMClustDiffGenes.R -\name{KMClustDiffGenes} -\alias{KMClustDiffGenes} -\alias{KMClustDiffGenes,DISCBIO-method} -\title{ClustDiffGenes} -\usage{ -KMClustDiffGenes( - object, - K, - pValue = 0.05, - fdr = 0.01, - export = TRUE, - quiet = FALSE -) - -\S4method{KMClustDiffGenes}{DISCBIO}( - object, - K, - pValue = 0.05, - fdr = 0.01, - export = TRUE, - quiet = FALSE -) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{K}{A numeric value of the number of clusters.} - -\item{pValue}{A numeric value of the p-value. Default is 0.05.} - -\item{fdr}{A numeric value of the false discovery rate. Default is 0.01.} - -\item{export}{A logical vector that allows writing the final gene list in -excel file. Default is TRUE.} - -\item{quiet}{if `TRUE`, suppresses intermediate text output} -} -\value{ -A list containing two tables. -} -\description{ -description -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- Clustexp(sc, cln=3, quiet=TRUE) # K-means clustering -KMClustDiffGenes(sc, K=3, fdr=.3, export=FALSE) -} diff --git a/man/KMclustheatmap.Rd b/man/KMclustheatmap.Rd deleted file mode 100644 index 4c3137c..0000000 --- a/man/KMclustheatmap.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-KMclustheatmap.R -\name{KMclustheatmap} -\alias{KMclustheatmap} -\alias{KMclustheatmap,DISCBIO-method} -\title{Plotting the K-means clusters in a heatmap representation of the - cell-to-cell distances} -\usage{ -KMclustheatmap(object, hmethod = "single", plot = TRUE) - -\S4method{KMclustheatmap}{DISCBIO}(object, hmethod = "single", plot = TRUE) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{hmethod}{Agglomeration method used for determining the cluster order -from hierarchical clustering of the cluster medoids. This should be one of -"ward.D", "ward.D2", "single", "complete", "average". Default is "single".} - -\item{plot}{if `TRUE`, plots the heatmap; otherwise, just prints cclmo} -} -\value{ -Unless otherwise specified, a heatmap and a vector of the underlying - cluster order. -} -\description{ -This functions plots a heatmap of the distance matrix grouped - by clusters. Individual clusters are highlighted with rainbow colors along - the x and y-axes. -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- Clustexp(sc, cln=3, quiet=TRUE) # K-means clustering -sc <- comptSNE(sc, max_iter=100, quiet=TRUE) -KMclustheatmap(sc, hmethod="single") -} diff --git a/man/KmeanOrder.Rd b/man/KmeanOrder.Rd index 2a687e7..165889c 100644 --- a/man/KmeanOrder.Rd +++ b/man/KmeanOrder.Rd @@ -1,13 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-KmeanOrder.R +% Please edit documentation in R/DIscBIO-generic-clusteringOrder.R \name{KmeanOrder} \alias{KmeanOrder} \alias{KmeanOrder,DISCBIO-method} \title{Pseudo-time ordering based on k-means clusters} \usage{ -KmeanOrder(object, quiet = FALSE, export = TRUE) +KmeanOrder( + object, + quiet = FALSE, + export = FALSE, + filename = "Cellular_pseudo-time_ordering_based_on_k-meansc-lusters" +) -\S4method{KmeanOrder}{DISCBIO}(object, quiet = FALSE, export = TRUE) +\S4method{KmeanOrder}{DISCBIO}( + object, + quiet = FALSE, + export = FALSE, + filename = "Cellular_pseudo-time_ordering_based_on_k-meansc-lusters" +) } \arguments{ \item{object}{\code{DISCBIO} class object.} @@ -15,6 +25,8 @@ KmeanOrder(object, quiet = FALSE, export = TRUE) \item{quiet}{if `TRUE`, suppresses intermediary output} \item{export}{if `TRUE`, exports order table to csv} + +\item{filename}{Name of the exported file (if `export=TRUE`)} } \value{ The DISCBIO-class object input with the kordering slot filled. @@ -24,9 +36,3 @@ This function takes the exact output of exprmclust function and construct Pseudo-time ordering by mapping all cells onto the path that connects cluster centers. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- Clustexp(sc, cln=3) # K-means clustering -Order <- KmeanOrder(sc, export = FALSE) -Order@kordering -} diff --git a/man/MBClustDiffGenes.Rd b/man/MBClustDiffGenes.Rd deleted file mode 100644 index a1e8e08..0000000 --- a/man/MBClustDiffGenes.Rd +++ /dev/null @@ -1,62 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-MBClustDiffGenes.R -\name{MBClustDiffGenes} -\alias{MBClustDiffGenes} -\alias{MBClustDiffGenes,DISCBIO-method} -\title{ClustDiffGenes} -\usage{ -MBClustDiffGenes( - object, - K, - pValue = 0.05, - fdr = 0.01, - export = TRUE, - quiet = FALSE -) - -\S4method{MBClustDiffGenes}{DISCBIO}( - object, - K, - pValue = 0.05, - fdr = 0.01, - export = TRUE, - quiet = FALSE -) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{K}{A numeric value of the number of clusters.} - -\item{pValue}{A numeric value of the p-value. Default is 0.05.} - -\item{fdr}{A numeric value of the false discovery rate. Default is 0.01.} - -\item{export}{A logical vector that allows writing the final gene list in -excel file. Default is TRUE.} - -\item{quiet}{if `TRUE`, suppresses intermediate text output} -} -\value{ -A list containing two tables. -} -\description{ -ClustDiffGenes -} -\examples{ -\dontrun{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=3) -sc <- comptsneMB(sc, max_iter=100) -sc <- Clustexp(sc, cln=3) -sc <- MB_Order(sc, export = FALSE) -cdiff <- MBClustDiffGenes(sc, K=3, fdr=.1) -str(cdiff) -} -} diff --git a/man/MB_Order.Rd b/man/MB_Order.Rd deleted file mode 100644 index 525c985..0000000 --- a/man/MB_Order.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MB_Order.R -\name{MB_Order} -\alias{MB_Order} -\title{Pseudo-time ordering based on Model-based clusters} -\usage{ -MB_Order(object, quiet = FALSE, export = TRUE) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{quiet}{if `TRUE`, intermediary output is suppressed} - -\item{export}{if `TRUE`, exports the results as a CSV file} -} -\value{ -The DISCBIO-class object input with the MBordering slot filled. -} -\description{ -This function takes the exact output of exprmclust function and - construct Pseudo-time ordering by mapping all cells onto the path that - connects cluster centers. -} -\examples{ -\dontrun{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=2) -sc <- comptsneMB(sc, max_iter=100) -sc <- Clustexp(sc, cln=3) -sc <- MB_Order(sc, export = FALSE) -sc@MBordering -} -} diff --git a/man/MBclustheatmap.Rd b/man/MBclustheatmap.Rd deleted file mode 100644 index 16ccd24..0000000 --- a/man/MBclustheatmap.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-MBclustheatmap.R -\name{MBclustheatmap} -\alias{MBclustheatmap} -\alias{MBclustheatmap,DISCBIO-method} -\title{Plotting the Model-based clusters in a heatmap representation of the - cell-to-cell distances} -\usage{ -MBclustheatmap(object, hmethod = "single", plot = TRUE, quiet = FALSE) - -\S4method{MBclustheatmap}{DISCBIO}(object, hmethod = "single", plot = TRUE, quiet = FALSE) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{hmethod}{Agglomeration method used for determining the cluster order -from hierarchical clustering of the cluster medoids. This should be one of -"ward.D", "ward.D2", "single", "complete", "average". Default is "single".} - -\item{plot}{if `TRUE`, plots the heatmap; otherwise, just prints cclmo} - -\item{quiet}{if `TRUE`, intermediary output is suppressed} -} -\value{ -Unless otherwise specified, a heatmap and a vector of the underlying - cluster order. -} -\description{ -This functions plots a heatmap of the distance matrix grouped - by clusters. Individual clusters are highlighted with rainbow colors along - the x and y-axes. -} -\examples{ -\dontrun{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc,K = 2) -sc <- comptsneMB(sc, max_iter=100) -sc <- Clustexp(sc, cln=3) -sc <- MB_Order(sc, export = FALSE) -MBclustheatmap(sc, hmethod="single") -} -} diff --git a/man/NetAnalysis.Rd b/man/NetAnalysis.Rd index 3134f07..cfa0006 100644 --- a/man/NetAnalysis.Rd +++ b/man/NetAnalysis.Rd @@ -4,7 +4,7 @@ \alias{NetAnalysis} \title{Networking analysis.} \usage{ -NetAnalysis(data, export = TRUE, FileName = "1") +NetAnalysis(data, export = FALSE, FileName = "NetworkAnalysisTable-1") } \arguments{ \item{data}{Protein-protein interaction data frame resulted from running the @@ -22,23 +22,3 @@ This function checks the connectivity degree and the betweenness centrality, which reflect the communication flow in the defined PPI networks } -\examples{ -\dontrun{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF") -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -dff <- DEGanalysis2clust(sc, Clustering="K-means", K=3, fdr=0.1, name="Name") -DEGs <- dff[[2]][1, 6] -data <- read.csv(file=paste0(DEGs),head=TRUE,sep=",") -data <- data[,3] -FileName <- paste0(DEGs) -ppi <- PPI(data, FileName) -NetAnalysis(ppi) -} -} diff --git a/man/Networking.Rd b/man/Networking.Rd index 2ad8da2..6126ca8 100644 --- a/man/Networking.Rd +++ b/man/Networking.Rd @@ -4,13 +4,20 @@ \alias{Networking} \title{Plotting the network.} \usage{ -Networking(data, FileName, species = "9606", plot_width = 25, plot_height = 15) +Networking( + data, + FileName = NULL, + species = "9606", + plot_width = 25, + plot_height = 15 +) } \arguments{ \item{data}{A gene list.} \item{FileName}{A string vector showing the name to be used to save the -resulted network.} +resulted network. If `NULL`, the network will be saved to a temporary +directory} \item{species}{The taxonomy name/id. Default is "9606" for Homo sapiens.} @@ -24,25 +31,3 @@ A plot of the network \description{ This function uses STRING-api to plot the network. } -\examples{ -\dontrun{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF") -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -dff <- DEGanalysis2clust(sc, Clustering="K-means", K=3, fdr=0.1, name="Name") -DEGs <- dff[[2]][1, 6] -data <- read.csv(file=paste0(DEGs),head=TRUE,sep=",") -data <- data[,3] -FileName <- paste0(DEGs) -ppi <- PPI(data, FileName) -networking <- NetAnalysis(ppi) -FileName <- "Up.DownDEG" -Networking(data, FileName) -} -} diff --git a/man/NoiseFiltering.Rd b/man/NoiseFiltering.Rd index d43d2c9..c22bb6f 100644 --- a/man/NoiseFiltering.Rd +++ b/man/NoiseFiltering.Rd @@ -14,8 +14,9 @@ NoiseFiltering( erccCol = "blue", Val = TRUE, plot = TRUE, - export = TRUE, - quiet = FALSE + export = FALSE, + quiet = FALSE, + filename = "Noise_filtering_genes_test" ) \S4method{NoiseFiltering}{DISCBIO}( @@ -27,8 +28,9 @@ NoiseFiltering( erccCol = "blue", Val = TRUE, plot = TRUE, - export = TRUE, - quiet = FALSE + export = FALSE, + quiet = FALSE, + filename = "Noise_filtering_genes_test" ) } \arguments{ @@ -56,6 +58,8 @@ Default is TRUE.} excel file. Default is TRUE.} \item{quiet}{if `TRUE`, suppresses printed output} + +\item{filename}{Name of the exported file (if `export=TRUE`)} } \value{ The DISCBIO-class object input with the noiseF slot filled. @@ -71,7 +75,8 @@ Given a matrix or data frame of count data, this function This function should be used only if the dataset has ERCC. } \examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data +sc <- DISCBIO(valuesG1msTest) # changes signature of data sd_filtered <- NoiseFiltering(sc, export=FALSE) str(sd_filtered) + } diff --git a/man/Normalizedata.Rd b/man/Normalizedata.Rd index 07c8c85..2e1fbca 100644 --- a/man/Normalizedata.Rd +++ b/man/Normalizedata.Rd @@ -13,7 +13,7 @@ Normalizedata( maxexpr = Inf, downsample = FALSE, dsn = 1, - rseed = 17000 + rseed = NULL ) \S4method{Normalizedata}{DISCBIO}( @@ -24,7 +24,7 @@ Normalizedata( maxexpr = Inf, downsample = FALSE, dsn = 1, - rseed = 17000 + rseed = NULL ) } \arguments{ @@ -53,8 +53,8 @@ downsampled versions of the transcript count data. Default is 1 which means that sampling noise should be comparable across cells. For high numbers of dsn the data will become similar to the median normalization.} -\item{rseed}{Integer number. Random seed to enforce reproducible clustering -results. Default is 17000.} +\item{rseed}{Random integer to enforce reproducible clustering. +results} } \value{ The DISCBIO-class object input with the ndata and fdata slots filled. @@ -64,7 +64,7 @@ This function allows filtering of genes and cells to be used in the downstream analysis. } \examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data +sc <- DISCBIO(valuesG1msTest) # changes signature of data # In this case this function is used to normalize the reads sc_normal <- Normalizedata( @@ -72,4 +72,5 @@ sc_normal <- Normalizedata( dsn=1, rseed=17000 ) summary(sc_normal@fdata) + } diff --git a/man/PCAplotSymbols.Rd b/man/PCAplotSymbols.Rd index d2d9c2e..f83a6e4 100644 --- a/man/PCAplotSymbols.Rd +++ b/man/PCAplotSymbols.Rd @@ -21,14 +21,3 @@ Plot of the Principal Components \description{ Generates a plot of grouped PCA components } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=2) -PCAplotSymbols(sc) -} diff --git a/man/PPI.Rd b/man/PPI.Rd index f73ef94..d52bae7 100644 --- a/man/PPI.Rd +++ b/man/PPI.Rd @@ -4,40 +4,22 @@ \alias{PPI} \title{Defining protein-protein interactions (PPI) over a list of genes,} \usage{ -PPI(data, FileName, species = "9606") +PPI(data, FileName = NULL, species = "9606") } \arguments{ \item{data}{A gene list.} \item{FileName}{A string vector showing the name to be used to save the -resulted table.} +resulted table. If null, no file will be exported} \item{species}{The taxonomy name/id. Default is "9606" for Homo sapiens.} } \value{ -A TSV file stored in the user's file system and its corresponding - `data.frame` object in R. +Either a TSV file stored in the user's file system and its +corresponding `data.frame` object in R or and R object containing that +information. } \description{ This function uses STRING-api. The outcome of STRING analysis will be stored in tab separated values (TSV) files. } -\examples{ -\dontrun{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF") -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -dff <- DEGanalysis2clust(sc, Clustering="K-means", K=3, fdr=0.1, name="Name") -DEGs <- dff[[2]][1, 6] -data <- read.csv(file=paste0(DEGs),head=TRUE,sep=",") -data <- data[,3] -FileName <- paste0(DEGs) -PPI(data, FileName) -} -} diff --git a/man/PlotMBexpPCA.Rd b/man/PlotMBexpPCA.Rd deleted file mode 100644 index 0839fc4..0000000 --- a/man/PlotMBexpPCA.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/PlotMBexpPCA.R -\name{PlotMBexpPCA} -\alias{PlotMBexpPCA} -\title{Plotting gene expression in Model-based clustering in PCA.} -\usage{ -PlotMBexpPCA(object, g, n = NULL) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{g}{Individual gene name or vector with a group of gene names -corresponding to a subset of valid row names of the \code{ndata} slot of -the \code{DISCBIO} object.} - -\item{n}{String of characters representing the title of the plot. Default is -NULL and the first element of \code{g} is chosen.} -} -\value{ -A plot of the PCA. -} -\description{ -The PCA representation can also be used to show the gene - expression of a particular gene. -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata(sc) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=2, reduce=TRUE, quiet=TRUE) -g <- "ENSG00000010244" # Plotting the expression of MT-RNR2 -PlotMBexpPCA(sc, g) -} diff --git a/man/PlotMBorderPCA.Rd b/man/PlotMBorderPCA.Rd deleted file mode 100644 index 5b1c728..0000000 --- a/man/PlotMBorderPCA.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/PlotMBorderPCA.R -\name{PlotMBorderPCA} -\alias{PlotMBorderPCA} -\title{Plotting pseudo-time ordering in Model-based clustering in PCA.} -\usage{ -PlotMBorderPCA(object) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} -} -\value{ -A plot of the PCA. -} -\description{ -The PCA representation can also be used to show the pseudo-time - ordering. -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=3) -sc <- Clustexp(sc, cln=3) -sc <- MB_Order(sc, export = FALSE) -PlotMBorderPCA(sc) -} diff --git a/man/PlotMBpca.Rd b/man/PlotMBpca.Rd new file mode 100644 index 0000000..8057c50 --- /dev/null +++ b/man/PlotMBpca.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotMBpca.R +\name{PlotMBpca} +\alias{PlotMBpca} +\title{Plotting pseudo-time ordering or gene expression in Model-based clustering in PCA} +\usage{ +PlotMBpca(object, type = "order", g = NULL, n = NULL) +} +\arguments{ +\item{object}{\code{DISCBIO} class object.} + +\item{type}{either `order` to plot pseudo-time ordering or `exp` to plot gene expression} + +\item{g}{Individual gene name or vector with a group of gene names +corresponding to a subset of valid row names of the \code{ndata} slot of +the \code{DISCBIO} object. Ignored if `type="order"`.} + +\item{n}{String of characters representing the title of the plot. Default is +NULL and the first element of \code{g} is chosen. Ignored if +`type="order"`.} +} +\value{ +A plot of the PCA. +} +\description{ +The PCA representation can either be used to show pseudo-time ordering or the gene expression of a particular gene. +} diff --git a/man/PlotmclustMB.Rd b/man/PlotmclustMB.Rd index 6838fd5..8262f83 100644 --- a/man/PlotmclustMB.Rd +++ b/man/PlotmclustMB.Rd @@ -18,14 +18,3 @@ A plot of the PCA. \description{ Plot the model-based clustering results } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=2) -PlotmclustMB(sc) -} diff --git a/man/RpartDT.Rd b/man/RpartDT.Rd index 70d0426..e9eee75 100644 --- a/man/RpartDT.Rd +++ b/man/RpartDT.Rd @@ -22,23 +22,3 @@ The decision tree analysis is implemented over a training dataset, which consisted of the DEGs obtained by either SAMseq or the binomial differential expression. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -cdiff <- DEGanalysis2clust( - sc, Clustering="K-means", K=3, fdr=.2, name="Name", First="CL1", - Second="CL2", export=FALSE -) -sigDEG <- cdiff[[1]] -DATAforDT <- ClassVectoringDT( - sc, Clustering="K-means", K=3, First="CL1", Second="CL2", sigDEG, -) -RpartDT(DATAforDT) -} diff --git a/man/RpartEVAL.Rd b/man/RpartEVAL.Rd index a1fc00e..f594ae0 100644 --- a/man/RpartEVAL.Rd +++ b/man/RpartEVAL.Rd @@ -27,23 +27,3 @@ Performance statistics of the model This function evaluates the performance of the generated trees for error estimation by ten-fold cross validation assessment. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -cdiff <- DEGanalysis2clust( - sc, Clustering="K-means", K=3, fdr=.2, name="Name", First="CL1", - Second="CL2", export=FALSE -) -sigDEG <- cdiff[[1]] -DATAforDT <- ClassVectoringDT( - sc, Clustering="K-means", K=3, First="CL1", Second="CL2", sigDEG, -) -RpartEVAL(DATAforDT,num.folds=10,First="CL1",Second="CL2") -} diff --git a/man/VolcanoPlot.Rd b/man/VolcanoPlot.Rd index 6377c89..b464f89 100644 --- a/man/VolcanoPlot.Rd +++ b/man/VolcanoPlot.Rd @@ -4,7 +4,7 @@ \alias{VolcanoPlot} \title{Volcano Plot} \usage{ -VolcanoPlot(object, value = 0.05, name, fc = 0.5, FS = 0.4) +VolcanoPlot(object, value = 0.05, name = NULL, fc = 0.5, FS = 0.4) } \arguments{ \item{object}{A data frame showing the differentially expressed genes (DEGs) @@ -13,8 +13,7 @@ in a particular cluster} \item{value}{A numeric value of the false discovery rate. Default is 0.05.. Default is 0.05} -\item{name}{A string vector showing the name to be used to save the resulted -tables.} +\item{name}{A string vector showing the name to be used on the plot title} \item{fc}{A numeric value of the fold change. Default is 0.5.} @@ -28,20 +27,3 @@ Plotting differentially expressed genes (DEGs) in a particular cluster. Volcano plots are used to readily show the DEGs by plotting significance versus fold-change on the y and x axes, respectively. } -\examples{ -\dontrun{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF") -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -dff <- DEGanalysis2clust(sc, Clustering="K-means", K=3, fdr=0.1, name="Name") -name <- dff[[2]][1, 6] -U <- read.csv(file = paste0(name), head=TRUE, sep=",") -VolcanoPlot(U, value=0.05, name=name, adj=FALSE, FS=.4) -} -} diff --git a/man/as.DISCBIO.Rd b/man/as.DISCBIO.Rd index 240704d..11ad971 100644 --- a/man/as.DISCBIO.Rd +++ b/man/as.DISCBIO.Rd @@ -23,11 +23,3 @@ Additional parameters to pass to `list` include, if x is a Seurat object, "assay", which is a string indicating the assay slot used to obtain data from (defaults to 'RNA') } -\examples{ -g1_sce <- SingleCellExperiment::SingleCellExperiment( - list(counts=as.matrix(valuesG1msReduced)) -) -g1_disc <- as.DISCBIO(g1_sce) -class(g1_disc) - -} diff --git a/man/check.format.Rd b/man/check.format.Rd new file mode 100644 index 0000000..452ca0e --- /dev/null +++ b/man/check.format.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/samr-adapted.R +\name{check.format} +\alias{check.format} +\title{Check format} +\usage{ +check.format(y, resp.type, censoring.status = NULL) +} +\arguments{ +\item{y}{y} + +\item{resp.type}{resp type} + +\item{censoring.status}{censoring status} +} +\description{ +Check format +} diff --git a/man/clustheatmap.Rd b/man/clustheatmap.Rd new file mode 100644 index 0000000..9dee417 --- /dev/null +++ b/man/clustheatmap.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DIscBIO-generic-clustheatmap.R +\name{clustheatmap} +\alias{clustheatmap} +\alias{clustheatmap,DISCBIO-method} +\title{Plotting clusters in a heatmap representation of the cell distances} +\usage{ +clustheatmap( + object, + clustering_method = "k-means", + hmethod = "single", + rseed = NULL, + quiet = FALSE, + plot = TRUE +) + +\S4method{clustheatmap}{DISCBIO}( + object, + clustering_method = "k-means", + hmethod = "single", + rseed = NULL, + quiet = FALSE, + plot = TRUE +) +} +\arguments{ +\item{object}{\code{DISCBIO} class object.} + +\item{clustering_method}{either "k-means" or "model-based" ("k" and "mb" are also accepted)} + +\item{hmethod}{Agglomeration method used for determining the cluster order +from hierarchical clustering of the cluster medoids. This should be one of +"ward.D", "ward.D2", "single", "complete", "average". Default is "single".} + +\item{rseed}{Random integer to fix random results.} + +\item{quiet}{if `TRUE`, intermediary output is suppressed} + +\item{plot}{if `TRUE`, plots the heatmap; otherwise, just prints cclmo} +} +\value{ +Unless otherwise specified, a heatmap and a vector of the underlying + cluster order. +} +\description{ +This functions plots a heatmap of the distance matrix grouped + by clusters. Individual clusters are highlighted with rainbow colors along + the x and y-axes. +} diff --git a/man/comptSNE.Rd b/man/comptSNE.Rd index 52308b3..d603f92 100644 --- a/man/comptSNE.Rd +++ b/man/comptSNE.Rd @@ -3,11 +3,11 @@ \name{comptSNE} \alias{comptSNE} \alias{comptSNE,DISCBIO-method} -\title{Computing tSNE for K-means clustering} +\title{Computing tSNE} \usage{ comptSNE( object, - rseed = 15555, + rseed = NULL, max_iter = 5000, epoch = 500, quiet = FALSE, @@ -16,7 +16,7 @@ comptSNE( \S4method{comptSNE}{DISCBIO}( object, - rseed = 15555, + rseed = NULL, max_iter = 5000, epoch = 500, quiet = FALSE, @@ -26,8 +26,8 @@ comptSNE( \arguments{ \item{object}{\code{DISCBIO} class object.} -\item{rseed}{Integer number. Random seed to to yield exactly reproducible -maps across different runs. Default is 15555.} +\item{rseed}{Random integer to to yield reproducible maps across different +runs} \item{max_iter}{maximum number of iterations to perform.} @@ -45,8 +45,9 @@ This function is used to compute the t-Distributed Stochastic Neighbor Embedding (t-SNE). } \examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data -sc <- Clustexp(sc, cln=3) # data must be clustered before plottin -sc <- comptSNE(sc, max_iter=1000) +sc <- DISCBIO(valuesG1msTest) # changes signature of data +sc <- Clustexp(sc, cln=2) # data must be clustered before plottin +sc <- comptSNE(sc, max_iter=30) head(sc@tsne) + } diff --git a/man/comptsneMB.Rd b/man/comptsneMB.Rd deleted file mode 100644 index bcc8ba5..0000000 --- a/man/comptsneMB.Rd +++ /dev/null @@ -1,58 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-comptsneMB.R -\name{comptsneMB} -\alias{comptsneMB} -\alias{comptsneMB,DISCBIO-method} -\title{Computing tSNE for Model-based clustering} -\usage{ -comptsneMB( - object, - rseed = 15555, - max_iter = 5000, - epoch = 500, - quiet = FALSE, - ... -) - -\S4method{comptsneMB}{DISCBIO}( - object, - rseed = 15555, - max_iter = 5000, - epoch = 500, - quiet = FALSE, - ... -) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{rseed}{Integer number. Random seed to to yield exactly reproducible -maps across different runs. Default is 15555.} - -\item{max_iter}{maximum number of iterations to perform.} - -\item{epoch}{The number of iterations in between update messages.} - -\item{quiet}{if `TRUE`, suppresses intermediate output} - -\item{...}{other parameters to be passed to `tsne::tsne`} -} -\value{ -The DISCBIO-class object input with the MBtsne slot filled. -} -\description{ -This function is used to compute the t-Distributed Stochastic - Neighbor Embedding (t-SNE). -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc) -sc <- comptsneMB(sc, rseed=15555, max_iter = 1000) -print(sc@MBtsne) -} diff --git a/man/foldchange.seq.twoclass.unpaired.Rd b/man/foldchange.seq.twoclass.unpaired.Rd new file mode 100644 index 0000000..bcc8668 --- /dev/null +++ b/man/foldchange.seq.twoclass.unpaired.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/samr-adapted.R +\name{foldchange.seq.twoclass.unpaired} +\alias{foldchange.seq.twoclass.unpaired} +\title{Foldchange of twoclass unpaired sequencing data} +\usage{ +foldchange.seq.twoclass.unpaired(x, y, depth) +} +\arguments{ +\item{x}{x} + +\item{y}{y} + +\item{depth}{depth} +} +\description{ +Foldchange of twoclass unpaired sequencing data +} diff --git a/man/plotExptSNE.Rd b/man/plotExptSNE.Rd index 80dbb8a..29fe5b7 100644 --- a/man/plotExptSNE.Rd +++ b/man/plotExptSNE.Rd @@ -3,7 +3,7 @@ \name{plotExptSNE} \alias{plotExptSNE} \alias{plotExptSNE,DISCBIO-method} -\title{Highlighting gene expression in K-means clustering in the t-SNE map} +\title{Highlighting gene expression in the t-SNE map} \usage{ plotExptSNE(object, g, n = NULL) @@ -27,10 +27,3 @@ The t-SNE map representation can also be used to analyze expression of a gene or a group of genes, to investigate cluster specific gene expression patterns } -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- Clustexp(sc, cln=3, quiet=TRUE) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -g <- 'ENSG00000001460' -plotExptSNE(sc, g) -} diff --git a/man/plotGap.Rd b/man/plotGap.Rd index e71f6c7..330b805 100644 --- a/man/plotGap.Rd +++ b/man/plotGap.Rd @@ -20,8 +20,3 @@ A plot of the gap statistics \description{ Plotting Gap Statistics } -\examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data -sc <- Clustexp(sc, cln=3) # data must be clustered before plotting -plotGap(sc) -} diff --git a/man/plotKmeansLabelstSNE.Rd b/man/plotKmeansLabelstSNE.Rd deleted file mode 100644 index 82d3e3f..0000000 --- a/man/plotKmeansLabelstSNE.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-plotKmeansLabelstSNE.R -\name{plotKmeansLabelstSNE} -\alias{plotKmeansLabelstSNE} -\alias{plotKmeansLabelstSNE,DISCBIO-method} -\title{tSNE map for K-means clustering with labels} -\usage{ -plotKmeansLabelstSNE(object) - -\S4method{plotKmeansLabelstSNE}{DISCBIO}(object) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} -} -\value{ -Plot containing the ID of the cells in each cluster -} -\description{ -Visualizing the K-means clusters using tSNE maps -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data -sc <- Clustexp(sc, cln=3) # data must be clustered before plottin -sc <- comptSNE(sc, max_iter=100) -plotKmeansLabelstSNE(sc) # Plots the ID of the cells in each cluster -} diff --git a/man/plotLabelstSNE.Rd b/man/plotLabelstSNE.Rd new file mode 100644 index 0000000..9cdb721 --- /dev/null +++ b/man/plotLabelstSNE.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DIscBIO-generic-plotLabelstSNE.R +\name{plotLabelstSNE} +\alias{plotLabelstSNE} +\alias{plotLabelstSNE,DISCBIO-method} +\title{tSNE map with labels} +\usage{ +plotLabelstSNE(object) + +\S4method{plotLabelstSNE}{DISCBIO}(object) +} +\arguments{ +\item{object}{\code{DISCBIO} class object.} +} +\value{ +Plot containing the ID of the cells in each cluster +} +\description{ +Visualizing k-means or model-based clusters using tSNE maps +} diff --git a/man/plotMBLabelstSNE.Rd b/man/plotMBLabelstSNE.Rd deleted file mode 100644 index 93585f5..0000000 --- a/man/plotMBLabelstSNE.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-plotMBLabelstSNE.R -\name{plotMBLabelstSNE} -\alias{plotMBLabelstSNE} -\alias{plotMBLabelstSNE,DISCBIO-method} -\title{tSNE map for Model-based clustering with labels} -\usage{ -plotMBLabelstSNE(object) - -\S4method{plotMBLabelstSNE}{DISCBIO}(object) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} -} -\value{ -A plot of the `object@MBtsne` values -} -\description{ -Visualizing the Model-based clusters using tSNE maps -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=2) -sc <- comptsneMB(sc, rseed=15555, quiet=TRUE, max_iter=100) -plotMBLabelstSNE(sc) -} diff --git a/man/plotOrderKMtsne.Rd b/man/plotOrderKMtsne.Rd deleted file mode 100644 index 0122164..0000000 --- a/man/plotOrderKMtsne.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-plotOrderKMtsne.R -\name{plotOrderKMtsne} -\alias{plotOrderKMtsne} -\alias{plotOrderKMtsne,DISCBIO-method} -\title{Plotting the pseudo-time ordering based on K-means clusters in the - t-SNE map} -\usage{ -plotOrderKMtsne(object) - -\S4method{plotOrderKMtsne}{DISCBIO}(object) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} -} -\value{ -A plot of the pseudo-time ordering. -} -\description{ -The tSNE representation can also be used to show the pseudo-time - ordering. -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- Clustexp(sc, cln=3) # K-means clustering -sc <- comptSNE(sc, max_iter=100) -sc <- KmeanOrder(sc, export = FALSE) -plotOrderKMtsne(sc) -} diff --git a/man/plotOrderMBtsne.Rd b/man/plotOrderMBtsne.Rd deleted file mode 100644 index 3d83b3f..0000000 --- a/man/plotOrderMBtsne.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-plotOrderMBtsne.R -\name{plotOrderMBtsne} -\alias{plotOrderMBtsne} -\alias{plotOrderMBtsne,DISCBIO-method} -\title{Plotting the pseudo-time ordering based on Model-based clusters in the - t-SNE map} -\usage{ -plotOrderMBtsne(object) - -\S4method{plotOrderMBtsne}{DISCBIO}(object) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} -} -\value{ -A plot of the pseudo-time ordering. -} -\description{ -The tSNE representation can also be used to show the pseudo-time - ordering. -} -\examples{ -sc<- DISCBIO(valuesG1msReduced) -sc<-NoiseFiltering(sc,percentile=0.9, CV=0.2, export=FALSE) -sc<-Normalizedata(sc) -sc<-FinalPreprocessing(sc,GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=2, reduce=TRUE, quiet=TRUE) -sc<- comptsneMB(sc, rseed=15555, quiet=TRUE, max_iter=100) -plotOrderMBtsne(sc) -} diff --git a/man/plotOrderTsne.Rd b/man/plotOrderTsne.Rd new file mode 100644 index 0000000..f53b524 --- /dev/null +++ b/man/plotOrderTsne.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DIscBIO-generic-plotOrderTsne.R +\name{plotOrderTsne} +\alias{plotOrderTsne} +\alias{plotOrderTsne,DISCBIO-method} +\title{Plotting the pseudo-time ordering in the t-SNE map} +\usage{ +plotOrderTsne(object) + +\S4method{plotOrderTsne}{DISCBIO}(object) +} +\arguments{ +\item{object}{\code{DISCBIO} class object.} +} +\value{ +A plot of the pseudo-time ordering. +} +\description{ +The tSNE representation can also be used to show the pseudo-time + ordering. +} diff --git a/man/plotSilhouette.Rd b/man/plotSilhouette.Rd index 0a88fd0..7e99e21 100644 --- a/man/plotSilhouette.Rd +++ b/man/plotSilhouette.Rd @@ -1,17 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-plotSilhouette.R, -% R/DIscBIO-generic-plottSNE.R +% Please edit documentation in R/DIscBIO-generic-plotSilhouette.R \name{plotSilhouette} \alias{plotSilhouette} \alias{plotSilhouette,DISCBIO-method} -\alias{plottSNE,DISCBIO-method} \title{Silhouette Plot for K-means clustering} \usage{ plotSilhouette(object, K) \S4method{plotSilhouette}{DISCBIO}(object, K) - -\S4method{plottSNE}{DISCBIO}(object) } \arguments{ \item{object}{\code{DISCBIO} class object.} @@ -30,9 +26,3 @@ The silhouette provides a representation of how well each point take values between -1 and 1 with higher values reflecting better representation of a point by its cluster. } -\examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data -sc <- Clustexp(sc, cln=3) # data must be clustered before plottin -sc <- comptSNE(sc, max_iter=100) -plotSilhouette(sc, K=3) -} diff --git a/man/plotSymbolstSNE.Rd b/man/plotSymbolstSNE.Rd index 9c520b7..0152863 100644 --- a/man/plotSymbolstSNE.Rd +++ b/man/plotSymbolstSNE.Rd @@ -25,9 +25,3 @@ Plot of tsne objet slot, grouped by gene. \description{ Visualizing the K-means clusters using tSNE maps } -\examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data -sc <- Clustexp(sc, cln=3) # data must be clustered before plottin -sc <- comptSNE(sc, max_iter=100, quiet=TRUE) -plotSymbolstSNE(sc,types=sub("(\\\\_\\\\d+)$","", names(sc@ndata))) -} diff --git a/man/plotexptsneMB.Rd b/man/plotexptsneMB.Rd deleted file mode 100644 index 7d03cec..0000000 --- a/man/plotexptsneMB.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-plotexptsneMB.R -\name{plotexptsneMB} -\alias{plotexptsneMB} -\alias{plotexptsneMB,DISCBIO-method} -\title{Highlighting gene expression in Model-based clustering in the t-SNE - map} -\usage{ -plotexptsneMB(object, g, n = NULL) - -\S4method{plotexptsneMB}{DISCBIO}(object, g, n = NULL) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{g}{Individual gene name or vector with a group of gene names -corresponding to a subset of valid row names of the \code{ndata} slot of -the \code{DISCBIO} object.} - -\item{n}{String of characters representing the title of the plot. Default is -NULL and the first element of \code{g} is chosen.} -} -\value{ -t-SNE plot for one particular gene -} -\description{ -The t-SNE map representation can also be used to analyze - expression of a gene or a group of genes, to investigate cluster specific - gene expression patterns -} -\examples{ -\dontrun{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=3) -sc <- comptsneMB(sc, max_iter=100) -sc <- Clustexp(sc, cln=3) -sc <- MB_Order(sc, export = FALSE) -g <- 'ENSG00000001460' -plotexptsneMB(sc, g) -} -} diff --git a/man/plotsilhouetteMB.Rd b/man/plotsilhouetteMB.Rd deleted file mode 100644 index c892fe8..0000000 --- a/man/plotsilhouetteMB.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-plotsilhouetteMB.R -\name{plotsilhouetteMB} -\alias{plotsilhouetteMB} -\alias{plotsilhouetteMB,DISCBIO-method} -\title{Silhouette Plot for Model-based clustering} -\usage{ -plotsilhouetteMB(object, K) - -\S4method{plotsilhouetteMB}{DISCBIO}(object, K) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{K}{A numeric value of the number of clusters} -} -\value{ -A silhouette plot -} -\description{ -The silhouette provides a representation of how well each point - is represented by its cluster in comparison to the closest neighboring - cluster. It computes for each point the difference between the average - similarity to all points in the same cluster and to all points in the - closest neighboring cluster. This difference it normalize such that it can - take values between -1 and 1 with higher values reflecting better - representation of a point by its cluster. -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, - downsample=FALSE, dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=2, reduce=TRUE, quiet=TRUE) -plotsilhouetteMB(sc, K=2) -} diff --git a/man/plottSNE.Rd b/man/plottSNE.Rd index 046c9f3..a69f68d 100644 --- a/man/plottSNE.Rd +++ b/man/plottSNE.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/DIscBIO-generic-plottSNE.R \name{plottSNE} \alias{plottSNE} -\title{tSNE map for K-means clustering} +\alias{plottSNE,DISCBIO-method} +\title{tSNE map} \usage{ plottSNE(object) + +\S4method{plottSNE}{DISCBIO}(object) } \arguments{ \item{object}{\code{DISCBIO} class object.} @@ -13,11 +16,5 @@ plottSNE(object) A plot of t-SNEs. } \description{ -Visualizing the K-means clusters using tSNE maps -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) # changes signature of data -sc <- Clustexp(sc, cln=3) # data must be clustered before plotting -sc <- comptSNE(sc, max_iter=100, quiet=TRUE) -plottSNE(sc) +Visualizing the k-means or model-based clusters using tSNE maps } diff --git a/man/plottsneMB.Rd b/man/plottsneMB.Rd deleted file mode 100644 index 6e6a09b..0000000 --- a/man/plottsneMB.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DIscBIO-generic-plottsneMB.R -\name{plottsneMB} -\alias{plottsneMB} -\alias{plottsneMB,DISCBIO-method} -\title{tSNE map for Model-based clustering} -\usage{ -plottsneMB(object, K = length(table(object@MBclusters$clusterid))) - -\S4method{plottsneMB}{DISCBIO}(object, K = length(table(object@MBclusters$clusterid))) -} -\arguments{ -\item{object}{\code{DISCBIO} class object.} - -\item{K}{A numeric value of the number of clusters} -} -\value{ -A plot of t-SNEs. -} -\description{ -Visualizing the Model-based clusters using tSNE maps -} -\examples{ -sc <- DISCBIO(valuesG1msReduced) -sc <- NoiseFiltering(sc, percentile=0.9, CV=0.2, export=FALSE) -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE) -sc <- Exprmclust(sc, K=2) -sc <- comptsneMB(sc, rseed=15555, quiet=TRUE, max_iter=100) -plottsneMB(sc) -} diff --git a/man/prepExampleDataset.Rd b/man/prepExampleDataset.Rd new file mode 100644 index 0000000..b3345f2 --- /dev/null +++ b/man/prepExampleDataset.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prepExampleDataset.R +\name{prepExampleDataset} +\alias{prepExampleDataset} +\title{Prepare Example Dataset} +\usage{ +prepExampleDataset(dataset, save = TRUE) +} +\arguments{ +\item{dataset}{Dataset used for transformation} + +\item{save}{save results?} +} +\value{ +Two rda files, ones for K-means clustering and another for +Model-based clustering. +} +\description{ +Internal function that prepares a pre-treated dataset for use in +several examples +} +\details{ +This function serves the purpose of treating datasets such as +valuesG1msReduced to reduce examples of other functions by bypassing some +analysis steps covered in the vignettes. +} +\author{ +Waldir Leoncio +} diff --git a/man/pseudoTimeOrdering.Rd b/man/pseudoTimeOrdering.Rd new file mode 100644 index 0000000..8a57157 --- /dev/null +++ b/man/pseudoTimeOrdering.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DIscBIO-generic-pseudoTimeOrdering.R +\name{pseudoTimeOrdering} +\alias{pseudoTimeOrdering} +\alias{pseudoTimeOrdering,DISCBIO-method} +\title{Pseudo-time ordering} +\usage{ +pseudoTimeOrdering( + object, + quiet = FALSE, + export = FALSE, + filename = "Cellular_pseudo-time_ordering" +) + +\S4method{pseudoTimeOrdering}{DISCBIO}( + object, + quiet = FALSE, + export = FALSE, + filename = "Cellular_pseudo-time_ordering" +) +} +\arguments{ +\item{object}{\code{DISCBIO} class object.} + +\item{quiet}{if `TRUE`, suppresses intermediary output} + +\item{export}{if `TRUE`, exports order table to csv} + +\item{filename}{Name of the exported file (if `export=TRUE`)} +} +\value{ +The DISCBIO-class object input with the kordering slot filled. +} +\description{ +This function takes the exact output of exprmclust function and + construct Pseudo-time ordering by mapping all cells onto the path that + connects cluster centers. +} diff --git a/man/rankcols.Rd b/man/rankcols.Rd new file mode 100644 index 0000000..b3abb74 --- /dev/null +++ b/man/rankcols.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/samr-adapted.R +\name{rankcols} +\alias{rankcols} +\title{Rank columns} +\usage{ +rankcols(x) +} +\arguments{ +\item{x}{x} +} +\description{ +Ranks the elements within each col of the matrix x and returns +these ranks in a matrix +} +\note{ +this function is equivalent to `samr::rankcol`, but uses `apply` to +rank the colums instead of a compiled Fortran function which was causing our +DEGanalysis functions to freeze in large datasets. +} diff --git a/man/reformatSiggenes.Rd b/man/reformatSiggenes.Rd new file mode 100644 index 0000000..279842e --- /dev/null +++ b/man/reformatSiggenes.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reformatSiggenes.R +\name{reformatSiggenes} +\alias{reformatSiggenes} +\title{Reformat Siggenes Table} +\usage{ +reformatSiggenes(table) +} +\arguments{ +\item{table}{output from `samr::samr.compute.siggenes.table`} +} +\description{ +Reformats the Siggenes table output from the SAMR package +} +\seealso{ +replaceDecimals +} diff --git a/man/replaceDecimals.Rd b/man/replaceDecimals.Rd new file mode 100644 index 0000000..6e914bf --- /dev/null +++ b/man/replaceDecimals.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/replaceDecimals.R +\name{replaceDecimals} +\alias{replaceDecimals} +\title{Replace Decimals} +\usage{ +replaceDecimals(x, from = ",", to = ".") +} +\arguments{ +\item{x}{vector of characters} + +\item{from}{decimal separator on input file} + +\item{to}{decimal separator for output file} +} +\description{ +Replaces decimals separators between comma and periods on a +character vector +} +\note{ +This function was especially designed to be used with retormatSiggenes +} +\seealso{ +reformatSiggenes +} diff --git a/man/resa.Rd b/man/resa.Rd new file mode 100644 index 0000000..1a6dee8 --- /dev/null +++ b/man/resa.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/samr-adapted.R +\name{resa} +\alias{resa} +\title{Resampling} +\usage{ +resa(x, d, nresamp = 20) +} +\arguments{ +\item{x}{data matrix. nrow=#gene, ncol=#sample} + +\item{d}{estimated sequencing depth} + +\item{nresamp}{number of resamplings} +} +\value{ +xresamp: an rank array with dim #gene*#sample*nresamp +} +\description{ +Corresponds to `samr::resample` +} diff --git a/man/retrieveBiomart.Rd b/man/retrieveBiomart.Rd deleted file mode 100644 index 71cd9da..0000000 --- a/man/retrieveBiomart.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/retrieveBiomart.R -\name{retrieveBiomart} -\alias{retrieveBiomart} -\title{Retrieve data from BioMart} -\usage{ -retrieveBiomart(gene_name, quiet = FALSE, max_tries = 3) -} -\arguments{ -\item{gene_name}{gene signature} - -\item{quiet}{if `TRUE`, suppresses messages} - -\item{max_tries}{maximum number of times the function will try to reach the -database} -} -\value{ -data.frame resulting from a successful call to getBM. -} -\description{ -uses functions from the biomaRt package to retrieve dataframes - from the BioMart Database -} -\details{ -Since the BioMart database is not always accessible, this function - envelops the requests to the database in a set of tryCatch functions to - allow for multiple queries and easier feedback to the end user -} diff --git a/man/sammy.Rd b/man/sammy.Rd new file mode 100644 index 0000000..db447ff --- /dev/null +++ b/man/sammy.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/samr-adapted.R +\name{sammy} +\alias{sammy} +\title{Significance analysis of microarrays} +\usage{ +sammy( + data, + resp.type = c("Quantitative", "Two class unpaired", "Survival", "Multiclass", + "One class", "Two class paired", "Two class unpaired timecourse", + "One class timecourse", "Two class paired timecourse", "Pattern discovery"), + assay.type = c("array", "seq"), + s0 = NULL, + s0.perc = NULL, + nperms = 100, + center.arrays = FALSE, + testStatistic = c("standard", "wilcoxon"), + time.summary.type = c("slope", "signed.area"), + regression.method = c("standard", "ranks"), + return.x = FALSE, + knn.neighbors = 10, + random.seed = NULL, + nresamp = 20, + nresamp.perm = NULL, + xl.mode = c("regular", "firsttime", "next20", "lasttime"), + xl.time = NULL, + xl.prevfit = NULL +) +} +\arguments{ +\item{data}{Data object with components x- p by n matrix of features, one observation per column (missing values allowed); y- n-vector of outcome measurements; censoring.status- n-vector of censoring censoring.status (1= died or event occurred, 0=survived, or event was censored), needed for a censored survival outcome} + +\item{resp.type}{Problem type: "Quantitative" for a continuous parameter (Available for both array and sequencing data); "Two class unpaired" (for both array and sequencing data); "Survival" for censored survival outcome (for both array and sequencingdata); "Multiclass": more than 2 groups (for both array and sequencing data); "One class" for a single group (only for array data); "Two class paired" for two classes with paired observations (for both array and sequencing data); "Two class unpaired timecourse" (only for array data), "One class time course" (only for array data), "Two class.paired timecourse" (only for array data), or "Pattern discovery" (only for array data)} + +\item{assay.type}{Assay type: "array" for microarray data, "seq" for counts from sequencing} + +\item{s0}{Exchangeability factor for denominator of test statistic; Default is automatic choice. Only used for array data.} + +\item{s0.perc}{Percentile of standard deviation values to use for s0; default is automatic choice; -1 means s0=0 (different from s0.perc=0, meaning s0=zeroeth percentile of standard deviation values= min of sd values. Only used for array data.} + +\item{nperms}{Number of permutations used to estimate false discovery rates} + +\item{center.arrays}{Should the data for each sample (array) be median centered at the outset? Default =FALSE. Only used for array data.} + +\item{testStatistic}{Test statistic to use in two class unpaired case.Either "standard" (t-statistic) or ,"wilcoxon" (Two-sample wilcoxon or Mann-Whitney test). Only used for array data.} + +\item{time.summary.type}{Summary measure for each time course: "slope", or"signed.area"). Only used for array data.} + +\item{regression.method}{Regression method for quantitative case: "standard",(linear least squares) or "ranks" (linear least squares on ranked data). Only used for array data.} + +\item{return.x}{Should the matrix of feature values be returned? Only useful for time course data, where x contains summaries of the features over time. Otherwise x is the same as the input data data\$x} + +\item{knn.neighbors}{Number of nearest neighbors to use for imputation of missing features values. Only used for array data.} + +\item{random.seed}{Optional initial seed for random number generator (integer)} + +\item{nresamp}{For assay.type="seq", number of resamples used to construct test statistic. Default 20. Only used for sequencing data.} + +\item{nresamp.perm}{For assay.type="seq", number of resamples used to construct test statistic for permutations. Default is equal to nresamp and it must be at most nresamp. Only used for sequencing data.} + +\item{xl.mode}{Used by Excel interface} + +\item{xl.time}{Used by Excel interface} + +\item{xl.prevfit}{Used by Excel interface} +} +\description{ +This function is an adaptation of `samr::samr` +} diff --git a/man/samr.estimate.depth.Rd b/man/samr.estimate.depth.Rd new file mode 100644 index 0000000..75ab46e --- /dev/null +++ b/man/samr.estimate.depth.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/samr-adapted.R +\name{samr.estimate.depth} +\alias{samr.estimate.depth} +\title{Estimate sequencing depths} +\usage{ +samr.estimate.depth(x) +} +\arguments{ +\item{x}{data matrix. nrow=#gene, ncol=#sample} +} +\value{ +depth: estimated sequencing depth. a vector with len sample. +} +\description{ +Estimate sequencing depths +} diff --git a/man/valuesG1msReduced.Rd b/man/valuesG1msTest.Rd similarity index 81% rename from man/valuesG1msReduced.Rd rename to man/valuesG1msTest.Rd index 810c8d4..5249a1d 100644 --- a/man/valuesG1msReduced.Rd +++ b/man/valuesG1msTest.Rd @@ -1,12 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/datasets.R \docType{data} -\name{valuesG1msReduced} -\alias{valuesG1msReduced} +\name{valuesG1msTest} +\alias{valuesG1msTest} \title{Single-cells data from a myxoid liposarcoma cell line} \description{ -A dataset of 30 single cells from a myxoid liposarcoma cell - line. Columns refer to samples and rows refer to genes. The last 92 rows +A sample of single cells from a myxoid liposarcoma cell + line. Columns refer to samples and rows refer to genes. The last rows refer to external RNA controls consortium (ERCC) spike-ins. This dataset is part of a larger dataset containing 94 single cells. The complete dataset is fully compatible with this package and an rda file can be obtained at diff --git a/man/wilcoxon.unpaired.seq.func.Rd b/man/wilcoxon.unpaired.seq.func.Rd new file mode 100644 index 0000000..aadb6ee --- /dev/null +++ b/man/wilcoxon.unpaired.seq.func.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/samr-adapted.R +\name{wilcoxon.unpaired.seq.func} +\alias{wilcoxon.unpaired.seq.func} +\title{Twoclass Wilcoxon statistics} +\usage{ +wilcoxon.unpaired.seq.func(xresamp, y) +} +\arguments{ +\item{xresamp}{an rank array with dim #gene*#sample*nresamp} + +\item{y}{outcome vector of values 1 and 2} +} +\value{ +the statistic. +} +\description{ +Twoclass Wilcoxon statistics +} diff --git a/tests/testthat/test-DIscBIO.IMP.R b/tests/testthat/test-DIscBIO.IMP.R new file mode 100644 index 0000000..4e9daa6 --- /dev/null +++ b/tests/testthat/test-DIscBIO.IMP.R @@ -0,0 +1,87 @@ +# ---------------------------------------------------------------------------- # +# Data pre-processing # +# ---------------------------------------------------------------------------- # + +context("Data loading and pre-processing") + +sc <- DISCBIO(valuesG1msTest) # Reduced dataset used for testing + +test_that("Loading datasets generate the expected output", { + expect_equal(dim(valuesG1msTest), c(800, 15)) +}) + +test_that("Data signature changes", { + expect_equal(class(sc)[1], "DISCBIO") + expect_equal(attr(class(sc), "package"), "DIscBIO") +}) + +# This function will be used only if the dataset has ERCC +sc <- NoiseFiltering(sc, plot=FALSE, export=FALSE, quiet=TRUE) + +test_that("Noise filtering is added", { + expect_equal(length(sc@noiseF), 163) +}) + +# In this case this function is used to normalize the reads +sc <- Normalizedata( + sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, + dsn=1, rseed=17000 +) + +test_that("Data is normalized", { + expect_equal(class(sc@fdata), "data.frame") + expect_output(str(sc@fdata), "708 obs. of 15 variables") +}) + +# This function can be used for: 1- filtering and normalizing the dataset that has no ERCC. 2- to normalize and filter genes and cells after the noise filtering. +sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE, quiet=TRUE) + +test_that("Data is normalized", { + expect_equal(dim(sc@fdata), c(163, 15)) +}) + +# ---------------------------------------------------------------------------- # +# K-means clustering # +# ---------------------------------------------------------------------------- # + +context("K-means clustering") + +sc <- Clustexp(sc, clustnr=2, cln=2, bootnr=10, quiet=TRUE, rseed=17000) +sc <- comptSNE(sc, rseed=15555, quiet=TRUE, max_iter=1, epoch=10) + +test_that("tSNE is computed", { + expect_equal(class(sc@tsne), "data.frame") + expect_output(str(sc@tsne), "15 obs. of 2 variables") +}) + +test_that("Cluster plots output is as expexted", { + expect_equivalent( + object = Jaccard(sc, Clustering="K-means", K=2, plot=FALSE, R=5), + expected = c(.417, .413), + tolerance = .01 + ) + expect_equal( + object = clustheatmap(sc, hmethod = "single", plot = FALSE), + expected = c(1, 2) + ) +}) + +# --------------------------------- Outliers --------------------------------- # + +context("Outliers") + +Outliers <- FindOutliers( + sc, K=2, outminc=5, outlg=2, probthr=.5*1e-3, thr=2 ^ (-1:-40), + outdistquant=.75, plot = FALSE, quiet = TRUE +) +Order <- KmeanOrder(sc, quiet = TRUE, export = FALSE) + +test_that("Outliers are the expected", { + expect_equivalent(Outliers, c(3, 10)) + expect_equivalent( + object = Order@kordering, + expected = c( + 5, 6, 13, 2, 3, 1, 4, 7, 12, 9, 15, 8, 10, 11, 14 + ) + ) +}) \ No newline at end of file diff --git a/tests/testthat/test.DIscBIO.IMP.R b/tests/testthat/test.DIscBIO.IMP.R deleted file mode 100644 index 8ade01f..0000000 --- a/tests/testthat/test.DIscBIO.IMP.R +++ /dev/null @@ -1,277 +0,0 @@ -# ---------------------------------------------------------------------------- # -# Data pre-processing # -# ---------------------------------------------------------------------------- # - -context("Data loading and pre-processing") - -sc <- DISCBIO(valuesG1msReduced) # Reduced dataset used for testing - -test_that("Loading datasets generate the expected output", { - expect_equal(dim(valuesG1msReduced), c(1092, 30)) -}) - -test_that("Data signature changes", { - expect_equal(class(sc)[1], "DISCBIO") - expect_equal(attr(class(sc), "package"), "DIscBIO") -}) - -# This function will be used only if the dataset has ERCC -sc <- NoiseFiltering(sc, plot=FALSE, export=FALSE, quiet=TRUE) - -test_that("Noise filtering is added", { - expect_equal(length(sc@noiseF), 341) -}) - -# In this case this function is used to normalize the reads -sc <- Normalizedata( - sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, - dsn=1, rseed=17000 -) - -test_that("Data is normalized", { - expect_equal(class(sc@fdata), "data.frame") - expect_output(str(sc@fdata), "1000 obs. of 30 variables") -}) - -# This function can be used for: 1- filtering and normalizing the dataset that has no ERCC. 2- to normalize and filter genes and cells after the noise filtering. -sc <- FinalPreprocessing(sc, GeneFlitering="NoiseF", export=FALSE, quiet=TRUE) - -test_that("Data is normalized", { - expect_equal(dim(sc@fdata), c(341, 30)) -}) - -# ---------------------------------------------------------------------------- # -# K-means clustering # -# ---------------------------------------------------------------------------- # - -context("K-means clustering") - -sc <- Clustexp(sc, cln=2, quiet=TRUE) # K-means clustering -sc <- comptSNE(sc, rseed=15555, quiet=TRUE, max_iter=10) - -test_that("tSNE is computed", { - expect_equal(class(sc@tsne), "data.frame") - expect_output(str(sc@tsne), "30 obs. of 2 variables") -}) - -test_that("Cluster plots output is as expexted", { - expect_equivalent( - object = Jaccard(sc, Clustering="K-means", K=2, plot=FALSE, R=5), - expected = c(.790, .653), - tolerance = .01 - ) - expect_equal( - object = KMclustheatmap(sc, hmethod = "single", plot = FALSE), - expected = c(1, 2) - ) -}) - -# --------------------------------- Outliers --------------------------------- # - -context("Outliers") - -Outliers <- FindOutliersKM( - sc, K=2, outminc=5, outlg=2, probthr=.5*1e-3, thr=2**-(1:40), - outdistquant=.75, plot = FALSE, quiet = TRUE -) - -# Adjusting outliers -outlg <- round(length(sc@fdata[, 1]) / 200) # The cell will be considered as an outlier if it has a minimum of 0.5% of the number of filtered genes as outlier genes. -Outliers2 <- FindOutliersKM( - sc, K=2, outminc=5, outlg=outlg, probthr=.5*1e-3, thr=2**-(1:40), - outdistquant=.75, plot = FALSE, quiet = TRUE -) -Order <- KmeanOrder(sc, quiet = TRUE, export = FALSE) - -test_that("Outliers are the expected", { - expect_equivalent(Outliers, c(3, 7, 19)) - expect_equivalent(Outliers2, c(3, 7, 19)) - expect_equivalent( - object = Order@kordering, - expected = c( - 23, 20, 6, 21, 27, 26, 24, 28, 10, 19, 15, 25, 16, 8, 14, 13, 22, 4, - 17, 2, 3, 18, 11, 29, 9, 5, 12, 1, 30, 7 - ) - ) -}) - -# --------------------- Differential Expression Analysis --------------------- # - -context("Differential Expression Analysis") - -# Binomial differential expression analysis -cdiff1 <- KMClustDiffGenes(sc, K=1, fdr=.2, export=FALSE, quiet=TRUE) - -# differential expression analysis between all clusters -cdiff2 <- DEGanalysis( - sc, Clustering="K-means", K=2, fdr=.2, name="Name", export=FALSE, - quiet=TRUE, plot=FALSE, nperms=5, nresamp=2 -) - -# differential expression analysis between two particular clusters. -cdiff3 <- DEGanalysis2clust( - sc, Clustering="K-means", K=2, fdr=.15, name="Name", First="CL1", - Second="CL2", export=FALSE, quiet=TRUE, plot=FALSE, nperms=5, nresamp=2 -) - -test_that("DEGs are calculated", { - expect_identical( - object = sapply(cdiff1, function(x) class(x)[1]), - expected = c("matrix", "data.frame") - ) - expect_identical( - object = sapply(cdiff2, function(x) class(x)[1]), - expected = c("matrix", "data.frame") - ) - expect_identical( - object = sapply(cdiff3, function(x) class(x)[1]), - expected = c("matrix", "data.frame") - ) -}) - -# Decision tree -sigDEG <- cdiff3[[1]] - -DATAforDT <- ClassVectoringDT( - sc, Clustering="K-means", K=2, First="CL1", Second="CL2", sigDEG, - quiet = TRUE -) - -j48dt <- J48DT(DATAforDT, quiet = TRUE, plot = FALSE) -j48dt_eval <- J48DTeval( - DATAforDT, num.folds=10, First="CL1", Second="CL2", quiet=TRUE -) -rpartDT <- RpartDT(DATAforDT, quiet = TRUE, plot = FALSE) -rpartEVAL <- RpartEVAL( - DATAforDT, num.folds=10, First="CL1", Second="CL2", quiet = TRUE -) - -test_that("Decision tree elements are defined", { - expect_output(str(DATAforDT), "3 obs. of 30 variables") - expect_s3_class(j48dt, "J48") - expect_s3_class(summary(j48dt), "Weka_classifier_evaluation") - expect_identical(j48dt_eval, c(TP = 14, FN = 4, FP = 3, TN = 9)) - expect_s3_class(rpartDT, "rpart") - expect_identical(rpartEVAL, c(TP = 16, FN = 2, FP = 4, TN = 8)) -}) - -# ---------------------------------------------------------------------------- # -# Model-based clustering # -# ---------------------------------------------------------------------------- # - -context("Model-based clustering") - -# Technically, this should be done before Clustexp, but it's ok in practice to -# apply it after K-means because it uses different slots. -sc <- Exprmclust(sc, K=2, quiet=TRUE) - -test_that("Model-based clustering elements are OK", { - expect_identical( - object = names(sc@MBclusters), - expected = c("pcareduceres", "MSTtree", "clusterid", "clucenter") - ) -}) - -sc <- comptsneMB(sc, rseed=15555, quiet=TRUE, max_iter=100) - -test_that("tSNE clustering works fine", { - expect_equal(dim(sc@MBtsne), c(30, 2)) -}) - -# --------------------------------- Outliers --------------------------------- # - -context("MB outliers") - -sc <- Clustexp(sc, cln=2, quiet=TRUE) - -Outliers <- FindOutliersMB( - sc, K=2, outminc=5, outlg=2, probthr=.5*1e-3, thr=2**-(1:40), - outdistquant=.75, plot = FALSE, quiet = TRUE -) -outlg <- round(length(sc@fdata[, 1]) / 200) # The cell will be considered as an outlier if it has a minimum of 0.5% of the number of filtered genes as outlier genes. -Outliers2 <- FindOutliersMB( - sc, K=2, outminc=5, outlg=outlg, probthr=.5*1e-3, thr=2**-(1:40), - outdistquant=.75, plot = FALSE, quiet = TRUE -) - -test_that("MB clustering and outliers work as expected", { - expect_equivalent( - object = Jaccard(sc, Clustering="MB", K=2, plot = FALSE, R=5), - expected = c(.819, .499), - tolerance = 0.01 - ) - expect_equivalent(Outliers, c(3, 4, 7)) - expect_equal(outlg, 2) - expect_equal(Outliers2, c("G1_12" = 3, "G1_18" = 4, "G1_21" = 7)) -}) - -sc <- MB_Order(sc, quiet = TRUE, export = FALSE) -mb_heat <- MBclustheatmap(sc, hmethod="single", plot = FALSE, quiet = TRUE) - -test_that("More MB things are OK", { - expect_equal( - object = sc@MBordering, - expected = c( - 8, 28, 27, 18, 21, 10, 29, 26, 17, 25, 5, 13, 12, 19, 16, 15, 23, - 20, 30, 14, 7, 9, 24, 22, 3, 2, 4, 6, 1, 11 - ) - ) - expect_equal(mb_heat, c(1, 2)) -}) - -# ----------------------------------- DEGs ----------------------------------- # - -context("MB DEGs") - -# Binomial DE analysis -cdiff1 <- MBClustDiffGenes(sc, K=2, fdr=.2, export=FALSE, quiet=TRUE) - -# DE analysis between all clusters -cdiff2 <- DEGanalysis( - sc, Clustering="MB", K=2, fdr=.2, name="Name", export=FALSE, quiet=TRUE, - plot = FALSE, nperms=5, nresamp=2 -) - -# differential expression analysis between particular clusters. -cdiff3 <- DEGanalysis2clust( - sc, Clustering="MB", K=2, fdr=.15, name="Name", First="CL1", Second="CL2", - export = FALSE, plot = FALSE, quiet = TRUE, nperms=5, nresamp=2 -) - -test_that("DEGs are calculated", { - expect_identical( - object = sapply(cdiff1, function(x) class(x)[1]), - expected = c("matrix", "data.frame") - ) - expect_identical( - object = sapply(cdiff2, function(x) class(x)[1]), - expected = c("matrix", "data.frame") - ) - expect_identical( - object = sapply(cdiff3, function(x) class(x)[1]), - expected = c("matrix", "data.frame") - ) -}) - -# Decision tree -sigDEG <- cdiff3[[1]] -DATAforDT <- ClassVectoringDT( - sc, Clustering="MB", K=2, First="CL1", Second="CL2", sigDEG, quiet = TRUE -) -j48dt <- J48DT(DATAforDT, quiet = TRUE, plot = FALSE) -j48dt_eval <- J48DTeval( - DATAforDT, num.folds=10, First="CL1", Second="CL2", quiet = TRUE -) -rpartDT <- RpartDT(DATAforDT, quiet = TRUE, plot = FALSE) -rpartEVAL <- RpartEVAL( - DATAforDT, num.folds=10, First="CL1", Second="CL2", quiet = TRUE -) - -test_that("Decision tree elements are defined", { - expect_output(str(DATAforDT), "29 obs. of 30 variables") # used to be 31 - expect_s3_class(j48dt, "J48") - expect_s3_class(summary(j48dt), "Weka_classifier_evaluation") - expect_identical(j48dt_eval, c(TP = 21, FN = 2, FP = 4, TN = 3)) - expect_s3_class(rpartDT, "rpart") - expect_identical(rpartEVAL, c(TP = 20, FN = 3, FP = 4, TN = 3)) -}) diff --git a/tests/testthat/test.integration.R b/tests/testthat/test.integration.R deleted file mode 100644 index df48a5c..0000000 --- a/tests/testthat/test.integration.R +++ /dev/null @@ -1,22 +0,0 @@ -# ============================================================================== -# Testing integration with SingleCellExperiment -# ============================================================================== -context("Converting other formats to DISCBIO") -# ------------------------------------------------------------------------------ -# Setting up datasets -# ------------------------------------------------------------------------------ -pmbc_seurat <- Seurat::pbmc_small -pmbc_sce <- Seurat::as.SingleCellExperiment(pmbc_seurat) -g1_sce <- SingleCellExperiment::SingleCellExperiment( - list(counts=as.matrix(valuesG1msReduced)) -) -# ------------------------------------------------------------------------------ -# Performing unit tests -# ------------------------------------------------------------------------------ -test_that("Pure text gets formatted as DISCBIO", { - expect_s4_class(as.DISCBIO(pmbc_seurat), "DISCBIO") -}) -test_that("SCE file gets formatted as DISCBIO", { - expect_s4_class(as.DISCBIO(pmbc_sce), "DISCBIO") - expect_s4_class(as.DISCBIO(g1_sce), "DISCBIO") -}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore deleted file mode 100644 index 097b241..0000000 --- a/vignettes/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.html -*.R diff --git a/vignettes/Clustering.png b/vignettes/Clustering.png deleted file mode 100644 index dc08512..0000000 Binary files a/vignettes/Clustering.png and /dev/null differ diff --git a/vignettes/DIscBIO-vignette.Rmd b/vignettes/DIscBIO-vignette.Rmd deleted file mode 100644 index e6d346f..0000000 --- a/vignettes/DIscBIO-vignette.Rmd +++ /dev/null @@ -1,712 +0,0 @@ ---- -title: "DIscBIO: a user-friendly pipeline for biomarker discovery in single-cell transcriptomics" -author: "Salim Ghannoum" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{DIscBIO tutorial} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -![DIscBIO](DiscBIO.png){width=90%} - -The pipeline consists of four successive steps: data pre-processing, cellular clustering and pseudo-temporal ordering, determining differential expressed genes and identifying biomarkers. - -## Required Packages - -```{r options, echo=FALSE} -knitr::opts_chunk$set(fig.width=7, fig.height=7) -``` - -```{r} -library(DIscBIO) -library(enrichR, quiet=TRUE) -``` - -## Loading dataset - -The dataset should be formatted in a data frame where columns refer to samples and rows refer to genes. - - -```{r} -FileName<-data("valuesG1msReduced") -DataSet<-get(FileName) -``` - -## 1. Data Pre-processing - -Prior to applying data analysis methods, it is standard to pre-process the raw read counts resulted from the sequencing. The preprocessing approach depends on the existence or absence of ERCC spike-ins. In both cases, it includes normalization of read counts and gene filtering. - -#### Normalization of read counts -To account for RNA composition and sequencing depth among samples (single-cells), the normalization method “median of ratios” is used. This method takes the ratio of the gene instantaneous median to the total counts for all genes in that cell (column median). The gene instantaneous median is the product of multiplying the median of the total counts across all cells (row median) with the read of the target gene in each cell. This normalization method makes it possible to compare the normalized counts for each gene equally between samples. - -#### Gene filtering -The key idea in filtering genes is to appoint the genes that manifest abundant variation across samples. Filtering genes is a critical step due to its dramatic impact on the downstream analysis. In case the raw data includes ERCC spike-ins, genes will be filtered based on variability in comparison to a noise level estimated from the ERCC spike-ins according to an algorithm developed by Brennecke et al (Brennecke et al., 2013). This algorithm utilizes the dependence of technical noise on the average read count and fits a model to the ERCC spike-ins. Further gene filtering can be implemented based on gene expression. -In case the raw data does not include ERCC spike-ins, genes will be only filtered based on minimum expression in certain number of cells. - - -![DataPre-processing](Data-Preprocessing.png){width=90%} - -```{r} -sc<- DISCBIO(DataSet) -``` - -### 1.1. Filtering and normalizing the raw data that includes ERCCs - -Filtering the raw data that includes ERCCs can be done by applying the “NoiseFiltering” function, which includes several parameters: - - object: the outcome of running the DISCBIO() function. - - percentile: A numeric value of the percentile. It is used to validate the ERCC spik-ins. Default is 0.8. - - CV: A numeric value of the coefficient of variation. It is used to validate the ERCC spik-ins. Default is 0.5. - - geneCol: Color of the genes that did not pass the filtration. - - FgeneCol: Color of the genes that passt the filtration. - - erccCol: Color of the ERCC spik-ins. - - Val: A logical vector that allows plotting only the validated ERCC spike-ins. Default is TRUE. If Val=FALSE will - plot all the ERCC spike-ins. - - plot: A logical vector that allows plotting the technical noise. Default is TRUE. - - export: A logical vector that allows writing the final gene list in excel file. Default is TRUE. - - quiet: if `TRUE`, suppresses printed output - -To normalize the raw sequencing reads the function Normalizedata() should be used, this function takes 8 parameters. -- In case the user would like just to normalize the reads without any further gene filtering the parameters minexpr and minnumber should be set to 0. -- In case the user would like just to normalize the reads and run gene filtering based on gene expression the parameters minexpr and minnumber should have values. This function will discard cells with less than mintotal transcripts. Genes that are not expressed at minexpr transcripts in at least minnumber cells are discarded. - -The function Normalizedata() normalizes the count reads using the normalization method “median of ratios” - -To Finalize the preprocessing the function FinalPreprocessing() should be implemented by setting the parameter "GeneFlitering" to NoiseF ( whether the dditional gene filtering step based on gene expression was implemented on not). - - -```{r} -sc<-NoiseFiltering(sc,percentile=0.9, CV=0.2) - -#### Normalizing the reads without any further gene filtering -sc<-Normalizedata(sc, mintotal=1000, minexpr=0, minnumber=0, maxexpr=Inf, downsample=FALSE, dsn=1, rseed=17000) - -#### Additional gene filtering step based on gene expression -#MIinExp<- mean(rowMeans(DataSet,na.rm=TRUE)) -#MIinExp -#MinNumber<- round(length(DataSet[1,])/3) # To be expressed in at least one third of the cells. -#MinNumber -#sc<-Normalizedata(sc, mintotal=1000, minexpr=MIinExp, minnumber=MinNumber, maxexpr=Inf, downsample=FALSE, dsn=1, rseed=17000) - -sc<-FinalPreprocessing(sc,GeneFlitering="NoiseF",export=TRUE) ### The GeneFiltering can be either "NoiseF" or"ExpF" -``` - -### 1.2. Filtering and normalizing the raw data that does not include ERCCs - -To normalize and filter the raw data that does not include ERCCs can be done by applying the function Normalizedata() and giving the parameters minexpr and minnumber some values. This function will discard cells with less than mintotal transcripts. Genes that are not expressed at minexpr transcripts in at least minnumber cells are discarded. Furthermore, it will normalize the count reads using the normalization method “median of ratios”. - -To Finalize the preprocessing the function FinalPreprocessing() should be implemented by setting the parameter "GeneFlitering" to ExpF. - -### ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤ Running this cell will overwrite the previous cell ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤ - - -```{r} -#OnlyExpressionFiltering=TRUE -OnlyExpressionFiltering=FALSE - -if (OnlyExpressionFiltering==TRUE){ - MIinExp <- mean(rowMeans(DataSet,na.rm=TRUE)) - MIinExp - MinNumber<- round(length(DataSet[1,])/3) # To be expressed in at least one third of the cells. - MinNumber - sc<-Normalizedata(sc, mintotal=1000, minexpr=MIinExp, minnumber=MinNumber, maxexpr=Inf, downsample=FALSE, dsn=1, rseed=17000) #### In this case this function is used to filter out genes and cells. - sc<-FinalPreprocessing(sc,GeneFlitering="ExpF",export=TRUE) -} -``` - -## 2. Cellular Clustering and Pseudo Time ordering - -Cellular clustering is performed according to the gene expression profiles to detect cellular sub-population with unique properties. After clustering, pseudo-temporal ordering is generated to indicate the cellular differentiation degree. - -![PSCAN](Clustering.png){width=90%} - -## 2.1. K-means Clustering - -Rare cell type Identification algorithm (RaceID) was used to cluster the pre-processed data using k-means on a similarity distance matrix, which was based on Pearson correlation and the similarity matrix was computed as “1 – Pearson correlation”. The approach of the proposed clustering, i.e., applying k-means on a similarity distance matrix using the Euclidean metric, improves cluster separation. RaceID estimates the number of clusters by finding the minimal clusters' number at the saturation level of gap statistics, which standardizes the within-cluster dispersion. - -The Clustexp() functions has several parameters: -- object: the outcome of running the DISCBIO() function. -- clustnr Maximum number of clusters for the derivation of the cluster number by the saturation of mean within-cluster dispersion. Default is 20. -- bootnr A numeric value of booststrapping runs for \code{clusterboot}. Default is 50. -- metric Is the method to transform the input data to a distance object. -- Metric has to be one of the following: ["spearman","pearson","kendall","euclidean","maximum","manhattan","canberra","binary","minkowski"]. -- do.gap A logical vector that allows generating the number of clusters based on the gap statistics. Default is TRUE. -- SE.method The SE.method determines the first local maximum of the gap statistics. -- The SE.method has to be one of the following:["firstSEmax","Tibs2001SEmax","globalSEmax","firstmax","globalmax"]. Default is "Tibs2001SEmax" -- SE.factor A numeric value of the fraction of the standard deviation by which the local maximum is required to differ from the neighboring points it is compared to. Default is 0.25. -- B.gap Number of bootstrap runs for the calculation of the gap statistics. Default is 50 -- cln Number of clusters to be used. Default is \code{NULL} and the cluster number is inferred by the saturation criterion. -- rseed Integer number. Random seed to enforce reproducible clustering results. Default is 17000. -- quiet if `TRUE`, intermediate output is suppressed - -![DIsccBIO](KM1.png){width=90%} - - -```{r} -sc<- Clustexp(sc,cln=2,quiet=TRUE) #### K-means clustering to get three clusters -plotGap(sc) ### Plotting gap statisticssc<- Clustexp(sc, clustnr=20,bootnr=50,metric="pearson",do.gap=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=K,rseed=17000) -``` - -#### 2.1.1. Defining the Cells in the clusters generated by k-means clustering - - -```{r} -sc<- comptSNE(sc,rseed=15555,quiet=TRUE) -cat("\t"," Cell-ID"," Cluster Number","\n") -sc@cpart -``` - -#### 2.1.2- Evaluating the stability and consistency of the clusters - -PSCAN enables the robustness assessment of the detected clusters in terms of stability and consistency using Jaccard’s similarity statistics and silhouette coefficients. Jaccard’s similarity index provides a comparison of members among clusters to evaluate the stability of the clusters with a range from 0% to 100%. The higher the percentage, the more stable the cluster is. Silhouette coefficients estimate how close each sample in one cluster is to samples in the neighboring clusters, reflecting the consistency of each cluster with a range of [-1, 1]. The higher the cluster mean coefficient, the more consistent the cluster is. - - -```{r} -# Silhouette of k-means clusters -par(mar=c(6,2,4,2)) -plotSilhouette(sc,K=2) # K is the number of clusters -``` - -```{r} -Jaccard(sc,Clustering="K-means", K=2, plot=TRUE) # Jaccard of k-means clusters -``` - -#### 2.1.3. Cluster plotting using tSNE maps - -Here you visualize the K-means clusters using t-distributed stochastic neighbor embedding (tSNE), which is a non-linear dimensionality reduction method that places neighbor cells close to each other. - - -```{r} -############ Plotting K-means clusters -plottSNE(sc) -plotKmeansLabelstSNE(sc) # To plot the the ID of the cells in eacj cluster -plotSymbolstSNE(sc,types=sub("(\\_\\d+)$","", names(sc@ndata))) # To plot the the ID of the cells in eacj cluster -``` - -### Defining outlier cells based on K-means Clustering - -Outlier identification is implemented using a background model based on distribution of transcript counts within a cluster. Outlier cells are detected if the probability for that cell c that a minimum number of genes Gmin of observing total counts TGmin is less than a specific threshold Pthr, as given by the red dotted line. Outlier cells in K-means clusters can be detected by running the FindOutliersKM() function. - -In case the user decided to remove outlier cells, the user should set RemovingOutliers to TRUE and then start from the beginning (Data Pre-processing). - - -```{r} -outlg<-round(length(sc@fdata[,1])/200) # The cell will be considered as an outlier if it has a minimum of 0.5% of the number of filtered genes as outlier genes. -Outliers<- FindOutliersKM(sc, K=2, outminc=5,outlg=outlg,probthr=.5*1e-3,thr=2**-(1:40),outdistquant=.75, plot=TRUE, quiet=FALSE) - -RemovingOutliers=FALSE -# RemovingOutliers=TRUE # Removing the defined outlier cells based on K-means Clustering - -if(RemovingOutliers==TRUE){ - names(Outliers)=NULL - Outliers - DataSet=DataSet[-Outliers] - dim(DataSet) - colnames(DataSet) - cat("Outlier cells were removed, now you need to start from the beginning") -} -``` - -#### 2.1.4. Cellular pseudo-time ordering based on k-means clusters - -```{r} -sc<-KmeanOrder(sc,quiet=FALSE, export=TRUE) -plotOrderKMtsne(sc) -``` - -#### 2.1.5 Plotting the K-means clusters in heatmap - -The similarities between single-cells were computed by Euclidean distances of Pearson transcriptome correlation matrix. Based on these similarities, a heatmap portrayal of cell-to-cell distances was plotted using Euclidean as the distance measure and the single linkage as the clustering method, cluster centers were ordered by hierarchic clustering. - - -```{r} -KMclustheatmap(sc,hmethod="single", plot=TRUE) -``` - -#### 2.1.6 Plotting the gene expression of a particular gene in a tSNE map - - -```{r} -g='ENSG00000010244' #### Plotting the expression of MT-RNR2 -plotExptSNE(sc,g) -``` - -## 3. Determining differentially expressed genes (DEGs) For K-means Clustering - -Differentially expressed genes between individual clusters are identified using the significance analysis of sequencing data (SAMseq), which is a new function in significance analysis of microarrays (Li and Tibshirani 2011) in the samr package v2.0 (Tibshirani et all., 2015). SAMseq is a non-parametric statistical function dependent on Wilcoxon rank statistic that equalizes the sizes of the library by a resampling method accounting for the various sequencing depths. The analysis is implemented over the pure raw dataset that has the unnormalized expression read counts after excluding the ERCCs. Furthermore, DEGs in each cluster comparing to all the remaining clusters are determined using binomial differential expression, which is based on binomial counting statistics. - -![DIsccBIO](KM2.png){width=90%} - -## 3.1 Identifying DEGs using SAMseq - -The user can define DEGs between all clusters generated by either K-means or model based clustering by applying the “DEGanalysis” function. Another alternative is to define DEGs between particular clusters generated by K-means clustering by applying the “DEGanalysis2clust” function. The outcome of these two functions is a list of two components: -- The first component is a data frame showing the Ensembl gage name and the symbole of the detected DEGs -- The second component is table showing the comparisons, Target cluster, Number of genes and the File name. This component will be used for the downstream analysis. - -### 3.1.1 Determining DEGs between two particular clusters - -```{r degKM} -####### differential expression analysis between cluster 1 and cluster 3 of the Model-Based clustering using FDR of 0.05 -cdiff<-DEGanalysis2clust(sc,Clustering="K-means",K=2,fdr=0.05,name="Name",First="CL1",Second="CL2",export=TRUE,quiet=TRUE) -``` - -```{r} -#### To show the result table -head(cdiff[[1]]) # The first component -head(cdiff[[2]]) # The second component -``` - -### 3.1.2 Determining DEGs between all clusters - - -## ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤ Running this cell will overwrite the previous one ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤ - - -```{r} -cdiff<-DEGanalysis(sc,Clustering="K-means",K=2,fdr=0.05,name="all_clusters",export=TRUE,quiet=TRUE) ####### differential expression analysis between all clusters -``` - -```{r} -#### To show the result table -head(cdiff[[1]]) # The first component -head(cdiff[[2]]) # The second component -``` - -#### 3.2 Identifying DEGs using binomial differential expression - -The function KMClustDiffGenes identifies differentially regulated genes for each cluster of the K-means clustering in -comparison to the ensemble of all cells. It returns a list with a data.frame element for each cluster that contains the mean expression across all cells not in the cluster (mean.ncl) and in the cluster (mean.cl), the fold-change in the cluster versus all remaining cells (fc), and the p-value for differential expression between all cells in a cluster and all remaining cells. The p-value is computed based on the overlap of negative binomials fitted to the count distributions within the two groups akin to DESeq. - - -```{r} -cdiffBinomial<-KMClustDiffGenes(sc,K=2,fdr=.01,,quiet=TRUE) ########## Binomial differential expression analysis -#### To show the result table -head(cdiffBinomial[[1]]) # The first component -head(cdiffBinomial[[2]]) # The second component -``` - -### Plotting the DEGs - -Volcano plots are used to readily show the DEGs by plotting significance versus fold-change on the y and x axes, respectively. - -```{r} -name <- cdiffBinomial[[2]][2, 4] # Selecting the DEGs table -U <- read.csv(file=paste0(name), head=TRUE, sep=",") -Vplot <- VolcanoPlot(U, value=0.001, name=name, FS=.5, fc=1) -``` - -In case the output of this function is the error "Error in plot.window(...): need finite 'ylim' values", `adj` should be set to TRUE. - -In case the user would like to get the names of the significant DEGs, then FS should be set to a value of 0.3 or higher. - -## 4. Identifying biomarkers (decision trees and networking analysis) - -There are several methods to identify biomarkers, among them are decision trees and hub detection through networking analysis. The outcome of STRING analysis is stored in tab separated values (TSV) files. These TSV files served as an input to check both the connectivity degree and the betweenness centrality, which reflects the communication flow in the defined PPI networks - -![DIsccBIO](KM3.png){width=90%} - -Decision trees are one of the most efficient classification techniques in biomarkers discovery. Here we use it to predict the sub-population of a target cell based on transcriptomic data. Two types of decision trees can be performed: classification and regression trees (CART) and J48. The decision tree analysis is implemented over a training dataset, which consisted of the DEGs obtained by either SAMseq or the binomial differential expression. The performance of the generated trees can be evaluated for error estimation by ten-fold cross validation assessment using the "J48DTeval" and "RpartEVAL" functions. The decision tree analysis requires the dataset to be class vectored by applying the “ClassVectoringDT” function. - -```{r} -#sigDEG<-cdiff[[1]] # DEGs gene list from SANseq -sigDEG<-cdiffBinomial[[1]] # DEGs gene list from Binomial analysis -First="CL1" -Second="CL2" -DATAforDT<-ClassVectoringDT(sc,Clustering="K-means",K=3,First=First,Second=Second,sigDEG) -``` - -### 4.1. J48 Decision Tree - - -```{r} -j48dt<-J48DT(DATAforDT) -summary(j48dt) -``` - -#### 4.1.1. Evaluating the performance of the J48 Decision Tree - - -```{r} -j48dt<-J48DTeval(DATAforDT,num.folds=10,First=First,Second=Second) -``` - -### 4.2. RPART Decision Tree - - -```{r} -rpartDT <- RpartDT(DATAforDT) -``` - -#### 4.2.1. Evaluating the performance of the RPART Decision Tree - - -```{r} -rpartEVAL<-RpartEVAL(DATAforDT,num.folds=10,First=First,Second=Second) -``` - -### 4.3. Networking Analysis - -To define protein-protein interactions (PPI) over a list of genes, STRING-api is used. The outcome of STRING analysis was stored in tab separated values (TSV) files. These TSV files served as an input to check both the connectivity degree and the betweenness centrality, which reflects the communication flow in the defined PPI networks. - -### 4.3.1 All DEGs - - -```{r} -DEGs="All DEGs" -FileName=paste0(DEGs) - -data<-cdiffBinomial[[1]] [,2] # DEGs gene list from Binomial analysis (taking only the firat 100 genes) -#data<-cdiff[[1]] [,2] # From the table of the differential expression analysis between all pairs of clusters - -ppi<-PPI(data,FileName) - -networking<-NetAnalysis(ppi) -networking ##### In case the Examine response components=200 and an error "linkmat[i, ]" appeared, that means there are no PPI. -``` - -```{r} -data=networking[1:25,1] # plotting the network of the top 25 highly connected genes -network<-Networking(data,FileName,plot_width=25, plot_height=10) -``` - -### 4.3.2 Particular set of DEGs - - -```{r} -############ Selecting the DEGs' table ############## -DEGs=cdiffBinomial[[2]][1,4] # Up-regulated genes in cluster 1 (from the Binomial analysis) -#DEGs=cdiff[[2]][1,4] # Up-regulated genes in cluster 1 (from SAMseq) -FileName=paste0(DEGs) - -data<-read.csv(file=paste0(DEGs),head=TRUE,sep=",") -data<-data[,3] - -ppi<-PPI(data,FileName) - -networking<-NetAnalysis(ppi) -networking ##### In case the Examine response components=200 and an error "linkmat[i, ]" appeared, that means there are no PPI. - -network<-Networking(data,FileName) -``` - -### 4.4 Gene Enrichment Analysis - -```{r} -dbs <- listEnrichrDbs() -head(dbs) -#print(dbs) -``` - -```{r} -############ Selecting the DEGs' table ############## -DEGs=cdiffBinomial[[2]][1,4] # Up-regulated genes in cluster 1 (from the Binomial analysis) -#DEGs=cdiff[[2]][1,4] # Up-regulated genes in cluster 1 (from SAMseq) - -data<-read.csv(file=paste0(DEGs),head=TRUE,sep=",") -data<-as.character(data[,3]) - -dbs <- c("KEGG_2013","GO_Molecular_Function_2015", "GO_Biological_Process_2015") -enriched <- enrichr(data, dbs) -KEGG_2013<-enriched[[1]][,c(1,2,3,9)] -GO_Molecular_Function_2015<-enriched[[2]][,c(1,2,3,9)] -GO_Biological_Process_2015<-enriched[[3]][,c(1,2,3,9)] - -GEA<-rbind(KEGG_2013,GO_Molecular_Function_2015,GO_Biological_Process_2015) -GEA -``` - -## 2.2. Model-Based clustering - -![DIsccBIO](MB1.png){width=90%} - -Model-based clustering assumes that the data are generated by a model and attempts to recover the original model from the data to define cellular clusters. The steps reproduced here are comparable to the ones on Section [2.1](#21-k-means-clustering), and the output will be suppressed for brevity. - -#### 2.2.1. Defining the Cells in the clusters generated by model-based clustering - - -```{r, eval=FALSE} -sc <- Exprmclust(sc,K=3,reduce=TRUE,quiet=TRUE) -cat("\t"," Cell-ID"," Cluster Number","\n") -sc@MBclusters$clusterid -``` - -#### 2.2.2. Cluster plotting using PCA and tSNE maps - -To visualize the detected clusters, two common dimensionality reduction tools are implemented: tSNE map and principal component analysis (PCA), which is a linear dimensionality reduction method that preserves the global structure and shows how the measurements themselves are related to each other. - - -```{r, eval=FALSE} -PlotmclustMB(sc) -PCAplotSymbols(sc) -``` - -```{r, eval=FALSE} -# Plotting the model-based clusters in tSNE maps -sc<- comptsneMB(sc,rseed=15555,quiet=TRUE) -plottsneMB(sc,K=3) -plotMBLabelstSNE(sc) -``` - -#### 2.2.3. Evaluating the stability and consistancy of the clusters - - -```{r, eval=FALSE} -# Silhouette of Model-based clusters -par(mar=c(6,2,4,2)) -plotsilhouetteMB(sc,K=3) -``` - -```{r, eval=FALSE} -sc<- Clustexp(sc,cln=3,quiet=TRUE) -Jaccard(sc,Clustering="MB", K=3, plot=TRUE) # Jaccard of Model based clusters -``` - -### Defining outlier cells based on Model-Based Clustering - - -```{r, eval=FALSE} -outlg<-round(length(sc@fdata[,1])/200) # The cell will be considered as an outlier if it has a minimum of 0.5% of the number of filtered genes as outlier genes. -Outliers<- FindOutliersMB(sc, K=3, outminc=5,outlg=outlg,probthr=.5*1e-3,thr=2**-(1:40),outdistquant=.75, plot=TRUE, quiet=TRUE) - -RemovingOutliers=FALSE -# RemovingOutliers=TRUE # Removing the defined outlier cells based on K-means Clustering - -if(RemovingOutliers==TRUE){ - names(Outliers)=NULL - Outliers - DataSet=DataSet[-Outliers] - dim(DataSet) - colnames(DataSet) - cat("Outlier cells were removed, now you need to start from the beginning") -} -``` - -### 2.2.4. Cellular pseudo-time ordering based on Model-based clusters - - -```{r, eval=FALSE} -sc<-MB_Order(sc,quiet=FALSE, export=TRUE) -``` - -#### 2.2.4.1 Plotting the pseudo-time ordering in a PCA plot - - -```{r, eval=FALSE} -PlotMBorderPCA(sc) -``` - - -#### 2.2.4.2 Plotting the pseudo-time ordering in a tSNE map - - -```{r, eval=FALSE} -plotOrderMBtsne(sc) -``` - - -#### 2.2.4.3 Plotting the Model-based clusters in heatmap - - -```{r, eval=FALSE} -MBclustheatmap(sc,hmethod="single", plot=TRUE) -``` - -#### 2.2.4.4 Plotting the gene expression of a particular gene in a tSNE map - - -```{r, eval=FALSE} -g='ENSG00000010244' #### Plotting the expression of MT-RNR2 -plotexptsneMB(sc,g) -``` - -#### 2.2.4.5 Plotting the gene expression of a particular gene in a PCA plot - - -```{r, eval=FALSE} -g <- "ENSG00000010244" #### Plotting the expression of MT-RNR2 -PlotMBexpPCA(sc,g) -``` - -# 3. Determining differentially expressed genes (DEGs) - -Differentially expressed genes between individual clusters are identified using the significance analysis of sequencing data (SAMseq), which is a new function in significance analysis of microarrays (Li and Tibshirani 2011) in the samr package v2.0 (Tibshirani et all., 2015). SAMseq is a non-parametric statistical function dependent on Wilcoxon rank statistic that equalizes the sizes of the library by a resampling method accounting for the various sequencing depths. The analysis is implemented over the pure raw dataset that has the unnormalized expression read counts after excluding the ERCCs. Furthermore, DEGs in each cluster comparing to all the remaining clusters are determined using binomial differential expression, which is based on binomial counting statistics. - -![DIsccBIO](MB2.png){width=90%} - -## 3.1 Identifying DEGs using SAMseq - -The user can define DEGs between all clusters generated by either K-means or model based clustering by applying the “DEGanalysis” function. Another alternative is to define DEGs between particular clusters generated by either K-means or model based clustering by applying the “DEGanalysis2clust” function. The outcome of these two functions is a list of two components: -- The first component is a data frame showing the Ensembl gage name and the symbole of the detected DEGs -- The second component is table showing the comparisons, Target cluster, Number of genes and the File name. This component will be used for the downstream analysis. - -### 3.1.1 Determining DEGs between two particular clusters - - -```{r, eval=FALSE} -####### differential expression analysis between cluster 2 and cluster 3 of the Model-Based clustering using FDR of 0.05 -MBcdiff<-DEGanalysis2clust(sc,Clustering="MB",K=3,fdr=0.1,name="Name",First="CL1",Second="CL2",export=TRUE,quiet=FALSE) -#### To show the result table -head(MBcdiff[[1]]) # The first component -head(MBcdiff[[2]]) # The second component -``` - -### 3.1.2 Determining DEGs between all clusters - -## ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤ Running this cell will overwrite the previous one ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤ - - -```{r, eval=FALSE} -MBcdiff<-DEGanalysis(sc,Clustering="MB",K=3,fdr=0.05,name="all_clusters",export=TRUE,quiet=TRUE) ####### differential expression analysis between all clusters -#### To show the result table -head(MBcdiff[[1]]) # The first component -head(MBcdiff[[2]]) # The second component -``` - -```{r, eval=FALSE} -#### To show the result table -head(MBcdiff[[1]]) # The first component -head(MBcdiff[[2]]) # The second component -``` - -#### 3.2 Identifying DEGs using binomial differential expression - -The function MBClustDiffGenes identifies differentially regulated genes for each cluster of the Model-Based clustering in -comparison to the ensemble of all cells. It returns a list with a data.frame element for each cluster that contains the mean expression across all cells not in the cluster (mean.ncl) and in the cluster (mean.cl), the fold-change in the cluster versus all remaining cells (fc), and the p-value for differential expression between all cells in a cluster and all remaining cells. The p-value is computed based on the overlap of negative binomials fitted to the count distributions within the two groups akin to DESeq. - - -```{r, eval=FALSE} -MBcdiffBinomial<-MBClustDiffGenes(sc,K=3,fdr=.01,export=TRUE, quiet=TRUE) ########## Binomial differential expression analysis -#### To show the result table -head(MBcdiffBinomial[[1]]) # The first component -head(MBcdiffBinomial[[2]]) # The second component -``` - -### Plotting the DEGs - -Volcano plots are used to readily show the DEGs by plotting significance versus fold-change on the y and x axes, respectively. - - -```{r, eval=FALSE} -name<-MBcdiffBinomial[[2]][2,4] ############ Selecting the "Up-DEG-cluster2.csv " from the DEGs' binomial table ############## -#name<-MBcdiff[[2]][4,6] ############ Selecting the "Up-DEG-cluster2.csv " from the DEGs' SAMseq table ############## -U<-read.csv(file=paste0(name),head=TRUE,sep=",") -Vplot<-VolcanoPlot(U,value=0.001,name=name,FS=.5,fc=0.5) -#### In case the output of this function is an error "Error in plot.window(...): need finite 'ylim' values", the adj should be set to TRUE -### In case the user would like to get the names of the significant DEGs then FS should be set to a value of 0.3 or higher. -``` - -## 4. Identifying biomarkers (decision trees and networking analysis) - -There are several methods to identify biomarkers, among them are decision trees and hub detection through networking analysis. The outcome of STRING analysis is stored in tab separated values (TSV) files. These TSV files served as an input to check both the connectivity degree and the betweenness centrality, which reflects the communication flow in the defined PPI networks - -![DIsccBIO](MB3.png){width=90%} - -Decision trees are one of the most efficient classification techniques in biomarkers discovery. Here we use it to predict the sub-population of a target cell based on transcriptomic data. Two types of decision trees can be performed: classification and regression trees (CART) and J48. The decision tree analysis is implemented over a training dataset, which consisted of the DEGs obtained by either SAMseq or the binomial differential expression. The performance of the generated trees can be evaluated for error estimation by ten-fold cross validation assessment using the "J48DTeval" and "RpartEVAL" functions. -The decision tree analysis requires the dataset to be class vectored by applying the “ClassVectoringDT” function. - - -```{r, eval=FALSE} -sigDEG<-cdiff[[1]] # DEGs gene list from SANseq -#sigDEG<-cdiffBinomial[[1]] # DEGs gene list from Binomial analysis -First="CL1" -Second="CL2" -DATAforDT<-ClassVectoringDT(sc,Clustering="MB",K=3,First=First,Second=Second,sigDEG) -``` - -### 4.1. J48 Decision Tree - - -```{r, eval=FALSE} -j48dt<-J48DT(DATAforDT) -summary(j48dt) -``` - -#### 4.1.1. Evaluating the performance of the J48 Decision Tree - - -```{r, eval=FALSE} -j48dt<-J48DTeval(DATAforDT,num.folds=10,First=First,Second=Second) -``` - -### 4.2. RPART Decision Tree - - -```{r, eval=FALSE} -rpartDT<-RpartDT(DATAforDT) -``` - -#### 4.2.1. Evaluating the performance of the RPART Decision Tree - - -```{r, eval=FALSE} -rpartEVAL<-RpartEVAL(DATAforDT,num.folds=10,First=First,Second=Second) -``` - -### 4.3. Networking Analysis - -To define protein-protein interactions (PPI) over a list of genes, STRING-api is used. The outcome of STRING analysis was stored in tab separated values (TSV) files. These TSV files served as an input to check both the connectivity degree and the betweenness centrality, which reflects the communication flow in the defined PPI networks. - -### 4.3.1 All DEGs - - -```{r, eval=FALSE} -DEGs="All DEGs" -FileName=paste0(DEGs) - -#data<-MBcdiffBinomial[[1]] [,2] # DEGs gene list from Binomial analysis -data<-MBcdiff[[1]] [,2] # From the table of the differential expression analysis between all pairs of clusters - -ppi<-PPI(data,FileName) - -networking<-NetAnalysis(ppi) -networking ##### In case the Examine response components=200 and an error "linkmat[i, ]" appeared, that means there are no PPI. - -network<-Networking(data,FileName) -``` - -### 4.3.2 Particular set of DEGs - - -```{r, eval=FALSE} -############ Selecting the DEGs' table ############## -DEGs=cdiffBinomial[[2]][2,4] # UP-regulated genes in cluster 2 (from the Binomial analysis) -#DEGs=MBcdiff[[2]][2,6] # UP-regulated genes in cluster 2 (from SAMseq) -FileName=paste0(DEGs) - -data<-read.csv(file=paste0(DEGs),head=TRUE,sep=",") -data<-data[,3] - -ppi<-PPI(data,FileName) - -networking<-NetAnalysis(ppi) -networking ##### In case the Examine response components=200 and an error "linkmat[i, ]" appeared, that means there are no PPI. - -network<-Networking(data,FileName) - -``` - -### 4.4 Gene Enrichment Analysis - -```{r, eval=FALSE} -dbs <- listEnrichrDbs() -head(dbs) -#print(dbs) -``` - -```{r, eval=FALSE} -############ Selecting the DEGs' table ############## -DEGs=MBcdiffBinomial[[2]][2,4] # Up-regulated genes in cluster 2 (from the Binomial analysis) -#DEGs=MBcdiff[[2]][2,6] # UP-regulated genes in cluster 2 (from SAMseq) -data<-read.csv(file=paste0(DEGs),head=TRUE,sep=",") -data<-as.character(data[,3]) - -dbs <- c("KEGG_2013","GO_Molecular_Function_2015", "GO_Biological_Process_2015") -enriched <- enrichr(data, dbs) -KEGG_2013<-enriched[[1]][,c(1,2,3,9)] -GO_Molecular_Function_2015<-enriched[[2]][,c(1,2,3,9)] -GO_Biological_Process_2015<-enriched[[3]][,c(1,2,3,9)] - -GEA<-rbind(KEGG_2013,GO_Molecular_Function_2015,GO_Biological_Process_2015) -GEA -``` \ No newline at end of file diff --git a/vignettes/Data-Preprocessing.png b/vignettes/Data-Preprocessing.png deleted file mode 100644 index e3471e1..0000000 Binary files a/vignettes/Data-Preprocessing.png and /dev/null differ diff --git a/vignettes/DiscBIO.png b/vignettes/DiscBIO.png deleted file mode 100644 index bd9b0e7..0000000 Binary files a/vignettes/DiscBIO.png and /dev/null differ diff --git a/vignettes/KM1.png b/vignettes/KM1.png deleted file mode 100644 index dbc62fb..0000000 Binary files a/vignettes/KM1.png and /dev/null differ diff --git a/vignettes/KM2.png b/vignettes/KM2.png deleted file mode 100644 index cd3c02c..0000000 Binary files a/vignettes/KM2.png and /dev/null differ diff --git a/vignettes/KM3.png b/vignettes/KM3.png deleted file mode 100644 index 2a0fa89..0000000 Binary files a/vignettes/KM3.png and /dev/null differ diff --git a/vignettes/MB1.png b/vignettes/MB1.png deleted file mode 100644 index 505b2c3..0000000 Binary files a/vignettes/MB1.png and /dev/null differ diff --git a/vignettes/MB2.png b/vignettes/MB2.png deleted file mode 100644 index b448f40..0000000 Binary files a/vignettes/MB2.png and /dev/null differ diff --git a/vignettes/MB3.png b/vignettes/MB3.png deleted file mode 100644 index 2f257c3..0000000 Binary files a/vignettes/MB3.png and /dev/null differ