New version (0.6.0). Removed emcncf2 and added udef option for genome…

… build.
mskcc · Dec 14, 2018 · 9352925 · 9352925
1 parent d002c51
commit 9352925
Show file tree

Hide file tree

Showing 8 changed files with 48 additions and 26 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,12 +2,12 @@ Package: facets
 Type: Package
 Title: Cellular Fraction and Copy Numbers from Tumor Sequencing
 Version: 0.6.0
-Date: 2018-02-28
+Date: 2018-12-14
 Author: Venkatraman E. Seshan and Ronglai Shen
 Maintainer: Venkatraman E. Seshan <seshanv@mskcc.org>
 Description: Algorithm to implement Fraction and Allelic Copy number
              Estimate from Tumor/normal Sequencing.
 License: GPL (>= 2)
-Depends: R (>= 2.10), pctGCdata (>= 0.2.0)
+Depends: R (>= 3.0.0), pctGCdata (>= 0.3.0)
 Remotes: veseshan/pctGCdata
 NeedsCompilation: yes
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,6 @@
 useDynLib(facets)
 import(stats,graphics)
 importFrom("utils", "read.csv")
-importFrom("grDevices", "colorRampPalette")
+importFrom("grDevices", "colorRampPalette", "rainbow")
 importFrom("pctGCdata", "getGCpct")
 export("readSnpMatrix", "preProcSample", "procSample", "emcncf", "emcncf2", "plotSample","logRlogORspider")
diff --git a/R/facets-procreads.R b/R/facets-procreads.R
@@ -1,11 +1,8 @@
 # heterozygous and keep flags of the SNPs
-procSnps <- function(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, gbuild="hg19", unmatched=FALSE, ndepthmax=1000) {
+procSnps <- function(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, nX=23, unmatched=FALSE, ndepthmax=1000) {
     # keep only chromsomes 1-22 & X for humans and 1-19, X for mice
-    if (gbuild %in% c("hg19", "hg38", "hg18")) {
-        chromlevels <- c(1:22,"X")
-    } else {
-        chromlevels <- c(1:19,"X")
-    }
+    # for other genomes (gbuild = udef) nX is number of autosomes plus 1
+    chromlevels <- c(1:(nX-1),"X")
     chr.keep <- rcmat$Chromosome %in% chromlevels
     # keep only snps with normal read depth between ndepth and 1000
     depthN.keep <- (rcmat$NOR.DP >= ndepth) & (rcmat$NOR.DP < ndepthmax)
@@ -46,7 +43,7 @@ scanSnp <- function(maploc, het, nbhd) {
 }
 
 # obtain logR and logOR from read counts and GC-correct logR
-counts2logROR <- function(mat, gbuild, unmatched=FALSE, f=0.2) {
+counts2logROR <- function(mat, gbuild, unmatched=FALSE, ugcpct=NULL, f=0.2) {
     out <- mat[mat$keep==1,]
     # gc percentage
     out$gcpct <- rep(NA_real_, nrow(out))
@@ -57,7 +54,11 @@ counts2logROR <- function(mat, gbuild, unmatched=FALSE, f=0.2) {
         ii <- which(out$chrom==i)
         # allow for chromosomes with no SNPs i.e. not targeted
         if (length(ii) > 0) {
-            out$gcpct[ii] <- getGCpct(i, out$maploc[ii], gbuild)
+            if (gbuild == "udef") {
+                out$gcpct[ii] <- getGCpct(i, out$maploc[ii], gbuild, ugcpct)
+            } else {
+                out$gcpct[ii] <- getGCpct(i, out$maploc[ii], gbuild)
+            }
         }
     }
     ##### log-ratio with gc correction and maf log-odds ratio steps

diff --git a/R/facets-wrapper.R b/R/facets-wrapper.R
@@ -22,13 +22,24 @@ readSnpMatrix <- function(filename, skip=0L, err.thresh=Inf, del.thresh=Inf, per
     rcmat
 }
 
-preProcSample <- function(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, cval=25, deltaCN=0, gbuild=c("hg19", "hg38", "hg18", "mm9", "mm10"), hetscale=TRUE, unmatched=FALSE, ndepthmax=1000) {
+preProcSample <- function(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, cval=25, deltaCN=0, gbuild=c("hg19", "hg38", "hg18", "mm9", "mm10", "udef"), ugcpct=NULL, hetscale=TRUE, unmatched=FALSE, ndepthmax=1000) {
     gbuild <- match.arg(gbuild)
     # integer value for chromosome X depends on the genome
     if (gbuild %in% c("hg19", "hg38", "hg18")) nX <- 23
     if (gbuild %in% c("mm9", "mm10")) nX <- 20
-    pmat <- procSnps(rcmat, ndepth, het.thresh, snp.nbhd, gbuild, unmatched, ndepthmax)
-    dmat <- counts2logROR(pmat[pmat$rCountT>0,], gbuild, unmatched)
+    if (gbuild == "udef") {
+        if (missing(ugcpct)) {
+            stop("GC percent data should be supplied if udef option is used")
+        } else {
+            nX <- length(ugcpct)
+        }
+    }
+    pmat <- procSnps(rcmat, ndepth, het.thresh, snp.nbhd, nX, unmatched, ndepthmax)
+    if (gbuild == "udef") {
+        dmat <- counts2logROR(pmat[pmat$rCountT>0,], gbuild, unmatched, ugcpct)
+    } else {
+        dmat <- counts2logROR(pmat[pmat$rCountT>0,], gbuild, unmatched)
+    }
     tmp <- segsnps(dmat, cval, hetscale, deltaCN)
     out <- list(pmat=pmat, gbuild=gbuild, nX=nX)
     c(out, tmp)
@@ -49,8 +60,7 @@ procSample <- function(x, cval=150, min.nhet=15, dipLogR=NULL) {
     chrs <- x$chromlevels
     nchr <- length(chrs)
     # get chromlevels from chrs
-    if (x$gbuild %in% c("hg19", "hg38", "hg18")) chromlevels <- c(1:22,"X")[chrs]
-    if (x$gbuild %in% c("mm9", "mm10")) chromlevels <- c(1:19,"X")[chrs]
+    chromlevels <- c(1:(nX-1), "X")[chrs]
     # get the segment summary for the fit in seg.tree
     nsegs <- 0
     for (i in 1:nchr) {
@@ -162,8 +172,8 @@ plotSample <- function(x, emfit=NULL, clustered=FALSE, plot.type=c("em","naive",
         if (length(ii)>0) out$lcn[ii] <- 5 + log10(out$lcn[ii])
         plot(c(0,length(jseg$cnlr)), c(0,max(out$tcn)), type="n", ylab="copy number (nv)", xaxt="n")
         abline(v=chrbdry, lwd=0.25)
-        segments(segstart, out$tcn, segend, out$tcn, lwd=1.75, col=1)
         segments(segstart, out$lcn, segend, out$lcn, lwd=1.75, col=2)
+        segments(segstart, out$tcn, segend, out$tcn, lwd=1.75, col=1)
         # add the cf
         plot(c(0,length(jseg$cnlr)), 0:1, type="n", ylab="", xaxt="n", yaxt="n")
         mtext("cf-nv", side=2, at=0.5, line=0.3, las=2, cex=0.75)
@@ -178,8 +188,8 @@ plotSample <- function(x, emfit=NULL, clustered=FALSE, plot.type=c("em","naive",
         if (length(ii)>0) out$lcn.em[ii] <- 5 + log10(out$lcn.em[ii])
         plot(c(0,length(jseg$cnlr)), c(0,max(out$tcn.em)), type="n", ylab="copy number (em)", xaxt="n")
         abline(v=chrbdry, lwd=0.25)
-        segments(segstart, out$tcn.em, segend, out$tcn.em, lwd=1.75, col=1)
         segments(segstart, out$lcn.em, segend, out$lcn.em, lwd=1.75, col=2)
+        segments(segstart, out$tcn.em, segend, out$tcn.em, lwd=1.75, col=1)
         # add the cf
         plot(c(0,length(jseg$cnlr)), 0:1, type="n", ylab="", xaxt="n", yaxt="n")
         mtext("cf-em", side=2, at=0.5, line=0.2, las=2, cex=0.75)
@@ -215,7 +225,7 @@ logRlogORspider <- function(cncf, dipLogR=0, nfrac=0.005) {
         }
     }
 
-    plot(c(-0.95, 1.8), c(0, 5), type="n", xlab="Expected(logR - dipLogR)", ylab=" Expected(|logOR|)")
+    plot(c(-0.95, 1.8), c(0, 5.2), type="n", xlab="Expected(logR - dipLogR)", ylab=" Expected(|logOR|)")
     l <- 1; i <-1; j <-0
     linecols <- c("black","cyan3","green3","blue")
     lines(logCNR[,l], logACR[,l], lty=1, col=j+1, lwd=1.25)
@@ -232,5 +242,7 @@ logRlogORspider <- function(cncf, dipLogR=0, nfrac=0.005) {
     nhets <- sum(cncf$nhet)
     ii <- cncf$num.mark > nfrac*nsnps & cncf$nhet > nfrac*nhets
     cex <- 0.3 + 2.7*(cncf$num.mark[ii]/sum(0.1*cncf$num.mark[ii]))
-    points(cncf$cnlr.median[ii] - dipLogR, sqrt(abs(cncf$mafR[ii])), cex=cex, col="magenta4", lwd=1.5)
+    chrcol <- rainbow(24)
+    points(cncf$cnlr.median[ii] - dipLogR, sqrt(abs(cncf$mafR[ii])), cex=cex, pch=10, col=chrcol[cncf$chrom[ii]], lwd=1.5)
+    legend(-1, 5.25, paste("chr", c(1:22, "X"), sep=""), ncol=4, pch=10, col=chrcol[1:23], cex=0.65)
 }
diff --git a/inst/ChangeLog b/inst/ChangeLog
@@ -1,6 +1,9 @@
-02/28/2018: v0.6.0
+12/13/2018: v0.6.0
 
   o Removed emcncf2 and marked as defunct
+  o Changed the order of tcn-lcn lines being drawn to address (0,0) masking
+  o Added udef option for gbuild to enable analyzing other genomes (say
+      dog) with user supplied GC percentage data
 
 02/28/2018: v0.5.14
 

diff --git a/man/facets-internal.Rd b/man/facets-internal.Rd
@@ -22,9 +22,9 @@
 }
 \usage{
 jointsegsummary(jointseg)
-procSnps(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, gbuild="hg19",
+procSnps(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, nX=23,
 unmatched=FALSE, ndepthmax=1000)
-counts2logROR(mat, gbuild, unmatched=FALSE, f=0.2)
+counts2logROR(mat, gbuild, unmatched=FALSE, ugcpct = NULL, f=0.2)
 scanSnp(maploc, het, nbhd)
 fit.cpt.tree(genomdat, edgelim=10, cval=25, hscl=1, delta=0)
 prune.cpt.tree(seg.tree, cval=25)

diff --git a/man/preProcSample.Rd b/man/preProcSample.Rd
@@ -6,8 +6,8 @@
 }
 \usage{
   preProcSample(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, cval=25,
-                deltaCN=0, gbuild=c("hg19", "hg38", "hg18", "mm9", "mm10"),
-                hetscale=TRUE, unmatched=FALSE, ndepthmax=1000)
+       deltaCN=0, gbuild=c("hg19", "hg38", "hg18", "mm9", "mm10", "udef"),
+       ugcpct = NULL, hetscale=TRUE, unmatched=FALSE, ndepthmax=1000)
 }
 \arguments{
   \item{rcmat}{data frame with 6 required columns: \code{Chrom},
@@ -21,7 +21,13 @@
   \item{gbuild}{genome build used for the alignment of the genome.
     Default value is human genome build hg19. Other possibilities are
     hg38 & hg18 for human and mm9 & mm10 for mouse. Chromosomes used for
-    analysis are \code{1-22, X} for humans and \code{1-19} for mouse.}
+    analysis are \code{1-22, X} for humans and \code{1-19} for mouse.
+    Option udef can be used to analyze other genomes.}
+  \item{ugcpct}{If udef is chosen for gbuild then appropriate GC
+    percentage date should be provided through this option. This is a
+    list of numeric vectors that gives the GC percentage windows of
+    width 1000 bases in steps of 100 i.e. 1-1000, 101-1100 etc. for the
+    autosomes and the X chromosome.}
   \item{hetscale}{logical variable to indicate if logOR should get more
     weight in the test statistics for segmentation and clustering. Usually
     only 10\% of snps are hets and hetscale gives the logOR contribution

diff --git a/vignettes/FACETS.pdf b/vignettes/FACETS.pdf