testsetbias.Rmd

# Analysis run for Cross-platform TSP

#First, we load all the libraries, files, and data.

```{r dependencies, eval=TRUE}
set.seed(47209)
library(genefu) # has the genes for PAM50
library(colorspace) # colors for plots
library(ggplot2)
library(plyr)
library(affyPLM)
library(pamr)
library(MetaGx)
library(tables)
source("download_esets.R") #global data load
```

First let us get summary information on our datasets (28 total microarry experiments).

```{r init, cache=TRUE, warning=FALSE}

n <- sum(unlist(lapply(lapply(eset.all, dim), "[[", 2)))
# 6297

table(unlist(lapply(eset.all, function(x){pData(x)$node})))
n - sum(table(unlist(lapply(eset.all, function(x){pData(x)$node}))))
#   0    1    2
#2857 1871    2
# 1567 NA

table(unlist(lapply(eset.all, function(x){pData(x)$er})))
n - sum(table(unlist(lapply(eset.all, function(x){pData(x)$er}))))
#    0    1
# 1556 3635
# 1106 NA

table(unlist(lapply(eset.all, function(x){pData(x)$grade})))
n - sum(table(unlist(lapply(eset.all, function(x){pData(x)$grade}))))
#   1    2    3
# 525 1642 2226
# 1904 NA

table(unlist(lapply(eset.all, function(x){pData(x)$her2})))
n - sum(table(unlist(lapply(eset.all, function(x){pData(x)$her2}))))
#   0    1
# 1437  496
# 4364 NA

table(unlist(lapply(eset.all, function(x){pData(x)$subtype})))

# Basal   Her2   LumB   LumA Normal
#  1254    927   2007   1813    296

summary(unlist(lapply(eset.all, function(x){pData(x)$t.rfs})))

#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
#      3    1164    2280    2637    3915    9218    2991

sd(unlist(lapply(eset.all, function(x){pData(x)$t.rfs})), na.rm=T)
# 1772.906

table(unlist(lapply(eset.all, function(x){pData(x)$pgr})))

#  0   1
# 656 766
# 4875 NA

summary(unlist(lapply(eset.all, function(x){pData(x)$age})))

#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
#  21.00   47.00   56.40   57.29   67.24   96.29    1547

sd(unlist(lapply(eset.all, function(x){pData(x)$age})), na.rm=T)

# 13.42863

summary(unlist(lapply(eset.all, function(x){pData(x)$size})))

#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
# -9.000   1.600   2.200   2.521   3.000  18.200    1720

sd(unlist(lapply(eset.all, function(x){pData(x)$size})), na.rm=T)

#1.434876

# Here, we are simplifying the platform labels so that we can look at like technologies together

platforms <- unlist(lapply(eset.all, annotation))
platforms[c(1,11,25)] <- "agilent"
platforms[c(21,28)] <- "illumina"
platforms[c(12,16)] <- "swegene"
platforms[c(3,4,5,17,22)] <- "other"

# Find the union and intersection of probes across all experiments

length(Reduce(union, lapply(eset.all, rownames)))
length(Reduce(intersect, lapply(eset.all, rownames)))

```

Here we examine how predicitions from PAM50 with scaling (normalization of data) change
when the makeup of the sample changes (size of sample and ER status distribution).

```{r bias, cache=TRUE, fig.width=15, fig.height=12, warning=FALSE}

affy <- eset.all$GSE7390 # This is an affy hgu133plus2 dataset
annot <- fData(affy)
names(annot)[1] <- "EntrezGene.ID"
annot$probe <- rownames(annot)
# pam50.robust applies centering/scaling to the data
affy_pam <- intrinsic.cluster.predict(sbt.model=pam50.robust, data=t(exprs(affy)), annot=annot,do.mapping=T)$subtype
# pam50 does not apply centering/scaling
affy_pam_un <- intrinsic.cluster.predict(sbt.model=pam50, data=t(exprs(affy)), annot=annot,do.mapping=T)$subtype

num <- c(2, 10, 20, 40, 80, 100, 120)
prog <- prog_un <- matrix(NA, 100, length(num))
n <- 100

for(i in 1:length(num)){
	for(j in 1:n){
		idx <- sample(colnames(exprs(affy)), num[i], replace=F)
	        tmp_pam <- intrinsic.cluster.predict(sbt.model=pam50.robust, data=t(exprs(affy))[idx,], annot=annot,do.mapping=T)$subtype
		tmp_pam_un <- intrinsic.cluster.predict(sbt.model=pam50, data=t(exprs(affy))[idx,], annot=annot,do.mapping=T)$subtype
		prog[j,i] <- sum(tmp_pam == affy_pam[idx])/length(tmp_pam)
		prog_un[j,i] <- sum(tmp_pam_un == affy_pam_un[idx])/length(tmp_pam_un)
	}
}

boxplot(prog*100, names=num, outline=F,xlab="# Samples", ylab="% Concordance with Full Predictions", main="Changes in predictions on scaled data as more patients are added")
for(i in 1:length(num)){
        points(jitter(rep(i, n), amount=1/(i+3)), prog[,i]*100, pch=21, bg="dodgerblue1", col="black")
}

boxplot(prog_un*100, names=num, outline=F, ylim=c(90, 101), xlab="# Samples", ylab="% Concordance with Full Predictions", main="Changes in predictions on unscaled data as more patients are added")
for(i in 1:length(num)){
	points(jitter(rep(i, n), amount=1/(i+3)), prog_un[,i]*100, pch=21, bg="dodgerblue1", col="black")
}

# Split data into er- and er+ groups
er_min_sub <- exprs(affy)[,pData(affy)$samplename[which(pData(affy)$er == 0)]]
er_max_sub <- exprs(affy)[,pData(affy)$samplename[which(pData(affy)$er == 1)]]

er_min_out <- affy_pam[which(pData(affy)$er == 0)]
er_max_out <- affy_pam[which(pData(affy)$er == 1)]
er_min_out_un <- affy_pam_un[which(pData(affy)$er == 0)]
er_max_out_un <- affy_pam_un[which(pData(affy)$er == 1)]
names(er_min_out) <- names(er_min_out_un) <- colnames(er_min_sub)
names(er_max_out) <- names(er_max_out_un) <- colnames(er_max_sub)

n2 <- 100
prog_mean <- prog_sd <- prog_mean_un <- prog_sd_un <- vector("numeric", 41)

# Let's go from all ER- to all ER+

for(i in 0:40){
	cur <- cur_un <- vector("numeric", n2)
	for(j in 1:n2){
		idx_min <- sample(colnames(er_min_sub), i, replace=F)
		idx_max <- sample(colnames(er_max_sub), 40-i, replace=F)
		dat <- cbind(er_min_sub[,idx_min], er_max_sub[,idx_max])
		st <- c(er_min_out[idx_min], er_max_out[idx_max])
		st_un <- c(er_min_out_un[idx_min], er_max_out_un[idx_max])
		tmp_pam <- intrinsic.cluster.predict(sbt.model=pam50.robust, data=t(dat), annot=annot,do.mapping=T)$subtype
		tmp_pam_un <- intrinsic.cluster.predict(sbt.model=pam50, data=t(dat), annot=annot,do.mapping=T)$subtype 
		cur[j] <- sum(tmp_pam == st)/length(tmp_pam)
		cur_un[j] <- sum(tmp_pam_un == st_un)/length(tmp_pam_un)
		
	}		
	
	prog_mean[i+1] <- mean(cur)
	prog_sd[i+1] <- sd(cur)
	prog_mean_un[i+1] <- mean(cur_un)
	prog_sd_un[i+1] <- sd(cur_un)
}

# These approximate the % ER- composition at each calculated point
x_seq <- seq(0,1,length=length(prog_mean))*100

plot(x_seq, prog_mean*100, type="l", ylim=c(70, 100), col="dodgerblue1",lwd=2, xlab="% ER negative in Population", ylab="% Concordance with Full Predictions", main="Predictions vary as ER composition of patient set varies")
lines(x_seq, prog_mean*100 - prog_sd*100, lty="dashed", col="red")
lines(x_seq, prog_mean*100 + prog_sd*100, lty="dashed", col="red")
abline(v=32, lty="dashed", col="green")
legend("bottomright", legend=c("+/- 1 SE","ER- % in original population"),col=c("red", "green"), lty=rep("dashed", 2))

plot(x_seq, prog_mean_un*100, type="l", ylim=c(70, 100), col="dodgerblue1",lwd=2, xlab="% ER negative in Population", ylab="% Concordance with Full Predictions", main="Unscaled predictions as ER composition of patient set varies")
lines(x_seq, prog_mean_un*100 - prog_sd_un*100, lty="dashed", col="red")
lines(x_,seq, prog_mean_un*100 + prog_sd_un*100, lty="dashed", col="red")
abline(v=32, lty="dashed", col="green")
legend("bottomright", legend=c("+/- 1 SE","ER- % in original population"),col=c("red", "green"), lty=rep("dashed", 2))


```

Now we're going to see what happens when we predict Tumor Grade by building a PAM model. We want to
determine the effect of normalization on the predictions and to determine if PAM scaled and unscaled 
perform similarly in different settings.

```{r grade, cache=TRUE, fig.width=15, fig.height=12, eval=TRUE, warning=FALSE}

# Function to create a submatrix from a new expressionset given a list of genes
make_mat <- function(data,genes){
        mat <- matrix(NA, length(genes), ncol(data))
        rownames(mat) <- genes
        colnames(mat) <- colnames(data)

        for(i in 1:length(genes)){
                if(genes[i] %in% rownames(data)){
                        mat[i,] <- data[genes[i],]
                }
        }

        mat
}

# Function to calculate sensitivity for grades 1 and 3
get_ss <- function(preds, true){

	tmp <- (preds == 1) + (true == 1)	
	c(length(which(tmp == 2))/length(tmp), length(which(tmp == 0))/length(tmp))

	c(sum(preds == 1 & true == 1)/sum((preds == 1 | preds == 3) & true == 1),
	  sum(preds == 3 & true == 3)/sum((preds == 1 | preds == 3) & true == 3))
}

# Function to run a cross-validated PAM build on a training set and gather predictions on
# two test sets.
pred_cv <- function(train, test1, test2, ncv=10){
	PAM50_names <- rownames(pam50$centroids.map)
	genes <- unlist(sapply(PAM50_names, function(x){rownames(fData(train))[which(fData(train)$SYMBOL == x)]}))
	
	dat <- exprs(train)[genes, which(pData(train)$grade != 2)]
	dat_t1 <- exprs(test1)[,which(pData(test1)$grade != 2)]
	dat_t2 <- exprs(test2)[,which(pData(test2)$grade != 2)]
	outcome <- pData(train)$grade[which(pData(train)$grade != 2)]
	outcome_t1 <- pData(test1)$grade[which(pData(test1)$grade != 2)]
	outcome_t2 <- pData(test2)$grade[which(pData(test2)$grade != 2)]

	idxs <- split(sample(1:ncol(dat)), rep(1:ncv, each=ncol(dat)/ncv))

	output <- matrix(0, ncv, 12)

	for(i in 1:ncv){
		idx <- idxs[[i]]
		ktrain <- dat[,-idx]
		ktest <- dat[,idx]
		ktrain_outcome <- outcome[-idx]
		ktest_outcome <- outcome[idx]

		pd <- list(x=ktrain, y=ktrain_outcome, geneids=rownames(ktrain))
		pt <- pamr.train(pd)
		pt_cv <- pamr.cv(pt, pd)
		thresh <- pt_cv$threshold[which.min(pt_cv$error)]

		final_genes <- pamr.listgenes(pt, pd, thresh)[,"id"]
		pam_sbt_scaled <- pam50.robust		
		cent <- pt$centroids[final_genes,]
		cent_map <- fData(train)[rownames(cent),]
		cent_map <- data.frame("probe"=cent_map$SYMBOL, "probe.centroids"=cent_map$SYMBOL, "EntrezGene.ID"=cent_map$ENTREZID)
		rownames(cent_map) <- cent_map$probe
		rownames(cent) <- fData(train)[rownames(cent), "SYMBOL"]
		pam_sbt_scaled$centroids <- cent
		pam_sbt_scaled$centroids.map <- cent_map

		rownames(ktest) <- fData(train)[rownames(ktest),"SYMBOL"]
		preds_scaled_test <- intrinsic.cluster.predict(pam_sbt_scaled, t(ktest), pam_sbt_scaled$centroids.map)$subtype 

		pam_sbt_unscaled <- pam_sbt_scaled
		pam_sbt_unscaled$std <- "none"
		preds_unscaled_test <- intrinsic.cluster.predict(pam_sbt_unscaled, t(ktest), pam_sbt_scaled$centroids.map)$subtype

		mat_t1 <- make_mat(dat_t1, genes)
		rownames(mat_t1) <- fData(test1)[rownames(mat_t1), "SYMBOL"]
		preds_scaled_t1 <- intrinsic.cluster.predict(pam_sbt_scaled, t(mat_t1), pam_sbt_scaled$centroids.map)$subtype 
		preds_unscaled_t1 <- intrinsic.cluster.predict(pam_sbt_unscaled, t(mat_t1), pam_sbt_scaled$centroids.map)$subtype

		mat_t2 <- make_mat(dat_t2, genes)
		rownames(mat_t2) <- fData(test2)[rownames(mat_t2), "SYMBOL"]
		preds_scaled_t2 <- intrinsic.cluster.predict(pam_sbt_scaled, t(mat_t2), pam_sbt_scaled$centroids.map)$subtype 
		preds_unscaled_t2 <- intrinsic.cluster.predict(pam_sbt_unscaled, t(mat_t2), pam_sbt_scaled$centroids.map)$subtype

		output[i,] <- c(get_ss(preds_scaled_test, ktest_outcome),
				    get_ss(preds_unscaled_test, ktest_outcome),
				    get_ss(preds_scaled_t1, outcome_t1),
				    get_ss(preds_unscaled_t1, outcome_t1),
				    get_ss(preds_scaled_t2, outcome_t2),
				    get_ss(preds_unscaled_t2, outcome_t2))
	}

	output
}

# Affy: GSE7390, Agilent: ISDB10845, Illumina: ISDB10278
affyout <- pred_cv(eset.all$GSE7390, eset.all$ISDB10845, eset.all$ISDB10278)
agout <- pred_cv(eset.all$ISDB10845, eset.all$GSE7390, eset.all$ISDB10278)
# Reorder to affy-agilent-ilumina
agout <- agout[,c(5:8,1:4,9:12)]
ilout <- pred_cv(eset.all$ISDB10278, eset.all$GSE7390, eset.all$ISDB10845)
# Reorder to affy-agilent-ilumina
ilout <- ilout[,c(5:8, 9:12, 1:4)]

affy_mean <- colMeans(affyout)
affy_sd <- apply(affyout, 2, sd)
ag_mean <- colMeans(agout)
ag_sd <- apply(agout, 2, sd)
il_mean <- colMeans(ilout)
il_sd <- apply(ilout, 2, sd)

full <- data.frame("mean" = c(affy_mean, ag_mean, il_mean), "sd" = c(affy_sd, ag_sd, il_sd),
 		   "train" = factor(rep(c("Affy", "Agilent", "Illumina"), each=12)),
 		   "prediction" = factor(rep(rep(c("Affy", "Agilent", "Illumina"), each=4), 3)),
 		   "norm" = factor(rep(rep(c("Scaled", "Unscaled"), each = 2), 9)),
 		   "grade" = factor(rep(c(1,3), 18)))
full$mean <- round(full$mean, 2)
full$sd <- round(full$sd, 2)
# Latex-style table output
#latex(tabular(train*norm ~ prediction*grade*(mean + sd)*Heading()*identity, data=full))

ggplot(full, aes(x = grade, y = mean, fill = norm)) +
    geom_bar(stat = "identity", position = position_dodge()) +
    facet_grid(prediction ~ train) +
    theme(strip.text.x = element_text(size = 15), strip.text.y = element_text(size = 15)) +
    geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, group = norm),
                  position = position_dodge(0.9), width=0.2) +
    xlab("Tumor Grade") + ylab("Average Accuracy") +             
    theme(axis.title.x = element_text(size=15, face="bold"), axis.title.y = element_text(size=15, face="bold"),
          axis.text.x=element_text(color="black"),axis.text.y=element_text(color="black")) +
    guides(fill=guide_legend(title="Normalization"))

```