/
ggplot_basicstats.R
122 lines (91 loc) · 6.69 KB
/
ggplot_basicstats.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
library(knitr)
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE, tidy.opts=list(width.cutoff=80),tidy=TRUE)
height_basic=6
# If you don't have a package installed already
# install.packages("packagename")
# Otherwise, load it into the environment
library(ggplot2)
library(ggpubr)
library(reshape2)
## Load the data
data_df <- read.table("data_sp_scaled.txt", sep="\t", stringsAsFactors=F, header=T)
## and the covariate information
metadata_df <- read.table("metadata.txt", sep="\t", stringsAsFactors=F, header=T)
table(complete.cases(data_df))
table(complete.cases(t(data_df)))
## ---------------- ##
# Over 50% of the genes across 5 samples are missing. We can deal with this by discarding the problematic samples, using the complete.cases() function!
data_df_clean <- data_df[complete.cases(data_df), ]
# We can then do a quick smell-test on the data using the dataframe function summary(). This function summarizes each column of the dataframe. We can summarize each sample (currently each row) by transposing the dataframe.
summary(t(data_df_clean))
## This is still hard to read! We can plot our data with ggplot to visualize it instead, but first we need to modify it such that we have a column with the sample name, another with the gene name, and lastly one with the expression value of the gene in a specific sample.
## For this we use the melt function from reshape2.
data_compact_df = melt(t(data_df_clean))
colnames(data_compact_df) = c("Gene", "Sample", "Expression")
## stat_summary() defaults to the categories defined on the x axis, and summarizes the numeric spread on the y-axis
## We use coord_flip() to switch the categories to the y-axis, to improve readability
ggplot(data_compact_df, aes(x = Sample, y = Expression)) + stat_summary() + theme_bw(base_size=14) + coord_flip()
## stat_summary with 1 Standard Deviation around mean
ggplot(data_compact_df, aes(x = Sample, y = Expression)) + stat_summary(fun.args = list(mult = 1)) + theme_bw(base_size=14) + coord_flip()
## geom_point to visualize the scatterplot
## stat_smooth to fit the blue line with confidence intervals in grey
## geom_abline fits an x=y line by default (intercept=0, slope=1).
ggplot(data_df_clean, aes(x = BRCA1, y = PIK3CA)) + geom_point() + stat_smooth() + geom_abline(colour="red", size=2) + theme_bw(base_size=14)
## We can identify the trends in the outlier sample better with a plot of the quantile ranges. We will use a boxplot for this
ggplot(data_compact_df, aes(x = Sample, y = Expression)) + geom_boxplot() + theme_bw(base_size=14) + coord_flip()
## We can remove this sample from our dataset by excluding any rownames that have the sample ID.
data_df_clean2 = data_df_clean[!(rownames(data_df_clean) %in% c("PDX1328")), ]
## Let us update our melted dataframe too
# First we melt our data
data_compact_df = melt(t(data_df_clean2))
colnames(data_compact_df) = c("Gene", "Sample", "Expression")
# Then we plot it
ggplot(data_compact_df, aes(x = Sample, y = Expression)) + geom_boxplot() + theme_bw(base_size=14) + coord_flip()
table(metadata_df$cl_disease_detail)
## Merge expression and metadata
data_merged = merge(data_compact_df, metadata_df, by.x="Sample", by.y="cl_id")
## Select PDX1258's outlier gene
brca_tspan_df = data_merged[data_merged$Gene == "TSPAN6" & data_merged$cl_disease_detail == "breast carcinoma", ]
## Firstly we update data_df_clean2
## Notice how we select the row with the sample name, and gene with the gene name
data_df_clean2["PDX1258", "TSPAN6"] = mean(brca_tspan_df[brca_tspan_df$Sample != "PDX1258", "Expression"])
## Then we recalculate the melted version of this dataframe
data_compact_df = melt(t(data_df_clean2))
colnames(data_compact_df) = c("Gene", "Sample", "Expression")
## ---- eval=TRUE, echo=TRUE-----------------------------------------------
data(airquality)
## ---- eval=TRUE, echo=FALSE----------------------------------------------
(head(airquality))
ggplot(airquality[airquality$Month %in% c(5,6), ], aes(x=Month, y=Temp)) + geom_boxplot() + stat_compare_means() + theme_bw(base_size=14)
ggplot(airquality[airquality$Month %in% c(5,6), ], aes(x=as.character(Month), y=Temp)) + geom_boxplot() + stat_compare_means() + theme_bw(base_size=14)
ggplot(airquality[airquality$Month %in% c(5,6), ], aes(x=as.character(Month), y=Temp)) + geom_violin() + stat_compare_means() + theme_bw(base_size=14)
my_comparisons <- list( c("5", "6"), c("7", "8"), c("6", "8"), c("7", "9"))
## Plot p-values for specified comparisons
ggplot(airquality, aes(x=as.character(Month), y=Temp)) + geom_violin() + geom_point(alpha=0.5) + stat_compare_means(comparisons = my_comparisons, method="t.test") + theme_bw(base_size=14)
ggplot(airquality, aes(x=as.character(Month), y=Temp)) + geom_violin() + geom_point(alpha=0.5) + stat_compare_means(method="t.test", ref.group = "5") + theme_bw(base_size=14)
# We can also do pairwise tests, but we have to explicitly define the comparisons
my_comparisons <- list( c("5", "6"), c("7", "8"), c("6", "8"), c("7", "9"))
ggplot(airquality, aes(x=as.character(Month), y=Temp)) + geom_violin() + geom_point(alpha=0.5) + stat_compare_means(comparisons = my_comparisons, method="t.test") + labs(x="Month", y="Temperature") + theme_pubclean(base_size=14)
## We can also plot pairwise scatterplots
brca_df = data_merged[data_merged$cl_disease_detail %in% c("normal", "breast medullary carcinoma"), ]
## Reverse the melt step
brca_df_recast = dcast(brca_df[,c("Sample", "Gene", "Expression")], Sample~Gene)
rownames(brca_df_recast) = brca_df_recast$Sample; brca_df_recast$Sample <- NULL
## Remove the outlier gene
brca_df_recast = brca_df_recast[, !(colnames(brca_df_recast) %in% c("TSPAN6"))]
pairs(t(brca_df_recast), panel = function(...) smoothScatter(..., add = TRUE))
## Selecting samples with a particular disease etiology
## Select the samples from the metadata dataframe
samples_brca = metadata_df[metadata_df$cl_disease_detail %in% c("breast adenocarcinoma", "breast ductal carcinoma"), ]
## Filter our dataframe based on this list
brca_cohorts_df = data_df_clean2[rownames(data_df_clean2) %in% samples_brca$cl_id, ]
### Compare this dataframe to what you get with the following command
brca_cohorts_testdf = data_df_clean2[samples_brca$cl_id, ]
# Calculate mean of each gene
avg_genes = sapply(brca_cohorts_df, mean)
# Calculate standard deviation of each gene
sd_genes = sapply(brca_cohorts_df, sd)
# Filter dataframe based on an SD threshold of your choosing
brca_cohorts_filt = brca_cohorts_df[ , names(sd_genes[sd_genes == max(sd_genes)]), drop=FALSE]
brca_cohorts_filt$cl_id = rownames(brca_cohorts_filt); brca_cohorts_filt = merge(brca_cohorts_filt, metadata_df)
ggplot(brca_cohorts_filt, aes(x=cl_disease_detail, y=TFF1)) + geom_boxplot() + theme_bw(base_size=16) + stat_compare_means(method = "t.test")