greenelab
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_human.tsv renamed to ‎configs/config_human.tsv
Lines changed: 3 additions & 2 deletions b/‎config_human.tsv renamed to ‎configs/config_human.tsv
Lines changed: 3 additions & 2 deletions
diff --git a/‎configs/config_pseudomonas_1183.tsv
Lines changed: 21 additions & 0 deletions b/‎configs/config_pseudomonas_1183.tsv
Lines changed: 21 additions & 0 deletions
diff --git a/‎configs/config_pseudomonas_33245.tsv
Lines changed: 21 additions & 0 deletions b/‎configs/config_pseudomonas_33245.tsv
Lines changed: 21 additions & 0 deletions
diff --git a/‎configs/config_pseudomonas_7704.tsv
Lines changed: 21 additions & 0 deletions b/‎configs/config_pseudomonas_7704.tsv
Lines changed: 21 additions & 0 deletions
diff --git a/‎config_pseudomonas.tsv renamed to ‎configs/config_pseudomonas_9989.tsv b/‎config_pseudomonas.tsv renamed to ‎configs/config_pseudomonas_9989.tsv
diff --git a/‎config_test.tsv renamed to ‎configs/config_test.tsv b/‎config_test.tsv renamed to ‎configs/config_test.tsv
diff --git a/‎explore_data/viz_template_experiment.ipynb
Lines changed: 1 addition & 1 deletion b/‎explore_data/viz_template_experiment.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎generic_expression_patterns_modules/DE_analysis.R
Lines changed: 61 additions & 32 deletions b/‎generic_expression_patterns_modules/DE_analysis.R
Lines changed: 61 additions & 32 deletions
diff --git a/‎generic_expression_patterns_modules/GSEA_analysis.R
Lines changed: 52 additions & 0 deletions b/‎generic_expression_patterns_modules/GSEA_analysis.R
Lines changed: 52 additions & 0 deletions
@@ -103,6 +103,6 @@ Note: Some of these parameters are required by the imported [ponyo](https://gith
 | epsilon_std | float: Standard deviation of Normal distribution to sample latent space|
 | num_simulated| int: Simulate a compendia with these many experiments, created by shifting the template experiment these many times|
 | project_id | str:  Experiment id to use as a template experiment|
-| col_to_rank | str:  Name of column header from DE association statistic results. This column will be use to rank genes. Select `logFC`, `P.Value`, `adj.P.Val`, `t`|
+| col_to_rank | str:  Name of column header from DE association statistic results. This column will be use to rank genes. Select `logFC`, `P.Value`, `adj.P.Val`, `t` if using Limma. Select Select `log2FoldChange`, `pvalue`, `padj` if using DESeq.|
 | num_recount2_experiments | int:  Number of recount2 experiments to download. Note this will not be needed when we update the training to use all of recount2|
 | compare_genes | bool:  1 if comparing gene ranks with reference gene ranks. 0 if just identifying generic genes and gene sets but not comparing against a reference.|
@@ -20,6 +20,7 @@ validation_frac	0.25
 project_id	"SRP012656"
 metadata_colname	'run'
 num_simulated	25
-col_to_rank	"logFC"
+col_to_rank	"log2FoldChange"
 num_recount2_experiments	200
-compare_genes	1
+gsea_statistic	'log2FoldChange'
+compare_genes	1
@@ -0,0 +1,21 @@
+local_dir	"/home/alexandra/Documents/Data/Generic_expression_patterns/"
+dataset_name	"pseudomonas_analysis"
+template_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/pseudomonas_template_data.tsv"
+compendium_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/Pa_compendium_02.22.2014.pcl"
+normalized_compendium_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/normalized_pseudomonas_compendium_data.tsv"
+shared_genes_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/shared_genes_pseudomonas.pickle"
+scaler_transform_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/scaler_transform_pseudomonas.pickle"
+NN_architecture	"NN_2500_30"
+learning_rate	0.001
+batch_size	10
+epochs	100
+kappa	0.01
+intermediate_dim	2500
+latent_dim	30
+epsilon_std	1.0
+validation_frac	0.25
+project_id	"E-MEXP-1183"
+metadata_colname	'ml_data_source'
+num_simulated	25
+col_to_rank	"logFC"
+compare_genes	0
@@ -0,0 +1,21 @@
+local_dir	"/home/alexandra/Documents/Data/Generic_expression_patterns/"
+dataset_name	"pseudomonas_analysis"
+template_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/pseudomonas_template_data.tsv"
+compendium_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/Pa_compendium_02.22.2014.pcl"
+normalized_compendium_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/normalized_pseudomonas_compendium_data.tsv"
+shared_genes_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/shared_genes_pseudomonas.pickle"
+scaler_transform_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/scaler_transform_pseudomonas.pickle"
+NN_architecture	"NN_2500_30"
+learning_rate	0.001
+batch_size	10
+epochs	100
+kappa	0.01
+intermediate_dim	2500
+latent_dim	30
+epsilon_std	1.0
+validation_frac	0.25
+project_id	"E-GEOD-33245"
+metadata_colname	'ml_data_source'
+num_simulated	25
+col_to_rank	"logFC"
+compare_genes	0
@@ -0,0 +1,21 @@
+local_dir	"/home/alexandra/Documents/Data/Generic_expression_patterns/"
+dataset_name	"pseudomonas_analysis"
+template_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/pseudomonas_template_data.tsv"
+compendium_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/Pa_compendium_02.22.2014.pcl"
+normalized_compendium_data_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/normalized_pseudomonas_compendium_data.tsv"
+shared_genes_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/shared_genes_pseudomonas.pickle"
+scaler_transform_file	"/home/alexandra/Documents/Data/Generic_expression_patterns/scaler_transform_pseudomonas.pickle"
+NN_architecture	"NN_2500_30"
+learning_rate	0.001
+batch_size	10
+epochs	100
+kappa	0.01
+intermediate_dim	2500
+latent_dim	30
+epsilon_std	1.0
+validation_frac	0.25
+project_id	"E-GEOD-7704"
+metadata_colname	'ml_data_source'
+num_simulated	25
+col_to_rank	"logFC"
+compare_genes	0
@@ -763,7 +763,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.8"
   }
  },
  "nbformat": 4,
 
@@ -7,12 +7,12 @@
 
 # library('limma')
 
-get_DE_stats <- function(metadata_file, 
-                         experiment_id, 
-                         expression_file,
-                         data_type,
-                         local_dir,
-                         run) {
+get_DE_stats_limma <- function(metadata_file, 
+                               experiment_id, 
+                               expression_file,
+                               data_type,
+                               local_dir,
+                               run) {
 
   # This function performs DE analysis using expression data in expression_file
   # where samples are grouped based on metadata_file
@@ -27,6 +27,7 @@ get_DE_stats <- function(metadata_file,
   #
   # expression_file: str
   #   File containing gene expression data
+  #   Expression data should be of the form sample x gene
   #
   # data_type: str
   #   Either 'template' or 'simulated' to label saved output file
@@ -38,6 +39,7 @@ get_DE_stats <- function(metadata_file,
   #   Used as identifier for different simulated experiments 
 
   # Read in data
+  # Note the expression data is transposed to gene x sample in order to run Limma
   expression_data <- t(as.matrix(read.csv(expression_file, sep="\t", header=TRUE, row.names=1)))
   metadata <- as.matrix(read.csv(metadata_file, sep="\t", header=TRUE, row.names=1))
 
@@ -84,34 +86,61 @@ get_DE_stats <- function(metadata_file,
 
 }
 
-create_volcano <- function(expression_file,
-                           experiment_id,
-                           pval,
-                           local_dir) {
+get_DE_stats_DESeq <- function(metadata_file, 
+                               experiment_id, 
+                               expression_file,
+                               data_type,
+                               local_dir,
+                               run) {
 
-    # This functioni generates a volcano plot using the output from
-    # the DE analysis script 'get_DE_stats' and output it to local_dir
+  # This function performs DE analysis using DESeq.
+  # Expression data in expression_file are grouped based on metadata_file
+  #
+  # Arguments
+  # ---------
+  # metadata_file: str
+  #   File containing mapping between sample id and group
+  #
+  # experiment_id: str
+  #   Experiment id used to label saved output filee
+  #
+  # expression_file: str
+  #   File containing gene expression data
+  #
+  # data_type: str
+  #   Either 'template' or 'simulated' to label saved output file
+  #
+  # local_dir: str
+  #   Directory to save output files to
+  #
+  # run: str
+  #   Used as identifier for different simulated experiments 
+
+  expression_data <- t(as.matrix(read.csv(expression_file, sep="\t", header=TRUE, row.names=1)))
+  metadata <- as.matrix(read.csv(metadata_file, sep="\t", header=TRUE, row.names=1))
+
+  print("Checking sample ordering...")
+  print(all.equal(colnames(expression_data), rownames(metadata)))
+
+  group <- interaction(metadata[,1])
+
+  mm <- model.matrix(~0 + group)
 
-    # Read in expression data
-    res <- read.table(expression_file, header=TRUE)
+  #print(head(expression_data))
 
-    threshold <- 0.05
+  ddset <- DESeqDataSetFromMatrix(expression_data, colData=metadata, design = ~group)
+  
+  deseq_object <- DESeq(ddset)
+
+  deseq_results <- results(deseq_object)
 
-    # Make a basic volcano plot
-    f <- EnhancedVolcano(res,
-                         lab = rownames(res),
-                         x = 'logFC',
-                         y = 'adj.P.Val',
-                         xlim = c(-2,2),
-                         pCutoff = threshold,
-                         FCcutoff = 1,
-                         pointSize = 1.0,
-                         labSize = 3.0,
-                         xlab=bquote(~Log[2]~ 'fold change'),
-                         ylab=bquote(-~Log[10]~ 'FDR adj p-value')
-    )
+  deseq_results_df <-  as.data.frame(deseq_results)
 
-    # Save
-    out_file = paste(local_dir, "volcano_template_data_", experiment_id,".png", sep="")  
-    ggsave(out_file, plot = f, dpi=300)
-                           }
+  # Save summary statistics of DEGs
+  if (data_type == "template") {
+    out_file = paste(local_dir, "DE_stats/DE_stats_template_data_", experiment_id,"_", run, ".txt", sep="")
+  } else if (data_type == "simulated") {
+    out_file = paste(local_dir, "DE_stats/DE_stats_simulated_data_", experiment_id,"_", run, ".txt", sep="")
+  }  
+  write.table(deseq_results_df, file = out_file, row.names = T, sep = "\t", quote = F)
+}
@@ -0,0 +1,52 @@
+## Run this once to setup environment
+## Used R 3.6.3
+#if (!requireNamespace("BiocManager", quietly = TRUE))
+#  install.packages("BiocManager")
+
+#BiocManager::install("clusterProfiler")
+
+#library(clusterProfiler)
+
+find_enriched_pathways <- function(DE_stats_file,
+                                   pathway_DB,
+                                   statistic){
+    # Read in data
+    DE_stats_data <- read.table(DE_stats_file, sep="\t", header=TRUE, row.names=NULL)
+   
+    # Sort genes by feature 1
+    
+    # feature 1: numeric vector
+    if (statistic == 'logFC'){
+      col_num = 2
+    } else if (statistic == 'log2FoldChange'){
+      col_num = 3
+    } else if (statistic == 't'){
+      col_num = 4
+    } else if (statistic == 'p-value'){
+      col_num = 5
+    } else if (statistic == 'adj p-value' || statistic == 'pvalue'){
+      col_num = 6
+    } else if ( statistic == 'padj'){
+      col_num = 7
+    }
+    rank_genes <- as.numeric(as.character(DE_stats_data[,col_num]))
+
+    # feature 2: named vector of gene ids
+    names(rank_genes) <- as.character(DE_stats_data[,1])
+
+	## feature 3: decreasing order
+    rank_genes <- sort(rank_genes, decreasing = TRUE)
+
+    pathway_DB_data <- gmtPathways(hallmark_DB_file)
+ 
+    #enrich_pathways <- GSEA(geneList=rank_genes, 
+    #                        TERM2GENE=pathway_DB_data,
+    #                        nPerm=100000,
+    #                        by='fgsea',
+    #                        verbose=T)
+    enrich_pathways <- fgsea(pathways=pathway_DB_data,
+                              stats=rank_genes,
+                              nperm=10000)
+
+    return(as.data.frame(enrich_pathways))
+}
Original file line number	Diff line number	Diff line change
`@@ -763,7 +763,7 @@`
`763`	`763`	`"name": "python",`
`764`	`764`	`"nbconvert_exporter": "python",`
`765`	`765`	`"pygments_lexer": "ipython3",`
`766`		`- "version": "3.7.6"`
	`766`	`+ "version": "3.7.8"`
`767`	`767`	`}`
`768`	`768`	`},`
`769`	`769`	`"nbformat": 4,`