From 9ea5545fd7f644a5f2a107cf925cd8aaa26a31af Mon Sep 17 00:00:00 2001
From: David DeTomaso <davedeto@gmail.com>
Date: Wed, 16 Dec 2020 23:17:17 -0800
Subject: [PATCH] Update paths for Figure panels

---
 README.md                                     |  69 +++++--
 .../CD4_Correlation/ModuleConsistency_CD4.py  | 185 ++++++++++++++++++
 ...ency.py => ModuleConsistency_Monocytes.py} |   6 +-
 .../EvaluateFeatureSelection/hvg_vs_hs.py     |  24 ---
 4 files changed, 241 insertions(+), 43 deletions(-)
 create mode 100644 Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_CD4.py
 rename Transcriptomics/Figures/CD4_Correlation/{ModuleConsistency.py => ModuleConsistency_Monocytes.py} (96%)

diff --git a/README.md b/README.md
index 470dbc6..468df3b 100644
--- a/README.md
+++ b/README.md
@@ -23,43 +23,84 @@ Outputs can be recreated in each directory by running `snakemake all`
 
 Code to re-create figures can be found in various `Figures` directories and depends on prior execution of Snakemake pipelines.
 
-- Figure 1
+- Figure 1 - Algorithm Diagram
+
+- Figure 2
+    - Panel A: `/Transcriptomics/Figures/EvaluateFeatureSelection/plotRelevance.py`
+    - Panel B: `/Transcriptomics/Figures/EvaluateFeatureSelection/compareLatentSpaces.py`
+    - Panel C: `/Transcriptomics/Figures/EvaluateFeatureSelection/compare_local_expression.py`
+    - Panel D: `/Transcriptomics/Figures/CD4_Correlation/plot.py`
+    - Panel E: `/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_CD4.py`
+
+- Figure 3
     - Panel A: `/SlideSeq/Figures/MainFigure/moduleHeatmap.py`
     - Panel B: `/SlideSeq/Figures/MainFigure/moduleCellTypes.py`
     - Panel C: `/SlideSeq/Figures/MainFigure/moduleScores.py`
-- Figure 2
+
+- Figure 4
     - Panel A: `/Lineage/Figure/plotCorrelations.py`
     - Panel B: `/Lineage/Figure/plotUMAPs.py`
     - Panel C: `/Lineage/Figure/plotKernels.py`
     - Panel D: `/Lineage/Figure/plotKernelsTx.py`
     - Panel E: `/Lineage/Figure/plotAngioblasts.py`
-- Figure 3
-    - Panel A: `/Transcriptomics/Figures/EvaluateFeatureSelection/plotRelevance.py`
-    - Panel B: `/Transcriptomics/Figures/EvaluateFeatureSelection/compareLatentSpaces.py`
-    - Panel C: `/Transcriptomics/Figures/CD4_Correlation/plot.py`
-    - Panel D: `/Transcriptomics/Figures/CD4_Correlation/plot.py`
+
 - Figure S1
-    - Panel A: `/SlideSeq/Figures/Supp1/plotMeanVar.py`
+    - Panel A: `/Transcriptomics/Figures/Simulation/plotTSNEs.py`
+    - Panel B: `/Transcriptomics/Figures/Simulation/plotAUC_PR.py`
+    - Panel C: `/Transcriptomics/Figures/Simulation/plotModuleAssignment.py`
+
+- Figure S2
+    - Panel A: `/SlideSeq/Figures/Supp1/plotMeanVar.py`, `/SlideSeq/Figures/Supp_Autocorr/compare_local_expression.py`
     - Panel B: `/SlideSeq/Figures/Supp1/plotPR.py`
     - Panel C: `/SlideSeq/Figures/Supp1/plotTimings.py`
     - Panel D: `/SlideSeq/Figures/Supp1/plotIDR.py`
-- Figure S2
+
+- Figure S3
     - Panel A: `/SlideSeq/Figures/Supp2/comparePairwiseZScores.py`
     - Panel B: `/SlideSeq/Figures/Supp2/compareModules.py`
     - Panel C: `/SlideSeq/Figures/Supp2/compareModuleAssignments.py`
-- Figure S3
+
+- Figure S4
     - Panel A: `/SlideSeq/Figures/Supp3/compareModulesSpatialDE.py`
     - Panel B: `/SlideSeq/Figures/Supp3/compareTiming.py`
     - Panel C: `/SlideSeq/Figures/Supp3/compareModulesSpatialDE.py`
-- Figure S4
-    - Panel A: `/Transcriptomics/Figures/Simulation/plotTSNEs.py`
-    - Panel B: `/Transcriptomics/Figures/Simulation/plotAUC_PR.py`
-    - Panel C: `/Transcriptomics/Figures/Simulation/plotModuleAssignment.py`
+
 - Figure S5
     - Panel A: `/SlideSeq/Figures/Supp4/plotPValues.py`
     - Panel B: `/Transcriptomics/Figures/Supp_Pvals/plotPValues.py`
     - Panel C: `/Lineage/Figure/plotPValues.py`
 
+- Figure S6
+    - All Panels: `/Transcriptomics/Figures/EvaluateFeatureSelection/hvg_vs_hs.py`
+
+- Figure S7
+    - Panel A: `/Transcriptomics/Figures/Simulation/downsampling_correlation.py`
+    - Panel B: `/Transcriptomics/Figures/CD4_Correlation/plot_downsampled.py`
+    - Panel C: `/Transcriptomics/Figures/CD4_Correlation/plot_downsampled.py`
+    - Panel D: `/Transcriptomics/Figures/Simulation/downsampling_correlation.py`
+
+- Figure S8
+    - All Panels: `/SlideSeq/Figures/Supp5_HVG_vs_HS/hvg_vs_hs.py`
+
+- Figure S9
+    - Panel A: `/SlideSeq/Figures/Supp6_NegBinom_vs_Bernoulli/plotPR.py`
+    - Panel B: `/SlideSeq/Figures/Supp6_NegBinom_vs_Bernoulli/compareModuleAssignments.py`
+
+- Figure S10
+    - Column 1: `/Transcriptomics/Figures/Simulation/plotAUC_PR_k_sensitivity.py`
+    - Column 2: `/Transcriptomics/Figures/EvaluateFeatureSelection/plotRelevance_k_sensitivity.py`
+    - Column 3: `/SlideSeq/Figures/Supp1/plotPR_k_sensitivity.py`
+
+- Figure S11
+    - Panel A: `/SlideSeq/Figures/Supp7_Bernoulli/plot.py`
+    - Panel B: `/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_Monocytes.py`
+
+- Figure S12
+    - All Panels: `/Transcriptomics/Figures/EvaluateFeatureSelection/compare_local_expression.py`
+
+- Figure S13
+    - All Panels: `/SlideSeq/Figures/Supp_Autocorr/compare_local_expression.py`
+
 
 ## Software Versions
 
diff --git a/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_CD4.py b/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_CD4.py
new file mode 100644
index 0000000..7c869da
--- /dev/null
+++ b/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_CD4.py
@@ -0,0 +1,185 @@
+"""
+Plots module consistence for the CD4 data between the train/test splits
+See the _Monocyte.py version for a nearly identical script on the Monocyte data
+"""
+import os
+import pandas as pd
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+plt.rcParams['svg.fonttype'] = 'none'
+
+base_dir = '../../CD4_w_protein'
+
+datasets = [
+    {
+        'Name': 'Hotspot',
+        'Train': os.path.join(base_dir, 'train/hotspot_hs/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/hotspot_hs/modules.txt'),
+    },
+    {
+        'Name': 'WGCNA',
+        'Train': os.path.join(base_dir, 'train/wgcna_hs/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/wgcna_hs/modules.txt'),
+    },
+    {
+        'Name': 'ICA5',
+        'Train': os.path.join(base_dir, 'train/ica5/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/ica5/modules.txt'),
+    },
+    # # Don't need so many ICA versions as this one isn't that different from ICA10
+    # {
+    #     'Name': 'ICA8',
+    #     'Train': os.path.join(base_dir, 'train/ica8/modules.txt'),
+    #     'Test': os.path.join(base_dir, 'test/ica8/modules.txt'),
+    # },
+    {
+        'Name': 'ICA10',
+        'Train': os.path.join(base_dir, 'train/ica10/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/ica10/modules.txt'),
+    },
+    {
+        'Name': 'Grnboost',
+        'Train': os.path.join(base_dir, 'train/grnboost/modules.txt'),
+        'Test': os.path.join(base_dir, 'test/grnboost/modules.txt'),
+    },
+]
+
+for data in datasets:
+    train = pd.read_table(data['Train'], index_col=0)
+    test = pd.read_table(data['Test'], index_col=0)
+
+    train = train.Cluster.to_dict()
+    test = test.Cluster.to_dict()
+    data['TrainDict'] = train
+    data['TestDict'] = test
+
+
+def eval_module_consistency(data):
+    consistency = (
+        eval_module_consistency_inner(data['TrainDict'], data['TestDict']) +
+        eval_module_consistency_inner(data['TestDict'], data['TrainDict'])
+    ) / 2
+
+    data['Consistency'] = consistency
+
+
+def eval_module_consistency_inner(dict_a, dict_b):
+
+    all_genes = set(dict_a.keys()) & set(dict_b.keys())
+
+    # For each pairs of genes that are in the same module in 'A', how many are in the same module in 'B'?
+
+    denom = 0
+    num = 0
+    for ga in all_genes:
+        for gb in all_genes:
+
+            if ga == gb: continue
+
+            if dict_a[ga] == dict_a[gb] and dict_a[ga] != -1 and dict_a[gb] != -1:  # Same module in A
+                denom += 1
+
+                if dict_b[ga] == dict_b[gb] and dict_b[ga] != -1 and dict_b[gb] != -1:  # Same module in B
+                    num += 1
+
+    num = num/2
+    denom = denom/2
+
+    consistent_rate = num/denom
+
+    return consistent_rate
+
+
+def eval_num_modules(data):
+    num_modules = (
+        pd.Series(data['TrainDict']).unique().size - 1 +
+        pd.Series(data['TestDict']).unique().size - 1
+    ) / 2
+
+    data['NumModules'] = num_modules
+
+
+def eval_num_assigned(data):
+    assigned = (
+        (pd.Series(data['TrainDict']) != -1).sum()/2 +
+        (pd.Series(data['TestDict']) != -1).sum()/2
+    )
+
+    data['NumAssigned'] = assigned
+
+
+for data in tqdm(datasets):
+    train = pd.read_table(data['Train'], index_col=0)
+    test = pd.read_table(data['Test'], index_col=0)
+
+    train = train.Cluster.to_dict()
+    test = test.Cluster.to_dict()
+    data['TrainDict'] = train
+    data['TestDict'] = test
+
+    eval_module_consistency(data)
+    eval_num_modules(data)
+    eval_num_assigned(data)
+
+
+# %% Consolidate into a nice dataframe
+columns = [
+    'Name',
+    'Consistency',
+    'NumModules',
+    'NumAssigned'
+]
+
+results = []
+for data in datasets:
+    results.append(
+        [data[x] for x in columns]
+    )
+
+results = pd.DataFrame(results, columns=columns)
+
+
+# %% Plot it
+order = ['ICA5', 'ICA10', 'Grnboost', 'WGCNA', 'Hotspot']
+colors = sns.color_palette('deep')[:len(order)]
+plot_data = results.set_index('Name').loc[order]
+
+fig, axs = plt.subplots(1, 3, figsize=(12, 4))
+
+plt.sca(axs[0])
+
+plt.bar(
+    plot_data.index, plot_data.Consistency, alpha=0.9, color=colors
+)
+plt.xticks(rotation=45)
+plt.ylabel('Proportion of Gene Pairs Which\nReplicate Across Data Split')
+plt.title('Reproducibility')
+plt.gca().set_axisbelow(True)
+plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))
+
+plt.sca(axs[1])
+
+plt.bar(
+    plot_data.index, plot_data.NumModules, alpha=0.9, color=colors
+)
+plt.xticks(rotation=45)
+plt.ylabel('Modules')
+plt.title('# Modules')
+plt.gca().set_axisbelow(True)
+plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))
+
+plt.sca(axs[2])
+
+plt.bar(
+    plot_data.index, plot_data.NumAssigned, alpha=0.9, color=colors
+)
+plt.xticks(rotation=45)
+plt.ylabel('Genes')
+plt.title('# Genes Assigned')
+plt.gca().set_axisbelow(True)
+plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))
+
+plt.subplots_adjust(bottom=.25, wspace=0.4, left=0.1, right=0.9)
+plt.savefig('CD4_Module_TrainTest.svg')
diff --git a/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency.py b/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_Monocytes.py
similarity index 96%
rename from Transcriptomics/Figures/CD4_Correlation/ModuleConsistency.py
rename to Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_Monocytes.py
index 2b32969..f6a90fb 100644
--- a/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency.py
+++ b/Transcriptomics/Figures/CD4_Correlation/ModuleConsistency_Monocytes.py
@@ -3,11 +3,9 @@
 from tqdm import tqdm
 import matplotlib.pyplot as plt
 import seaborn as sns
- 
+
 plt.rcParams['svg.fonttype'] = 'none'
 
-# Pick which set of results to plot - CD4 or Monocytes
-# base_dir = '../../CD4_w_protein'
 base_dir = '../../Mono_w_protein'
 
 datasets = [
@@ -180,6 +178,4 @@ def eval_num_assigned(data):
 plt.grid(color='#CCCCCC', lw=0.5, axis='y', ls=(0, (5, 5)))
 
 plt.subplots_adjust(bottom=.25, wspace=0.4, left=0.1, right=0.9)
-# plt.savefig('CD4_Module_TrainTest.svg')
 plt.savefig('Monocyte_Module_TrainTest.svg')
-# plt.show()
diff --git a/Transcriptomics/Figures/EvaluateFeatureSelection/hvg_vs_hs.py b/Transcriptomics/Figures/EvaluateFeatureSelection/hvg_vs_hs.py
index 8484f66..7f87328 100644
--- a/Transcriptomics/Figures/EvaluateFeatureSelection/hvg_vs_hs.py
+++ b/Transcriptomics/Figures/EvaluateFeatureSelection/hvg_vs_hs.py
@@ -14,30 +14,6 @@
 hvg = hvg.loc[hs.index]
 
 
-# %% Plot one vs other
-
-# plot_data = hs[['Symbol', 'Z']].join(
-#     hvg[['gene.dispersion.scaled', 'gene.mean']]
-# )
-# 
-# plt.figure(figsize=(5, 5))
-# 
-# plt.plot(
-#     plot_data['gene.dispersion.scaled'],
-#     plot_data['Z'],
-#     'o', ms=2, rasterized=True
-# )
-# 
-# plt.gca().set_axisbelow(True)
-# plt.xlabel('Scaled Dispersion')
-# plt.ylabel('Autocorrelation Z')
-# plt.grid(color='#BBBBBB', ls=(0, (5, 5)), lw=.5)
-# plt.subplots_adjust(left=0.15, right=1-0.15, bottom=0.15, top=1-0.15)
-# 
-# plt.show()
-# plt.savefig('autocorrelation_vs_hvg.svg', dpi=300)
-
-
 # %%
 
 loom_file = "../../../data/10x_PBMC_w_proteins/cd4/data.loom"