Run betta in batches

Golob-Minot · Jun 19, 2020 · 7a3ea19 · 7a3ea19
1 parent 316ec36
commit 7a3ea19
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 15 deletions.
diff --git a/bin/add_corncob_results.py b/bin/add_corncob_results.py
@@ -149,10 +149,32 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out):
     ]
 
     if len(df) > 0:
-        pd.concat(df).to_csv(
-            fp_out,
-            index=None
-        )
+
+        # Write out the results in batches of ~10,000 lines each
+        ix = 0
+        batch_size = 0
+        batch = []
+
+        for i in df:
+            batch.append(i)
+            batch_size += i.shape[0]
+
+            if batch_size >= 10000:
+
+                pd.concat(batch).to_csv(
+                    fp_out.format(ix),
+                    index=None
+                )
+                ix += 1
+                batch_size = 0
+                batch = []
+
+        # Write out the remainder
+        if len(batch) > 0:
+            pd.concat(batch).to_csv(
+                fp_out.format(ix),
+                index=None
+            )
 
     else:
         # Write a dummy file to help with data flow
@@ -161,7 +183,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out):
             "p_value": [1],
             "parameter": ["dummy"]
         }).to_csv(
-            fp_out,
+            fp_out.format(0),
             index=None
         )
 
@@ -202,7 +224,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out):
         ),
         gene_annot,
         columns_to_use,
-        "corncob.for.betta.csv.gz"
+        "corncob.for.betta.{}.csv.gz"
     )
 else:
     # Write a dummy file to help with data flow
@@ -211,7 +233,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out):
         "p_value": [1],
         "parameter": ["dummy"]
     }).to_csv(
-        "corncob.for.betta.csv.gz",
+        "corncob.for.betta.{}.csv.gz".format(0),
         index=None
     )
 

diff --git a/main.nf b/main.nf
@@ -512,7 +512,7 @@ workflow {
         )
 
         runBetta(
-            addCorncobResults.out[1]
+            addCorncobResults.out[1].flatten()
         )
 
         addBetta(

diff --git a/modules/general.nf b/modules/general.nf
@@ -678,7 +678,7 @@ process addCorncobResults{
 
     output:
         path "${results_hdf}"
-        path "corncob.for.betta.csv.gz" optional true
+        path "corncob.for.betta.*.csv.gz" optional true
 
 """
 #!/bin/bash

diff --git a/modules/statistics.nf b/modules/statistics.nf
@@ -668,7 +668,7 @@ process addBetta{
 
     input:
         path results_hdf
-        path betta_csv
+        path betta_csv_list
 
     output:
         path "${results_hdf}"
@@ -680,12 +680,18 @@ import os
 import pandas as pd
 from statsmodels.stats.multitest import multipletests
 
-betta_csv = "${betta_csv}"
+betta_csv_list = "${betta_csv_list}".split(" ")
 
-assert os.path.exists(betta_csv)
+for betta_csv in betta_csv_list:
+    if len(betta_csv) > 1:
+        assert os.path.exists(betta_csv)
 
 # Read in from the flat file
-df = pd.read_csv(betta_csv)
+df = pd.concat([
+    pd.read_csv(betta_csv)
+    for betta_csv in betta_csv_list
+    if len(betta_csv) > 1
+])
 
 print("Read in {:,} lines from {}".format(
     df.shape[0],

diff --git a/run_corncob.nf b/run_corncob.nf
@@ -229,12 +229,12 @@ workflow {
     )
 
     runBetta(
-        addCorncobResults.out[1]
+        addCorncobResults.out[1].flatten()
     )
 
     addBetta(
         addCorncobResults.out[0],
-        runBetta.out
+        runBetta.out.toSortedList()
     )
 
     // Repack the HDF