diff --git a/bin/add_corncob_results.py b/bin/add_corncob_results.py index 2056955..c55df25 100755 --- a/bin/add_corncob_results.py +++ b/bin/add_corncob_results.py @@ -149,10 +149,32 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out): ] if len(df) > 0: - pd.concat(df).to_csv( - fp_out, - index=None - ) + + # Write out the results in batches of ~10,000 lines each + ix = 0 + batch_size = 0 + batch = [] + + for i in df: + batch.append(i) + batch_size += i.shape[0] + + if batch_size >= 10000: + + pd.concat(batch).to_csv( + fp_out.format(ix), + index=None + ) + ix += 1 + batch_size = 0 + batch = [] + + # Write out the remainder + if len(batch) > 0: + pd.concat(batch).to_csv( + fp_out.format(ix), + index=None + ) else: # Write a dummy file to help with data flow @@ -161,7 +183,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out): "p_value": [1], "parameter": ["dummy"] }).to_csv( - fp_out, + fp_out.format(0), index=None ) @@ -202,7 +224,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out): ), gene_annot, columns_to_use, - "corncob.for.betta.csv.gz" + "corncob.for.betta.{}.csv.gz" ) else: # Write a dummy file to help with data flow @@ -211,7 +233,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out): "p_value": [1], "parameter": ["dummy"] }).to_csv( - "corncob.for.betta.csv.gz", + "corncob.for.betta.{}.csv.gz".format(0), index=None ) diff --git a/main.nf b/main.nf index 21c84e0..09bfa08 100755 --- a/main.nf +++ b/main.nf @@ -512,7 +512,7 @@ workflow { ) runBetta( - addCorncobResults.out[1] + addCorncobResults.out[1].flatten() ) addBetta( diff --git a/modules/general.nf b/modules/general.nf index 89a555f..ef3640b 100644 --- a/modules/general.nf +++ b/modules/general.nf @@ -678,7 +678,7 @@ process addCorncobResults{ output: path "${results_hdf}" - path "corncob.for.betta.csv.gz" optional true + path "corncob.for.betta.*.csv.gz" optional true """ #!/bin/bash diff --git a/modules/statistics.nf b/modules/statistics.nf index 56b53bd..6f54b8c 100644 --- a/modules/statistics.nf +++ b/modules/statistics.nf @@ -668,7 +668,7 @@ process addBetta{ input: path results_hdf - path betta_csv + path betta_csv_list output: path "${results_hdf}" @@ -680,12 +680,18 @@ import os import pandas as pd from statsmodels.stats.multitest import multipletests -betta_csv = "${betta_csv}" +betta_csv_list = "${betta_csv_list}".split(" ") -assert os.path.exists(betta_csv) +for betta_csv in betta_csv_list: + if len(betta_csv) > 1: + assert os.path.exists(betta_csv) # Read in from the flat file -df = pd.read_csv(betta_csv) +df = pd.concat([ + pd.read_csv(betta_csv) + for betta_csv in betta_csv_list + if len(betta_csv) > 1 +]) print("Read in {:,} lines from {}".format( df.shape[0], diff --git a/run_corncob.nf b/run_corncob.nf index c7317c3..8c90eb1 100644 --- a/run_corncob.nf +++ b/run_corncob.nf @@ -229,12 +229,12 @@ workflow { ) runBetta( - addCorncobResults.out[1] + addCorncobResults.out[1].flatten() ) addBetta( addCorncobResults.out[0], - runBetta.out + runBetta.out.toSortedList() ) // Repack the HDF