Skip to content

Commit

Permalink
Run betta in batches
Browse files Browse the repository at this point in the history
  • Loading branch information
Sam Minot committed Jun 19, 2020
1 parent 316ec36 commit 7a3ea19
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 15 deletions.
36 changes: 29 additions & 7 deletions bin/add_corncob_results.py
Expand Up @@ -149,10 +149,32 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out):
]

if len(df) > 0:
pd.concat(df).to_csv(
fp_out,
index=None
)

# Write out the results in batches of ~10,000 lines each
ix = 0
batch_size = 0
batch = []

for i in df:
batch.append(i)
batch_size += i.shape[0]

if batch_size >= 10000:

pd.concat(batch).to_csv(
fp_out.format(ix),
index=None
)
ix += 1
batch_size = 0
batch = []

# Write out the remainder
if len(batch) > 0:
pd.concat(batch).to_csv(
fp_out.format(ix),
index=None
)

else:
# Write a dummy file to help with data flow
Expand All @@ -161,7 +183,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out):
"p_value": [1],
"parameter": ["dummy"]
}).to_csv(
fp_out,
fp_out.format(0),
index=None
)

Expand Down Expand Up @@ -202,7 +224,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out):
),
gene_annot,
columns_to_use,
"corncob.for.betta.csv.gz"
"corncob.for.betta.{}.csv.gz"
)
else:
# Write a dummy file to help with data flow
Expand All @@ -211,7 +233,7 @@ def write_corncob_by_annot(corncob_wide, gene_annot, col_name_list, fp_out):
"p_value": [1],
"parameter": ["dummy"]
}).to_csv(
"corncob.for.betta.csv.gz",
"corncob.for.betta.{}.csv.gz".format(0),
index=None
)

Expand Down
2 changes: 1 addition & 1 deletion main.nf
Expand Up @@ -512,7 +512,7 @@ workflow {
)

runBetta(
addCorncobResults.out[1]
addCorncobResults.out[1].flatten()
)

addBetta(
Expand Down
2 changes: 1 addition & 1 deletion modules/general.nf
Expand Up @@ -678,7 +678,7 @@ process addCorncobResults{

output:
path "${results_hdf}"
path "corncob.for.betta.csv.gz" optional true
path "corncob.for.betta.*.csv.gz" optional true

"""
#!/bin/bash
Expand Down
14 changes: 10 additions & 4 deletions modules/statistics.nf
Expand Up @@ -668,7 +668,7 @@ process addBetta{

input:
path results_hdf
path betta_csv
path betta_csv_list

output:
path "${results_hdf}"
Expand All @@ -680,12 +680,18 @@ import os
import pandas as pd
from statsmodels.stats.multitest import multipletests
betta_csv = "${betta_csv}"
betta_csv_list = "${betta_csv_list}".split(" ")
assert os.path.exists(betta_csv)
for betta_csv in betta_csv_list:
if len(betta_csv) > 1:
assert os.path.exists(betta_csv)
# Read in from the flat file
df = pd.read_csv(betta_csv)
df = pd.concat([
pd.read_csv(betta_csv)
for betta_csv in betta_csv_list
if len(betta_csv) > 1
])
print("Read in {:,} lines from {}".format(
df.shape[0],
Expand Down
4 changes: 2 additions & 2 deletions run_corncob.nf
Expand Up @@ -229,12 +229,12 @@ workflow {
)

runBetta(
addCorncobResults.out[1]
addCorncobResults.out[1].flatten()
)

addBetta(
addCorncobResults.out[0],
runBetta.out
runBetta.out.toSortedList()
)

// Repack the HDF
Expand Down

0 comments on commit 7a3ea19

Please sign in to comment.