-
Notifications
You must be signed in to change notification settings - Fork 4
/
run_corncob.nf
245 lines (199 loc) · 6.89 KB
/
run_corncob.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env nextflow
/*
Geneshot: A pipeline to robustly identify which alleles (n.e.e peptide coding sequences)
are present in a microbial community.
This utility runs the corncob algorithm with a specified formula on an
existing geneshot output file object.
*/
// Using DSL-2
nextflow.preview.dsl=2
// Parameters
params.input_hdf = false
params.input_folder = false
params.output_folder = false
params.output_prefix = false
params.formula = false
params.fdr_method = "fdr_bh"
params.help = false
// Docker containers
container__pandas = "quay.io/fhcrc-microbiome/python-pandas:v1.0.3"
// Function which prints help message text
def helpMessage() {
log.info"""
Run corncob with a specified formula on a geneshot output file.
This utility will create a new geneshot output file in which
the only difference is the corncob output results stored within.
Usage:
nextflow run Golob-Minot/geneshot/run_corncob.nf <ARGUMENTS>
Options:
--input_hdf Geneshot results HDF5 file to be used for analysis
--input_folder Folder containing geneshot output to be used for analysis (must contain the subfolder "abund/")
--output_folder Folder to place output tile
--output_prefix Text used as a prefix for summary HDF5 output files (the .corncob.hdf5 suffix will be attached)
-w Working directory. Defaults to `./work`
For Statistical Analysis:
--formula Optional formula used to estimate associations with CAG relative abundance
--fdr_method FDR method used to calculate q-values for associations (default: 'fdr_bh')
""".stripIndent()
}
// Show help message if the user specifies the --help flag at runtime
if (params.help || params.input_hdf == false || params.input_folder == false || params.output_folder == false || params.output_prefix == false || params.formula == false){
// Invoke the function above which prints the help message
helpMessage()
// Exit out and do not run anything else
exit 0
}
// Import the process used for statistical analysis
include runCorncob from './modules/statistics' params(
formula: params.formula
)
// Import the process used to join the results for multiple formulae
include joinCorncob from './modules/statistics' params(
output_folder: params.output_folder
)
// Import the process used to aggregate readcounts across all samples
// This process is only run if `abund/CAG.readcounts.csv.gz` can not be found yet
// After the process completes, a new `abund/CAG.readcounts.csv.gz` will be added to that input folder
include extractCounts from './modules/statistics' params(
output_folder: params.input_folder
)
// Import the processes needed to run meta-analysis of corncob results by annotation
include runBetta from './modules/statistics'
include addBetta from './modules/statistics' params(
fdr_method: params.fdr_method
)
// Import the process used to add corncob results to the output
include repackHDF from './modules/general' params(
output_folder: params.output_folder
)
include addCorncobResults from './modules/general' params(
fdr_method: params.fdr_method
)
// Process to update the formula listed in the summary table
// Also extract the manifest to reduce the number of times
// the file is opened, as well as the table listing the genes
// which make up each CAG.
process updateFormula{
container "${container__pandas}"
label 'mem_medium'
errorStrategy 'retry'
input:
path results_hdf
output:
path "*corncob.hdf5"
path "manifest.csv"
path "CAG.assignments.csv.gz"
"""
#!/usr/bin/env python3
import pandas as pd
import shutil
# Come up with a new name for the output file
formula_name = "${params.formula}".replace(
" ", "_"
).replace(
"+", "_"
).replace(
",", "_"
).replace(
"*", "_"
).replace(
":", "_"
).replace(
"__", "_"
).replace(
"__", "_"
)
new_file_name = "${params.output_prefix}.%s.corncob.hdf5" % formula_name
# Copy the input file to a new output file object
print("Copying %s to %s" % ("${results_hdf}", new_file_name))
shutil.copyfile("${results_hdf}", new_file_name)
# Open a connection to the store
with pd.HDFStore(new_file_name, "a") as store:
# Read in the existing summary table
df = pd.read_hdf(store, "/summary/experiment")
print("Existing summary:")
print(df)
# Add the formula
df = pd.concat([
df.query("variable != 'formula'"),
pd.DataFrame([dict(variable='formula', value='${params.formula}')])
])
print("")
print("New summary:")
print(df)
# Write back to the store
df.to_hdf(store, "/summary/experiment")
# Extract the manifest
print("Extracting the manifest")
pd.read_hdf(store, "/manifest").to_csv("manifest.csv")
# Extract the table of genes making up each CAG
print("Extracting the gene~CAG table")
pd.read_hdf(
store,
"/annot/gene/all",
columns=["gene", "CAG"]
).to_csv(
"CAG.assignments.csv.gz",
index=None
)
print("Done")
"""
}
workflow {
// Make sure we can find the input file
if(file(params.input_hdf).isEmpty()){
log.info"""Cannot find input file ${params.input_hdf}""".stripIndent()
exit 0
}
// First update the formula listed in the geneshot results file,
// a process which will also extract the manifest CSV
updateFormula(
file(params.input_hdf)
)
// Check to see if `abund/CAG.readcounts.csv.gz` is present in the input folder
// For context, this file is only created by geneshot for runs which originally
// included a `--formula`.
// If this file cannot be found, then we will run a process to create that file
// and also to publish that file back to the input folder.
abund_folder = "${params.input_folder.replaceAll(/\/$/, "")}/abund"
if(file("${abund_folder}/CAG.readcounts.csv.gz").isEmpty()){
extractCounts(
Channel.fromPath(
"${abund_folder}/details/*json.gz"
).toSortedList(),
updateFormula.out[2]
)
input_csv = extractCounts.out
} else {
input_csv = file("${abund_folder}/CAG.readcounts.csv.gz")
}
// Set up a channel with the strings of the formula(s) provided
formula_ch = Channel.of(
params.formula.split(",")
)
// Now run corncob on the extracted manifest, as well as the gene counts table
runCorncob(
input_csv,
updateFormula.out[1],
formula_ch
)
joinCorncob(
runCorncob.out.toSortedList()
)
// Add those results to the output file
addCorncobResults(
updateFormula.out[0],
joinCorncob.out
)
runBetta(
addCorncobResults.out[1].flatten()
)
addBetta(
addCorncobResults.out[0],
runBetta.out.toSortedList()
)
// Repack the HDF
repackHDF(
addBetta.out[0]
)
}