Skip to content

Commit

Permalink
Merge pull request #35 from Golob-Minot/test_alignment_batches
Browse files Browse the repository at this point in the history
Add metaPhlAn2 to HDF5
  • Loading branch information
sminot committed Apr 29, 2020
2 parents 319309b + 6994409 commit b34a086
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 48 deletions.
26 changes: 11 additions & 15 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ include collectBreakaway from './modules/statistics' params(
include metaphlan2_fastq from './modules/composition' params(
output_folder: output_folder
)
include join_metaphlan2 from './modules/composition'
// include join_metaphlan2 from './modules/composition'
include addMetaPhlAn2Results from './modules/general'


Expand Down Expand Up @@ -341,12 +341,6 @@ workflow {
r -> [r[0], r[1], r[2]]
}
)
// // Make a single table with all of the metaPhlAn2 results
// join_metaphlan2(
// metaphlan2_fastq.out.map {
// r -> r[1]
// }.toSortedList()
// )
}

// ###################################
Expand Down Expand Up @@ -443,15 +437,17 @@ workflow {
detailedHDF = addGeneAssembly.out[1]
}

// // If we performed compositional analysis, add the results ot the HDF5
// if (params.composition) {
// addMetaPhlAn2Results(
// resultsHDF,
// join_metaphlan2.out
// )
// If we performed compositional analysis, add the results ot the HDF5
if (params.composition) {
addMetaPhlAn2Results(
resultsHDF,
metaphlan2_fastq.out.map {
r -> r[1]
}.toSortedList()
)

// resultsHDF = addMetaPhlAn2Results.out
// }
resultsHDF = addMetaPhlAn2Results.out
}

// If we performed statistical analysis, add the results to the HDF5
if ( params.formula ) {
Expand Down
46 changes: 25 additions & 21 deletions modules/alignment.nf
Original file line number Diff line number Diff line change
Expand Up @@ -169,27 +169,31 @@ process diamond {
"""
set -e
cat ${R1} ${R2} | \
gunzip -c | \
awk "{if(NR % 4 == 1){print \\"@\\" NR }else{if(NR % 4 == 3){print \\"+\\"}else{print}}}" | \
gzip -c \
> query.fastq.gz
diamond \
blastx \
--query query.fastq.gz \
--out ${sample_name}.aln.gz \
--threads ${task.cpus} \
--db ${refdb} \
--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen \
--min-score ${params.dmnd_min_score} \
--query-cover ${params.dmnd_min_coverage} \
--id ${params.dmnd_min_identity} \
--top ${params.dmnd_top_pct} \
--block-size ${task.memory.toMega() / (1024 * 6 * task.attempt)} \
--query-gencode ${params.gencode} \
--compress 1 \
--unal 0
for fp in ${R1} ${R2}; do
cat \$fp | \
gunzip -c | \
awk "{if(NR % 4 == 1){print \\"@\$fp\\" NR }else{if(NR % 4 == 3){print \\"+\\"}else{print}}}" | \
gzip -c \
> query.fastq.gz
diamond \
blastx \
--query query.fastq.gz \
--out \$fp.aln.gz \
--threads ${task.cpus} \
--db ${refdb} \
--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen \
--min-score ${params.dmnd_min_score} \
--query-cover ${params.dmnd_min_coverage} \
--id ${params.dmnd_min_identity} \
--top ${params.dmnd_top_pct} \
--block-size ${task.memory.toMega() / (1024 * 6 * task.attempt)} \
--query-gencode ${params.gencode} \
--compress 1 \
--unal 0
done
cat *aln.gz > ${sample_name}.aln.gz
"""

}
Expand Down
2 changes: 1 addition & 1 deletion modules/composition.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ workflow composition_wf {
process metaphlan2_fastq {
tag "MetaPhlAn2 composition for paired end fastq reads"
container "${container__metaphlan2}"
label = 'mem_veryhigh'
label = 'mem_medium'
errorStrategy 'retry'

// If the user sets --preprocess_output, write out the combined reads to that folder
Expand Down
38 changes: 27 additions & 11 deletions modules/general.nf
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ process addMetaPhlAn2Results{

input:
path results_hdf
path metaphlan_csv
path metaphlan_tsv_list

output:
path "${results_hdf}"
Expand All @@ -674,19 +674,35 @@ process addMetaPhlAn2Results{
import pandas as pd
# Read in the metaphlan results
metaphlan_df = pd.read_csv("${metaphlan_csv}")
print(
"Read in metaphlan results for %d taxa" %
metaphlan_df.shape[0]
)
# Open a connection to the HDF5
with pd.HDFStore("${results_hdf}", "a") as store:
# Write metaphlan results to HDF5
metaphlan_df.to_hdf(store, "/composition/metaphlan")
# Iterate over each input file
for fp in "${metaphlan_tsv_list}".split(" "):
# Make sure that the file has the expected suffix
assert fp.endswith(".metaphlan2.tsv"), fp
# Get the specimen name from the file name
specimen_name = fp.replace(".metaphlan2.tsv", "")
# Read in from the flat file
metaphlan_df = pd.read_csv(
fp,
skiprows=1,
sep="\\t"
)
print("Read in %d taxa from %s" % (metaphlan_df.shape[0], specimen_name))
# Write metaphlan results to HDF5
key = "/composition/metaphlan/%s" % specimen_name
print("Writing to %s" % key)
metaphlan_df.to_hdf(store, key)
print("Closing store")
print("Done")
"""

Expand Down

0 comments on commit b34a086

Please sign in to comment.