Merge pull request #35 from Golob-Minot/test_alignment_batches

Add metaPhlAn2 to HDF5
Golob-Minot · Apr 29, 2020 · b34a086 · b34a086
2 parents 319309b + 6994409
commit b34a086
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 48 deletions.
diff --git a/main.nf b/main.nf
@@ -267,7 +267,7 @@ include collectBreakaway from './modules/statistics' params(
 include metaphlan2_fastq from './modules/composition' params(
     output_folder: output_folder
 )
-include join_metaphlan2 from './modules/composition'
+// include join_metaphlan2 from './modules/composition'
 include addMetaPhlAn2Results from './modules/general'
 
 
@@ -341,12 +341,6 @@ workflow {
                 r -> [r[0], r[1], r[2]]
             }
         )
-        // // Make a single table with all of the metaPhlAn2 results
-        // join_metaphlan2(
-        //     metaphlan2_fastq.out.map {
-        //         r -> r[1]
-        //     }.toSortedList()
-        // )
     }
 
     // ###################################
@@ -443,15 +437,17 @@ workflow {
         detailedHDF = addGeneAssembly.out[1]
     }
 
-    // // If we performed compositional analysis, add the results ot the HDF5
-    // if (params.composition) {
-    //     addMetaPhlAn2Results(
-    //         resultsHDF,
-    //         join_metaphlan2.out
-    //     )
+    // If we performed compositional analysis, add the results ot the HDF5
+    if (params.composition) {
+        addMetaPhlAn2Results(
+            resultsHDF,
+            metaphlan2_fastq.out.map {
+                r -> r[1]
+            }.toSortedList()
+        )
 
-    //     resultsHDF = addMetaPhlAn2Results.out
-    // }
+        resultsHDF = addMetaPhlAn2Results.out
+    }
 
     // If we performed statistical analysis, add the results to the HDF5
     if ( params.formula ) {

diff --git a/modules/alignment.nf b/modules/alignment.nf
@@ -169,27 +169,31 @@ process diamond {
     """
     set -e
 
-    cat ${R1} ${R2} | \
-    gunzip -c | \
-    awk "{if(NR % 4 == 1){print \\"@\\" NR }else{if(NR % 4 == 3){print \\"+\\"}else{print}}}" | \
-    gzip -c \
-     > query.fastq.gz
-
-    diamond \
-      blastx \
-      --query query.fastq.gz \
-      --out ${sample_name}.aln.gz \
-      --threads ${task.cpus} \
-      --db ${refdb} \
-      --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen \
-      --min-score ${params.dmnd_min_score} \
-      --query-cover ${params.dmnd_min_coverage} \
-      --id ${params.dmnd_min_identity} \
-      --top ${params.dmnd_top_pct} \
-      --block-size ${task.memory.toMega() / (1024 * 6 * task.attempt)} \
-      --query-gencode ${params.gencode} \
-      --compress 1 \
-      --unal 0
+    for fp in ${R1} ${R2}; do
+        cat \$fp | \
+        gunzip -c | \
+        awk "{if(NR % 4 == 1){print \\"@\$fp\\" NR }else{if(NR % 4 == 3){print \\"+\\"}else{print}}}" | \
+        gzip -c \
+        > query.fastq.gz
+
+        diamond \
+        blastx \
+        --query query.fastq.gz \
+        --out \$fp.aln.gz \
+        --threads ${task.cpus} \
+        --db ${refdb} \
+        --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen \
+        --min-score ${params.dmnd_min_score} \
+        --query-cover ${params.dmnd_min_coverage} \
+        --id ${params.dmnd_min_identity} \
+        --top ${params.dmnd_top_pct} \
+        --block-size ${task.memory.toMega() / (1024 * 6 * task.attempt)} \
+        --query-gencode ${params.gencode} \
+        --compress 1 \
+        --unal 0
+    done
+
+    cat *aln.gz > ${sample_name}.aln.gz
     """
 
 }

diff --git a/modules/composition.nf b/modules/composition.nf
@@ -21,7 +21,7 @@ workflow composition_wf {
     process metaphlan2_fastq {
     tag "MetaPhlAn2 composition for paired end fastq reads"
     container "${container__metaphlan2}"
-    label = 'mem_veryhigh'
+    label = 'mem_medium'
     errorStrategy 'retry'
 
     // If the user sets --preprocess_output, write out the combined reads to that folder

diff --git a/modules/general.nf b/modules/general.nf
@@ -664,7 +664,7 @@ process addMetaPhlAn2Results{
 
     input:
         path results_hdf
-        path metaphlan_csv
+        path metaphlan_tsv_list
 
     output:
         path "${results_hdf}"
@@ -674,19 +674,35 @@ process addMetaPhlAn2Results{
 
 import pandas as pd
 
-# Read in the metaphlan results
-metaphlan_df = pd.read_csv("${metaphlan_csv}")
-
-print(
-    "Read in metaphlan results for %d taxa" % 
-    metaphlan_df.shape[0]
-)
-
 # Open a connection to the HDF5
 with pd.HDFStore("${results_hdf}", "a") as store:
 
-    # Write metaphlan results to HDF5
-    metaphlan_df.to_hdf(store, "/composition/metaphlan")
+    # Iterate over each input file
+    for fp in "${metaphlan_tsv_list}".split(" "):
+
+        # Make sure that the file has the expected suffix
+        assert fp.endswith(".metaphlan2.tsv"), fp
+
+        # Get the specimen name from the file name
+        specimen_name = fp.replace(".metaphlan2.tsv", "")
+
+        # Read in from the flat file
+        metaphlan_df = pd.read_csv(
+            fp,
+            skiprows=1,
+            sep="\\t"
+        )
+
+        print("Read in %d taxa from %s" % (metaphlan_df.shape[0], specimen_name))
+
+        # Write metaphlan results to HDF5
+        key = "/composition/metaphlan/%s" % specimen_name
+        print("Writing to %s" % key)
+        metaphlan_df.to_hdf(store, key)
+
+    print("Closing store")
+
+print("Done")
 
 """