fix bakta support for pangenome runs

bactopia · Apr 16, 2024 · 60b8053 · 60b8053
1 parent cfd4c12
commit 60b8053
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,7 @@ description: A full list of Bactopia releases and a description of the changes.
 
 - missing schema for clean-yer-reads
 - use `--infile-list` with `csvtk concat` to support 10k+ inputs
+- `pangenome` when Bakta GFF (`*.gff3`) files are provided
 
 ### `Enhancements to OSS`
 

diff --git a/bin/kraken-bracken-summary.py b/bin/kraken-bracken-summary.py
@@ -80,14 +80,25 @@ def braken_root_count(bracken_report):
         'bracken_secondary_species_abundance',
         'bracken_unclassified_abundance'
     ]
-    results = [
-        args.prefix,
-        bracken['name'].iloc[0] if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "No primary abundance > 1%",
-        "{0:.5f}".format(bracken['fraction_total_reads'].iloc[0]) if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "",
-        bracken['name'].iloc[1] if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "No secondary abundance > 1%",
-        "{0:.5f}".format(bracken['fraction_total_reads'].iloc[1]) if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "",
-        "{0:.5f}".format(unclassified_count / total_count)
-    ]
+    if len(bracken) > 1:
+        results = [
+            args.prefix,
+            bracken['name'].iloc[0] if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "No primary abundance > 1%",
+            "{0:.5f}".format(bracken['fraction_total_reads'].iloc[0]) if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "",
+            bracken['name'].iloc[1] if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "No secondary abundance > 1%",
+            "{0:.5f}".format(bracken['fraction_total_reads'].iloc[1]) if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "",
+            "{0:.5f}".format(unclassified_count / total_count)
+        ]
+    else:
+        results = [
+            args.prefix,
+            bracken['name'].iloc[0] if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "No primary abundance > 1%",
+            "{0:.5f}".format(bracken['fraction_total_reads'].iloc[0]) if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "",
+            "No secondary abundance > 1%",
+            "",
+            "{0:.5f}".format(unclassified_count / total_count)
+        ]
+
     with open("{0}.bracken.tsv".format(args.prefix), "wt") as fh_out:
         fh_out.write("{}\n".format('\t'.join(cols)))
         fh_out.write("{}\n".format('\t'.join(results)))

diff --git a/modules/nf-core/panaroo/run/main.nf b/modules/nf-core/panaroo/run/main.nf
@@ -33,13 +33,16 @@ process PANAROO_RUN {
     """
     mkdir gff
     cp -L gff-tmp/* gff/
-    find gff/ -name "*.gff.gz" | xargs gunzip
+    find gff/ -name "*.gz" | xargs gunzip
+
+    # Make FOFN of gff (Prokka) and gff3 (Bakta) files
+    find gff/ -name "*.gff" -or -name "*.gff3" > gff-fofn.txt
 
     panaroo \\
         $options.args \\
         -t $task.cpus \\
         -o results \\
-        -i gff/*.gff
+        -i gff-fofn.txt
 
     # Cleanup
     find . -name "*.fas" | xargs -I {} -P $task.cpus -n 1 gzip {}

diff --git a/modules/nf-core/pirate/main.nf b/modules/nf-core/pirate/main.nf
@@ -17,7 +17,7 @@ process PIRATE {
         'quay.io/biocontainers/pirate:1.0.5--hdfd78af_0' }"
 
     input:
-    tuple val(meta), path(gff)
+    tuple val(meta), path(gff, stageAs: 'gff-tmp/*')
 
     output:
     tuple val(meta), path("results/*")                        , emit: results
@@ -30,13 +30,21 @@ process PIRATE {
     script:
     prefix = options.suffix ? "${options.suffix}" : "${meta.id}"
     """
-    find . -name "*.gff.gz" | sed 's/.gz//' | xargs -I {} bash -c 'gzip -cdf {}.gz > {}'
+    mkdir gff
+    cp -L gff-tmp/* gff/
+    find gff/ -name "*.gff.gz" | xargs -r gunzip
+
+    # PIRATE only supports .gff extension, will need to adjust for gff3 (Bakta) files
+    # https://github.com/SionBayliss/PIRATE/blob/master/scripts/run_PIRATE.pl#L153
+    # note for later: "xargs -r" will not run if no files are found
+    find gff/ -name "*.gff3.gz" | xargs -r gunzip
+    for file in gff/*.gff3; do mv "\$file" "\${file%.gff3}.gff"; done
 
     PIRATE \\
         $options.args \\
         --align \\
         --threads $task.cpus \\
-        --input ./ \\
+        --input ./gff/ \\
         --output results/
     PIRATE_to_roary.pl -i results/PIRATE.*.tsv -o results/gene_presence_absence.csv
     find . -name "*.fasta" | xargs -I {} -P $task.cpus -n 1 gzip {}

diff --git a/modules/nf-core/roary/main.nf b/modules/nf-core/roary/main.nf
@@ -32,7 +32,14 @@ process ROARY {
     """
     mkdir gff
     cp -L gff-tmp/* gff/
-    find gff/ -name "*.gff.gz" | xargs gunzip
+    find gff/ -name "*.gff.gz" | xargs -r gunzip
+
+    # Roary only supports .gff extension, will need to adjust for gff3 (Bakta) files
+    # https://github.com/sanger-pathogens/Roary/blob/master/lib/Bio/Roary/PrepareInputFiles.pm#L82
+    # note for later: "xargs -r" will not run if no files are found
+    find gff/ -name "*.gff3.gz" | xargs -r gunzip
+    for file in gff/*.gff3; do mv "\$file" "\${file%.gff3}.gff"; done
+
     roary \\
         $options.args \\
         -p $task.cpus \\

diff --git a/subworkflows/local/snippy/main.nf b/subworkflows/local/snippy/main.nf
@@ -32,8 +32,8 @@ core_opts.args = [
 core_opts.publish_to_base = [".full.aln.gz"]
 core_opts.suffix = "core-snp"
 
-include { SNIPPY_RUN }  from '../../../modules/nf-core/snippy/run/main' addParams( options: snippy_opts )
-include { SNIPPY_CORE }  from '../../../modules/nf-core/snippy/core/main' addParams( options: core_opts )
+include { SNIPPY_RUN as SNIPPY_RUN_MODULE }  from '../../../modules/nf-core/snippy/run/main' addParams( options: snippy_opts )
+include { SNIPPY_CORE as SNIPPY_CORE_MODULE }  from '../../../modules/nf-core/snippy/core/main' addParams( options: core_opts )
 include { GUBBINS } from '../gubbins/main' addParams( options: [suffix: 'core-snp', ignore: [".aln.gz"]] )
 include { IQTREE } from '../iqtree/main' addParams( options: [suffix: 'core-snp', ignore: [".aln.gz"]] )
 include { SNPDISTS } from '../../../modules/nf-core/snpdists/main' addParams( options: [suffix: 'core-snp.distance'] )
@@ -46,24 +46,24 @@ workflow SNIPPY {
     ch_versions = Channel.empty()
 
     // Run Snippy per-sample
-    SNIPPY_RUN(reads, file(params.reference))
-    ch_versions = ch_versions.mix(SNIPPY_RUN.out.versions.first())
+    SNIPPY_RUN_MODULE(reads, file(params.reference))
+    ch_versions = ch_versions.mix(SNIPPY_RUN_MODULE.out.versions.first())
 
     // Identify core SNPs
-    SNIPPY_RUN.out.vcf.collect{meta, vcf -> vcf}.map{ vcf -> [[id:'snippy-core'], vcf]}.set{ ch_merge_vcf }
-    SNIPPY_RUN.out.aligned_fa.collect{meta, aligned_fa -> aligned_fa}.map{ aligned_fa -> [[id:'snippy-core'], aligned_fa]}.set{ ch_merge_aligned_fa }
+    SNIPPY_RUN_MODULE.out.vcf.collect{meta, vcf -> vcf}.map{ vcf -> [[id:'snippy-core'], vcf]}.set{ ch_merge_vcf }
+    SNIPPY_RUN_MODULE.out.aligned_fa.collect{meta, aligned_fa -> aligned_fa}.map{ aligned_fa -> [[id:'snippy-core'], aligned_fa]}.set{ ch_merge_aligned_fa }
     ch_merge_vcf.join( ch_merge_aligned_fa ).set{ ch_snippy_core }
-    SNIPPY_CORE(ch_snippy_core, file(params.reference), MASK)
-    ch_versions = ch_versions.mix(SNIPPY_CORE.out.versions.first())
+    SNIPPY_CORE_MODULE(ch_snippy_core, file(params.reference), MASK)
+    ch_versions = ch_versions.mix(SNIPPY_CORE_MODULE.out.versions.first())
 
     // Per-sample SNP distances
-    SNPDISTS(SNIPPY_CORE.out.clean_full_aln)
+    SNPDISTS(SNIPPY_CORE_MODULE.out.clean_full_aln)
     ch_versions = ch_versions.mix(SNPDISTS.out.versions)
 
     // Identify Recombination
     if (!params.skip_recombination) {
         // Run Gubbins
-        GUBBINS(SNIPPY_CORE.out.clean_full_aln)
+        GUBBINS(SNIPPY_CORE_MODULE.out.clean_full_aln)
         ch_versions = ch_versions.mix(GUBBINS.out.versions)
     }
 
@@ -72,11 +72,64 @@ workflow SNIPPY {
         if (!params.skip_recombination) {
             IQTREE(GUBBINS.out.masked_aln)
         } else {
-            IQTREE(SNIPPY_CORE.out.clean_full_aln)
+            IQTREE(SNIPPY_CORE_MODULE.out.clean_full_aln)
         }
         ch_versions = ch_versions.mix(IQTREE.out.versions)
     }
 
+    emit:
+    versions = ch_versions // channel: [ versions.yml ]
+}
+
+workflow SNIPPY_RUN {
+    take:
+    reads // channel: [ val(meta), [ reads ] ]
+
+    main:
+    ch_versions = Channel.empty()
+
+    // Run Snippy per-sample
+    SNIPPY_RUN_MODULE(reads, file(params.reference))
+    ch_versions = ch_versions.mix(SNIPPY_RUN_MODULE.out.versions.first())
+
+    emit:
+    versions = ch_versions // channel: [ versions.yml ]
+}
+
+workflow SNIPPY_CORE {
+    take:
+    snippy_run // channel: [ val(meta), [ vcf ], [ aligned_fa ] ]
+
+    main:
+    ch_versions = Channel.empty()
+
+    // Identify core SNPs
+    snippy_run.vcf.collect{meta, vcf -> vcf}.map{ vcf -> [[id:'snippy-core'], vcf]}.set{ ch_merge_vcf }
+    snippy_run.aligned_fa.collect{meta, aligned_fa -> aligned_fa}.map{ aligned_fa -> [[id:'snippy-core'], aligned_fa]}.set{ ch_merge_aligned_fa }
+    ch_merge_vcf.join( ch_merge_aligned_fa ).set{ ch_snippy_core }
+    SNIPPY_CORE_MODULE(ch_snippy_core, file(params.reference), MASK)
+    ch_versions = ch_versions.mix(SNIPPY_CORE_MODULE.out.versions.first())
+
+    // Per-sample SNP distances
+    SNPDISTS(SNIPPY_CORE_MODULE.out.clean_full_aln)
+    ch_versions = ch_versions.mix(SNPDISTS.out.versions)
+
+    // Identify Recombination
+    if (!params.skip_recombination) {
+        // Run Gubbins
+        GUBBINS(SNIPPY_CORE_MODULE.out.clean_full_aln)
+        ch_versions = ch_versions.mix(GUBBINS.out.versions)
+    }
+
+    // Create core-snp phylogeny
+    if (!params.skip_phylogeny) {
+        if (!params.skip_recombination) {
+            IQTREE(GUBBINS.out.masked_aln)
+        } else {
+            IQTREE(SNIPPY_CORE_MODULE.out.clean_full_aln)
+        }
+        ch_versions = ch_versions.mix(IQTREE.out.versions)
+    }
 
     emit:
     versions = ch_versions // channel: [ versions.yml ]