Skip to content

Commit

Permalink
fix bakta support for pangenome runs
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Apr 16, 2024
1 parent cfd4c12 commit 60b8053
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 25 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -27,6 +27,7 @@ description: A full list of Bactopia releases and a description of the changes.

- missing schema for clean-yer-reads
- use `--infile-list` with `csvtk concat` to support 10k+ inputs
- `pangenome` when Bakta GFF (`*.gff3`) files are provided

### `Enhancements to OSS`

Expand Down
27 changes: 19 additions & 8 deletions bin/kraken-bracken-summary.py
Expand Up @@ -80,14 +80,25 @@ def braken_root_count(bracken_report):
'bracken_secondary_species_abundance',
'bracken_unclassified_abundance'
]
results = [
args.prefix,
bracken['name'].iloc[0] if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "No primary abundance > 1%",
"{0:.5f}".format(bracken['fraction_total_reads'].iloc[0]) if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "",
bracken['name'].iloc[1] if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "No secondary abundance > 1%",
"{0:.5f}".format(bracken['fraction_total_reads'].iloc[1]) if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "",
"{0:.5f}".format(unclassified_count / total_count)
]
if len(bracken) > 1:
results = [
args.prefix,
bracken['name'].iloc[0] if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "No primary abundance > 1%",
"{0:.5f}".format(bracken['fraction_total_reads'].iloc[0]) if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "",
bracken['name'].iloc[1] if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "No secondary abundance > 1%",
"{0:.5f}".format(bracken['fraction_total_reads'].iloc[1]) if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "",
"{0:.5f}".format(unclassified_count / total_count)
]
else:
results = [
args.prefix,
bracken['name'].iloc[0] if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "No primary abundance > 1%",
"{0:.5f}".format(bracken['fraction_total_reads'].iloc[0]) if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "",
"No secondary abundance > 1%",
"",
"{0:.5f}".format(unclassified_count / total_count)
]

with open("{0}.bracken.tsv".format(args.prefix), "wt") as fh_out:
fh_out.write("{}\n".format('\t'.join(cols)))
fh_out.write("{}\n".format('\t'.join(results)))
Expand Down
7 changes: 5 additions & 2 deletions modules/nf-core/panaroo/run/main.nf
Expand Up @@ -33,13 +33,16 @@ process PANAROO_RUN {
"""
mkdir gff
cp -L gff-tmp/* gff/
find gff/ -name "*.gff.gz" | xargs gunzip
find gff/ -name "*.gz" | xargs gunzip
# Make FOFN of gff (Prokka) and gff3 (Bakta) files
find gff/ -name "*.gff" -or -name "*.gff3" > gff-fofn.txt
panaroo \\
$options.args \\
-t $task.cpus \\
-o results \\
-i gff/*.gff
-i gff-fofn.txt
# Cleanup
find . -name "*.fas" | xargs -I {} -P $task.cpus -n 1 gzip {}
Expand Down
14 changes: 11 additions & 3 deletions modules/nf-core/pirate/main.nf
Expand Up @@ -17,7 +17,7 @@ process PIRATE {
'quay.io/biocontainers/pirate:1.0.5--hdfd78af_0' }"

input:
tuple val(meta), path(gff)
tuple val(meta), path(gff, stageAs: 'gff-tmp/*')

output:
tuple val(meta), path("results/*") , emit: results
Expand All @@ -30,13 +30,21 @@ process PIRATE {
script:
prefix = options.suffix ? "${options.suffix}" : "${meta.id}"
"""
find . -name "*.gff.gz" | sed 's/.gz//' | xargs -I {} bash -c 'gzip -cdf {}.gz > {}'
mkdir gff
cp -L gff-tmp/* gff/
find gff/ -name "*.gff.gz" | xargs -r gunzip
# PIRATE only supports .gff extension, will need to adjust for gff3 (Bakta) files
# https://github.com/SionBayliss/PIRATE/blob/master/scripts/run_PIRATE.pl#L153
# note for later: "xargs -r" will not run if no files are found
find gff/ -name "*.gff3.gz" | xargs -r gunzip
for file in gff/*.gff3; do mv "\$file" "\${file%.gff3}.gff"; done
PIRATE \\
$options.args \\
--align \\
--threads $task.cpus \\
--input ./ \\
--input ./gff/ \\
--output results/
PIRATE_to_roary.pl -i results/PIRATE.*.tsv -o results/gene_presence_absence.csv
find . -name "*.fasta" | xargs -I {} -P $task.cpus -n 1 gzip {}
Expand Down
9 changes: 8 additions & 1 deletion modules/nf-core/roary/main.nf
Expand Up @@ -32,7 +32,14 @@ process ROARY {
"""
mkdir gff
cp -L gff-tmp/* gff/
find gff/ -name "*.gff.gz" | xargs gunzip
find gff/ -name "*.gff.gz" | xargs -r gunzip
# Roary only supports .gff extension, will need to adjust for gff3 (Bakta) files
# https://github.com/sanger-pathogens/Roary/blob/master/lib/Bio/Roary/PrepareInputFiles.pm#L82
# note for later: "xargs -r" will not run if no files are found
find gff/ -name "*.gff3.gz" | xargs -r gunzip
for file in gff/*.gff3; do mv "\$file" "\${file%.gff3}.gff"; done
roary \\
$options.args \\
-p $task.cpus \\
Expand Down
75 changes: 64 additions & 11 deletions subworkflows/local/snippy/main.nf
Expand Up @@ -32,8 +32,8 @@ core_opts.args = [
core_opts.publish_to_base = [".full.aln.gz"]
core_opts.suffix = "core-snp"

include { SNIPPY_RUN } from '../../../modules/nf-core/snippy/run/main' addParams( options: snippy_opts )
include { SNIPPY_CORE } from '../../../modules/nf-core/snippy/core/main' addParams( options: core_opts )
include { SNIPPY_RUN as SNIPPY_RUN_MODULE } from '../../../modules/nf-core/snippy/run/main' addParams( options: snippy_opts )
include { SNIPPY_CORE as SNIPPY_CORE_MODULE } from '../../../modules/nf-core/snippy/core/main' addParams( options: core_opts )
include { GUBBINS } from '../gubbins/main' addParams( options: [suffix: 'core-snp', ignore: [".aln.gz"]] )
include { IQTREE } from '../iqtree/main' addParams( options: [suffix: 'core-snp', ignore: [".aln.gz"]] )
include { SNPDISTS } from '../../../modules/nf-core/snpdists/main' addParams( options: [suffix: 'core-snp.distance'] )
Expand All @@ -46,24 +46,24 @@ workflow SNIPPY {
ch_versions = Channel.empty()

// Run Snippy per-sample
SNIPPY_RUN(reads, file(params.reference))
ch_versions = ch_versions.mix(SNIPPY_RUN.out.versions.first())
SNIPPY_RUN_MODULE(reads, file(params.reference))
ch_versions = ch_versions.mix(SNIPPY_RUN_MODULE.out.versions.first())

// Identify core SNPs
SNIPPY_RUN.out.vcf.collect{meta, vcf -> vcf}.map{ vcf -> [[id:'snippy-core'], vcf]}.set{ ch_merge_vcf }
SNIPPY_RUN.out.aligned_fa.collect{meta, aligned_fa -> aligned_fa}.map{ aligned_fa -> [[id:'snippy-core'], aligned_fa]}.set{ ch_merge_aligned_fa }
SNIPPY_RUN_MODULE.out.vcf.collect{meta, vcf -> vcf}.map{ vcf -> [[id:'snippy-core'], vcf]}.set{ ch_merge_vcf }
SNIPPY_RUN_MODULE.out.aligned_fa.collect{meta, aligned_fa -> aligned_fa}.map{ aligned_fa -> [[id:'snippy-core'], aligned_fa]}.set{ ch_merge_aligned_fa }
ch_merge_vcf.join( ch_merge_aligned_fa ).set{ ch_snippy_core }
SNIPPY_CORE(ch_snippy_core, file(params.reference), MASK)
ch_versions = ch_versions.mix(SNIPPY_CORE.out.versions.first())
SNIPPY_CORE_MODULE(ch_snippy_core, file(params.reference), MASK)
ch_versions = ch_versions.mix(SNIPPY_CORE_MODULE.out.versions.first())

// Per-sample SNP distances
SNPDISTS(SNIPPY_CORE.out.clean_full_aln)
SNPDISTS(SNIPPY_CORE_MODULE.out.clean_full_aln)
ch_versions = ch_versions.mix(SNPDISTS.out.versions)

// Identify Recombination
if (!params.skip_recombination) {
// Run Gubbins
GUBBINS(SNIPPY_CORE.out.clean_full_aln)
GUBBINS(SNIPPY_CORE_MODULE.out.clean_full_aln)
ch_versions = ch_versions.mix(GUBBINS.out.versions)
}

Expand All @@ -72,11 +72,64 @@ workflow SNIPPY {
if (!params.skip_recombination) {
IQTREE(GUBBINS.out.masked_aln)
} else {
IQTREE(SNIPPY_CORE.out.clean_full_aln)
IQTREE(SNIPPY_CORE_MODULE.out.clean_full_aln)
}
ch_versions = ch_versions.mix(IQTREE.out.versions)
}

emit:
versions = ch_versions // channel: [ versions.yml ]
}

workflow SNIPPY_RUN {
take:
reads // channel: [ val(meta), [ reads ] ]

main:
ch_versions = Channel.empty()

// Run Snippy per-sample
SNIPPY_RUN_MODULE(reads, file(params.reference))
ch_versions = ch_versions.mix(SNIPPY_RUN_MODULE.out.versions.first())

emit:
versions = ch_versions // channel: [ versions.yml ]
}

workflow SNIPPY_CORE {
take:
snippy_run // channel: [ val(meta), [ vcf ], [ aligned_fa ] ]

main:
ch_versions = Channel.empty()

// Identify core SNPs
snippy_run.vcf.collect{meta, vcf -> vcf}.map{ vcf -> [[id:'snippy-core'], vcf]}.set{ ch_merge_vcf }
snippy_run.aligned_fa.collect{meta, aligned_fa -> aligned_fa}.map{ aligned_fa -> [[id:'snippy-core'], aligned_fa]}.set{ ch_merge_aligned_fa }
ch_merge_vcf.join( ch_merge_aligned_fa ).set{ ch_snippy_core }
SNIPPY_CORE_MODULE(ch_snippy_core, file(params.reference), MASK)
ch_versions = ch_versions.mix(SNIPPY_CORE_MODULE.out.versions.first())

// Per-sample SNP distances
SNPDISTS(SNIPPY_CORE_MODULE.out.clean_full_aln)
ch_versions = ch_versions.mix(SNPDISTS.out.versions)

// Identify Recombination
if (!params.skip_recombination) {
// Run Gubbins
GUBBINS(SNIPPY_CORE_MODULE.out.clean_full_aln)
ch_versions = ch_versions.mix(GUBBINS.out.versions)
}

// Create core-snp phylogeny
if (!params.skip_phylogeny) {
if (!params.skip_recombination) {
IQTREE(GUBBINS.out.masked_aln)
} else {
IQTREE(SNIPPY_CORE_MODULE.out.clean_full_aln)
}
ch_versions = ch_versions.mix(IQTREE.out.versions)
}

emit:
versions = ch_versions // channel: [ versions.yml ]
Expand Down

0 comments on commit 60b8053

Please sign in to comment.