Merge v3.1.0

ctmrbio · May 7, 2021 · 79211b2 · 79211b2
2 parents 36d0bad + 4e64f42
commit 79211b2
Show file tree

Hide file tree

Showing 33 changed files with 491 additions and 293 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,25 @@ Changes should fall into one of the following categories:
 - `Removed`, for now removed features.
 - `Fixed`, for any bug fixes.
 
+## [3.1.0] - In development
+### Added
+- New profile for use on CTMR Gandalf, `ctmr_gandalf`.
+- Kraken2 added for taxonomic profiling, replaces sendsketch as contamination
+  screen.
+- Docker profile
+
+### Changed
+- Renamed profile for CTMR-NAS to `ctmr_nas` to better conform to incoming
+  profiles.
+- Moved contig renaming script to shovill process from contamination screen
+  process.
+- Now publishes `shovill.log` in the output directory by default.
+- Limited the search scope for MultiQC to minimize risk of process timeouts on
+  HPC systems.
+
+### Removed
+- Sendsketch replaced with Kraken2
+
 ## [3.0.0] - 2021-01-15
 ### Added
 - A script was implemented that renames the headers of fasta-files.
@@ -26,10 +45,12 @@ Changes should fall into one of the following categories:
 	with assembled genomes from prokka (default = only from prokka)
 - Contamination screen is now parsed and used as taxonomic info for prokka.
 - MultiQC now incorporates the trimmed QC reports as well.
-- User now receives a file with compiled assemblystats, generated via BBmap's statswrapper. 
+- User now receives a file with compiled assemblystats, generated via BBmap's
+  statswrapper. 
 
 ### Changed
-- QC and trimming now done by FastP, not BBduk and FastQC, resulting in a much faster runtime.
+- QC and trimming now done by FastP, not BBduk and FastQC, resulting in a much
+  faster runtime.
 - Contamination screening now done by Sendsketch.
 - Contamination now ascertained from assemblies, not trimmed reads.
 - BACTpipe now updated to DLS2-format.

diff --git a/README.md b/README.md
@@ -6,13 +6,13 @@ paired input reads, tries to assess if the sample contains mixed or pure
 isolates, performs *de novo* assembly, and annotates the assembled genome.
 BACTpipe uses Nextflow as a workflow manager. 
 
-![BACTpipe flowchart](./docs/source/img/BACTpipe_3_flowchart.png)
+![BACTpipe flowchart](./docs/source/img/BACTpipe_3-1_flow.png)
 
 ## Documentation
 Complete documentation is available at https://bactpipe.readthedocs.io. 
 
 ## Quick-start
-You need to have [Nextflow](https://www.nextflow.io), more specifically [v21.01.0-edge](https://github.com/nextflow-io/nextflow/releases/download/v21.01.0-edge/nextflow-21.01.0-edge-all) and [conda](https://conda.io/docs/) installed.
+You need to have [Nextflow](https://www.nextflow.io), more specifically [v21.04.0](https://github.com/nextflow-io/nextflow/releases/download/v21.04.0/nextflow-21.04.0-all) and [conda](https://conda.io/docs/) installed.
 When running with the default local profile all other software will be installed into local environments by conda througout the process. 
 
 ## Run BACTpipe

diff --git a/bactpipe.nf b/bactpipe.nf
@@ -3,20 +3,13 @@
 
 nextflow.enable.dsl = 2
 
-//================================================================================
-// Constants
-//================================================================================
-
-BACTPIPE_VERSION = '3.0'
-
 //================================================================================
 // Log info
 //================================================================================
 
-
 log.info "".center(60, "=")
 log.info "BACTpipe".center(60)
-log.info "Version ${BACTPIPE_VERSION}".center(60)
+log.info "Version $workflow.manifest.version".center(60)
 log.info "Bacterial whole genome analysis pipeline".center(60)
 log.info "https://bactpipe.readthedocs.io".center(60)
 log.info "".center(60, "=")
@@ -27,24 +20,23 @@ params.help = false
 // Include modules and (soft) override module-level parameters
 //================================================================================
 
-
-include { ASSEMBLY_STATS } from "./modules/assembly_stats/assembly_stats.nf"
 include { FASTP } from "./modules/fastp/fastp.nf"
-include { MULTIQC } from "./modules/multiqc/multiqc.nf"
-include { PROKKA } from "./modules/prokka/prokka.nf"
-include { SCREEN_FOR_CONTAMINANTS } from "./modules/screen_for_contaminants/screen_for_contaminants.nf"
+include { CLASSIFY_TAXONOMY } from "./modules/classify_taxonomy/classify_taxonomy.nf"
 include { SHOVILL } from "./modules/shovill/shovill.nf"
+include { ASSEMBLY_STATS } from "./modules/assembly_stats/assembly_stats.nf"
+include { PROKKA } from "./modules/prokka/prokka.nf"
+include { MULTIQC } from "./modules/multiqc/multiqc.nf"
 include { printHelp; printSettings } from "./modules/utils/utils.nf"
 
-
 //================================================================================
 // Pre-flight checks and info
 //================================================================================
 
 if (workflow['profile'] in params.profiles_that_require_project) {
     if (!params.project) {
         log.error "BACTpipe requires that you set the 'project' parameter when running the ${workflow['profile']} profile.\n".center(60) +
-                "Specify --project <project_name> on the command line, or tuple it in a custom configuration file.".center(60)
+                "Specify --project <project_name> on the command line, or add it to a custom configuration file.".center(60) + 
+                "Refer to the official docs for more information."
         exit(1)
     }
 }
@@ -54,51 +46,57 @@ if (params.help) {
     exit(0)
 }
 
-
 printSettings()
 
+if ( ! params.kraken2_db ) {
+	log.warn "No Kraken2 database specified. Use --kraken2_db /path/to/db to use Kraken2 to classify samples and determine gram stain."
+}
+
+if ( ! params.reads ) {
+    log.error "No reads specified. It is required to specify --reads 'path/to/*_{1,2}.fastq.gz' (note the single quotes)"
+    exit(1)
+}
 
 //================================================================================
 // Prepare channels
 //================================================================================
 
-
 fastp_input = Channel.fromFilePairs(params.reads)
 
 fastp_input
         .ifEmpty {
             log.error "Cannot find any reads matching: '${params.reads}'\n\n" +
                     "Did you specify --reads 'path/to/*_{1,2}.fastq.gz'? (note the single quotes)\n" +
-                    "Specify --help for a summary of available commands."
+                    "Specify --help for a summary of available commands. " +
+                    "Refer to the official docs for more information."
             printHelp()
             exit(1)
         }
 
-
 //================================================================================
 // Main workflow
 //================================================================================
 
 
 workflow {
-
     FASTP(fastp_input)
-    SHOVILL(FASTP.out.shovill_input)
-    ASSEMBLY_STATS(SHOVILL.out[0])
-    SCREEN_FOR_CONTAMINANTS(SHOVILL.out[0])
-    PROKKA(SCREEN_FOR_CONTAMINANTS.out[0], SCREEN_FOR_CONTAMINANTS.out[1])
-    MULTIQC(FASTP.out.fastp_reports.collect(),
-            PROKKA.out.collect()
+    CLASSIFY_TAXONOMY(FASTP.out.fastq)
+    SHOVILL(FASTP.out.fastq)
+    ASSEMBLY_STATS(SHOVILL.out.contigs)
+    PROKKA(
+        SHOVILL.out.contigs,
+        CLASSIFY_TAXONOMY.out.classification
+    )
+    MULTIQC(
+        FASTP.out.fastp_reports.collect(),
+        PROKKA.out.collect()
     )
-
-
 }
 
 //================================================================================
 // Workflow onComplete action
 //================================================================================
 
-
 workflow.onComplete {
     log.info "".center(60, "=")
     log.info "BACTpipe workflow completed without errors".center(60)

diff --git a/bin/classify_kreport.py b/bin/classify_kreport.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+from sys import argv, exit, stderr
+from collections import defaultdict
+import argparse
+
+"""Identify top ranked genus in output report from Kraken2.
+
+Script was developed for internal use in the Nextflow pipeline BACTpipe.
+"""
+
+def parse_args():
+    """Parse command line arguments.
+    """
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-r", "--kreport",
+        required=True,
+        help="Path to Kraken2 report output file (txt/kreport)")
+    parser.add_argument("-m", "--min-proportion",
+        type=float,
+        default=50.00,
+        help="Minimum proportion on species level [%(default)s].")
+    parser.add_argument("-g", "--gramstains", 
+        help="Path to text file containing gram staining classifications in "
+            "two-column tab separated format (Genus<TAB>Stain)")
+
+    if len(argv) < 2:
+        parser.print_help()
+        exit(1)
+
+    args = parser.parse_args()
+
+    return args
+
+
+def parse_gramstains(gramstains):
+    gramstain_db = defaultdict(lambda: "Unknown")
+    try:
+        with open(gramstains) as f:
+            for line in f:
+                genus, gramstain = line.rstrip().split("\t")
+                gramstain_db[genus] = gramstain
+    except TypeError as e:
+        print("WARNING: No gramstain database specified, gramstain set to Unknown", file=stderr)
+    return gramstain_db
+
+
+def parse_kreport(kreport_file):
+    with open(kreport_file) as f:
+        for line_no, line in enumerate(f, start=1):
+            try:
+                (clade_fraction, clade_fragments, 
+                    taxon_fragments, rank, taxid, name) = line.strip().split("\t")
+            except ValueError as e:
+                print(f"WARNING: Could not parse line {line_no}, ignoring...", file=stderr)
+                continue
+
+            try:
+                clade_fraction = float(clade_fraction)
+            except ValueError as e:
+                print(f"WARNING: Could not interpret {clade_fraction} on row {line_no} as float, ignoring...", file=stderr)
+                continue
+
+            if rank == "S":
+                clean_name = name.strip().split()[:2]
+                yield clade_fraction, clean_name
+
+
+def classify(detected_species, min_proportion, gramstain_db):
+    taxon_names = [
+            species for proportion, species in 
+            filter(lambda x: x[0] > min_proportion, detected_species)
+    ]
+
+    output_genus = "Unknown"
+    output_species = "unknown"
+
+    genera = set(genus for genus, species in taxon_names)
+    if len(genera) > 1:
+        output_genus = "Mixed"
+    elif len(genera) == 1:
+        output_genus = genera.pop()
+
+    species = set(species for genus, species in taxon_names)
+    if (len(species) > 1) and output_genus:
+        output_species = "spp."
+    elif len(species) == 1:
+        output_species = species.pop()
+
+    output_gramstain = gramstain_db.get(output_genus, "Unknown")
+
+    return output_genus, output_species, output_gramstain
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    gramstain_db = parse_gramstains(args.gramstains)
+    detected_species = list(parse_kreport(args.kreport))
+    genus, species, gramstain = classify(detected_species, args.min_proportion, gramstain_db)
+
+    print(genus, species, gramstain, sep="\t")
+
diff --git a/bin/sendsketch_to_prokka.py b/bin/sendsketch_to_prokka.py
diff --git a/conf/ctmr_gandalf.config b/conf/ctmr_gandalf.config
@@ -0,0 +1,60 @@
+// vim: syntax=groovy expandtab
+// BACTpipe Nextflow configuration file for use on CTMR Gandalf
+
+params {
+    project = 'bio'
+    partition = 'ctmr'
+}
+
+process {
+    errorStrategy = 'terminate'
+    executor = 'slurm'
+    clusterOptions = {
+        " --partition ${params.partition} -A ${params.project}" + (params.clusterOptions ?: '')
+    }
+    scratch = false
+    stageInMode = 'copy'
+    stageOutMode = 'copy'
+
+    withName:
+    FASTP {
+        cpus = 4
+        time = 20.m
+        conda = 'bioconda::fastp'
+    }
+
+    withName:
+    SHOVILL {
+        cpus = 10
+        time = 2.h
+        conda = 'bioconda::shovill bioconda::bwa=0.7.16 python=3'
+    }
+
+    withName:
+    CLASSIFY_TAXONOMY {
+        cpus = 10
+        time = 30.m
+        conda = 'bioconda::kraken2'
+    }
+
+    withName:
+    ASSEMBLY_STATS {
+        cpus = 1
+        time = 20.m
+        conda = 'bioconda::bbmap'
+    }
+
+    withName:
+    PROKKA {
+        cpus = 4
+        time = 2.h
+        conda = 'bioconda::prokka'
+    }
+
+    withName:
+    MULTIQC {
+        cpus = 1
+        time = 10.m
+        conda = 'bioconda::multiqc'
+    }
+}
diff --git a/conf/ctmrnas.config → conf/ctmr_nas.config b/conf/ctmrnas.config → conf/ctmr_nas.config
@@ -20,10 +20,10 @@ process {
         conda = 'bioconda::fastp=0.20.0'
     }
     withName:
-    SCREEN_FOR_CONTAMINANTS {
+    CLASSIFY_TAXONOMY {
         cpus = 8
         time = 15.m
-        conda = 'bioconda::bbmap=38.76'
+        conda = 'bioconda::kraken2'
     }
     withName:
     SHOVILL {