diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e2c9734..9a3b02c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -34,11 +34,13 @@ docker-run: NF_IGNORE_PROCESSES: "medakaVariant,medakaVariantHdf,makePerSampleReports" - if: $MATRIX_NAME == "reference-based" variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000" + NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 8GB \ + --fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000" NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka,makePerSampleReports" - if: $MATRIX_NAME == "check-model" variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 \ + NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 8GB \ + --fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 \ --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v3.5.2" NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka,makePerSampleReports" - if: $MATRIX_NAME == "amr" @@ -48,13 +50,15 @@ docker-run: NF_IGNORE_PROCESSES: "runProkka,medakaVariant,medakaVariantHdf,makePerSampleReports" - if: $MATRIX_NAME == "sample-sheet" variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 \ + NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB \ + --fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 \ --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v3.5.2 --sample_sheet test_data/sample_sheet.csv" NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka,makePerSampleReports" - if: $MATRIX_NAME == "barcode04" # isolates barcode04 only has 4 reads and flye will fail due to low coverage variables: - NF_WORKFLOW_OPTS: "--fastq s3://ont-exd-int-s3-euwst1-epi2me-labs/wf-bacterial-genomes/test_data/isolates_fastq/barcode04 \ + NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB \ + --fastq s3://ont-exd-int-s3-euwst1-epi2me-labs/wf-bacterial-genomes/test_data/isolates_fastq/barcode04 \ --threads 3 --chunk_size 100000" NF_IGNORE_PROCESSES: "alignReads,readStats,coverStats,splitRegions,\ medakaConsensus,medakaNetwork,medakaVariant,medakaVariantHdf,makeReport,runProkka,makePerSampleReports" @@ -65,7 +69,8 @@ docker-run: NF_IGNORE_PROCESSES: "runProkka,medakaVariant,medakaVariantHdf" - if: $MATRIX_NAME == "reference-iso" variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 --isolates" + NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 8GB \ + --fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 --isolates" NF_IGNORE_PROCESSES: "deNovo,runProkka" # reminder: update AUX_IMAGE_TAG if the aux container package versions are changed release-prokka: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6583953..8134142 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1,14 @@ repos: - repo: local hooks: - - id: docs_schema - name: docs_schema - entry: parse_docs -p docs -e .md -s intro links -oj nextflow_schema.json - language: python - always_run: true - pass_filenames: false - additional_dependencies: - - epi2melabs - id: docs_readme name: docs_readme - entry: parse_docs -p docs -e .md -s header intro quickstart links -ot README.md + entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_inputs 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json language: python always_run: true pass_filenames: false additional_dependencies: - - epi2melabs + - epi2melabs>=0.0.50 - id: build_models name: build_models entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 88a3f44..1460909 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.0.0] +### Added +- Cloud support for the workflow within the EPI2ME Application. +### Changed +- Documentation + ## [v0.4.0] ### Added - MacOS ARM64 support diff --git a/README.md b/README.md index 9db022a..ba6c66a 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,276 @@ -# Bacterial genomes Workflow +# Bacterial assembly and annotation workflow -This repository contains a [nextflow](https://www.nextflow.io/) workflow -for analysing bacterial genomes. +Assembly, variant calling, and annotation of bacterial genomes. -# Introduction +## Introduction -If no reference is included assembly will be -completed using [flye](https://github.com/fenderglass/Flye) and polished with -[medaka](https://www.github.com/nanoporetech/medaka). If a reference is provided -alignment will be done with [mini_align](https://github.com/nanoporetech/pomoxis/blob/master/scripts/mini_align) -and variant called using medaka. The workflow has a few optional extras. It can run -[prokka](https://github.com/tseemann/prokka) to annotate the resulting -consensus sequence or [ResFinder](https://bitbucket.org/genomicepidemiology/resfinder/src/master/) to check it against a database of antimicrobial resistance genes. + +This workflow is primarily used to assemble genomes from bacterial reads and provide information on features of interest within those assemblies through annotations. +The workflow can provide additional information about the assembly, such as antimicrobial resistance (AMR) analysis and sequence typing through an optional `--isolates` mode. +In brief, this workflow will perform the following: -## Quickstart ++ De novo (or reference-based) assembly of bacterial genomes ++ Annotation of regions of interest within the assembly ++ Species identification and sequence typing (`--isolates` mode only) ++ Identify genes and SNVs associated with AMR (`--isolates` mode only) -The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and -software resources, as such nextflow will need to be installed before attempting -to run the workflow. -The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop) (default) or -[Singularity](https://sylabs.io/singularity/) (`-profile singularity`) to provide isolation of + +## Compute requirements + +Recommended requirements: + ++ CPUs = 16 ++ Memory = 32GB + +Minimum requirements: + ++ CPUs = 4 ++ Memory = 8GB + +Approximate run time: 20-40 minutes per sample with ~50x coverage using minimum requirements + +ARM processor support: True + + + + +## Install and run + + + +These are instructions to install and run the workflow on command line. You can also access the workflow via the [EPI2ME application](https://labs.epi2me.io/downloads/). + +The workflow uses [Nextflow](https://www.nextflow.io/) to manage compute and software resources, therefore nextflow will need to be installed before attempting to run the workflow. + +The workflow can currently be run using either [Docker](https://www.docker.com/products/docker-desktop) or +[Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of the required software. Both methods are automated out-of-the-box provided -either Docker or Singularity is installed. +either docker or singularity is installed. This is controlled by the [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) parameter as exemplified below. It is not required to clone or download the git repository in order to run the workflow. -For more information on running EPI2ME Labs workflows [visit out website](https://labs.epi2me.io/wfindex). +More information on running EPI2ME workflows can be found on our [website](https://labs.epi2me.io/wfindex). -**Workflow options** - -To obtain the workflow, having installed `nextflow`, users can run: +The following command can be used to obtain the workflow. This will pull the repository in to the assets folder of nextflow and provide a list of all parameters available for the workflow as well as an example command: ``` -nextflow run epi2me-labs/wf-bacterial-genomes --help +nextflow run epi2me-labs/wf-bacterial-genomes –-help +``` +A demo dataset is provided for testing of the workflow. It can be downloaded using: +``` +wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-bacterial-genomes/wf-bacterial-genomes-demo.tar.gz +tar -xzvf wf-bacterial-genomes-demo.tar.gz +``` +The workflow can be run with the demo data using: +``` +nextflow run epi2me-labs/wf-bacterial-genomes \ + --fastq wf-bacterial-genomes-demo/isolates_fastq \ + --isolates \ + --reference_based_assembly \ + --reference wf-bacterial-genomes-demo/ref/ref.fasta.gz \ + --sample_sheet wf-bacterial-genomes-demo/isolates_sample_sheet.csv + --profile standard ``` +For further information about running a workflow on the cmd line see https://labs.epi2me.io/wfquickstart/ + + + +## Related protocols + + + +This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. + +Optimal DNA extraction will be dependent on the gram status of the organism. Some useful protocols are provided below: ++ [Gram-positive bacteria](https://community.nanoporetech.com/extraction_method_groups/gram-positive-bacterial-gnda) ++ [Gram-negative bacteria](https://community.nanoporetech.com/extraction_methods/gram-ve-dna) + + +Find more related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). + + + +## Inputs + +### Input Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | +| reference_based_assembly | boolean | Enable reference guided assembly instead of de-novo assembly. | By default de-novo assembly will be performed with Flye. Enable this to instead perform a reference-based consensus. A reference must be provided. | False | +| reference | string | Reference sequence FASTA file. | The reference sequence is used when performing reference-based assembly. | | +| basecaller_cfg | string | Name of the model that was used to basecall signal data, used to select an appropriate Medaka model. | The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the `medaka_variant_model` and `medaka_consensus_model` parameters. The model list only shows models that are compatible with this workflow. | dna_r10.4.1_e8.2_400bps_sup@v4.2.0 | +| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | + + +### Sample Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | +| sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | + + +### Isolate options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| isolates | boolean | Run the Isolates pipeline on the assembly results if set to True. | Isolates mode adds further analysis options to the workflow such as multi-locus sequence typing and antimicrobial resistance calling, as well as producing single reports for each sample in the run. | False | +| resfinder_version | string | ResFinder version to use. | ResFinder is the tool used to check for antimicrobial resistance genes in isolates of bacteria. | 4.3.2 | +| resfinder_threshold | string | Threshold of required identity to report a match between a gene in the ResFinder database and the assembly. Valid interval: 0.00-1.00 | Identity refers to the ratio of base pairs that match between the sequence in your assembly and that of the sequence in the ResFinder database. Increasing the threshold will results in fewer, but more accurate hits against the database. | 0.8 | +| resfinder_coverage | string | Minimum coverage (breadth-of) threshold required to report a match between a gene in the ResFinder database and the assembly. Valid interval: 0.00-1.00 | The amount of an AMR gene that has to be present within the assembly as compared to the reference in the ResFinder database. | 0.6 | +| mlst_version | string | MLST version to use. | | 2.23.0 | + + +### Advanced Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| run_prokka | boolean | Run prokka on consensus sequence | Will provide an output file with a list of annotations for your sequence. Optional because it can take some time. | True | +| prokka_opts | string | Command-line arguments for prokka | [Command line arguments](https://github.com/tseemann/prokka#command-line-options) which can be used to alter prokka output annotation files. | | +| flye_opts | string | Command-line arguments for flye | [Command line arguments](https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md#-quick-usage) which can be used to alter the de-novo assembly process. | | +| medaka_consensus_model | string | The name of a Medaka consensus model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka consensus model. You can override this by providing a model with this option instead. | | +| medaka_variant_model | string | The name of a Medaka variant model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka variant model. You can override this by providing a model with this option instead. | | + + +### Miscellaneous Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| threads | integer | Number of CPU threads. | Provided to alignment, flye assembly and prokka steps to improve performance. | 3 | +| disable_ping | boolean | Enable to prevent sending a workflow ping. | | False | + + + + + + +## Outputs + +Outputs files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. + +| Title | File path | Description | Per sample or aggregated | +|-------|-----------|-------------|--------------------------| +| Workflow report | ./wf-bacterial-genomes-report.html | Report for all samples | aggregated | +| Draft assembly FASTA file | ./{{ alias }}.medaka.fasta.gz | Consensus file generated from either de-novo assembly or reference variant calling pipeline. | per-sample | +| Variants VCF file | ./{{ alias }}.medaka.vcf.gz | VCF file of variants detected against the provided reference (Reference mode only). | per-sample | +| Variants summary | ./{{ alias }}.variants.stats | TSV file of summary statistics for variants in sample (Reference mode only). | per-sample | +| Annotations files | ./{{ alias }}.prokka.{gbk,gff} | Annotations of regions of interest in assembly in GBK and GFF format. | per-sample | +| Sequence typing results | ./{{ alias }}.mlst.json | Sequence typing results in JSON format (isolates mode only). | per-sample | +| AMR calling results | ./{{ alias }}_resfinder_results | Resfinder results for AMR calling (isolates mode only). | per-sample | +| isolates per sample report | /{{ alias }}-isolates-report.html | Per sample report isolates mode | per-sample | + + + + +## Pipeline overview + + +### 1. Concatenates input files and generate per read stats. + +The [fastcat/bamstats](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. It will also output per read stats including average read lengths and qualities. + +### 2a. De-novo assembly + +#### i. Assembly + +[Flye](https://github.com/fenderglass/Flye) is used to create a draft assembly from the FASTQ reads. This will run by default on the `--nano-raw` paramter for flye. Additional configuration can be performed using `--flye_opts` parameter. + +#### ii. Polishing + +The draft assembly from flye is then polished using [Medaka](https://github.com/nanoporetech/medaka). This step will attempt to correct any errors that were introduced during the de-novo assembly process. + +The workflow selects the appropriate [Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller configuration that was used to process the signal data. +You can use the parameter `--basecaller_cfg` to provide this information (e.g. `dna_r10.4.1_e8.2_400bps_hac`). +Alternatively, you can choose the [Medaka](https://github.com/nanoporetech/medaka) model directly with `--medaka_consensus_model`. + + +### 2b. Variant calling mode + +#### i. Align reads + +Reads are aligned against the provided reference with [mini_align](https://github.com/nanoporetech/pomoxis/). + +#### ii. Call variants + +After alignment, haploid variants are called with [Medaka](https://github.com/nanoporetech/medaka). + +The workflow selects the appropriate [Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller configuration that was used to process the signal data. +You can use the parameter `--basecaller_cfg` to provide this information (e.g. `dna_r10.4.1_e8.2_400bps_hac`). +Alternatively, you can choose the [Medaka](https://github.com/nanoporetech/medaka) model directly with `--medaka_model`. + +#### iii. Use the variants to generate a consensus + +The variants passing the depth filter are then incorporated in the reference to create the consensus sequence. Variant stats are also created at this point. + +### 3. Annotations + +Regions of interest within your assembly are identified and annotated using [Prokka](https://github.com/tseemann/prokka). By default, prokka will run with it's default databases, but users can refine the annotation using the `--prokka_opts` command. **NOTE** The workflow does not current accept any additional files sent to prokka such as GBK or GFF files. + +### 4. Isolates mode (optional) + +#### i. Multi-locus sequence typing (MLST) + +MLST is a common technique used to help characterise your bacterial isolate, by using allelic variation from internal DNA fragments of 6-7 house keeping genes. Typing schemes for specific species and genera are found on [PubMLST](https://pubmlst.org/) and are pre-loaded into this workflow. [MLST](https://github.com/tseemann/mlst) will try to infer the correct typing scheme to use by scanning the assembly and subsequently identify the allele variant found. + +#### ii. Antimicrobial resistance (AMR) calling + +[ResFinder](https://bitbucket.org/genomicepidemiology/resfinder/src/master/) is used to identify genes/SNVs associated with AMR in your assembly. Assemblies of any species will be searched for the detection of acquired resistance genes, however SNVs conferring resistance are only available to a few well characterised species/genera. These are: +* Campylobacter spp. +* Enterococcus faecalis +* Enterococcus faecium +* Escherichia coli +* Helicobacter pylori +* Klebsiella spp. +* Mycobacterium tuberculosis +* Neisseria gonorrhoeae +* Salmonella spp. +* Staphylococcus aureus + +The species/genera of your assembly will be detected from the results of the MLST step and SNV will be selected automatically if applicable. + + + + + + + + +## Troubleshooting + + ++ If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. ++ See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). + + + +## FAQ's + + + +### No results for multi-locus sequence typing? +This usually occurs if the assembly is incomplete and does not have sufficient coverage to identify the house keeping genes of the typing scheme. Another, rarer scenario is if the assembly is from an organism with no typing scheme. A list of the available typing schemes can be found [here](https://github.com/tseemann/mlst/tree/master/db/pubmlst). In both scenarios, AMR calling will still be performed but only for acquired resistance genes. + + + +If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-bacterial-genomes/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). + + -to see the options for the workflow. +## Related blog posts -**Workflow outputs** + -The primary outputs of the workflow include: +## Related blog posts -* a [FASTA](https://en.wikipedia.org/wiki/FASTA) consensus sequence scaffolded from a provided reference sequence, -* a [VCF](https://en.wikipedia.org/wiki/Variant_Call_Format) file containing variants in the sample compared to the reference (if provided), -* an HTML report document detailing QC metrics and the primary findings of the workflow, -* (optionally) an annotation of the consensus sequence using prokka. -* (optionally) a per-sample ResFinder output directory with various results. ++ [Importing third-party workflows into EPI2ME Labs](https://labs.epi2me.io/nexflow-for-epi2melabs/) +See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. -## Useful links -* [nextflow](https://www.nextflow.io/) -* [docker](https://www.docker.com/products/docker-desktop) -* [Singularity](https://sylabs.io/singularity/) -* [flye](https://github.com/fenderglass/Flye) -* [mini_align](https://github.com/nanoporetech/pomoxis/blob/master/scripts/mini_align) -* [prokka](https://github.com/tseemann/prokka) -* [ResFinder](https://bitbucket.org/genomicepidemiology/resfinder/src/master/) \ No newline at end of file diff --git a/bin/workflow_glue/check_sample_sheet.py b/bin/workflow_glue/check_sample_sheet.py index d77f1ab..fe4fc37 100755 --- a/bin/workflow_glue/check_sample_sheet.py +++ b/bin/workflow_glue/check_sample_sheet.py @@ -2,6 +2,7 @@ import codecs import csv import os +import re import sys from .util import get_named_logger, wf_parser # noqa: ABS101 @@ -79,6 +80,19 @@ def main(args): sys.stdout.write(f"Parsing error: {e}") sys.exit() + # check barcodes are correct format + for barcode in barcodes: + if not re.match(r'^barcode\d\d+$', barcode): + sys.stdout.write("values in 'barcode' column are incorrect format") + sys.exit() + + # check barcodes are all the same length + first_length = len(barcodes[0]) + for barcode in barcodes[1:]: + if len(barcode) != first_length: + sys.stdout.write("values in 'barcode' column are different lengths") + sys.exit() + # check barcode and alias values are unique if len(barcodes) > len(set(barcodes)): sys.stdout.write("values in 'barcode' column not unique") diff --git a/bin/workflow_glue/results_schema.py b/bin/workflow_glue/results_schema.py new file mode 100755 index 0000000..4d91dbc --- /dev/null +++ b/bin/workflow_glue/results_schema.py @@ -0,0 +1,64 @@ +# generated by datamodel-codegen: +# filename: results_schema.yml + +from __future__ import annotations + +from enum import Enum +from typing import Any, Dict, List + +from pydantic import Field + +from workflow_glue.results_schema_helpers import BaseModel + + +class SampleType(Enum): + """ + The type of the sample + """ + + no_template_control = 'no_template_control' + positive_control = 'positive_control' + negative_control = 'negative_control' + test_sample = 'test_sample' + + +class CheckResult(BaseModel): + """ + A result of some check the workflow has performed on a sample, or itself + """ + + check_name: str = Field(..., description='The name of the check') + check_pass: bool = Field(..., description='If true the check has passed') + + +class Sample(BaseModel): + """ + A sample sheet entry and its corresponding checks and related results + """ + + alias: str = Field(..., description='The alias for the sample given by the user') + barcode: str = Field(..., description='The physical barcode assigned to the sample') + sample_type: SampleType = Field(..., description='The type of the sample') + sample_pass: bool = Field( + ..., description='If true the sample has passed workflow checks' + ) + sample_checks: List[CheckResult] = Field( + ..., description='An array of checks performed on the sample' + ) + results: Dict[str, Any] = Field( + ..., description='Further specific workflow results for this sample' + ) + + +class WorkflowResult(BaseModel): + """ + Definition for results that will be returned by this workflow. This structure will be passed through by Gizmo speaking clients as WorkflowInstance.results. + """ + + workflow_pass: bool = Field( + ..., description='True if this workflow instance passes all checks' + ) + workflow_checks: List[CheckResult] = Field( + ..., description='An array of checks performed on the workflow instance' + ) + samples: List[Sample] = Field(..., description='Samples in this workflow instance') diff --git a/bin/workflow_glue/results_schema_helpers.py b/bin/workflow_glue/results_schema_helpers.py new file mode 100755 index 0000000..da15026 --- /dev/null +++ b/bin/workflow_glue/results_schema_helpers.py @@ -0,0 +1,12 @@ +"""Helpers for the auto-generated schema code.""" +from pydantic import BaseModel as PydanticBaseModel + + +class BaseModel(PydanticBaseModel): + """Extend base model.""" + + class Config: + """Config items for the pydantic code.""" + + # make enums json serializable + use_enum_values = True diff --git a/docs/01_brief_description.md b/docs/01_brief_description.md new file mode 100644 index 0000000..9ac1e9d --- /dev/null +++ b/docs/01_brief_description.md @@ -0,0 +1 @@ +Assembly, variant calling, and annotation of bacterial genomes. \ No newline at end of file diff --git a/docs/02_introduction.md b/docs/02_introduction.md new file mode 100644 index 0000000..c80ab87 --- /dev/null +++ b/docs/02_introduction.md @@ -0,0 +1,12 @@ + + +This workflow is primarily used to assemble genomes from bacterial reads and provide information on features of interest within those assemblies through annotations. + +The workflow can provide additional information about the assembly, such as antimicrobial resistance (AMR) analysis and sequence typing through an optional `--isolates` mode. + +In brief, this workflow will perform the following: + ++ De novo (or reference-based) assembly of bacterial genomes ++ Annotation of regions of interest within the assembly ++ Species identification and sequence typing (`--isolates` mode only) ++ Identify genes and SNVs associated with AMR (`--isolates` mode only) \ No newline at end of file diff --git a/docs/03_compute_requirements.md b/docs/03_compute_requirements.md new file mode 100644 index 0000000..dc86d5b --- /dev/null +++ b/docs/03_compute_requirements.md @@ -0,0 +1,13 @@ +Recommended requirements: + ++ CPUs = 16 ++ Memory = 32GB + +Minimum requirements: + ++ CPUs = 4 ++ Memory = 8GB + +Approximate run time: 20-40 minutes per sample with ~50x coverage using minimum requirements + +ARM processor support: True diff --git a/docs/04_install_and_run.md b/docs/04_install_and_run.md new file mode 100644 index 0000000..519a126 --- /dev/null +++ b/docs/04_install_and_run.md @@ -0,0 +1,35 @@ + + +These are instructions to install and run the workflow on command line. You can also access the workflow via the [EPI2ME application](https://labs.epi2me.io/downloads/). + +The workflow uses [Nextflow](https://www.nextflow.io/) to manage compute and software resources, therefore nextflow will need to be installed before attempting to run the workflow. + +The workflow can currently be run using either [Docker](https://www.docker.com/products/docker-desktop) or +[Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of +the required software. Both methods are automated out-of-the-box provided +either docker or singularity is installed. This is controlled by the [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) parameter as exemplified below. + +It is not required to clone or download the git repository in order to run the workflow. +More information on running EPI2ME workflows can be found on our [website](https://labs.epi2me.io/wfindex). + +The following command can be used to obtain the workflow. This will pull the repository in to the assets folder of nextflow and provide a list of all parameters available for the workflow as well as an example command: + +``` +nextflow run epi2me-labs/wf-bacterial-genomes –-help +``` +A demo dataset is provided for testing of the workflow. It can be downloaded using: +``` +wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-bacterial-genomes/wf-bacterial-genomes-demo.tar.gz +tar -xzvf wf-bacterial-genomes-demo.tar.gz +``` +The workflow can be run with the demo data using: +``` +nextflow run epi2me-labs/wf-bacterial-genomes \ + --fastq wf-bacterial-genomes-demo/isolates_fastq \ + --isolates \ + --reference_based_assembly \ + --reference wf-bacterial-genomes-demo/ref/ref.fasta.gz \ + --sample_sheet wf-bacterial-genomes-demo/isolates_sample_sheet.csv + --profile standard +``` +For further information about running a workflow on the cmd line see https://labs.epi2me.io/wfquickstart/ \ No newline at end of file diff --git a/docs/05_related_protocols.md b/docs/05_related_protocols.md new file mode 100644 index 0000000..629c76b --- /dev/null +++ b/docs/05_related_protocols.md @@ -0,0 +1,10 @@ + + +This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. + +Optimal DNA extraction will be dependent on the gram status of the organism. Some useful protocols are provided below: ++ [Gram-positive bacteria](https://community.nanoporetech.com/extraction_method_groups/gram-positive-bacterial-gnda) ++ [Gram-negative bacteria](https://community.nanoporetech.com/extraction_methods/gram-ve-dna) + + +Find more related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). \ No newline at end of file diff --git a/docs/06_inputs.md b/docs/06_inputs.md new file mode 100644 index 0000000..4c4f56a --- /dev/null +++ b/docs/06_inputs.md @@ -0,0 +1,49 @@ +### Input Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | +| reference_based_assembly | boolean | Enable reference guided assembly instead of de-novo assembly. | By default de-novo assembly will be performed with Flye. Enable this to instead perform a reference-based consensus. A reference must be provided. | False | +| reference | string | Reference sequence FASTA file. | The reference sequence is used when performing reference-based assembly. | | +| basecaller_cfg | string | Name of the model that was used to basecall signal data, used to select an appropriate Medaka model. | The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the `medaka_variant_model` and `medaka_consensus_model` parameters. The model list only shows models that are compatible with this workflow. | dna_r10.4.1_e8.2_400bps_sup@v4.2.0 | +| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | + + +### Sample Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | +| sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | + + +### Isolate options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| isolates | boolean | Run the Isolates pipeline on the assembly results if set to True. | Isolates mode adds further analysis options to the workflow such as multi-locus sequence typing and antimicrobial resistance calling, as well as producing single reports for each sample in the run. | False | +| resfinder_version | string | ResFinder version to use. | ResFinder is the tool used to check for antimicrobial resistance genes in isolates of bacteria. | 4.3.2 | +| resfinder_threshold | string | Threshold of required identity to report a match between a gene in the ResFinder database and the assembly. Valid interval: 0.00-1.00 | Identity refers to the ratio of base pairs that match between the sequence in your assembly and that of the sequence in the ResFinder database. Increasing the threshold will results in fewer, but more accurate hits against the database. | 0.8 | +| resfinder_coverage | string | Minimum coverage (breadth-of) threshold required to report a match between a gene in the ResFinder database and the assembly. Valid interval: 0.00-1.00 | The amount of an AMR gene that has to be present within the assembly as compared to the reference in the ResFinder database. | 0.6 | +| mlst_version | string | MLST version to use. | | 2.23.0 | + + +### Advanced Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| run_prokka | boolean | Run prokka on consensus sequence | Will provide an output file with a list of annotations for your sequence. Optional because it can take some time. | True | +| prokka_opts | string | Command-line arguments for prokka | [Command line arguments](https://github.com/tseemann/prokka#command-line-options) which can be used to alter prokka output annotation files. | | +| flye_opts | string | Command-line arguments for flye | [Command line arguments](https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md#-quick-usage) which can be used to alter the de-novo assembly process. | | +| medaka_consensus_model | string | The name of a Medaka consensus model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka consensus model. You can override this by providing a model with this option instead. | | +| medaka_variant_model | string | The name of a Medaka variant model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka variant model. You can override this by providing a model with this option instead. | | + + +### Miscellaneous Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| threads | integer | Number of CPU threads. | Provided to alignment, flye assembly and prokka steps to improve performance. | 3 | +| disable_ping | boolean | Enable to prevent sending a workflow ping. | | False | + + diff --git a/docs/07_outputs.md b/docs/07_outputs.md new file mode 100644 index 0000000..1aae658 --- /dev/null +++ b/docs/07_outputs.md @@ -0,0 +1,12 @@ +Outputs files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. + +| Title | File path | Description | Per sample or aggregated | +|-------|-----------|-------------|--------------------------| +| Workflow report | ./wf-bacterial-genomes-report.html | Report for all samples | aggregated | +| Draft assembly FASTA file | ./{{ alias }}.medaka.fasta.gz | Consensus file generated from either de-novo assembly or reference variant calling pipeline. | per-sample | +| Variants VCF file | ./{{ alias }}.medaka.vcf.gz | VCF file of variants detected against the provided reference (Reference mode only). | per-sample | +| Variants summary | ./{{ alias }}.variants.stats | TSV file of summary statistics for variants in sample (Reference mode only). | per-sample | +| Annotations files | ./{{ alias }}.prokka.{gbk,gff} | Annotations of regions of interest in assembly in GBK and GFF format. | per-sample | +| Sequence typing results | ./{{ alias }}.mlst.json | Sequence typing results in JSON format (isolates mode only). | per-sample | +| AMR calling results | ./{{ alias }}_resfinder_results | Resfinder results for AMR calling (isolates mode only). | per-sample | +| isolates per sample report | /{{ alias }}-isolates-report.html | Per sample report isolates mode | per-sample | diff --git a/docs/08_pipeline_overview.md b/docs/08_pipeline_overview.md new file mode 100644 index 0000000..c332505 --- /dev/null +++ b/docs/08_pipeline_overview.md @@ -0,0 +1,67 @@ + +### 1. Concatenates input files and generate per read stats. + +The [fastcat/bamstats](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. It will also output per read stats including average read lengths and qualities. + +### 2a. De-novo assembly + +#### i. Assembly + +[Flye](https://github.com/fenderglass/Flye) is used to create a draft assembly from the FASTQ reads. This will run by default on the `--nano-raw` paramter for flye. Additional configuration can be performed using `--flye_opts` parameter. + +#### ii. Polishing + +The draft assembly from flye is then polished using [Medaka](https://github.com/nanoporetech/medaka). This step will attempt to correct any errors that were introduced during the de-novo assembly process. + +The workflow selects the appropriate [Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller configuration that was used to process the signal data. +You can use the parameter `--basecaller_cfg` to provide this information (e.g. `dna_r10.4.1_e8.2_400bps_hac`). +Alternatively, you can choose the [Medaka](https://github.com/nanoporetech/medaka) model directly with `--medaka_consensus_model`. + + +### 2b. Variant calling mode + +#### i. Align reads + +Reads are aligned against the provided reference with [mini_align](https://github.com/nanoporetech/pomoxis/). + +#### ii. Call variants + +After alignment, haploid variants are called with [Medaka](https://github.com/nanoporetech/medaka). + +The workflow selects the appropriate [Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller configuration that was used to process the signal data. +You can use the parameter `--basecaller_cfg` to provide this information (e.g. `dna_r10.4.1_e8.2_400bps_hac`). +Alternatively, you can choose the [Medaka](https://github.com/nanoporetech/medaka) model directly with `--medaka_model`. + +#### iii. Use the variants to generate a consensus + +The variants passing the depth filter are then incorporated in the reference to create the consensus sequence. Variant stats are also created at this point. + +### 3. Annotations + +Regions of interest within your assembly are identified and annotated using [Prokka](https://github.com/tseemann/prokka). By default, prokka will run with it's default databases, but users can refine the annotation using the `--prokka_opts` command. **NOTE** The workflow does not current accept any additional files sent to prokka such as GBK or GFF files. + +### 4. Isolates mode (optional) + +#### i. Multi-locus sequence typing (MLST) + +MLST is a common technique used to help characterise your bacterial isolate, by using allelic variation from internal DNA fragments of 6-7 house keeping genes. Typing schemes for specific species and genera are found on [PubMLST](https://pubmlst.org/) and are pre-loaded into this workflow. [MLST](https://github.com/tseemann/mlst) will try to infer the correct typing scheme to use by scanning the assembly and subsequently identify the allele variant found. + +#### ii. Antimicrobial resistance (AMR) calling + +[ResFinder](https://bitbucket.org/genomicepidemiology/resfinder/src/master/) is used to identify genes/SNVs associated with AMR in your assembly. Assemblies of any species will be searched for the detection of acquired resistance genes, however SNVs conferring resistance are only available to a few well characterised species/genera. These are: +* Campylobacter spp. +* Enterococcus faecalis +* Enterococcus faecium +* Escherichia coli +* Helicobacter pylori +* Klebsiella spp. +* Mycobacterium tuberculosis +* Neisseria gonorrhoeae +* Salmonella spp. +* Staphylococcus aureus + +The species/genera of your assembly will be detected from the results of the MLST step and SNV will be selected automatically if applicable. + + + + diff --git a/docs/09_troubleshooting.md b/docs/09_troubleshooting.md new file mode 100644 index 0000000..e8ee712 --- /dev/null +++ b/docs/09_troubleshooting.md @@ -0,0 +1,3 @@ + ++ If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. ++ See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). \ No newline at end of file diff --git a/docs/10_FAQ.md b/docs/10_FAQ.md new file mode 100644 index 0000000..04b9166 --- /dev/null +++ b/docs/10_FAQ.md @@ -0,0 +1,8 @@ + + +### No results for multi-locus sequence typing? +This usually occurs if the assembly is incomplete and does not have sufficient coverage to identify the house keeping genes of the typing scheme. Another, rarer scenario is if the assembly is from an organism with no typing scheme. A list of the available typing schemes can be found [here](https://github.com/tseemann/mlst/tree/master/db/pubmlst). In both scenarios, AMR calling will still be performed but only for acquired resistance genes. + + + +If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-bacterial-genomes/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). \ No newline at end of file diff --git a/docs/11_other.md b/docs/11_other.md new file mode 100644 index 0000000..43d2754 --- /dev/null +++ b/docs/11_other.md @@ -0,0 +1,7 @@ + + +## Related blog posts + ++ [Importing third-party workflows into EPI2ME Labs](https://labs.epi2me.io/nexflow-for-epi2melabs/) + +See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. \ No newline at end of file diff --git a/docs/header.md b/docs/header.md deleted file mode 100644 index fd3549d..0000000 --- a/docs/header.md +++ /dev/null @@ -1,4 +0,0 @@ -# Bacterial genomes Workflow - -This repository contains a [nextflow](https://www.nextflow.io/) workflow -for analysing bacterial genomes. \ No newline at end of file diff --git a/docs/intro.md b/docs/intro.md deleted file mode 100644 index 2ab7a5c..0000000 --- a/docs/intro.md +++ /dev/null @@ -1,9 +0,0 @@ -# Introduction - -If no reference is included assembly will be -completed using [flye](https://github.com/fenderglass/Flye) and polished with -[medaka](https://www.github.com/nanoporetech/medaka). If a reference is provided -alignment will be done with [mini_align](https://github.com/nanoporetech/pomoxis/blob/master/scripts/mini_align) -and variant called using medaka. The workflow has a few optional extras. It can run -[prokka](https://github.com/tseemann/prokka) to annotate the resulting -consensus sequence or [ResFinder](https://bitbucket.org/genomicepidemiology/resfinder/src/master/) to check it against a database of antimicrobial resistance genes. diff --git a/docs/links.md b/docs/links.md deleted file mode 100644 index ac301a2..0000000 --- a/docs/links.md +++ /dev/null @@ -1,9 +0,0 @@ -## Useful links - -* [nextflow](https://www.nextflow.io/) -* [docker](https://www.docker.com/products/docker-desktop) -* [Singularity](https://sylabs.io/singularity/) -* [flye](https://github.com/fenderglass/Flye) -* [mini_align](https://github.com/nanoporetech/pomoxis/blob/master/scripts/mini_align) -* [prokka](https://github.com/tseemann/prokka) -* [ResFinder](https://bitbucket.org/genomicepidemiology/resfinder/src/master/) \ No newline at end of file diff --git a/docs/quickstart.md b/docs/quickstart.md deleted file mode 100644 index 959db19..0000000 --- a/docs/quickstart.md +++ /dev/null @@ -1,34 +0,0 @@ -## Quickstart - -The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and -software resources, as such nextflow will need to be installed before attempting -to run the workflow. - -The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop) (default) or -[Singularity](https://sylabs.io/singularity/) (`-profile singularity`) to provide isolation of -the required software. Both methods are automated out-of-the-box provided -either Docker or Singularity is installed. - -It is not required to clone or download the git repository in order to run the workflow. -For more information on running EPI2ME Labs workflows [visit out website](https://labs.epi2me.io/wfindex). - -**Workflow options** - -To obtain the workflow, having installed `nextflow`, users can run: - -``` -nextflow run epi2me-labs/wf-bacterial-genomes --help -``` - -to see the options for the workflow. - -**Workflow outputs** - -The primary outputs of the workflow include: - -* a [FASTA](https://en.wikipedia.org/wiki/FASTA) consensus sequence scaffolded from a provided reference sequence, -* a [VCF](https://en.wikipedia.org/wiki/Variant_Call_Format) file containing variants in the sample compared to the reference (if provided), -* an HTML report document detailing QC metrics and the primary findings of the workflow, -* (optionally) an annotation of the consensus sequence using prokka. -* (optionally) a per-sample ResFinder output directory with various results. \ No newline at end of file diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 3b29be1..81fdc2e 100644 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -141,7 +141,7 @@ class NfcoreSchema { for (specifiedParam in params.keySet()) { // nextflow params if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" + log.error "You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" has_error = true } // unexpected params @@ -180,7 +180,7 @@ class NfcoreSchema { schema.validate(params_json) } catch (ValidationException e) { println '' - log.error 'ERROR: Validation of pipeline parameters failed!' + log.error 'Validation of pipeline parameters failed!' JSONObject exceptionJSON = e.toJSON() HashSet observed_exceptions = [] printExceptions(exceptionJSON, params_json, log, enums, raw_schema, observed_exceptions) diff --git a/lib/common.nf b/lib/common.nf new file mode 100644 index 0000000..d843ab1 --- /dev/null +++ b/lib/common.nf @@ -0,0 +1,15 @@ +import groovy.json.JsonBuilder + +process getParams { + label "wf_common" + cpus 1 + output: + path "params.json" + script: + String paramsJSON = new JsonBuilder(params).toPrettyString() + """ + # Output nextflow params object to JSON + echo '$paramsJSON' > params.json + """ +} + diff --git a/lib/ingress.nf b/lib/ingress.nf index 5839730..b5d8dfa 100644 --- a/lib/ingress.nf +++ b/lib/ingress.nf @@ -240,6 +240,7 @@ process checkBamHeaders { label "ingress" label "wf_common" cpus 1 + memory "2 GB" input: tuple val(meta), path("input_dir/reads*.bam") output: // set the two env variables by `eval`-ing the output of the python script @@ -257,6 +258,7 @@ process mergeBams { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("input_bams/reads*.bam") output: tuple val(meta), path("reads.bam") shell: @@ -271,6 +273,7 @@ process catSortBams { label "ingress" label "wf_common" cpus 4 + memory "4 GB" input: tuple val(meta), path("input_bams/reads*.bam") output: tuple val(meta), path("reads.bam") script: @@ -285,6 +288,7 @@ process sortBam { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("reads.bam") output: tuple val(meta), path("reads.sorted.bam") script: @@ -298,6 +302,7 @@ process bamstats { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("reads.bam") output: @@ -414,6 +419,7 @@ process move_or_compress_fq_file { label "ingress" label "wf_common" cpus 1 + memory "2 GB" input: // don't stage `input` with a literal because we check the file extension tuple val(meta), path(input) @@ -439,6 +445,7 @@ process fastcat { label "ingress" label "wf_common" cpus 3 + memory "2 GB" input: tuple val(meta), path("input") val extra_args @@ -737,6 +744,7 @@ process validate_sample_sheet { cpus 1 label "ingress" label "wf_common" + memory "2 GB" input: path "sample_sheet.csv" val required_sample_types diff --git a/main.nf b/main.nf index c050379..f7b9073 100644 --- a/main.nf +++ b/main.nf @@ -12,6 +12,7 @@ FLYE_MIN_COVERAGE_THRESHOLD = 5 process readStats { label "wfbacterialgenomes" cpus 1 + memory "2 GB" input: tuple val(meta), path("align.bam"), path("align.bam.bai") output: @@ -29,6 +30,7 @@ process readStats { process coverStats { label "wfbacterialgenomes" cpus 2 + memory "2 GB" input: tuple val(meta), path("align.bam"), path("align.bam.bai") output: @@ -47,6 +49,7 @@ process coverStats { process deNovo { label "wfbacterialgenomes" cpus params.threads + memory "16 GB" input: tuple val(meta), path("reads.fastq.gz") output: @@ -98,6 +101,7 @@ process deNovo { process alignReads { label "wfbacterialgenomes" cpus params.threads + memory "8 GB" input: tuple val(meta), path("reads.fastq.gz"), path("ref.fasta.gz") output: @@ -113,6 +117,7 @@ process splitRegions { label "medaka" cpus 1 + memory "4 GB" input: tuple val(meta), path("align.bam"), path("align.bam.bai") output: @@ -134,7 +139,6 @@ process splitRegions { """ } - // TODO: in a single GPU environment it would be better just // to use a single process for the whole bam file. Need // to read up on conditional channels @@ -144,6 +148,7 @@ process medakaNetwork { label "medaka" cpus 2 + memory "8 GB" input: tuple val(meta), path("align.bam"), path("align.bam.bai"), val(reg), val(medaka_model) output: @@ -165,6 +170,7 @@ process medakaVariantHdf { label "medaka" cpus 2 + memory "8 GB" input: tuple val(meta), path("align.bam"), path("align.bam.bai"), val(reg), val(medaka_model) output: @@ -184,6 +190,7 @@ process medakaVariantHdf { process medakaVariant { label "medaka" cpus 1 + memory "4 GB" input: tuple val(meta), path("consensus_probs*.hdf"), path("align.bam"), path("align.bam.bai"), path("ref.fasta.gz") output: @@ -205,6 +212,7 @@ process medakaVariant { process medakaConsensus { label "medaka" cpus 1 + memory "4 GB" input: tuple val(meta), path("align.bam"), path("align.bam.bai"), path("consensus_probs*.hdf"), path("reference*") output: @@ -221,6 +229,7 @@ process runProkka { // run prokka in a basic way on the consensus sequence label "prokka" cpus params.threads + memory "4 GB" input: tuple val(meta), path("consensus.fasta.gz") output: @@ -238,6 +247,8 @@ process runProkka { process prokkaVersion { label "prokka" + cpus 1 + memory "250 MB" output: path "prokka_version.txt" """ @@ -248,6 +259,8 @@ process prokkaVersion { process medakaVersion { label "medaka" + cpus 1 + memory "250 MB" input: path "input_versions.txt" output: @@ -260,6 +273,8 @@ process medakaVersion { process mlstVersion { label "mlst" + cpus 1 + memory "250 MB" input: path "input_version.txt" output: @@ -275,6 +290,7 @@ process mlstVersion { process getVersions { label "wfbacterialgenomes" cpus 1 + memory "500 MB" input: path "input_versions.txt" output: @@ -293,6 +309,7 @@ process getVersions { process getParams { label "wfbacterialgenomes" cpus 1 + memory "500 MB" output: path "params.json" script: @@ -307,6 +324,7 @@ process getParams { process makeReport { label "wfbacterialgenomes" cpus 1 + memory "1 GB" input: path "versions/*" path "params.json" @@ -346,6 +364,7 @@ process makeReport { process makePerSampleReports { label "wfbacterialgenomes" cpus 1 + memory "1 GB" input: path "versions.txt" path "params.json" @@ -381,6 +400,8 @@ process makePerSampleReports { process output { // publish inputs to output directory label "wfbacterialgenomes" + cpus 1 + memory "2 GB" publishDir "${params.out_dir}", mode: 'copy', pattern: "*" input: path fname @@ -394,6 +415,8 @@ process output { process lookup_medaka_consensus_model { label "wfbacterialgenomes" + cpus 1 + memory "500 MB" input: path("lookup_table") val basecall_model @@ -409,6 +432,8 @@ process lookup_medaka_consensus_model { process lookup_medaka_variant_model { label "wfbacterialgenomes" + cpus 1 + memory "500 MB" input: path("lookup_table") val basecall_model @@ -426,6 +451,8 @@ process lookup_medaka_variant_model { // into it. process collectFastqIngressResultsInDir { label "wfbacterialgenomes" + cpus 1 + memory "2 GB" input: // both the fastcat seqs as well as stats might be `OPTIONAL_FILE` --> stage in // different sub-directories to avoid name collisions @@ -651,3 +678,4 @@ workflow.onComplete { workflow.onError { Pinguscript.ping_error(nextflow, workflow, params) } + diff --git a/modules/local/isolates.nf b/modules/local/isolates.nf index 95a1c00..f278a1a 100644 --- a/modules/local/isolates.nf +++ b/modules/local/isolates.nf @@ -1,6 +1,7 @@ process mlstSearch { label "mlst" cpus 1 + memory "1 GB" input: tuple val(meta), path("input_genome.fasta.gz") output: @@ -15,19 +16,22 @@ process mlstSearch { process getPointfinderSpecies { label "wfbacterialgenomes" cpus 1 + memory "500 MB" input: tuple val(meta), path("${meta.alias}.mlst.json") output: tuple val(meta), stdout - shell: - ''' - pf_species=$(workflow-glue pointfinder_species --mlst_json '!{meta.alias}.mlst.json') - echo $pf_species - ''' + script: + """ + workflow-glue pointfinder_species --mlst_json ${meta.alias}.mlst.json + + """ } process resfinder { label "amr" + cpus 2 + memory "2 GB" errorStrategy 'ignore' input: tuple val(meta), path("input_genome.fasta.gz"), val(species) @@ -56,6 +60,8 @@ process resfinder { process processResfinder { // Disinfection not processed yet (CW-2106) label "wfbacterialgenomes" + cpus 2 + memory "500 MB" input: tuple val(meta), path("${meta.alias}_resfinder_results"), val(species) output: diff --git a/nextflow.config b/nextflow.config index 9976073..3650d8e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,7 +15,7 @@ params { sample = null sample_sheet = null disable_ping = false - reference_based_assembly = null + reference_based_assembly = false basecaller_cfg = "dna_r10.4.1_e8.2_400bps_sup@v4.2.0" medaka_variant_model = null medaka_consensus_model = null @@ -57,7 +57,7 @@ manifest { description = 'Assembly, variant calling, and annotation of bacterial genomes.' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = 'v0.4.0' + version = 'v1.0.0' } epi2melabs { @@ -65,12 +65,6 @@ epi2melabs { icon = "faBacterium" } -executor { - $local { - cpus = 4 - memory = "8 GB" - } -} // used by default for "standard" (docker) and singularity profiles, // other profiles may override. @@ -125,7 +119,7 @@ profiles { process { executor = 'awsbatch' queue = "${params.aws_queue}" - memory = '8G' + memory = '32G' withLabel:wf_common { container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}-root" } @@ -138,6 +132,13 @@ profiles { withLabel:medaka { container = "${params.aws_image_prefix}-medaka:${params.wf.container_sha_medaka}-root" } + withLabel:amr { + container = "genomicepidemiology/resfinder:${params.resfinder_version}" + containerOptions = {workflow.profile == "standard" ? "--entrypoint=''" : ""} + } + withLabel:mlst { + container = "staphb/mlst:${params.mlst_version}" + } shell = ['/bin/bash', '-euo', 'pipefail'] } } diff --git a/nextflow_schema.json b/nextflow_schema.json index 5eb88f7..94e1819 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,6 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", "title": "epi2me-labs/wf-bacterial-genomes", + "workflow_title" : "Bacterial assembly and annotation workflow", "description": "Assembly, variant calling, and annotation of bacterial genomes.", "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-bacterial-genomes/wf-bacterial-genomes-demo.tar.gz", "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-bacterial-genomes/wf-bacterial-genomes-demo/aws.nextflow.config", @@ -23,6 +24,7 @@ }, "reference_based_assembly": { "type": "boolean", + "default": false, "title": "Reference-based assembly", "help_text": "By default de-novo assembly will be performed with Flye. Enable this to instead perform a reference-based consensus. A reference must be provided.", "description": "Enable reference guided assembly instead of de-novo assembly." @@ -38,7 +40,7 @@ "type": "string", "title": "Basecaller model configuration", "description": "Name of the model that was used to basecall signal data, used to select an appropriate Medaka model.", - "help_text": "The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_variant_model' and 'medaka_consensus_model' parameters. The model list only shows models that are compatible with this workflow.", + "help_text": "The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the `medaka_variant_model` and `medaka_consensus_model` parameters. The model list only shows models that are compatible with this workflow.", "default": "dna_r10.4.1_e8.2_400bps_sup@v4.2.0", "enum": [ "dna_r10.4.1_e8.2_400bps_hac@v4.2.0", @@ -108,8 +110,8 @@ } ], "dependencies": { - "reference_based_assembly": [ - "reference" + "reference": [ + "reference_based_assembly" ] } }, @@ -158,7 +160,7 @@ "title": "Isolates mode", "default": false, "description": "Run the Isolates pipeline on the assembly results if set to True.", - "help_text": "Will scan the bacterial genome output contigs and return a table of any antimicrobial resistance genes." + "help_text": "Isolates mode adds further analysis options to the workflow such as multi-locus sequence typing and antimicrobial resistance calling, as well as producing single reports for each sample in the run." }, "resfinder_version": { "type": "string", @@ -171,13 +173,15 @@ "type": "string", "title": "Resfinder gene identity threshold", "default": "0.8", - "description": "Threshold of required identity to report a match between a gene in the ResFinder database and the assembly. Valid interval: 0.00-1.00" + "description": "Threshold of required identity to report a match between a gene in the ResFinder database and the assembly. Valid interval: 0.00-1.00", + "help_text": "Identity refers to the ratio of base pairs that match between the sequence in your assembly and that of the sequence in the ResFinder database. Increasing the threshold will results in fewer, but more accurate hits against the database." }, "resfinder_coverage": { "type": "string", "title": "Resfinder gene coverage threshold", "default": "0.6", - "description": "Minimum coverage (breadth-of) threshold required to report a match between a gene in the ResFinder database and the assembly. Valid interval: 0.00-1.00" + "description": "Minimum coverage (breadth-of) threshold required to report a match between a gene in the ResFinder database and the assembly. Valid interval: 0.00-1.00", + "help_text": "The amount of an AMR gene that has to be present within the assembly as compared to the reference in the ResFinder database." }, "mlst_version": { "type": "string", @@ -220,13 +224,6 @@ "description": "Command-line arguments for flye", "help_text": "[Command line arguments](https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md#-quick-usage) which can be used to alter the de-novo assembly process." }, - "threads": { - "type": "integer", - "title": "Threads", - "default": 3, - "description": "Number of CPU threads.", - "help_text": "Provided to alignment, flye assembly and prokka steps to improve performance." - }, "medaka_consensus_model": { "type": "string", "title": "Medaka consensus model", @@ -247,6 +244,13 @@ "description": "Everything else.", "default": "", "properties": { + "threads": { + "type": "integer", + "title": "Threads", + "default": 3, + "description": "Number of CPU threads.", + "help_text": "Provided to alignment, flye assembly and prokka steps to improve performance." + }, "disable_ping": { "type": "boolean", "default": false, @@ -255,7 +259,7 @@ }, "help": { "type": "boolean", - "default": false, + "default": false, "description": "Display help text.", "fa_icon": "fas fa-question-circle", "hidden": true @@ -313,8 +317,16 @@ "hidden": true } }, - "docs": { - "intro": "# Introduction\n\nIf no reference is included assembly will be \ncompleted using [flye](https://github.com/fenderglass/Flye) and polished with \n[medaka](https://www.github.com/nanoporetech/medaka). If a reference is provided\nalignment will be done with [mini_align](https://github.com/nanoporetech/pomoxis/blob/master/scripts/mini_align)\nand variant called using medaka. The workflow has a few optional extras. It can run\n[prokka](https://github.com/tseemann/prokka) to annotate the resulting\nconsensus sequence or [ResFinder](https://bitbucket.org/genomicepidemiology/resfinder/src/master/) to check it against a database of antimicrobial resistance genes.\n", - "links": "## Useful links\n\n* [nextflow](https://www.nextflow.io/)\n* [docker](https://www.docker.com/products/docker-desktop)\n* [Singularity](https://sylabs.io/singularity/)\n* [flye](https://github.com/fenderglass/Flye)\n* [mini_align](https://github.com/nanoporetech/pomoxis/blob/master/scripts/mini_align)\n* [prokka](https://github.com/tseemann/prokka)\n* [ResFinder](https://bitbucket.org/genomicepidemiology/resfinder/src/master/)" + "resources": { + "recommended": { + "cpus": 16, + "memory": "32GB" + }, + "minimum": { + "cpus": 4, + "memory": "8GB" + }, + "run_time": "20-40 minutes per sample with ~50x coverage using minimum requirements", + "arm_support": true } } \ No newline at end of file diff --git a/output_definition.json b/output_definition.json new file mode 100644 index 0000000..2ed3937 --- /dev/null +++ b/output_definition.json @@ -0,0 +1,68 @@ +{ + "files": { + "workflow-report": { + "filepath": "./wf-bacterial-genomes-report.html", + "title": "Workflow report", + "description": "Report for all samples", + "mime-type": "text/html", + "optional": false, + "type": "aggregated" + }, + "assembly": { + "filepath": "./{{ alias }}.medaka.fasta.gz", + "title": "Draft assembly FASTA file", + "description": "Consensus file generated from either de-novo assembly or reference variant calling pipeline.", + "mime-type": "application/gzip", + "optional": false, + "type": "per-sample" + }, + "variants": { + "filepath": "./{{ alias }}.medaka.vcf.gz", + "title": "Variants VCF file", + "description": "VCF file of variants detected against the provided reference (Reference mode only).", + "mime-type": "application/gzip", + "optional": true, + "type": "per-sample" + }, + "variants_stats": { + "filepath": "./{{ alias }}.variants.stats", + "title": "Variants summary", + "description": "TSV file of summary statistics for variants in sample (Reference mode only).", + "mime-type": "text/tab-seperated-values", + "optional": true, + "type": "per-sample" + }, + "Annotations": { + "filepath": "./{{ alias }}.prokka.{gbk,gff}", + "title": "Annotations files", + "description": "Annotations of regions of interest in assembly in GBK and GFF format.", + "mime-type": "application/octet-stream", + "optional": true, + "type": "per-sample" + }, + "mlst": { + "filepath": "./{{ alias }}.mlst.json", + "title": "Sequence typing results", + "description": "Sequence typing results in JSON format (isolates mode only).", + "mime-type": "application/json", + "optional": true, + "type": "per-sample" + }, + "amr": { + "filepath": "./{{ alias }}_resfinder_results", + "title": "AMR calling results", + "description": "Resfinder results for AMR calling (isolates mode only).", + "mime-type": "inode/directory", + "optional": true, + "type": "per-sample" + }, + "isolates-report": { + "filepath": "/{{ alias }}-isolates-report.html", + "title": "isolates per sample report", + "description": "Per sample report isolates mode", + "mime-type": "text/html", + "optional": true, + "type": "per-sample" + } + } +} \ No newline at end of file