Merge branch 'fix-manglement' into 'dev'

Fix manglement See merge request epi2melabs/workflows/wf-bacterial-genomes!50
epi2me-labs · Dec 1, 2022 · de15477 · de15477
2 parents 5b48b41 + 00c2ae7
commit de15477
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 75 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -14,22 +14,18 @@ docker-run:
     parallel:
         matrix:
             - MATRIX_NAME: [
-                "reference-based", "de-novo", "evaluate-assemblies"]
+                "de-novo", "reference-based"]
     rules:
         - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
           when: never
-        - if: $MATRIX_NAME == "reference-based"
-          variables:
-              NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --evaluate_assemblies --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000"
-              NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka"
         - if: $MATRIX_NAME == "de-novo"
           variables:
               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --threads 4 --chunk_size 100000"
               NF_IGNORE_PROCESSES: "medakaVariant"
-        - if: $MATRIX_NAME == "evaluate-assemblies"
+        - if: $MATRIX_NAME == "reference-based"
           variables:
-              NF_WORKFLOW_OPTS: "--fastq test_data/fastq --threads 4 --chunk_size 100000 --evaluate_assemblies"
-              NF_IGNORE_PROCESSES: "medakaVariant"
+              NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000"
+              NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka"
 
 
 # reminder: update AUX_IMAGE_TAG if the aux container package versions are changed

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.2.8]
+- Output QUAST stats for reference and denovo based assembly
+
 ## [v0.2.7]
 ### Changes
 - Replace QUAST with MetaQUAST
@@ -13,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `-profile conda` is no longer supported, users should use `-profile standard` (Docker) or `-profile singularity` instead
 - Docs update
 ### Added
-- `nextflow run epi2me-labs/wf-human-variation --version` will now print the workflow version number and exit
+- `nextflow run epi2me-labs/wf-bacterial-genomes --version` will now print the workflow version number and exit
 ### Fixes
 - Prokka only runs in denovo assembly mode
 - Tidy up report code

diff --git a/bin/report.py b/bin/report.py
@@ -6,6 +6,7 @@
 import io
 import os
 import re
+import sys
 
 from aplanat import report
 from aplanat.components import bcfstats
@@ -67,15 +68,13 @@ def bp_to_mb(input_num):
     return round(input_num / 1000000, 2)
 
 
-def run_qc_stats(
+def get_quant_stats(
         sample_names,
         read_stats_dir,
         read_stats_suffix,
-        quast_path,
-        flye_dir,
-        flye_suffix,
+        quast_path
 ):
-    """Run QC stats."""
+    """Get Quast Stats."""
     # read stats
     read_stats = collate_stats(
         read_stats_dir, sample_names, read_stats_suffix)
@@ -98,9 +97,6 @@ def run_qc_stats(
     read_stats_out.columns = [
         'Read count', 'Median read length (bp)', 'Mean read quality',
         'Read data (Mb)']
-
-    # quast stats
-
     quast_raw_data = pd.read_csv(
         os.path.join(quast_path, "transposed_report.tsv"),
         sep='\t',
@@ -109,15 +105,25 @@ def run_qc_stats(
         '.medaka', '', regex=True)
     quast_keep_cols = [12, 13, 14, 15]
     quast_filtered_data = quast_raw_data.iloc[:, quast_keep_cols].copy()
-    # Get flye stats
     quast_filtered_data.iloc[:, [1, 2, 3]] = bp_to_mb(
         quast_filtered_data.iloc[:, [1, 2, 3]])
     quast_filtered_data.columns = [
         '# contigs',
         'Largest contig (Mb)',
         'Total length (Mb)',
         'Reference length (Mb)']
+    quant_stats = pd.merge(
+        read_stats_out, quast_filtered_data, left_index=True,
+        right_index=True)
+    return quant_stats
+
 
+def get_flye_stats(
+        sample_names,
+        flye_dir,
+        flye_suffix
+):
+    """Get Flye stats."""
     # flye stats
     flye_stats = collate_stats(
         flye_dir, sample_names, flye_suffix)
@@ -128,14 +134,7 @@ def run_qc_stats(
     flye_circular = get_circular_stats(flye_stats)
     flye_out = pd.concat([flye_cov_mean, flye_circular['Y']], axis=1)
     flye_out.columns = ['Mean contig coverage', '# circular contigs']
-    # Merge
-    merged = pd.merge(
-        read_stats_out, quast_filtered_data, left_index=True,
-        right_index=True).merge(
-            flye_out, left_index=True, right_index=True)
-    merged.index.name = None
-
-    return merged
+    return flye_out
 
 
 def run_species_stats(species_stats_path, sample_names):
@@ -218,7 +217,7 @@ def gather_sample_files(sample_names, denovo_mode, prokka_mode):
                 pass
             else:
                 final_files[name] = 'None'
-                raise FileNotFoundError(
+                sys.err.write(
                     'Missing {0} required for report for: {1}'.format(
                         name, sample_name))
         sample_files[sample_name] = final_files
@@ -253,45 +252,53 @@ def main():
         "Bacterial Genomes Summary Report",
         ("Results generated through the wf-bacterial-genomes Nextflow "
             "workflow provided by Oxford Nanopore Technologies"))
+    quant_stats = get_quant_stats(
+        sample_names=args.sample_ids,
+        read_stats_dir="stats",
+        read_stats_suffix=".stats",
+        quast_path="quast_stats")
     if not args.denovo:
         report_doc.add_section().markdown(
             "Analysis was completed using an alignment with the provided "
             "reference and medaka was used for variant calling")
+        quant_stats.index.name = None
+        stats_table = quant_stats
     else:
         report_doc.add_section().markdown(
             "As no reference was provided the reads were assembled"
             " and corrected using flye and Medaka")
-        merged_stats_df = run_qc_stats(
-            sample_names=args.sample_ids,
-            read_stats_dir="stats",
-            read_stats_suffix=".stats",
-            quast_path="quast_stats",
-            flye_dir="flye_stats",
+        flye_stats = get_flye_stats(
+            sample_names=args.sample_ids, flye_dir="flye_stats",
             flye_suffix="_flye_stats.tsv")
-        section = report_doc.add_section()
-
-        section.markdown("## Run summary statistics")
-        section.markdown("* * *")
-        section.markdown("#### Read and assembly statistics")
+        merged = pd.merge(
+            quant_stats, flye_stats, left_index=True,
+            right_index=True)
+        merged.index.name = None
+        stats_table = merged
 
-        section.markdown(
-            "This section displays the read and assembly QC"
-            " statistics for all the samples in the run.")
+    section = report_doc.add_section()
+    section.markdown("## Run summary statistics")
+    section.markdown("* * *")
+    section.markdown("#### Read and assembly statistics")
 
-        section.table(merged_stats_df, index=True)
-        section = report_doc.add_section()
-        section.markdown("#### Species ID")
+    section.markdown(
+        "This section displays the read and assembly QC"
+        " statistics for all the samples in the run.")
 
-        section.markdown(
-            "This section displays the Species ID as determined by 16S."
-            " The table shows the percentage match of the 16S sequence"
-            " to the best match of  the SILVA 16S database.")
-
-        species_stats = run_species_stats(
-            species_stats_path="quast_stats/quast_downloaded_references",
-            sample_names=args.sample_ids)
-        section.table(species_stats, index=True)
-        section.markdown('<br/>')
+    section.table(stats_table, index=True)
+    section = report_doc.add_section()
+    section.markdown("#### Species ID")
+
+    section.markdown(
+        "This section displays the Species ID as determined by 16S."
+        " The table shows the percentage match of the 16S sequence"
+        " to the best match of  the SILVA 16S database.")
+
+    species_stats = run_species_stats(
+        species_stats_path="quast_stats/quast_downloaded_references",
+        sample_names=args.sample_ids)
+    section.table(species_stats, index=True)
+    section.markdown('<br/>')
 
     sample_files = gather_sample_files(
         args.sample_ids,

diff --git a/main.nf b/main.nf
@@ -267,7 +267,7 @@ process makeReport {
         path "wf-bacterial-genomes-*.html"
     script:
         report_name = "wf-bacterial-genomes-report.html"
-        denovo = params.reference == null ? "--denovo" : ""
+        denovo = params.reference_based_assembly as Boolean ? "" : "--denovo"
         prokka = params.run_prokka as Boolean ? "--prokka" : ""
         samples = sample_ids.join(" ")
     // NOTE: the script assumes the various subdirectories
@@ -277,7 +277,7 @@ process makeReport {
     --versions versions \
     --params params.json \
     --output $report_name \
-    --sample_ids $samples \
+    --sample_ids $samples 
     """
 }
 
@@ -335,13 +335,8 @@ workflow calling_pipeline {
         consensus = medakaConsensus(hdfs_grouped)
 
         // post polishing, do assembly specific things
-        if (params.evaluate_assemblies){
-             log.info("Evaluating assemblies, set evaluate_assemblies param to False to skip.")
-             assem_stats = assemblyStats(consensus.collect({it -> it[1]}))
-        } else {
-             log.info("Not evaluating assemblies. Enable with --evaluate_assemblies true")
-             assem_stats = Channel.empty()
-        }
+        assem_stats = assemblyStats(consensus.collect({it -> it[1]}))
+
         if (!params.reference_based_assembly){
             flye_info = denovo_assem.map { it -> it[2] }
         }else{

diff --git a/nextflow.config b/nextflow.config
@@ -13,14 +13,14 @@ params {
     chunk_size = 1000000
     run_prokka = true
     prokka_opts = null
-    wfversion = "v0.2.7"
+    wfversion = "v0.2.8"
     aws_image_prefix = null
     aws_queue = null
     sample = null
     sample_sheet = null
     disable_ping = false
     reference_based_assembly = false
-    evaluate_assemblies = true
+    summarise_assemblies = true
 
     monochrome_logs = false
     validate_params = true
@@ -47,7 +47,7 @@ manifest {
     description     = 'Workflow to analyse bacterial genomes'
     mainScript      = 'main.nf'
     nextflowVersion = '>=20.10.0'
-    version         = 'v0.2.7'
+    version         = 'v0.2.8'
 }
 
 epi2melabs {

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -21,24 +21,24 @@
                 },
                 "reference_based_assembly": {
                     "type": "boolean",
-                    "help_text": "By default de novo assembly will be performed with Flye. Enable this option if you have an appropriate reference for reference-based assembly. If enabled you must provide a reference.",
-                    "description": "Enable reference guided assembly instead of de novo assembly."
+                    "help_text": "By default de-novo assembly will be performed with Flye. Enable this to instead perform a reference-based consensus. A reference must be provided.",
+                    "description": "Enable reference guided assembly instead of de-novo assembly."
                 },
                 "reference": {
                     "type": "string",
                     "format": "file-path",
-                    "demo_data": "${projectDir}/test_data/ref",
                     "description": "Reference sequence FASTA file.",
-                    "help_text": "If provided this will be used to perform variant calling with Medaka. If not, the de novo assembly will be performed with Flye and Medaka."
+                    "help_text": "The reference sequence is used when performing reference-based assembly and may we used as a benchmark for de-novo assemblies."
                 },
-                "evaluate_assemblies": {
+                "summarise_assemblies": {
                     "type": "boolean",
-                    "description": "Evaluate the assemblies and compare metagenome assemblies based on alignments to close references",
-                    "help_text": "Will run MetaQUAST on the output assemblies and output quality stats."
+                    "hidden": true,
+                    "description": "Hidden for now because report code needs updating. Produce summary statistics and compare to standard reference databases.",
+                    "help_text": "Runs MetaQUAST with default parameters. Assemblies with be compared to the 16S SILVA database for analysis of sequence content."
                 },
                 "medaka_model": {
                     "type": "string",
-                    "default": "r941_prom_variant_g360",
+                    "default": "r941_min_hac_g507",
                     "description": "Medaka model name",
                     "help_text": "The correct Medaka model to use is determined by the Guppy basecaller version, see [Medaka Models](https://github.com/nanoporetech/medaka#models) for more information."
                 },
@@ -170,7 +170,7 @@
         },
         "wfversion": {
             "type": "string",
-            "default": "v0.2.7",
+            "default": "v0.2.8",
             "hidden": true
         },
         "monochrome_logs": {