Skip to content

Commit

Permalink
Merge branch 'fix-manglement' into 'dev'
Browse files Browse the repository at this point in the history
Fix manglement

See merge request epi2melabs/workflows/wf-bacterial-genomes!50
  • Loading branch information
sarahjeeeze committed Dec 1, 2022
2 parents 5b48b41 + 00c2ae7 commit de15477
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 75 deletions.
12 changes: 4 additions & 8 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,18 @@ docker-run:
parallel:
matrix:
- MATRIX_NAME: [
"reference-based", "de-novo", "evaluate-assemblies"]
"de-novo", "reference-based"]
rules:
- if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
when: never
- if: $MATRIX_NAME == "reference-based"
variables:
NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --evaluate_assemblies --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000"
NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka"
- if: $MATRIX_NAME == "de-novo"
variables:
NF_WORKFLOW_OPTS: "--fastq test_data/fastq --threads 4 --chunk_size 100000"
NF_IGNORE_PROCESSES: "medakaVariant"
- if: $MATRIX_NAME == "evaluate-assemblies"
- if: $MATRIX_NAME == "reference-based"
variables:
NF_WORKFLOW_OPTS: "--fastq test_data/fastq --threads 4 --chunk_size 100000 --evaluate_assemblies"
NF_IGNORE_PROCESSES: "medakaVariant"
NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000"
NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka"


# reminder: update AUX_IMAGE_TAG if the aux container package versions are changed
Expand Down
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.2.8]
- Output QUAST stats for reference and denovo based assembly

## [v0.2.7]
### Changes
- Replace QUAST with MetaQUAST
Expand All @@ -13,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `-profile conda` is no longer supported, users should use `-profile standard` (Docker) or `-profile singularity` instead
- Docs update
### Added
- `nextflow run epi2me-labs/wf-human-variation --version` will now print the workflow version number and exit
- `nextflow run epi2me-labs/wf-bacterial-genomes --version` will now print the workflow version number and exit
### Fixes
- Prokka only runs in denovo assembly mode
- Tidy up report code
Expand Down
97 changes: 52 additions & 45 deletions bin/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import io
import os
import re
import sys

from aplanat import report
from aplanat.components import bcfstats
Expand Down Expand Up @@ -67,15 +68,13 @@ def bp_to_mb(input_num):
return round(input_num / 1000000, 2)


def run_qc_stats(
def get_quant_stats(
sample_names,
read_stats_dir,
read_stats_suffix,
quast_path,
flye_dir,
flye_suffix,
quast_path
):
"""Run QC stats."""
"""Get Quast Stats."""
# read stats
read_stats = collate_stats(
read_stats_dir, sample_names, read_stats_suffix)
Expand All @@ -98,9 +97,6 @@ def run_qc_stats(
read_stats_out.columns = [
'Read count', 'Median read length (bp)', 'Mean read quality',
'Read data (Mb)']

# quast stats

quast_raw_data = pd.read_csv(
os.path.join(quast_path, "transposed_report.tsv"),
sep='\t',
Expand All @@ -109,15 +105,25 @@ def run_qc_stats(
'.medaka', '', regex=True)
quast_keep_cols = [12, 13, 14, 15]
quast_filtered_data = quast_raw_data.iloc[:, quast_keep_cols].copy()
# Get flye stats
quast_filtered_data.iloc[:, [1, 2, 3]] = bp_to_mb(
quast_filtered_data.iloc[:, [1, 2, 3]])
quast_filtered_data.columns = [
'# contigs',
'Largest contig (Mb)',
'Total length (Mb)',
'Reference length (Mb)']
quant_stats = pd.merge(
read_stats_out, quast_filtered_data, left_index=True,
right_index=True)
return quant_stats


def get_flye_stats(
sample_names,
flye_dir,
flye_suffix
):
"""Get Flye stats."""
# flye stats
flye_stats = collate_stats(
flye_dir, sample_names, flye_suffix)
Expand All @@ -128,14 +134,7 @@ def run_qc_stats(
flye_circular = get_circular_stats(flye_stats)
flye_out = pd.concat([flye_cov_mean, flye_circular['Y']], axis=1)
flye_out.columns = ['Mean contig coverage', '# circular contigs']
# Merge
merged = pd.merge(
read_stats_out, quast_filtered_data, left_index=True,
right_index=True).merge(
flye_out, left_index=True, right_index=True)
merged.index.name = None

return merged
return flye_out


def run_species_stats(species_stats_path, sample_names):
Expand Down Expand Up @@ -218,7 +217,7 @@ def gather_sample_files(sample_names, denovo_mode, prokka_mode):
pass
else:
final_files[name] = 'None'
raise FileNotFoundError(
sys.err.write(
'Missing {0} required for report for: {1}'.format(
name, sample_name))
sample_files[sample_name] = final_files
Expand Down Expand Up @@ -253,45 +252,53 @@ def main():
"Bacterial Genomes Summary Report",
("Results generated through the wf-bacterial-genomes Nextflow "
"workflow provided by Oxford Nanopore Technologies"))
quant_stats = get_quant_stats(
sample_names=args.sample_ids,
read_stats_dir="stats",
read_stats_suffix=".stats",
quast_path="quast_stats")
if not args.denovo:
report_doc.add_section().markdown(
"Analysis was completed using an alignment with the provided "
"reference and medaka was used for variant calling")
quant_stats.index.name = None
stats_table = quant_stats
else:
report_doc.add_section().markdown(
"As no reference was provided the reads were assembled"
" and corrected using flye and Medaka")
merged_stats_df = run_qc_stats(
sample_names=args.sample_ids,
read_stats_dir="stats",
read_stats_suffix=".stats",
quast_path="quast_stats",
flye_dir="flye_stats",
flye_stats = get_flye_stats(
sample_names=args.sample_ids, flye_dir="flye_stats",
flye_suffix="_flye_stats.tsv")
section = report_doc.add_section()

section.markdown("## Run summary statistics")
section.markdown("* * *")
section.markdown("#### Read and assembly statistics")
merged = pd.merge(
quant_stats, flye_stats, left_index=True,
right_index=True)
merged.index.name = None
stats_table = merged

section.markdown(
"This section displays the read and assembly QC"
" statistics for all the samples in the run.")
section = report_doc.add_section()
section.markdown("## Run summary statistics")
section.markdown("* * *")
section.markdown("#### Read and assembly statistics")

section.table(merged_stats_df, index=True)
section = report_doc.add_section()
section.markdown("#### Species ID")
section.markdown(
"This section displays the read and assembly QC"
" statistics for all the samples in the run.")

section.markdown(
"This section displays the Species ID as determined by 16S."
" The table shows the percentage match of the 16S sequence"
" to the best match of the SILVA 16S database.")

species_stats = run_species_stats(
species_stats_path="quast_stats/quast_downloaded_references",
sample_names=args.sample_ids)
section.table(species_stats, index=True)
section.markdown('<br/>')
section.table(stats_table, index=True)
section = report_doc.add_section()
section.markdown("#### Species ID")

section.markdown(
"This section displays the Species ID as determined by 16S."
" The table shows the percentage match of the 16S sequence"
" to the best match of the SILVA 16S database.")

species_stats = run_species_stats(
species_stats_path="quast_stats/quast_downloaded_references",
sample_names=args.sample_ids)
section.table(species_stats, index=True)
section.markdown('<br/>')

sample_files = gather_sample_files(
args.sample_ids,
Expand Down
13 changes: 4 additions & 9 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ process makeReport {
path "wf-bacterial-genomes-*.html"
script:
report_name = "wf-bacterial-genomes-report.html"
denovo = params.reference == null ? "--denovo" : ""
denovo = params.reference_based_assembly as Boolean ? "" : "--denovo"
prokka = params.run_prokka as Boolean ? "--prokka" : ""
samples = sample_ids.join(" ")
// NOTE: the script assumes the various subdirectories
Expand All @@ -277,7 +277,7 @@ process makeReport {
--versions versions \
--params params.json \
--output $report_name \
--sample_ids $samples \
--sample_ids $samples
"""
}

Expand Down Expand Up @@ -335,13 +335,8 @@ workflow calling_pipeline {
consensus = medakaConsensus(hdfs_grouped)

// post polishing, do assembly specific things
if (params.evaluate_assemblies){
log.info("Evaluating assemblies, set evaluate_assemblies param to False to skip.")
assem_stats = assemblyStats(consensus.collect({it -> it[1]}))
} else {
log.info("Not evaluating assemblies. Enable with --evaluate_assemblies true")
assem_stats = Channel.empty()
}
assem_stats = assemblyStats(consensus.collect({it -> it[1]}))

if (!params.reference_based_assembly){
flye_info = denovo_assem.map { it -> it[2] }
}else{
Expand Down
6 changes: 3 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ params {
chunk_size = 1000000
run_prokka = true
prokka_opts = null
wfversion = "v0.2.7"
wfversion = "v0.2.8"
aws_image_prefix = null
aws_queue = null
sample = null
sample_sheet = null
disable_ping = false
reference_based_assembly = false
evaluate_assemblies = true
summarise_assemblies = true

monochrome_logs = false
validate_params = true
Expand All @@ -47,7 +47,7 @@ manifest {
description = 'Workflow to analyse bacterial genomes'
mainScript = 'main.nf'
nextflowVersion = '>=20.10.0'
version = 'v0.2.7'
version = 'v0.2.8'
}

epi2melabs {
Expand Down
18 changes: 9 additions & 9 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,24 @@
},
"reference_based_assembly": {
"type": "boolean",
"help_text": "By default de novo assembly will be performed with Flye. Enable this option if you have an appropriate reference for reference-based assembly. If enabled you must provide a reference.",
"description": "Enable reference guided assembly instead of de novo assembly."
"help_text": "By default de-novo assembly will be performed with Flye. Enable this to instead perform a reference-based consensus. A reference must be provided.",
"description": "Enable reference guided assembly instead of de-novo assembly."
},
"reference": {
"type": "string",
"format": "file-path",
"demo_data": "${projectDir}/test_data/ref",
"description": "Reference sequence FASTA file.",
"help_text": "If provided this will be used to perform variant calling with Medaka. If not, the de novo assembly will be performed with Flye and Medaka."
"help_text": "The reference sequence is used when performing reference-based assembly and may we used as a benchmark for de-novo assemblies."
},
"evaluate_assemblies": {
"summarise_assemblies": {
"type": "boolean",
"description": "Evaluate the assemblies and compare metagenome assemblies based on alignments to close references",
"help_text": "Will run MetaQUAST on the output assemblies and output quality stats."
"hidden": true,
"description": "Hidden for now because report code needs updating. Produce summary statistics and compare to standard reference databases.",
"help_text": "Runs MetaQUAST with default parameters. Assemblies with be compared to the 16S SILVA database for analysis of sequence content."
},
"medaka_model": {
"type": "string",
"default": "r941_prom_variant_g360",
"default": "r941_min_hac_g507",
"description": "Medaka model name",
"help_text": "The correct Medaka model to use is determined by the Guppy basecaller version, see [Medaka Models](https://github.com/nanoporetech/medaka#models) for more information."
},
Expand Down Expand Up @@ -170,7 +170,7 @@
},
"wfversion": {
"type": "string",
"default": "v0.2.7",
"default": "v0.2.8",
"hidden": true
},
"monochrome_logs": {
Expand Down

0 comments on commit de15477

Please sign in to comment.