Merge branch 'prototyping' into 'dev'

awfullll See merge request epi2melabs/workflow-containers/wf-mpx!1
epi2me-labs · Jun 1, 2022 · f232742 · f232742
2 parents 9fbbd4b + 2bb37ff
commit f232742
Show file tree

Hide file tree

Showing 52 changed files with 34,045 additions and 81 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -8,4 +8,4 @@ variables:
     # The workflow should define `--out_dir`, the CI template sets this.
     # Only common file inputs and option values need to be given here
     # (not things such as -profile)
-    NF_WORKFLOW_OPTS: "--fastq test_data/reads.fastq.gz"
+    NF_WORKFLOW_OPTS: "--fastq test_data/fastq/barcode01"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,54 +4,6 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [v0.2.0]
-### Added
-- default process label parameter
-- Added `params.wf.example_cmd` list to populate `--help`
-### Changed
-- Update WorkflowMain.groovy to provide better `--help`
-
-## [v0.1.0]
-### Changed
-- `sample_name` to `sample_id` throughout to mathc MinKNOW samplesheet.
-### Added
-- Singularity profile include in base config.
-- Numerous other changes that have been lost to the mists of time.
-
-## [v0.0.7]
-### Added
-- Fastqingress module for common handling of (possibly
-  multiplexed) inputs.
-- Optimized container size through removal of various
-  conda cruft.
-### Changed
-- Use mamba by default for building conda environments.
-- Cut down README to items specific to workflow.
-### Fixed
-- Incorrect specification of conda environment file in Nextflow config.
-
-## [v0.0.6]
-### Changed
-- Explicitely install into base conda env
-
-## [v0.0.5]
-### Added
-- Software versioning report example.
-
-## [v0.0.4]
-### Changed
-- Version bump to test CI.
-
-## [v0.0.3]
-### Changed
-- Moved all CI to templates.
-- Use canned aplanat report components.
-
-## [v0.0.2]
-### Added
-- CI release checks.
-- Create pre-releases in CI from dev branch.
-
 ## [v0.0.1]
 
-First release.
+First release
diff --git a/Dockerfile b/Dockerfile
@@ -7,6 +7,12 @@ RUN \
     . $CONDA_DIR/etc/profile.d/mamba.sh \
     && micromamba activate \
     && micromamba install -n base --file $HOME/environment.yaml \
+    # run medaka to download some models needed for variant calling
+    && medaka tools download_models \
+        --models r103_fast_variant_g507 r103_hac_variant_g507 r103_prom_variant_g3210 r103_sup_variant_g507 \
+                 r941_min_fast_variant_g507 r941_min_hac_variant_g507 r941_min_sup_variant_g507 \
+                 r941_prom_fast_variant_g507 r941_prom_hac_variant_g507 r941_prom_sup_variant_g507 \
+                 r941_prom_variant_g303 r941_prom_variant_g322 r941_prom_variant_g360 \
     && micromamba clean --all --yes \
     && fix-permissions $CONDA_DIR \
     && fix-permissions $HOME \

diff --git a/bin/report.py b/bin/report.py
@@ -2,10 +2,221 @@
 """Create workflow report."""
 
 import argparse
+import math
 
+from aplanat import bars
 from aplanat.components import fastcat
 from aplanat.components import simple as scomponents
 from aplanat.report import WFReport
+from aplanat.util import ont_colors
+from bokeh.models import BasicTickFormatter, ColumnDataSource, LabelSet
+from bokeh.plotting import figure
+import pandas as pd
+import pysam
+import vcf
+
+
+def load_fasta(reference):
+    """Load reference data."""
+    fasta = pysam.Fastafile(reference)
+    return fasta
+
+
+def load_vcf(vcf_file):
+    """Load VCF data."""
+    vcf_reader = vcf.Reader(filename=vcf_file)
+
+    # define our data structure
+    data = dict(
+        chromosome=list(),
+        start=list(),
+        end=list(),
+        quality=list(),
+        reference=list(),
+        alternate=list(),
+        is_indel=list()
+    )
+
+    info_fields = ['DP', 'DPS']
+
+    for info_field in info_fields:
+        data[info_field] = list()
+
+    # for records in VCF file add to our structure
+    for record in vcf_reader:
+        for alt in record.ALT:
+            data['chromosome'].append(record.CHROM)
+            data['start'].append(record.POS)
+            data['end'].append(record.POS+len(alt))
+            data['quality'].append(record.QUAL)
+            ref = record.REF
+            alt = str(alt)
+            if len(ref) > 10:
+                ref = f"{ref[0:10]}...({len(ref)})"
+            data['reference'].append(ref)
+            if len(alt) > 10:
+                alt = f"{alt[0:10]}...({len(alt)})"
+            data['alternate'].append(alt)
+            data['is_indel'].append(record.is_indel)
+
+            for info_field in info_fields:
+                field_data = record.INFO[info_field]
+                print(type(field_data))
+                if isinstance(field_data, list):
+                    field_data = ",".join(str(x) for x in field_data)
+                data[info_field].append(field_data)
+
+    df = pd.DataFrame.from_dict(data)
+    df['width'] = df['end']-df['start']
+
+    return df
+
+
+def make_coverage_section(coverage_file, variants_data, report_doc):
+    """Make the coverage section."""
+    print(variants_data)
+    coverage = pd.read_csv(coverage_file, sep="\t", header=0)
+
+    section = report_doc.add_section()
+    section._add_item("""<h3>Genome coverage</h3>
+
+    <p>The plot below shows coverage of the whole genome. Dots represent
+    single nucleotide polymorphisms called by medaka. Bars represent
+    insertions and deletions called by medaka.</p>""")
+
+    max_coverage = max(coverage['depth'])
+
+    p = figure(
+            x_axis_label='position',
+            y_axis_label='depth',
+            width=1000,
+            height=300)
+
+    # add a line for the depth of coverage
+    p.line(
+            coverage['pos'],
+            coverage['depth'],
+            line_color=ont_colors.BRAND_BLUE)
+
+    # add a point for the snps
+    p.circle(
+        variants_data[~variants_data.is_indel]['start'],
+        y=5,
+        size=5,
+        color=ont_colors.BRAND_GREY)
+
+    # add a bar for the indels
+    p.hbar(
+        left=variants_data[variants_data.is_indel]['start'],
+        y=6,
+        right=variants_data[variants_data.is_indel]['end'],
+        height=max_coverage/10,
+        color=ont_colors.BRAND_LIGHT_BLUE)
+
+    p.xaxis.formatter = BasicTickFormatter(use_scientific=False)
+
+    section.plot(p)
+
+
+def get_context(variant, fasta):
+    """Get the sequence context around the variant of interest."""
+    start = variant[0]
+    seq = fasta.fetch(
+        reference=fasta.references[0], start=start-1, end=start+1)
+
+    if len(variant[1]) > 1:
+        return "NA"
+    base = fasta.fetch(
+        reference=fasta.references[0], start=start, end=start+1)
+
+    return f"""{seq}>{variant[1]}{base}"""
+
+
+def make_variants_context(variants_data, reference, report):
+    """Make variants context section."""
+    section = report.add_section()
+    section._add_item("""<h3>Variant context</h3>
+
+    <p>APOBEC3 host enzyme action has been implicated in the mutational process
+    of MPX. As described in <a href="https://tinyurl.com/2fawchvu">this</a>
+    <a href="https://virological.org">virological.org</a> post by Áine O’Toole
+    & Andrew Rambaut.<p>""")
+
+    variants_data['context'] = [
+        get_context(x, reference) for x in zip(
+            variants_data['start'], variants_data['alternate'])]
+
+    summary = variants_data.context.value_counts().to_frame().set_axis(
+        ['Count'], axis=1, inplace=False)
+
+    summary.index.name = 'Variant'
+    summary.reset_index(inplace=True)
+    summary = summary[summary.Variant != "NA"]
+
+    variants_context_plot = bars.simple_bar(
+        summary['Variant'], summary['Count'])
+    variants_context_plot.xaxis.major_label_orientation = math.pi/4
+    variants_context_plot.xaxis.axis_label = 'Variant'
+    variants_context_plot.yaxis.axis_label = 'Count'
+    section.plot(variants_context_plot)
+
+
+def make_variants_table(variants_data, report):
+    """Make the variants table."""
+    section = report.add_section()
+    section._add_item("""<h3>All variants</h3>
+
+    <p>Variants with respect to the reference sequence as called by medaka
+    .</p>""")
+    section.table(
+            variants_data,
+            index=False,
+            th_color=ont_colors.BRAND_BLUE,
+            paging=True,
+            searchable=False)
+    pass
+
+
+def make_assembly_summary(bed, report):
+    """Make a plot for the assembly."""
+    section = report.add_section()
+    section._add_item("""<h3>Flye Assembly</h3>
+
+    <p>Quick assembly of the reads. Below the contigs are displayed as mapped
+    to the reference chosen for analysis.</p>""")
+
+    contigs = pd.read_csv(bed, sep='\t', header=0).set_axis(
+            ['referece', 'start', 'end', 'name', 'qual', 'strand'],
+            axis=1,
+            inplace=False)
+
+    print(contigs)
+
+    p = figure(
+            x_axis_label='position',
+            y_axis_label='contig',
+            x_range=(-1000, 200000),
+            width=1000,
+            height=300)
+
+    p.hbar(
+        left=contigs.start,
+        right=contigs.end,
+        y=contigs.index,
+        height=1,
+        color=ont_colors.BRAND_BLUE)
+    contigs['text_position'] = (contigs.start-1000)
+    labels = LabelSet(
+        x='text_position',
+        y='index',
+        text='name',
+        text_align='right',
+        text_baseline='middle',
+        source=ColumnDataSource(contigs),
+        text_color=ont_colors.BRAND_BLUE)
+    p.add_layout(labels)
+
+    section.plot(p)
 
 
 def main():
@@ -16,6 +227,18 @@ def main():
     parser.add_argument(
         "--versions", required=True,
         help="directory containing CSVs containing name,version.")
+    parser.add_argument(
+        "--variants", required=True,
+        help="medaka variants VCF file")
+    parser.add_argument(
+        "--reference", required=True,
+        help="reference fasta file")
+    parser.add_argument(
+        "--coverage", required=True,
+        help="depth of coverage file")
+    parser.add_argument(
+        "--assembly_bed", required=True,
+        help="bed file of assembly mapped to reference")
     parser.add_argument(
         "--params", default=None, required=True,
         help="A JSON file containing the workflow parameter key/values")
@@ -28,11 +251,41 @@ def main():
     args = parser.parse_args()
 
     report = WFReport(
-        "Workflow Template Sequencing report", "wf-template",
+        "wf-mpx Sequencing Report", "wf-mpx",
         revision=args.revision, commit=args.commit)
 
+    section = report.add_section()
+    section.markdown(""" ### Preamble
+
+    This workflow is intended to produce a __draft__ consensus Monkeypox virus
+    genome from Oxford Nanopore Technologies Sequencing data.
+
+    The rough outline of this workflow:
+
+    * Map reads to a reference genome
+    * Call variants, using medaka, with respect to this reference
+    * Create a consensus using these called variants
+    * Attempt assembly with flye polsihed with medaka
+
+    Consensus is masked ('N') in regions of <20x coverage, deletions are
+    represented as "-" and insertions are in lower case. The consensus __will
+    require manual review__.""")
+
     report.add_section(
         section=fastcat.full_report(args.summaries))
+
+    variants = load_vcf(args.variants)
+
+    make_coverage_section(args.coverage, variants, report)
+
+    reference = load_fasta(args.reference)
+
+    make_variants_context(variants, reference, report)
+
+    make_variants_table(variants, report)
+
+    make_assembly_summary(args.assembly_bed, report)
+
     report.add_section(
         section=scomponents.version_table(args.versions))
     report.add_section(

diff --git a/data/.DS_Store b/data/.DS_Store