Skip to content

Commit

Permalink
Merge branch 'prototyping' into 'dev'
Browse files Browse the repository at this point in the history
awfullll

See merge request epi2melabs/workflow-containers/wf-mpx!1
  • Loading branch information
mattdmem committed Jun 1, 2022
2 parents 9fbbd4b + 2bb37ff commit f232742
Show file tree
Hide file tree
Showing 52 changed files with 34,045 additions and 81 deletions.
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Expand Up @@ -8,4 +8,4 @@ variables:
# The workflow should define `--out_dir`, the CI template sets this.
# Only common file inputs and option values need to be given here
# (not things such as -profile)
NF_WORKFLOW_OPTS: "--fastq test_data/reads.fastq.gz"
NF_WORKFLOW_OPTS: "--fastq test_data/fastq/barcode01"
50 changes: 1 addition & 49 deletions CHANGELOG.md
Expand Up @@ -4,54 +4,6 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.2.0]
### Added
- default process label parameter
- Added `params.wf.example_cmd` list to populate `--help`
### Changed
- Update WorkflowMain.groovy to provide better `--help`

## [v0.1.0]
### Changed
- `sample_name` to `sample_id` throughout to mathc MinKNOW samplesheet.
### Added
- Singularity profile include in base config.
- Numerous other changes that have been lost to the mists of time.

## [v0.0.7]
### Added
- Fastqingress module for common handling of (possibly
multiplexed) inputs.
- Optimized container size through removal of various
conda cruft.
### Changed
- Use mamba by default for building conda environments.
- Cut down README to items specific to workflow.
### Fixed
- Incorrect specification of conda environment file in Nextflow config.

## [v0.0.6]
### Changed
- Explicitely install into base conda env

## [v0.0.5]
### Added
- Software versioning report example.

## [v0.0.4]
### Changed
- Version bump to test CI.

## [v0.0.3]
### Changed
- Moved all CI to templates.
- Use canned aplanat report components.

## [v0.0.2]
### Added
- CI release checks.
- Create pre-releases in CI from dev branch.

## [v0.0.1]

First release.
First release
6 changes: 6 additions & 0 deletions Dockerfile
Expand Up @@ -7,6 +7,12 @@ RUN \
. $CONDA_DIR/etc/profile.d/mamba.sh \
&& micromamba activate \
&& micromamba install -n base --file $HOME/environment.yaml \
# run medaka to download some models needed for variant calling
&& medaka tools download_models \
--models r103_fast_variant_g507 r103_hac_variant_g507 r103_prom_variant_g3210 r103_sup_variant_g507 \
r941_min_fast_variant_g507 r941_min_hac_variant_g507 r941_min_sup_variant_g507 \
r941_prom_fast_variant_g507 r941_prom_hac_variant_g507 r941_prom_sup_variant_g507 \
r941_prom_variant_g303 r941_prom_variant_g322 r941_prom_variant_g360 \
&& micromamba clean --all --yes \
&& fix-permissions $CONDA_DIR \
&& fix-permissions $HOME \
Expand Down
255 changes: 254 additions & 1 deletion bin/report.py
Expand Up @@ -2,10 +2,221 @@
"""Create workflow report."""

import argparse
import math

from aplanat import bars
from aplanat.components import fastcat
from aplanat.components import simple as scomponents
from aplanat.report import WFReport
from aplanat.util import ont_colors
from bokeh.models import BasicTickFormatter, ColumnDataSource, LabelSet
from bokeh.plotting import figure
import pandas as pd
import pysam
import vcf


def load_fasta(reference):
"""Load reference data."""
fasta = pysam.Fastafile(reference)
return fasta


def load_vcf(vcf_file):
"""Load VCF data."""
vcf_reader = vcf.Reader(filename=vcf_file)

# define our data structure
data = dict(
chromosome=list(),
start=list(),
end=list(),
quality=list(),
reference=list(),
alternate=list(),
is_indel=list()
)

info_fields = ['DP', 'DPS']

for info_field in info_fields:
data[info_field] = list()

# for records in VCF file add to our structure
for record in vcf_reader:
for alt in record.ALT:
data['chromosome'].append(record.CHROM)
data['start'].append(record.POS)
data['end'].append(record.POS+len(alt))
data['quality'].append(record.QUAL)
ref = record.REF
alt = str(alt)
if len(ref) > 10:
ref = f"{ref[0:10]}...({len(ref)})"
data['reference'].append(ref)
if len(alt) > 10:
alt = f"{alt[0:10]}...({len(alt)})"
data['alternate'].append(alt)
data['is_indel'].append(record.is_indel)

for info_field in info_fields:
field_data = record.INFO[info_field]
print(type(field_data))
if isinstance(field_data, list):
field_data = ",".join(str(x) for x in field_data)
data[info_field].append(field_data)

df = pd.DataFrame.from_dict(data)
df['width'] = df['end']-df['start']

return df


def make_coverage_section(coverage_file, variants_data, report_doc):
"""Make the coverage section."""
print(variants_data)
coverage = pd.read_csv(coverage_file, sep="\t", header=0)

section = report_doc.add_section()
section._add_item("""<h3>Genome coverage</h3>
<p>The plot below shows coverage of the whole genome. Dots represent
single nucleotide polymorphisms called by medaka. Bars represent
insertions and deletions called by medaka.</p>""")

max_coverage = max(coverage['depth'])

p = figure(
x_axis_label='position',
y_axis_label='depth',
width=1000,
height=300)

# add a line for the depth of coverage
p.line(
coverage['pos'],
coverage['depth'],
line_color=ont_colors.BRAND_BLUE)

# add a point for the snps
p.circle(
variants_data[~variants_data.is_indel]['start'],
y=5,
size=5,
color=ont_colors.BRAND_GREY)

# add a bar for the indels
p.hbar(
left=variants_data[variants_data.is_indel]['start'],
y=6,
right=variants_data[variants_data.is_indel]['end'],
height=max_coverage/10,
color=ont_colors.BRAND_LIGHT_BLUE)

p.xaxis.formatter = BasicTickFormatter(use_scientific=False)

section.plot(p)


def get_context(variant, fasta):
"""Get the sequence context around the variant of interest."""
start = variant[0]
seq = fasta.fetch(
reference=fasta.references[0], start=start-1, end=start+1)

if len(variant[1]) > 1:
return "NA"
base = fasta.fetch(
reference=fasta.references[0], start=start, end=start+1)

return f"""{seq}>{variant[1]}{base}"""


def make_variants_context(variants_data, reference, report):
"""Make variants context section."""
section = report.add_section()
section._add_item("""<h3>Variant context</h3>
<p>APOBEC3 host enzyme action has been implicated in the mutational process
of MPX. As described in <a href="https://tinyurl.com/2fawchvu">this</a>
<a href="https://virological.org">virological.org</a> post by Áine O’Toole
& Andrew Rambaut.<p>""")

variants_data['context'] = [
get_context(x, reference) for x in zip(
variants_data['start'], variants_data['alternate'])]

summary = variants_data.context.value_counts().to_frame().set_axis(
['Count'], axis=1, inplace=False)

summary.index.name = 'Variant'
summary.reset_index(inplace=True)
summary = summary[summary.Variant != "NA"]

variants_context_plot = bars.simple_bar(
summary['Variant'], summary['Count'])
variants_context_plot.xaxis.major_label_orientation = math.pi/4
variants_context_plot.xaxis.axis_label = 'Variant'
variants_context_plot.yaxis.axis_label = 'Count'
section.plot(variants_context_plot)


def make_variants_table(variants_data, report):
"""Make the variants table."""
section = report.add_section()
section._add_item("""<h3>All variants</h3>
<p>Variants with respect to the reference sequence as called by medaka
.</p>""")
section.table(
variants_data,
index=False,
th_color=ont_colors.BRAND_BLUE,
paging=True,
searchable=False)
pass


def make_assembly_summary(bed, report):
"""Make a plot for the assembly."""
section = report.add_section()
section._add_item("""<h3>Flye Assembly</h3>
<p>Quick assembly of the reads. Below the contigs are displayed as mapped
to the reference chosen for analysis.</p>""")

contigs = pd.read_csv(bed, sep='\t', header=0).set_axis(
['referece', 'start', 'end', 'name', 'qual', 'strand'],
axis=1,
inplace=False)

print(contigs)

p = figure(
x_axis_label='position',
y_axis_label='contig',
x_range=(-1000, 200000),
width=1000,
height=300)

p.hbar(
left=contigs.start,
right=contigs.end,
y=contigs.index,
height=1,
color=ont_colors.BRAND_BLUE)
contigs['text_position'] = (contigs.start-1000)
labels = LabelSet(
x='text_position',
y='index',
text='name',
text_align='right',
text_baseline='middle',
source=ColumnDataSource(contigs),
text_color=ont_colors.BRAND_BLUE)
p.add_layout(labels)

section.plot(p)


def main():
Expand All @@ -16,6 +227,18 @@ def main():
parser.add_argument(
"--versions", required=True,
help="directory containing CSVs containing name,version.")
parser.add_argument(
"--variants", required=True,
help="medaka variants VCF file")
parser.add_argument(
"--reference", required=True,
help="reference fasta file")
parser.add_argument(
"--coverage", required=True,
help="depth of coverage file")
parser.add_argument(
"--assembly_bed", required=True,
help="bed file of assembly mapped to reference")
parser.add_argument(
"--params", default=None, required=True,
help="A JSON file containing the workflow parameter key/values")
Expand All @@ -28,11 +251,41 @@ def main():
args = parser.parse_args()

report = WFReport(
"Workflow Template Sequencing report", "wf-template",
"wf-mpx Sequencing Report", "wf-mpx",
revision=args.revision, commit=args.commit)

section = report.add_section()
section.markdown(""" ### Preamble
This workflow is intended to produce a __draft__ consensus Monkeypox virus
genome from Oxford Nanopore Technologies Sequencing data.
The rough outline of this workflow:
* Map reads to a reference genome
* Call variants, using medaka, with respect to this reference
* Create a consensus using these called variants
* Attempt assembly with flye polsihed with medaka
Consensus is masked ('N') in regions of <20x coverage, deletions are
represented as "-" and insertions are in lower case. The consensus __will
require manual review__.""")

report.add_section(
section=fastcat.full_report(args.summaries))

variants = load_vcf(args.variants)

make_coverage_section(args.coverage, variants, report)

reference = load_fasta(args.reference)

make_variants_context(variants, reference, report)

make_variants_table(variants, report)

make_assembly_summary(args.assembly_bed, report)

report.add_section(
section=scomponents.version_table(args.versions))
report.add_section(
Expand Down
Binary file added data/.DS_Store
Binary file not shown.

0 comments on commit f232742

Please sign in to comment.