Skip to content

Commit

Permalink
Merge branch 'report' into 'dev'
Browse files Browse the repository at this point in the history
Report

See merge request epi2melabs/workflow-containers/wf-hap-snps!8
  • Loading branch information
cjw85 committed Mar 3, 2021
2 parents d38234d + 06e2c3b commit 2177265
Show file tree
Hide file tree
Showing 6 changed files with 353 additions and 11 deletions.
54 changes: 47 additions & 7 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ conda-run:
- *install-nextflow
- *install-conda
script:
- OUTPUT="snp-calling"
- ./nextflow run workflow.nf \
- OUTPUT=${CI_PROJECT_NAME}
- ./nextflow run main.nf \
-w ${OUTPUT}/workspace
-profile conda
--reads test_data/subset.fa.gz --reference test_data/reference.subseq.fa.gz
--fastq test_data/subset.fa.gz --reference test_data/reference.subseq.fa.gz
--threads 4 --out_dir ${OUTPUT}
only:
- branches


build-image:
Expand All @@ -42,11 +44,12 @@ build-image:
- docker build --no-cache -t "${TAG}" -f Dockerfile . --build-arg BASEIMAGE=${BASEIMAGE}
# run letting nextflow orchestrate the containers
- docker tag "${TAG}" "${DOCKERHUB_NAMESPACE}/${CI_PROJECT_NAME}:latest"
- OUTPUT="snp-calling"
- ./nextflow run workflow.nf \
- OUTPUT=${CI_PROJECT_NAME}
- ./nextflow run main.nf \
-w ${OUTPUT}/workspace
-profile standard
--reads test_data/subset.fa.gz --reference test_data/reference.subseq.fa.gz
--wfversion latest
--fastq test_data/subset.fa.gz --reference test_data/reference.subseq.fa.gz
--threads 4 --out_dir ${OUTPUT}
# push
- if [[ ${CI_COMMIT_BRANCH} == 'dev' ]]; then
Expand All @@ -60,7 +63,7 @@ build-image:
artifacts:
paths:
# Add the output directory for the test
- "snp-calling"
- ${CI_PROJECT_NAME}
expire_in: 1 day


Expand Down Expand Up @@ -94,6 +97,43 @@ release-hub:
script:
- echo ${DOCKERHUB_TOKEN} | docker login --username epi2melabs --password-stdin
- RELTAG="${DOCKERHUB_NAMESPACE}/${CI_PROJECT_NAME}:${CI_COMMIT_TAG}"
- LATEST="${DOCKERHUB_NAMESPACE}/${CI_PROJECT_NAME}:latest"
- echo "Pushing ${RELTAG}"
- docker tag ${SHATAG} ${RELTAG}
- docker push ${RELTAG}
- docker tag ${SHATAG} ${LATEST}
- docker push ${LATEST}

# Send all tags matching vX.Y.Z to github (code and release)
push-github:
stage: release
image: $GH_PUSH_IMAGE
before_script:
- apt-get update -qq && apt-get install -y -qq
git python3-all-dev git-lfs python3-venv
- mkdir ~/.ssh/
- cp $LABS_BOT_GH_KEY ~/.ssh/id_rsa && chmod 600 ~/.ssh/id_rsa
- echo -e "Host github.com\n\tStrictHostKeyChecking no\n\tHostname ssh.github.com\n\tPort 443\n\n" > ~/.ssh/config
- git config --global user.email "epi2melabs@nanoporetech.com"
- git config --global user.name "epi2melabs-bot"
script:
# Push master and tag to github
- git remote add ont ${CI_REPOSITORY_URL} || true
- git remote add github git@github.com:epi2me-labs/${CI_PROJECT_NAME}.git || true
- git fetch ont dev --tags
- git fetch ont master
- git fetch github master || echo "No 'master' branch present on github"
- git checkout --track github/master || git checkout master
- git merge ${CI_COMMIT_TAG}
- git push github master
- git push github ${CI_COMMIT_TAG}
# Make a github release page
- python3 -m venv release_env --prompt "(release) "
- source release_env/bin/activate
- pip install pip --upgrade
- pip install git+https://github.com/epi2me-labs/github_release.git
- github_release ${CI_PROJECT_NAME} ${CI_COMMIT_TAG} CHANGELOG.md ${LABS_BOT_GH_TOKEN}
only:
- /^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$/
except:
- branches
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Changelog
All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.0.1]

Initial release

### Added
- Basic running of medaka variant calling and report.
47 changes: 47 additions & 0 deletions bin/parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import re
import sys

import pandas as pd

def split_blocks(fname):
"""Split lines of a file into blocks based on comment lines."""
comment = list()
data = list()
state = None
with open(fname, 'r') as fh:
for line in fh.readlines():
if line.startswith('#'):
if state != 'comment' and state is not None:
yield comment, data
comment, data = list(), list()
state = 'comment'
comment.append(line.strip('# ').rstrip())
else:
state = 'data'
data.append(line.rstrip())
yield comment, data


def parse_bcftools_stats(fname):
tables = dict()
with open(fname, 'r') as fh:
for comment, data in split_blocks(fname):
fields = [x.rstrip() for x in re.split('\[\d+\]', comment[-1])]
section, fields = fields[0], fields[1:]
rows = list()
for d in data:
items = d.split('\t')
if items[0] != section:
raise ValueError("first data field not equal to section key")
rows.append(items[1:])
tables[section] = pd.DataFrame(rows, columns=fields)
# now some special handling
SN = tables['SN']
SN['key'] = SN['key'].apply(lambda x: x.replace('number of ', '').rstrip(':'))
tables['SN'] = pd.DataFrame(SN.pivot(index='id', columns='key', values='value').to_records())
return tables


if __name__ == "__main__":
tables = parse_bcftools_stats(sys.argv[1])
print(tables['SN'])
239 changes: 239 additions & 0 deletions bin/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
#!/usr/bin/env python

import argparse
import glob
import numpy as np
import pandas as pd

from bokeh.layouts import gridplot, layout
from bokeh.models import Panel, Tabs
from bokeh.models.formatters import NumeralTickFormatter
import aplanat
from aplanat import annot, bars, gridplot, hist, lines, points, report, spatial

from parse import parse_bcftools_stats


def main():
parser = argparse.ArgumentParser()
parser.add_argument("depth", help="Depth summary file.")
parser.add_argument("summary", help="Read statistics summary file.")
parser.add_argument("align_summary", help="Align statistics summary file.")
parser.add_argument("bcf_stats", help="Output of bcftools stats")
parser.add_argument("output", help="Report output filename")
args = parser.parse_args()

report_doc = report.HTMLReport(
"Haploid variant calling Summary Report",
"Results generated through the wf-hap-snp nextflow workflow provided by Oxford Nanopore Technologies")

report_doc.markdown('''
### Read Quality control
This section displays basic QC metrics indicating read data quality.
''')

np_blue = '#0084A9'
np_dark_grey = '#455560'
np_light_blue = '#90C6E7'

# read length summary
seq_summary = pd.read_csv(args.summary, sep='\t')
total_bases = seq_summary['read_length'].sum()
mean_length = total_bases / len(seq_summary)
median_length = np.median(seq_summary['read_length'])
datas = [seq_summary['read_length']]
length_hist = hist.histogram(
datas, colors=[np_blue], bins=100,
title="Read length distribution.",
x_axis_label='Read Length / bases',
y_axis_label='Number of reads',
xlim=(0, None))
length_hist = annot.subtitle(
length_hist,
"Mean: {:.0f}. Median: {:.0f}".format(
mean_length, median_length))

datas = [seq_summary['acc']]
mean_q, median_q = np.mean(datas[0]), np.median(datas[0])
q_hist = hist.histogram(
datas, colors=[np_blue], bins=100,
title="Read quality (wrt reference sequence)",
x_axis_label="Read Quality",
y_axis_label="Number of reads",
xlim=(85, 100))
q_hist = annot.subtitle(
q_hist,
"Mean: {:.0f}. Median: {:.0f}".format(
mean_q, median_q))

report_doc.plot(gridplot([[length_hist, q_hist]]))

report_doc.markdown('''
### Genome coverage
Plots below indicate depth of coverage of the coloured by amplicon pool.
For adequate variant calling depth should be at least 50X in any region.
Forward reads are shown in light-blue, reverse reads are dark grey.
''')
df = pd.read_csv(args.depth, sep='\t')
plots_orient = list()
plots_cover = list()
depth_lim = 50
for sample in sorted(df['rname'].unique()):
bc = df['rname'] == sample
depth = df[bc].groupby('pos')['depth'].sum()
depth_thresh = 100*(depth >= depth_lim).sum() / len(depth)

# fwd/rev
data = df[bc].groupby('pos').sum().reset_index() # Is this necessary?
xs = [data['pos'], data['pos']]
ys = [data['depth_fwd'], data['depth_rev']]
plot = points.points(
xs, ys, colors=[np_light_blue, np_dark_grey],
title="{}: {:.0f}X, {:.1f}% > {}X".format(
sample, depth.mean(), depth_thresh, depth_lim),
height=300, width=800,
x_axis_label='position', y_axis_label='depth',
ylim=(0,300))
plots_orient.append(plot)

# cumulative coverage
xs = [data['depth'].sort_values(ascending=False)]
ys = [np.linspace(1, 100, len(data))]
plot = lines.line(
xs, ys, colors=[np_light_blue],
title="{}: {:.0f}X, {:.1f}% > {}X".format(
sample, depth.mean(), depth_thresh, depth_lim),
height=300, width=800,
x_axis_label='Coverage / bases',
y_axis_label='%age of reference')
plots_cover.append(plot)


tab1 = Panel(
child=gridplot(plots_orient, ncols=1), title="Coverage traces")
tab2 = Panel(
child=gridplot(plots_cover, ncols=1), title="Proportions covered")
cover_panel = Tabs(tabs=[tab1, tab2])
report_doc.plot(cover_panel)

# Footer section
report_doc.markdown('''
### Variant Calls
This section summarises information from the VCF file found within the workflow output directory.
#### Summary
The following tables and figures are derived from the output of `bcftools stats`.
''')

vcf_tables = parse_bcftools_stats(args.bcf_stats)
report_doc.markdown("""
**Variant counts:**
Barcoded samples are represented independently
""")
df = vcf_tables['SN'].rename(columns={'id':'sample'}) \
.drop(columns='samples').set_index('sample').transpose()
report_doc.table(df, index=True)
report_doc.markdown("**Transitions and tranversions:**")
df = vcf_tables['TSTV'].rename(columns={'id':'sample'}) \
.set_index('sample').transpose()
report_doc.table(df, index=True)
report_doc.markdown("""
**Substitution types**
Base substitutions aggregated across all samples (symmetrised by pairing)
""")

sim_sub = {
'G>A': 'C>T', 'G>C': 'C>G', 'G>T': 'C>A',
'T>A': 'A>T', 'T>C': 'A>G', 'T>G': 'A>C'}
def canon_sub(sub):
b1 = sub[0]
if b1 not in {'A', 'C'}:
return canon_sub(sim_sub[sub])
else:
return b1, sub[2]

df = vcf_tables['ST']
df['canon_sub'] = df['type'].apply(canon_sub)
df['original'] = df['canon_sub'].apply(lambda x: x[0])
df['substitution'] = df['canon_sub'].apply(lambda x: x[1])
df['count'] = df['count'].astype(int)
df = df[['original', 'substitution', 'count']] \
.groupby(['original', 'substitution']) \
.agg(count=pd.NamedAgg(column='count', aggfunc='sum')) \
.reset_index()

from bokeh.models import ColorBar, LinearColorMapper
from bokeh.palettes import Blues9
from bokeh.plotting import figure
colors = Blues9[::-1]
mapper = LinearColorMapper(
palette=colors, low=min(df['count']), high=max(df['count']))
p = figure(
y_range=['C', 'A'], x_range=['A', 'C', 'G', 'T'],
x_axis_location="above",
x_axis_label='alternative base',
y_axis_label='reference base',
tools="save", toolbar_location='below',
output_backend="webgl",
height=225, width=300,
tooltips=[('sub', '@original>@substitution'), ('count', '@count')])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.rect(
source=df, y="original", x="substitution", width=1, height=1,
fill_color={'field': 'count', 'transform': mapper},
line_color=None)
color_bar = ColorBar(
title='', color_mapper=mapper, label_standoff=10,
location=(0, 0))
#p.add_layout(color_bar, 'right')

report_doc.plot(p)

report_doc.markdown("""
**Indel lengths**
Insertion and deletion lengths aggregated across all samples.
""")
df = vcf_tables['IDD']
df['nlength'] = df['length (deletions negative)'].astype(int)
df['count'] = df['number of sites'].astype(int)
# pad just to pull out axes by a minimum
pad = pd.DataFrame({'nlength':[-10,+10], 'count':[0,0]})
counts = df.groupby('nlength') \
.agg(count=pd.NamedAgg(column='count', aggfunc='sum')) \
.reset_index().append(pad)
plot = hist.histogram(
[counts['nlength']], weights=[counts['count']],
colors = [np_light_blue], binwidth=1,
title='Insertion and deletion variant lengths',
x_axis_label='Length / bases (deletions negative)',
y_axis_label='Count')
#plot.xaxis.formatter = NumeralTickFormatter(format="0,0")
report_doc.plot(plot)




# Footer section
report_doc.markdown('''
### About
**Oxford Nanopore Technologies products are not intended for use for health assessment
or to diagnose, treat, mitigate, cure or prevent any disease or condition.**
This report was produced using the [epi2me-labs/wf-hap-snp](https://github.com/epi2me-labs/wf-hap-snp).
The workflow can be run using `nextflow epi2me-labs/wf-hap-snp --help`
---
''')

# write report
report_doc.write("summary_report.html")

if __name__ == "__main__":
main()

0 comments on commit 2177265

Please sign in to comment.