From f44758ef41e0bdbb6fdc2a70ec288ac04e54acae Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Wed, 9 Aug 2023 18:33:38 +0000 Subject: [PATCH] CW-2260 ezcharts report --- CHANGELOG.md | 9 +- bin/workflow_glue/bokeh_plot.py | 196 +++++++ bin/workflow_glue/report.py | 488 +++++++----------- .../report_utils/report_utils.py | 158 ++++++ bin/workflow_glue/run_plannotate.py | 4 +- main.nf | 9 +- nextflow.config | 6 +- 7 files changed, 565 insertions(+), 305 deletions(-) create mode 100755 bin/workflow_glue/bokeh_plot.py create mode 100755 bin/workflow_glue/report_utils/report_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c676f23..5fe30a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,14 +4,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [unreleased] +## [v0.5.0] +### Changed +- The report has been updated and re-ordered to improve usability. +- Default basecaller cfg is now `dna_r10.4.1_e8.2_400bps_sup@v4.2.0`. +- Docker will use an ARM platform image on appropriate devices. ### Fixed - Updated basecaller cfg model options. - Workflow will still output report if there are no assemblies. -### Changed -- Default basecaller cfg is now `dna_r10.4.1_e8.2_400bps_sup@v4.2.0`. - ## [v0.4.0] ### Added - Documentation updated to include workflow steps. diff --git a/bin/workflow_glue/bokeh_plot.py b/bin/workflow_glue/bokeh_plot.py new file mode 100755 index 0000000..3618e13 --- /dev/null +++ b/bin/workflow_glue/bokeh_plot.py @@ -0,0 +1,196 @@ +"""Script from plannotate edited to work with bokeh 3.""" +from math import pi + +from bokeh.models import ColumnDataSource, HoverTool, Range1d, WheelZoomTool +from bokeh.models.annotations import Label +from bokeh.plotting import figure +import numpy as np +import pandas as pd +from plannotate.bokeh_plot import calc_glyphs, calc_level, calc_num_markers +import plannotate.resources as rsc + + +def get_bokeh(df, linear=False): + """Get bokeh from plannotate updated to use Bokeh v3 to match ezcharts.""" + # df = df.fillna("") + x = 0 + y = 0 + baseradius = .18 + tooltips = """ + @Feature — @Type @pi_permatch_int
+ @Description + """ # noqa: E501 + hover = HoverTool(tooltips=tooltips) + plotsize = .35 + plotdimen = 800 + x_range = Range1d(-plotsize, plotsize, bounds=(-.5, .5), min_interval=.1) + y_range = Range1d(-plotsize, plotsize, bounds=(-.5, .5), min_interval=.1) + toolbar = "right" + p = figure( + height=plotdimen, width=plotdimen, title="", + toolbar_location=toolbar, toolbar_sticky=False, + match_aspect=True, + sizing_mode='scale_width', tools=['save', 'pan'], + x_range=x_range, y_range=y_range) + + # x_range=(-plotsize, plotsize), y_range=(-plotsize, plotsize)) + p.toolbar.logo = None + p.add_tools(WheelZoomTool(zoom_on_axis=False)) + p.toolbar.active_scroll = p.select_one(WheelZoomTool) + + # backbone line + p.circle( + x=x, y=y, radius=baseradius, line_color="#000000", + fill_color=None, line_width=2.5) + + df = calc_level(df) + + if linear: + line_length = baseradius / 5 + p.line( + [0, 0], [baseradius - line_length, baseradius + line_length], + line_width=4, level="overlay", line_color="black") + + df['pi_permatch_int'] = df['pi_permatch'].astype('int') + + df['pi_permatch_int'] = df['pi_permatch_int'].astype(str) + "%" + # removes percent from infernal hits + df.loc[df['db'] == "Rfam", 'pi_permatch_int'] = "" + + df['rstart'] = ((df["qstart"]/df["qlen"])*2*pi) + df['rend'] = ((df["qend"]/df["qlen"])*2*pi) + df['rstart'] = np.where( + df['rstart'] < 0, df['rstart'] + (2*pi), df['rstart']) + df['rend'] = np.where(df['rend'] < 0, df['rend'] + (2*pi), df['rend']) + df['rend'] = np.where( + df['rend'] < df['rstart'], df['rend'] + (2*pi), df['rend']) + + df['Type'] = df['Type'].str.replace('rep_origin', 'origin of replication') + + # DDE0BD + # C97064 + # C9E4CA + fullcolordf = pd.read_csv( + rsc.get_resource("data", "colors.csv"), index_col=0) + fragcolordf = fullcolordf.copy() + fragcolordf[['fill_color', 'line_color']] = fragcolordf[ + ['line_color', 'fill_color']] + fragcolordf["fill_color"] = "#ffffff" + + full = df[df["fragment"] == False] # noqa + + full = full.merge( + fullcolordf, + how="left", on=["Type"]) + full['legend'] = full['Type'] + full = full.fillna( + {"color": "grey", + "fill_color": "#808080", + "line_color": "#000000"}) + frag = df[df["fragment"]] + frag = frag.merge(fragcolordf, how="left", on=["Type"]) + frag = frag.fillna( + {"color": "grey", + "fill_color": "#ffffff", + "line_color": "#808080"}) + + df = full.append(frag).reset_index(drop=True) + + # add orientation column + orient = pd.read_csv( + rsc.get_resource("data", "feature_orientation.csv"), + header=None, names=["Type", "has_orientation"]) + orient['Type'] = orient['Type'] + orient['has_orientation'] = orient['has_orientation'].map({"T": True}) + df = df.merge(orient, on="Type", how="left") + df['Type'] = df['Type'].str.replace("_", " ") + df['has_orientation'] = df['has_orientation'].fillna(value=False) + df[[ + 'x', 'y', "Lx1", "Ly1", + "annoLineColor", "lineX", + "lineY", "theta", "text_align"]] = df.apply(calc_glyphs, axis=1) + df['legend'] = df['Type'] + # allowedtypes = ['CDS',"promoter","origin of replication","swissprot"] + allowedtypes = fullcolordf['Type'] + mask = ~df['legend'].isin(allowedtypes) + df.loc[mask, 'legend'] = 'misc feature' + + # plot annotations + source = ColumnDataSource(df) + hover_labels = p.patches( + 'x', 'y', fill_color='fill_color', line_color='line_color', + name="features", line_width=2.5, + source=source, legend_group="legend") + p.multi_line( + xs="lineX", ys="lineY", line_color="annoLineColor", + line_width=3, level="overlay", + line_cap='round', alpha=.5, source=source) + + # `text_align` cannot read from `source` -- have to do this workaround + right = ColumnDataSource(df[df['text_align'] == 'right']) + left = ColumnDataSource(df[df['text_align'] == 'left']) + bcenter = ColumnDataSource(df[df['text_align'] == 'b_center']) + tcenter = ColumnDataSource(df[df['text_align'] == 't_center']) + + text_level = 'overlay' + p.text( + x="Lx1", y="Ly1", name="2", x_offset=3, y_offset=8, + text_align="left", + text='Feature', level=text_level, source=right) + p.text( + x="Lx1", y="Ly1", name="2", x_offset=-5, y_offset=8, + text_align="right", text='Feature', level=text_level, source=left) + p.text( + x="Lx1", y="Ly1", name="2", x_offset=0, y_offset=15, + text_align="center", text='Feature', + level=text_level, source=bcenter) + p.text( + x="Lx1", y="Ly1", name="2", x_offset=0, y_offset=0, + text_align="center", + text='Feature', level=text_level, source=tcenter) + + # calculate chunk size(s) for drawing lines + plaslen = df.iloc[0]['qlen'] + ticks = calc_num_markers(plaslen) + ticks_cds = ColumnDataSource(ticks) + p.multi_line( + xs="lineX", ys="lineY", line_color="black", line_width=2, + level="underlay", line_cap='round', + alpha=.5, source=ticks_cds) + + right = ColumnDataSource(ticks[ticks['text_align'] == 'right']) + left = ColumnDataSource(ticks[ticks['text_align'] == 'left']) + bcenter = ColumnDataSource(ticks[ticks['text_align'] == 'b_center']) + tcenter = ColumnDataSource(ticks[ticks['text_align'] == 't_center']) + p.text( + x="Lx1", y="Ly1", name="2", x_offset=3, y_offset=6, + text_align="left", text='bp', alpha=.5, + text_font_size='size', level=text_level, source=right) + p.text( + x="Lx1", y="Ly1", name="2", x_offset=-5, + y_offset=6, text_align="right", + text='bp', alpha=.5, text_font_size='size', + level=text_level, source=left) + p.text( + x="Lx1", y="Ly1", name="2", x_offset=0, y_offset=15, + text_align="center", text='bp', alpha=.5, + text_font_size='size', level=text_level, source=bcenter) + p.text( + x="Lx1", y="Ly1", name="2", x_offset=0, y_offset=-3, + text_align="center", text='bp', alpha=.5, + text_font_size='size', level=text_level, source=tcenter) + p.add_tools(hover) + p.add_layout(Label( + x=0, y=0, name="2", x_offset=0, y_offset=-8, + text_align="center", + text=f"{plaslen} bp", text_color="#7b7b7b", + text_font_size='16px', level=text_level)) + p.hover.renderers = [hover_labels] + p.axis.axis_label = None + p.axis.visible = False + p.grid.grid_line_color = "#EFEFEF" + p.outline_line_color = "#DDDDDD" + p.legend.location = 'bottom_left' + p.legend.border_line_color = "#EFEFEF" + p.legend.visible = True + return p diff --git a/bin/workflow_glue/report.py b/bin/workflow_glue/report.py index 9f87de2..9a9e88d 100755 --- a/bin/workflow_glue/report.py +++ b/bin/workflow_glue/report.py @@ -1,174 +1,111 @@ #!/usr/bin/env python - -"""Report component for displaying information from wf-clone-validation.""" -import argparse +"""Create workflow report.""" +from itertools import count, takewhile import json import os - -from aplanat import bars, report -from aplanat.components import bcfstats -from aplanat.components import fastcat -from aplanat.components import simple as scomponents -import aplanat.graphics -from aplanat.util import Colors -from bokeh.layouts import layout -from bokeh.models import Panel, Tabs +from dominate.tags import p, pre +from dominate.util import raw +from ezcharts.components import bokehchart +from ezcharts.components.fastcat import draw_all_plots +from ezcharts.components.reports import labs +from ezcharts.layout.snippets import Stats, Tabs +from ezcharts.layout.snippets.table import DataTable +from ezcharts.plots.__init__ import BokehPlot +from ezcharts.plots.util import read_files +from ezcharts.util import get_named_logger # noqa: ABS101 import pandas as pd -from plannotate.bokeh_plot import get_bokeh -from .util import wf_parser # noqa: ABS101 +from workflow_glue.bokeh_plot import get_bokeh +from workflow_glue.report_utils import report_utils +from .util import wf_parser # noqa: ABS101 -def tidyup_status_file(status_sheet, annotations): - """Tidy up the sample status file.""" - sample_status = pd.read_csv(status_sheet[0], header=None) - unique_samples = sample_status[0].unique() - pass_fail_dic = {} - for sample in unique_samples: - pass_fail_dic[sample] = 'Completed successfully' - filter_pass = sample_status[sample_status[1] != 'Completed successfully'] - failures = dict(zip(filter_pass[0], filter_pass[1])) - completed_annotations = list(annotations) - success = sample_status[sample_status[1] == 'Completed successfully'] - no_annotations = success[~success[0].isin(completed_annotations)] - for sample in list(no_annotations[0]): - failures[sample] = 'Completed but no annotations found in the database' - all_sample_names = unique_samples.tolist() - all_sample_names.sort() - passed_list = unique_samples.tolist() - for k, v in failures.items(): - pass_fail_dic[k] = v - if v != 'Completed but failed to reconcile': - passed_list.remove(k) - passed_list.sort() - status_df = pd.DataFrame( - pass_fail_dic.items(), columns=['Sample', 'pass/failed reason']) - sort_df = status_df['Sample'].astype(str).argsort() - status_df = status_df.iloc[sort_df] - return (status_df, passed_list, all_sample_names, pass_fail_dic) +THEME = 'epi2melabs' -def read_files(summaries, sep='\t'): - """Read a set of files and join to single dataframe.""" - dfs = list() - for fname in sorted(summaries): - dfs.append(pd.read_csv(fname, sep=sep)) - return pd.concat(dfs) +def sliced(seq, n): + """Slice up a string.""" + iterator = takewhile(len, (seq[i: i + n] for i in count(0, n))) + slices = list(iterator) + return slices -def fastcat_report_tab(file_name, tab_name): - """Read fastcat dataframe and create a tab with qual and len plots.""" - df = pd.read_csv(file_name, sep='\t') - depth = len(df.index) - min_length = df["read_length"].min() - max_length = df["read_length"].max() - lengthplot = fastcat.read_length_plot( - df, - min_len=min_length, - max_len=max_length) - qstatplot = fastcat.read_quality_plot(df) - exec_summary = aplanat.graphics.InfoGraphItems() - exec_summary.append( - 'No. reads', - str(depth), - "bars", '') - exec_plot = aplanat.graphics.infographic( - exec_summary.values(), ncols=1) - tab = Panel(child=layout( - [[exec_plot], [lengthplot], [qstatplot]], - aspect_ratio="auto", - sizing_mode='stretch_width'), - title=tab_name) - return tab +def format_msa(inserts): + """Format the MSA.""" + collect_sliced = [] + name_list = [] + reorg_slices = [] + for i in list(inserts['msa']): + name_list.append(i[0:10]) + collect_sliced.append(sliced(i[10:], 80)) + for slice_index in range(len(collect_sliced[0])): + for index, value in enumerate(collect_sliced): + reorg_slices.append(name_list[index] + value[slice_index]) + reorg_slices.append('') + return reorg_slices -def create_fastcat_dic(sample_names, raw, hostfilt, downsampled): - """Create dictionary using sample names and fastcat files available.""" - per_sample_dic = {} - lists = {'raw': raw, 'hostfilt': hostfilt, 'downsampled': downsampled} - for sample in sample_names: - new_dic = {} - item_search = '/' + sample + '.' - for list_name, fc_list in lists.items(): - # find index of item that contains the sample name as a substring - item_index = [i for i, s in enumerate(fc_list) if item_search in s] - if item_index: - indice = item_index[0] - new_dic[list_name] = fc_list[indice] - else: - pass - per_sample_dic[sample] = new_dic - return per_sample_dic +def button_format(reason): + """Format button, blue for success, red for failure.""" + if reason == "Completed successfully": + return """""" + reason + """""" + else: + return """""" + reason + """""" def main(args): - """Entry point to create a wf-clone-validation report.""" - report_doc = report.WFReport( - "Clone Validation Report", - "wf-clone-validation", - revision=args.revision, - commit=args.commit) - - # summary section - section = report_doc.add_section() - section.markdown("### Summary") - seq_summary = read_files(args.per_barcode_stats) - section = report_doc.add_section() - barcode_counts = ( - pd.DataFrame(seq_summary['sample_name'].value_counts()) - .sort_index() - .reset_index() - .rename( - columns={'index': 'sample', 'sample_name': 'count'}) - ) - bc_counts = bars.simple_bar( - barcode_counts['sample'].astype(str), barcode_counts['count'], - colors=[Colors.cerulean]*len(barcode_counts), - title=( - 'Number of reads per barcode'), - plot_width=None - ) - bc_counts.xaxis.major_label_orientation = 3.14/2 - section.plot( - layout( - [[bc_counts]], - sizing_mode="stretch_width")) - - # We defer this until processing through all the samples in the loop below - summary_placeholder = report_doc.add_section(key='stats_table') + """Run the entry point.""" + logger = get_named_logger("Report") + report = labs.LabsReport( + "Clone validation report", "wf-clone-validation-new", + args.params, args.versions) plannotate_annotations = json.load(open(args.plannotate_json)).keys() - lengths = json.load(open(args.lengths)) - pass_fail = tidyup_status_file(args.status, plannotate_annotations) - # find inserts and put in dir - current_directory = os.getcwd() - make_directory = os.path.join(current_directory, r'inserts') - if not os.path.exists(make_directory): - os.makedirs(make_directory) - if os.stat(args.inserts_json).st_size != 0: - inserts = json.load(open(args.inserts_json)) - insert_placeholder = report_doc.add_section(key='insert_table') - seq_segment = inserts['bed_df'] - seq_segment = pd.read_json(inserts['bed_df']) - insert_placeholder.markdown(""" -### Insert sequences -This table shows which primers were found in the consensus sequence -of each sample and where the inserts were found. -""") - section = report_doc.add_section() - section.markdown(""" -### Multiple sequence alignment -This section shows the inserts aligned with each other or a reference -sequence if provided. + # Get sample info from status file + passed_samples, sample_names, sample_status_dic = \ + report_utils.tidyup_status_file( + args.status, plannotate_annotations) + # Sample status table + with report.add_section("Sample status", "Sample status"): + p(""" +This table gives the status of each sample, either completed \ +successfully or the reason the workflow failed. \ +If applicable the mean quality for the whole \ +plasmid assembly has been provided, derived by Medaka \ +from aligning the reads to the consensus. """) - section.markdown("
" + os.linesep.join(inserts['msa']) + "
") - - # Per sample details - section = report_doc.add_section() - section.markdown(""" -### Assemblies -For each assembly, read length statistics are displayed as well as a -[pLannotate plot](https://github.com/barricklab/pLannotate), -and quality plots. + lengths = json.load(open(args.lengths)) + sample_stats = [] + for item in sample_names: + try: + stats_table = [item] + [int(lengths[item]['reflen'])] + except KeyError: + stats_table = [item] + ['N/A'] + sample_stats.append(stats_table) + stats_df = pd.DataFrame( + sample_stats, columns=["Sample", "Length"]) + status_df = pd.DataFrame( + sample_status_dic.items(), columns=['Sample', 'pass/failed reason']) + sort_df = status_df['Sample'].astype(str).argsort() + status_df = status_df.iloc[sort_df] + merged = pd.merge(status_df, stats_df) + merged.to_csv('sample_status.txt', index=False) + if ('assembly_quality/OPTIONAL_FILE' not in args.assembly_quality): + qc_df = read_files(args.assembly_quality)[ + ['sample_name', 'mean_quality']] + qc_df = qc_df.rename(columns={ + 'sample_name': 'Sample', 'mean_quality': 'Mean Quality'}) + qc_df = qc_df.reset_index(drop=True) + merged = pd.merge(merged, qc_df, how="outer") + merged.fillna('N/A', inplace=True) + merged['pass/failed reason'] = merged.apply( + lambda merged: button_format(merged['pass/failed reason']), + axis=1) + DataTable.from_pandas(merged, use_index=False) + with report.add_section("Plannotate", "Plannotate"): + raw("""The Plasmid annotation plot and feature table are produced using \ +pLannotate""") + p(""" +A pLannotate plot is shown for each assembly. A feature table provides descriptions of the annotated sequence. Unfilled features on the plannotate plots are incomplete features; the sequence @@ -179,170 +116,135 @@ def main(args): small feature fragments may affect plasmid function if they result in cryptic gene expression or are inadvertently combined with other elements during later cloning steps. - -The Plasmid annotation plot and feature table are produced using -[Plannotate](http://plannotate.barricklab.org/). """) - passed_samples = pass_fail[1] - host_ref_stats = args.host_filter_stats - downsampled_stats = args.downsampled_stats - initial_stats = args.per_barcode_stats - if ('host_filter_stats/OPTIONAL_FILE' in host_ref_stats): - host_filt = host_ref_stats.remove('host_filter_stats/OPTIONAL_FILE') - if host_filt is None: - host_filt = [] - else: - host_filt = host_ref_stats - if ('host_filter_stats/OPTIONAL_FILE' in downsampled_stats): - summary_stats = downsampled_stats.remove( - 'downsampled_stats/OPTIONAL_FILE') - if summary_stats is None: - summary_stats = [] - else: - summary_stats = downsampled_stats - sample_names = pass_fail[2] - fast_cat_dic = create_fastcat_dic( - sample_names, initial_stats, host_filt, summary_stats) - plannotate = json.load(open(args.plannotate_json)) - sample_stats = [] - - # stats graphs and plannotate where appropriate - for item in sample_names: - section = report_doc.add_section() - section.markdown('### Sample: {}'.format(str(item))) - if item in passed_samples: - section.markdown('*{}*'.format(pass_fail[3][item])) - tup_dic = plannotate[item] - annotations = pd.read_json(tup_dic['annotations']) - plot = get_bokeh(pd.read_json(tup_dic['plot']), False) - fast_cat_tabs = fast_cat_dic[item] - alltabs = [] - for key, value in fast_cat_tabs.items(): - alltabs.append(fastcat_report_tab(value, key)) - cover_panel = Tabs(tabs=alltabs) - plasmidplot = [plot] - stats_table = [item] + [int(lengths[item]['reflen'])] - sample_stats.append(stats_table) - section.plot(layout( - [[cover_panel, plasmidplot]], - sizing_mode='scale_width')) - section.table(annotations.drop( - columns=['Plasmid length']), - index=False, key="table"+str(item)) - else: - section.markdown('*{}*'.format(pass_fail[3][item])) - fast_cat_tabs = fast_cat_dic[item] - alltabs = [] - for key, value in fast_cat_tabs.items(): - alltabs.append(fastcat_report_tab(value, key)) - cover_panel = Tabs(tabs=alltabs) - section.plot(cover_panel) - try: - stats_table = [item] + [int(lengths[item]['reflen'])] - except KeyError: - stats_table = [item] + ['N/A'] - sample_stats.append(stats_table) - - # high level sample status table - summary_placeholder.markdown("### Sample status") - stats_df = pd.DataFrame( - sample_stats, columns=["Sample", "Length"]) - merged_inner = pd.merge(pass_fail[0], stats_df) - summary_placeholder.table(merged_inner, index=False, key='stats_table') - merged_inner.to_csv('sample_status.txt', index=False) - - def insert_len(start, end, length): - if end >= start: - insert_length = end - start - else: - insert_length = (length - start) + end - return insert_length - + # Load and plot per sample plannotate json + plannotate = json.load(open(args.plannotate_json)) + tabs = Tabs() + with tabs.add_dropdown_menu(): + for item in passed_samples: + plan_item = plannotate[item] + annotations = pd.read_json(plan_item['annotations']) + with tabs.add_dropdown_tab(str(item)): + bk_plot = BokehPlot() + bk_plot._fig = get_bokeh(pd.read_json(plan_item['plot'])) + bk_plot._fig.xgrid.grid_line_color = None + bk_plot._fig.ygrid.grid_line_color = None + bokehchart.BokehChart( + bk_plot, 'epi2melabs', + width='100%', height='100%') + DataTable.from_pandas(annotations, use_index=False) + # Per barcode read count plot + report_utils.read_count_barplot(args.per_barcode_stats, report) + with report.add_section("Read stats", "Read stats"): + p(""" +For each assembly, read length statistics and plots of quality \ +before and after downsampling, and after host filtering \ +if a host reference was provided. +""") + fastcat_dic = report_utils.create_fastcat_dic( + sample_names, args.per_barcode_stats, + args.host_filter_stats, args.downsampled_stats) + tabs = Tabs() + with tabs.add_dropdown_menu(): + for item in sample_names: + with tabs.add_dropdown_tab(str(item)): + p(sample_status_dic[item]) + internal_tabs = Tabs() + # For each sample, plot fastcat stats + fastcat_tabs = fastcat_dic[item] + for key, value in fastcat_tabs.items(): + with internal_tabs.add_tab(key): + seq_summary = pd.read_csv(value, sep='\t') + depth = len(seq_summary.index) + Stats( + columns=2, + items=[ + (depth, 'Read count'), + ]) + draw_all_plots(seq_summary, THEME) + # Insert info table if os.stat(args.inserts_json).st_size != 0: - test_df = seq_segment.merge(stats_df, how='left') - test_df['Insert length'] = list( - map( - insert_len, - test_df['start'], - test_df['end'], - test_df['Length'])) - test_df = test_df.drop(columns='Length') - insert_placeholder.table(test_df, index=False, key='insert_table') - # Assembly QC section - section = report_doc.add_section() - section.markdown('### Assembly Quality') - if 'assembly_quality/OPTIONAL_FILE' not in args.assembly_quality: - section.markdown( - 'This table gives the mean quality for the whole \ - plasmid assembly. This is provided by Medaka, derived \ - from aligning the reads to the consensus.') - qual_df = read_files( - args.assembly_quality)[['sample_name', 'mean_quality']] - qual_df = qual_df.rename(columns={ - 'sample_name': 'Sample', 'mean_quality': 'Mean Quality'}) - section.table(qual_df, index=False) - if 'qc_inserts/OPTIONAL_FILE' not in args.qc_inserts: - bcf_header = """ -### Insert variant call summaries - -The following tables and figures are derived using `bcftools`, -comparing the consensus insert with the reference -insert if provided. -""" - report_doc.add_section( - section=bcfstats.full_report(args.qc_inserts, header=bcf_header)) - # Versions and params - report_doc.add_section( - section=scomponents.version_table(args.versions)) - report_doc.add_section( - section=scomponents.params_table(args.params)) - report_doc.write(args.report_name) + inserts = json.load(open(args.inserts_json)) + with report.add_section("Insert sequences", "Inserts"): + seq_segment = pd.read_json(inserts['bed_df']) + insert_df = seq_segment.merge(stats_df, how='left') + insert_df['Insert length'] = list( + map( + report_utils.insert_len, + insert_df['start'], + insert_df['end'], + insert_df['Length'])) + insert_df = insert_df.drop(columns='Length') + p(""" +This table shows which primers were found in the consensus sequence +of each sample and where the inserts were found. +""") + DataTable.from_pandas(insert_df, use_index=False) + # MSA section if inserts available + with report.add_section("Multiple Sequence Alignment", "MSA"): + p(""" +This section shows the inserts aligned with each other or a reference +sequence if provided. +""") + formatted_msa = format_msa(inserts) + pre(os.linesep.join(formatted_msa)) + if ('OPTIONAL_FILE' not in os.listdir(args.qc_inserts)): + with report.add_section("Insert variants", "Insert variants"): + p(""" +The following tables and figures are output from bcftools from +finding any variants between +the consensus insert and the provided reference +insert. +""") + variants_df = report_utils.variant_counts_table(args.qc_inserts, report) + DataTable.from_pandas(variants_df, use_index=False) + trans_df = report_utils.trans_counts(args.qc_inserts, report) + DataTable.from_pandas(trans_df, use_index=False) + report.write(args.report) + logger.info(f"Report written to {args.report}.") def argparser(): """Argument parser for entrypoint.""" parser = wf_parser("report") - + parser.add_argument("report", help="Report output file") parser.add_argument( - "--downsampled_stats", - nargs='+', - required=True - ) + "--stats", nargs='*', help="Fastcat per-read stats file(s).") + parser.add_argument( + "--versions", required=True, + help="directory containing CSVs containing name,version.") + parser.add_argument( + "--params", default=None, required=True, + help="A JSON file containing the workflow parameter key/values") parser.add_argument( "--revision", default='unknown', - help="revision number") + help="git branch/tag of the executed workflow") parser.add_argument( "--commit", default='unknown', - help="git commit number") - parser.add_argument( - "--status", nargs='+', - help="status") + help="git commit of the executed workflow") parser.add_argument( "--per_barcode_stats", nargs='+', help="fastcat stats file for each sample before filtering") parser.add_argument( - "--host_filter_stats", nargs='+', - help="fastcat stats file after host filtering") - parser.add_argument( - "--params", default=None, - help="A csv containing the parameter key/values") - parser.add_argument( - "--versions", - help="directory contained CSVs containing name,version.") + "--status", nargs='+', + help="status") parser.add_argument( "--plannotate_json", help="Plannotate Json.") parser.add_argument( "--inserts_json", help="inserts Json.") - parser.add_argument( - "--report_name", - help="report name") parser.add_argument( "--lengths", help="report name") parser.add_argument( - "--qc_inserts", nargs='+', + "--downsampled_stats", + nargs='+') + parser.add_argument( + "--host_filter_stats", nargs='+', + help="fastcat stats file after host filtering") + parser.add_argument( + "--qc_inserts", help="insert vcfs") parser.add_argument( "--assembly_quality", nargs='+', @@ -351,5 +253,5 @@ def argparser(): if __name__ == "__main__": - args = argparse().parse_args() - main(args) + parser = argparser() + main(parser.parse_args()) diff --git a/bin/workflow_glue/report_utils/report_utils.py b/bin/workflow_glue/report_utils/report_utils.py new file mode 100755 index 0000000..6fdc5eb --- /dev/null +++ b/bin/workflow_glue/report_utils/report_utils.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +"""Create tables for the report.""" + +from dominate.tags import p +from ezcharts.components import bcfstats +from ezcharts.components.ezchart import EZChart +from ezcharts.plots.categorical import barplot +from ezcharts.plots.util import read_files +import pandas as pd + +THEME = 'epi2melabs' + + +def tidyup_status_file(status_sheet, annotations): + """Tidy up the sample status file.""" + sample_status = pd.read_csv(status_sheet[0], header=None) + unique_samples = sample_status[0].unique() + sample_status_dic = {} + # Default all to success + for sample in unique_samples: + sample_status_dic[sample] = 'Completed successfully' + filter_pass = sample_status[sample_status[1] != 'Completed successfully'] + # Collect failures + failures = dict(zip(filter_pass[0], filter_pass[1])) + completed_annotations = list(annotations) + # If no failure for a sample status sheet then success + success = sample_status[sample_status[1] == 'Completed successfully'] + # Check for corresponding annotations, if none update status + no_annotations = success[~success[0].isin(completed_annotations)] + for sample in list(no_annotations[0]): + failures[sample] = 'Completed but no annotations found in the database' + passed_list = unique_samples.tolist() + # Update sample status dictionary with any failure messages + # Also create a list of passed samples to iterate through later + for k, v in failures.items(): + sample_status_dic[k] = v + if v != 'Completed but failed to reconcile': + passed_list.remove(k) + passed_list.sort() + # Output list of all sample names + all_sample_names = unique_samples.tolist() + all_sample_names.sort() + return (passed_list, all_sample_names, sample_status_dic) + + +def read_count_barplot(per_barcode_stats, report): + """Plot per sample read count bar chart.""" + # Per barcode read count + seq_summary = read_files(per_barcode_stats) + barcode_counts = ( + pd.DataFrame(seq_summary['sample_name'].value_counts()) + .sort_index() + .reset_index() + .rename( + columns={'index': 'Sample', 'sample_name': 'Count'}) + ) + with report.add_section("Read Counts", "Read Counts"): + p( + """Number of reads per sample.""" + ) + order = barcode_counts["Sample"].values.tolist() + plt = barplot( + data=barcode_counts, x='Sample', y='Count', order=order) + plt.xAxis = dict( + name="Sample", + type='category', + axisLabel=dict(interval=0, rotate=45), + axisTick=dict(alignWithLabel=True)) + plt.yAxis = dict(name="Count", type='value') + EZChart( + plt, + theme='epi2melabs') + + +def insert_len(start, end, length): + """Insert length calc.""" + if end >= start: + insert_length = end - start + else: + insert_length = (length - start) + end + return insert_length + + +def create_fastcat_dic(sample_names, raw, host_file, downsampled_file): + """Create dictionary using sample names and fastcat files available.""" + # Collect per sample fastcat stats check no optional files + if ('host_filter_stats/OPTIONAL_FILE' in host_file): + host_filt = host_file.remove('host_filter_stats/OPTIONAL_FILE') + if host_filt is None: + host_filt = [] + else: + host_filt = host_file + if ('downsampled_stats/OPTIONAL_FILE' in downsampled_file): + summary_stats = downsampled_file.remove( + 'downsampled_stats/OPTIONAL_FILE') + if summary_stats is None: + summary_stats = [] + else: + summary_stats = downsampled_file + per_sample_dic = {} + lists = {'Raw': raw, 'Hostfilt': host_filt, 'Downsampled': summary_stats} + for sample in sample_names: + new_dic = {} + item_search = '/' + sample + '.' + for list_name, fc_list in lists.items(): + # find index of item that contains the sample name as a substring + item_index = [i for i, s in enumerate(fc_list) if item_search in s] + if item_index: + indice = item_index[0] + new_dic[list_name] = fc_list[indice] + else: + pass + per_sample_dic[sample] = new_dic + return per_sample_dic + + +def variant_counts_table( + bcf_stats, + header="**Variant counts:**", report=None): + """Create a report section contains variant counts. + + :param bcf_stats: one or more outputs from `bcftools stats`. + :param sample_as_columns: transpose table to put data for each + sample into a column. + :param header: a markdown formatted header. + :param report: an HTMLSection instance. + + :returns: an HTMLSection instance, if `report` was provided the given + instance is modified and returned. + """ + bcf_stats = bcfstats.load_bcfstats(bcf_stats) + p(""" +Variant counts per sample. See output bcf +file for info on individual variants. +""") + df = bcf_stats['SN'].drop(columns='samples') + return df + + +def trans_counts( + bcf_stats, + header="**Transitions and tranversions:**", report=None): + """Create a report section with transition and transversion counts. + + :param bcf_stats: one or more outputs from `bcftools stats`. + :param header: a markdown formatted header. + :param report: an HTMLSection instance. + + :returns: an HTMLSection instance, if `report` was provided the given + instance is modified and returned. + """ + bcf_stats = bcfstats.load_bcfstats(bcf_stats) + p(""" +Trans counts per sample. +See output bcf file for info on individual transitions. +""") + df = bcf_stats['TSTV'] + return df diff --git a/bin/workflow_glue/run_plannotate.py b/bin/workflow_glue/run_plannotate.py index 2967c4e..6b2ecce 100755 --- a/bin/workflow_glue/run_plannotate.py +++ b/bin/workflow_glue/run_plannotate.py @@ -5,12 +5,12 @@ import json import os -from aplanat import json_item +from bokeh.embed import json_item import numpy as np import pandas as pd from plannotate.annotate import annotate -from plannotate.bokeh_plot import get_bokeh import pysam +from workflow_glue.bokeh_plot import get_bokeh from .util import wf_parser # noqa: ABS101 diff --git a/main.nf b/main.nf index d1a61a2..58673c7 100644 --- a/main.nf +++ b/main.nf @@ -302,10 +302,13 @@ process getVersions { seqkit version | sed 's/ /,/' >> versions.txt trycycler --version | sed 's/ /,/' >> versions.txt bedtools --version | sed 's/ /,/' >> versions.txt - flye --version | sed 's/ /,/' >> versions.txt + flye --version | sed 's/^/flye,/' >> versions.txt fastcat --version | sed 's/^/fastcat,/' >> versions.txt rasusa --version | sed 's/ /,/' >> versions.txt python -c "import spoa; print(spoa.__version__)" | sed 's/^/spoa,/' >> versions.txt + python -c "import pandas; print(pandas.__version__)" | sed 's/^/pandas,/' >> versions.txt + python -c "import plannotate; print(plannotate.__version__)" | sed 's/^/plannotate,/' >> versions.txt + python -c "import bokeh; print(bokeh.__version__)" | sed 's/^/bokeh,/' >> versions.txt """ } @@ -434,6 +437,7 @@ process report { report_name = "wf-clone-validation-report.html" """ workflow-glue report \ + $report_name \ --downsampled_stats downsampled_stats/* \ --revision $workflow.revision \ --commit $workflow.commitId \ @@ -442,11 +446,10 @@ process report { --host_filter_stats host_filter_stats/* \ --params params.json \ --versions versions \ - --report_name $report_name \ --plannotate_json $plannotate_json \ --lengths $lengths \ --inserts_json $inserts_json \ - --qc_inserts qc_inserts/* \ + --qc_inserts qc_inserts \ --assembly_quality assembly_quality/* """ } diff --git a/nextflow.config b/nextflow.config index f9db822..11cd493 100644 --- a/nextflow.config +++ b/nextflow.config @@ -45,9 +45,9 @@ params { "--fastq 'wf-clone-validation-demo/test'", "--primers 'wf-clone-validation-demo/primers.tsv'", ] - container_sha = "sha4b40b56828d0127650b778a795ea4cd6094a5a0c" + container_sha = "shab8863b29b5cfa427d7f8e5519bd71b956785c315" container_sha_medaka = "sha6a58ce1871560b8ddaada5cc1dfc51e5c03442e8" - common_sha = "sha0fa3896acb70eecc0d432c91a1516d596a87741c" + common_sha = "sha5fc720674a26bc63a6f31ed186344209175b54b1" agent = null } } @@ -59,7 +59,7 @@ manifest { description = 'De-novo reconstruction of synthetic plasmid sequences.' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = 'v0.4.0' + version = 'v0.5.0' } executor {