Skip to content

Commit

Permalink
Merge branch 'ezcharts_cw1453' into 'dev'
Browse files Browse the repository at this point in the history
Ezcharts cw1453

Closes CW-1453

See merge request epi2melabs/workflows/wf-cas9!33
  • Loading branch information
nrhorner committed Mar 8, 2023
2 parents 0fb29b5 + 8635eae commit bdb8ea2
Show file tree
Hide file tree
Showing 8 changed files with 398 additions and 461 deletions.
7 changes: 2 additions & 5 deletions .gitignore
Expand Up @@ -3,8 +3,5 @@ nextflow
template-workflow
.*.swp
.*.swo
.DS_STORE
output/**
.idea/**
**/__pycache__

*.pyc
*.pyo
10 changes: 8 additions & 2 deletions .pre-commit-config.yaml
Expand Up @@ -18,9 +18,10 @@ repos:
additional_dependencies:
- epi2melabs
- repo: https://github.com/pycqa/flake8
rev: 3.7.9
rev: 5.0.4
hooks:
- id: flake8
pass_filenames: false
additional_dependencies:
- flake8-rst-docstrings
- flake8-docstrings
Expand All @@ -31,4 +32,9 @@ repos:
- flake8-builtins
- flake8-absolute-import
- flake8-print
entry: flake8 bin --import-order-style google --statistics --max-line-length 88
args: [
"bin",
"--import-order-style=google",
"--statistics",
"--max-line-length=88",
]
6 changes: 6 additions & 0 deletions CHANGELOG.md
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.1.9]
### Changed
- Generation report with ezcharts
### Fixed
- Memory issue in background calculation with large number of samples

## [v0.1.8]
### Fixed
- sample_sheet format in schema to expect a file
Expand Down
12 changes: 12 additions & 0 deletions README.md
Expand Up @@ -2,6 +2,10 @@

wf-cas9 is a [nextflow](https://www.nextflow.io/) workflow
for the multiplexed analysis of Oxford Nanopore Cas9 enrichment sequencing.




## Introduction
The ONT Cas9 sequencing kit allows the enrichment of genomic
regions of interest by amplifying target regions from adapters ligated to Cas9 cleavage sites.
Expand All @@ -22,6 +26,10 @@ of read-target overlap with [bedtools](https://github.com/arq5x/bedtools2).







## Quickstart

The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and
Expand Down Expand Up @@ -91,6 +99,10 @@ HTML report:
* Plots of stranded coverage at each target.
* Histograms of on and off-target coverage for each sample.
* Off-target hotspot region tables.




## Useful links

* [nextflow](https://www.nextflow.io/)
Expand Down
74 changes: 48 additions & 26 deletions bin/workflow_glue/build_tables.py
Expand Up @@ -12,12 +12,37 @@ def argparser():
parser.add_argument(
"--target_summary", help="Target summary bed.")
parser.add_argument(
"--aln_summary", help="Alignment summary from pomoxis/stats_from_bam.")
"--aln_summary", help="Alignment summary from bamstats.")
parser.add_argument(
"--on_off", help="bed file of xx .")
"--read_to_target", help="bed file including read_id, target, sample_id")
return parser


def read_target_summary(df):
"""Build table summarising on target/off-target read status."""
df.loc[df.target != 'off_target', 'target'] = 'on_target'

agg = df.groupby(['sample_id', 'target']).agg(
mean_len=('read_length', 'mean'),
num_reads=('read_id', 'count'),
kbases_mapped=('read_length', 'sum'))

agg.kbases_mapped /= 1000
agg = agg.astype('int')
agg.reset_index(inplace=True)
result = agg.pivot(
index='sample_id',
columns=['target'],
values=['mean_len', 'num_reads', 'kbases_mapped'])

# Create emptpy columns if there are no on-target reads
if ('num_reads', 'on_target') not in result:
result[[('mean_len', 'on_target')]] = 0
result[[('num_reads', 'on_target')]] = 0
result[[('kbases_mapped', 'on_target')]] = 0
return result


def main(args):
"""Entry point."""
header = [
Expand All @@ -26,48 +51,43 @@ def main(args):

frames = []

df_ono_ff = pd.read_csv(
args.on_off, sep='\t',
df_read_to_taget = pd.read_csv(
args.read_to_target, sep='\t',
names=['chr', 'start', 'end', 'read_id', 'target', 'sample_id'],
index_col=False)

stats_df = pd.read_csv(args.aln_summary, sep='\t', index_col=False)
read_stats_df = pd.read_csv(args.aln_summary, sep='\t', index_col=False)

df_on_off = df_ono_ff.merge(
stats_df[['name', 'read_length', 'acc']],
df_read_to_taget = df_read_to_taget.merge(
read_stats_df[['name', 'read_length']],
left_on='read_id', right_on='name')

main_df = pd.read_csv(
df_target_summary = pd.read_csv(
args.target_summary, sep='\t', names=header, index_col=False)

for id_, df in main_df.groupby('sample_id'):
for id_, df in df_target_summary.groupby('sample_id'):
df = df.drop(columns=['sample_id'])
if len(df) == 0:
continue
df_on_off = df_on_off.astype({
df_read_to_taget = df_read_to_taget.astype({
'start': int,
'end': int,
'read_length': int,
'acc': float
'read_length': int
})
read_len = df_on_off.groupby(['target']).mean()[['read_length']]
read_len = df_read_to_taget.groupby(['target']).mean()[['read_length']]
read_len.columns = ['mean_read_length']
if len(read_len) > 0:
df = df.merge(read_len, left_on='target', right_index=True)
else:
df['mean_read_length'] = 0

kbases = df_on_off.groupby(['target']).sum()[['read_length']] / 1000
kbases = df_read_to_taget.groupby(['target']).sum()[['read_length']] / 1000
kbases.columns = ['kbases']
if len(kbases) > 0:
df = df.merge(kbases, left_on='target', right_index=True)
else:
df['kbases'] = 0

acc = df_on_off.groupby(['target']).mean()[['acc']]
acc.columns = ['mean_acc']
df = df.merge(acc, left_on='target', right_index=True)

df['strand_bias'] = (df.p - df.n) / (df.p + df.n)
df.drop(columns=['p', 'n'], inplace=True)
df.insert(0, 'sample', id_)
Expand All @@ -87,13 +107,12 @@ def main(args):
'strand_bias': 2,
'coverage_frac': 2,
'kbases': 2,
'mean_read_length': 1,
'mean_acc': 2})
'mean_read_length': 1})

df_all = df_all[[
'sample', 'chr', 'start', 'end', 'target', 'tsize',
'kbases', 'coverage_frac', 'median_cov', 'nreads',
'mean_read_length', 'mean_acc', 'strand_bias']]
'mean_read_length', 'strand_bias']]
df_all.sort_values(
by=["sample", "chr", "start"], key=natsort_keygen(), inplace=True)
else:
Expand All @@ -109,8 +128,6 @@ def main(args):
df['kbases'] * (df['nreads'] / df['nreads'].sum())
sdf['mean_read_length'] =\
df['mean_read_length'] * (df['nreads'] / df['nreads'].sum())
sdf['mean_acc'] =\
df['mean_acc'] * (df['nreads'] / df['nreads'].sum())
sdf['strand_bias'] =\
df['strand_bias'] * (df['nreads'] / df['nreads'].sum())
sample_df = sdf.sum()
Expand All @@ -119,8 +136,13 @@ def main(args):
sample_df['sample_id'] = sid
dfs.append(sample_df)

sample_summary = pd.concat(dfs, axis=1).T

sample_summary.set_index('sample_id', drop=True, inplace=True)
if dfs:
sample_summary = pd.concat(dfs, axis=1).T
sample_summary.set_index('sample_id', drop=True, inplace=True)
else:
sample_summary = pd.DataFrame()

sample_summary.to_csv('sample_summary.csv')

read_target_summary_table = read_target_summary(df_read_to_taget)
read_target_summary_table.to_csv('read_target_summary.tsv', sep='\t')

0 comments on commit bdb8ea2

Please sign in to comment.