Skip to content

Commit

Permalink
Add summary reports to the framer #269
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Apr 10, 2020
1 parent 27fb102 commit f015397
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 7 deletions.
5 changes: 5 additions & 0 deletions atram_framer.py
Expand Up @@ -89,6 +89,11 @@ def parse_command_line():
<taxon-name> if you select this then the assembled gene name
will be <reference-name>.<taxon-name>.""")

parser.add_argument(
'--long-contig', type=float, default=0.7,
help="""A long contig is considered to be this fraction [0-1] of the
longest contig assembled by exonerate. The default is 0.7.""")

args = parser.parse_args()

util.temp_dir_exists(args.temp_dir)
Expand Down
61 changes: 61 additions & 0 deletions lib/core_framer.py
@@ -1,6 +1,8 @@
"""Put exons into the correct reading frames."""

import csv
from collections import defaultdict
from itertools import product
from . import bio
from . import exonerate
from . import db_stitcher as db
Expand Down Expand Up @@ -32,9 +34,12 @@ def frame(args):
args, temp_dir, cxn, taxon_names, iteration)
exonerate.contig_file_write(cxn)
exonerate.run_exonerate(temp_dir, cxn, iteration)

output_contigs(args, cxn)

log.info('Writing output')
output_summary_per_gene(args, cxn, taxon_names)
output_summary_per_taxon(args, cxn, taxon_names)

log.info('Finished')

Expand Down Expand Up @@ -64,3 +69,59 @@ def output_contigs(args, cxn):
seq += contig['seq']
seq += 'N' * (ref_len - len(seq))
util.write_fasta_record(out_file, contig_name, seq)


def output_summary_per_gene(args, cxn, taxon_names):
"""Print per gene summary statistics."""
longest = max(db.select_longest(cxn), 1)
lengths = db.select_seq_lengths(cxn)

counts = {t: {'total': set(), 'long': set()} for t in taxon_names}

for length in lengths:
taxon_name = length['taxon_name']
ref_name = length['ref_name']
counts[taxon_name]['total'].add(ref_name)
fraction = length['len'] / longest
if fraction >= args.long_contig:
counts[taxon_name]['long'].add(ref_name)

out_path = util.prefix_file(
args.output_prefix, 'summary_stats_per_ref_gene.csv')
with open(out_path, 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(['Taxon',
'Total_Genes',
'Total_Genes_>={:0.2}'.format(args.long_contig)])
for taxon, count in counts.items():
writer.writerow([taxon, len(count['total']), len(count['long'])])


def output_summary_per_taxon(args, cxn, taxon_names):
"""Print per taxon summary statistics."""
longest = max(db.select_longest(cxn), 1)
lengths = db.select_seq_lengths(cxn)
ref_names = [r['ref_name'] for r in db.select_reference_genes(cxn)]

counts = {c: {'total': 0, 'long': 0}
for c in product(taxon_names, ref_names)}

for length in lengths:
taxon_name = length['taxon_name']
ref_name = length['ref_name']
key = (taxon_name, ref_name)
counts[key]['total'] += 1
fraction = length['len'] / longest
if fraction >= args.long_contig:
counts[key]['long'] += 1

out_path = util.prefix_file(
args.output_prefix, 'summary_stats_per_taxon.csv')
with open(out_path, 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(['Taxon',
'Gene',
'Total_Contigs',
'Total_Contigs_>={:0.2}'.format(args.long_contig)])
for key, count in counts.items():
writer.writerow([key[0], key[1], count['total'], count['long']])
4 changes: 2 additions & 2 deletions lib/core_preprocessor.py
Expand Up @@ -91,7 +91,7 @@ def assign_seqs_to_shards(cxn, shard_count):
cuts = [db_preprocessor.get_shard_cut(cxn, offset) for offset in offsets]

# Make sure the last sequence gets included
cuts[-1] = cuts[-1] + 'z'
cuts[-1] += 'z'

# Now organize the list into pairs of sequence names
pairs = [(cuts[i - 1], cuts[i]) for i in range(1, len(cuts))]
Expand All @@ -115,7 +115,7 @@ def create_all_blast_shards(args, shard_list):
(args, shard_params, idx)))

all_results = [result.get() for result in results]
log.info('Finished making blast all {} DBs'.format(len(all_results)))
log.info('Finished making all {} blast DBs'.format(len(all_results)))


def create_one_blast_shard(args, shard_params, shard_index):
Expand Down
4 changes: 0 additions & 4 deletions lib/core_stitcher.py
Expand Up @@ -265,10 +265,6 @@ def output_stitched_genes(args, cxn, taxon_names, iteration):
taxon_name,
iteration=iteration)

# has_hits = 'hits' if hits else 'no hits'
# log.info('{}.{} has {}'.format(
# ref_name, taxon_name, has_hits))

if not hits:
continue

Expand Down
2 changes: 1 addition & 1 deletion lib/db.py
Expand Up @@ -5,7 +5,7 @@
import os
from os.path import basename, join, exists

ATRAM_VERSION = 'v2.3.0'
ATRAM_VERSION = 'v2.3.1'

# DB_VERSION != ATRAM_VERSION
# We don't force DB changes until required.
Expand Down
15 changes: 15 additions & 0 deletions lib/db_stitcher.py
Expand Up @@ -191,6 +191,21 @@ def select_next(cxn, ref_name, taxon_name, beg=-1, iteration=0):
return result.fetchone()


def select_longest(cxn):
"""Get the longest contig for the framer reports."""
sql = """SELECT MAX(LENGTH(seq)) AS max_len FROM exonerate;"""
result = cxn.execute(sql)
return result.fetchone()['max_len']


def select_seq_lengths(cxn):
"""Get the sequence lengths for the framer reports."""
sql = """
SELECT taxon_name, ref_name, length(seq) AS len
FROM exonerate;"""
return cxn.execute(sql)


def select_overlap(
cxn, ref_name, taxon_name, beg_lo, beg_hi, end, iteration=0):
"""
Expand Down

0 comments on commit f015397

Please sign in to comment.