From f015397fb63467d4b241519f06f02d480c4b17b4 Mon Sep 17 00:00:00 2001 From: rafe Date: Fri, 10 Apr 2020 15:10:26 -0400 Subject: [PATCH] Add summary reports to the framer #269 --- atram_framer.py | 5 ++++ lib/core_framer.py | 61 ++++++++++++++++++++++++++++++++++++++++ lib/core_preprocessor.py | 4 +-- lib/core_stitcher.py | 4 --- lib/db.py | 2 +- lib/db_stitcher.py | 15 ++++++++++ 6 files changed, 84 insertions(+), 7 deletions(-) diff --git a/atram_framer.py b/atram_framer.py index 69d663c..7be66a3 100755 --- a/atram_framer.py +++ b/atram_framer.py @@ -89,6 +89,11 @@ def parse_command_line(): if you select this then the assembled gene name will be ..""") + parser.add_argument( + '--long-contig', type=float, default=0.7, + help="""A long contig is considered to be this fraction [0-1] of the + longest contig assembled by exonerate. The default is 0.7.""") + args = parser.parse_args() util.temp_dir_exists(args.temp_dir) diff --git a/lib/core_framer.py b/lib/core_framer.py index dc3567c..6856a9c 100644 --- a/lib/core_framer.py +++ b/lib/core_framer.py @@ -1,6 +1,8 @@ """Put exons into the correct reading frames.""" +import csv from collections import defaultdict +from itertools import product from . import bio from . import exonerate from . import db_stitcher as db @@ -32,9 +34,12 @@ def frame(args): args, temp_dir, cxn, taxon_names, iteration) exonerate.contig_file_write(cxn) exonerate.run_exonerate(temp_dir, cxn, iteration) + output_contigs(args, cxn) log.info('Writing output') + output_summary_per_gene(args, cxn, taxon_names) + output_summary_per_taxon(args, cxn, taxon_names) log.info('Finished') @@ -64,3 +69,59 @@ def output_contigs(args, cxn): seq += contig['seq'] seq += 'N' * (ref_len - len(seq)) util.write_fasta_record(out_file, contig_name, seq) + + +def output_summary_per_gene(args, cxn, taxon_names): + """Print per gene summary statistics.""" + longest = max(db.select_longest(cxn), 1) + lengths = db.select_seq_lengths(cxn) + + counts = {t: {'total': set(), 'long': set()} for t in taxon_names} + + for length in lengths: + taxon_name = length['taxon_name'] + ref_name = length['ref_name'] + counts[taxon_name]['total'].add(ref_name) + fraction = length['len'] / longest + if fraction >= args.long_contig: + counts[taxon_name]['long'].add(ref_name) + + out_path = util.prefix_file( + args.output_prefix, 'summary_stats_per_ref_gene.csv') + with open(out_path, 'w') as out_file: + writer = csv.writer(out_file) + writer.writerow(['Taxon', + 'Total_Genes', + 'Total_Genes_>={:0.2}'.format(args.long_contig)]) + for taxon, count in counts.items(): + writer.writerow([taxon, len(count['total']), len(count['long'])]) + + +def output_summary_per_taxon(args, cxn, taxon_names): + """Print per taxon summary statistics.""" + longest = max(db.select_longest(cxn), 1) + lengths = db.select_seq_lengths(cxn) + ref_names = [r['ref_name'] for r in db.select_reference_genes(cxn)] + + counts = {c: {'total': 0, 'long': 0} + for c in product(taxon_names, ref_names)} + + for length in lengths: + taxon_name = length['taxon_name'] + ref_name = length['ref_name'] + key = (taxon_name, ref_name) + counts[key]['total'] += 1 + fraction = length['len'] / longest + if fraction >= args.long_contig: + counts[key]['long'] += 1 + + out_path = util.prefix_file( + args.output_prefix, 'summary_stats_per_taxon.csv') + with open(out_path, 'w') as out_file: + writer = csv.writer(out_file) + writer.writerow(['Taxon', + 'Gene', + 'Total_Contigs', + 'Total_Contigs_>={:0.2}'.format(args.long_contig)]) + for key, count in counts.items(): + writer.writerow([key[0], key[1], count['total'], count['long']]) diff --git a/lib/core_preprocessor.py b/lib/core_preprocessor.py index 13b9900..96ffc2a 100644 --- a/lib/core_preprocessor.py +++ b/lib/core_preprocessor.py @@ -91,7 +91,7 @@ def assign_seqs_to_shards(cxn, shard_count): cuts = [db_preprocessor.get_shard_cut(cxn, offset) for offset in offsets] # Make sure the last sequence gets included - cuts[-1] = cuts[-1] + 'z' + cuts[-1] += 'z' # Now organize the list into pairs of sequence names pairs = [(cuts[i - 1], cuts[i]) for i in range(1, len(cuts))] @@ -115,7 +115,7 @@ def create_all_blast_shards(args, shard_list): (args, shard_params, idx))) all_results = [result.get() for result in results] - log.info('Finished making blast all {} DBs'.format(len(all_results))) + log.info('Finished making all {} blast DBs'.format(len(all_results))) def create_one_blast_shard(args, shard_params, shard_index): diff --git a/lib/core_stitcher.py b/lib/core_stitcher.py index e944a48..8ba17e2 100644 --- a/lib/core_stitcher.py +++ b/lib/core_stitcher.py @@ -265,10 +265,6 @@ def output_stitched_genes(args, cxn, taxon_names, iteration): taxon_name, iteration=iteration) - # has_hits = 'hits' if hits else 'no hits' - # log.info('{}.{} has {}'.format( - # ref_name, taxon_name, has_hits)) - if not hits: continue diff --git a/lib/db.py b/lib/db.py index 65828a1..7129a7e 100644 --- a/lib/db.py +++ b/lib/db.py @@ -5,7 +5,7 @@ import os from os.path import basename, join, exists -ATRAM_VERSION = 'v2.3.0' +ATRAM_VERSION = 'v2.3.1' # DB_VERSION != ATRAM_VERSION # We don't force DB changes until required. diff --git a/lib/db_stitcher.py b/lib/db_stitcher.py index 7ff0dbb..08470fc 100644 --- a/lib/db_stitcher.py +++ b/lib/db_stitcher.py @@ -191,6 +191,21 @@ def select_next(cxn, ref_name, taxon_name, beg=-1, iteration=0): return result.fetchone() +def select_longest(cxn): + """Get the longest contig for the framer reports.""" + sql = """SELECT MAX(LENGTH(seq)) AS max_len FROM exonerate;""" + result = cxn.execute(sql) + return result.fetchone()['max_len'] + + +def select_seq_lengths(cxn): + """Get the sequence lengths for the framer reports.""" + sql = """ + SELECT taxon_name, ref_name, length(seq) AS len + FROM exonerate;""" + return cxn.execute(sql) + + def select_overlap( cxn, ref_name, taxon_name, beg_lo, beg_hi, end, iteration=0): """