Add summary reports to the framer #269

juliema · Apr 10, 2020 · f015397 · f015397
1 parent 27fb102
commit f015397
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 7 deletions.
diff --git a/atram_framer.py b/atram_framer.py
@@ -89,6 +89,11 @@ def parse_command_line():
             <taxon-name> if you select this then the assembled gene name
             will be <reference-name>.<taxon-name>.""")
 
+    parser.add_argument(
+        '--long-contig', type=float, default=0.7,
+        help="""A long contig is considered to be this fraction [0-1] of the
+            longest contig assembled by exonerate. The default is 0.7.""")
+
     args = parser.parse_args()
 
     util.temp_dir_exists(args.temp_dir)

diff --git a/lib/core_framer.py b/lib/core_framer.py
@@ -1,6 +1,8 @@
 """Put exons into the correct reading frames."""
 
+import csv
 from collections import defaultdict
+from itertools import product
 from . import bio
 from . import exonerate
 from . import db_stitcher as db
@@ -32,9 +34,12 @@ def frame(args):
                 args, temp_dir, cxn, taxon_names, iteration)
             exonerate.contig_file_write(cxn)
             exonerate.run_exonerate(temp_dir, cxn, iteration)
+
             output_contigs(args, cxn)
 
             log.info('Writing output')
+            output_summary_per_gene(args, cxn, taxon_names)
+            output_summary_per_taxon(args, cxn, taxon_names)
 
         log.info('Finished')
 
@@ -64,3 +69,59 @@ def output_contigs(args, cxn):
                 seq += contig['seq']
                 seq += 'N' * (ref_len - len(seq))
                 util.write_fasta_record(out_file, contig_name, seq)
+
+
+def output_summary_per_gene(args, cxn, taxon_names):
+    """Print per gene summary statistics."""
+    longest = max(db.select_longest(cxn), 1)
+    lengths = db.select_seq_lengths(cxn)
+
+    counts = {t: {'total': set(), 'long': set()} for t in taxon_names}
+
+    for length in lengths:
+        taxon_name = length['taxon_name']
+        ref_name = length['ref_name']
+        counts[taxon_name]['total'].add(ref_name)
+        fraction = length['len'] / longest
+        if fraction >= args.long_contig:
+            counts[taxon_name]['long'].add(ref_name)
+
+    out_path = util.prefix_file(
+        args.output_prefix, 'summary_stats_per_ref_gene.csv')
+    with open(out_path, 'w') as out_file:
+        writer = csv.writer(out_file)
+        writer.writerow(['Taxon',
+                         'Total_Genes',
+                         'Total_Genes_>={:0.2}'.format(args.long_contig)])
+        for taxon, count in counts.items():
+            writer.writerow([taxon, len(count['total']), len(count['long'])])
+
+
+def output_summary_per_taxon(args, cxn, taxon_names):
+    """Print per taxon summary statistics."""
+    longest = max(db.select_longest(cxn), 1)
+    lengths = db.select_seq_lengths(cxn)
+    ref_names = [r['ref_name'] for r in db.select_reference_genes(cxn)]
+
+    counts = {c: {'total': 0, 'long': 0}
+              for c in product(taxon_names, ref_names)}
+
+    for length in lengths:
+        taxon_name = length['taxon_name']
+        ref_name = length['ref_name']
+        key = (taxon_name, ref_name)
+        counts[key]['total'] += 1
+        fraction = length['len'] / longest
+        if fraction >= args.long_contig:
+            counts[key]['long'] += 1
+
+    out_path = util.prefix_file(
+        args.output_prefix, 'summary_stats_per_taxon.csv')
+    with open(out_path, 'w') as out_file:
+        writer = csv.writer(out_file)
+        writer.writerow(['Taxon',
+                         'Gene',
+                         'Total_Contigs',
+                         'Total_Contigs_>={:0.2}'.format(args.long_contig)])
+        for key, count in counts.items():
+            writer.writerow([key[0], key[1], count['total'], count['long']])
diff --git a/lib/core_preprocessor.py b/lib/core_preprocessor.py
@@ -91,7 +91,7 @@ def assign_seqs_to_shards(cxn, shard_count):
     cuts = [db_preprocessor.get_shard_cut(cxn, offset) for offset in offsets]
 
     # Make sure the last sequence gets included
-    cuts[-1] = cuts[-1] + 'z'
+    cuts[-1] += 'z'
 
     # Now organize the list into pairs of sequence names
     pairs = [(cuts[i - 1], cuts[i]) for i in range(1, len(cuts))]
@@ -115,7 +115,7 @@ def create_all_blast_shards(args, shard_list):
                 (args, shard_params, idx)))
 
         all_results = [result.get() for result in results]
-    log.info('Finished making blast all {} DBs'.format(len(all_results)))
+    log.info('Finished making all {} blast DBs'.format(len(all_results)))
 
 
 def create_one_blast_shard(args, shard_params, shard_index):

diff --git a/lib/core_stitcher.py b/lib/core_stitcher.py
@@ -265,10 +265,6 @@ def output_stitched_genes(args, cxn, taxon_names, iteration):
                     taxon_name,
                     iteration=iteration)
 
-                # has_hits = 'hits' if hits else 'no hits'
-                # log.info('{}.{} has {}'.format(
-                #     ref_name, taxon_name, has_hits))
-
                 if not hits:
                     continue
 

diff --git a/lib/db.py b/lib/db.py
@@ -5,7 +5,7 @@
 import os
 from os.path import basename, join, exists
 
-ATRAM_VERSION = 'v2.3.0'
+ATRAM_VERSION = 'v2.3.1'
 
 # DB_VERSION != ATRAM_VERSION
 # We don't force DB changes until required.

diff --git a/lib/db_stitcher.py b/lib/db_stitcher.py
@@ -191,6 +191,21 @@ def select_next(cxn, ref_name, taxon_name, beg=-1, iteration=0):
     return result.fetchone()
 
 
+def select_longest(cxn):
+    """Get the longest contig for the framer reports."""
+    sql = """SELECT MAX(LENGTH(seq)) AS max_len FROM exonerate;"""
+    result = cxn.execute(sql)
+    return result.fetchone()['max_len']
+
+
+def select_seq_lengths(cxn):
+    """Get the sequence lengths for the framer reports."""
+    sql = """
+        SELECT taxon_name, ref_name, length(seq) AS len
+          FROM exonerate;"""
+    return cxn.execute(sql)
+
+
 def select_overlap(
         cxn, ref_name, taxon_name, beg_lo, beg_hi, end, iteration=0):
     """