From 43fb8ae768b174114b227aa0c0639eaa73bcc0b6 Mon Sep 17 00:00:00 2001 From: Kevin Bonham Date: Sat, 28 May 2016 12:17:32 -0400 Subject: [PATCH] Add status updates (#17) * Added status for multi-species GenBank files * added status for blast import * More progress indicators, bug fix in run.analyze() --- run.py | 4 ++-- src/Analysis/output.py | 2 ++ src/DataImport/gb_parse.py | 8 ++++++-- src/FindHGT/run_blast.py | 14 +++++++++----- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/run.py b/run.py index 91e32d8..45e88c7 100644 --- a/run.py +++ b/run.py @@ -37,8 +37,8 @@ def blast(): def analyze(minimum_identity, minimum_length=500, dist_between_hits=5000): groups = output.hgt_groups(minimum_identity, minimum_length, dist_between_hits) output.output_groups( - groups, MONGODB.name, os.path.join( - OUTPUT, "{}-{}-{}-groups.csv".format( + groups, os.path.join( + OUTPUT, MONGODB.name, "{}-{}-{}-groups.csv".format( int((minimum_identity)*100), minimum_length, dist_between_hits ) ) diff --git a/src/Analysis/output.py b/src/Analysis/output.py index a53e234..cf37258 100644 --- a/src/Analysis/output.py +++ b/src/Analysis/output.py @@ -63,6 +63,7 @@ def get_hits_from_species(hits_list): """ for species in db['genes'].distinct('species'): + print("---> Getting blast hits for {}".format(species)) yield [record['_id'] for record in db['genes'].find({'species': species, 'type': 'CDS', '_id': {'$in': hits_list}})] @@ -181,4 +182,5 @@ def output_groups(groups_list, output_file, min_group_size=2): name=db_handle['species'] ) df=df.append(series) + print("Creating file at {}".format(output_file)) df.to_csv(output_file, columns=df_index) diff --git a/src/DataImport/gb_parse.py b/src/DataImport/gb_parse.py index fcaba90..1fa6945 100644 --- a/src/DataImport/gb_parse.py +++ b/src/DataImport/gb_parse.py @@ -40,7 +40,7 @@ def add_contig_data(records, genbank_file): contig_counter = 0 contig_ids = [] - + current_species = None for contig in records: contig_counter += 1 try: @@ -49,6 +49,10 @@ def add_contig_data(records, genbank_file): # uses filename (without extension) as species name species = os.path.splitext(os.path.basename(genbank_file)[0]) + if not species == current_species: + print("Importing {}".format(species)) + current_species = species + if contig.id in contig_ids: # in case the id field is present but not unique contig.id = "{}_{}".format(contig.id, contig_counter) else: @@ -114,4 +118,4 @@ def add_features(contig, species): 'contig': contig.id}, } - yield feature_record \ No newline at end of file + yield feature_record diff --git a/src/FindHGT/run_blast.py b/src/FindHGT/run_blast.py index bf1550b..ee1a654 100644 --- a/src/FindHGT/run_blast.py +++ b/src/FindHGT/run_blast.py @@ -1,5 +1,5 @@ from tempfile import NamedTemporaryFile -from subprocess import Popen +from subprocess import Popen, PIPE from DataImport.mongo_import import mongo_import_record from bson.objectid import ObjectId from Bio.Blast import NCBIXML @@ -14,13 +14,14 @@ def blast_all(query_fasta, blast_db): """ tmp_results = NamedTemporaryFile() - Popen( + print(Popen( ['blastn', '-query', query_fasta.name, '-db', blast_db, '-out', tmp_results.name, - '-outfmt', '5'] # xml output - ).wait() # waits for return code before proceeding + '-outfmt', '5'], + stdout=PIPE # xml output + ).wait()) # waits for return code before proceeding return tmp_results @@ -30,9 +31,10 @@ def parse_blast_results_xml(results_file): :param results_file: blast results in xml format """ - + counter = 0 results_file.seek(0) for blast_record in NCBIXML.parse(results_file): + counter += 1 query_id = blast_record.query for alignment in blast_record.alignments: @@ -55,6 +57,8 @@ def parse_blast_results_xml(results_file): }, "blast_results" ) + if counter % 500 == 0: + print("---> {} blast records imported".format(counter)) def check_blast_pair(query, subject):