Skip to content

Commit

Permalink
Add status updates (#17)
Browse files Browse the repository at this point in the history
* Added status for multi-species GenBank files

* added status for blast import

* More progress indicators, bug fix in run.analyze()
  • Loading branch information
kescobo committed May 28, 2016
1 parent a672cca commit 43fb8ae
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 9 deletions.
4 changes: 2 additions & 2 deletions run.py
Expand Up @@ -37,8 +37,8 @@ def blast():
def analyze(minimum_identity, minimum_length=500, dist_between_hits=5000):
groups = output.hgt_groups(minimum_identity, minimum_length, dist_between_hits)
output.output_groups(
groups, MONGODB.name, os.path.join(
OUTPUT, "{}-{}-{}-groups.csv".format(
groups, os.path.join(
OUTPUT, MONGODB.name, "{}-{}-{}-groups.csv".format(
int((minimum_identity)*100), minimum_length, dist_between_hits
)
)
Expand Down
2 changes: 2 additions & 0 deletions src/Analysis/output.py
Expand Up @@ -63,6 +63,7 @@ def get_hits_from_species(hits_list):
"""
for species in db['genes'].distinct('species'):
print("---> Getting blast hits for {}".format(species))
yield [record['_id'] for record in db['genes'].find({'species': species, 'type': 'CDS', '_id': {'$in': hits_list}})]


Expand Down Expand Up @@ -181,4 +182,5 @@ def output_groups(groups_list, output_file, min_group_size=2):
name=db_handle['species']
)
df=df.append(series)
print("Creating file at {}".format(output_file))
df.to_csv(output_file, columns=df_index)
8 changes: 6 additions & 2 deletions src/DataImport/gb_parse.py
Expand Up @@ -40,7 +40,7 @@ def add_contig_data(records, genbank_file):
contig_counter = 0
contig_ids = []


current_species = None
for contig in records:
contig_counter += 1
try:
Expand All @@ -49,6 +49,10 @@ def add_contig_data(records, genbank_file):
# uses filename (without extension) as species name
species = os.path.splitext(os.path.basename(genbank_file)[0])

if not species == current_species:
print("Importing {}".format(species))
current_species = species

if contig.id in contig_ids: # in case the id field is present but not unique
contig.id = "{}_{}".format(contig.id, contig_counter)
else:
Expand Down Expand Up @@ -114,4 +118,4 @@ def add_features(contig, species):
'contig': contig.id},
}

yield feature_record
yield feature_record
14 changes: 9 additions & 5 deletions src/FindHGT/run_blast.py
@@ -1,5 +1,5 @@
from tempfile import NamedTemporaryFile
from subprocess import Popen
from subprocess import Popen, PIPE
from DataImport.mongo_import import mongo_import_record
from bson.objectid import ObjectId
from Bio.Blast import NCBIXML
Expand All @@ -14,13 +14,14 @@ def blast_all(query_fasta, blast_db):
"""
tmp_results = NamedTemporaryFile()

Popen(
print(Popen(
['blastn',
'-query', query_fasta.name,
'-db', blast_db,
'-out', tmp_results.name,
'-outfmt', '5'] # xml output
).wait() # waits for return code before proceeding
'-outfmt', '5'],
stdout=PIPE # xml output
).wait()) # waits for return code before proceeding

return tmp_results

Expand All @@ -30,9 +31,10 @@ def parse_blast_results_xml(results_file):
:param results_file: blast results in xml format
"""

counter = 0
results_file.seek(0)
for blast_record in NCBIXML.parse(results_file):
counter += 1
query_id = blast_record.query

for alignment in blast_record.alignments:
Expand All @@ -55,6 +57,8 @@ def parse_blast_results_xml(results_file):
},
"blast_results"
)
if counter % 500 == 0:
print("---> {} blast records imported".format(counter))


def check_blast_pair(query, subject):
Expand Down

0 comments on commit 43fb8ae

Please sign in to comment.