Skip to content

Commit

Permalink
Merge pull request #13 from kescobo/analysis_2
Browse files Browse the repository at this point in the history
continue cleanup
  • Loading branch information
kescobo committed May 26, 2016
2 parents 46cc9b3 + 117fc4c commit be74847
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 26 deletions.
84 changes: 63 additions & 21 deletions src/Analysis/circos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import numpy as np
from settings import OUTPUT
from settings import MONGODB as db
from Analysis.output import get_hgt
from Analysis.output import get_hgt, hgt_groups, find_all_hits
from Bio import SeqUtils
import re


def safe_species_name(species_name):
Expand Down Expand Up @@ -38,35 +39,76 @@ def get_karyotypes(length_cutoff=1000):
break


def get_links(minimum_identity, minimum_length=100, maximum_identity=1.0):
def get_links(minimum_identity, minimum_length=100, maximum_identity=1.0, group_no=None, min_dist=3000, annotation=None):
outdir = os.path.join(OUTPUT, 'circos', 'links')
if not os.path.isdir(outdir):
os.makedirs(outdir)

if not maximum_identity == 1.0:
if group_no:
out_file = os.path.join(outdir, 'group{}{}_{}-{}-{}-{}-links.txt'.format(
group_no,
annotation,
int(minimum_identity * 100),
int(maximum_identity * 100),
minimum_length,
min_dist)
)
elif not maximum_identity == 1.0:
out_file = os.path.join(outdir, '{}-{}-{}-links.txt'.format(int(minimum_identity * 100), int(maximum_identity * 100), minimum_length))
else:
out_file = os.path.join(outdir, '{}-{}-links.txt'.format(int(minimum_identity * 100), minimum_length))

with open(out_file, 'w+') as out_handle:
for s1, s2, hit in get_hgt(minimum_identity, minimum_length, maximum_identity):
s1_record = db['genes'].find_one({'_id': s1})
s2_record = db['genes'].find_one({'_id': s2})

s1_name = safe_species_name(s1_record['species'])
s2_name = safe_species_name(s2_record['species'])

out_handle.write('{0}-{1} {2} {3} {4}-{5} {6} {7}\n'.format(
s1_name,
s1_record['location']['contig'],
s1_record['location']['start'],
s1_record['location']['end'],
s2_name,
s2_record['location']['contig'],
s2_record['location']['start'],
s2_record['location']['end'],
)
)
if group_no:
print out_handle.name
groups = hgt_groups(minimum_identity, minimum_length, min_dist)
groups.sort(key=len, reverse=True)
group_index = group_no - 1
print group_index
group_hits = groups[group_index]

for subject in group_hits:
if annotation:
if check_annotation(subject, annotation):
for query in find_all_hits(subject, minimum_identity, minimum_length):
write_link(out_handle, subject, query)
else:
pass
else:
for query in find_all_hits(subject, minimum_identity, minimum_length):
write_link(out_handle, subject, query)
else:
for s1, s2, hit in get_hgt(minimum_identity, minimum_length, maximum_identity):
write_link(out_handle, s1, s2)


def write_link(open_file, gene1, gene2):
g1_record = db['genes'].find_one({'_id': gene1})
g2_record = db['genes'].find_one({'_id': gene2})

g1_name = safe_species_name(g1_record['species'])
g2_name = safe_species_name(g2_record['species'])

open_file.write('{0}-{1} {2} {3} {4}-{5} {6} {7}\n'.format(
g1_name,
g1_record['location']['contig'],
g1_record['location']['start'],
g1_record['location']['end'],
g2_name,
g2_record['location']['contig'],
g2_record['location']['start'],
g2_record['location']['end'],
)
)


def check_annotation(gene_id, phrase):
gene_record = db['genes'].find_one({'_id': gene_id})
if re.search(phrase.lower(), gene_record['annotation'].lower()):
return True
else:
return False



def get_conf_file(links, name='circos', gc=None,):
Expand Down
11 changes: 6 additions & 5 deletions src/Analysis/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def get_hgt(minimum_identity, minimum_length=100, maximum_identity=1.0):
:rtype generator: query id and subject id for each hit
"""
hgt = db['blast_results'].find(
{'perc_identity': {'$gte': minimum_identity, '$lt': maximum_identity},
{'perc_identity': {'$gte': minimum_identity, '$lte': maximum_identity},
'length' : {'$gte': minimum_length}}
)

Expand Down Expand Up @@ -48,9 +48,10 @@ def get_islands(minimum_identity, minimum_length=100, dist_between_hits=3000):
else:
location_1 = entry_1['location']
location_2 = entry_2['location']
if abs(location_1['end'] - location_2['start']) <= dist_between_hits:
entry_recorded = True
islands.append([entry_1['_id'], entry_2['_id']])
if location_1['contig'] == location_2['contig']:
if abs(location_1['end'] - location_2['start']) <= dist_between_hits:
entry_recorded = True
islands.append([entry_1['_id'], entry_2['_id']])

if not entry_recorded:
islands.append([entry_1['_id']])
Expand Down Expand Up @@ -106,7 +107,7 @@ def find_all_hits(some_id, minimum_identity, minimum_length=100):
as_query = {"type": "blast_result", "query": str(some_id)} # maybe should use ObjectId on import, see issue#10
as_subject = {"type": "blast_result", "subject": str(some_id)}

blast_hits = collection.find({"$or": [as_query, as_subject],
blast_hits = collection.find({"$or": [as_query, as_subject],
"perc_identity": {"$gte": minimum_identity},
"length": {"$gte": minimum_length}
}
Expand Down

0 comments on commit be74847

Please sign in to comment.