/
alignment_functions.py
106 lines (101 loc) · 4.33 KB
/
alignment_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import numpy as np
def traceback(a,b,scoregrid,indel_score=-1, match_score=2, mismatch_score=-1):
"""
Traceback for global alignment.
Returns a top-scoring trace. If multiple traces are possible for the
top alignment score, this function will return just one of them, arbitrarily.
"""
X,Y = scoregrid.shape
assert X==len(a)+1
assert Y==len(b)+1
x, y = X-1, Y-1
# Start with an empty trace
trace = []
# Trace back until we reach the top-left corner
while x>0 or y>0:
current_score = scoregrid[x,y]
# Note that we have to compare a[x-1] to b[y-1] again, and compare scores,
# but only along the path, not for every cell of the grid, i.e. O(N) not O(N^2)
if a[x-1]==b[y-1]:
match_mismatch_score = match_score
else:
match_mismatch_score = mismatch_score
# Here we pick just one operation, so we'll only trace one path
if x>0 and y>0 and scoregrid[x-1,y-1]+match_mismatch_score == current_score:
# Do a match or mismatch at this step:
# add a letter to both alignment strings,
# and go from x-1, y-1
trace.append((a[x-1],b[y-1]))
x -= 1
y -= 1
elif x>0 and scoregrid[x-1,y]+indel_score == current_score:
# Do a deletion at this step:
# add a letter to the first alignment string but a '-' to the second string,
# and go from x-1
trace.append((a[x-1],'-'))
x -= 1
elif y>0 and scoregrid[x,y-1]+indel_score == current_score:
# Do an insertion at this step:
# add a letter to the second alignment string but a '-' to the first string,
# and go from y-1
trace.append(('-',b[y-1]))
y -= 1
else:
raise ValueError("No valid trace found")
return trace[::-1]
def traceback_local(a,b,scoregrid,indel_score=-1, match_score=2, mismatch_score=-1):
"""
Traceback for local alignment.
Here we start from a highest-scoring cell in the grid, and stop when we reach a zero.
Returns a top-scoring trace. If multiple traces are possible for the
top alignment score, this function will return just one of them, arbitrarily.
"""
X,Y = scoregrid.shape
assert X==len(a)+1
assert Y==len(b)+1
# Find the highest-scoring square to start
# If there is more than one, we pick one arbitrarily
# (better would be to return all traces)
x,y = np.unravel_index(np.argmax(scoregrid, axis=None), scoregrid.shape)
# Start with an empty trace
trace = []
# Trace back until we reach the top-left corner
while x>0 or y>0:
current_score = scoregrid[x,y]
if current_score==0:
# We've finished our local alignment
break
# Note that we have to compare a[x-1] to b[y-1] again, and compare scores,
# but only along the path, not for every cell of the grid, i.e. O(N) not O(N^2)
if a[x-1]==b[y-1]:
match_mismatch_score = match_score
else:
match_mismatch_score = mismatch_score
# Here we pick just one operation, so we'll only trace one path
if x>0 and y>0 and scoregrid[x-1,y-1]+match_mismatch_score == current_score:
# Do a match or mismatch at this step:
# add a letter to both alignment strings,
# and go from x-1, y-1
trace.append((a[x-1],b[y-1]))
x -= 1
y -= 1
elif x>0 and scoregrid[x-1,y]+indel_score == current_score:
# Do a deletion at this step:
# add a letter to the first alignment string but a '-' to the second string,
# and go from x-1
trace.append((a[x-1],'-'))
x -= 1
elif y>0 and scoregrid[x,y-1]+indel_score == current_score:
# Do an insertion at this step:
# add a letter to the second alignment string but a '-' to the first string,
# and go from y-1
trace.append(('-',b[y-1]))
y -= 1
return trace[::-1]
def get_alignment(trace):
"""
Convert a trace to a pair of strings representing the alignment.
"""
aligned_string_a = ''.join([c1 for (c1,c2) in trace])
aligned_string_b = ''.join([c2 for (c1,c2) in trace])
return (aligned_string_a, aligned_string_b)