-
Notifications
You must be signed in to change notification settings - Fork 0
/
ContactMap.py
253 lines (212 loc) · 12.8 KB
/
ContactMap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from numpy import sqrt, array, where, zeros
from Bio import PDB, SeqIO
# TODO allow looking at contacts for specific position for intra
# TODO combine contacts across all homomers
from scipy.spatial import KDTree
from Chimeragenesis.AccessiontoAlignment import get_alignment_indexing,get_alignment_indexing_w_dashes, \
create_dictionary_from_alignment,no_gap_sequence_from_alignment
def get_coordinates_dict_per_chain(pdb_file):
STRUCTURE = PDB.PDBParser().get_structure(pdb_file, pdb_file)
# Then selecting the chain
PROTEIN = STRUCTURE[0]
list_of_chain_coordinates = {
chain.id: [tuple(res["CB"].coord) if res.get_resname() != 'GLY' else tuple(res["CA"].coord) for res in chain if
len(chain) > 10] for chain in PROTEIN}
return list_of_chain_coordinates
def create_spatial_index(list_coords):
kdtree = KDTree(list_coords)
return kdtree
def find_neighbors(kdtree, list_of_coords, residue_index, distance_cutoff=8.0, residue_dist_cutoff=6):
neighbors = kdtree.query_ball_point(list_of_coords[residue_index], distance_cutoff)
return sorted(tuple(res for res in neighbors if abs(res - residue_index) >= residue_dist_cutoff))
def assign_index_to_chain(index, contact_range):
for chain, ranges in contact_range.items():
if index in ranges:
corrected_index = contact_range[chain].index(index)
return chain, corrected_index
# TODO be able to add residue id
def get_inter_protein_contacts(pdb_file, chain_id, index_to_position=False):
coord_dict = get_coordinates_dict_per_chain(pdb_file)
chain_indices_dict = {}
range_placeholder = 0
total_coords = []
for chain, coords in coord_dict.items():
chain_indices_dict[chain] = range(range_placeholder, range_placeholder + len(coords))
total_coords += coords
range_placeholder += len(coords)
kd_tree = create_spatial_index(total_coords)
inter_contacts = []
for index, kd_index in enumerate(chain_indices_dict[chain_id]):
neighbors = find_neighbors(kd_tree, total_coords, kd_index, residue_dist_cutoff=0)
chain_assigned_neighbors = []
for neighbor in neighbors:
chain_for_index = assign_index_to_chain(neighbor, chain_indices_dict)
if chain_for_index[0] != chain_id:
chain_assigned_neighbors.append(f'{chain_for_index[0]}:{chain_for_index[1] + bool(index_to_position)}')
inter_contacts.append(chain_assigned_neighbors)
return inter_contacts
def calc_residue_dist(residue_one, residue_two):
"""Returns the C-alpha distance between two residues"""
# Mostly copied from https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/protein_contact_map/
# Using distance formula to calculate residue distance
diff_vector = residue_one["CB"].coord - residue_two["CB"].coord
return sqrt(sum(diff_vector * diff_vector))
def residue_dist_matrix(pdb_file, chain_id, make_it_binary='Yes', distance_cutoff=8):
"""Takes a distance matrix of a protein and converts it to a binary matrix that uses 1 to signify a residue
contact and 0 for no contact or Returns a matrix of C-alpha distances between residues in a protein chain or b"""
if make_it_binary == 'Yes':
# Using list comprehension to create a nested list
# then an array(protein length by protein length) of 1 to signifying a residue contact below distance_cutoff
# and at least 6 residue separation or 0 for no contact
protein_coords = get_coordinates_dict_per_chain(pdb_file)[chain_id]
contact_map = zeros((len(protein_coords), len(protein_coords)))
contact_pairs = get_intra_residue_contact_pairs(pdb_file, chain_id)
for index, pairs in enumerate(contact_pairs):
for pair in pairs:
contact_map[pair, index] = 1
contact_map[index, pair] = 1
return contact_map
else:
STRUCTURE = PDB.PDBParser().get_structure(pdb_file, pdb_file)
# Then selecting the chain
PROTEIN = STRUCTURE[0][chain_id]
# Using list comprehension to create a list then an array of each residue distance
# to the rest of the chain and stacking those tuples to make a matrix
return array([[calc_residue_dist(res_one, res_two) for res_one in PROTEIN] for res_two in PROTEIN])
def get_intra_residue_contact_pairs(pdb_file, chain_id):
"""Takes a pdb structure and return a nested list where each residue index in the list has a list with the python
indexes for the residues it's in contact with.
Residue indexes that have no contacts will have an empty list"""
protein_coords = get_coordinates_dict_per_chain(pdb_file)[chain_id]
kd_tree = create_spatial_index(protein_coords)
list_of_contact_pairs = [find_neighbors(kd_tree, protein_coords, index) for index, coord in
enumerate(protein_coords)]
return list_of_contact_pairs
def correct_alignment_for_residue_position(alignment_file, alignment_label, alignment_index):
sequence_dict = create_dictionary_from_alignment(alignment_file)
sequence_from_alignment = sequence_dict[alignment_label]
sequence_indexing = get_alignment_indexing(sequence_from_alignment)
return sequence_indexing.index(alignment_index)
def correct_residue_index_for_alignment(alignment_file, residue_index, protein_label):
aligned_seq = create_dictionary_from_alignment(alignment_file)[protein_label]
print(get_alignment_indexing(aligned_seq)[residue_index])
def get_residue_at_native_position(alignment_file, alignment_label, alignment_index=None, residue_index=None):
sequence_dict = create_dictionary_from_alignment(alignment_file)
sequence_from_alignment = sequence_dict[alignment_label]
if alignment_index:
return sequence_from_alignment[alignment_index]
elif residue_index:
sequence = get_alignment_indexing(sequence_from_alignment)
return sequence[residue_index]
def get_individual_intra_contacts(pdb_file, chain_id, index):
coords = get_coordinates_dict_per_chain(pdb_file)[chain_id]
kd_tree = create_spatial_index(coords)
contacts = find_neighbors(kd_tree, coords, index)
return contacts
def get_individual_inter_contacts(pdb_file, chain_id, index,index_to_position=False):
coord_dict = get_coordinates_dict_per_chain(pdb_file)
contact_range = {}
range_placeholder = 0
total_coords = []
for chain, coords in coord_dict.items():
contact_range[chain] = range(range_placeholder, range_placeholder + len(coords))
total_coords += coords
range_placeholder += len(coords)
kd_tree = create_spatial_index(total_coords)
inter_contacts = []
neighbors = kd_tree.query_ball_point(coord_dict[chain_id][index], 8)
for neighbor in neighbors:
chain_for_index = assign_index_to_chain(neighbor, contact_range)
if chain_for_index[0] != chain_id:
inter_contacts.append(f'{chain_for_index[0]}:{chain_for_index[1] + bool(index_to_position)}')
return inter_contacts
def correct_contact_position_for_alignment(pdb_file, chain_identifier, alignment_file, alignment_label):
"""This can only be used for the pdbs that have the same sequence as the sequence found in the alignment. Takes a nested list of residue indexes
and translates them into the alignment indexes they are found in the alignment given"""
sequence_from_alignment = create_dictionary_from_alignment(alignment_file)[alignment_label]
contact_pairs = get_intra_residue_contact_pairs(pdb_file, chain_identifier)
sequence_indexing = [ind for ind, x in enumerate(sequence_from_alignment) if x != '-']
index_dictionary = {real_index: alignment_index for real_index, alignment_index in enumerate(sequence_indexing)}
updated_contact_map = [[index_dictionary[real_index] for real_index in contact_pairs[alignment_index]] for
alignment_index, pairs in enumerate(contact_pairs)]
return updated_contact_map
def correct_alignment_for_chimera_index(alignment_file, ref_label, aln_index, comparison_label,sequence_of_interest):
sequence_dictionary = create_dictionary_from_alignment(alignment_file)
reference_aln = sequence_dictionary[ref_label]
comparison_aln = sequence_dictionary[comparison_label]
reference_alignment_indexing = get_alignment_indexing(reference_aln)
no_gap_reference_sequence = no_gap_sequence_from_alignment(reference_aln)
alignment_reference_start = reference_alignment_indexing[no_gap_reference_sequence.find(sequence_of_interest)]
alignment_reference_end = reference_alignment_indexing[no_gap_reference_sequence.find(sequence_of_interest) + len(
sequence_of_interest) - 1] + 1
indexed_ref_aln=get_alignment_indexing_w_dashes(reference_aln)
indexed_com_aln=get_alignment_indexing_w_dashes(comparison_aln)
indexed_ref_aln[alignment_reference_start:alignment_reference_end]=indexed_com_aln[alignment_reference_start:alignment_reference_end]
ref_aln_list=list(reference_aln)
com_aln_list=list(comparison_aln)
ref_aln_list[alignment_reference_start:alignment_reference_end] = com_aln_list[
alignment_reference_start:alignment_reference_end]
chimera_seq=no_gap_sequence_from_alignment(''.join(ref_aln_list))
indexed_ref_aln=[x for x in indexed_ref_aln if x!='-']
chimera_index=indexed_ref_aln.index(aln_index)
chi_AA=chimera_seq[chimera_index]
return chimera_index,chi_AA,chimera_seq
def get_sequence_from_pdb(pdb_file, chain_id):
# Iterate over all records in the PDB file
for record in SeqIO.parse(pdb_file, "pdb-atom"):
# Check if the record corresponds to the desired chain
if record.annotations["chain"] == chain_id:
# Return the sequence
return str(record.seq)
# Extract the sequence from the chain
aln = '/gpfs/gpfs0/scratch/jws6pq/Notebook/PDB/CoronavirusMSA.aln'
native_format = '/gpfs/gpfs0/scratch/jws6pq/Notebook/PDB/3mer{0}.pdb'
chi_format = '/gpfs/gpfs0/scratch/jws6pq/Notebook/PDB/3merSARS2w{0}S1.pdb'
# with open("/gpfs/gpfs0/scratch/jws6pq/Notebook/Overall/List_of_coronaviruses", 'r') as loc:
# loc = [line.split()[-1] for line in loc.readlines()]
# compare_aligned_contact_all_proteins(aln,native_format,chi_format,'B',1267,loc,'/gpfs/gpfs0/scratch/jws6pq/Notebook/Immersion/CarbonB_1267_contacts.tsv','/gpfs/gpfs0/scratch/jws6pq/Notebook/Immersion/Rank_change_1268.tsv')
# def compare_aligned_contacts(alignment_file,reference_label,comparison_label,ref_pdb,com_pdb):
# aligned_reference=ColorCoding.create_dictionary_from_alignment(alignment_file)[reference_label]
# aligned_comparison = ColorCoding.create_dictionary_from_alignment(alignment_file)[comparison_label]
# reference_indexing=get_alignment_indexing(aligned_reference)
# comparison_indexing=get_alignment_indexing(aligned_comparison)
def ContactOverlap(alignment_file, comparison, reference='6VSB_B'):
sequence_dictionary = create_dictionary_from_alignment(alignment_file)
reference_sequence, comparison_sequence = sequence_dictionary[reference], sequence_dictionary[comparison]
# CP is Comparison Protein and RP is Reference Protein
cp_updated_contact_map = correct_contact_position_for_alignment(f'3mer{comparison}.pdb', 'B', comparison_sequence)
print(comparison_sequence)
print(cp_updated_contact_map)
rp_updated_contact_map = correct_contact_position_for_alignment(f'{reference}.pdb', "B", reference_sequence)
rp_contact_map, cp_contact_map = [], []
j = 0
residue_comparison = [blosum[where(blosum_62_matrix()[:, 0] == res_ref)[0][0] - 1][
where(blosum_62_matrix()[0, :] == res_com)[0][0] - 1]
for res_ref, res_com in zip(reference_sequence, comparison_sequence)]
for residue in reference_sequence:
if residue.isalpha():
rp_contact_map.append(rp_updated_contact_map[j])
j += 1
else:
rp_contact_map.append([])
j = 0
for residue in comparison_sequence:
if residue.isalpha():
cp_contact_map.append(cp_updated_contact_map[j])
j += 1
else:
cp_contact_map.append([])
fraction_conserved = []
# print(list(zip(rp_contact_map,cp_contact_map)))
for x, y in zip(rp_contact_map, cp_contact_map):
if x == [] and y == []:
fraction_conserved.append(1); continue
elif x == [] or y == []:
fraction_conserved.append(0); continue
x_set = set();
y_set = set()
[x_set.update(num for num in range(contact - 6, contact + 7)) for contact in x]
[y_set.update(num for num in range(contact - 6, contact + 7)) for contact in y]
fraction_conserved.append((len([contact for contact in y if contact in x_set]) + len(
[contact for contact in x if contact in y_set])) / (len(x) + len(y)))
print(fraction_conserved)