-
Notifications
You must be signed in to change notification settings - Fork 2
/
gromacs_utils.py
314 lines (268 loc) · 13.7 KB
/
gromacs_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import os
import numpy as np
import time
class colors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def warn_file_exists(filename):
"""
Prints a warning that a file already exists and will not be overwritten.
:param filename: the filename as a string.
"""
print colors.WARNING + "WARNING:" + colors.ENDC + "The file " + filename + \
" already exists: use rewrite=True to override"
def strip_filenames(filenames):
"""
This strips off "_cntr" and "_whole" substrings in each filename of a list of filenames. The "_cntr" and "_whole"
substrings are used to name pbc -whole and pbc -cntr output xtcs, so it is good to be able to strip off those
extras. BEWARE this will sort the files!
:param filenames: A list of filenames from which "_cntr" and "_whole" will be removed.
:return: A list of the edited filenames, no extensions.
"""
stripped_filenames = []
for filename in filenames:
if '_cntr' in filename:
stripped_filenames.append(filename.replace('_cntr', '')[0:-4])
elif '_whole' in filename:
stripped_filenames.append(filename.replace('_whole', '')[0:-4])
else:
stripped_filenames.append(filename[0:-4])
return np.unique(stripped_filenames)
def make_ndx(tpr_filename, ndx_filename, selection=None, rewrite=False, AA=True):
"""
Generates an index file using gmx select. The output index file will have the following groups:
mol_1_and_name_CA
mol_2_and_name_CA
(mol_1_and_name_CA)_or_(mol_2_and_name_CA)
mol_1_or_mol_2
Do NOT alter this selection if you are using this with mRMR, except you may flag whether the tpr is a coarse-grained
or all-atom simulation. If all-atom, alpha carbons are selected, and if coarse-grained, backbone atoms are selected.
You may provide your own selection if you are not using this with mRMR.
:param tpr_filename: path to a single tpr
:param ndx_filename: path to output index file
:param rewrite: boolean to indicate whether you want to overwrite an existing index file of the same name
"""
if rewrite or not os.path.exists(ndx_filename):
if selection is None:
if AA:
selection = 'mol 1 and name CA; mol 2 and name CA; ' \
'(mol 1 and name CA) or (mol 2 and name CA); mol 1 or mol 2'
else:
selection = 'mol 1 and name BB; mol 2 and name BB; ' \
'(mol 1 and name BB) or (mol 2 and name BB); mol 1 or mol 2'
os.system('gmx select -s %s -select "%s" -on %s' % (tpr_filename, selection, ndx_filename))
else:
warn_file_exists(ndx_filename)
def read_ndx(ndx_filename):
"""
Reads a Gromacs ndx file into a dictionary. The dictionary keys are the group names. If used with mRMR algorithm,
group names should be the names discussed in make_ndx
:param ndx_filename: path to ndx file
:return: dictionary of the form index_dict[group id] = [atom ids in group id]
"""
index_dict = {}
index_file = open(ndx_filename, 'r')
while 1:
newline = index_file.readline()
if not newline:
break
split_line = newline.split()
if split_line:
if split_line[0] == '[':
current_group = split_line[1]
index_dict[current_group] = []
else:
index_dict[current_group].append([int(i) for i in split_line])
for key in index_dict.keys():
index_dict[key] = np.concatenate(index_dict[key], axis=1)
return index_dict
def make_xvg(tpr_filename, xtc_filename, xvg_filename, AA=True, rewrite=False):
"""
Generates an xvg file of distances using gmx pairdist which can output rectangular distance matrices.
Calculates the distances between all alpha carbons/ backbone atoms of molecule 1 to all alpha carbons/ backbone
atoms of molecule 2. Distances are calculated at each time step.
The xvg will therefore have the form:
t_0 distance(mol1atom1,mol2atom1) distance(mol1atom1, mol2atom2) ... distance(mol1atomN, mol2atomM)
t_1 distance(mol1atom1,mol2atom1) ... distance(mol1atomN, mol2atomM)
...
:param tpr_filename: a single tpr for gmx mindist input
:param xtc_filename: a single xtc for gmx mindist input
:param xvg_filename: a single xvg for gmx mindist output
:param AA: True for all-atom simulations.
:param rewrite: boolean to indicate whether you want to overwrite an existing xvg file of the same name
"""
if rewrite or (not os.path.exists(xvg_filename)):
if AA:
ref = "\"mol 1 and name CA\""
sel = "\"mol 2 and name CA\""
else:
ref = "\"mol 1 and name BB\""
sel = "\"mol 2 and name BB\""
os.system('gmx pairdist -f %s -s %s -o %s -ref %s -sel %s -refgrouping res -selgrouping res' %
(xtc_filename, tpr_filename, xvg_filename, ref, sel))
else:
warn_file_exists(xvg_filename)
def read_xvg(xvg_filenames, bin_size=0, contact=0, one_traj=True, skip_time=True):
"""
Reads a series of xvg files into a single distance matrix. You may keep track of the time and/or trajectory number
with one_traj and skip_time, or read in only the distances. This can perform binning and calculate contacts maps.
For use with mRMR, you must bin initially. (Binning while reading the files prevents memory overflow issues).
:param xvg_filenames: a list of xvg filenames
:param bin_size: if you wish to bin the distances for use with mRMR, provide a bin size in nm. If bin_size is 0,
distances will be kept as floating point.
:param contact: if you wish to get contact maps instead of distances, specify a contact length in nm.
:param one_traj: boolean to indicate whether trajectories are concatenated. Must be True for use with mRMR
algorithm.
:param skip_time: whether you want to read in the time step as well as the distances. This was added for
flexibility, but isn't useful for mRMR (set True).
:return: array of distances. data[i,j] = pair j distance value at frame number i if you are not keeping track of
time or the trajectories. If you are keeping track of the trajectory number, data[i,0] = traj_num. If you are
keeping track of just the time, data[i,0]=time. If you are keeping track of both trajectory number and time,
data[i,0] = traj_num and data[i,1] = time.
"""
data = []
traj_num = 0
if bin_size != 0:
print "Distances will be binned with bin size:", bin_size, "nm"
if contact != 0:
print "Contact maps will be loaded with contact cutoff:", contact, "nm"
for xvg_filename in xvg_filenames:
start_time = time.time()
print "Reading file " + xvg_filename + '...'
inputfile = open(xvg_filename, 'r')
while 1:
newline = inputfile.readline()
if not newline:
break
if newline[0] not in ['@', '#']:
if one_traj:
temp_vec = []
else:
temp_vec = [traj_num] # The first column of the data should be trajectory number
col_num = 0
for element in newline.split():
# BINNING
if bin_size != 0:
if col_num > 0: # Check to see whether the element is a distance measurement.
dist = int(float(element)/bin_size) # Bin the distance
temp_vec.append(dist)
# If the element is not a distance measurement, check to see if you are keeping track of time
elif not skip_time:
dist = float(element)
temp_vec.append(dist)
# CONTACT MAPS
elif contact != 0:
if col_num > 0: # Check to see whether the element is a distance measurement.
# Determine whether this is a contact or no
if float(element) > contact:
dist = 1
else:
dist = 0
temp_vec.append(dist)
# If the element is not a distance measurement, check to see if you are keeping track of time
elif not skip_time:
dist = float(element)
temp_vec.append(dist)
# RAW DATA
else:
if col_num > 0:
dist = float(element)
temp_vec.append(dist)
elif not skip_time:
dist = float(element)
temp_vec.append(dist)
col_num += 1
data.append(temp_vec)
end_time = time.time()
print "Time elapsed to read trajectory", traj_num, ":", end_time-start_time
traj_num += 1
return np.array(data)
def dump_frame(xtc, tpr, ndx, time, pdb):
"""
Uses gmx trjconv -dump to pull the frame at "time" and dump it to a pdb.
This is useful for restarting all-atom simulations from coarse-grained
simulations. For use with the file cluster_dump.py.
:param xtc: A single xtc.
:param tpr: A single tpr.
:param ndx: An index file, not necessarily of the form in make_ndx
:param time: The time (in ps) you want to dump.
:param pdb: The name of the output pdb.
"""
os.system('gmx trjconv -f %s -s %s -n %s -dump %f -o %s' % (xtc, tpr, ndx, time, pdb))
def pbc_center(xtcs, tprs, ndx, rewrite=False):
"""
This function is used for pre-processing trajectories that have pbc issues. It does the following:
1. Make any split atoms whole using gmx trjconv -pbc whole
2. Center on a single protein. Ideally, we would center on both proteins. However, there are problems with gmx
trjconv -center on multiple chains. See
https://mailman-1.sys.kth.se/pipermail/gromacs.org_gmx-users/2010-August/053340.html
To do: option to center on one or the other protein. Right now we just center on the second protein, Opa.
:param xtcs: A list of xtcs for pre-processing
:param tprs: A list of tprs. MUST BE IN CORRECT ORDER
:param ndx: A single ndx filename. The ndx file will be written using make_ndx if it
does not already exist.
:param rewrite: Whether to rewrite the ndx file and the intermediate xtc files.
"""
make_ndx(tpr_filename=tprs[0], ndx_filename=ndx, rewrite=rewrite)
stripped = strip_filenames(xtcs)
orig_xtcs = [strip +'.xtc' for strip in stripped]
n_files = len(orig_xtcs)
for i in range(n_files):
whole_name = stripped[i] + '_whole.xtc'
cntr_name = stripped[i] + '_cntr.xtc'
if os.path.exists(whole_name) and not rewrite:
warn_file_exists(whole_name)
else:
os.system('echo 3 3 | gmx trjconv -f %s -s %s -pbc whole -n %s -o %s' %
(orig_xtcs[i], tprs[i], ndx, whole_name))
if os.path.exists(cntr_name) and not rewrite:
warn_file_exists(cntr_name)
else:
os.system('echo 1 3 | gmx trjconv -f %s -s %s -pbc mol -ur compact -center -n %s -o %s' %
(whole_name, tprs[i], ndx, cntr_name))
def write_pdbs(xtcs, tprs, ndx, pdbs, dt=100000, rewrite=False):
"""
Uses gmx trjconv to write out pdbs at intervals of dt. The pdbs are written with connect records. This is really for
visualization to make sure centered trajectories are not moving through the periodic box. It is not called in mRMR
algorithm, so if you want to check you need to write your own script.
:param xtcs: A list of xtcs.
:param tprs: A list of tprs. MUST BE IN CORRECT ORDER
:param ndx: An index file of the form described in the method make_ndx.
:param pdbs: A list of output pdbs. Must be the same length as your xtc list.
:param dt: Intervals at which you wish to write out pdbs. See GROMACS documentation of trjconv.
:param rewrite: Whether to overwrite the pdbs.
"""
n_files = len(xtcs)
for i in range(n_files):
if os.path.exists(pdbs[i]) and not rewrite:
warn_file_exists(pdbs[i])
else:
os.system('echo 3 | gmx trjconv -f %s -s %s -n %s -conect -dt %i -o %s' %
(xtcs[i], tprs[i], ndx, dt, pdbs[i]))
def pre_process(xtcs, tprs, ndx, xvgs, AA=True, rewrite=False):
"""
This function:
1. Makes a single ndx file using a single tpr.
2. Takes care of pbc problems by clustering and centering with pbc_center.
3. Makes a bunch of xvg files using a list of xtcs and tprs.
:param xtcs: list of xtcs
:param tprs: list of tprs. MUST BE IN CORRECT ORDER.
:param ndx: single index file name. If the index file does not already exist, one will be created using make_ndx
:param xvgs: a list of xvg file names. If any one of the xvgs does not already exist, it
will be created using make_xvg.
:param AA: indicates whether the trajectories are all-atom or coarse-grained.
:param rewrite: boolean to indicate whether you want to overwrite an existing index file of the same name.
"""
make_ndx(tprs[0], ndx, rewrite=rewrite, AA=AA)
pbc_center(xtcs, tprs, ndx)
cntrd_xtcs = [strip + '_cntr.xtc' for strip in strip_filenames(xtcs)]
n_files = len(cntrd_xtcs)
for i in range(n_files):
make_xvg(tpr_filename=tprs[i], xtc_filename=cntrd_xtcs[i], AA=AA,
xvg_filename=xvgs[i], rewrite=rewrite)