-
Notifications
You must be signed in to change notification settings - Fork 0
/
HomomerChimeras.py
409 lines (375 loc) · 23.7 KB
/
HomomerChimeras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
from json import load
from os import path
from pathlib import Path
from sys import exit
from Chimeragenesis import AccessiontoAlignment
import Analysis
from setup import alphafold_submission_for_chimera_container
import ChimeraGenerator
import argparse
# TODO add autocomplete for changing keys?? readline
# TODO be able to designate multiple seq_of_interest
# TODO test accession route
# TODO test this between Homomer proteins
# TODO be able tos swap into the constant
#TODO turn parser into function??
parser = argparse.ArgumentParser(
description='Creating chimeric proteins where a region from a constant protein is spliced into a list of '
'homologus regions between a protein family or vice versa')
parser.add_argument('-u', '--updatejson', type=str, required=False,
help='updating json configs. ex: default_json,old_json')
parser.add_argument('-i', '--jsoninput', dest='arg_jsons', required=False, type=str,
help='Comma seperated json config inputs. ex: constant_json,variant_json')
parser.add_argument('-ch', '--change', required=False, type=str,
help='input json with options you wish to change. ex: json')
parser.add_argument('-nv', '--new_values', required=False, type=str,
help='input comma seperated option:new_value pairs, seperate each part of the pair with a colon. '
'ex: option:new_value,option:new_value')
parser.add_argument('--overwrite', action='store_true', required=False, default=False,
help='This flag will overwrite the input json files for change,findnreplace, and updatejson, '
'the default without the flag will create a new json named new+input_file_stem.')
parser.add_argument('-pr', '--prints', required=False, type=str,
help='print all changeable options from inputted json or print current value for inputted option '
'in inputted json. ex: json or json,alphafold_outputs_dir')
parser.add_argument('-fr', '--findnreplace', required=False, type=str,
help='Warning:It is recommended you do not use overwrite flag with this option. Replaces a '
'recurring between a find:replace pair input within the json input in change flag. ex: '
'string_to_find:replacing_string')
subparser = parser.add_subparsers(dest='cmdline_toggles')
operations = subparser.add_parser('operations')
operations.add_argument('-fa', '--fasta', required=False, action='store_true', default=False,
help='turns on fasta operation. Flag must be called at the end of the command and '
'preceded by \'operations\' ex: python file.py -i json1,json2 operations -fa . Can be '
'combined with -s')
operations.add_argument('-s', '--submission', required=False, action='store_true', default=False,
help='turns on submission operation. This flag must be called at the end of the command and '
'preceded by \'operations\' ex: python file.py -i json1,json2 operations -s . Can be '
'combined with -fa or -a')
operations.add_argument('-a', '--analysis', required=False, action='store_true', default=False,
help='turns on analysis operation. This flag must be called at the end of the command and '
'preceded by \'operations\' ex: python file.py -i json1,json2 operations -a . Can be '
'combined with -s')
args = parser.parse_args()
# All conditionals to check for flags related to manipulation or clarification of json config files: (u,ch,pr,nv,overwite,fr)
# If any of them are called the program will cease after completion or error
if args.updatejson:
args.updatejson = args.updatejson.split(',')
ChimeraGenerator.update_json(args.updatejson[0], args.updatejson[1], args.overwrite)
exit()
if args.prints:
print_args = args.prints.split(',')
with open(print_args[0], 'rb') as jfile:
selected_json = load(jfile)
if len(print_args) == 1:
ChimeraGenerator.print_keys(selected_json)
else:
ChimeraGenerator.print_keys(selected_json, print_args[1])
exit()
if args.new_values and args.change:
new_key_values = [pair.split(':') for pair in args.new_values.split(',')]
new_key_values = {key: value for key, value in new_key_values}
for key, value in new_key_values.items():
ChimeraGenerator.change_json_value(args.change, key, value, args.overwrite)
exit()
if args.findnreplace and args.change:
pair = tuple(args.findnreplace.split(':'))
ChimeraGenerator.change_json_value(args.change, overwrite=args.overwrite, find_n_replace_tuple=pair)
exit()
# Allows for commandline operation flags to override selections made in the json config
class HomomerNamingArguments:
placeholder: str = "*"
'''a unique character to be replaced by fasta identifiers or provided nicknames for naming convention'''
monomer_naming_convention: str = "3mer*"
'''A combination of words and placeholders to create a naming that will persist through all monomer chimera
related data files e.g. (fasta,pdb,plddt)'''
slurm_naming: str
'''Slurm naming convention. Placeholders in the convention will be replaced by numbers in ascending order as needed'''
multimer_naming_convention: str
chimera_naming_convention: str
gromacs_slurm_dir: str
fasta_naming: str
plddt_naming: str
pdb_naming: str
alphafold_outputs_dir: str
fasta_extension: str = '.fasta'
'''File extension for fasta files'''
plddt_extension: str = '.plddt'
pdb_extension: str = '.pdb'
gmx_setup_extension: str = '.setup'
gmx_production_extension: str = '.prod'
# TODO set to general method for all
# All non-nested keys from the json are set as attributes to be called later
def __init__(self, naming_dict):
for key, value in naming_dict.items():
setattr(self, key, value)
class HomomerFastaArguments:
fasta_toggles: dict
'''A dictionary that holds optional operations within the fasta operation, they are turned on by
placing true/false'''
Make_a_lst_of_created_fasta_files: bool
'''Create a file with line separted list of all fastas created'''
Create_an_alignment: bool
constant_or_variant: bool
'''Distinguishes whether the config json is for the constant part of the chimera or the variant protein that will be swapped with all protein in the protein list'''
protein_list: str
'''Two column list of accession number and their associated protein file_stems'''
fasta_identifier: str
'''Fasta label for the constant protein or the reference protein for the homologs to be iterated through'''
# TODO add defaults like this
muscle_command_for_msa: str = 'module load gcc/9.2.0 && module load muscle/3.8.31 && muscle'
'''The start of the muscle command to be run to create the alignment.'''
number_of_subunits: int
sequence_of_interest: str
'''This is the path to the fasta containing the sequence of either the constant protein you want spliced into the
variant proteins, or the section of the variants you want replaced with the constant protein. Each sequence of
interest should be in their own fasta file. All non-reference variant proteins will have the Homomer sequence
from an alignment replaced'''
full_reference_seq_fasta: str
'''The path to full-length protein of the constant protein or the reference for the variants'''
msa_file_name: str
'''Path to the multiple sequence alignment to be created or a user-generated msa'''
email_for_accession: str
msa_fasta: str
'''Name of the new fasta that will contain all variant protein sequences to be aligned'''
fasta_for_alphafold: str
'''Path to the fasta to be predicted by Alphafold'''
fasta_file_list_name: str
'''filename to hold the complete list of fastas created in the fasta operation'''
def __init__(self, fasta_dict):
for key, value in fasta_dict.items():
setattr(self, key, value)
class HomomerSubmissionArguments:
submission_toggles: dict
'''A dictionary that holds optional operations within the submission operation, they are generally turned on by
placing true within the quotes of their value'''
create_slurms: bool
sbatch_slurms: bool
stragglers_or_custom_or_all: bool = 'stragglers'
'''If stragglers is placed in the quotes it will check to see if an alphafold prediction is done for all created
chimeras and making a new the list of incomplete predictions or 'stragglers' to either create a file with their
fastas or put them into a slurm to be submitted again. If custom is placed, all chimeras within the
custom_list_to_run file will be submitted as slurm jobs. If all is selected all chimeras will be run
indiscriminately'''
create_file_of_stragglers: bool
custom_list_to_run: str
'''Path to file that for list that either be created by toggling on create_file_of_stragglers, or user-generated that should contain
line separated fastas of proteins you want predicted'''
proteins_per_slurm: int
template_slurm: str
'''Path to the file that has the template settings for alphafold slurm jobs'''
alphafold_shell_script: str
'''Path to the shell script that runs alphafold, it must take a list of comma-separated fastas
and an output as its arguments. It's recommended you use the on provided in github'''
slurm_output: str
slurm_error: str
def __init__(self, fasta_dict):
for key, value in fasta_dict.items():
setattr(self, key, value)
class HomomerAnalysisArguments:
analysis_toggles: dict
make_plddts: bool
make_pdbs: bool
make_emboss_files: bool
emboss_command: str
analysis_output_csv: str
'''Path of the file you want created that will contain all analysis created.'''
column_names: dict
'''A dictionary containing a multiple keys and their list value that each correspond with a type of data that is
output by the script, a user-selected title for the data column and whether that data is included in the final
output'''
file_stem: list
'''An array [true,"Protein"] that contains a boolean that will control whether the data column containing
filestems/nicknames created from the naming convention previously established will be included in the data csv.
The second index controls the column name in the csv'''
overall_native_stability: list
'''See file_stem docstring'''
relative_stability: list
'''See file_stem docstring'''
overall_chimera_stability: list
'''See file_stem docstring'''
def __init__(self, fasta_dict):
for key, value in fasta_dict.items():
setattr(self, key, value)
class HomomerChimeraContainer:
argument_dict: dict
'''Dictionary containing all arguments for running the chimera script'''
operation_toggles: dict
'''Dictionary containing boolean toggles for all major operation:fasta creation, alphafold submission,
analysis of alphafold results, and gromacs simulation submission.'''
run_fasta_operation: bool
alphafold_submission: bool
run_analysis_operation: bool
run_gromacs_operation: bool
def __init__(self, shifted_json_file):
self.naming_args = None
self.submission_args = None
self.fasta_args = None
self.analysis_args = None
with open(shifted_json_file, 'rb') as jfile:
self.argument_dict = load(jfile)
self.operation_toggles = self.argument_dict['operation_toggles']
self.chimeras = ()
def get_dict_args(self, dict_class, attr_name, arg_key):
dict_args = self.argument_dict[arg_key]
setattr(self, attr_name, dict_class(dict_args))
def add_chimera(self, chimera):
self.chimeras += (chimera,)
container = HomomerChimeraContainer(args.arg_jsons)
container.get_dict_args(HomomerFastaArguments, 'fasta_args', 'fasta_arguments')
container.get_dict_args(HomomerNamingArguments, 'naming_args', 'naming_arguments')
fasta_toggles = container.fasta_args.fasta_toggles
if hasattr(args,'fasta') or hasattr(args,'submission') or hasattr(args,'analysis'):
operation_args = (args.fasta, args.submission, args.analysis)
if any(operation_args) and operation_args:
for key in container.operation_toggles:
container.operation_toggles[key] = False
name_placeholder = container.naming_args.nickname_placeholder
file_placeholder = container.naming_args.file_placeholder
msa = container.fasta_args.msa_file_name
subunits = container.fasta_args.number_of_subunits
alphafold_dir = container.naming_args.alphafold_outputs_dir
seq_of_interest= AccessiontoAlignment.extract_seq_from_fasta(container.fasta_args.sequence_of_interest)
ref_sequence= AccessiontoAlignment.extract_seq_from_fasta(container.fasta_args.full_reference_seq_fasta)
# This is a conditional to forgo creating an alignment if one already exist, and its creation wasn't toggled on. This allows existing MSA to be overwritten if needed.
if path.exists(container.fasta_args.msa_file_name) and not fasta_toggles['Create_an_alignment']:
sequence_dictionary = AccessiontoAlignment.create_dictionary_from_alignment(container.fasta_args.msa_file_name)
# chimeras are turned into classes with labeling attributes from the existing msa and the native sequence from the non-reference protein
for fasta_id, sequence in sequence_dictionary.items():
chimera = ChimeraGenerator.chimeracls()
container.add_chimera(chimera)
chimera.file_stem = fasta_id
chimera.native_seq = sequence.replace('-', '')
# failed conditional trigger a creation of chimeras with labels from the protein list that will be used to make the msa
else:
with open(container.fasta_args.protein_list, 'r') as info_list:
info_list = info_list.readlines()
for line in info_list:
chimera = ChimeraGenerator.chimeracls()
container.add_chimera(chimera)
chimera.file_stem = line.split()[1]
chimera.accession = line.split()[0]
list_of_chis = container.chimeras
num_of_chi = len(list_of_chis)
#function takes all chimeras from the container and attaches file names of the future pdb and fasta files
ChimeraGenerator.assign_file_attrs_to_chimeras(container)
#a list is then crafted based on the recently created filenames
fastas = [chimera.chimera_fasta for chimera in list_of_chis] + [chimera.multimer_fasta for chimera in
list_of_chis]
if container.operation_toggles['run_fasta_operation']:
email = container.fasta_args.email_for_accession
# checking if all wild-type and non-reference fastas exist or that there creation was toggled
if fasta_toggles['make_native_fastas'] or any(not path.exists(chimera.monomer_fasta) for chimera in list_of_chis):
#loop checks if theirs an existing msa and therefore sequence attribute, if not the accession number attached is used to retrieve the sequence
for chimera in list_of_chis:
if hasattr(chimera,'native_seq'):
ChimeraGenerator.fasta_creation(chimera.monomer_fasta,
[tuple((chimera.native_seq, 1, chimera.multimer_stem))])
ChimeraGenerator.fasta_creation(chimera.multimer_fasta,
[tuple((chimera.native_seq, subunits, chimera.multimer_stem))])
else:
AccessiontoAlignment.accession_to_fasta(chimera.monomer_fasta, chimera.accession, email, subunits,
chimera.multimer_fasta)
# checking if msa exists or if creation was toggled now that all fastas are created
if fasta_toggles['Create_an_alignment'] or not path.exists(container.fasta_args.msa_file_name):
AccessiontoAlignment.multiple_sequence_alignment(
tuple(chimera.monomer_fasta for chimera in list_of_chis) + (container.fasta_args.full_reference_seq_fasta,),
container.fasta_args.msa_fasta, msa,
container.fasta_args.muscle_command_for_msa)
for chimera in list_of_chis:
homologous_splice = AccessiontoAlignment.alignment_finder(msa, seq_of_interest, chimera.file_stem,
container.fasta_args.reference_identifier)[0]
chimera.chi_seq = ref_sequence.replace(seq_of_interest, homologous_splice)
ChimeraGenerator.fasta_creation(chimera.chimera_fasta,
[tuple((chimera.chi_seq, subunits, chimera.chimera_stem))])
if fasta_toggles['Make_a_list_of_created_fasta_files']:
AccessiontoAlignment.create_list_of_fasta_files(fastas,
container.fasta_args.fasta_list_file_name)
if container.operation_toggles['alphafold_submission']:
container.get_dict_args(HomomerSubmissionArguments, 'submission_args', 'alphafold_submission_args')
alphafold_submission_for_chimera_container(container, fastas)
if container.operation_toggles['run_analysis_operation']:
for chimera in list_of_chis:
homologous_splice = AccessiontoAlignment.alignment_finder(msa, seq_of_interest, chimera.file_stem,
container.fasta_args.reference_identifier)[0]
chimera.chi_seq = ref_sequence.replace(seq_of_interest, homologous_splice)
container.get_dict_args(HomomerAnalysisArguments, 'analysis_args', 'analysis_arguments')
analysis_toggles = container.analysis_args.analysis_toggles
ref_file_stem = Path(container.fasta_args.fasta_for_alphafold).stem
ref_pdb = path.join(f'{alphafold_dir}3mer6VSB', 'ranked_0.pdb')
ref_plddt = {ref_sequence: Analysis.get_plddt_dict_from_pdb(ref_pdb)[ref_sequence]}
if analysis_toggles['make_emboss_files']:
for chimera in list_of_chis:
AccessiontoAlignment.alignment_finder(msa, seq_of_interest, chimera.file_stem, container.fasta_args.reference_identifier, container.analysis_args.emboss_command, container.naming_args.emboss_naming.replace(name_placeholder, chimera.file_stem))
if analysis_toggles["make_pdbs"] or any(not path.exists(chimera.native_pdb) for chimera in list_of_chis) or any(not path.exists(chimera.chi_pdb) for chimera in list_of_chis):
for chimera in list_of_chis:
Analysis.generate_alphafold_files(f'{alphafold_dir}{chimera.multimer_stem}',
new_pdb=container.naming_args.pdb_naming.replace(file_placeholder,chimera.multimer_stem))
Analysis.generate_alphafold_files(f'{alphafold_dir}{chimera.chimera_stem}',
new_pdb=container.naming_args.pdb_naming.replace(file_placeholder,chimera.chimera_stem))
Analysis.generate_alphafold_files(
f'{alphafold_dir}{ref_file_stem}',new_pdb=container.naming_args.pdb_naming.replace(file_placeholder, ref_file_stem))
for chimera in list_of_chis:
chimera.native_plddt = {
chimera.native_seq: Analysis.get_plddt_dict_from_pdb(chimera.native_pdb)[chimera.native_seq]}
chimera.chi_plddt = {chimera.chi_seq: Analysis.get_plddt_dict_from_pdb(chimera.chi_pdb)[chimera.chi_seq]}
chimera.overall_native_stability = Analysis.overall_confidence(chimera.native_plddt[chimera.native_seq])
chimera.overall_chimera_stability = Analysis.overall_confidence(chimera.chi_plddt[chimera.chi_seq])
chimera.sequence_similarity=Analysis.get_sequence_similarity(container.naming_args.emboss_naming.replace(name_placeholder, chimera.file_stem))
if analysis_toggles['make_plddts']:
for chimera in list_of_chis:
Analysis.get_plddt_file_from_pdb(chimera.native_pdb,
container.naming_args.plddt_naming.replace(file_placeholder,chimera.multimer_stem))
Analysis.get_plddt_file_from_pdb(chimera.chi_pdb,
container.naming_args.plddt_naming.replace(file_placeholder,chimera.chimera_stem))
Analysis.get_plddt_file_from_pdb(ref_pdb, container.naming_args.plddt_naming.replace(file_placeholder, ref_file_stem))
for chimera in list_of_chis:
homologous_splice = AccessiontoAlignment.alignment_finder(msa, seq_of_interest, chimera.file_stem,
container.fasta_args.reference_identifier)[0]
chimera.rel_stability = Analysis.revamped_rs(chimera.native_plddt, chimera.chi_plddt,ref_plddt,
homologous_splice)
data_columns = Analysis.determine_columns_from_container(container)
Analysis.convert_data_dict_to_csv(data_columns, container)
# # TODO have a functiuon to proint pdbs and make pdbs for reference
# # TODO make it to be able to simulate native pdb
# # TODO: Create a way for python to check periodically whether the setup slurms are done and then submit the prodcution
# # TODO make certain parts available from commandline
# if operation_toggles['run_gromacs_operation']=='True':
# gromacs_data_dict={}
# gromacs_arguments = argument_dict.gromacs_arguments
# gromacs_toggles = gromacs_arguments.gromacs_toggles
# # TODO if you hyand select make a file to resubmit with the choisces
# if not path.exists(gromacs_arguments.pdbs_to_run) or gromacs_toggles['create new pdb list']:
# with open(analysis_arguments.analysis_output_csv) as data:
# pdbs_to_run=tuple(chimera.chi_pdb for chimera in list_of_chis)
# for row in reader(data):
# print(row)
# for index,pdb in enumerate(pdbs_to_run):
# print(f'{index}. {pdb}')
# pdbs_to_run=tuple(pdbs_to_run[int(index)] for index in input("Submit comma separated list:\n Ex: 0,1,2,7,8\n Enter:").split(','))
# with open(gromacs_arguments.pdbs_to_run,'w') as pdb_list:
# for pdb in pdbs_to_run:
# pdb_list.write(f'{pdb}\n')
# else:
# with open(gromacs_arguments.pdbs_to_run,'r') as pdbs_to_run:
# pdbs_to_run=tuple(pdb.split()[0] for pdb in pdbs_to_run.readlines())
# for pdb in pdbs_to_run:
# system(f'cp {pdb} {container.naming_args["gromacs_slurm_dir"]}')
# gromacs_data_dict.setup_slurms = tuple(
# container.naming_args.gromacs_slurm_dir + Path(pdb).stem + container.naming_args.setup_extension for pdb in
# pdbs_to_run)
# # TODO make folders for gromacs
# if gromacs_toggles['create setup slurms']=='True':
# for index,(pdb,slurm) in enumerate(zip(pdbs_to_run,gromacs_data_dict.setup_slurms)):
# create_setup_slurm(pdb,gromacs_arguments.gmxbin,gromacs_arguments.pdb2gmx,gromacs_arguments.slurm_template,slurm,gromacs_arguments.slurm_output.replace(placeholder,Path(pdb).stem),gromacs_arguments.slurm_error.replace(placeholder,Path(pdb).stem))
# if gromacs_toggles['sbatch setup slurms']:
# for slurm in gromacs_data_dict.setup_slurms:
# system(f"sbatch {slurm}")
# gromacs_data_dict.mdrun_slurms = tuple(
# container.naming_args.gromacs_slurm_dir + Path(pdb).stem + container.naming_args.production_extension for pdb
# in pdbs_to_run)
# if gromacs_toggles['create mdrun slurms']:
# for index, (pdb, slurm) in enumerate(zip(pdbs_to_run, gromacs_data_dict.mdrun_slurms)):
# create_prod_slurm(pdb,gromacs_arguments.gmxbin,gromacs_arguments.slurm_template,slurm,gromacs_arguments.slurm_output.replace(placeholder,Path(pdb).stem),gromacs_arguments.slurm_error.replace(placeholder,Path(pdb).stem))
# if gromacs_toggles['sbatch mdrun slurms']:
# for slurm in gromacs_data_dict.mdrun_slurms:
# system(f"sbatch {slurm}")