/
dsNickFury2.0.6.py
3417 lines (3232 loc) · 190 KB
/
dsNickFury2.0.6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
'''
Software License
Commercial reservation
This License governs use of the accompanying Software, and your use of the Software constitutes acceptance of this license.
You may use this Software for any non-commercial purpose, subject to the restrictions in this license. Some purposes which can be non-commercial are teaching, academic research, and personal experimentation.
You may not use or distribute this Software or any derivative works in any form for any commercial purpose. Examples of commercial purposes would be running business operations, licensing, leasing, or selling the Software, or distributing the Software for use with commercial products.
You may modify this Software and distribute the modified Software for non-commercial purposes; however, you may not grant rights to the Software or derivative works that are broader than those provided by this License. For example, you may not distribute modifications of the Software under terms that would permit commercial use, or under terms that purport to require the Software or derivative works to be sublicensed to others.
You agree:
1. Not remove any copyright or other notices from the Software.
2. That if you distribute the Software in source or object form, you will include a verbatim copy of this license.
3. That if you distribute derivative works of the Software in source code form you do so only under a license that includes all of the provisions of this License, and if you distribute derivative works of the Software solely in object form you do so only under a license that complies with this License.
4. That if you have modified the Software or created derivative works, and distribute such modifications or derivative works, you will cause the modified files to carry prominent notices so that recipients know that they are not receiving the original Software. Such notices must state: (i) that you have changed the Software; and (ii) the date of any changes.
5. THAT THIS PRODUCT IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS PRODUCT, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU MUST PASS THIS LIMITATION OF LIABILITY ON WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE WORKS.
6. That if you sue anyone over patents that you think may apply to the Software or anyone's use of the Software, your license to the Software ends automatically.
7. That your rights under the License end automatically if you breach it in any way.
8. UCLA and the Regents of the University of California reserves all rights not expressly granted to you in this license.
9. Nothing in this Agreement grants by implication, estoppel, or otherwise any rights to any intellectual property owned by the Regents of the University of California, except as explicitly set forth in this license.
10. You will hold the Regents of the harmless for all claims, suits, losses, liabilities, damages, costs, fees, and expenses resulting from their respective activities arising from this license.
11. You will not use any name, trade name, trademark, name of any campus, or other designation of the Regents of the University of California in advertising, publicity, or other promotional activity, except as permitted herein.
'''
#This is the path for the python interpreter. You must set it before this will run. If you are unsure what this is, try the command "which python3" or "which pypy3" and try pasting the path returned between the quotes on the next line.
global pythonInterpreterAbsolutePath; global pythonCompilerAbsolutePath
pythonInterpreterAbsolutePath = "/u/local/apps/python/3.4.3/bin/python3"
#pythonInterpreterAbsolutePath = "/u/local/apps/python/3.4.3/bin/python3" #Set the absolute path for your python interpreter here between the quotes. Depending on your system configuration, you may also be able to use a shortcut, such as python3, but that has a greater chance of errors
pythonCompilerAbsolutePath = ""
#USEFUL DEFAULT SETTINGS HERE
global selectionModeTargetLimitPerJob
selectionModeTargetLimitPerJob = 0 #This prevents a user from submitting a job with too many target sites that might overload or degrade performance on the system. Clobber mode can override this. Change this value according to your system's capabilities. Set to 0 or negative value for no limit.
global clusterDefaultSelectionModeParallelJobLimit; global standAloneDefaultSelectionModeParallelJobLimit; global clusterMaxArrayJobs;
clusterDefaultSelectionModeParallelJobLimit = 230 #Limit on how many simultaneous parallel jobs can be going at once on a cluster (set this based on the limit for how many queued items you can have)
standAloneDefaultSelectionModeParallelJobLimit = 1 #Limit on how many simultaneous parallel jobs can be going if run on a single system. Change according to your machine.
clusterMaxArrayJobs = 50 #for an array job, this is the maximum number of crispr sites that can be in a single sequence if there is no selectionModeTargetLimitPerJob set.
global currentVersion
currentVersion = "2.0.6"
global versionName
versionName = "GuideSeq Divided... I never asked for this"
global yearWritten
yearWritten = "2015"
def yesAnswer(question): #asks the question passed in and returns True if the answer is yes, False if the answer is no, and keeps the user in a loop until one of those is given. Also useful for walking students through basic logical python functions
answer = False #initializes the answer variable to false. Not absolutely necessary, since it should be undefined at this point and test to false, but explicit is always better than implicit
while not answer: #enters the loop and stays in it until answer is equal to True
print (question + ' (Y/N)') #Asks the question contained in the argument passed into this subroutine
answer = input('>>') #sets answer equal to some value input by the user
if str(answer) == 'y' or str(answer) == 'Y': #checks if the answer is a valid yes answer
return True #sends back a value of True because of the yes answer
elif str(answer) == 'n' or str(answer) == 'N': #checks to see if the answer is a valid form of no
return False #sends back a value of False because it was not a yes answer
else: #if the answer is not a value indicating a yes or no
print ('Invalid response.')
answer = False #set ansewr to false so the loop will continue until a satisfactory answer is given
def reportUsage(details): #this will report usage back to us. This is required by the people who maintain the Azimuth server. This will not send back any details of your job except if azimuth was used.
import urllib.request
url = "http://pathways.mcdb.ucla.edu/cgi-bin/MW_counter/counter.cgi?counter=" + details
try:
req = urllib.request.urlopen(url)
except:
pass
def printStartUp():
print(" _ _ _ _ _ _ _ _ _ _ _ _ ")
print(" /\\ \\ / /\\ /\\ \\ _ /\\ \\ /\\ \\ /\\_\\ /\\ \\ /\\_\\ /\\ \\ /\\ \\ /\\_\\ /\\ \\ ")
print(" / \\ \\____ / / \\ / \\ \\ /\\_\\ \\ \\ \\ / \\ \\ / / / _ / \\ \\ / / / _ / \\ \\\\ \\ \\ / / // \\ \\ ")
print(" / /\\ \\_____\\ / / /\\ \\__ / /\\ \\ \\_/ / / /\\ \\_\\ / /\\ \\ \\ / / / /\\_\\ / /\\ \\ \\\\ \\ \\__ /\\_\\ / /\\ \\ \\\\ \\ \\_/ / // /\\ \\ \\ ")
print(" / / /\\/___ // / /\\ \\___\\ / / /\\ \\___/ / / /\\/_/ / / /\\ \\ \\ / / /__/ / / / / /\\ \\_\\\\ \\___\\ / / // / /\\ \\_\\\\ \\___/ / \\/_/\\ \\ \\ ")
print(" / / / / / / \\ \\ \\ \\/___// / / \\/____/ / / / / / / \\ \\_\\ / /\\_____/ / / /_/_ \\/_/ \\__ / / / // / /_/ / / \\ \\ \\_/ / / / ")
print(" / / / / / / \\ \\ \\ / / / / / / / / / / / / \\/_/ / /\\_______/ / /____/\\ / / / / / // / /__\\/ / \\ \\ \\ / / / ")
print("/ / / / / /_ \\ \\ \\ / / / / / / / / / / / / / / /\\ \\ \\ / /\\____\\/ / / / / / // / /_____/ \\ \\ \\ / / / _ ")
print("\\ \\ \\__/ / //_/\\__/ / / / / / / / /___/ / /__ / / /________ / / / \\ \\ \\ / / / / / /___/ / // / /\\ \\ \\ \\ \\ \\ / / /_/\\_\\")
print(" \\ \\___\\/ / \\ \\/___/ / / / / / / //\\__\\/_/___\\/ / /_________\\/ / / \\ \\ \\ / / / / / /____\\/ // / / \\ \\ \\ \\ \\_\\/ /_____/ /")
print(" \\/_____/ \\_____\\/ \\/_/ \\/_/ \\/_________/\\/____________/\\/_/ \\_\\_\\\\/_/ \\/_________/ \\/_/ \\_\\/ \\/_/\\________/ ")
print(" _ _ _ ____ ___ ____ _ _ _ _ ____ _ _ ____ ___ ____ ____ ____ ____ ___ ____")
print(" | | | |__| | | |__| \_/ | | | | |__/ | |__| |__/ | __ |___ | [__ ")
print(" |_|_| | | | |___ | | | |__| |__| | \ | | | | \ |__] |___ | ___]")
print(" _ _ _ _ ___ _ _ ___ ____ ___ _ _ ____ _ _ ____ ____ /")
print(" | | | | | |__| |__] | | | |__| |___ \_/ |___ [__ /")
print(" |_|_| | | | | |__] |__| | | | |___ | |___ ___] .")
print("Version " + currentVersion + " (" + versionName + ")")
print("By Michael M. Weinstein, Copyright " + yearWritten)
print("Dan Cohn Laboratory and Collaboratory, University of California, Los Angeles")
def printManual():
import urllib.request
try:
import textwrap
wrap = True
except:
wrap = False
try:
import random
egg = random.randint(1,100)
if egg == 42:
print(urllib.request.urlopen("https://raw.githubusercontent.com/michael-weinstein/dsNickFury2/master/ajd/AJD.txt").read().decode('utf-8')) #this works better if your terminal has a dark background
except:
pass
manualURL = 'https://raw.githubusercontent.com/michael-weinstein/dsNickFury2/master/readme.md'
try:
rawManual = urllib.request.urlopen(manualURL)
text = rawManual.read().decode('utf-8')
print("Manual from " + manualURL)
if not wrap:
print(text)
else:
text = text.split("\n")
for paragraph in text:
print("\n".join(textwrap.wrap(paragraph, width = 120, break_long_words = False)))
except:
print("Unable to download and display manual. Please view it in your browser at:\n\n" + manualURL + "\n")
try:
import random
egg = random.randint(1,100)
if egg == 42:
print(urllib.request.urlopen("https://raw.githubusercontent.com/michael-weinstein/dsNickFury2/master/ajd/AJD.txt").read().decode('utf-8')) #this works better if your terminal has a dark background
except:
pass
quit()
def checkPythonInterpreterAbsolutePath(absPath):
import os
import subprocess
if not absPath:
printManual()
raise RuntimeError("ABORTED: You must set the absolute path for your python interpreter at the beginning of this script.")
if not os.path.isfile(absPath): #if the absolute path is not actually a file, check if we have an alias we can expand upon
import subprocess
try:
absPath = subprocess.check_output(['which',absPath]).decode('utf-8').strip()
except subprocess.CalledProcessError:
raise RuntimeError("ABORTED: Python interpreter not found at " + absPath + " and it does not appear to be a valid alias. Please correct the location.")
return absPath
pythonInterpreterAbsolutePath = checkPythonInterpreterAbsolutePath(pythonInterpreterAbsolutePath)
#===================================Command line Argument Checking===========================================
class Args(object):
def __init__(self):
import os
import argparse #loads the required library for reading the commandline
parser = argparse.ArgumentParser()
parser.add_argument ("--manual", help = "Print out the user manual and sample command lines.", action = 'store_true')
parser.add_argument ("-m", "--mode", help = "Specify how the program is to run (what the desired task is)")
parser.add_argument ("-g", "--genome", help = "Specify the genome for searching or indexing.")
parser.add_argument ("-d", "--inputDirectory", help = "Specify genome index directory for a worker job.")
parser.add_argument ("-f", "--inputfile", help = "Specify a single input genome for indexing")
parser.add_argument ("--tempDir", help = "Temporary directory name for parallel jobs")
parser.add_argument ("--parallelJobs", help = "Max number of parallel jobs at once. (Or per search if running in selection mode)")
parser.add_argument ("-9", "--clobber", help = "Do not ask before overwriting files.", action = 'store_true')
parser.add_argument ("-w", "--workerID", help = "Worker process ID. Users should not be setting this value.")
parser.add_argument ("-s", "--sequence", help = "Sequence of interest. Format: NNNNNNGUIDERNANNNNNN_PAM")
parser.add_argument ("-t", "--mismatchTolerance", help = "Maximim number of mismatches permitted for a positive result.")
parser.add_argument ("--verbose", help = "Run in verbose mode", action = 'store_true')
parser.add_argument ("--mock" , help = "Print exec commands instead of running them.", action = 'store_true')
parser.add_argument ("--ordered", help = "Do not break up chromosomes for parallel analysis.", action = 'store_true')
parser.add_argument ("-c", "--chunkSize", help = "Specify the chunk size for parallel genome annotation.")
parser.add_argument ("--chromosome", help = "Used to specify the chromosome/contig ID. This should be passed by the machine and not the user.")
parser.add_argument ("--start", help = "Used to specify the starting byte of the FASTA for indexing. This should be passed by the machine and not the user.")
parser.add_argument ("--length", help = "Used to specify the bytelength of the chunk to be indexed by the program. This should be passed by the machine and not the user.")
parser.add_argument ("--forceJobIndex", help = "Force the indexing supervisor instance to take a specific job index. Mostly useful for debugging functions.")
parser.add_argument ("--outputDirectory", help = "Directory for outputting search results to a hypervisor. This should generally be passed by the machine and not the user.")
parser.add_argument ("--noCleanup", help = "Leave behind any temporary files for future inspection.", action = 'store_true')
parser.add_argument ("--forceGenome", help = "Force the search supervisor to use this genome directory. Generally this should be passed by the machine and not the user.")
parser.add_argument ("--species", help = "Tell the indexer which species the genome is from.")
parser.add_argument ("--targetSequence", help = "Enter a long sequence here to search for and analyze potential targets.")
parser.add_argument ("--targetFasta", help = "Enter a file name for a FASTA file to search for and analyze potential target sites.")
parser.add_argument ("--targetList", help = "Enter a list of potential sites to analyze for off-target risk.")
parser.add_argument ("--noForcedBases", help = "Prevent forcing bases 1 and/or 3 in the guide RNA to match those submitted for Azimuth analysis")
parser.add_argument ("--skipAzimuth", help = "Do not attempt Azimuth analysis.", action = 'store_true')
parser.add_argument ("--parallelJobLimit", help = "Set a limit on the number of parallel jobs allowed at once in the queue for highly parallelized tasks (this MUST be set below your scheduler's limit for queued jobs for a single user, and should be set 5-10 percent below it).")
parser.add_argument ("--genomeDirectory", help = "Specify an alternate directory to search for suitable indexed genomes.")
parser.add_argument ("--annotationExpansion", help = "Specify how far from the target site to search for an annotated gene/genomic feature (default is 1KB).")
parser.add_argument ("--azimuthSequence", help = "Specify a sequence for Azimuth analysis.")
parser.add_argument ("--outputToFile", help = "In selection mode, dump the output to the filename passed as an argument here.")
parser.add_argument ("--scratchFolder", help = "Specify a directory to use for writing temporary (job) folders.")
parser.add_argument ("--cluster", help = "Specify that this is running on a cluster system.", action = "store_true")
parser.add_argument ("--standAlone", "--standalone", help = "Specify that this is running on a single system or server.", action = "store_true")
parser.add_argument ("--mixed", help = "In selection mode, run each site on a separate node, but parallelize searching within a single node.", action = "store_true")
parser.add_argument ("--directToCompiler", help = "If you already have run the index, but compiling did not complete successfully, try this to recompile.", action = "store_true")
parser.add_argument ("--recreateTree", help = "Recreate the tree pickle for an existing genome index.", action = "store_true")
parser.add_argument ("--bins", help = "Pass a comma-separated list of bins for this compiler instance to work on.")
parser.add_argument ("--treeLevel1", help = "Pass an integer value for the first level of the tree")
parser.add_argument ("--treeLevel2", help = "Pass an integer value for the second level of the tree")
parser.add_argument ("-p", "--preferredPAM", help = "Use this to pass a preferred PAM site for a selection job" )
parser.add_argument ("--arrayJob", help = "This will be used by a selection hypervisor to initiate array jobs", action = "store_true")
parser.add_argument ("-a", "--useArray", help = "Use array jobs on an SGE cluster in mixed mode (recommended if possible, this will turn on mixed mode automatically)", action = "store_true")
parser.add_argument ("--delayIO", help = "Delay search worker IO calls by a random time to avoid IO traffic jams.", action = "store_true")
parser.add_argument ("--totalArrayJobs", help = "Total number of array jobs for this run, used by the array job launcher.")
parser.add_argument ("--spoofTaskID", help = "Pass an integer to force a task ID (rather than pulling one from environment variables). Used for debugging and testing the array job runner.")
parser.add_argument ("--quickExtend", help = "Quick extension mode", action = "store_true")
parser.add_argument ("--guideExtension", help = "Set the length of the extension before the guide to store or use")
parser.add_argument ("--pamExtension", help = "Set the length of the extension after the PAM to store or use")
args = parser.parse_args() #puts the arguments into the args object
if args.arrayJob: #quick trap for array job runners... it will be run during the checkargs phase and return to the main function where it will report itself done and quit
if not args.tempDir:
raise RuntimeError("Unable to run array job without being passed the name of the temporary directory under --tempDir.")
try:
totalArrayJobs = int(args.totalArrayJobs)
except ValueError:
raise RuntimeError("Argument passed as --totalArrayJobs must be an integer. " + str(args.totalArrayJobs) + " was passed instead.")
spoofTaskID = False
if args.spoofTaskID:
try:
spoofTaskID = int(args.spoofTaskID)
except ValueError:
raise RuntimeError("Spoofed task ID must be passed as an integer. We got: " + str(args.spoofTaskID))
arrayJobTrap(args.tempDir, totalArrayJobs, spoofTaskID)
self.mode = "ARRAY RUNNER"
else:
if args.quickExtend: #adding a catch for quick return mode
args.mode = "search"
args.standAlone = True
self.quickExtend = True
else:
self.quickExtend = False
if (args.standAlone and args.cluster) or (args.mixed and args.cluster) or (args.mixed and args.standAlone): #You really can't be both.
raise RuntimeError("Aborted: You must specify --cluster OR --standAlone OR --mixed. You can't be more than one.")
if not (args.standAlone or args.cluster or args.mixed or args.manual) and args.mode in ['search', 'index', 'selection']:
raise RuntimeError("ABORTED: You must specify if this will be running on a stand-alone system or a cluster. (--standAlone or --cluster)")
else:
if args.standAlone:
self.standAlone = True
self.cluster = False
self.mixed = False
elif args.cluster:
self.cluster = True
self.standAlone = False
self.mixed = False
elif args.mixed:
self.cluster = False
self.standAlone = False
self.mixed = True
if args.mode and args.mode.upper() == "INDEX":
raise RuntimeError("Mixed system cannot be specified for an index run.")
if not args.mode and not args.manual: #series of case statements for mode to determine which set of inputs to validate. If no mode was set, it will see if the user is asking for the manual.
raise RuntimeError("ABORTED: No run mode was set on the commandline.")
self.mode = args.mode
if not args.genomeDirectory:
self.genomeListDirectory = "genomes/"
else:
self.genomeListDirectory = args.genomeDirectory
if not self.genomeListDirectory[-1] == "/":
self.genomeListDirectory += "/"
if not os.path.isdir(self.genomeListDirectory) and not self.mode == 'index':
raise RuntimeError("ABORTED: User-specified genome: " + self.genomeListDirectory + " not found.")
if args.mode == 'worker':
self.setWorkerArgs(args)
elif args.mode == 'search':
self.setSearchArgs(args)
elif args.mode == 'index':
self.setIndexArgs(args)
elif args.mode == 'FASTAWorker':
self.setFASTAWorkerArgs(args)
elif args.mode == 'selection':
self.setSelectionArgs(args)
elif args.mode == 'compiler':
self.setCompilerArgs(args)
elif args.manual:
printManual()
quit()
else:
raise RuntimeError('ABORTED: Invalid/no mode set on commandline. Please select a mode or run with --manual set for assistance.')
def setSelectionArgs(self, args): #validating and setting arguments for selection of targets from a user-provided sequence
import os
self.sequence = args.sequence
if not self.sequence:
raise RuntimeError("ABORTED: You must specify a generic sequence to describe your system (see manual, argument --manual) for more information.")
if "_" in args.sequence:
guide, pam = args.sequence.split("_")
try:
guide = int(guide)
except ValueError:
pass
else:
args.sequence = "N"*guide + "_" + pam
self.sequence = args.sequence.upper()
else:
raise RuntimeError("ABORTED: Invalid sequence passed. Please include an underscore between the guide and PAM sequences.")
if not len(self.sequence) > 15 and not args.clobber:
raise RuntimeError("ABORTED: This guide+pam combination appears too short, and will likely cause memory and other errors. Rerun in clobber mode (argument -9) to proceed anyway.")
self.targetSequence = args.targetSequence
self.targetFasta = args.targetFasta
if self.targetFasta and not os.path.isfile(self.targetFasta):
raise RuntimeError("ABORTED: " + self.targetFasta + " is not a valid file.")
self.targetList = args.targetList
if self.targetList and not os.path.isfile(self.targetList):
raise RuntimeError("ABORTED: " + self.targetList + " is not a valid file.")
if not args.genome:
raise RuntimeError("ABORTED: You must set a genome. See manual (run with --manual) for details.")
self.genome = args.genome.upper()
self.verbose = args.verbose
self.mock = args.mock
if args.parallelJobs:
try:
self.parallelJobs = int(args.parallelJobs)
except ValueError:
raise RuntimeError("ABORTED: Parallel jobs argument must be an integer")
else:
self.parallelJobs = 5
if not args.mismatchTolerance:
self.mismatchTolerance = 3
else:
try:
self.mismatchTolerance = int(args.mismatchTolerance)
except ValueError:
raise RuntimeError("ABORTED: Mismatch tolerance must be an integer. Please check your command line options and try again.")
if args.noForcedBases:
if args.noForcedBases == "1":
self.noForcedBases = False
self.noForced1 = True
self.noForced3 = False
if args.noForcedBases == "3":
self.noForcedBases = False
self.noForced1 = False
self.noForced3 = True
else: #if they put in any other argument here, we block forcing either base (this could include 1,3 or all or anything else)
self.noForcedBases = True
self.noForced1 = True
self.noForced3 = True
else: #if the argument was left blank or not passed at all, we allow base forcing
self.noForcedBases = False
self.noForced1 = False
self.noForced3 = False
self.skipAzimuth = args.skipAzimuth
self.noCleanup = args.noCleanup
if not args.parallelJobLimit:
if self.cluster or self.mixed:
self.maxParallelJobs = clusterDefaultSelectionModeParallelJobLimit #set this value at the top of the script for your configuration of choice
if self.standAlone:
self.maxParallelJobs = standAloneDefaultSelectionModeParallelJobLimit
else:
try:
self.maxParallelJobs = int(args.parallelJobLimit)
except ValueError:
raise RuntimeError("ABORTED: Parallel job limit must be an integer.")
if self.maxParallelJobs < self.parallelJobs:
self.maxParallelJobs = self.parallelJobs
self.outputToFile = args.outputToFile
if self.outputToFile and os.path.isfile(self.outputToFile) and not args.clobber:
raise RuntimeError("ABORTED: Output file " + self.outputToFile + " already exists. Run in clobber mode to overwrite.")
self.clobber = args.clobber
self.scratchFolder = args.scratchFolder
if not self.scratchFolder:
self.scratchFolder = "" #making sure this is cast to a string and not a NoneType (although that will probably add to a string with no trouble)
else:
if not self.scratchFolder[-1] == "/":
self.scratchFolder = self.scratchFolder + "/" #make sure that the directory name is passed ending with a slash so we can prepend it directly to our tempDir name
if not os.path.isdir(self.scratchFolder):
try:
os.mkdir(self.scratchFolder)
except OSError:
if not os.path.isdir(self.scratchFolder): #This could happen because of a data race type condition, if one process creates the directory after this one checks for it, but before it creates it. This will catch that problem.
raise RuntimeError("ABORTED: Unable to create scratch folder. Check if directory containing this folder already exists.")
self.preferredPAM = False
self.preferredPAMWeight = False
if args.preferredPAM:
if "," in args.preferredPAM:
if not args.preferredPAM.count(",") == 1:
raise RuntimeError("PreferredPAM argument can be passed with only one comma-separated value for SITE,PREFERENCEVALUE. Passed value was " + args.preferredPAM)
self.preferredPAM, self.preferredPAMWeight = args.preferredPAM.split(",")
self.preferredPAM = self.preferredPAM.upper()
try:
self.preferredPAMWeight = int(self.preferredPAMWeight)
except ValueError:
try:
self.preferredPAMWeight = float(self.preferredPAMWeight)
except:
raise RuntimeError("PreferredPAM argument must be a number. Program was passed " + self.preferredPAMWeight + " in preferredPAM argument " + args.preferredPAM)
else:
self.preferredPAM = args.preferredPAM.upper()
self.preferredPAMWeight = False
systemPAM = self.sequence.split("_")[1]
systemPAMList = NondegenerateBases(systemPAM).permutations()
self.preferredPAMList = NondegenerateBases(self.preferredPAM).permutations()
for preferredSite in self.preferredPAMList:
if not preferredSite.upper() in systemPAMList:
raise RuntimeError("ABORTED: Your preferred PAM site of " + self.preferredPAM + " is not a subset for the system PAMs of " + systemPAM + ".")
self.useArray = False
if args.useArray:
if not self.mixed:
if self.standAlone:
raise RuntimeError("Array mode is invalid in stand alone mode. Please run in mixed mode and try again.")
self.mixed = True
self.cluster = False
print("Array use must be done in mixed mode to avoid overloading the job scheduler.")
if not yesAnswer("Change to mixed mode now?"):
quit("Ok, exiting.")
self.useArray = True
if args.guideExtension:
try:
self.guideExtension = int(args.guideExtension)
except ValueError:
raise RuntimeError("Error: Guide extension value must be an integer.")
else:
self.guideExtension = 4
if args.pamExtension:
try:
self.pamExtension = int(args.pamExtension)
except ValueError:
raise RuntimeError("Error: PAM extension value must be an integer")
else:
self.pamExtension = 3
def setWorkerArgs(self, args): #Validating arguments for a search worker. This should not require too much validation, as users should not be launching worker processes themselves
self.mode = "worker"
self.workerID = args.workerID
self.tempDir = args.tempDir
self.sequence = args.sequence
self.mismatchTolerance = int(args.mismatchTolerance)
self.inputDirectory = args.inputDirectory
self.verbose = args.verbose
self.skipAzimuth = True
self.delayIO = False
if args.delayIO:
self.delayIO = True
if args.guideExtension:
try:
self.guideExtension = int(args.guideExtension)
except ValueError:
raise RuntimeError("Error: Guide extension value must be an integer.")
else:
self.guideExtension = 4
if args.pamExtension:
try:
self.pamExtension = int(args.pamExtension)
except ValueError:
raise RuntimeError("Error: PAM extension value must be an integer")
else:
self.pamExtension = 3
def setSearchArgs(self, args): #Validating arguments for launching a search supervisor. This will require good validations, as users are likely to be launching this on their own.
import os
self.mode = "search"
self.sequence = args.sequence
if not self.sequence:
raise RuntimeError("ABORTED: You must specify a sequence to search for. Remember to place an underscore between the guide and PAM sequences.")
if not "_" in self.sequence:
raise RuntimeError("ABORTED: You must include an underscore '_' in your sequence between the guide RNA portion and the PAM sequence.")
if not len(self.sequence) > 15 and not args.clobber:
raise RuntimeError("ABORTED: This guide+pam combination appears too short, and will likely cause memory and other errors. Rerun in clobber mode (argument -9) to proceed anyway.")
self.sequence = self.sequence.upper()
if not args.mismatchTolerance:
if args.quickExtend:
self.mismatchTolerance = 0
else:
self.mismatchTolerance = 3
else:
try:
self.mismatchTolerance = int(args.mismatchTolerance)
except ValueError:
raise RuntimeError("ABORTED: Mismatch tolerance must be an integer. Please check your command line options and try again.")
self.tempDir = args.tempDir
self.workerID = args.workerID
self.inputDirectory = args.inputDirectory
self.verbose = args.verbose
self.mock = args.mock
if not args.forceGenome:
self.genome = args.genome.upper() #if a genome is forced by a hypervisor function, this will prevent an error from trying to uppercase a NoneType variable.
if args.parallelJobs:
try:
self.parallelJobs = int(args.parallelJobs)
except ValueError:
raise RuntimeError("ABORTED: Parallel jobs argument must be an integer")
elif args.quickExtend:
self.parallelJobs = 1
else:
self.parallelJobs = 12
self.outputDirectory = args.outputDirectory
self.noCleanup = args.noCleanup
self.forceGenome = args.forceGenome
self.skipAzimuth = True
self.annotationExpansion = args.annotationExpansion
if not self.annotationExpansion:
self.annotationExpansion = 1000
else:
try:
self.annotationExpansion = int(self.annotationExpansion)
except ValueError:
raise RuntimeError("ABORTED: Annotation expansion range must be an integer value.")
self.azimuthSequence = args.azimuthSequence
if not self.azimuthSequence or self.azimuthSequence == "False":
self.azimuthSequence = False
self.scratchFolder = args.scratchFolder
if not self.scratchFolder:
self.scratchFolder = "" #making sure this is cast to a string and not a NoneType (although that will probably add to a string with no trouble)
else:
if not self.scratchFolder[-1] == "/":
self.scratchFolder = self.scratchFolder + "/" #make sure that the directory name is passed ending with a slash so we can prepend it directly to our tempDir name
if not os.path.isdir(self.scratchFolder):
try:
os.mkdir(self.scratchFolder)
except OSError:
raise RuntimeError("ABORTED: Unable to create scratch folder. Check if directory containing this folder already exists.")
self.clobber = False
if args.clobber:
self.clobber = True
self.preferredPAM = False
self.preferredPAMWeight = False
if args.preferredPAM:
if "," in args.preferredPAM:
if not args.preferredPAM.count(",") == 1:
raise RuntimeError("PreferredPAM argument can be passed with only one comma-separated value for SITE,PREFERENCEVALUE. Passed value was " + args.preferredPAM)
self.preferredPAM, self.preferredPAMWeight = args.preferredPAM.split(",")
try:
self.preferredPAMWeight = int(self.preferredPAMWeight)
except ValueError:
try:
self.preferredPAMWeight = float(self.preferredPAMWeight)
except:
raise RuntimeError("PreferredPAM argument must be a number. Program was passed " + self.preferredPAMWeight + " in preferredPAM argument " + args.preferredPAM)
else:
self.preferredPAM = args.preferredPAM.upper()
self.preferredPAMWeight = False
systemPAM = self.sequence.split("_")[1]
self.preferredPAMList = NondegenerateBases(self.preferredPAM).permutations()
if not self.forceGenome: #if we have a forceGenome argument, this is a subprocess of a selection job and this has already been checked. It will also fail, since a nondegenerate PAM will be passed.
systemPAMList = NondegenerateBases(systemPAM).permutations()
for preferredSite in self.preferredPAMList:
if not preferredSite in systemPAMList:
raise RuntimeError("ABORTED: Your preferred PAM site of " + self.preferredPAM + " is not a subset for the system PAMs of " + systemPAM + ".")
self.useArray = False
if args.useArray:
self.useArray = True
self.delayIO = False
if args.delayIO:
self.delayIO = True
if args.guideExtension:
try:
self.guideExtension = int(args.guideExtension)
except ValueError:
raise RuntimeError("Error: Guide extension value must be an integer.")
else:
self.guideExtension = 4
if args.pamExtension:
try:
self.pamExtension = int(args.pamExtension)
except ValueError:
raise RuntimeError("Error: PAM extension value must be an integer")
else:
self.pamExtension = 3
def setIndexArgs(self, args): #Validating arguments for launching an indexing supervisor. This will also require good validations as users are likely to be launching this on their own.
import os
self.mode = "index"
if not args.sequence:
raise RuntimeError("ABORTED: No search sequence specified.")
if "_" in args.sequence:
guide, pam = args.sequence.split("_")
try:
guide = int(guide)
except ValueError:
pass
else:
args.sequence = "N"*guide + "_" + pam
self.sequence = args.sequence.upper()
else:
raise RuntimeError("ABORTED: Invalid sequence passed. Please include an underscore between the guide and PAM sequences.")
if not len(self.sequence) > 15 and not args.clobber:
raise RuntimeError("ABORTED: This guide+pam combination appears too short, and will likely cause memory and other errors. Rerun in clobber mode (argument -9) to proceed anyway.")
if not args.inputfile:
raise RuntimeError("ABORTED: No FASTA specified for searching.")
if os.path.isfile(args.inputfile):
self.inputfile = args.inputfile
else:
raise RuntimeError("ABORTED: FASTA file: " + args.inputfile + " not found.")
if not args.genome:
raise RuntimeError("ABORTED: You must specify the name you want to identify this genome by.")
self.genome = args.genome.upper()
self.clobber = args.clobber
self.mock = args.mock
self.tempDir = args.tempDir
self.ordered = args.ordered
self.workerID = args.workerID
self.species = args.species.upper()
if args.chunkSize:
try:
args.chunkSize = int(args.chunkSize)
except ValueError:
raise RuntimeError("ABORTED: Invalid chunk size passed as argument (must be an integer)")
self.chunkSize = args.chunkSize
else:
self.chunkSize = 20000000
if self.chunkSize < 100:
self.chunkSize = 1000000 * self.chunkSize
if args.forceJobIndex:
try:
tester = int(args.forceJobIndex) #Leave this argument as a string until it is used so that a zero value can be passed for the index and will still evaluate to true.
except ValueError:
raise RuntimeError("ABORTED: Forced job index argument must be an integer so that it can be used as an index. If you don't understand why this is, you probably should not be messing with this argument.")
self.forceJobIndex = args.forceJobIndex
else:
self.forceJobIndex = False
self.noCleanup = args.noCleanup
self.skipAzimuth = True
self.verbose = args.verbose
if not args.parallelJobLimit:
if self.cluster:
self.maxParallelJobs = clusterDefaultSelectionModeParallelJobLimit
if self.standAlone:
self.maxParallelJobs = standAloneDefaultSelectionModeParallelJobLimit
else:
try:
self.maxParallelJobs = int(args.parallelJobLimit)
except ValueError:
raise RuntimeError("ABORTED: Parallel job limit must be an integer.")
else:
if self.maxParallelJobs < 1:
raise RuntimeError("ABORTED: Parallel job limit must be greater than 0")
self.scratchFolder = args.scratchFolder
if not self.scratchFolder:
self.scratchFolder = "" #making sure this is cast to a string and not a NoneType (although that will probably add to a string with no trouble)
else:
if not self.scratchFolder[-1] == "/":
self.scratchFolder = self.scratchFolder + "/" #make sure that the directory name is passed ending with a slash so we can prepend it directly to our tempDir name
if not os.path.isdir(self.scratchFolder):
try:
os.mkdir(self.scratchFolder)
except OSError:
raise RuntimeError("ABORTED: Unable to create scratch folder. Check if directory containing this folder already exists.")
self.directToCompiler = False
self.recreateTree = False
if args.recreateTree:
if args.directToCompiler:
raise RuntimeError("You cannot set both the --directToCompiler and --recreateTree arguments. If you need both, --directToCompiler will run both for you.")
else:
self.recreateTree = True
self.directToCompiler = True
else:
if args.directToCompiler:
self.directToCompiler = True
if args.treeLevel1:
try:
args.treeLevel1 = int(args.treeLevel1)
except ValueError:
raise RuntimeError("ABORTED: Invalid tree level 1 size passed as argument (must be an integer). We got " + str(args.treeLevel1))
self.treeLevel1 = args.treeLevel1
else:
self.treeLevel1 = 5
if args.treeLevel2:
try:
args.treeLevel2 = int(args.treeLevel2)
except ValueError:
raise RuntimeError("ABORTED: Invalid tree level 2 size passed as argument (must be an integer). We got " + str(args.treeLevel2))
self.treeLevel2 = args.treeLevel2
else:
self.treeLevel2 = 5
if args.guideExtension:
try:
self.guideExtension = int(args.guideExtension)
except ValueError:
raise RuntimeError("Error: Guide extension value must be an integer.")
else:
self.guideExtension = 4
if args.pamExtension:
try:
self.pamExtension = int(args.pamExtension)
except ValueError:
raise RuntimeError("Error: PAM extension value must be an integer")
else:
self.pamExtension = 3
def setFASTAWorkerArgs(self, args): #Validate arguments for launching a FASTA indexing worker. Users are unlikely to be launching this on their own.
import os
self.mode = "FASTAWorker"
self.chromosome = args.chromosome
self.start = args.start
self.length = args.length
if "_" in args.sequence:
self.sequence = args.sequence
else:
raise RuntimeError("ABORTED: Invalid sequence passed to worker. Please include an underscore between the guide and PAM sequences.")
if os.path.isfile(args.inputfile):
self.inputfile = args.inputfile
else:
raise RuntimeError("ABORTED: FASTA file: " + args.inputfile + " not found.")
self.genome = args.genome.upper()
self.tempDir = args.tempDir
self.workerID = args.workerID
self.chunkSize = args.chunkSize
#if os.path.isdir(args.tempDir):
# self.tempDir = args.tempDir
#else:
# raise RuntimeError("ABORTED: Unable to detect temporary directory: " + args.tempDir)
self.skipAzimuth = True
self.species = args.species
self.verbose = args.verbose
try:
args.treeLevel1 = int(args.treeLevel1)
except ValueError:
raise RuntimeError("ABORTED: Invalid tree level 1 size passed as argument (must be an integer). We got " + str(args.treeLevel1))
self.treeLevel1 = args.treeLevel1
try:
args.treeLevel2 = int(args.treeLevel2)
except ValueError:
raise RuntimeError("ABORTED: Invalid tree level 2 size passed as argument (must be an integer). We got " + str(args.treeLevel2))
self.treeLevel2 = args.treeLevel2
if args.guideExtension:
try:
self.guideExtension = int(args.guideExtension)
except ValueError:
raise RuntimeError("Error: Guide extension value must be an integer.")
else:
self.guideExtension = 4
if args.pamExtension:
try:
self.pamExtension = int(args.pamExtension)
except ValueError:
raise RuntimeError("Error: PAM extension value must be an integer")
else:
self.pamExtension = 3
def setCompilerArgs(self, args):
import os
self.mode = "compiler"
if "_" in args.sequence:
self.sequence = args.sequence
else:
raise RuntimeError("ABORTED: Invalid sequence passed to worker. Please include an underscore between the guide and PAM sequences.")
self.genome = args.genome.upper()
self.tempDir = args.tempDir
self.workerID = args.workerID
self.skipAzimuth = True
self.species = args.species
self.verbose = args.verbose
if not args.bins:
raise RuntimeError("Tried to run a bin compiler without specifying bins to compile. It got bored.")
self.bins = args.bins
self.noCleanup = False
if args.noCleanup:
self.noCleanup = True
#=================================================SGE Array Job trapper==============================================================================
def arrayJobTrap(tempDir, totalArrayJobs, spoofTaskID):
import os
import pickle
import time
import random
if not spoofTaskID:
try:
thisJob = int(os.environ["SGE_TASK_ID"]) #Yes, yes, this is a fertile job and we will thrive. We will rule over all this job and we will call it... thisJob.
except KeyError: #I think we should call it your unhandled exception
raise RuntimeError("Unable to find a valid task ID in OS environment variables.") #Arghhh... curse your sudden but inevitable betrayal!
else:
thisJob = int(spoofTaskID)
random.seed(thisJob)
sleepyTime = random.uniform(0,totalArrayJobs/1.5) #stagger the starts of these jobs to control how much simultaneous IO we run
print("Random hold for " + str(sleepyTime) + " seconds to avoid overloading the IO system.")
time.sleep(sleepyTime)
jobArrayFilePath = tempDir
if not tempDir.endswith(os.sep):
jobArrayFilePath += os.sep
jobArrayFilePath += "jobArray.pkl"
try:
jobArrayFile = open(jobArrayFilePath,"rb")
except FileNotFoundError:
raise RuntimeError("Unable to find job array file at " + jobArrayFile)
except: #Any error besides a missing file could be due to excessive simultaneous access. If so, wait some random amount of time and try again
try:
jobArrayFile = open(jobArrayFilePath,"rb")
except: #Give it one more shot. If it still fails, we probably have a serious problem on our hands want an unhandled exception.
jobArrayFile = open(jobArrayFilePath,"rb")
jobArray = pickle.load(jobArrayFile)
jobArrayFile.close()
try:
bashFileList = jobArray[thisJob - 1] #Subtract 1 because python indexes to 0 while the SGE scheduler indexes to 1
except IndexError:
raise RuntimeError("Tried to run job number " + str(thisJob) + " from the job array at " + jobArrayFilePath + ". It was not found.")
print("Running array job number " + str(thisJob - 1))
# try:
# nodeNumber = os.environ["HOSTNAME"].replace("n","")
# nodeNumber = int(nodeNumber)
# except ValueError:
# nodeNumber = False
# if False and nodeNumber and nodeNumber in range(2208,2222): #This seems to be a bad group of nodes.
# import sys
# sys.exit(99)
# else:
for bashFilePath in bashFileList:
os.system("bash " + bashFilePath)
#=================================================Sequence target analysis/hypervisor objects=================================================================================================================================
class TargetSite(object): #This object holds attributes that describe a potential target site found in the user-defined target sequence. This can be extended as we get better ways to describe potential target sites. Only needs a cut site sequence to be initialized, all else can be set later.
def __init__(self, cutSeq, longSeq = False):
self.longSeq = longSeq
self.cutSeq = cutSeq #this value should come in with the underscore already added between guide and pam
self.matches = {} #This is designed to hold the dictionary of match/mismatch sites that gets passed from the search function
for i in range(0, args.mismatchTolerance + 1):
self.matches[i] = []
self.azimuthScore = -1 #Default value for this is -1 to indicate no score.
self.score = False
self.acceptable = True #This flag gets set to false if the target has more than one perfect match in the genome
self.mismatchRisk = False #This is a function of how well-matched the sites are and if they are in or near an annotated gene
self.tooManyMatches = False #This flags any sites with more than 100 * mismatch tolerance potential off-targets. They're not especially useful for genome editing, generate obscenely huge outputs, and severely degrade performance when annotated.
def calculateSortValue(self):
self.sortValue = (self.mismatchRisk, -self.azimuthScore)
class TargetFinder(object): #This object is analogous to a FASTA indexer, except designed to deal with smaller sequences and can be extended to collect larger windows for analysis in azimuth
def __init__(self, target):
self.target = target #This is the target sequence passed in. Should be supplied by the user either on the command line or in a FASTA file
self.longSeq = False #If we can generate a 30bp site to pass to azimuth, this is where it gets stored. Note that we may have to force it a bit if we try to apply their model to other systems
self.lastGroup = 0
self.done = False
self.cutWindow = len(args.sequence) - 1 #subtract out the underscore
self.start = 0 #start is inclusive
self.end = self.cutWindow
self.guide, self.pam = args.sequence.split("_")
if args.preferredPAM:
self.pam = args.preferredPAM #If the user specified an optimal PAM sequence, we will use that instead. It will still probably be degenerate, but a bit more restricted.
self.pamList = NondegenerateBases(self.pam).permutations()
self.pamLength = len(self.pam)
self.matches = [] #initialize an empty list to hold our match sites (which will be TargetSite class instances)
self.done = False
self.forceAzimuthPam = False #This indicates that we have forced the azimuth model for this system by trimming the fixed bases on the end of the pam
self.forceGuide1 = False #This indicates that we have forced the azimuth model by making the 5' base we submit into the 5' base on the guide
self.forceGuide3 = False #This indicates we have done the same thing with the second base from the 5' end (base 3 if the guide is indexed to 1)
def findMatches(self): #main running function for this object, actually runs the search, gets azimuth scores if needed, and returns the list of matches
while not self.done:
windowSeq = self.target[self.start:self.end]
revComp = str(ReverseComplement(windowSeq))
if windowSeq[-self.pamLength:] in self.pamList:
guide = windowSeq[:-self.pamLength]
pam = windowSeq[-self.pamLength:]
if not args.skipAzimuth:
longSeq = self.getLongSeq(guide, pam,'+') #tries to get an extended sequence for azimuth analysis
self.matches.append(TargetSite(guide + "_" + pam, longSeq))
if revComp[-self.pamLength:] in self.pamList:
guide = revComp[:-self.pamLength]
pam = revComp[-self.pamLength:]
if not args.skipAzimuth:
longSeq = self.getLongSeq(guide, pam,'-')
self.matches.append(TargetSite(guide + "_" + pam, longSeq))
self.advance()
# if not args.skipAzimuth:
# self.useAzimuth = True
# else:
# self.useAzimuth = False
# if self.useAzimuth:
# self.azimuthAPIkey = self.getAzimuthAPIkey()
# if self.azimuthAPIkey:
# self.assignAzimuthScores()
return self.matches
def advance(self): #moves the window ahead one character, then checks to see if it has reached the end
self.start += 1
self.end += 1
self.done = self.end > len(self.target)
def getLongSeq(self, guide, pam, strand): #this method gets an extended sequence for azimuth or other analysis if possible
pamExtensionLength = 3
guideExtensionLength = 24 - len(self.guide)
try: #we need a try/except block for this because it is possible that the extended sequence will run us off the end of the sequence
if strand == '+':
pamEnd = self.end + pamExtensionLength
guideStart = self.start - guideExtensionLength
if pamEnd > len(self.target) or guideStart < 0:
return False
pamExtension = self.target[self.end : pamEnd]
guideExtension = self.target[self.start - guideExtensionLength : self.start]
if strand == '-':
pamStart = self.start - pamExtensionLength
guideEnd = self.end + guideExtensionLength
if pamStart < 0 or guideEnd > len(self.target):
return False
pamExtension = self.target[self.start - pamExtensionLength : self.start]
guideExtension = self.target[self.end : self.end + guideExtensionLength]
pamExtension = str(ReverseComplement(pamExtension))
guideExtension = str(ReverseComplement(guideExtension))
except IndexError: #If we get something near the end of the given sequence where we try to read off the end, we just return False for this value. Later, this will tell us not try submitting it for analysis.
return False
if not len(pam) == 3: #if we have to force the pam, we will warn the user
if not self.forceAzimuthPam:
print("WARNING: Attempting to force conformity of PAM site to the Azimuth model. Predictions based on forced projections may not be as accurate.")
self.forceAzimuthPam = True
pam = pam[:2]
extendedSeq = guideExtension + guide + pam + pamExtension
# if not len(guide) == 20 and not args.noForcedBases:
# extendedSeq = list(extendedSeq) #making it a list so that I can change individual characters by their index
# if not extendedSeq[4] == guide[0] and not args.noForced1:
# extendedSeq[4] = guide[0]
# if not self.forceGuide1:
# print("Forcing guide base 1 into position 1 for azimuth analysis. Predictions based on forced projections may not be as accurate.")
# self.forceGuide1 = True
# if not extendedSeq[6] == guide[2] and not args.noForced3:
# extendedSeq[6] = guide[2]
# if not self.forceGuide3:
# print("Forcing guide base 3 into position 3 for azimuth analysis. Predictions based on forced projections may not be as accurate.")
# self.forceGuide3 = True
# extendedSeq = str(extendedSeq) #return the value back to a string for later submission
return extendedSeq
def getAzimuth(self, sequence, failedPrevious = False): #this handles communication with the azure server to get a score. This can later be replaced if we decide to run a local instance with the source code.
import urllib.request
import json
import time
import sys #for error catching
data = {
"Inputs":{
"input1":{
"ColumnNames":["sequence", "cutsite", "percentpeptide"],
"Values":[[sequence, "-1", "-1"]]
}
},
"GlobalParameters": {}
}
body = str.encode(json.dumps(data))
#url = 'https://ussouthcentral.services.azureml.net/workspaces/ee5485c1d9814b8d8c647a89db12d4df/services/c24d128abfaf4832abf1e7ef45db4b54/execute?api-version=2.0&details=true'
url = 'https://ussouthcentral.services.azureml.net/workspaces/ee5485c1d9814b8d8c647a89db12d4df/services/5c6cbabaef4947b4b7425e934b6f7d6b/execute?api-version=2.0&details=true' #slower, but only one working for now. Use for testing
api_key = self.azimuthAPIkey
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}
req = urllib.request.Request(url, body, headers)
try:
response = urllib.request.urlopen(req)
result = response.read().decode('utf-8')
result = json.loads(result)
return float(result['Results']['output2']['value']['Values'][0][0])
except urllib.error.HTTPError as error:
if error.code == 401:
print("Unable to use Azimuth due to a possible invalid API key. Please check on the status of key: " + self.azimuthAPIkey)
else:
print("The Azimuth request failed with status code: " + str(error.code))
print(error.info())
print(json.loads(error.read().decode('utf-8')))
self.useAzimuth = False
return -1 #Remember that -1 is our placeholder value for a failed attempt or no attempt.
except urllib.error.URLError:
if not failedPrevious:
time.sleep(5) #wait 5 seconds before retry
return self.getAzimuth(sequence, True)
else:
print("Unable to reach/find Azimuth server. Please confirm you are connected to the internet.")
self.useAzimuth = False
return -1
except: #Allowing this for now while dealing with many possible exceptions due to experimental server and software
if not failedPrevious:
time.sleep(5)
return self.getAzimuth(sequence, True) #give it another go, because why not...
else:
error = sys.exc_info()
print("Unexpected error in Azimuth scoring:")
for item in error:
print(item)
return -1
def getAzimuthAPIkey(self): #this gets the API key from a file
import os
if os.path.isfile("azimuth.apikey"):
file = open("azimuth.apikey", 'r')
key = file.read()
file.close()
key = key.strip()
return key
else:
print("Unable to run azimuth. Cannot locate API key. Please save the key the same directory as this program under filename azimuth.apikey.")
return False
def assignAzimuthScores(self): #iterate over all our matches and get an azimuth score if there is a saved extended sequence, otherwise it is left as the default -1 value
for i in range(0,len(self.matches)):
if self.matches[i].longSeq and self.useAzimuth:
self.matches[i].azimuthScore = self.getAzimuth(self.matches[i].longSeq)
class TargetSelection(object): #This is the main running object for the target selection job
def __init__(self):
printStartUp()
reportUsage("SELECTION")
self.targetList = []
self.indexedGenome = self.selectIndexedGenome() #we will pass this to the search supervisor. This will save each supervisor a few seconds (probably not significant) and will cover for the potential loss of degeneracy when we pass the sequence to searcher agents
print("Checking for target sites")
self.getTargetSequence()
if not self.targetList:
self.targetList = TargetFinder(self.target).findMatches()
if not self.targetList:
raise RuntimeError('ABORTED: No suitable target sequences found.')
if selectionModeTargetLimitPerJob > 0 and len(self.targetList) > selectionModeTargetLimitPerJob and not args.clobber:
raise RuntimeError("ABORTED: Too many targets in sequence. Try running a shorter target sequence, a more specific Crispr system, or using clobber mode (argument -9) to override this.")
print("Found " + str(len(self.targetList)) + " potential target sites.")
self.createTempDir()
if not args.useArray: