/
BisPin_index.py
executable file
·217 lines (207 loc) · 11.8 KB
/
BisPin_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/python
import optparse, os, sys, multiprocessing, subprocess, datetime
from Utilities import Constants
from Utilities import IndentedHelpFormatterWithNL
from Utilities import BisPin_util
logstr = "\nBisPin_index: "
"""
@author: Jacob Porter
@summary: Builds BFAST indices. The reference genomes must already be converted.
TODO: Add depth, etc. to indices.
TODO: Creating the brg file should probably be separated because it needs to be done before the index file is created.
"""
def fasta2brg(path_to_bfast, directory, filename, CTorGA, orig = False):
"""
Converts a FASTA file into a BFAST binary format, brg.
This function is called after the reference genome is converted.
@param path_to_bfast: a string giving the location of the BFAST executable.
@param directory: the directory where the reference FASTA file is located
@param filename: the filename of the reference FASTA file
@param CTorGA: if True, indicates that the FASTA file is C to T converted
If False, the FASTA file is G to A converted.
This is used to get the correct name of the reference file.
@param orig: when True, the FASTA file is an unconverted reference file.
"""
if not orig:
str1 = Constants.CONV_CT if CTorGA else Constants.CONV_GA
genome_path = os.path.join(directory, filename + ".BisPin." + str1)
else:
genome_path = os.path.join(directory, filename)
BRG_path = os.path.join(genome_path + ".nt" + ".brg")
if os.path.exists(BRG_path):
sys.stderr.write("%sThe BRG file %s already exists, so it will not be created.\n" % (logstr, str(BRG_path)))
sys.stderr.flush()
else:
sys.stderr.write("%sThe BRG file %s does NOT exist, so it will be created.\n" % (logstr, str(BRG_path)))
sys.stderr.flush()
subprocess.call([path_to_bfast, 'fasta2brg', '-f', genome_path, "-A", "0"])
def create_index(fastafile, path_to_bfast, mask, hashWidth, indexNumber, depth, num_threads, tmpDir, CTorGA):
"""
Create a BFAST index
@see: BFAST documentation for more information on the index building paraemters.
@param fastafile: A string giving the location of a FASTA file.
@param path_to_bfast: a string giving the location of the BFAST executable.
@param mask: A BFAST index mask
@param hashWidth: A BFAST index hash width
@param indexNumber: A BFAST index number
@param depth: A BFAST depth
@param num_threads: the number of threads for BFAST to use (This seems to have no effect.)
@param tmpDir: A temporary directory for BFAST to write to.
@param CTorGA: When True, the FASTA file is expected to conform to the naming conventions of a
C to T converted reference genome. When False, it should correspond to a G to A converted file.
"""
str1 = "CT" if CTorGA else "GA"
converted_genome_path = os.path.join(fastafile + ".BisPin." + str1)
converted_genome_path = str(converted_genome_path)
subprocess.call([path_to_bfast, 'index', '-f', converted_genome_path, '-m', mask, '-w', hashWidth, '-i', indexNumber, '-d', depth, '-n', num_threads, '-T', tmpDir])
def create_regular_index(fastafile, path_to_bfast, mask, hashWidth, indexNumber, depth, num_threads, tmpDir):
"""
Creates a BFAST index for a regular unconverted reference genome.
@see: create_index help documentation has information on the parameters.
"""
genome_path = os.path.join(fastafile)
subprocess.call([path_to_bfast, 'index', '-f', genome_path, "-m", mask, "-w", hashWidth, '-i', indexNumber, '-d', depth, '-n', num_threads, '-T', tmpDir])
def create_bwa_converted_index(fastafile, path_to_bwa, block_size, CTorGA):
"""
Experimental code for BWA indices.
@status: Experimental
"""
str1 = "CT" if CTorGA else "GA"
converted_genome_path = os.path.join(fastafile + ".BisPin." + str1)
subprocess.call([path_to_bwa, 'index', '-b', block_size, converted_genome_path])
def create_bwa_regular_index(fastafile, path_to_bwa, block_size):
"""
Experimental code for BWA indices.
@status: Experimental
"""
genome_path = os.path.join(fastafile)
subprocess.call([path_to_bwa, 'index', '-b', block_size, genome_path])
def main():
"""
The entry point into the program.
"""
#Create the command line interface.
usage = "usage: %prog [options] <reference_genome_file> "
version = "%prog " + Constants.version
description = "Create indices for the reference genome. The file naming convention for the converted genome is expected to be in BisPin_covert format."
epilog = Constants.creation_string
p = optparse.OptionParser(usage = usage, version = version, description = description,
epilog = epilog, formatter = IndentedHelpFormatterWithNL.IndentedHelpFormatterWithNL())
p.add_option('--path', '-P', help='The path to the BFAST executable file. If this option is not used, then the PATH variable will be searched for the executable.', default = None)
p.add_option('--tmpDir','-T',help= 'Specifies the directory to store temporary files. [default: the directory where the outputfile is located.]', default = None)
p.add_option('--regular', '-r', help='Create an index for the regular unconverted reference genome. This is for processing hairpin data for the recovery step. [default: %default]', action='store_true', default=False)
#p.add_option('--aligner', '-a', help='The code string for the aligner to use. [default: %default]\nOptions:\n' + Constants.BFAST +'\n' + Constants.BWA, default=Constants.BFAST)
p.add_option('--sequential', '-s', help='Create the indices for the converted reference genome in sequence instead of in parallel. [default: %default]', action='store_true', default=False)
bfast_group = optparse.OptionGroup(p, "BFAST index options.")
bfast_group.add_option('--mask', '-m', help='The mask or spaced seed to use. The mask is a set of zero and ones (must start and end with a one). [default: %default]', default = '11111111111111111111')
bfast_group.add_option('--hashWidth', '-w', help='The length of the hashed string (key size for the hash index). This must be less than or equal to the number of ones in the mask. [default: %default]', default = 14)
bfast_group.add_option('--indexNumber', '-i', help='Specifies this is the ith index you are creating. This is useful when multiple indexes from the same reference are to be created (in the same space). [default: %default]', default='1')
#p.add_option('--num_threads', '-n', help='The number of threads for each BFAST process to use. [default: %default]', default='1')
#p.add_option('--depth' , '-d', help= 'The depth of splitting (d). The index will be split into 4^d parts. [default: %default]', default='0')
p.add_option_group(bfast_group)
# bwa_group = optparse.OptionGroup(p, "BWA index options.")
# p.add_option('--block_size', '-b', help='block size for the bwtsw algorithm [default: %default]', default='10000000')
# p.add_option_group(bwa_group)
#Extract arguments
options, args = p.parse_args()
now = datetime.datetime.now()
if len(args) == 0:
p.print_help()
if len(args) != 1:
p.error("There must be one argument.")
fastafile = args[0]
#path_to_aligner = args[0]
aligner_type = Constants.BFAST #options.aligner.upper()
if aligner_type != Constants.BFAST and aligner_type != Constants.BWA:
p.error("The aligner is not recognized. Please choose the aligner from the given options.")
if options.path == None:
path_to_aligner = BisPin_util.which("bfast")
else:
path_to_aligner = BisPin_util.which(options.path)
if path_to_aligner == None:
p.error("The BFAST executable could not be found. Please check the path.")
path_to_aligner = str(path_to_aligner)
try:
hashWidth = int(options.hashWidth)
except ValueError:
p.error("The hash width argument must be a positive integer.")
regular = options.regular
if not os.path.exists(fastafile):
p.error("The file at %s could not be located." % fastafile)
if not os.path.exists(path_to_aligner):
p.error("The aligner could not be found at %s" % path_to_aligner)
mask = options.mask
hashWidth = str(hashWidth)
indexNumber = str(options.indexNumber)
sequential = options.sequential
tmpDir = options.tmpDir
num_threads = '1'#options.num_threads # Multithreading for constructing indices does not seem to do anyting.
depth = '0'#options.depth #Unsupported right now.
filename = os.path.basename(fastafile)
directory = os.path.dirname(fastafile)
if tmpDir == None:
tmpDir = directory
tmpDir = str(tmpDir)
if not tmpDir.endswith("/"):
tmpDir += "/"
#Create indices
sys.stderr.write("%sCreating indices for %s\n" % (logstr, fastafile))
sys.stderr.flush()
if aligner_type == Constants.BFAST: #Create BFAST indices
sys.stderr.write(logstr + "Converting the FASTA format into the BFAST binary version (BRG)...\n")
sys.stderr.flush()
finish_string = "%sFinished creating %s index with mask %s and hash width %s for %s with\n" + aligner_type
if not regular:
p_CT_bin = multiprocessing.Process(target=fasta2brg, args=(path_to_aligner, directory, filename, True))
p_CT_bin.start()
p_GA_bin = multiprocessing.Process(target=fasta2brg, args=(path_to_aligner, directory, filename, False))
p_GA_bin.start()
p_CT_bin.join()
p_GA_bin.join()
sys.stderr.write(logstr + "Finished creating the BFAST BRG files!\n")
p_CT = multiprocessing.Process(target=create_index, args=(fastafile, path_to_aligner, mask, hashWidth, indexNumber, depth, num_threads, tmpDir, True))
p_CT.start()
if sequential:
p_CT.join()
sys.stderr.write(finish_string % (logstr, "C to T", mask, hashWidth, fastafile))
p_GA = multiprocessing.Process(target=create_index, args=(fastafile, path_to_aligner, mask, hashWidth, indexNumber, depth, num_threads, tmpDir, False))
p_GA.start()
if not sequential:
p_CT.join()
sys.stderr.write(finish_string % (logstr, "C to T", mask, hashWidth, fastafile))
p_GA.join()
sys.stderr.write(finish_string % (logstr, "G to A", mask, hashWidth, fastafile))
else:
p_orig_bin = multiprocessing.Process(target=fasta2brg, args=(path_to_aligner, directory, filename, True, True))
p_orig_bin.start()
p_orig_bin.join()
sys.stderr.write(logstr + "Finished creating the BFAST BRG files!\n")
p_orig = multiprocessing.Process(target=create_regular_index, args=(fastafile, path_to_aligner, mask, hashWidth, indexNumber, depth, num_threads, tmpDir))
p_orig.start()
p_orig.join()
sys.stderr.write(finish_string % (logstr, "original", mask, hashWidth, fastafile))
elif aligner_type == Constants.BWA: #Create BWA indices
finish_string = "%sFinished creating the %s index with " + aligner_type
if not regular:
p_CT = multiprocessing.Process()
p_CT.start()
if sequential:
p_CT.join()
sys.stderr.write(finish_string % (logstr, "C to T"))
p_GA = multiprocessing.Process()
p_GA.start()
if not sequential:
p_CT.join()
sys.stderr.write(finish_string % (logstr, "C to T"))
p_GA.join()
sys.stderr.write(finish_string % (logstr, "G to A"))
else:
p_orig = multiprocessing.Process()
p_orig.start()
p_orig.join()
sys.stderr.write(finish_string % (logstr, "original"))
sys.stderr.flush()
later = datetime.datetime.now()
sys.stderr.write("%sElapsed time -- %s\n" % (logstr, str(later - now)))
if __name__ == '__main__':
main()