/
InfoTrim.py
executable file
·571 lines (547 loc) · 22.5 KB
/
InfoTrim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
#!/usr/bin/python
import datetime
import math
import multiprocessing
import argparse
import os
import sys
from Utilities import SeqIterator
"""
@author: Jacob Porter
@since: 02/16/2016
@summary: An information based read trimmer
Written by Jacob Porter while at Virginia Tech
http://www.jacobporter.com
This program implements a read trimming strategy that maximizes information content.
It has expressions for a minimum threshold amount, length, correctness probability, and entropy.
For each read in the file, the linear time algorithm takes time O(n) where n is the read length.
This algorithm trims the read to a prefix of the original read.
The non-linear time algorithm searches for a substring that maximizes the score and returns that substring.
For each read in the file, the non-linear algorithm has time O(n^2).
This read trimmer is intended for Illumina style short DNA reads encoded as A, C, T, G in a fastq file format
with phred scores. If the format is incorrect, consider converting to fastq format.
Multiprocessing is implemented with a producer / consumer design pattern using synchronized queues.
Will probably not do, but maybe will do:
TODO: add a term or support for trimming out regions with homopolymer runs?
TODO: Is the entropy only option redundant? Probably
TODO: Is balanced redundant? Probably
TODO: Should the scaling of the terms be changed because their contribution to the score will differ.
TODO: Should an exponential or logistic model or no model be used for biasing the non-linear version towards the start of the read.
TODO: Automatic phred encoding detection.
TODO: Automatic fasta/fastq file detection.
TODO: Is the linear algorithm that trims both ends the same as the dynamic algorithm?
"""
def calculate_read_trim_position_linear(read,
m,
alpha,
beta,
gamma,
offset,
balanced=False,
file_type='fastq',
entropyOnly=False):
"""
This implements the linear version of the read trimmer.
Read is a tuple. It has the id string, the read string, and the phred string.
alpha controls the influence of the length
beta controls the influence of the correctness probability
gamma controls the influence of entropy
balanced uses the geometric mean
entropyOnly finds the region of maximum entropy considering the length cutoff
"""
num_terms = 3.0
sequence = read[1]
if file_type != 'fastq':
phred = "H" * len(sequence)
else:
phred = read[2]
total_length = len(read[1])
max_position = 0
max_score = float("-inf")
initial_prob = 1
frequencies = [0, 0, 0, 0, 0]
for i in range(total_length):
base = sequence[i].upper()
prob_correct = 1 - 10**((ord(phred[i]) - offset) / (-10.0))
initial_prob *= prob_correct
if base == 'A':
frequencies[0] += 1
elif base == 'C':
frequencies[1] += 1
elif base == 'T':
frequencies[2] += 1
elif base == 'G':
frequencies[3] += 1
else:
frequencies[4] += 1
entropy = 0
for f in frequencies:
if not (f == 0):
my_freq = f / (i + 1.0)
entropy += my_freq * math.log(my_freq, 2)
entropy = -entropy
#if not entropyOnly:
length_cutoff = 1 / (1 + math.exp(m - (i + 1.0)))
if balanced and not entropyOnly:
my_score = length_cutoff * math.pow(
(i + 1.0) * initial_prob * entropy, 1 / 3.0)
elif not balanced and not entropyOnly:
my_score = length_cutoff * math.pow(
(i + 1.0), alpha / num_terms) * math.pow(
initial_prob, beta / num_terms) * math.pow(
entropy / 2.0, gamma / num_terms)
else:
my_score = length_cutoff * entropy
if my_score >= max_score:
max_score = my_score
max_position = i
return (max_position, max_score)
def calculate_read_trim_position_dynamic(read,
m,
t,
alpha,
beta,
gamma,
offset,
balanced=False,
file_type='fastq',
entropyOnly=False):
"""
This implements the non-linear version of the read trimmer.
It calls the linear version for each starting position.
The position of the substring with the maximum score is returned.
"""
sequence = read[1]
if file_type != 'fastq':
phred = "H" * len(sequence)
else:
phred = read[2]
total_length = len(read[1])
max_begin = 0
max_end = 0
max_score = float("-inf")
for i in range(total_length):
sliced_sequence = sequence[i:total_length]
sliced_phred = phred[i:total_length]
sliced_read = ("", sliced_sequence, sliced_phred)
my_position, my_score = calculate_read_trim_position_linear(
sliced_read,
m,
alpha,
beta,
gamma,
offset,
balanced=balanced,
file_type=file_type,
entropyOnly=entropyOnly)
if t != None and not entropyOnly:
my_score = math.exp(-i / t) * my_score
if my_score > max_score:
max_begin = i
max_end = my_position
max_score = my_score
return (max_begin, max_end + max_begin, max_score)
def calculate_read_trim_position_both_ends_linear(read,
m,
t,
alpha,
beta,
gamma,
offset,
balanced=False,
file_type='fastq',
entropyOnly=False):
"""
This implements the non-linear version of the read trimmer.
It calls the linear version for each starting position.
The position of the substring with the maximum score is returned.
"""
sequence = read[1]
if file_type != 'fastq':
phred = "H" * len(sequence)
else:
phred = read[2]
total_length = len(read[1])
begin = 0
end = 0
score_b = float("-inf")
score_e = float("-inf")
end, score_e = calculate_read_trim_position_linear(("", sequence, phred),
m,
alpha,
beta,
gamma,
offset,
balanced=balanced,
file_type=file_type,
entropyOnly=entropyOnly)
begin, score_b = calculate_read_trim_position_linear(
("", "".join(reversed(sequence)), "".join(reversed(phred))),
m,
alpha,
beta,
gamma,
offset,
balanced=balanced,
file_type=file_type,
entropyOnly=entropyOnly)
begin = len(sequence) - begin
if begin < end:
return (begin, end, (score_b + score_e) / 2)
else:
return (0, end, score_e)
def trim_read(read, alpha, beta, gamma, m, t, offset, balanced, algorithm,
entropyOnly, file_type, verbose):
"""
This function calls the functions that calculate the positions on the read to trim.
It branches depending on whether the non-linear version is used.
All other parameters are passed on.
"""
if algorithm == 'linear_end':
position, score = calculate_read_trim_position_linear(
read,
m,
alpha,
beta,
gamma,
offset,
balanced=balanced,
file_type=file_type,
entropyOnly=entropyOnly)
position += 1
if verbose:
sys.stderr.write("%s position: %s score: %s\n" %
(read[0], str(position), str(score)))
sys.stderr.flush()
if file_type == 'fastq':
trimmed_read = (read[0], read[1][0:position], read[2][0:position])
else:
trimmed_read = (read[0], read[1][0:position])
elif algorithm == 'dynamic':
begin, end, score = calculate_read_trim_position_dynamic(
read,
m,
t,
alpha,
beta,
gamma,
offset,
balanced=balanced,
file_type=file_type,
entropyOnly=entropyOnly)
end += 1
if verbose:
sys.stderr.write("%s begin: %s end: %s score: %s\n" %
(read[0], str(begin), str(end), str(score)))
sys.stderr.flush()
if file_type == 'fastq':
trimmed_read = (read[0], read[1][begin:end], read[2][begin:end])
else:
trimmed_read = (read[0], read[1][begin:end])
else:
begin, end, score = calculate_read_trim_position_both_ends_linear(
read,
m,
t,
alpha,
beta,
gamma,
offset,
balanced=balanced,
file_type=file_type,
entropyOnly=entropyOnly)
end += 1
if verbose:
sys.stderr.write("%s begin: %s end: %s score: %s\n" %
(read[0], str(begin), str(end), str(score)))
sys.stderr.flush()
if file_type == 'fastq':
trimmed_read = (read[0], read[1][begin:end], read[2][begin:end])
else:
trimmed_read = (read[0], read[1][begin:end])
return trimmed_read
def trim_reads_single(reads, trimmed_output, alpha, beta, gamma, m, t, offset,
balanced, algorithm, entropyOnly, file_type, verbose):
"""
This is the main function that loops through the reads.
The linear option is a boolean that selects for either the linear or the non-linear version.
The floating point parameters r, s, m, and t control the tradeoffs in the scoring function.
The balanced option is a boolean that uses the geometric mean of length, correctess probability, and entropy. This option ignores s and r.
The parameter t is only used in the non-linear version. It controls the exponential model for weighting the starting position.
reads is an iterator for the reads to trim and trimmed_output is an iterator for writing the reads.
"""
for read in reads:
trimmed_read = trim_read(read, alpha, beta, gamma, m, t, offset,
balanced, algorithm, entropyOnly, file_type,
verbose)
trimmed_output.write(trimmed_read)
def trim_reads_multi(q_read, q_write, alpha, beta, gamma, m, t, offset,
balanced, algorithm, entropyOnly, file_type, verbose):
"""
A function for trimming reads with multiple processes.
q_read is the queue for reading reads
q_write is the queue for adding trimmed reads to.
The other arguments are passed to the function that does all the work.
A 'poison pill' is put onto the q_write queue so that the writer process knows when to quit.
"""
#print str(os.getpid())
while (True):
read = q_read.get()
if read is None:
q_write.put(None)
return
else:
trimmed_read = trim_read(read, alpha, beta, gamma, m, t, offset,
balanced, algorithm, entropyOnly, file_type, verbose)
q_write.put(trimmed_read)
def add_reads_to_queue_multi(q_read, input_file, num_processes):
"""
This function puts the reads from the input_file into the Queue q_read.
This function is called to add all the reads on a queue for other processes to consume.
A 'poison pill' is added for all num_processes. When other processes get this signal, they will stop.
"""
reads = SeqIterator.SeqIterator(input_file, file_type='fastq')
for read in reads:
q_read.put(read)
for i in range(num_processes):
q_read.put(None)
def trim_reads(input_file,
output_file,
alpha,
beta,
gamma,
m,
t,
offset=33,
process_amount=1,
balanced=False,
algorithm=True,
entropyOnly=False,
file_type='fastq',
verbose=False):
"""
This function is the entry point into the read trimmer. It is the function that could be called
by other programs to do read trimming.
This function creates processes if multiprocessing is used.
"""
if isinstance(output_file, str):
output_file = open(output_file, 'w')
trimmed_output = SeqIterator.SeqWriter(output_file, file_type=file_type)
if process_amount == 1:
try:
reads = SeqIterator.SeqIterator(input_file, file_type=file_type)
except IOError:
sys.stderr.write(
"Something is wrong with the reads file. Please check it.\n")
return
trim_reads_single(reads, trimmed_output, alpha, beta, gamma, m, t,
offset, balanced, algorithm, entropyOnly, file_type,
verbose)
read_count = reads.records_processed()
else:
# Need to add records to a queue for the other processes.
processes = []
q_read = multiprocessing.Queue()
q_write = multiprocessing.Queue()
proc_read = multiprocessing.Process(target=add_reads_to_queue_multi,
args=(q_read, input_file,
process_amount))
proc_read.start()
# Start the processes. Each one pulls a record from the queue and processes it.
for i in range(process_amount):
proc = multiprocessing.Process(target=trim_reads_multi,
args=(q_read, q_write, alpha, beta,
gamma, m, t, offset, balanced,
algorithm, entropyOnly, file_type, verbose))
processes.append(proc)
proc.start()
num_quit = 0
read_count = 0
# The calling process writes the reads to the output file.
while (True):
trimmed_read = q_write.get()
if trimmed_read is None:
num_quit += 1
if num_quit == process_amount:
break
else:
read_count += 1
trimmed_output.write(trimmed_read)
proc_read.join()
for proc in processes:
proc.join()
return read_count
def main():
"""
This is the main file that defines the interface and executes the functions that do all
of the read trimming. Multiple processes are started and joined if the processes parameter
is set to a value larger than one. The s=0.1,r=0.4,m=25 had the best results on a simulation
with BisPin and good results with Bismark, walt, and bwameth.
"""
now = datetime.datetime.now()
usage = "usage: InfoTrim.py [args] <Reads file location> "
version = "0.1.1"
p = argparse.ArgumentParser(prog="InfoTrim", usage=usage, version=version,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
#p.add_argument('--reads', '-f', help='Fastq reads file location.', default=None)
# p.add_argument('--pair1', '-1', help='Fastq reads file for the first mate pair.', default=None)
# p.add_argument('--pair2', '-2', help='Fastq reads file for the second mate pair.', default=None)
p.add_argument('reads_file', type=str, help="The location of the fastq/fasta reads file.")
p.add_argument('--processes',
'-p',
type=int,
help='Number of processes to use.',
default=1)
p.add_argument('--output',
'-o',
type=str,
help='The file name for the output file.',
default=sys.stdout)
p.add_argument(
'--phred',
'-s',
type=float,
help='The tradeoff value for the phred score.',
default=0.1)
p.add_argument(
'--entropy',
'-r',
type=float,
help='The tradeoff value for the entropy feature.',
default=0.4)
p.add_argument('--min_length',
'-m',
type=int,
help='The minimum length target.',
default=25)
p.add_argument(
'--start_weight',
'-t',
type=float,
help=
'The weight for the exponential model for the starting position of the read for the non-linear option. If set to 0, it will not be used.',
default=0)
#p.add_argument('--balanced', '-b', help='Use the geometric mean of length, correctness probability, and entropy.', action="store_true", default=False)
p.add_argument(
'--algorithm',
'-g',
choices=['linear_end', 'linear_both', 'dynamic'],
help="This chooses the trimming algorithm to use. 'linear_end' uses a linear time algorithm and trims the end only (recommended). 'linear_both' uses a linear time algorithm and trims the beginning and the ending. 'dynamic' uses a slow quadratic time algorithm that trims the ends optimally with the InfoTrim score.",
default='linear_end')
#p.add_argument('--entropyOnly', '-e', help='Use the dynamic programming approach.', action="store_true", default=False)
p.add_argument(
'--ascii_offset',
'-a',
type=int,
help=
'The offset value for the ascii encoding of phred scores.',
default=33)
p.add_argument('--fasta',
action='store_true',
help="Use this if the input file is a fasta file.",
default=False)
p.add_argument(
'--verbose',
'-b',
help='Run in verbose. The score for each read is printed to stderr.',
action="store_true",
default=False)
args = p.parse_args()
balanced = False #args.balanced
algorithm = args.algorithm
entropyOnly = False #args.entropyOnly
verbose = args.verbose
input_file = args.reads_file
if not os.path.exists(input_file):
p.error("The reads file does not exist or could not be accessed.")
try:
r = float(args.entropy)
s = float(args.phred)
m = float(args.min_length)
t = float(args.start_weight)
if args.fasta:
s = 0.0
args.phred = 0.0
if t == 0.0 or t == 0:
t = None
if not balanced and not entropyOnly:
if r + s > 1 or r + s < 0:
sys.stderr.write(
"The parameters r=%s (phred) and s=%s (entropy) need to add up to a value in [0,1].\n"
% (str(s), str(r)))
return
#Is this formula correct?
#sr_over_3 = s*r/3.0
#alpha = (1-s)*(1-r) + sr_over_3
#beta = s * (1-r) + sr_over_3
#gamma = (1-s)*r + sr_over_3
alpha = 1 - r - s
beta = s
gamma = r
else:
alpha = None
beta = None
gamma = None
except ValueError:
sys.stderr.write(
"One of the parameters r, s, m, or t was not set to a number.\n")
sys.stderr.flush()
return
try:
offset = int(args.ascii_offset)
except ValueError:
sys.stderr.write("The ascii offset must be an integer.\n")
return
if input_file == None:
p.print_help()
sys.exit(1)
output_file = args.output
output_file_string = "stdout"
if not (output_file == sys.stdout):
try:
output_file_string = output_file
output_file = open(output_file, 'w')
except IOError:
sys.stderr.write(
"Something is wrong with opening the output file. Please check it.\n"
)
return
try:
process_amount = int(args.processes)
except ValueError:
sys.stderr.write("The number of processes should be an integer.\n")
return
sys.stderr.write("Starting InfoTrim %s by Jacob Porter.\n" % version)
sys.stderr.write("Using file '" + str(input_file) + "'. Please wait.\n")
sys.stderr.write("Current time: " + str(now) + " \n")
sys.stderr.write("Writing output to file '" + output_file_string + "'.\n")
sys.stderr.write("Using the following arguments.\n")
sys.stderr.write(str(args) + "\n")
# sys.stderr.write(
# "r=%s s=%s m=%s t=%s processes=%s ascii_offset=%s, balanced=%s linear=%s entropyOnly=%s verbose=%s\n"
# % (str(r), str(s), str(m), str(t), str(process_amount), str(offset),
# str(balanced), str(linear), str(entropyOnly), str(verbose)))
sys.stderr.flush()
read_count = trim_reads(input_file,
output_file,
alpha,
beta,
gamma,
m,
t,
offset=offset,
process_amount=process_amount,
balanced=balanced,
algorithm=args.algorithm,
entropyOnly=entropyOnly,
file_type='fasta' if args.fasta else 'fastq',
verbose=verbose)
sys.stderr.write("Finished trimming the reads. There were " +
str(read_count) + " reads processed.\n")
later = datetime.datetime.now()
#sys.stderr.write("Current time: " + str(now) + " \n")
sys.stderr.write("Elapsed time: %s\n" % (str(later - now)))
sys.stderr.write("Have a good day!\n")
sys.stderr.flush()
if __name__ == '__main__':
main()