-
Notifications
You must be signed in to change notification settings - Fork 0
/
01a_BlastTab_BestHit_Filter.py
254 lines (215 loc) · 7.8 KB
/
01a_BlastTab_BestHit_Filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python
'''Best Hit Filter for Tabular Blast Output.
This script filters tabular blast output for best hit based on bitscore,
as well as user defined percent match length, and percent identity.
This tool takes the following input parameters:
* tabular blast input file that includes query and subject lengths
in columns 13 and 14.
* percent_match_length to filter for in base pairs (bps) (ex: 70.0).
* percent_identity to filter for (ex: 70.0).
* minimum_subject_length to filter for in bps (ex: 3000).
This script returns the following files:
* input_file.fltrdBstHts.blst
Contains entries from original file that passed the filters.
* input_file.fltrdBstHtsLst.txt
Contains list of subject sequence names (contigs) for entries
that passed the filter.
* input_file.metatemplate.txt
Contains header and gene names for meta file to use for 01b
This script requires the following packages:
* argparse
* collections.defaultdict
-------------------------------------------
Author :: Roth Conrad
Email :: rotheconrad@gatech.edu
GitHub :: https://github.com/rotheconrad
Date Created :: March 13th, 2020
License :: GNU GPLv3
Copyright 2020 Roth Conrad
All rights reserved
-------------------------------------------
'''
import argparse
from collections import defaultdict
def best_hits(query_subject, bitscore, d, line, dups):
""" Filters the besthit based on bitscore """
v = ''
if query_subject in d:
v = f', DUPLICATE {len(query_subject)}'
dups += 1
old_bitscore = float(d[query_subject].split('\t')[11])
if bitscore > old_bitscore:
v += '- KEPT - BETTER HIT THAN PREVIOUS'
d[query_subject] = [line]
else: v += '- REMOVED - LOWER HIT THAN PREVIOUS'
else:
d[query_subject] = [line]
return d, dups, v
def filter_blast(infile, pml, pid, msl, verbose):
"""Gets and prints the spreadsheet's header columns
Parameters
----------
infile : str - file_name.txt
Tabular blast output file.
pml : float - ex: 70.0
Minimum Percent Match Length to keep
(alignment length / query sequence length).
pid : float - ex: 70.0
Minimum Percent Sequence Identity to keep.
msl : int - ex: 3000
Minimum length of subject sequence (contig) to keep.
Returns
-------
True - writes file.
writes out tabular blast lines passing the filter to
a new file infile.fltrdBstHts.blst
"""
d = {} # initialize dictionary for bitscore besthits
dups = 0 # counter for number of duplicate matches for reads
low_pml = 0 # counter for number of matches failing filters
low_pid = 0 #
low_msl = 0
passes = 0 # counter for number of matches passing filters
total = 0 # counter for total blast entries in file
if verbose:
print('query, subject, pid, aLen, qLen, pml, sLen, decision')
with open(infile, 'r') as f:
for l in f:
total += 1
X = l.rstrip().split('\t')
query = X[0]
subject = X[1]
bitscore = float(X[11]) # bitscore
pIdent = float(X[2]) # percent sequence identity
aLen = int(X[3]) # read alignment length
qLen = int(X[12]) # full length of read
sLen = int(X[13])
pMatch = aLen / qLen # percent match length of read length
verbose_line = (
f'{query}, {subject}, {pIdent}, {aLen}, {qLen}, '
f'{pMatch*100:.2f}, {sLen}'
)
if pMatch < (pml/100):
low_pml += 1
verbose_line += ', FAILED pml'
elif pIdent < pid:
low_pid += 1
verbose_line += ', FAILED pid'
elif sLen < msl:
low_msl += 1
verbose_line += ', FAILED msl'
elif pMatch >= (pml/100) and pIdent >= pid and sLen >= msl:
verbose_line += ', PASSED'
qs = f'{query}_{subject}'
d, dups, v = best_hits(qs, bitscore, d, l, dups)
passes += 1
verbose_line += v
else:
print(
'\n\nSomething is wrong with line:\n'
f'{l}\n'
)
if verbose: print(verbose_line)
ibase = infile.split('.')[0]
query_sequence_set = defaultdict(list)
outfile_blast = ibase + '.fltrdBstHts.blst'
outfile_meta = ibase + '.metatemplate.txt'
with open(outfile_blast, 'w') as ob:
for k,v in d.items():
ob.write(v[0])
X = v[0].split('\t')
subject_sequence_name = X[1]
query_sequence_name = X[0]
query_sequence_set[query_sequence_name].append(subject_sequence_name)
with open(outfile_meta, 'w') as om:
header = 'query name, annotation, color\n'
om.write(header)
for query, contigs in query_sequence_set.items():
om.write(f'{query}, \n')
outfile_list = ibase + f'_{query}.fltrdBstHtsLst.txt'
with open(outfile_list, 'a') as ol:
for contig in contigs:
ol.write(contig + '\n')
print('\nTotal number of entries in blast file:', total)
print('Number of matches lower than pml:', low_pml)
print('Number of matches >= pml but lower than pid:', low_pid)
print('Number of matches >= pml & pid but lower than msl:', low_msl)
print('Number of matchs passing the filters:', passes)
print(
'Number of passing matches with multiple hits to same contig '
'that were removed:', dups
)
print(
'Number of best hit entries passing the filter and written to '
'new file:', len(d)
)
if len(d) == 0:
print(
'\n\nNo viable sequence alignments passing the filters.\n'
'Try changing some parameters or go fish (for more contigs).\n'
'You can use the -v option to see which filter removes the match.'
)
print('\n\n')
return True
def main():
# Configure Argument Parser
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
'-i', '--input_file',
help='Please specify the tabular blast file with added sequence length',
metavar='',
type=str,
required=True
)
parser.add_argument(
'-pml', '--percent_match_length',
help=
'Minimum Percent Match Length (in bps) to keep '
'(alignment length / query sequence length). . . . '
'(Default = 50)',
metavar='',
type=float,
required=False,
default=50
)
parser.add_argument(
'-pid', '--percent_sequence_identity',
help=
'Minimum percent sequence identity to keep. . . . '
'(Default = 70)',
metavar='',
type=float,
required=False,
default=70
)
parser.add_argument(
'-msl', '--minimum_subject_length',
help=
'Minimum length (in bps) of subject (contig) '
'sequence to keep (Default = 1000).',
metavar='',
type=int,
required=False,
default=1000
)
parser.add_argument(
'-v', '--verbose',
help='Print decision for each match to screen.',
required=False,
action='store_true'
)
args=vars(parser.parse_args())
# Do what you came here to do:
print('\n\nRunning Script...')
filter_blast(
args['input_file'],
args['percent_match_length'],
args['percent_sequence_identity'],
args['minimum_subject_length'],
args['verbose']
)
if __name__ == "__main__":
main()