-
Notifications
You must be signed in to change notification settings - Fork 0
/
convertInversion.py
278 lines (219 loc) · 8.75 KB
/
convertInversion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/env python2
#
import sys
import gzip
from io import BufferedReader
from subprocess import check_output
from os import path
from os.path import exists
class VcfRecord:
def __init__(self, inline):
tokens = inline.strip().split('\t')
self.chr = tokens[0]
self.pos = int(tokens[1])
self.vid = tokens[2]
self.ref = tokens[3]
self.alt = tokens[4]
self.qual = tokens[5]
self.filter = tokens[6]
self.info = tokens[7].split(';')
self.others = "\t".join(tokens[8:])
# Create a dictionary for INFO
self.infoDict ={}
for infoItem in self.info:
items = infoItem.split('=')
if len(items) == 1:
self.infoDict[items[0]] = True
elif len(items) > 1:
self.infoDict[items[0]] = items[1]
self.isINV3 = False
self.isINV5 = False
self.mateChr = ""
self.matePos = -1
def checkInversion(self):
def getMateInfo(splitChar):
items = self.alt.split(splitChar)
mateInfo = items[1].split(':')
# Assuming that the last item, contains position information
matePos = mateInfo[-1]
# Other items belong to chromosome
self.mateChr = ":".join(mateInfo[:-1])
self.matePos = int(matePos)
if self.alt.startswith('['):
getMateInfo('[')
if self.mateChr == self.chr:
self.isINV5 = True
elif self.alt.endswith(']'):
getMateInfo(']')
if self.mateChr == self.chr:
self.isINV3 = True
def makeLine(self):
infoStr = ";".join(self.info)
self.line = "\t".join((self.chr,
str(self.pos),
self.vid,
self.ref,
self.alt,
self.qual,
self.filter,
infoStr,
self.others
))+"\n"
def scanVcf(vcfFile):
invMateDict = {}
if vcfFile.endswith('gz'):
gzfp = gzip.open(vcfFile, 'rb')
fpVcf = BufferedReader(gzfp)
else:
fpVcf = open(vcfFile, 'rb')
for line in fpVcf:
if line[0] == '#':
continue
vcfRec = VcfRecord(line)
vcfRec.checkInversion()
if vcfRec.isINV3 or vcfRec.isINV5:
if vcfRec.vid in invMateDict:
# update mate INFO
invMateDict[vcfRec.vid] = vcfRec.infoDict
else:
mateId = vcfRec.infoDict["MATEID"]
invMateDict[mateId] = ""
return invMateDict
def getReference(samtools, refFasta, chrom, start, end):
region = "%s:%d-%d" % (chrom, start, end)
samtoolsOut = check_output([samtools, "faidx", refFasta, region])
refSeq = ""
for seq in samtoolsOut.split('\n'):
if not seq.startswith(">"):
refSeq += seq
return refSeq.upper()
def writeLines(lines):
for line in lines:
sys.stdout.write(line)
def convertInversions(samtools, refFasta, vcfFile, invMateDict):
isHeaderInfoAdded = False
isHeaderAltAdded = False
lineBuffer = []
bufferedChr = ""
bufferedPos = -1
if vcfFile.endswith('gz'):
gzfp = gzip.open(vcfFile, 'rb')
fpVcf = BufferedReader(gzfp)
else:
fpVcf = open(vcfFile, 'rb')
for line in fpVcf:
if line.startswith('#'):
if (not isHeaderInfoAdded) and line.startswith("##FORMAT="):
sys.stdout.write("##INFO=<ID=INV3,Number=0,Type=Flag,Description=\"Inversion breakends open 3' of reported location\">\n")
sys.stdout.write("##INFO=<ID=INV5,Number=0,Type=Flag,Description=\"Inversion breakends open 5' of reported location\">\n")
isHeaderInfoAdded = True
if (not isHeaderAltAdded) and line.startswith("##ALT="):
sys.stdout.write("##ALT=<ID=INV,Description=\"Inversion\">\n")
isHeaderAltAdded = True
sys.stdout.write(line)
continue
vcfRec = VcfRecord(line)
# skip mate record
if vcfRec.vid in invMateDict:
continue
vcfRec.checkInversion()
if vcfRec.isINV3 or vcfRec.isINV5:
if vcfRec.isINV5:
# adjust POS for INV5
vcfRec.pos -= 1
vcfRec.matePos -= 1
vcfRec.ref = getReference(samtools, refFasta,
vcfRec.chr, vcfRec.pos, vcfRec.pos)
# update manta ID
vidSuffix = vcfRec.vid.split("MantaBND")[1]
idx = vidSuffix.rfind(':')
vcfRec.vid = "MantaINV%s" % vidSuffix[:idx]
# symbolic ALT
vcfRec.alt = "<INV>"
# add END
infoEndStr = "END=%d" % vcfRec.matePos
newInfo = [infoEndStr]
for infoItem in vcfRec.info:
if infoItem.startswith("SVTYPE"):
# change SVTYPE
newInfo.append("SVTYPE=INV")
# add SVLEN
infoSvLenStr = "SVLEN=%d" % (vcfRec.matePos-vcfRec.pos)
newInfo.append(infoSvLenStr)
elif infoItem.startswith("CIPOS"):
newInfo.append(infoItem)
# set CIEND
isImprecise = "IMPRECISE" in vcfRec.infoDict
# for imprecise calls, set CIEND to the mate breakpoint's CIPOS
if isImprecise:
mateId = vcfRec.infoDict["MATEID"]
mateInfoDict = invMateDict[mateId]
infoCiEndStr = "CIEND=%s" % (mateInfoDict["CIPOS"])
newInfo.append(infoCiEndStr)
# for precise calls, set CIEND w.r.t HOMLEN
else:
if "HOMLEN" in vcfRec.infoDict:
infoCiEndStr = "CIEND=-%s,0" % vcfRec.infoDict["HOMLEN"]
newInfo.append(infoCiEndStr)
elif infoItem.startswith("HOMSEQ"):
# update HOMSEQ for INV5
if vcfRec.isINV5:
cipos = vcfRec.infoDict["CIPOS"].split(',')
homSeqStart = vcfRec.pos + int(cipos[0]) + 1
homSeqEnd = vcfRec.pos + int(cipos[1])
refSeq = getReference(samtools, refFasta, vcfRec.chr,
homSeqStart, homSeqEnd)
infoHomSeqStr = "HOMSEQ=%s" % refSeq
newInfo.append(infoHomSeqStr)
else:
newInfo.append(infoItem)
# skip BND-specific tags
elif (infoItem.startswith("MATEID") or
infoItem.startswith("BND_DEPTH") or
infoItem.startswith("MATE_BND_DEPTH")):
continue
# update event ID
elif infoItem.startswith("EVENT"):
eidSuffix = vcfRec.infoDict["EVENT"].split("MantaBND")[1]
idx = vidSuffix.rfind(':')
infoEventStr = "EVENT=MantaINV%s" % eidSuffix[:idx]
newInfo.append(infoEventStr)
# apply all other tags
else:
newInfo.append(infoItem)
# add INV3/INV5 tag
if vcfRec.isINV3:
newInfo.append("INV3")
elif vcfRec.isINV5:
newInfo.append("INV5")
vcfRec.info = newInfo
vcfRec.makeLine()
# make sure the vcf is sorted in genomic order
if (not vcfRec.chr == bufferedChr) or (vcfRec.pos > bufferedPos):
if lineBuffer:
writeLines(lineBuffer)
lineBuffer = [vcfRec.line]
bufferedChr = vcfRec.chr
bufferedPos = vcfRec.pos
elif vcfRec.pos < bufferedPos:
lineBuffer.insert(0, vcfRec.line)
else:
lineBuffer.append(vcfRec.line)
if lineBuffer:
writeLines(lineBuffer)
if __name__=='__main__':
usage = "convertInversion.py <samtools path> <reference fasta> <vcf file>\n"
if len(sys.argv) <= 3:
sys.stderr.write(usage)
sys.exit(1)
samtools = sys.argv[1]
refFasta = sys.argv[2]
vcfFile = sys.argv[3]
for inputFile in [samtools, refFasta, vcfFile]:
if not(exists(inputFile)):
errMsg = ('File %s does not exist.'
% inputFile)
sys.stderr.write(errMsg + '\nProgram exits.')
sys.exit(1)
invMateDict = scanVcf(vcfFile)
convertInversions(samtools, refFasta, vcfFile, invMateDict)