/
local-rearrangements
executable file
·408 lines (367 loc) · 14.2 KB
/
local-rearrangements
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#! /usr/bin/env python
# Copyright 2017 Martin C. Frith
import bisect
import gzip
import itertools
import operator
import optparse
import signal
import sys
def myOpen(fileName): # faster than fileinput
if fileName == "-":
return sys.stdin
if fileName.endswith(".gz"):
return gzip.open(fileName)
return open(fileName)
def log(opts, *things):
if opts.verbose:
sys.stderr.write(" ".join(map(str, things)) + "\n")
def fieldsFromMaf(line):
_, seqName, start, span, strand, seqLen, _ = line.split()
beg = int(start)
return seqName, beg, beg + int(span), strand, int(seqLen)
def mafInput(lines):
mafLines = []
rName = qName = ""
for line in lines:
if line.isspace():
if qName:
yield fields
mafLines = []
rName = qName = ""
elif line[0] != "#":
mafLines.append(line)
if line[0] == "s":
if rName:
qName, qBeg, qEnd, qStrand, qSeqLen = fieldsFromMaf(line)
fields = (qName, qBeg, qEnd, qStrand, qSeqLen,
rName, rBeg, rEnd, rStrand, rSeqLen, mafLines)
else:
rName, rBeg, rEnd, rStrand, rSeqLen = fieldsFromMaf(line)
if qName:
yield fields
def rearrangementEnds(alignments, end):
for i in alignments:
yield max(end, i[6]) # rBeg
end = max(end, i[7]) # rEnd
def rearrangementBegs(alignments, beg):
for i in reversed(alignments):
yield min(beg, i[7]) # rEnd
beg = min(beg, i[6]) # rBeg
def rearrangementRanges(opts, alignments, reaBegs, reaEnds, strand):
x = None
for j, y in enumerate(alignments):
if y[3] == strand and reaEnds[j] < reaBegs[j]:
if x and (i + 1 < j or x[7] > y[6]):
yield i, j
i = j
x = y
def onePassMergedRanges(ranges, reaBegs, reaEnds):
oldBeg, oldEnd = ranges[0]
for beg, end in ranges[1:]:
separation = reaBegs[beg] - reaEnds[oldEnd]
mergedSpan = reaEnds[end] - reaBegs[oldBeg]
if 2 * separation > mergedSpan: # ? empirically, this seems good
yield oldBeg, oldEnd
oldBeg = beg
oldEnd = end
yield oldBeg, oldEnd
def allPassMergedRanges(ranges, reaBegs, reaEnds):
while True:
newRanges = list(onePassMergedRanges(ranges, reaBegs, reaEnds))
if len(newRanges) == len(ranges): return ranges
ranges = newRanges
def rearrangementsFromOneQuery(opts, alignments):
head = alignments[0]
rName = head[5]
if any(i[5] != rName for i in alignments): return
strand = head[3]
tail = alignments[-1]
if tail[3] != strand: return
headBeg = head[6]
tailEnd = tail[7]
reaEnds = list(rearrangementEnds(alignments, headBeg))
if reaEnds[-1] >= tailEnd: return
reaBegs = list(rearrangementBegs(alignments, tailEnd))
if reaBegs[-1] <= headBeg: return
reaBegs.reverse()
ranges = list(rearrangementRanges(opts,
alignments, reaBegs, reaEnds, strand))
if not ranges: return
alignedLength = sum(i[7] - i[6] for i in alignments) # xxx ???
if tailEnd - headBeg > 2 * alignedLength: return # xxx ?
for beg, end in allPassMergedRanges(ranges, reaBegs, reaEnds):
if beg + 1 < end:
event = "C" # complex
else:
event = "T" # tandem duplication
yield rName, reaBegs[beg], reaEnds[end], event, alignments
def alignmentsPerQueryFromLines(lines):
for k, v in itertools.groupby(mafInput(lines), operator.itemgetter(0)):
yield list(v)
def alignmentsPerQuery(args):
if not args:
args = ["-"]
for arg in args:
lines = myOpen(arg)
for i in alignmentsPerQueryFromLines(lines): yield i
def rearrangementsFromFiles(opts, args):
for alignments in alignmentsPerQuery(args):
if alignments[0][3] == "-":
alignments.reverse()
for i in rearrangementsFromOneQuery(opts, alignments):
yield i
def isExtraFirstGapField(fields):
return fields[4].isdigit()
def readGaps(opts):
if not opts.gap: return
log(opts, "reading unsequenced gaps...")
for line in myOpen(opts.gap):
fields = line.split()
if not fields or fields[0][0] == "#": continue
if isExtraFirstGapField(fields): fields = fields[1:]
if fields[4] not in "NU": continue
seqName = fields[0]
end = int(fields[2])
beg = end - int(fields[5]) # zero-based coordinate
yield seqName, beg, end
def tabAlignmentParts(text):
insSize = delSize = 0
for block in text.split(","):
i = block.find(":")
if i < 0:
yield insSize, delSize, int(block)
insSize = delSize = 0
else:
j = i + 1
delSize += int(block[:i])
insSize += int(block[j:])
def mafAlignmentParts(alignedSeq1, alignedSeq2):
insSize = delSize = subSize = 0
for x, y in itertools.izip(alignedSeq1, alignedSeq2):
if x == "-":
if subSize:
yield insSize, delSize, subSize
insSize = delSize = subSize = 0
insSize += 1
elif y == "-":
if subSize:
yield insSize, delSize, subSize
insSize = delSize = subSize = 0
delSize += 1
else:
subSize += 1
if subSize:
yield insSize, delSize, subSize
def alignmentRanges(alignmentParts, seqName, beg, maxGap):
fixedName = seqName.split(".")[-1]
end = beg
for insSize, delSize, subSize in alignmentParts:
if beg == end or insSize > maxGap or delSize > maxGap:
if beg < end: yield fixedName, beg, end
beg = end + delSize
end += delSize + subSize
if beg < end: yield fixedName, beg, end
def readOutgroupAlignments(opts):
if not opts.outgroup: return
log(opts, "reading outgroup alignments...")
maxGap = opts.outgroup_max_gap
rSeq = ""
for line in myOpen(opts.outgroup):
if line[0].isdigit(): # lastTab format
fields = line.split()
seqName = fields[1]
beg = int(fields[2])
parts = tabAlignmentParts(fields[11])
for i in alignmentRanges(parts, seqName, beg, maxGap):
yield i
elif line[0] == "s": # MAF format
fields = line.split()
if not rSeq:
seqName = fields[1]
beg = int(fields[2])
rSeq = fields[6]
else:
qSeq = fields[6]
parts = mafAlignmentParts(rSeq, qSeq)
for i in alignmentRanges(parts, seqName, beg, maxGap):
yield i
rSeq = ""
def rearrangementRangesFromFile(opts, gaps, outgroupRanges, seqNames, lines):
for line in lines:
fields = line.split()
if not fields: break
if len(fields) < 6 + opts.min_queries: continue
if int(fields[5]) < opts.min_complex: continue
chromosomeName = fields[2]
beg = int(fields[3])
end = int(fields[4])
if isOverlap(gaps, chromosomeName, beg, end): continue
if opts.outgroup:
if not isContained(outgroupRanges, chromosomeName, beg, end):
continue
yield chromosomeName, beg, end
seqNames.update(fields[6:])
sys.stdout.write(line)
def rangeSortKey(r):
seqName, beg, end = r[:3]
return seqName, beg, -end
def uncontained(ranges):
oldName = ""
for r in sorted(ranges, key=rangeSortKey):
seqName, beg, end = r[:3]
if seqName == oldName:
if end <= maxEnd:
continue
else:
oldName = seqName
maxEnd = end
yield r
def isOverlap(ranges, seqName, beg, end):
q = seqName, beg
i = bisect.bisect(ranges, q)
if i > 0:
n, b, e = ranges[i - 1]
if n == seqName and e > beg: return True
if i < len(ranges):
n, b, e = ranges[i]
if n == seqName and b < end: return True
return False
def isContained(ranges, seqName, beg, end):
q = seqName, beg
i = bisect.bisect(ranges, q)
if not i: return False
n, b, e = ranges[i - 1]
return n == seqName and e > end
def isPotentialOverlap(genomeRanges, alignment):
qBeg, qEnd, _, qSeqLen, rName, rBeg, rEnd = alignment[1:8]
genomePosition = rName, rBeg
i = bisect.bisect(genomeRanges, genomePosition)
if i > 0:
n, b, e = genomeRanges[i - 1]
if n == rName and e > rBeg - qBeg:
return True
if i < len(genomeRanges):
n, b, e = genomeRanges[i]
if n == rName and b < rEnd - (qSeqLen - qEnd):
return True
return False
def printMafs(alignments):
if alignments[0][3] == "-": alignments = reversed(alignments)
for i in alignments:
mafLines = i[-1]
for i in mafLines:
sys.stdout.write(i)
sys.stdout.write("\n")
sys.stdout.write("\n\n")
def printRearrangement(rName, beg, end, numOfComplex, queryNames):
q = " ".join(sorted(queryNames))
out = "# Rearrangement:", rName, str(beg), str(end), str(numOfComplex), q
sys.stdout.write(" ".join(out) + "\n")
def rearrangementClumps(rearrangements):
oldName = ""
for chromosomeName, beg, end, event, alignments in rearrangements:
if chromosomeName > oldName or beg >= maxEnd:
if oldName: yield oldName, minBeg, maxEnd, clump
clump = []
oldName = chromosomeName
minBeg = beg
maxEnd = end
queryInfo = alignments, event
clump.append(queryInfo)
maxEnd = max(maxEnd, end)
if oldName: yield oldName, minBeg, maxEnd, clump
def alignmentsPerQuerySortKey(alignmentsForOneQuery):
h = alignmentsForOneQuery[0]
return h[5], h[3], h[6], h[7], h[0]
def getQueriesNearRearrangements(opts, args, gaps, outgroupRanges):
f = myOpen(opts.rearrangements)
log(opts, "reading rearrangements...")
seqNames = set()
r = rearrangementRangesFromFile(opts, gaps, outgroupRanges, seqNames, f)
r = list(uncontained(r))
log(opts, len(r), "rearrangement ranges")
sys.stdout.write("\n")
for alignments in alignmentsPerQueryFromLines(f):
if alignments[0][0] in seqNames:
printMafs(alignments)
for alignments in alignmentsPerQuery(args):
if any(isPotentialOverlap(r, a) for a in alignments):
if alignments[0][0] not in seqNames:
printMafs(alignments)
def getLocalRearrangements(opts, args, gaps, outgroupRanges):
log(opts, "reading alignments...")
rearrangements = sorted(rearrangementsFromFiles(opts, args))
log(opts, len(rearrangements), "initial rearrangements")
alignmentsOfRearrangedQueries = []
queryCounts = []
for chromosomeName, beg, end, clump in rearrangementClumps(rearrangements):
alignmentLists = [i[0] for i in clump]
allQueryNames = [i[0][0] for i in alignmentLists]
queryNames = set(allQueryNames)
numOfQueries = len(queryNames)
if numOfQueries < opts.min_queries:
continue
events = [i[1] for i in clump]
namesAndEvents = sorted(zip(allQueryNames, events))
g = itertools.groupby(namesAndEvents, operator.itemgetter(0))
numOfComplex = sum("".join(i[1] for i in v) != "T" for k, v in g)
if numOfComplex < opts.min_complex:
continue
if isOverlap(gaps, chromosomeName, beg, end):
continue
if opts.outgroup:
if not isContained(outgroupRanges, chromosomeName, beg, end):
continue
printRearrangement(chromosomeName, beg, end, numOfComplex, queryNames)
alignmentsOfRearrangedQueries.extend(alignmentLists)
queryCounts.append(numOfComplex)
sys.stdout.write("\n")
alignmentsOfRearrangedQueries.sort(key=alignmentsPerQuerySortKey)
oldName = ""
for i in alignmentsOfRearrangedQueries:
queryName = i[0][0]
if queryName == oldName: continue
printMafs(i)
oldName = queryName
sys.stderr.write("# Rearrangements: %s\n" % len(queryCounts))
for k, v in itertools.groupby(sorted(queryCounts)):
t = k, len(list(v))
line = "# Complex query sequences: %2s Rearrangements: %s\n" % t
sys.stderr.write(line)
def localRearrangements(opts, args):
gaps = list(uncontained(readGaps(opts)))
log(opts, len(gaps), "gap ranges")
outgroupRanges = list(uncontained(readOutgroupAlignments(opts)))
log(opts, len(outgroupRanges), "outgroup ranges")
if opts.rearrangements:
getQueriesNearRearrangements(opts, args, gaps, outgroupRanges)
else:
getLocalRearrangements(opts, args, gaps, outgroupRanges)
if __name__ == "__main__":
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
usage = "%prog [options] maf-file(s)"
descr = "Find sequence rearrangements contained in single query sequences."
op = optparse.OptionParser(usage=usage, description=descr)
op.add_option("--gap", metavar="FILE",
help="read unsequenced genome gaps from agp or gap file")
op.add_option("--min-complex", type="int", default=1, metavar="N",
help='only consider rearrangements with >= N '
'"complex" queries (default=%default)')
op.add_option("--min-queries", type="int", default=1, metavar="N",
help="only consider rearrangements with >= N queries "
"(default=%default)")
op.add_option("--outgroup", metavar="FILE",
help="read outgroup alignments in maf or lastTab format")
op.add_option("--outgroup-max-gap", type="int", default=50, metavar="L",
help=
"max gap length in outgroup alignments (default=%default)")
# Maybe 50 is a bit short? This discards one chr3 rearrangement
# that I think is good. Also: chr7 156594331 156601643:
# human-chimp indel >100.
op.add_option("--rearrangements", metavar="FILE",
help="get alignments of queries near these rearrangements")
op.add_option("-v", "--verbose", action="count",
help="show progress messages")
opts, args = op.parse_args()
localRearrangements(opts, args)