/
rna-alignment-stats
executable file
·203 lines (184 loc) · 7.55 KB
/
rna-alignment-stats
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#! /usr/bin/env python
# Copyright 2019 Martin C. Frith
# SPDX-License-Identifier: MIT
from __future__ import print_function
import gzip
import itertools
import optparse
import signal
import sys
from bisect import bisect
from collections import defaultdict
from operator import itemgetter
def openFile(fileName):
if fileName == "-":
return sys.stdin
if fileName.endswith(".gz"):
return gzip.open(fileName, "rt") # xxx dubious for Python2
return open(fileName)
def ucscIntList(text):
items = text.rstrip(",").split(",")
return [int(i) for i in items]
def genesFromFile(lines):
for line in lines:
fields = line.split()
geneNames = []
if fields[2] not in ("+", "-"):
if len(fields) < 13: # refFlat
geneNames.append(fields[0])
fields.pop(0)
if len(fields) > 11: # genePredExt
geneNames.append(fields[11])
geneNames.append(fields[0])
chroName = fields[1]
strand = fields[2]
exonBegs = ucscIntList(fields[8])
exonEnds = ucscIntList(fields[9])
beg = exonBegs[0]
end = exonEnds[-1]
exons = list(zip(exonBegs, exonEnds))
yield chroName, strand, beg, end, exons, ",".join(geneNames)
def genesPerStrand(lines):
s = sorted(genesFromFile(lines))
for key, group in itertools.groupby(s, itemgetter(0, 1)):
genes = [i[2:] for i in group]
maxSpan = max(end - beg for beg, end, exons, name in genes)
value = genes, maxSpan
yield key, value
def gaplessAlignmentParts(lines):
for line in lines:
fields = line.split()
if fields and fields[0].isdigit():
if fields[8].isdigit():
fields.pop(0)
strand = fields[8]
qrySeqName = fields[9]
qrySeqLen = int(fields[10])
refSeqName = fields[13]
if "linker" in refSeqName:
continue
refSeqLen = int(fields[14])
blockLengths = ucscIntList(fields[18])
qryStarts = ucscIntList(fields[19])
refStarts = ucscIntList(fields[20])
blocks = zip(blockLengths, qryStarts, refStarts)
for blockLen, qryStart, refStart in blocks:
if strand == "-": # get query +strand and reference -strand:
qryStart = qrySeqLen - (qryStart + blockLen)
refStart = refSeqLen - (refStart + blockLen)
yield (qrySeqName, qrySeqLen, qryStart, refStart, blockLen,
refSeqName, refSeqLen, strand)
def qryEnd(gaplessAlnPart):
return gaplessAlnPart[2] + gaplessAlnPart[4]
def refEnd(gaplessAlnPart):
return gaplessAlnPart[3] + gaplessAlnPart[4]
def genomeSegment(gaplessAlnPart):
refStart, blockLen, refSeqName, refSeqLen, strand = gaplessAlnPart[3:]
if strand == "-": # get coordinates in the reference +strand:
refStart = refSeqLen - (refStart + blockLen)
return refSeqName, strand, refStart, refStart + blockLen
def qryUnalignedLengths(gaplessAlnParts):
for x, y in zip(gaplessAlnParts, gaplessAlnParts[1:]):
qryEndX = qryEnd(x)
qryBegY = y[2]
if qryEndX < qryBegY:
yield qryBegY - qryEndX
elif qryEndX > qryBegY:
raise Exception("unexpected alignment overlap in the query")
def refUnalignedLengths(gaplessAlnParts):
for x, y in zip(gaplessAlnParts, gaplessAlnParts[1:]):
refEndX = refEnd(x)
refBegY = y[3]
if x[5:] != y[5:] or refEndX > refBegY:
yield -1
elif refEndX < refBegY:
yield refBegY - refEndX
def overlapLengths(xSegments, ySegments):
# xxx This double-loop risks being very slow. In practice, it
# seems fast enough, so far...
for xBeg, xEnd in xSegments:
for yBeg, yEnd in ySegments:
if yEnd > xBeg:
if yBeg >= xEnd:
break
yield min(xEnd, yEnd) - max(xBeg, yBeg)
def segmentLengthsBef(segments, cutPos):
for beg, end in segments:
if beg < cutPos:
yield min(end, cutPos) - beg
def segmentLengthsAft(segments, cutPos):
for beg, end in segments:
if end > cutPos:
yield end - max(beg, cutPos)
def geneOverlapData(allGenes, gaplessAlnParts):
geneName = "."
geneSize = overlapSize = geneSizeBefAln = geneSizeAftAln = 0
alnSegments = sorted(map(genomeSegment, gaplessAlnParts))
for key, group in itertools.groupby(alnSegments, itemgetter(0, 1)):
if key in allGenes:
strand = key[1]
genes, maxGeneSpan = allGenes[key]
segments = [i[2:] for i in group]
alnBeg = segments[0][0]
alnEnd = segments[-1][1]
lookMeUp = alnEnd, alnEnd
genesIndex = bisect(genes, lookMeUp)
while genesIndex > 0:
genesIndex -= 1
geneBeg, geneEnd, exons, name = genes[genesIndex]
if geneEnd <= alnBeg:
if geneBeg + maxGeneSpan <= alnBeg:
break
continue
s = sum(overlapLengths(segments, exons))
thisGeneSize = sum(end - beg for beg, end in exons)
if (s, -thisGeneSize) > (overlapSize, -geneSize):
geneName = name
geneSize = thisGeneSize
overlapSize = s
bef = sum(segmentLengthsBef(exons, alnBeg))
aft = sum(segmentLengthsAft(exons, alnEnd))
geneSizeBefAln = bef if strand == "+" else aft
geneSizeAftAln = aft if strand == "+" else bef
return geneName, geneSize, overlapSize, geneSizeBefAln, geneSizeAftAln
def mainRefSeqName(gaplessAlnParts):
d = defaultdict(int)
for i in gaplessAlnParts:
blockLen, refSeqName = i[4:6]
d[refSeqName] -= blockLen
mainName, junk = min(d.items(), key=itemgetter(1, 0))
return mainName
def main(opts, args):
alnFiles = args if args else ["-"]
allGenes = {}
if opts.genes:
allGenes.update(genesPerStrand(openFile(opts.genes)))
for fileName in alnFiles:
gaplessAlnParts = gaplessAlignmentParts(openFile(fileName))
for key, group in itertools.groupby(gaplessAlnParts, itemgetter(0, 1)):
qrySeqName, qrySeqLen = key
group = sorted(group)
alignedBases = sum(i[4] for i in group)
unalignedHead = group[0][2]
unalignedTail = qrySeqLen - qryEnd(group[-1])
qryInserts = list(qryUnalignedLengths(group))
qryInsMax = max(qryInserts) if qryInserts else 0
refJumps = list(refUnalignedLengths(group))
isNonlinear = any(i < 0 for i in refJumps)
refInserts = [i for i in refJumps if i > 0]
refInsMax = max(refInserts) if refInserts else 0
chro = mainRefSeqName(group)
out = (qrySeqName, qrySeqLen, alignedBases, unalignedHead,
unalignedTail, qryInsMax, refInsMax, int(isNonlinear), chro)
if opts.genes:
out += geneOverlapData(allGenes, group)
print(*out, sep="\t")
if __name__ == "__main__":
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
usage = "%prog [options] alignments.psl"
descr = "Print a table of summary statistics for RNA-to-genome alignments."
op = optparse.OptionParser(usage=usage, description=descr)
op.add_option("-g", "--genes", metavar="FILE",
help="read genes from a genePred file")
opts, args = op.parse_args()
main(opts, args)