/
EntityRecognizer.py
535 lines (440 loc) · 20.9 KB
/
EntityRecognizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
import sys
import itertools
import kindred
import pickle
import argparse
import codecs
import time
import re
import string
from collections import defaultdict,Counter
import json
import six
import os
def acronymMatch(words,pos,currentAcronym,atStart,subpos=None):
if len(currentAcronym) == 0:
if not (subpos is None): # Can't finish acronym mid-word
return []
else:
return [pos+1]
curWord = words[pos].lower()
wordSplit = curWord.split('-')
curLetter = currentAcronym[-1]
moves = []
if subpos is None:
if atStart and curLetter == 's' and curWord[-1] == 's':
# Possible plural
moves.append( (words,pos,currentAcronym[:-1],False) )
if curLetter == curWord[0]:
moves.append( (words,pos-1,currentAcronym[:-1],False) )
if curWord == '-':
moves.append( (words,pos-1,currentAcronym,False) )
if len(wordSplit) > 1:
if subpos is None:
subpos = len(wordSplit)-1
if len(wordSplit[subpos]) > 0 and curLetter == wordSplit[subpos][0]:
if subpos == 0:
moves.append( (words,pos-1,currentAcronym[:-1],False) )
else:
moves.append( (words,pos,currentAcronym[:-1],False,subpos-1) )
possibleStarts = []
for move in moves:
possibleStarts += acronymMatch(*move)
return possibleStarts
def acronymDetection(words):
LRBs = [i for i, x in enumerate(words) if x == u'(']
RRBs = [i for i, x in enumerate(words) if x == u')']
acronyms = []
for i,j in itertools.product(LRBs,RRBs):
if j-i == 2:
acronymLoc = i+1
possibleAcronym = words[acronymLoc]
possibleStarts = acronymMatch(words,i-1,possibleAcronym.lower(),True)
if len(possibleStarts) > 0:
start = min(possibleStarts)
end = i
acronyms.append((start,end,acronymLoc))
return acronyms
def mergeWordsForFusionDetection(words):
prevWord = ""
mergedWords = []
start = 0
mergeChars = ['-','/',':']
for i,w in enumerate(words):
if w in mergeChars:
prevWord += w
elif len(prevWord) > 0 and prevWord[-1] in mergeChars:
prevWord += w
else:
if prevWord:
mergedWords.append((start,i-1,prevWord))
prevWord = w
start = i
if prevWord:
mergedWords.append((start,len(words)-1,prevWord))
return mergedWords
def fusionGeneDetection(words, lookupDict):
termtypesAndids,terms,locs = [],[],[]
origWords = list(words)
words = [ w.lower() for w in words ]
mergedWords = mergeWordsForFusionDetection(words)
for start,end,word in mergedWords:
split = re.split("[-/:]",word)
fusionCount = len(split)
if fusionCount == 1:
continue
allGenes = True
geneIDs = []
lookupIDCounter = Counter()
for s in split:
key = s
if key in lookupDict:
isGene = False
for entityType,entityID in lookupDict[key]:
if entityType == 'gene':
for tmpID in entityID.split(';'):
lookupIDCounter[tmpID] += 1
geneIDs.append(entityID)
isGene = True
break
if not isGene:
allGenes = False
break
else:
allGenes = False
break
# We're going to check if there are any lookup IDs shared among all the "fusion" terms
# Hence this may not actually be a fusion, but just using multiple names of a gene
# e.g. HER2/neu
completeLookupIDs = [ id for id,count in lookupIDCounter.items() if count == fusionCount ]
if len(completeLookupIDs) > 0:
termtypesAndids.append([('gene',';'.join(completeLookupIDs))])
terms.append(tuple(origWords[start:end+1]))
locs.append((start,end+1))
elif allGenes: # All the terms look like genes (and different genes), so we're going to mark this as a fusion (or combo)
#geneTxt = ",".join(map(str,geneIDs))
geneIDs = [ geneID.replace(';','&') for geneID in geneIDs ]
termtypesAndids.append([('gene','combo|' + '|'.join(geneIDs))])
terms.append(tuple(origWords[start:end+1]))
locs.append((start,end+1))
return locs,terms,termtypesAndids
def getTermIDsAndLocations(sentence, lookupDict):
termtypesAndids,terms,locs = [],[],[]
# Lowercase all the tokens
#np = [ unicodeLower(w) for w in np ]
#orignp = np
np = [ t.word.lower() for t in sentence.tokens ]
blank = "".join( " " for _ in sentence.text )
tempSentence = sentence.text.lower()
sentenceStart = sentence.tokens[0].startPos
# The length of each search string will decrease from the full length
# of the text down to 1
for l in reversed(range(1, len(sentence.tokens)+1)):
# We move the search window through the text
for i in range(len(np)-l+1):
# Extract that window of text
#s = tuple(np[i:i+l])
startPos = sentence.tokens[i].startPos - sentenceStart
endPos = sentence.tokens[i+l-1].endPos - sentenceStart
s = tempSentence[startPos:endPos]
# Search for it in the dictionary
if s in lookupDict:
# If found, save the ID(s) in the dictionary
termtypesAndids.append(lookupDict[s])
terms.append(tuple(np[i:i+l]))
locs.append((i,i+l))
# And blank it out
#np[i:i+l] = [ "" for _ in range(l) ]
tempSentence = tempSentence[:startPos] + blank[startPos:endPos] + tempSentence[endPos:]
# Then return the found term IDs
return locs,terms,termtypesAndids
def startsWithButNotAll(s,search):
return s.startswith(search) and len(s) > len(search)
def cleanupVariant(variant):
variant = variant.upper().replace('P.','')
aminoAcidInfo = [('ALA','A'),('ARG','R'),('ASN','N'),('ASP','D'),('CYS','C'),('GLU','E'),('GLN','Q'),('GLY','G'),('HIS','H'),('ILE','I'),('LEU','L'),('LYS','K'),('MET','M'),('PHE','F'),('PRO','P'),('SER','S'),('THR','T'),('TRP','W'),('TYR','Y'),('VAL','V')]
for longA,shortA in aminoAcidInfo:
variant = variant.replace(longA,shortA)
return variant
class EntityRecognizer:
"""
Annotates entities in a Corpus using an exact-dictionary matching scheme with additional heuristics. These heuristics include detecthing fusion gene mentions, microRNA, identifying acronyms to reduce ambiguity, identifying variants and more. All the options are parameters for the constructor of this class.
:ivar lookup: Used for the dictionary matching. A dictionary of terms (tuple of parsed words) to a list of (entityType,externalID).
:ivar detectFusionGenes: Whether it will try to identify fusion gene terms (e.g. BCR-ABL1). Lookup must contain terms of type 'gene'
:ivar detectMicroRNA: Whether it will identify microRNA terms (added as 'gene' entities)
:ivar acronymDetectionForAmbiguity: Whether it will try to identify acronyms and use this to deal with ambiguity (by removing incorrect matches to acronyms or the longer terms)
:ivar mergeTerms: Whether it will merge neighbouring terms that refer to the same external entity (e.g. HER2/neu as one term instead of two)
:ivar detectVariants: Whether it will identify a variant (e.g. V600E) and create an entity of type 'variant'
:ivar variantStopwords: Variant terms to be ignored (e.g. S100P) if detectVariants is used
:ivar detectPolymorphisms: Whether it will identify a SNP (using a dbSNP ID) and create an entity of type 'variant'
:ivar removePathways: Whether it will remove genes that are actually naming a signalling pathway (e.g. MTOR pathway)
"""
def __init__(self,lookup,detectFusionGenes=False,detectMicroRNA=False,acronymDetectionForAmbiguity=False,mergeTerms=False,detectVariants=False,variantStopwords=None,detectPolymorphisms=False,removePathways=False):
"""
Create an EntityRecognizer and provide the lookup table for terms and additional flags for what to identify in text
:param lookup: A dictionary of terms (tuple of parsed words) to a list of (entityType,externalID).
:param detectFusionGenes: Whether to try to identify fusion gene terms (e.g. BCR-ABL1). Lookup must contain terms of type 'gene'
:param detectMicroRNA: Whether to identify microRNA terms (added as 'gene' entities)
:param acronymDetectionForAmbiguity: Whether to try to identify acronyms and use this to deal with ambiguity (by removing incorrect matches to acronyms or the longer terms)
:param mergeTerms: Whether to merge neighbouring terms that refer to the same external entity (e.g. HER2/neu as one term instead of two)
:param detectVariants: Whether to identify a variant (e.g. V600E) and create an entity of type 'variant'
:param variantStopwords: Variant terms to be ignored (e.g. S100P) if detectVariants is used
:param detectPolymorphisms: Whether to identify a SNP (using a dbSNP ID) and create an entity of type 'variant'
:param removePathways: Remove genes that are actually naming a signalling pathway (e.g. MTOR pathway)
:type lookup: dict
:type detectFusionGenes: bool
:type detectMicroRNA: bool
:type acronymDetectionForAmbiguity: bool
:type mergeTerms: bool
:type detectVariants: bool
:type variantStopwords: list
:type detectPolymorphisms: bool
:type removePathways: bool
"""
if variantStopwords is None:
variantStopwords = []
assert isinstance(lookup,dict)
for termsmatch,typeAndIDs in lookup.items():
assert isinstance(termsmatch,six.string_types), "Lookup key must be a tuple of strings"
assert isinstance(typeAndIDs,set), "Lookup value must be a list of (entityType,externalID)"
assert len(typeAndIDs)>0, "Lookup value must be a list of (entityType,externalID)"
for typeAndID in typeAndIDs:
assert isinstance(typeAndID,tuple),"Lookup value must be a list of (entityType,externalID)"
assert len(typeAndID)==2, "Lookup value must be a list of (entityType,externalID)"
assert isinstance(detectFusionGenes,bool)
assert isinstance(detectMicroRNA,bool)
assert isinstance(acronymDetectionForAmbiguity,bool)
assert isinstance(mergeTerms,bool)
assert isinstance(detectVariants,bool)
assert isinstance(detectPolymorphisms,bool)
assert isinstance(variantStopwords,list)
for variantStopword in variantStopwords:
assert isinstance(variantStopword,six.string_types), "variantStopwords should be a list of strings"
self.lookup = lookup
self.detectFusionGenes = detectFusionGenes
self.detectMicroRNA = detectMicroRNA
self.acronymDetectionForAmbiguity = acronymDetectionForAmbiguity
self.mergeTerms = mergeTerms
self.detectVariants = detectVariants
self.variantStopwords = set([vs.lower() for vs in variantStopwords])
self.detectPolymorphisms = detectPolymorphisms
self.removePathways = removePathways
self.variantRegex1 = re.compile(r'\b[ACDEFGHIKLMNPQRSTVWY][1-9][0-9]*[ACDEFGHIKLMNPQRSTVWY]\b')
self.variantRegex2 = re.compile(r'\b(p\.)?((Ala)|(Arg)|(Asn)|(Asp)|(Cys)|(Glu)|(Gln)|(Gly)|(His)|(Ile)|(Leu)|(Lys)|(Met)|(Phe)|(Pro)|(Ser)|(Thr)|(Trp)|(Tyr)|(Val))[1-9][0-9]*((Ala)|(Arg)|(Asn)|(Asp)|(Cys)|(Glu)|(Gln)|(Gly)|(His)|(Ile)|(Leu)|(Lys)|(Met)|(Phe)|(Pro)|(Ser)|(Thr)|(Trp)|(Tyr)|(Val))\b')
def _processWords(self, sentence):
locs,terms,termtypesAndids = getTermIDsAndLocations(sentence,self.lookup)
words = [ t.word for t in sentence.tokens ]
if self.detectVariants:
# Index the start and ends locations of tokens for lookup
token_starts = { t.startPos:i for i,t in enumerate(sentence.tokens) }
token_ends = { t.endPos:i for i,t in enumerate(sentence.tokens) }
snvMatches = list(self.variantRegex1.finditer(sentence.text)) + list(self.variantRegex2.finditer(sentence.text))
for match in snvMatches:
snvText = match.group()
start,end = match.span()
if start in token_starts and end in token_ends and not snvText.lower() in self.variantStopwords:
cleaned = cleanupVariant(snvText)
potentialLocs = (token_starts[start],token_ends[end]+1)
if not potentialLocs in locs:
termtypesAndids.append([('variant',"substitution|%s"%cleaned)])
terms.append((snvText,))
locs.append(potentialLocs)
if self.detectPolymorphisms:
polymorphismRegex1 = r'^rs[1-9][0-9]*$'
polyMatches = [ not (re.match(polymorphismRegex1,w) is None) for w in words ]
for i,(w,polyMatch) in enumerate(zip(words,polyMatches)):
if polyMatch:
potentialLocs = (i,i+1)
if not potentialLocs in locs:
termtypesAndids.append([('variant','dbsnp|%s'%w)])
terms.append((w,))
locs.append(potentialLocs)
if self.detectMicroRNA:
for i,w in enumerate(words):
# Require that microRNA names contain a digit
containsDigits = any( [ d in w for d in string.digits ] )
if not containsDigits:
continue
lw = w.lower()
if startsWithButNotAll(lw,"mir-") or startsWithButNotAll(lw,"hsa-mir-") or startsWithButNotAll(lw,"microrna-") or (startsWithButNotAll(lw,"mir") and lw[3] in string.digits):
potentialLocs = (i,i+1)
if not potentialLocs in locs:
termtypesAndids.append([('gene','mirna|'+w)])
terms.append((w,))
locs.append((i,i+1))
toRemove = []
if self.detectFusionGenes:
fusionLocs,fusionTerms,fusionTermtypesAndids = fusionGeneDetection(words,self.lookup)
for floc,fterm,ftermtypesAndid in zip(fusionLocs,fusionTerms,fusionTermtypesAndids):
if not floc in locs:
# Check for which entities to remove that are inside this fusion term
fstart,fend = floc
for tstart,tend in locs:
if fstart <= tstart and tend <= fend:
toRemove.append((tstart,tend))
locs.append(floc)
terms.append(fterm)
termtypesAndids.append(ftermtypesAndid)
filtered = zip(locs,terms,termtypesAndids)
filtered = [ (l,t,ti) for l,t,ti in filtered if not l in toRemove ]
filtered = sorted(filtered)
if self.mergeTerms:
# We'll attempt to merge terms (i.e. if a gene is referred to using two acronyms together)
# Example: Hepatocellular carcinoma (HCC) or HER2/ Neu or INK4B P15
# First we'll go through an expand terms out into brackets
filteredWithBrackets = []
for (startA,endA),termsA,termTypesAndIDsA in filtered:
termInBrackets = startA > 0 and endA < len(words) and words[startA-1] == '(' and words[endA] == ')'
if termInBrackets:
startA -= 1
endA += 1
filteredWithBrackets.append( ((startA,endA),termsA,termTypesAndIDsA) )
filteredWithBrackets = sorted(filteredWithBrackets)
# Next we go through and create groups of terms that should be merged
indexGroups, curGroup = [], []
curGroup, prevStart, prevEnd, prevIDs = None, None, None, None
for index,((curStart,curEnd),curTerms,curTermTypesAndIDs) in enumerate(filteredWithBrackets):
curIDs = set( (termType,termID) for termType, termIDs in curTermTypesAndIDs for termID in termIDs.split(';') )
shouldMergeWithPrev = False
if not prevStart is None:
termsAreNeighbouring = (curStart == prevEnd or (curStart == (prevEnd+1) and words[prevEnd] in ['/','-']))
if termsAreNeighbouring:
idsIntersection = prevIDs.intersection(curIDs)
idsShared = (len(idsIntersection) > 0)
if idsShared:
curIDs = idsIntersection
shouldMergeWithPrev = True
prevStart, prevEnd, prevIDs = curStart, curEnd, curIDs
if shouldMergeWithPrev:
curGroup.append(index)
else:
if curGroup:
indexGroups.append(curGroup)
curGroup = [index]
# Remember to add any final group to the list of groups
if curGroup:
indexGroups.append(curGroup)
# And now we do merging where appropriate
mergedFiltered = []
for indexGroup in indexGroups:
if len(indexGroup) == 1:
# No merging required
index = indexGroup[0]
mergedFiltered.append(filtered[index])
else:
# Merging required
idsIntersection = None
for index in indexGroup:
(curStart,curEnd),curTerms,curTermTypesAndIDs = filteredWithBrackets[index]
ids = set( (termType,termID) for termType, termIDs in curTermTypesAndIDs for termID in termIDs.split(';') )
if idsIntersection is None:
idsIntersection = ids
else:
idsIntersection = idsIntersection.intersection(ids)
# Double check that there the IDs of the merging terms do actually overlap
assert len(idsIntersection) > 0
groupedByType = defaultdict(list)
for termType,termID in idsIntersection:
groupedByType[termType].append(termID)
newTermTypesAndIDs = [ (termType,";".join(sorted(termIDs))) for termType,termIDs in groupedByType.items() ]
newStart = filteredWithBrackets[indexGroup[0]][0][0]
newEnd = filteredWithBrackets[indexGroup[-1]][0][1]
newTerms = tuple(words[newStart:newEnd])
mergedFiltered.append( ((newStart,newEnd),newTerms,newTermTypesAndIDs) )
filtered = sorted(mergedFiltered)
if self.acronymDetectionForAmbiguity:
# And we'll check to see if there are any obvious acronyms
locsToRemove = set()
acronyms = acronymDetection(words)
for (wordsStart,wordsEnd,acronymLoc) in acronyms:
wordIsTerm = (wordsStart,wordsEnd) in locs
acronymIsTerm = (acronymLoc,acronymLoc+1) in locs
if wordIsTerm and acronymIsTerm:
# Remove the acronym
locsToRemove.add((acronymLoc,acronymLoc+1))
elif acronymIsTerm:
# Remove any terms that contain part of the spelt out word
newLocsToRemove = [ (i,j) for i in range(wordsStart,wordsEnd) for j in range(i,wordsEnd+1) ]
locsToRemove.update(newLocsToRemove)
# Now we have to remove the terms marked for deletion in the previous section
filtered = [ (locs,terms,termtypesAndids) for locs,terms,termtypesAndids in filtered if not locs in locsToRemove]
filtered = sorted(filtered)
if self.removePathways:
forbiddenPathwayWords = set(['pathway','pathways','signaling','signalling','cascade'])
filtered2 = []
for locs,terms,termtypesAndids in filtered:
nextTokenIndex = locs[1]
nextTokenIsForbiddenWord = nextTokenIndex < len(words) and words[nextTokenIndex].lower() in forbiddenPathwayWords
if nextTokenIsForbiddenWord:
termtypesAndids = [ (termtype,termid) for termtype,termid in termtypesAndids if not termtype == 'gene' ]
if len(termtypesAndids) > 0:
filtered2.append((locs,terms,termtypesAndids))
filtered = filtered2
return filtered
def annotate(self,corpus):
"""
Annotate a parsed corpus with the wordlist lookup and other entity types
:param corpus: Corpus to annotate
:type corpus: kindred.Corpus
"""
assert corpus.parsed == True, "Corpus must already be parsed before entity recognition"
for doc in corpus.documents:
entityCount = len(doc.entities)
for sentence in doc.sentences:
extractedTermData = self._processWords(sentence)
for locs,terms,termtypesAndids in extractedTermData:
startToken = locs[0]
endToken = locs[1]
startPos = sentence.tokens[startToken].startPos
endPos = sentence.tokens[endToken-1].endPos
text = doc.text[startPos:endPos]
loc = list(range(startToken,endToken))
for entityType,externalID in termtypesAndids:
sourceEntityID = "T%d" % (entityCount+1)
e = kindred.Entity(entityType,text,[(startPos,endPos)],externalID=externalID,sourceEntityID=sourceEntityID)
#doc.addEntity(e)
doc.entities.append(e)
sentence.addEntityAnnotation(e,loc)
entityCount += 1
@staticmethod
def loadWordlists(entityTypesWithFilenames, idColumn=0, termsColumn=1, columnSeparator='\t', termSeparator='|'):
"""
Load a wordlist from multiple files. By default, each file should be a tab-delimited file with the first column is the ID and the second column containing all the terms separated by '|'. This can be modified by the parameters.
As each term is parsed, this can take a long time. It is recommended to run this one time and save the output as a Python pickle file and load in.
:param entityTypesWithFilenames: Dictionary of entityType => filename
:param idColumn: The column containing the ID for the term (starts from 0)
:param termsColumn: The column containing the list of terms (starts from 0)
:param columnSeparator: The column separator for the file (default is a tab)
:param termSeparator: The separator for the list of terms (default is a '|')
:type entityTypesWithFilenames: dict
:type idColumn: int
:type termsColumn: int
:type columnSeparator: str
:type termSeparator: str
:return: Dictionary of lookup values
:rtype: dict
"""
errorMsg = 'entityTypesWithFilenames should be a dictionary with pairs of {entityType: filename} where both are strings and the filename points to a file that exists'
assert isinstance(entityTypesWithFilenames,dict), errorMsg
for entityType,filename in entityTypesWithFilenames.items():
assert isinstance(entityType,six.string_types), errorMsg
assert isinstance(filename,six.string_types), errorMsg
assert os.path.isfile(filename), "%s does not exist" % filename
assert isinstance(idColumn,int)
assert isinstance(termsColumn,int)
assert isinstance(columnSeparator,str)
assert isinstance(termSeparator,str)
requiredColumns = max(idColumn,termsColumn)+1
lookup = defaultdict(set)
for entityType,filename in entityTypesWithFilenames.items():
with codecs.open(filename,'r','utf-8') as f:
tempLookup = defaultdict(set)
for lineno,line in enumerate(f):
split = line.strip().split(columnSeparator)
assert len(split) >= requiredColumns, 'Line %d contains only %d columns when %d are required' % (lineno+1,len(split),requiredColumns)
termid,terms = split[idColumn],split[termsColumn]
for term in terms.split(termSeparator):
tempLookup[term.lower().strip()].add(termid)
for term,idlist in tempLookup.items():
lookup[term].add( (entityType,";".join(sorted(list(idlist)))) )
return dict(lookup)