From ff1c0d1c7f1addbccce5ef4b3d8a1e06e4870ac0 Mon Sep 17 00:00:00 2001
From: Jake Lever <jake.lever@gmail.com>
Date: Fri, 3 Mar 2023 22:45:22 +0000
Subject: [PATCH] Fix for variant extraction bug with multiple tokens (issue
 #28)

---
 kindred/EntityRecognizer.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/kindred/EntityRecognizer.py b/kindred/EntityRecognizer.py
index f04da5d..3645d2a 100644
--- a/kindred/EntityRecognizer.py
+++ b/kindred/EntityRecognizer.py
@@ -253,33 +253,35 @@ def __init__(self,lookup,detectFusionGenes=False,detectMicroRNA=False,acronymDet
 		self.acronymDetectionForAmbiguity = acronymDetectionForAmbiguity
 		self.mergeTerms = mergeTerms
 		self.detectVariants = detectVariants
-		self.variantStopwords = set(variantStopwords)
+		self.variantStopwords = set([vs.lower() for vs in variantStopwords])
 		self.detectPolymorphisms = detectPolymorphisms
 		self.removePathways = removePathways
 
+		self.variantRegex1 = re.compile(r'\b[ACDEFGHIKLMNPQRSTVWY][1-9][0-9]*[ACDEFGHIKLMNPQRSTVWY]\b')
+		self.variantRegex2 = re.compile(r'\b(p\.)?((Ala)|(Arg)|(Asn)|(Asp)|(Cys)|(Glu)|(Gln)|(Gly)|(His)|(Ile)|(Leu)|(Lys)|(Met)|(Phe)|(Pro)|(Ser)|(Thr)|(Trp)|(Tyr)|(Val))[1-9][0-9]*((Ala)|(Arg)|(Asn)|(Asp)|(Cys)|(Glu)|(Gln)|(Gly)|(His)|(Ile)|(Leu)|(Lys)|(Met)|(Phe)|(Pro)|(Ser)|(Thr)|(Trp)|(Tyr)|(Val))\b')
 		
 	def _processWords(self, sentence):
 		locs,terms,termtypesAndids = getTermIDsAndLocations(sentence,self.lookup)
 
 		words = [ t.word for t in sentence.tokens ]
-
+		
 		if self.detectVariants:
-			#snvRegex = r'^[A-Z][0-9]+[A-Z]$'
-			variantRegex1 = r'^[ACDEFGHIKLMNPQRSTVWY][1-9][0-9]*[ACDEFGHIKLMNPQRSTVWY]$'
-			variantRegex2 = r'^(p\.)?((Ala)|(Arg)|(Asn)|(Asp)|(Cys)|(Glu)|(Gln)|(Gly)|(His)|(Ile)|(Leu)|(Lys)|(Met)|(Phe)|(Pro)|(Ser)|(Thr)|(Trp)|(Tyr)|(Val))[1-9][0-9]*((Ala)|(Arg)|(Asn)|(Asp)|(Cys)|(Glu)|(Gln)|(Gly)|(His)|(Ile)|(Leu)|(Lys)|(Met)|(Phe)|(Pro)|(Ser)|(Thr)|(Trp)|(Tyr)|(Val))$'
-
-			filteredWords = [ w for w in words if not w in self.variantStopwords ]
-			snvMatches1 = [ not (re.match(variantRegex1,w) is None) for w in filteredWords ]
-			snvMatches2 = [ not (re.match(variantRegex2,w,re.IGNORECASE) is None) for w in filteredWords ]
-
-			snvMatches = [ (match1 or match2) for match1,match2 in zip(snvMatches1,snvMatches2) ]
-			for i,(w,snvMatch) in enumerate(zip(words,snvMatches)):
-				if snvMatch:
-					cleaned = cleanupVariant(w)
-					potentialLocs = (i,i+1)
+			# Index the start and ends locations of tokens for lookup
+			token_starts = { t.startPos:i for i,t in enumerate(sentence.tokens) }
+			token_ends = { t.endPos:i for i,t in enumerate(sentence.tokens) }
+
+			snvMatches = list(self.variantRegex1.finditer(sentence.text)) + list(self.variantRegex2.finditer(sentence.text))
+			print(snvMatches)
+			for match in snvMatches:
+				snvText = match.group()
+				start,end = match.span()
+				if start in token_starts and end in token_ends and not snvText.lower() in self.variantStopwords:
+					cleaned = cleanupVariant(snvText)
+					print(snvText, cleaned)
+					potentialLocs = (token_starts[start],token_ends[end]+1)
 					if not potentialLocs in locs:
 						termtypesAndids.append([('variant',"substitution|%s"%cleaned)])
-						terms.append((w,))
+						terms.append((snvText,))
 						locs.append(potentialLocs)
 
 		if self.detectPolymorphisms: