-
Notifications
You must be signed in to change notification settings - Fork 20
/
tokenizer.py
822 lines (732 loc) · 37.4 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
# coding: utf-8
#### MEMORY BASED SHALLOW PARSER ######################################################################
# Copyright (c) 2003-2010 University of Antwerp, Belgium and Tilburg University, The Netherlands
# Vincent Van Asch <vincent.vanasch@ua.ac.be>, Tom De Smedt <tom@organisms.be>
# License: GNU General Public License, see LICENSE.txt
#### ENGLISH TOKENIZER ###############################################################################
# Tokenizer divides a sequence of characters into words and sentences using regular expressions.
# Special care is given to punctuation marks. We need to guess which punctuation marks are part
# of a word (e.g. abbreviations) and which mark word and sentence breaks.
#
# Based on:
# - CNTS tokenizer.pl - S. Buchholz, E. Tjong Kim Sang, J. Meyhi, 1998-2005
# - CNTS tokenizer.py - V. Van Asch, 2008
######################################################################################################
# Use Penn Treebank specifications for tokenization?
# - For numbers, only % is split: 100% => 100 %
# - Number ranges are not split: 100-200 => 100-200
# - For contractions: n't and 'm is not substituted: won't => wo n't
PENN_TREEBANK = True
######################################################################################################
# Use corrections for biomedical texts?
# - Extra rule in assert_split(): don't split "3(R),3a(S),6a(R)-bis-tetrahydrofuranylurethane".
BIOMEDICAL = True
######################################################################################################
# Note for developers:
# For performance, always compile regular expressions once, outside of the functions.
import re
PUNCTUATION = [ch for ch in """(){}[]<>!?.:;,`'"@#$%^&*+-|=~/\\_"""]
LETTERS = [ch for ch in "abcdefghijklmnopqrstuvwxyz"]
CONSONANTS = [ch for ch in "bcdfghjklmnpqrstvwxz"] # Need this for Mr. Mss. abbreviations.
WHITESPACE = [ch for ch in " \t\n\r\f\v"] # Need this to split words.
DASHES = [ch for ch in u"–—"] # Not to be confused with hyphen.
is_uppercase = lambda s: len(s)>0 and s==s.upper() # GOODBYE => True
is_lowercase = lambda s: len(s)>0 and s==s.lower() # goodbye => True
is_capitalized = lambda s: len(s)>0 and is_uppercase(s[0]) \
and is_lowercase(s[1:]) # Goodbye => True
digits = re.compile("^[0-9]+$")
is_int = lambda s: digits.search(s) is not None
# regex pattern for entities: & ¤
entity = "&[a-z]+;|&#[0-9]+;"
#### RANGES ##########################################################################################
# Sets that match a range of words: all abbreviations, all numeric strings, all hyperlinks, ...
# The Range class is a dictionary enriched with regular expression patterns.
# You can do handy "word in Range()" or in_any(word, ranges) checks.
class Range(dict):
def __init__(self, items=[]):
dict.__init__(self, [(x,True) for x in items])
self.patterns = []
def __contains__(self, str):
if dict.__contains__(self, str):
return True
for p in self.patterns:
if p.search(str) is not None:
return True
return False
def in_any(word, ranges=[]):
for rng in ranges:
if word in rng: return True
return False
#--- ABBREVIATIONS -----------------------------------------------------------------------------------
abbreviations = [
"Adm.", "Ala.", "Ariz.", "Ark.", "Aug.", "B.C.", "Bancorp.", "Bhd.", "Brig.", "Bros.", "CO.",
"CORP.", "COS.", "Calif.", "Capt.", "Cie.", "Cmdr.", "Co.", "Col.", "Colo.", "Conn.", "Corp.",
"Cos.", "Cpl.", "D-Mass.", "D.C.", "Dec.", "Del.", "Dept.", "Dr.", "E.g.", "Etc.", "Ex.", "Exch.",
"Feb.", "Fla.", "Fri.", "Ga.", "Gen.", "Gov.", "INC.", "Ill.", "Inc.", "Ind.", "Jan.", "Jansz.",
"Jos.", "Jr.", "Kan.", "Ky.", "L.A.", "La.", "Lt.", "Ltd.", "Maj.", "Mass.", "Md.", "Messrs.",
"Mfg.", "Mich.", "Minn.", "Miss.", "Mo.", "Mon.", "Mr.", "Mrs.", "Ms.", "Mt.", "N.C.", "N.J.",
"N.Y.", "NFATc.", "Neb.", "Nev.", "No.", "Nos.", "Nov.", "O.J.", "Oct.", "Okla.", "Ont.", "Ore.",
"P.T.", "Pa.", "Ph.", "Prof.", "Prop.", "Pty.", "R.I.", "R.J.", "Rep.", "Reps.", "Rev.", "S.C.",
"Sat.", "Sen.", "Sens.", "Sep.", "Sept.", "Sgt.", "Sol.", "Sr.", "St.", "Sun.", "Tenn.", "Tex.",
"Thu.", "Tue.", "U.K.", "U.N.", "U.S.", "Va.", "Vt.", "W.J.", "W.Va.", "Wash.", "Wed.", "Wis.",
"Wyo.", "a.m.", "cit.", "def.", "ed.", "eds.", "e.g.", "etc.", "ft.", "i.e.", "op.", "p.m.", "pp.",
"sc.", "v.", "vs."
]
class Abbreviations(Range):
def __init__(self, known=[]):
""" A dictionary of known abbreviations, extended with patterns of likely abbreviations.
word in Abbreviations() == True if word is a known or likely abbreviation.
Periods in English are ambiguous,
marking end of sentence, abbreviation, decimal point or ellipsis.
- Simple rule: "every point is a sentence break" is 93.20% correct for Brown corpus.
- Fix decimal points: 93,64% correct.
- Fix single letter abbreviations (T. De Smedt), alternating letters (i.e. U.S.),
and capital letter followed by consonants (Mr. Assn.): 97.7% correct.
- Fix with dictionary of known abbreviations: up to 99,07 accuracy.
http://bulba.sdsu.edu/~malouf/ling571/13-token-bw.pdf
"""
Range.__init__(self, known)
self.patterns = [
re.compile("^[A-Za-z]\.$"), # single letter, "T. De Smedt"
re.compile("^([A-Za-z]\.)+$"), # alternating letters, "U.S."
#re.compile(".+\.[,;]$"), # followed by punctuation, "dept.,"
re.compile("^[A-Z]["+"|".join(CONSONANTS)+"]+.$") # capital followed by consonants, "Mr."
]
def __contains__(self, word):
return "." in word and Range.__contains__(self, word)
abbreviations = Abbreviations(abbreviations)
#--- NUMBERS -----------------------------------------------------------------------------------------
units = [
"nm", "mm", "cm", "m", "km", "\"", "'", "ft", # length
"g", "kg", "lb", "lbs", "oz", # mass
"l", "L", "mmol", # volume
"hrs", "hr", "h", "u", "AM", "PM", "am", "pm", "a.m.", "p.m.", # time
"BC", "AD", "B.C.", "A.D.", # epoch
"°C", "°F", # temperature
"KB", "MB", "GB", "TB", "kb", "mb", "gb", "tb", # storage capacity
"Kb", "Mb", "Mbit", "Gb", "Tb", # data transfer rate
"%", # percentage
]
# Currency units can also appear at the start of a number (e.g. US$100).
units_currency = ["$", "£", "€", "EUR", "BEF", "US$", "C$"]
units.extend(units_currency)
class Numeric(Range):
def __init__(self, units=[], punctuation="\.,:/"):
""" A range for numeric strings.
word in Numeric() == True for 0, 0-0, 000.0, 00/00/00, 00:00, 0km, 0.0KB ...
The range will match anything that starts with a digit
followed by a chain of digits and .,:/ separators,
as long as the last character is a digit,
optionally followed by a unit of measurement.
"""
Range.__init__(self)
self.units = dict.fromkeys(units, True)
# Anything that starts with a digit, ends with a digit,
# with a mix of digits and punctuation in between, suffixed by a unit.
# Note the 4: this is the maximum unit length.
self.patterns = [re.compile("^(\d*["+punctuation+"])*(\d+)(.{0,4})$")]
# Currency units can also appear at the start of a number (e.g. US$100).
self.patterns += [re.compile("^("+"|".join([re.escape(x) for x in units_currency])+")")]
self._cache = ("", "", False) # (word, unit suffix, is number?)
def __contains__(self, word):
if self._cache[0] == word:
return self._cache[2]
elif self.patterns[1].match(word):
m = self.patterns[1].match(word)
u = m.group(1)
b = True
elif len(word) > 0 and word[0].isalpha():
u, b = "", False
else:
m = self.patterns[0].search(word)
u = m is not None and m.group(3) or ""
b = m is not None and u=="" or u in self.units
self._cache = (word, u, b)
return b
def unit(self, word):
""" Yields the unit prefix/suffix of the last word checked, e.g. 1000km => km
"""
if word != self._cache[0]:
word in self
return self._cache[1]
@property
def cached(self):
return self._cache[0]
numeric = Numeric(units)
def split_numeric(word):
""" Returns a list with the word, and its unit if it is numeric: "100kg" => ["100", "kg"]
"""
u = numeric.unit(word)
if not PENN_TREEBANK and u != "" or u == "%": # Penn treebank's choice: only split %, 100% => 100 %
return word.startswith(u) and [u, word[len(u):]] or [word[:-len(u)], u]
return [word]
#--- UNIFORM RESOURCE INDENTIFIERS -------------------------------------------------------------------
class UniformResourceIdentifiers(Range):
def __init__(self):
""" A range for URLs, links and e-mail addresses.
It will match:
- anything that starts with http://, ftp://, feed://
- domain names: www.google.com, google.com, nodebox.net/Home
- e-mail addresses: Tom.DeSmedt@somewhere.com
In the last case for example we might rule againts splitting
Tom. <=> DeSmedt@somewhere.be as two separate sentences.
"""
Range.__init__(self)
self.patterns = [
re.compile("^(http|https|ftp|feed)://"), # url: http://xxxxxx.xxx
re.compile("^www\."), # domain: www.xxxxxx.xxx
re.compile("\.(com|org|net|us|eu|uk|de|be)(/.*?)*$"), # domain: xxxxxx.com, xxxxxx.net/Home
re.compile("^(.*?)@(.*?)\.(.*?)") # email: xxx@xxx.xxx
]
URI = UniformResourceIdentifiers()
#--- ENTITIES ----------------------------------------------------------------------------------------
class Entities(Range):
def __init__(self):
""" A range for HTML (e.g. é) and Unicode (e.g. ¤) entities.
If you can, reuse a cached version of re.compile(entity) instead; it's 2x faster.
"""
Range.__init__(self)
def __contains__(self, word):
if len(word) > 3 and word.startswith("&") and word.endswith(";"):
if word[1:-1].isalpha() \
or word[1] == "#":
return True
return False
entities = Entities()
#--- BIOMEDICAL --------------------------------------------------------------------------------------
class Biomedical(Range):
def __init__(self):
""" A range for guessing biomedical words, such as proteins and lipids.
It wil match:
- combinations of numbers, apostrophes, dashes and alphabetical characters:
1,2,3-triphosphatefenol, 1',2'-trifenol, phosphatidylinositol-4,5-bisphosphate.
- specific abbreviations in parenthesis: (IL-6)-deficient,
"bile duct ligation (BDL)- or ethinyl estradiol (EE)-injected rats"
"""
Range.__init__(self)
self.patterns = [
re.compile("\d'?(,\d’?)*-\w+"), # 1',2'-trifenol
re.compile("^\([\w|-]+\)-") # (IL-6)-deficient
]
biomedical = Biomedical()
##### SPLITTERS ######################################################################################
# Functions that split words, punctuation, sentences.
#--- CONTRACTIONS ------------------------------------------------------------------------------------
# Apostrophes in contractions (he's) and possessives (father's) mark word boundaries.
# These don't necessarily contain an apostrophe so we need to check them one by one.
# The whole word is replaced.
contractions = {
"cannot" : "can not",
"gonna" : "gon na",
"wanna" : "wan na"
}
# Special English contractions containing apostrophe.
# The whole word is replaced.
contractions_apostrophe = {
"i'm" : "i am",
"ain't" : "am not",
"can't" : "can not",
"won't" : "will not",
"shan't" : "shall not",
"let's" : "let us",
"'cause" : "because",
"'em" : "them",
"'til" : "until",
"'tis" : "it is",
"'twas" : "it was",
"'n'" : "and",
"'nother" : "another"
}
# Common English verb contractions.
# The word ending is replaced.
# Two possible modes: Penn Treebank's choice: won't => wo n't, verbose mode: won't => will not
contractions_suffixes = {
"n't" : ("not", "n't"), # Regular don't, isn't, wasn't, doesn't, haven't, ...
"'s" : ("'s", "'s"), # Can't disambiguate: it's been => it has been, it's hot => it is hot.
"'d" : ("'d", "'d"), # Can't disambiguate: I'd better go => I had, he'd want to go => he would.
"'m" : ("am", "'m"),
"'re" : ("are", "'re"),
"'ll" : ("will", "'ll"),
"'ve" : ("have", "'ve")
}
# Shouldn't split 60's, 1960's, 9's but not currently in use.
era = re.compile("^[0-9]+'s$")
def case_sensitive(word, replacement):
""" Attempts to return replacement in the same case as the given word.
If word is lowercase, yield lowercase replacement.
If word is uppercase, yield uppercase replacement.
If word starts with a capital, yield lowercase replacement with starting capital.
- "cannot" => "can not"
- "CANNOT" => "CAN NOT"
- "Cannot" => "Can not"
- "caNnOt" => "caN nOt"
- "O'Sullivan's" => "O'Sullivan 's"
"""
if is_lowercase(word):
return replacement.lower()
if is_uppercase(word):
return replacement.upper()
if is_capitalized(word):
return replacement.lower().capitalize()
if word == replacement.replace(" ","",1):
i = replacement.find(" ")
if i >= 0: # If we simply added a space, derive the original case.
return word[:i]+" "+word[i:]
return replacement
def split_contraction(word):
""" Apostrophes in contractions (he's) and possessives (father's) are word boundaries.
Returns the expanded form of the word as a list of strings.
I've => ["I", "have"] (or ["I", "'ve"] in Penn Treebank mode).
"""
k = word.lower()
if k in contractions:
return case_sensitive(word, contractions[k]).split(" ")
if "'" in k:
if not PENN_TREEBANK and k in contractions_apostrophe:
return case_sensitive(word.lstrip("'"), contractions_apostrophe[k]).split(" ")
for s in contractions_suffixes:
if k.endswith(s):
return case_sensitive(word, word[:-len(s)]+" "+contractions_suffixes[s][PENN_TREEBANK]).split(" ")
return [word]
#--- HYPHENATION -------------------------------------------------------------------------------------
# Hyphens at the end of a line mark words that have been split across lines.
# Hyphenated words are those that end in a single "-".
# Unknown cases (e.g. "blah--", "-") are ignored.
hyphen = re.compile("\w\-$")
# Coordinators are never joined to the hyphened word.
coordinators = ["and", "or"]
# Words starting with "and-" or "or-".
# Rare, e.g. rag- \nand-bone man.
coordination_prefix = "|".join([x+"-" for x in coordinators])
coordination_prefix = re.compile("^"+coordination_prefix+"-")
def split_hyphenation(words=[]):
""" Joins words that have been hyphenated by line-wrapping, i.e. "mar- \nket" => "market\n".
Otherwise, "mar- ket" is ignored; hyphens only occur inside a word or at the end of line.
The input is a list of split words and \n items (see split_words() below).
The output is the restructured list:
1) ["Great-","\n","Britain"] => ["Great-Britain","\n"]
2) ["nineteenth-","\n","and","twentieth-century","writers"] remains unchanged
3) ["EU-", "\n", "mandaat"] => ["EU-mandaat", "\n"]
4) ["12-", "\n", "hour"] => ["12-hour", "\n"]
5) ["mar-","\n","ket"] => ["market","\n"]
"""
p, i, n = [], 0, len(words)
while i < n:
w1 = words[i]
br = i<n-1 and words[i+1] or ""
w2 = i<n-2 and words[i+2] or ""
# Check for a break first, lazy evaluation might save us a regex lookup.
if br == "\n" and w2 and hyphen.search(w1) is not None:
if is_capitalized(w2): # (1)
p.extend([w1+w2,"\n"])
elif w2 in coordinators \
or coordination_prefix.search(w2) is not None: # (2) + schiet- \nen-vechtgeval
p.extend([w1,"\n",w2])
elif is_uppercase(w1[-2]): # (3)
p.extend([w1+w2,"\n"])
elif w1[0:-1] in numeric: # (4)
p.extend([w1+w2,"\n"])
else: # (5)
p.extend([w1[0:-1]+w2,"\n"])
i += 2
else:
p.append(w1)
i += 1
return p
#--- MISSING SPACE -----------------------------------------------------------------------------------
# Punctuation inside a word can indicate a missing space, e.g. "Hello;and goodbye".
# The general idea of how to split word internal punctuation.
# Exceptions and corrections can be handled in assert_split().
dashes = "|".join(DASHES)
missing_space = [
re.compile("^(.*?)(\.\.+)(.*)"), # multiple periods: "wait..go" => "wait .. go"
re.compile("^(.*?\))(\.)(.*)"), # period preceded by parenthesis: "wait).go" => "wait) . go"
re.compile("^(.*?\.)(\(|\[)(.*)"), # period followed by parenthesis: "wait.(go" => "wait . (go"
re.compile("^([^\(]*?)(\))(.*)"), # unbalanced parenthesis: "wait)go" => "wait ) go"
re.compile("^(.*?)(\.)([A-Z].*)"), # period followed by capital: "wait.Go" => "wait . Go"
re.compile("^(.*?)(,|;|:|/)(.+)"), # word internal punctuation: "wait;go" => "wait ; go"
re.compile("^(.*?)("+dashes+")(.*)"), # em-dash and en-dash: "wait—go" => "wait — go"
re.compile("^(.*?)("+entity+")(.*)"), # HTML/Unicode entities: "égo" => "é go"
re.compile("^(.*?)(\[\d+\])(.*)"), # Wikipedia references: "wait[1]" => "wait [1]"
]
missing_space_range = re.compile("(\d+)(-)(\d+)") # "100-200" => "100 - 200"
missing_space.append(missing_space_range)
def assert_split(a, b, c, sets=[]):
""" Returns (a,b) if splitting the word into a and b seems correct.
Otherwise, return the corrected form.
"""
if b.startswith("."):
# Splitting "etc.," to ["etc.", ","] instead of ["etc", ".", ","]
if b == ".," or in_any(a+".", sets):
return a+".", b[1:], c
if b == ",":
# Splitting "6,000-year" to ["6", ",", "000-year"] is not allowed.
# Splitting "3(R),3a(S),6a(R)-bis-tetrahydrofuranylurethane" is not allowed in biomedical mode.
if len(a)>0 and (is_int(a[-1]) or (BIOMEDICAL and a[-1] in "),")) and \
len(c)>0 and is_int(c[0]):
return None, None, None
return a, b, c
def split_missing_space(word, ignore=[abbreviations, numeric, URI, entities]):
""" Splits words that are probably two word with missing punctuation in between.
Returns the expanded form of the word as a list of strings, i.e.
nice;also => ["nice", ";", "also"]
The ignore list contains lists of known words that should not be split,
see also abbreviations, numeric, URI, entities...
"""
# In the case of "It's 10:45.", the 10:45 will be split because "10:45."
# is not validate as a number, whereas "10:45" is.
# So we check if it is a number with the word's head and tail punctuation removed.
# This head and tail punctuation needs to be split in a later step - see split_punctuation().
if in_any(word.strip("".join(PUNCTUATION)), ignore):
return [word]
for p in missing_space:
if PENN_TREEBANK and p == missing_space_range:
# Penn treebank's choice: don't split 100-200.
continue
m = p.search(word)
if m is not None:
a, b, c = m.group(1), m.group(2), m.group(3)
a, b, c = assert_split(a, b, c, ignore) # etc., => etc.][. not etc][.,
if a == b == c == None : continue
S = [b]
# The split "words" in the chain may not have been examined, recurse them.
if len(a) > 0: S = split_missing_space(a, ignore) + S
if len(c) > 0: S+= split_missing_space(c, ignore)
return [x for x in S if x != ""]
return [word]
#--- PUNCTUATION -------------------------------------------------------------------------------------
# Deal with punctuation at the head or tail of a word.
punctuation = dict.fromkeys(PUNCTUATION, True) # Dictionary lookup is faster.
# A number of words to keep intact:
punctuation_head = Range()
punctuation_head.patterns = [
re.compile("^\.[0-9]+"), # calibre: .22
re.compile("^'[0-9][0-9](-|$)"), # year: '93 '93-'94
re.compile("^\([A-Za-z]+(-\)|\)-)"), # rare: (oud-)student (her)-introductie
]
punctuation_tail = Range()
punctuation_tail.patterns = [
re.compile(".\([A-Za-z]+\)$"), # rare: koning(in)
]
def split_chars(word, chunk=".-"):
""" Returns a list of characters in the given string.
Characters in the defined chunk will be kept together,
i.e. split_chars("Hello...", chunk="l.") => ["H", "e", "ll", "o", "..."]
"""
p = [""]
for ch in word:
if ch in chunk and p[-1].startswith(ch):
p[-1] += ch
else:
p.append(ch)
return p[1:] or [""]
def split_punctuation(word, ignore=[abbreviations, numeric, URI, entities, biomedical]):
""" Splits punctuation from the head and tail of the word.
Returns the expanded form of the word as a list of strings, i.e.
goodbye. => ["goodbye", "."]
The ignore list contains lists of known words that should not be split,
see also abbreviations, numeric, URI, entities...
Some other rare exceptions are hard-coded not to split
(see punctuation_head and punctuation_tail above).
"""
if len(word) == 0 or word[0].isalpha() and word[-1].isalpha():
# Nothing to do: there is no punctuation at the head or tail of the word.
# This shortcut makes the function call 2x faster.
return [word]
if in_any(word, ignore):
# If the word is in a known range (e.g. a HTML entity or abbreviation),
# no further processing is required. We do split any number unit (e.g. 1000km => 1000 km).
return split_numeric(word)
if word in contractions_suffixes:
return [word]
# Find the head and the tail of the word containing punctuation.
p, i, j = [], 0, len(word)-1
while i<=j and word[i] in punctuation: i+=1
while j>=0 and word[j] in punctuation: j-=1
# Discard the head or tail if the punctuation is valid (e.g. .22 calibre).
if word in punctuation_tail: j=len(word)
if word in punctuation_head \
or word[:j+1] in contractions_apostrophe: i=0
# If the head and the tail are the same,
# i.e. the word consists entirely out of punctuation), discard the tail.
a, b, c = word[:i], word[i:j+1], j>-1 and word[j+1:] or ""
b, c, X = assert_split(b, c, "", ignore) # "etc.," => "etc. ," not "etc .,"
if b == c == X == None: return [word]
# Split the punctuation.
a = split_chars(a, chunk=".-'\"<>") # Keep dashes, ellipsis etc. together: -- ...
c = split_chars(c, chunk=".-'\"<>")
# Split units from numbers: We ran a 100km. => We ran a 100 km .
b = split_numeric(b)
words = [x for x in a+b+c if x != ""]
words = balance_parenthesis(words)
return words
def balance_parenthesis(words, ignore="+"):
""" In split_punctuation() we split the trailing ")" from a word.
Put it back if the words contains "(", to balance the parenthesis, e.g.:
- Normal case: function (x) => function ( x )
- Balance: function(x) => function(x ) => function(x)
- Balance: (value from function(x)) => ( value from function(x) )
"""
for i, word in enumerate(words):
if len(word) > 1 and not word.isalpha():
n = word.count("(")
m = word.count(")")
opened = n-m
for j in range(i+1, len(words)):
# The ignore string defines punctuation between the word and the close parens
# that we can merge back into the word, for example in biomedical use:
# L(4)Fe(DA(H+)) => L(4)Fe(DA(H + ) ) => L(4)Fe(DA(H+))
if words[j] not in ")"+ignore or opened <= 0: break
if words[j] == ")":
opened -= 1
words[i] += "".join(words[i+1:j+1])
for k in range(i+1, j+1):
words[k] = ""
return [word for word in words if word != ""]
#---- WORDS ------------------------------------------------------------------------------------------
def split_words(string, keep="\n"):
""" Returns a list of words in the given string.
All whitespace characters except those defined as keepers are replaced by a standard space.
Multiple spaces are collapsed to a single space,
so that each word is separated by exactly one space.
Words can contain punctuation marks at the start or end, we need to process these separately.
We retain \n (newline) in the output is because we need it to process hyphenation.
"""
w = "|".join(filter(lambda ch: ch not in keep, WHITESPACE))
for ch in keep:
# Collapse keepers, e.g, "\n \n" => "\n".
# We will be splitting on spaces in a minute,
# so ensure there is a space around each keeper.
string = re.sub(ch+"["+ch+"|\s]+", ch, string)
string = re.sub("\s{0,1}\n\s{0,1}", " "+ch+" ", string)
string = string.strip()
string = re.compile(w).sub(" ", string)
string = re.sub(" +", " ", string)
return string.split(" ")
# Word parts occuring in protein names:
PROTEIN = ["kappa", "in", "ase", "itis", "group", "complex", "inhibitor", "region", "site", "type", "ine"]
PROTEIN = re.compile("|".join(PROTEIN), re.I)
def split_word(word, ignore=[abbreviations, numeric, URI, entities, biomedical], previous=None):
""" Splits contracted words, joined words with a missing space, punctuation.
Returns the expanded form of the word as a list of strings, i.e.
can't... => ["cannot", "..."]
"""
if word.lower() in contractions:
# Check if the word is a contraction (e.g. "cannot") before doing the alpha-word-shortcut.
return [case_sensitive(word, contractions[word.lower()])]
if word.isalpha():
# We're in luck: the word contains only alphabetic characters, no disambiguation needed.
# This shortcut saves about 15-50% time depending on the complexity of the text.
return [word]
if len(word) == 2 and word.endswith(".") and previous is not None and PROTEIN.search(previous):
# The next rule handles abbreviated initials in names, however,
# word is not an initial if it is part of a protein name: "peri-kappa B." => "peri-kappa B ."
return [word[:-1], "."]
if in_any(word, ignore):
# If the word is in a known range (e.g. a HTML entity or abbreviation),
# no further processing is required. We do split any number unit (e.g. 1000km => 1000 km).
return split_numeric(word)
a = [word]
b = []; [b.extend(split_missing_space(word, ignore)) for word in a]; a=b
b = []; [b.extend(split_punctuation(word, ignore)) for word in a]; a=b
b = []; [b.extend(split_contraction(word)) for word in a]; a=b
return a
#--- TAGS --------------------------------------------------------------------------------------------
# By default, all tags are removed.
# If they need to be retained someone needs to add checks.
# Beware: <a href="foo" onclick="do();" fontsize=12> is a valid HTML tag...
tags = re.compile("</{0,1}.*?>")
# Before strippping tags, replace the following:
HTML_LIST_ITEM = "HTML___LIST___ITEM "
tags_replace = {
re.compile("<head.*?>.*?</head>") : "", # Remove HTML metadata.
re.compile("<style.*?>.*?</style>") : "", # Remove HTML CSS.
re.compile("<script.*?>.*?</script>") : "", # Remove HMTL Javascript.
re.compile("<li.*?>") : HTML_LIST_ITEM, # Add list markers for split_lists().
re.compile("\.{0,1}</h\d>") : ". ", # A <h1> block always ends sentence,
} # accomplished by adding a perdiod.
def strip_tags(string, replace=tags_replace):
""" Strips all tags from the given string.
"""
for p,r in replace.items():
string = p.sub(r, string)
return tags.sub("", string)
#--- SENTENCES ---------------------------------------------------------------------------------------
# Separated period always ends sentence.
# Punctuation like ? and ! end sentence when followed by a capitalized letter.
stop_always = ["."]
stop_assert = ["?", "!", "..", "...", "....", "etc."]
# Parenthesis and quotes can be part of the sentence even if the period precedes it.
# We will check for open/close balance, e.g.:
# - "Stop!" he shouted => the ! does not end sentence.
# - "Stop!" He shouted => the ! ends sentence and the quote after it is part of that sentence.
parenthesis_open = ["(", "[", "{"]
parenthesis_close = [")", "]", "}"]
quotes = ["\"", "'", "''", "'''", "\"\"\""]
ASSERT = "assert"
SENTENCE_BREAK = "SENTENCE___BREAK"
def add_sentence_breaks(words=[], marker=SENTENCE_BREAK):
""" Returns the list of words with injected SENTENCE_BREAK markers.
["Hello", ".", "Having", "fun", "?"] =>
["Hello", ".", SENTENCE_BREAK, "Having", "fun", "?", SENTENCE_BREAK]
"""
quote_count = {}
p, i, n = [], 0, len(words)
stop = False
for i in range(n):
word = words[i]
# We count the occurences of quotes.
# If at any time the count for a quote is uneven, this means we are inside a quotation, e.g.
# "All work and no play." => if the period is followed by a ", the quote is part of this sentence.
if word in quotes:
quote_count[word] = quote_count.get(word,0)+1
# Periods, ellipsis and some other punctuation mark the end of the sentence,
# but we need to continue scanning if quotes or parenthesis follow
# that are still part of this sentence before adding the break.
p.append(word)
if stop != False:
if word in parenthesis_close: # hello!) Goodbye => hello!) ][ Goodbye
pass
elif word in quotes:
if quote_count.get(word,0) % 2 != 0: # Hello. "Goodbye". => Hello. ][ "Goodbye".
p.insert(len(p)-1, marker)
stop = False
else: # "Hello." Goodbye. => "Hello." ][ Goodbye.
pass
elif stop == True \
or stop == ASSERT and (is_capitalized(word) or word in parenthesis_open):
p.insert(len(p)-1, marker)
stop = False
else:
stop = False
# Separated period or ellipsis always ends sentence.
# For other punctuation marks we verify
# if it is followed by a capitalized letter or open parenthesis.
if word in stop_always:
stop = True
elif word in stop_assert:
stop = ASSERT
# Don't forget the sentence break at the end of the list,
# e.g. ["'", "Enough", "of", "this", "!", "'"]
if stop == True:
p.append(marker)
return p
def ignore_cited_breaks(words=[], marker=SENTENCE_BREAK):
""" Returns the list of words with SENTENCE_BREAK inside quotes removed.
['"', u'Stop', '.', 'SENTENCE___BREAK', 'Stop', '!', '"', 'he', 'shouted', '.', 'SENTENCE___BREAK'] =>
['"', u'Stop', '.', 'Stop', '!', '"', 'he', 'shouted', '.', 'SENTENCE___BREAK']
"""
p, citation = [], []
for i, word in enumerate(words):
# Scan balanced quotes only.
# If it is an open quote, the closing quote is one of the following words.
# If it is a closing quote, it matches the last opened quote.
if word in quotes and (word in words[i+1:] or citation and word == citation[-1]):
if len(citation) == 0 or citation[-1] != word:
citation.append(word) # Open quote.
else:
citation.pop() # Close quote.
if word != marker or len(citation) == 0:
p.append(word)
return p
def split_sentences(words=[], marker=SENTENCE_BREAK):
""" Returns a list of sentences. Each sentence is itself a list of words.
The input list of words is expected to have been treated with add_sentence_breaks().
"""
sentences = [[]]
for i, word in enumerate(words):
if word == marker:
sentences.append([])
else:
sentences[-1].append(word)
return [s for s in sentences if len(s) > 0]
#--- LISTS -------------------------------------------------------------------------------------------
list_marker = re.compile("^(\d+\.|\d+\)|\*|\-|[a-z]\.|[a-z]\))$") # 1. 1) * - a. a)
def split_lists(words=[], marker=SENTENCE_BREAK):
""" Adds sentence breaks before list item markers.
A list item marker is only taken into account when at the start of a new line.
Occurences of HTML_LIST_ITEM (i.e. <li>, see strip_tags()) also get a sentence break.
There is no continuity check, e.g. 2) followed by a) is also regarded as a list.
"""
p = []
for i, word in enumerate(words):
if word == HTML_LIST_ITEM:
p.append(marker)
elif i==0 or words[i-1] == "\n" and list_marker.search(word):
p.append(marker)
p.append(word)
elif i>0 and words[i-1] == "\n":
# If a linebreak occured and no list item follows,
# this marks the end of the list. Add a sentence break.
p.append(marker)
p.append(word)
else:
p.append(word)
return p
######################################################################################################
# Punctuation replacements for unicode characters.
# This makes our life easier - in the rest of the module we can simply work with ' and ".
unicode_replacements = {
u"‘’‚‛`´ʹʻʼʾʿˋˊ˴" : "'", # Single quote lookalikes.
u"“”„‟ʺˮ˵˶˝" : "\"", # Double quote lookalikes.
u"…" : "...", # Unicode ellipsis.
u"«" : "<<",
u"»" : ">>",
}
# Word ranges to ignore when splitting.
ignore = [abbreviations, numeric, URI, entities, biomedical]
def split(string, tags=False, citations=False, replace=unicode_replacements, ignore=ignore):
""" Splits the string into a list of sentences.
Punctuation is split from words as individual tokens.
With tags=False, removes SGML-tags (i.e. anything resembling "<...>") first.
With citations=True, cited sentences in quotes are kept together.
The replace dictionary contains (unicode) strings to normalize.
"""
# Make sure we have a unicode string.
if isinstance(string, str):
string = string.decode("utf-8")
for k,v in replace.items():
for k in k:
string = re.sub(k,v, string)
if not tags:
string = strip_tags(string)
# Collapse whitespace and split on each space.
# We keep \n as a separate token because we still need it to check lists and hyphenation.
words = split_words(string, keep="\n")
words = split_lists(words)
words = split_hyphenation(words)
words = [word for word in words if word != "\n"]
# Split words with missing spaces, contractions, etc.
# Keep list item markers at the start of the sentence intact - see split_lists().
p = []
for i, word in enumerate(words):
if i==0 or words[i-1] == SENTENCE_BREAK and list_marker.search(word) is not None:
p.append(word)
else:
p.extend(split_word(word, ignore=ignore, previous=i>0 and words[i-1] or None))
# Add sentence breaks after periods and other punctuation that indicate the end of the sentence.
# Parse sentence breaks and create a list of individual sentence strings.
p = add_sentence_breaks(p)
p = citations and ignore_cited_breaks(p) or p
p = split_sentences(p)
p = [" ".join(sentence) for sentence in p]
return p
######################################################################################################
test1 = u"""
I live in the U.S.
The U.S. Army likes Shock and Awe.
U.N. regulations are not a part of their concern–Isn't it?
Yes! "I'd rather have a walk", Ms. Comble sighed.
'Me too!', Mr. P. Delaware cried.
They ran about 10km.
But then the 6,000-year ice age came...
"""
test2 = u"""
From “The Eskimo Cookbook”:
1. cut the flippers off from the oogruk
2. put the flippers in fresh blubber
3. let them stay there for about two weeks
4. take the loose fur off the flipper
5. cut flipper into small pieces and eat the meat
http://www.recipesource.com/ethnic/americas/eskimo/oogruk-flippers1.html
"""