/
sequential.py
755 lines (621 loc) · 27.2 KB
/
sequential.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
# Natural Language Toolkit: Sequential Backoff Taggers
#
# Copyright (C) 2001-2021 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# Tiago Tresoldi <tresoldi@users.sf.net> (original affix tagger)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Classes for tagging sentences sequentially, left to right. The
abstract base class SequentialBackoffTagger serves as the base
class for all the taggers in this module. Tagging of individual words
is performed by the method ``choose_tag()``, which is defined by
subclasses of SequentialBackoffTagger. If a tagger is unable to
determine a tag for the specified token, then its backoff tagger is
consulted instead. Any SequentialBackoffTagger may serve as a
backoff tagger for any other SequentialBackoffTagger.
"""
import ast
import re
from abc import abstractmethod
from typing import List, Optional, Tuple
from nltk import jsontags
from nltk.classify import NaiveBayesClassifier
from nltk.probability import ConditionalFreqDist
from nltk.tag.api import FeaturesetTaggerI, TaggerI
######################################################################
# Abstract Base Classes
######################################################################
class SequentialBackoffTagger(TaggerI):
"""
An abstract base class for taggers that tags words sequentially,
left to right. Tagging of individual words is performed by the
``choose_tag()`` method, which should be defined by subclasses. If
a tagger is unable to determine a tag for the specified token,
then its backoff tagger is consulted.
:ivar _taggers: A list of all the taggers that should be tried to
tag a token (i.e., self and its backoff taggers).
"""
def __init__(self, backoff=None):
if backoff is None:
self._taggers = [self]
else:
self._taggers = [self] + backoff._taggers
@property
def backoff(self):
"""The backoff tagger for this tagger."""
return self._taggers[1] if len(self._taggers) > 1 else None
def tag(self, tokens):
# docs inherited from TaggerI
tags = []
for i in range(len(tokens)):
tags.append(self.tag_one(tokens, i, tags))
return list(zip(tokens, tags))
def tag_one(self, tokens, index, history):
"""
Determine an appropriate tag for the specified token, and
return that tag. If this tagger is unable to determine a tag
for the specified token, then its backoff tagger is consulted.
:rtype: str
:type tokens: list
:param tokens: The list of words that are being tagged.
:type index: int
:param index: The index of the word whose tag should be
returned.
:type history: list(str)
:param history: A list of the tags for all words before *index*.
"""
tag = None
for tagger in self._taggers:
tag = tagger.choose_tag(tokens, index, history)
if tag is not None:
break
return tag
@abstractmethod
def choose_tag(self, tokens, index, history):
"""
Decide which tag should be used for the specified token, and
return that tag. If this tagger is unable to determine a tag
for the specified token, return None -- do not consult
the backoff tagger. This method should be overridden by
subclasses of SequentialBackoffTagger.
:rtype: str
:type tokens: list
:param tokens: The list of words that are being tagged.
:type index: int
:param index: The index of the word whose tag should be
returned.
:type history: list(str)
:param history: A list of the tags for all words before *index*.
"""
class ContextTagger(SequentialBackoffTagger):
"""
An abstract base class for sequential backoff taggers that choose
a tag for a token based on the value of its "context". Different
subclasses are used to define different contexts.
A ContextTagger chooses the tag for a token by calculating the
token's context, and looking up the corresponding tag in a table.
This table can be constructed manually; or it can be automatically
constructed based on a training corpus, using the ``_train()``
factory method.
:ivar _context_to_tag: Dictionary mapping contexts to tags.
"""
def __init__(self, context_to_tag, backoff=None):
"""
:param context_to_tag: A dictionary mapping contexts to tags.
:param backoff: The backoff tagger that should be used for this tagger.
"""
super().__init__(backoff)
self._context_to_tag = context_to_tag if context_to_tag else {}
@abstractmethod
def context(self, tokens, index, history):
"""
:return: the context that should be used to look up the tag
for the specified token; or None if the specified token
should not be handled by this tagger.
:rtype: (hashable)
"""
def choose_tag(self, tokens, index, history):
context = self.context(tokens, index, history)
return self._context_to_tag.get(context)
def size(self):
"""
:return: The number of entries in the table used by this
tagger to map from contexts to tags.
"""
return len(self._context_to_tag)
def __repr__(self):
return f"<{self.__class__.__name__}: size={self.size()}>"
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
Initialize this ContextTagger's ``_context_to_tag`` table
based on the given training data. In particular, for each
context ``c`` in the training data, set
``_context_to_tag[c]`` to the most frequent tag for that
context. However, exclude any contexts that are already
tagged perfectly by the backoff tagger(s).
The old value of ``self._context_to_tag`` (if any) is discarded.
:param tagged_corpus: A tagged corpus. Each item should be
a list of (word, tag tuples.
:param cutoff: If the most likely tag for a context occurs
fewer than cutoff times, then exclude it from the
context-to-tag table for the new tagger.
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None:
continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if self.backoff is None or tag != self.backoff.tag_one(
tokens, index, tags[:index]
):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
best_tag = fd[context].max()
hits = fd[context][best_tag]
if hits > cutoff:
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0) / token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print("[Trained Unigram tagger:", end=" ")
print(
"size={}, backoff={:.2f}%, pruning={:.2f}%]".format(
size, backoff, pruning
)
)
######################################################################
# Tagger Classes
######################################################################
@jsontags.register_tag
class DefaultTagger(SequentialBackoffTagger):
"""
A tagger that assigns the same tag to every token.
>>> from nltk.tag import DefaultTagger
>>> default_tagger = DefaultTagger('NN')
>>> list(default_tagger.tag('This is a test'.split()))
[('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]
This tagger is recommended as a backoff tagger, in cases where
a more powerful tagger is unable to assign a tag to the word
(e.g. because the word was not seen during training).
:param tag: The tag to assign to each token
:type tag: str
"""
json_tag = "nltk.tag.sequential.DefaultTagger"
def __init__(self, tag):
self._tag = tag
super().__init__(None)
def encode_json_obj(self):
return self._tag
@classmethod
def decode_json_obj(cls, obj):
tag = obj
return cls(tag)
def choose_tag(self, tokens, index, history):
return self._tag # ignore token and history
def __repr__(self):
return f"<DefaultTagger: tag={self._tag}>"
@jsontags.register_tag
class NgramTagger(ContextTagger):
"""
A tagger that chooses a token's tag based on its word string and
on the preceding n word's tags. In particular, a tuple
(tags[i-n:i-1], words[i]) is looked up in a table, and the
corresponding tag is returned. N-gram taggers are typically
trained on a tagged corpus.
Train a new NgramTagger using the given training data or
the supplied model. In particular, construct a new tagger
whose table maps from each context (tag[i-n:i-1], word[i])
to the most frequent tag for that context. But exclude any
contexts that are already tagged perfectly by the backoff
tagger.
:param train: A tagged corpus consisting of a list of tagged
sentences, where each sentence is a list of (word, tag) tuples.
:param backoff: A backoff tagger, to be used by the new
tagger if it encounters an unknown context.
:param cutoff: If the most likely tag for a context occurs
fewer than *cutoff* times, then exclude it from the
context-to-tag table for the new tagger.
"""
json_tag = "nltk.tag.sequential.NgramTagger"
def __init__(
self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False
):
self._n = n
self._check_params(train, model)
super().__init__(model, backoff)
if train:
self._train(train, cutoff, verbose)
def encode_json_obj(self):
_context_to_tag = {repr(k): v for k, v in self._context_to_tag.items()}
if "NgramTagger" in self.__class__.__name__:
return self._n, _context_to_tag, self.backoff
else:
return _context_to_tag, self.backoff
@classmethod
def decode_json_obj(cls, obj):
try:
_n, _context_to_tag, backoff = obj
except ValueError:
_context_to_tag, backoff = obj
if not _context_to_tag:
return backoff
_context_to_tag = {ast.literal_eval(k): v for k, v in _context_to_tag.items()}
if "NgramTagger" in cls.__name__:
return cls(_n, model=_context_to_tag, backoff=backoff)
else:
return cls(model=_context_to_tag, backoff=backoff)
def context(self, tokens, index, history):
tag_context = tuple(history[max(0, index - self._n + 1) : index])
return tag_context, tokens[index]
@jsontags.register_tag
class UnigramTagger(NgramTagger):
"""
Unigram Tagger
The UnigramTagger finds the most likely tag for each word in a training
corpus, and then uses that information to assign tags to new tokens.
>>> from nltk.corpus import brown
>>> from nltk.tag import UnigramTagger
>>> test_sent = brown.sents(categories='news')[0]
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> for tok, tag in unigram_tagger.tag(test_sent):
... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE
(The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
(Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
(investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
(primary, NN), (election, NN), (produced, VBD), (``, ``),
(no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI),
(irregularities, NNS), (took, VBD), (place, NN), (., .),
:param train: The corpus of training data, a list of tagged sentences
:type train: list(list(tuple(str, str)))
:param model: The tagger model
:type model: dict
:param backoff: Another tagger which this tagger will consult when it is
unable to tag a word
:type backoff: TaggerI
:param cutoff: The number of instances of training data the tagger must see
in order not to use the backoff tagger
:type cutoff: int
"""
json_tag = "nltk.tag.sequential.UnigramTagger"
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
super().__init__(1, train, model, backoff, cutoff, verbose)
def context(self, tokens, index, history):
return tokens[index]
@jsontags.register_tag
class BigramTagger(NgramTagger):
"""
A tagger that chooses a token's tag based its word string and on
the preceding words' tag. In particular, a tuple consisting
of the previous tag and the word is looked up in a table, and
the corresponding tag is returned.
:param train: The corpus of training data, a list of tagged sentences
:type train: list(list(tuple(str, str)))
:param model: The tagger model
:type model: dict
:param backoff: Another tagger which this tagger will consult when it is
unable to tag a word
:type backoff: TaggerI
:param cutoff: The number of instances of training data the tagger must see
in order not to use the backoff tagger
:type cutoff: int
"""
json_tag = "nltk.tag.sequential.BigramTagger"
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
super().__init__(2, train, model, backoff, cutoff, verbose)
@jsontags.register_tag
class TrigramTagger(NgramTagger):
"""
A tagger that chooses a token's tag based its word string and on
the preceding two words' tags. In particular, a tuple consisting
of the previous two tags and the word is looked up in a table, and
the corresponding tag is returned.
:param train: The corpus of training data, a list of tagged sentences
:type train: list(list(tuple(str, str)))
:param model: The tagger model
:type model: dict
:param backoff: Another tagger which this tagger will consult when it is
unable to tag a word
:type backoff: TaggerI
:param cutoff: The number of instances of training data the tagger must see
in order not to use the backoff tagger
:type cutoff: int
"""
json_tag = "nltk.tag.sequential.TrigramTagger"
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
super().__init__(3, train, model, backoff, cutoff, verbose)
@jsontags.register_tag
class AffixTagger(ContextTagger):
"""
A tagger that chooses a token's tag based on a leading or trailing
substring of its word string. (It is important to note that these
substrings are not necessarily "true" morphological affixes). In
particular, a fixed-length substring of the word is looked up in a
table, and the corresponding tag is returned. Affix taggers are
typically constructed by training them on a tagged corpus.
Construct a new affix tagger.
:param affix_length: The length of the affixes that should be
considered during training and tagging. Use negative
numbers for suffixes.
:param min_stem_length: Any words whose length is less than
min_stem_length+abs(affix_length) will be assigned a
tag of None by this tagger.
"""
json_tag = "nltk.tag.sequential.AffixTagger"
def __init__(
self,
train=None,
model=None,
affix_length=-3,
min_stem_length=2,
backoff=None,
cutoff=0,
verbose=False,
):
self._check_params(train, model)
super().__init__(model, backoff)
self._affix_length = affix_length
self._min_word_length = min_stem_length + abs(affix_length)
if train:
self._train(train, cutoff, verbose)
def encode_json_obj(self):
return (
self._affix_length,
self._min_word_length,
self._context_to_tag,
self.backoff,
)
@classmethod
def decode_json_obj(cls, obj):
_affix_length, _min_word_length, _context_to_tag, backoff = obj
return cls(
affix_length=_affix_length,
min_stem_length=_min_word_length - abs(_affix_length),
model=_context_to_tag,
backoff=backoff,
)
def context(self, tokens, index, history):
token = tokens[index]
if len(token) < self._min_word_length:
return None
elif self._affix_length > 0:
return token[: self._affix_length]
else:
return token[self._affix_length :]
@jsontags.register_tag
class RegexpTagger(SequentialBackoffTagger):
r"""
Regular Expression Tagger
The RegexpTagger assigns tags to tokens by comparing their
word strings to a series of regular expressions. The following tagger
uses word suffixes to make guesses about the correct Brown Corpus part
of speech tag:
>>> from nltk.corpus import brown
>>> from nltk.tag import RegexpTagger
>>> test_sent = brown.sents(categories='news')[0]
>>> regexp_tagger = RegexpTagger(
... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
... (r'.*able$', 'JJ'), # adjectives
... (r'.*ness$', 'NN'), # nouns formed from adjectives
... (r'.*ly$', 'RB'), # adverbs
... (r'.*s$', 'NNS'), # plural nouns
... (r'.*ing$', 'VBG'), # gerunds
... (r'.*ed$', 'VBD'), # past tense verbs
... (r'.*', 'NN') # nouns (default)
... ])
>>> regexp_tagger
<Regexp Tagger: size=9>
>>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE
[('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'),
('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'),
('place', 'NN'), ('.', 'NN')]
:type regexps: list(tuple(str, str))
:param regexps: A list of ``(regexp, tag)`` pairs, each of
which indicates that a word matching ``regexp`` should
be tagged with ``tag``. The pairs will be evaluated in
order. If none of the regexps match a word, then the
optional backoff tagger is invoked, else it is
assigned the tag None.
"""
json_tag = "nltk.tag.sequential.RegexpTagger"
def __init__(
self, regexps: List[Tuple[str, str]], backoff: Optional[TaggerI] = None
):
super().__init__(backoff)
self._regexps = []
for regexp, tag in regexps:
try:
self._regexps.append((re.compile(regexp), tag))
except Exception as e:
raise Exception(
f"Invalid RegexpTagger regexp: {e}\n- regexp: {regexp!r}\n- tag: {tag!r}"
) from e
def encode_json_obj(self):
return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff
@classmethod
def decode_json_obj(cls, obj):
regexps, backoff = obj
return cls(regexps, backoff)
def choose_tag(self, tokens, index, history):
for regexp, tag in self._regexps:
if re.match(regexp, tokens[index]):
return tag
return None
def __repr__(self):
return f"<Regexp Tagger: size={len(self._regexps)}>"
class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
"""
A sequential tagger that uses a classifier to choose the tag for
each token in a sentence. The featureset input for the classifier
is generated by a feature detector function::
feature_detector(tokens, index, history) -> featureset
Where tokens is the list of unlabeled tokens in the sentence;
index is the index of the token for which feature detection
should be performed; and history is list of the tags for all
tokens before index.
Construct a new classifier-based sequential tagger.
:param feature_detector: A function used to generate the
featureset input for the classifier::
feature_detector(tokens, index, history) -> featureset
:param train: A tagged corpus consisting of a list of tagged
sentences, where each sentence is a list of (word, tag) tuples.
:param backoff: A backoff tagger, to be used by the new tagger
if it encounters an unknown context.
:param classifier_builder: A function used to train a new
classifier based on the data in *train*. It should take
one argument, a list of labeled featuresets (i.e.,
(featureset, label) tuples).
:param classifier: The classifier that should be used by the
tagger. This is only useful if you want to manually
construct the classifier; normally, you would use *train*
instead.
:param backoff: A backoff tagger, used if this tagger is
unable to determine a tag for a given token.
:param cutoff_prob: If specified, then this tagger will fall
back on its backoff tagger if the probability of the most
likely tag is less than *cutoff_prob*.
"""
def __init__(
self,
feature_detector=None,
train=None,
classifier_builder=NaiveBayesClassifier.train,
classifier=None,
backoff=None,
cutoff_prob=None,
verbose=False,
):
self._check_params(train, classifier)
super().__init__(backoff)
if (train and classifier) or (not train and not classifier):
raise ValueError(
"Must specify either training data or " "trained classifier."
)
if feature_detector is not None:
self._feature_detector = feature_detector
# The feature detector function, used to generate a featureset
# or each token: feature_detector(tokens, index, history) -> featureset
self._cutoff_prob = cutoff_prob
"""Cutoff probability for tagging -- if the probability of the
most likely tag is less than this, then use backoff."""
self._classifier = classifier
"""The classifier used to choose a tag for each token."""
if train:
self._train(train, classifier_builder, verbose)
def choose_tag(self, tokens, index, history):
# Use our feature detector to get the featureset.
featureset = self.feature_detector(tokens, index, history)
# Use the classifier to pick a tag. If a cutoff probability
# was specified, then check that the tag's probability is
# higher than that cutoff first; otherwise, return None.
if self._cutoff_prob is None:
return self._classifier.classify(featureset)
pdist = self._classifier.prob_classify(featureset)
tag = pdist.max()
return tag if pdist.prob(tag) >= self._cutoff_prob else None
def _train(self, tagged_corpus, classifier_builder, verbose):
"""
Build a new classifier, based on the given training data
*tagged_corpus*.
"""
classifier_corpus = []
if verbose:
print("Constructing training corpus for classifier.")
for sentence in tagged_corpus:
history = []
untagged_sentence, tags = zip(*sentence)
for index in range(len(sentence)):
featureset = self.feature_detector(untagged_sentence, index, history)
classifier_corpus.append((featureset, tags[index]))
history.append(tags[index])
if verbose:
print(f"Training classifier ({len(classifier_corpus)} instances)")
self._classifier = classifier_builder(classifier_corpus)
def __repr__(self):
return f"<ClassifierBasedTagger: {self._classifier}>"
def feature_detector(self, tokens, index, history):
"""
Return the feature detector that this tagger uses to generate
featuresets for its classifier. The feature detector is a
function with the signature::
feature_detector(tokens, index, history) -> featureset
See ``classifier()``
"""
return self._feature_detector(tokens, index, history)
def classifier(self):
"""
Return the classifier that this tagger uses to choose a tag
for each word in a sentence. The input for this classifier is
generated using this tagger's feature detector.
See ``feature_detector()``
"""
return self._classifier
class ClassifierBasedPOSTagger(ClassifierBasedTagger):
"""
A classifier based part of speech tagger.
"""
def feature_detector(self, tokens, index, history):
word = tokens[index]
if index == 0:
prevword = prevprevword = None
prevtag = prevprevtag = None
elif index == 1:
prevword = tokens[index - 1].lower()
prevprevword = None
prevtag = history[index - 1]
prevprevtag = None
else:
prevword = tokens[index - 1].lower()
prevprevword = tokens[index - 2].lower()
prevtag = history[index - 1]
prevprevtag = history[index - 2]
if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word):
shape = "number"
elif re.match(r"\W+$", word):
shape = "punct"
elif re.match("[A-Z][a-z]+$", word):
shape = "upcase"
elif re.match("[a-z]+$", word):
shape = "downcase"
elif re.match(r"\w+$", word):
shape = "mixedcase"
else:
shape = "other"
features = {
"prevtag": prevtag,
"prevprevtag": prevprevtag,
"word": word,
"word.lower": word.lower(),
"suffix3": word.lower()[-3:],
"suffix2": word.lower()[-2:],
"suffix1": word.lower()[-1:],
"prevprevword": prevprevword,
"prevword": prevword,
"prevtag+word": f"{prevtag}+{word.lower()}",
"prevprevtag+word": f"{prevprevtag}+{word.lower()}",
"prevword+word": f"{prevword}+{word.lower()}",
"shape": shape,
}
return features