From 2a50a3edc9d35f57ae42a921c621edc160877f4d Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Wed, 8 Dec 2021 15:19:56 +0100 Subject: [PATCH] Resolve ReDoS opportunity by fixing incorrectly specified regex (#2906) --- nltk/parse/malt.py | 2 +- nltk/sem/glue.py | 2 +- nltk/tag/brill.py | 2 +- nltk/tag/brill_trainer.py | 22 +++++++++++----------- nltk/tag/sequential.py | 8 ++++---- nltk/tbl/demo.py | 4 ++-- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py index 768f4846bf..9393f113f7 100644 --- a/nltk/parse/malt.py +++ b/nltk/parse/malt.py @@ -32,7 +32,7 @@ def malt_regex_tagger(): (r"\)$", ")"), # round brackets (r"\[$", "["), (r"\]$", "]"), # square brackets - (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "DT"), # articles (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns (r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive diff --git a/nltk/sem/glue.py b/nltk/sem/glue.py index 5b3c945571..a627aaa3f9 100644 --- a/nltk/sem/glue.py +++ b/nltk/sem/glue.py @@ -703,7 +703,7 @@ def get_pos_tagger(self): regexp_tagger = RegexpTagger( [ - (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives diff --git a/nltk/tag/brill.py b/nltk/tag/brill.py index b0b7607eac..05a8dd5f43 100644 --- a/nltk/tag/brill.py +++ b/nltk/tag/brill.py @@ -329,7 +329,7 @@ def print_train_stats(): ) print( "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " - "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats) + "final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats) ) head = "#ID | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py index 3e18a22b17..0f1c5bea8c 100644 --- a/nltk/tag/brill_trainer.py +++ b/nltk/tag/brill_trainer.py @@ -91,7 +91,7 @@ def __init__( # Training def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): - """ + r""" Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least @@ -111,7 +111,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ - ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers + ... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives @@ -125,7 +125,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> baseline = backoff #see NOTE1 >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS - 0.2450142... + 0.2433862... >>> # Set up templates >>> Template._cleartemplates() #clear any templates created in earlier tests @@ -137,7 +137,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... - Found 845 useful rules. + Found 847 useful rules. B | S F r O | Score = Fixed - Broken @@ -150,7 +150,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] - 47 63 16 161 | NN->IN if Pos:NNS@[-1] + 47 63 16 162 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] @@ -162,11 +162,11 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] - [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] + [1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) - TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750 + TRAIN ( 2417 tokens) initial 1776 0.2652 final: 1270 0.4746 #ID | Score (train) | #Rules | Template -------------------------------------------- 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) @@ -175,7 +175,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS - 0.43996... + 0.43833... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) @@ -185,13 +185,13 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] - [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] + [1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] >>> # A high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... - Found 845 useful rules. + Found 847 useful rules. B | S F r O | Score = Fixed - Broken @@ -212,7 +212,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS - 0.44159544... + 0.43996743... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py index 3576d2d8a9..0ce5ee683f 100644 --- a/nltk/tag/sequential.py +++ b/nltk/tag/sequential.py @@ -337,7 +337,7 @@ class UnigramTagger(NgramTagger): >>> test_sent = brown.sents(categories='news')[0] >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) >>> for tok, tag in unigram_tagger.tag(test_sent): - ... print("({}, {}), ".format(tok, tag)) + ... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL), (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT), (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ), @@ -491,7 +491,7 @@ def context(self, tokens, index, history): @jsontags.register_tag class RegexpTagger(SequentialBackoffTagger): - """ + r""" Regular Expression Tagger The RegexpTagger assigns tags to tokens by comparing their @@ -503,7 +503,7 @@ class RegexpTagger(SequentialBackoffTagger): >>> from nltk.tag import RegexpTagger >>> test_sent = brown.sents(categories='news')[0] >>> regexp_tagger = RegexpTagger( - ... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers + ... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives @@ -515,7 +515,7 @@ class RegexpTagger(SequentialBackoffTagger): ... ]) >>> regexp_tagger - >>> regexp_tagger.tag(test_sent) + >>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'), ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py index 613351b03a..8bdb8d360b 100644 --- a/nltk/tbl/demo.py +++ b/nltk/tbl/demo.py @@ -393,11 +393,11 @@ def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None): plt.savefig(learning_curve_output) -NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")]) +NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")]) REGEXP_TAGGER = RegexpTagger( [ - (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives