Resolve ReDoS opportunity by fixing incorrectly specified regex (#2906)

nltk · Dec 8, 2021 · 2a50a3e · 2a50a3e
1 parent 8ed8b70
commit 2a50a3e
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 20 deletions.
diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py
@@ -32,7 +32,7 @@ def malt_regex_tagger():
             (r"\)$", ")"),  # round brackets
             (r"\[$", "["),
             (r"\]$", "]"),  # square brackets
-            (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
+            (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
             (r"(The|the|A|a|An|an)$", "DT"),  # articles
             (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"),  # pronouns
             (r"(His|his|Her|her|Its|its)$", "PRP$"),  # possessive

diff --git a/nltk/sem/glue.py b/nltk/sem/glue.py
@@ -703,7 +703,7 @@ def get_pos_tagger(self):
 
         regexp_tagger = RegexpTagger(
             [
-                (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
+                (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
                 (r"(The|the|A|a|An|an)$", "AT"),  # articles
                 (r".*able$", "JJ"),  # adjectives
                 (r".*ness$", "NN"),  # nouns formed from adjectives

diff --git a/nltk/tag/brill.py b/nltk/tag/brill.py
@@ -329,7 +329,7 @@ def print_train_stats():
             )
             print(
                 "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
-                "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
+                "final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats)
             )
             head = "#ID | Score (train) |  #Rules     | Template"
             print(head, "\n", "-" * len(head), sep="")

diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py
@@ -91,7 +91,7 @@ def __init__(
     # Training
 
     def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
-        """
+        r"""
         Trains the Brill tagger on the corpus *train_sents*,
         producing at most *max_rules* transformations, each of which
         reduces the net number of errors in the corpus by at least
@@ -111,7 +111,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         >>> testing_data = [untag(s) for s in gold_data]
 
         >>> backoff = RegexpTagger([
-        ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+        ... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
         ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
         ... (r'.*able$', 'JJ'),                # adjectives
         ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
@@ -125,7 +125,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         >>> baseline = backoff #see NOTE1
 
         >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
-        0.2450142...
+        0.2433862...
 
         >>> # Set up templates
         >>> Template._cleartemplates() #clear any templates created in earlier tests
@@ -137,7 +137,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         >>> tagger1 = tt.train(training_data, max_rules=10)
         TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
         Finding initial useful rules...
-            Found 845 useful rules.
+            Found 847 useful rules.
         <BLANKLINE>
                    B      |
            S   F   r   O  |        Score = Fixed - Broken
@@ -150,7 +150,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
           85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
           69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
           51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
-          47  63  16 161  | NN->IN if Pos:NNS@[-1]
+          47  63  16 162  | NN->IN if Pos:NNS@[-1]
           33  33   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
           26  26   0   0  | IN->. if Pos:NNS@[-1] & Word:.@[0]
           24  24   0   0  | IN->, if Pos:NNS@[-1] & Word:,@[0]
@@ -162,11 +162,11 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
 
         >>> train_stats = tagger1.train_stats()
         >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
-        [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
+        [1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
 
         >>> tagger1.print_template_statistics(printunused=False)
         TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
-        TRAIN (   2417 tokens) initial  1775 0.2656 final:  1269 0.4750
+        TRAIN (   2417 tokens) initial  1776 0.2652 final:  1270 0.4746
         #ID | Score (train) |  #Rules     | Template
         --------------------------------------------
         001 |   305   0.603 |   7   0.700 | Template(Pos([-1]),Word([0]))
@@ -175,7 +175,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         <BLANKLINE>
 
         >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
-        0.43996...
+        0.43833...
 
         >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
 
@@ -185,13 +185,13 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         True
 
         >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
-        [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
+        [1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
 
         >>> # A high-accuracy tagger
         >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
         TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
         Finding initial useful rules...
-            Found 845 useful rules.
+            Found 847 useful rules.
         <BLANKLINE>
                    B      |
            S   F   r   O  |        Score = Fixed - Broken
@@ -212,7 +212,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
           18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]
 
         >>> tagger2.evaluate(gold_data)  # doctest: +ELLIPSIS
-        0.44159544...
+        0.43996743...
         >>> tagger2.rules()[2:4]
         (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))
 

diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py
@@ -337,7 +337,7 @@ class UnigramTagger(NgramTagger):
         >>> test_sent = brown.sents(categories='news')[0]
         >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
         >>> for tok, tag in unigram_tagger.tag(test_sent):
-        ...     print("({}, {}), ".format(tok, tag))
+        ...     print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE
         (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
         (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
         (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
@@ -491,7 +491,7 @@ def context(self, tokens, index, history):
 
 @jsontags.register_tag
 class RegexpTagger(SequentialBackoffTagger):
-    """
+    r"""
     Regular Expression Tagger
 
     The RegexpTagger assigns tags to tokens by comparing their
@@ -503,7 +503,7 @@ class RegexpTagger(SequentialBackoffTagger):
         >>> from nltk.tag import RegexpTagger
         >>> test_sent = brown.sents(categories='news')[0]
         >>> regexp_tagger = RegexpTagger(
-        ...     [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+        ...     [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
         ...      (r'(The|the|A|a|An|an)$', 'AT'),   # articles
         ...      (r'.*able$', 'JJ'),                # adjectives
         ...      (r'.*ness$', 'NN'),                # nouns formed from adjectives
@@ -515,7 +515,7 @@ class RegexpTagger(SequentialBackoffTagger):
         ... ])
         >>> regexp_tagger
         <Regexp Tagger: size=9>
-        >>> regexp_tagger.tag(test_sent)
+        >>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE
         [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
         ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
         ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),

diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py
@@ -393,11 +393,11 @@ def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
     plt.savefig(learning_curve_output)
 
 
-NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")])
+NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")])
 
 REGEXP_TAGGER = RegexpTagger(
     [
-        (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
+        (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
         (r"(The|the|A|a|An|an)$", "AT"),  # articles
         (r".*able$", "JJ"),  # adjectives
         (r".*ness$", "NN"),  # nouns formed from adjectives