From 277711ab1dec729e626b27aab6fa35ea5efbd7e6 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Sat, 25 Sep 2021 01:14:32 +0200
Subject: [PATCH] Resolved ReDoS vulnerability in Corpus Reader (#2816)

* Resolved ReDoS vulnerability in the Corpus Reader for the Comparative Sentence Dataset

* Solidified performance tests
---
 nltk/corpus/reader/comparative_sents.py |  2 +-
 nltk/test/corpus.doctest                | 32 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
diff --git a/nltk/corpus/reader/comparative_sents.py b/nltk/corpus/reader/comparative_sents.py
index cbfac4c13c..ed295e4e02 100644
--- a/nltk/corpus/reader/comparative_sents.py
+++ b/nltk/corpus/reader/comparative_sents.py
@@ -45,7 +45,7 @@
 GRAD_COMPARISON = re.compile(r"<cs-[123]>")
 NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
 ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
-KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
+KEYWORD = re.compile(r"\(([^\(]*)\)$")
 
 
 class Comparison:
diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index 82b17f8a5a..560e641a69 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -2162,3 +2162,35 @@ access to its tuples() method
     >>> from nltk.corpus import qc
     >>> qc.tuples('test.txt')
     [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...]
+
+Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulnerability.
+
+    >>> import re
+    >>> import time
+    >>> from nltk.corpus.reader.comparative_sents import KEYWORD
+    >>> sizes = {
+    ...     "short": 4000,
+    ...     "long": 40000
+    ... }
+    >>> exec_times = {
+    ...     "short": [],
+    ...     "long": [],
+    ... }
+    >>> for size_name, size in sizes.items():
+    ...     for j in range(9):
+    ...         start_t = time.perf_counter()
+    ...         payload = "( " + "(" * size
+    ...         output = KEYWORD.findall(payload)
+    ...         exec_times[size_name].append(time.perf_counter() - start_t)
+    ...     exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the mean
+
+Ideally, the execution time of such a regular expression is linear
+in the length of the input. As such, we would expect exec_times["long"]
+to be roughly 10 times as big as exec_times["short"].
+With the ReDoS in place, it took roughly 80 times as long.
+For now, we accept values below 30 (times as long), due to the potential
+for variance. This ensures that the ReDoS has certainly been reduced,
+if not removed.
+
+    >>> exec_times["long"] / exec_times["short"] < 30
+    True