From 277711ab1dec729e626b27aab6fa35ea5efbd7e6 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Sat, 25 Sep 2021 01:14:32 +0200 Subject: [PATCH] Resolved ReDoS vulnerability in Corpus Reader (#2816) * Resolved ReDoS vulnerability in the Corpus Reader for the Comparative Sentence Dataset * Solidified performance tests --- nltk/corpus/reader/comparative_sents.py | 2 +- nltk/test/corpus.doctest | 32 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/nltk/corpus/reader/comparative_sents.py b/nltk/corpus/reader/comparative_sents.py index cbfac4c13c..ed295e4e02 100644 --- a/nltk/corpus/reader/comparative_sents.py +++ b/nltk/corpus/reader/comparative_sents.py @@ -45,7 +45,7 @@ GRAD_COMPARISON = re.compile(r"") NON_GRAD_COMPARISON = re.compile(r"") ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)") -KEYWORD = re.compile(r"\((?!.*\()(.*)\)$") +KEYWORD = re.compile(r"\(([^\(]*)\)$") class Comparison: diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest index 82b17f8a5a..560e641a69 100644 --- a/nltk/test/corpus.doctest +++ b/nltk/test/corpus.doctest @@ -2162,3 +2162,35 @@ access to its tuples() method >>> from nltk.corpus import qc >>> qc.tuples('test.txt') [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...] + +Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulnerability. + + >>> import re + >>> import time + >>> from nltk.corpus.reader.comparative_sents import KEYWORD + >>> sizes = { + ... "short": 4000, + ... "long": 40000 + ... } + >>> exec_times = { + ... "short": [], + ... "long": [], + ... } + >>> for size_name, size in sizes.items(): + ... for j in range(9): + ... start_t = time.perf_counter() + ... payload = "( " + "(" * size + ... output = KEYWORD.findall(payload) + ... exec_times[size_name].append(time.perf_counter() - start_t) + ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the mean + +Ideally, the execution time of such a regular expression is linear +in the length of the input. As such, we would expect exec_times["long"] +to be roughly 10 times as big as exec_times["short"]. +With the ReDoS in place, it took roughly 80 times as long. +For now, we accept values below 30 (times as long), due to the potential +for variance. This ensures that the ReDoS has certainly been reduced, +if not removed. + + >>> exec_times["long"] / exec_times["short"] < 30 + True