Merge pull request #3203 from alexrudnick/fix-toktok-tokenizer

Make sure that we invoke all the intended regex patterns in ToktokTokenizer...
nltk · Nov 9, 2023 · d7b428d · d7b428d
2 parents 5424961 + 6394166
commit d7b428d
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 2 deletions.
diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
@@ -395,3 +395,52 @@ Test that `ValueError` exceptions are raised when illegal arguments are used.
     Traceback (most recent call last):
       ...
     ValueError: Smoothing method bar not recognized
+
+
+Regression Tests: ToktokTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    >>> toktok = ToktokTokenizer()
+    >>> text = u'Is 9.5 or 525,600 my favorite number?'
+    >>> print(toktok.tokenize(text, return_str=True))
+    Is 9.5 or 525,600 my favorite number ?
+    >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
+    >>> print(toktok.tokenize(text, return_str=True))
+    The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
+    >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+    >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+    >>> assert toktok.tokenize(text, return_str=True) == expected
+    >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
+    True
+
+Taking comments from the code and turning them into actual tests...
+
+    # Don't tokenize period unless it ends the line and that it isn't
+    # preceded by another period, e.g.
+    # "something ..." -> "something ..."
+    >>> text = "something ..."
+    >>> print(toktok.tokenize(text, return_str=True))
+    something ...
+
+    # "something." -> "something ."
+    >>> text = "something."
+    >>> print(toktok.tokenize(text, return_str=True))
+    something .
+
+    # Don't tokenize period unless it ends the line eg.
+    # " ... stuff." ->  "... stuff ."
+    >>> text = "also more ... stuff."
+    >>> print(toktok.tokenize(text, return_str=True))
+    also more ... stuff .
+
+Demonstrate that the "FUNKY_PUNCT_1" and "FUNKY_PUNCT_2" patterns do what
+they're supposed to do. For example, FUNKY_PUNCT_1 splits out inverted question
+marks.
+    >>> text = "¿Quieres una taza de café?"
+    >>> print(toktok.tokenize(text, return_str=True))
+    ¿ Quieres una taza de café ?
+
+This one would have failed without the FUNKY_PUNCT_2 pattern included.
+    >>> text = "«Sí, por favor.»"
+    >>> print(toktok.tokenize(text, return_str=True))
+    « Sí , por favor . »
diff --git a/nltk/tokenize/toktok.py b/nltk/tokenize/toktok.py
@@ -2,7 +2,8 @@
 #
 # Copyright (C) 2001-2015 NLTK Project
 # Author: Jon Dehdari
-# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters
+# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters,
+# Alex Rudnick
 #
 # URL: <https://www.nltk.org>
 # For license information, see LICENSE.TXT
@@ -145,6 +146,7 @@ class ToktokTokenizer(TokenizerI):
     TOKTOK_REGEXES = [
         NON_BREAKING,
         FUNKY_PUNCT_1,
+        FUNKY_PUNCT_2,
         URL_FOE_1,
         URL_FOE_2,
         URL_FOE_3,
@@ -156,7 +158,6 @@ class ToktokTokenizer(TokenizerI):
         CLOSE_PUNCT_RE,
         MULTI_COMMAS,
         COMMA_IN_NUM,
-        FINAL_PERIOD_2,
         PROB_SINGLE_QUOTES,
         STUPID_QUOTES_1,
         STUPID_QUOTES_2,