nltk · ab-10 · Jan 30, 2020 · Mar 9, 2020
diff --git a/converter.py b/converter.py
@@ -0,0 +1,14 @@
+import sys
+import re
+
+lines = []
+
+with open(sys.argv[1]) as log:
+    for line in log.readlines():
+        path = re.search(r"^.*:\d", line)
+        if path:
+            lines.append([path.group(0)])
+        else:
+            lines[-1].append(line[:-1])
+
+print(lines)
diff --git a/nltk/app/chunkparser_app.py b/nltk/app/chunkparser_app.py
@@ -202,7 +202,7 @@ class RegexpChunkApp(object):
             "\t<regexp><RB>?<VBD></regexp>\n"
             '\t\tMatches <match>"ran/VBD"</match>\n'
             '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
-            "\t<regexp><\#><CD> # This is a comment...</regexp>\n"
+            r"\t<regexp><\#><CD> # This is a comment...</regexp>\n"
             '\t\tMatches <match>"#/# 100/CD"</match>\n'
             "</hangindent>",
         ),
@@ -311,7 +311,7 @@ def normalize_grammar(self, grammar):
         grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
         # Normalize whitespace
         grammar = re.sub(" +", " ", grammar)
-        grammar = re.sub("\n\s+", "\n", grammar)
+        grammar = re.sub(r"\n\s+", "\n", grammar)
         grammar = grammar.strip()
         # [xx] Hack: automatically backslash $!
         grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
@@ -1057,7 +1057,7 @@ def show_help(self, tab):
                             "\t%s\t%s" % item
                             for item in sorted(
                                 list(self.tagset.items()),
-                                key=lambda t_w: re.match("\w+", t_w[0])
+                                key=lambda t_w: re.match(r"\w+", t_w[0])
                                 and (0, t_w[0])
                                 or (1, t_w[0]),
                             )
@@ -1418,7 +1418,7 @@ def load_grammar(self, filename=None):
         with open(filename, "r") as infile:
             grammar = infile.read()
         grammar = re.sub(
-            "^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
+            r"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
         ).lstrip()
         self.grammarbox.insert("1.0", grammar)
         self.update()

diff --git a/nltk/ccg/combinator.py b/nltk/ccg/combinator.py
@@ -214,7 +214,7 @@ def backwardBxConstraint(left, right):
 
 
 class UndirectedSubstitution(UndirectedBinaryCombinator):
-    """
+    r"""
     Substitution (permutation) combinator.
     Implements rules of the form
     Y/Z (X\Y)/Z -> X/Z (<Sx)

diff --git a/nltk/chat/iesha.py b/nltk/chat/iesha.py
@@ -49,7 +49,7 @@
     (
         r"(.*) don\'t you (.*)",
         (
-            "u think I can%2??! really?? kekeke \<_\<",
+            r"u think I can%2??! really?? kekeke \<_\<",
             "what do u mean%2??!",
             "i could if i wanted, don't you think!! kekeke",
         ),

diff --git a/nltk/chunk/named_entity.py b/nltk/chunk/named_entity.py
@@ -171,11 +171,11 @@ def _parse_to_tagged(sent):
 
 
 def shape(word):
-    if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
+    if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
         return "number"
-    elif re.match("\W+$", word, re.UNICODE):
+    elif re.match(r"\W+$", word, re.UNICODE):
         return "punct"
-    elif re.match("\w+$", word, re.UNICODE):
+    elif re.match(r"\w+$", word, re.UNICODE):
         if word.istitle():
             return "upcase"
         elif word.islower():
@@ -247,8 +247,8 @@ def load_ace_file(textfile, fmt):
     def subfunc(m):
         return " " * (m.end() - m.start() - 6)
 
-    text = re.sub("[\s\S]*<TEXT>", subfunc, text)
-    text = re.sub("</TEXT>[\s\S]*", "", text)
+    text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
+    text = re.sub(r"</TEXT>[\s\S]*", "", text)
 
     # Simplify quotes
     text = re.sub("``", ' "', text)

diff --git a/nltk/chunk/regexp.py b/nltk/chunk/regexp.py
@@ -64,7 +64,7 @@ class ChunkString(object):
     _CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG
     _CHINK = r"(%s+?)+?" % CHUNK_TAG
     _VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG)
-    _BRACKETS = re.compile("[^\{\}]+")
+    _BRACKETS = re.compile(r"[^\{\}]+")
     _BALANCED_BRACKETS = re.compile(r"(\{\})*$")
 
     def __init__(self, chunk_struct, debug_level=1):
@@ -209,7 +209,7 @@ def xform(self, regexp, repl):
         # The substitution might have generated "empty chunks"
         # (substrings of the form "{}").  Remove them, so they don't
         # interfere with other transformations.
-        s = re.sub("\{\}", "", s)
+        s = re.sub(r"\{\}", "", s)
 
         # Make sure that the transformation was legal.
         if self._debug > 1:
@@ -420,7 +420,7 @@ def __init__(self, tag_pattern, descr):
             "(?P<chunk>%s)%s"
             % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)
         )
-        RegexpChunkRule.__init__(self, regexp, "{\g<chunk>}", descr)
+        RegexpChunkRule.__init__(self, regexp, r"{\g<chunk>}", descr)
 
     def __repr__(self):
         """
@@ -465,7 +465,7 @@ def __init__(self, tag_pattern, descr):
             "(?P<chink>%s)%s"
             % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
         )
-        RegexpChunkRule.__init__(self, regexp, "}\g<chink>{", descr)
+        RegexpChunkRule.__init__(self, regexp, r"}\g<chink>{", descr)
 
     def __repr__(self):
         """
@@ -504,8 +504,8 @@ def __init__(self, tag_pattern, descr):
             of this rule.
         """
         self._pattern = tag_pattern
-        regexp = re.compile("\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
-        RegexpChunkRule.__init__(self, regexp, "\g<chunk>", descr)
+        regexp = re.compile(r"\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
+        RegexpChunkRule.__init__(self, regexp, r"\g<chunk>", descr)
 
     def __repr__(self):
         """
@@ -569,7 +569,7 @@ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
                 tag_pattern2re_pattern(right_tag_pattern),
             )
         )
-        RegexpChunkRule.__init__(self, regexp, "\g<left>", descr)
+        RegexpChunkRule.__init__(self, regexp, r"\g<left>", descr)
 
     def __repr__(self):
         """
@@ -702,13 +702,13 @@ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
         self._left_tag_pattern = left_tag_pattern
         self._right_tag_pattern = right_tag_pattern
         regexp = re.compile(
-            "(?P<left>%s)\{(?P<right>%s)"
+            r"(?P<left>%s)\{(?P<right>%s)"
             % (
                 tag_pattern2re_pattern(left_tag_pattern),
                 tag_pattern2re_pattern(right_tag_pattern),
             )
         )
-        RegexpChunkRule.__init__(self, regexp, "{\g<left>\g<right>", descr)
+        RegexpChunkRule.__init__(self, regexp, r"{\g<left>\g<right>", descr)
 
     def __repr__(self):
         """
@@ -772,13 +772,13 @@ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
         self._left_tag_pattern = left_tag_pattern
         self._right_tag_pattern = right_tag_pattern
         regexp = re.compile(
-            "(?P<left>%s)\}(?P<right>%s)"
+            r"(?P<left>%s)\}(?P<right>%s)"
             % (
                 tag_pattern2re_pattern(left_tag_pattern),
                 tag_pattern2re_pattern(right_tag_pattern),
             )
         )
-        RegexpChunkRule.__init__(self, regexp, "\g<left>\g<right>}", descr)
+        RegexpChunkRule.__init__(self, regexp, r"\g<left>\g<right>}", descr)
 
     def __repr__(self):
         """
@@ -890,7 +890,7 @@ def __repr__(self):
 # this should probably be made more strict than it is -- e.g., it
 # currently accepts 'foo'.
 CHUNK_TAG_PATTERN = re.compile(
-    r"^((%s|<%s>)*)$" % ("([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", "[^\{\}<>]+")
+    r"^((%s|<%s>)*)$" % (r"([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", r"[^\{\}<>]+")
 )
 
 
@@ -1130,7 +1130,7 @@ def __str__(self):
 
 
 class RegexpParser(ChunkParserI):
-    """
+    r"""
     A grammar based chunk parser.  ``chunk.RegexpParser`` uses a set of
     regular expression patterns to specify the behavior of the parser.
     The chunking of the text is encoded using a ``ChunkString``, and

diff --git a/nltk/chunk/util.py b/nltk/chunk/util.py
@@ -368,7 +368,7 @@ def tagstr2tree(
 
 ### CONLL
 
-_LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
+_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
 
 
 def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
@@ -514,7 +514,7 @@ def tree2conllstr(t):
     re.DOTALL,
 )
 
-_IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
+_IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
 
 
 def _ieer_read_text(s, root_label):
@@ -523,7 +523,7 @@ def _ieer_read_text(s, root_label):
     # return the empty list in place of a Tree
     if s is None:
         return []
-    for piece_m in re.finditer("<[^>]+>|[^\s<]+", s):
+    for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
         piece = piece_m.group()
         try:
             if piece.startswith("<b_"):

diff --git a/nltk/classify/maxent.py b/nltk/classify/maxent.py
@@ -1287,7 +1287,7 @@ def calculate_deltas(
     nftranspose,
     encoding,
 ):
-    """
+    r"""
     Calculate the update values for the classifier weights for
     this iteration of IIS.  These update weights are the value of
     ``delta`` that solves the equation::

diff --git a/nltk/classify/rte_classify.py b/nltk/classify/rte_classify.py
@@ -58,7 +58,7 @@ def __init__(self, rtepair, stop=True, use_lemmatize=False):
         self.negwords = set(["no", "not", "never", "failed", "rejected", "denied"])
         # Try to tokenize so that abbreviations, monetary amounts, email
         # addresses, URLs are single tokens.
-        tokenizer = RegexpTokenizer("[\w.@:/]+|\w+|\$[\d.]+")
+        tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+")
 
         # Get the set of word types for text and hypothesis
         self.text_tokens = tokenizer.tokenize(rtepair.text)

diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
@@ -113,19 +113,19 @@
 conll2002 = LazyCorpusLoader(
     "conll2002",
     ConllChunkCorpusReader,
-    ".*\.(test|train).*",
+    r".*\.(test|train).*",
     ("LOC", "PER", "ORG", "MISC"),
     encoding="utf-8",
 )
 conll2007 = LazyCorpusLoader(
     "conll2007",
     DependencyCorpusReader,
-    ".*\.(test|train).*",
+    r".*\.(test|train).*",
     encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
 )
-crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, ".*\.txt")
+crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, r".*\.txt")
 dependency_treebank = LazyCorpusLoader(
-    "dependency_treebank", DependencyCorpusReader, ".*\.dp", encoding="ascii"
+    "dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
 )
 floresta = LazyCorpusLoader(
     "floresta",
@@ -308,7 +308,7 @@
 switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj")
 timit = LazyCorpusLoader("timit", TimitCorpusReader)
 timit_tagged = LazyCorpusLoader(
-    "timit", TimitTaggedCorpusReader, ".+\.tags", tagset="wsj", encoding="ascii"
+    "timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
 )
 toolbox = LazyCorpusLoader(
     "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
@@ -332,7 +332,7 @@
 treebank_raw = LazyCorpusLoader(
     "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
 )
-twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, ".*\.json")
+twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, r".*\.json")
 udhr = LazyCorpusLoader("udhr", UdhrCorpusReader)
 udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8")
 universal_treebanks = LazyCorpusLoader(
@@ -361,7 +361,7 @@
     WordNetCorpusReader,
     LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
 )
-wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, ".*\.dat")
+wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat")
 words = LazyCorpusLoader(
     "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
 )
@@ -371,7 +371,7 @@
     "propbank",
     PropbankCorpusReader,
     "prop.txt",
-    "frames/.*\.xml",
+    r"frames/.*\.xml",
     "verbs.txt",
     lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
     treebank,
@@ -380,7 +380,7 @@
     "nombank.1.0",
     NombankCorpusReader,
     "nombank.1.0",
-    "frames/.*\.xml",
+    r"frames/.*\.xml",
     "nombank.1.0.words",
     lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
     treebank,
@@ -389,7 +389,7 @@
     "propbank",
     PropbankCorpusReader,
     "prop.txt",
-    "frames/.*\.xml",
+    r"frames/.*\.xml",
     "verbs.txt",
     lambda filename: filename.upper(),
     ptb,
@@ -398,7 +398,7 @@
     "nombank.1.0",
     NombankCorpusReader,
     "nombank.1.0",
-    "frames/.*\.xml",
+    r"frames/.*\.xml",
     "nombank.1.0.words",
     lambda filename: filename.upper(),
     ptb,

diff --git a/nltk/corpus/reader/api.py b/nltk/corpus/reader/api.py
@@ -72,7 +72,8 @@ def __init__(self, root, fileids, encoding="utf8", tagset=None):
         """
         # Convert the root to a path pointer, if necessary.
         if isinstance(root, str) and not isinstance(root, PathPointer):
-            m = re.match("(.*\.zip)/?(.*)$|", root)
+            m = re.match(r"(.*\.zip)/?(.*)$|", root)
+
             zipfile, zipentry = m.groups()
             if zipfile:
                 root = ZipFilePathPointer(zipfile, zipentry)

diff --git a/nltk/corpus/reader/bnc.py b/nltk/corpus/reader/bnc.py
@@ -12,7 +12,7 @@
 
 
 class BNCCorpusReader(XMLCorpusReader):
-    """Corpus reader for the XML version of the British National Corpus.
+    r"""Corpus reader for the XML version of the British National Corpus.
 
     For access to the complete XML data structure, use the ``xml()``
     method.  For access to simple word lists and tagged word lists, use

diff --git a/nltk/corpus/reader/bracket_parse.py b/nltk/corpus/reader/bracket_parse.py
@@ -212,7 +212,7 @@ def __init__(self, root, encoding="ISO-8859-1", tagset=None):
         BracketParseCorpusReader.__init__(
             self,
             root,
-            "alpino\.xml",
+            r"alpino\.xml",
             detect_blocks="blankline",
             encoding=encoding,
             tagset=tagset,

diff --git a/nltk/corpus/reader/childes.py b/nltk/corpus/reader/childes.py
@@ -277,7 +277,7 @@ def _get_age(self, fileid, speaker, month):
 
     def convert_age(self, age_year):
         "Caclculate age in months from a string in CHILDES format"
-        m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
+        m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
         age_month = int(m.group(1)) * 12 + int(m.group(2))
         try:
             if int(m.group(3)) > 15:

diff --git a/nltk/corpus/reader/framenet.py b/nltk/corpus/reader/framenet.py
@@ -2744,7 +2744,7 @@ def _strip_tags(self, data):
         """
 
         try:
-            """
+            r"""
             # Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.)
             m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data)
             if m:

diff --git a/nltk/corpus/reader/plaintext.py b/nltk/corpus/reader/plaintext.py
@@ -44,7 +44,7 @@ def __init__(
         para_block_reader=read_blankline_block,
         encoding="utf8",
     ):
-        """
+        r"""
         Construct a new plaintext corpus reader for a set of documents
         located at the given root directory.  Example usage:
 

diff --git a/nltk/corpus/reader/switchboard.py b/nltk/corpus/reader/switchboard.py
@@ -110,7 +110,7 @@ def _words_block_reader(self, stream):
     def _tagged_words_block_reader(self, stream, tagset=None):
         return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
 
-    _UTTERANCE_RE = re.compile("(\w+)\.(\d+)\:\s*(.*)")
+    _UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
     _SEP = "/"
 
     def _parse_utterance(self, utterance, include_tag, tagset=None):