Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix escape sequences #2493

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 14 additions & 0 deletions converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import sys
import re

lines = []

with open(sys.argv[1]) as log:
for line in log.readlines():
path = re.search(r"^.*:\d", line)
if path:
lines.append([path.group(0)])
else:
lines[-1].append(line[:-1])

print(lines)
8 changes: 4 additions & 4 deletions nltk/app/chunkparser_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ class RegexpChunkApp(object):
"\t<regexp><RB>?<VBD></regexp>\n"
'\t\tMatches <match>"ran/VBD"</match>\n'
'\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
"\t<regexp><\#><CD> # This is a comment...</regexp>\n"
r"\t<regexp><\#><CD> # This is a comment...</regexp>\n"
'\t\tMatches <match>"#/# 100/CD"</match>\n'
"</hangindent>",
),
Expand Down Expand Up @@ -311,7 +311,7 @@ def normalize_grammar(self, grammar):
grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
# Normalize whitespace
grammar = re.sub(" +", " ", grammar)
grammar = re.sub("\n\s+", "\n", grammar)
grammar = re.sub(r"\n\s+", "\n", grammar)
grammar = grammar.strip()
# [xx] Hack: automatically backslash $!
grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
Expand Down Expand Up @@ -1057,7 +1057,7 @@ def show_help(self, tab):
"\t%s\t%s" % item
for item in sorted(
list(self.tagset.items()),
key=lambda t_w: re.match("\w+", t_w[0])
key=lambda t_w: re.match(r"\w+", t_w[0])
and (0, t_w[0])
or (1, t_w[0]),
)
Expand Down Expand Up @@ -1418,7 +1418,7 @@ def load_grammar(self, filename=None):
with open(filename, "r") as infile:
grammar = infile.read()
grammar = re.sub(
"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
r"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
).lstrip()
self.grammarbox.insert("1.0", grammar)
self.update()
Expand Down
2 changes: 1 addition & 1 deletion nltk/ccg/combinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def backwardBxConstraint(left, right):


class UndirectedSubstitution(UndirectedBinaryCombinator):
"""
r"""
Substitution (permutation) combinator.
Implements rules of the form
Y/Z (X\Y)/Z -> X/Z (<Sx)
Expand Down
2 changes: 1 addition & 1 deletion nltk/chat/iesha.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
(
r"(.*) don\'t you (.*)",
(
"u think I can%2??! really?? kekeke \<_\<",
r"u think I can%2??! really?? kekeke \<_\<",
"what do u mean%2??!",
"i could if i wanted, don't you think!! kekeke",
),
Expand Down
10 changes: 5 additions & 5 deletions nltk/chunk/named_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,11 @@ def _parse_to_tagged(sent):


def shape(word):
if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
return "number"
elif re.match("\W+$", word, re.UNICODE):
elif re.match(r"\W+$", word, re.UNICODE):
return "punct"
elif re.match("\w+$", word, re.UNICODE):
elif re.match(r"\w+$", word, re.UNICODE):
if word.istitle():
return "upcase"
elif word.islower():
Expand Down Expand Up @@ -247,8 +247,8 @@ def load_ace_file(textfile, fmt):
def subfunc(m):
return " " * (m.end() - m.start() - 6)

text = re.sub("[\s\S]*<TEXT>", subfunc, text)
text = re.sub("</TEXT>[\s\S]*", "", text)
text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
text = re.sub(r"</TEXT>[\s\S]*", "", text)

# Simplify quotes
text = re.sub("``", ' "', text)
Expand Down
26 changes: 13 additions & 13 deletions nltk/chunk/regexp.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class ChunkString(object):
_CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG
_CHINK = r"(%s+?)+?" % CHUNK_TAG
_VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG)
_BRACKETS = re.compile("[^\{\}]+")
_BRACKETS = re.compile(r"[^\{\}]+")
_BALANCED_BRACKETS = re.compile(r"(\{\})*$")

def __init__(self, chunk_struct, debug_level=1):
Expand Down Expand Up @@ -209,7 +209,7 @@ def xform(self, regexp, repl):
# The substitution might have generated "empty chunks"
# (substrings of the form "{}"). Remove them, so they don't
# interfere with other transformations.
s = re.sub("\{\}", "", s)
s = re.sub(r"\{\}", "", s)

# Make sure that the transformation was legal.
if self._debug > 1:
Expand Down Expand Up @@ -420,7 +420,7 @@ def __init__(self, tag_pattern, descr):
"(?P<chunk>%s)%s"
% (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)
)
RegexpChunkRule.__init__(self, regexp, "{\g<chunk>}", descr)
RegexpChunkRule.__init__(self, regexp, r"{\g<chunk>}", descr)

def __repr__(self):
"""
Expand Down Expand Up @@ -465,7 +465,7 @@ def __init__(self, tag_pattern, descr):
"(?P<chink>%s)%s"
% (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
)
RegexpChunkRule.__init__(self, regexp, "}\g<chink>{", descr)
RegexpChunkRule.__init__(self, regexp, r"}\g<chink>{", descr)

def __repr__(self):
"""
Expand Down Expand Up @@ -504,8 +504,8 @@ def __init__(self, tag_pattern, descr):
of this rule.
"""
self._pattern = tag_pattern
regexp = re.compile("\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
RegexpChunkRule.__init__(self, regexp, "\g<chunk>", descr)
regexp = re.compile(r"\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
RegexpChunkRule.__init__(self, regexp, r"\g<chunk>", descr)

def __repr__(self):
"""
Expand Down Expand Up @@ -569,7 +569,7 @@ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
tag_pattern2re_pattern(right_tag_pattern),
)
)
RegexpChunkRule.__init__(self, regexp, "\g<left>", descr)
RegexpChunkRule.__init__(self, regexp, r"\g<left>", descr)

def __repr__(self):
"""
Expand Down Expand Up @@ -702,13 +702,13 @@ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
regexp = re.compile(
"(?P<left>%s)\{(?P<right>%s)"
r"(?P<left>%s)\{(?P<right>%s)"
% (
tag_pattern2re_pattern(left_tag_pattern),
tag_pattern2re_pattern(right_tag_pattern),
)
)
RegexpChunkRule.__init__(self, regexp, "{\g<left>\g<right>", descr)
RegexpChunkRule.__init__(self, regexp, r"{\g<left>\g<right>", descr)

def __repr__(self):
"""
Expand Down Expand Up @@ -772,13 +772,13 @@ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
regexp = re.compile(
"(?P<left>%s)\}(?P<right>%s)"
r"(?P<left>%s)\}(?P<right>%s)"
% (
tag_pattern2re_pattern(left_tag_pattern),
tag_pattern2re_pattern(right_tag_pattern),
)
)
RegexpChunkRule.__init__(self, regexp, "\g<left>\g<right>}", descr)
RegexpChunkRule.__init__(self, regexp, r"\g<left>\g<right>}", descr)

def __repr__(self):
"""
Expand Down Expand Up @@ -890,7 +890,7 @@ def __repr__(self):
# this should probably be made more strict than it is -- e.g., it
# currently accepts 'foo'.
CHUNK_TAG_PATTERN = re.compile(
r"^((%s|<%s>)*)$" % ("([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", "[^\{\}<>]+")
r"^((%s|<%s>)*)$" % (r"([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", r"[^\{\}<>]+")
)


Expand Down Expand Up @@ -1130,7 +1130,7 @@ def __str__(self):


class RegexpParser(ChunkParserI):
"""
r"""
A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
regular expression patterns to specify the behavior of the parser.
The chunking of the text is encoded using a ``ChunkString``, and
Expand Down
6 changes: 3 additions & 3 deletions nltk/chunk/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ def tagstr2tree(

### CONLL

_LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")


def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
Expand Down Expand Up @@ -514,7 +514,7 @@ def tree2conllstr(t):
re.DOTALL,
)

_IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
_IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')


def _ieer_read_text(s, root_label):
Expand All @@ -523,7 +523,7 @@ def _ieer_read_text(s, root_label):
# return the empty list in place of a Tree
if s is None:
return []
for piece_m in re.finditer("<[^>]+>|[^\s<]+", s):
for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
piece = piece_m.group()
try:
if piece.startswith("<b_"):
Expand Down
2 changes: 1 addition & 1 deletion nltk/classify/maxent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1287,7 +1287,7 @@ def calculate_deltas(
nftranspose,
encoding,
):
"""
r"""
Calculate the update values for the classifier weights for
this iteration of IIS. These update weights are the value of
``delta`` that solves the equation::
Expand Down
2 changes: 1 addition & 1 deletion nltk/classify/rte_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(self, rtepair, stop=True, use_lemmatize=False):
self.negwords = set(["no", "not", "never", "failed", "rejected", "denied"])
# Try to tokenize so that abbreviations, monetary amounts, email
# addresses, URLs are single tokens.
tokenizer = RegexpTokenizer("[\w.@:/]+|\w+|\$[\d.]+")
tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+")

# Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
Expand Down
22 changes: 11 additions & 11 deletions nltk/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,19 +113,19 @@
conll2002 = LazyCorpusLoader(
"conll2002",
ConllChunkCorpusReader,
".*\.(test|train).*",
r".*\.(test|train).*",
("LOC", "PER", "ORG", "MISC"),
encoding="utf-8",
)
conll2007 = LazyCorpusLoader(
"conll2007",
DependencyCorpusReader,
".*\.(test|train).*",
r".*\.(test|train).*",
encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
)
crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, ".*\.txt")
crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, r".*\.txt")
dependency_treebank = LazyCorpusLoader(
"dependency_treebank", DependencyCorpusReader, ".*\.dp", encoding="ascii"
"dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
)
floresta = LazyCorpusLoader(
"floresta",
Expand Down Expand Up @@ -308,7 +308,7 @@
switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj")
timit = LazyCorpusLoader("timit", TimitCorpusReader)
timit_tagged = LazyCorpusLoader(
"timit", TimitTaggedCorpusReader, ".+\.tags", tagset="wsj", encoding="ascii"
"timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
)
toolbox = LazyCorpusLoader(
"toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
Expand All @@ -332,7 +332,7 @@
treebank_raw = LazyCorpusLoader(
"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
)
twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, ".*\.json")
twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, r".*\.json")
udhr = LazyCorpusLoader("udhr", UdhrCorpusReader)
udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8")
universal_treebanks = LazyCorpusLoader(
Expand Down Expand Up @@ -361,7 +361,7 @@
WordNetCorpusReader,
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, ".*\.dat")
wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat")
words = LazyCorpusLoader(
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
)
Expand All @@ -371,7 +371,7 @@
"propbank",
PropbankCorpusReader,
"prop.txt",
"frames/.*\.xml",
r"frames/.*\.xml",
"verbs.txt",
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
treebank,
Expand All @@ -380,7 +380,7 @@
"nombank.1.0",
NombankCorpusReader,
"nombank.1.0",
"frames/.*\.xml",
r"frames/.*\.xml",
"nombank.1.0.words",
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
treebank,
Expand All @@ -389,7 +389,7 @@
"propbank",
PropbankCorpusReader,
"prop.txt",
"frames/.*\.xml",
r"frames/.*\.xml",
"verbs.txt",
lambda filename: filename.upper(),
ptb,
Expand All @@ -398,7 +398,7 @@
"nombank.1.0",
NombankCorpusReader,
"nombank.1.0",
"frames/.*\.xml",
r"frames/.*\.xml",
"nombank.1.0.words",
lambda filename: filename.upper(),
ptb,
Expand Down
3 changes: 2 additions & 1 deletion nltk/corpus/reader/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def __init__(self, root, fileids, encoding="utf8", tagset=None):
"""
# Convert the root to a path pointer, if necessary.
if isinstance(root, str) and not isinstance(root, PathPointer):
m = re.match("(.*\.zip)/?(.*)$|", root)
m = re.match(r"(.*\.zip)/?(.*)$|", root)

zipfile, zipentry = m.groups()
if zipfile:
root = ZipFilePathPointer(zipfile, zipentry)
Expand Down
2 changes: 1 addition & 1 deletion nltk/corpus/reader/bnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


class BNCCorpusReader(XMLCorpusReader):
"""Corpus reader for the XML version of the British National Corpus.
r"""Corpus reader for the XML version of the British National Corpus.

For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
Expand Down
2 changes: 1 addition & 1 deletion nltk/corpus/reader/bracket_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def __init__(self, root, encoding="ISO-8859-1", tagset=None):
BracketParseCorpusReader.__init__(
self,
root,
"alpino\.xml",
r"alpino\.xml",
detect_blocks="blankline",
encoding=encoding,
tagset=tagset,
Expand Down
2 changes: 1 addition & 1 deletion nltk/corpus/reader/childes.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def _get_age(self, fileid, speaker, month):

def convert_age(self, age_year):
"Caclculate age in months from a string in CHILDES format"
m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
age_month = int(m.group(1)) * 12 + int(m.group(2))
try:
if int(m.group(3)) > 15:
Expand Down
2 changes: 1 addition & 1 deletion nltk/corpus/reader/framenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2744,7 +2744,7 @@ def _strip_tags(self, data):
"""

try:
"""
r"""
# Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.)
m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data)
if m:
Expand Down
2 changes: 1 addition & 1 deletion nltk/corpus/reader/plaintext.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
para_block_reader=read_blankline_block,
encoding="utf8",
):
"""
r"""
Construct a new plaintext corpus reader for a set of documents
located at the given root directory. Example usage:

Expand Down
2 changes: 1 addition & 1 deletion nltk/corpus/reader/switchboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def _words_block_reader(self, stream):
def _tagged_words_block_reader(self, stream, tagset=None):
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])

_UTTERANCE_RE = re.compile("(\w+)\.(\d+)\:\s*(.*)")
_UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
_SEP = "/"

def _parse_utterance(self, utterance, include_tag, tagset=None):
Expand Down