-
Notifications
You must be signed in to change notification settings - Fork 34
/
separated_parenthesis.py
205 lines (158 loc) · 6.63 KB
/
separated_parenthesis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import pyparsing as pypar
import logging
from .Grammars import parenthesis_nester
from . import nlp
class separated_parenthesis(object):
"""
Separates parenthetical content into new sentences. This is useful when
creating word embeddings, as associations should only be made within the
same sentence.
This parser returns a document that is sentence chunked and appends
parenthetical content as a new sentence to the sentence following the
sentences it was found in. Terminal punctuation of a period is added to
parenthetical sentences if necessary. Parenthetical sentences can be
pruned by setting min_keep_length.
Example:
input = 'Hello (it is a beautiful day) world.'
output = 'Hello world. it is a beautiful day .'
"""
def __init__(self, min_keep_length=0):
"""
Initialize the parser.
Args:
min_keep_length: if None keep everything, if 0 drop everything
(default), for any other integer n, keep only if statment is
at least n tokens long.
"""
self.logger = logging.getLogger(__name__)
self.min_keep_length = min_keep_length
self.grammar = parenthesis_nester()
def __call__(self, text):
"""
Runs the parser.
Args:
text: a string document
Returns:
text: A string document with parenthetical content processed
"""
# Known issue - pattern will split on punctuation, even when found in
# parenthetical content. So, the sentence "A A V (C D. A B) A." would
# be split into sentences "A A V (C D." and " A B) A."
text = " ".join(text.strip().split())
parsed = nlp(text, disable=["tagger"])
doc_out = []
for parsed_sent in parsed.sents:
# Get the raw text for the sentence from spaCy
# sent = ' '.join([x.text for x in parsed_sent])
sent = parsed_sent.text
# Count the number of left and right parens
LP_Paran = sum(1 for a in sent if a == "(")
RP_Paran = sum(1 for a in sent if a == ")")
LP_Bracket = sum(1 for a in sent if a == "[")
RP_Bracket = sum(1 for a in sent if a == "]")
LP_Curl = sum(1 for a in sent if a == "{")
RP_Curl = sum(1 for a in sent if a == "}")
# If the count of the left paren doesn't match the right, then
# ignore all parenthesis
FLAG_valid = (
(LP_Paran == RP_Paran)
and (LP_Bracket == RP_Bracket)
and (LP_Curl == RP_Curl)
)
try:
tokens = self.grammar.grammar.parseString(sent)
except (pypar.ParseException, RuntimeError): # pragma: no cover
FLAG_valid = False
if not FLAG_valid:
# On fail simply remove all parenthesis
sent = sent.replace("(", "")
sent = sent.replace(")", "")
sent = sent.replace("[", "")
sent = sent.replace("]", "")
sent = sent.replace("{", "")
sent = sent.replace("}", "")
tokens = sent.split()
text = " ".join(tokens)
doc_out.append(text)
else:
text = self.paren_pop(tokens)
doc_out.extend(text)
return "\n".join(doc_out)
def paren_pop(self, parsed_tokens):
"""
Args:
parsed_tokens: a ParseResult object
Returns:
content: a list of string sentences
"""
# must convert the ParseResult to a list, otherwise adding it to a list
# causes weird results.
if isinstance(parsed_tokens, pypar.ParseResults):
parsed_tokens = parsed_tokens.asList()
content = self.paren_pop_helper(parsed_tokens)
return content
def paren_pop_helper(self, tokens):
"""
Args:
tokens: a list of string sentences and parenthetical content lists
Returns:
new_tokens: a list of string sentences
"""
# Check if token list is empty
if not tokens:
return tokens
# Check if there is a single sentence in parenthetical content
# if so, use the sentence as tokens
if isinstance(tokens[0], list) and len(tokens) == 1:
tokens = tokens[0]
new_tokens = []
token_words = [x for x in tokens if isinstance(x, str)]
# If tokens don't include parenthetical content, return as string
if len(token_words) == len(tokens) and len(token_words):
if token_words[-1] not in [".", "!", "?"]:
token_words.append(".")
output = remove_trailing_space(" ".join(token_words))
return [output]
else:
token_parens = [x for x in tokens if isinstance(x, list)]
reorged_tokens = []
# Iterate through all parenthetical content, recursing on them
# This allows content in nested parenthesis to be captured
for tokens in token_parens:
sents = self.paren_pop_helper(tokens)
for sent in sents:
# Only keep if the sentence is at least as long as the
# min_keep_length
n_tokens_sent = len(sent.split())
if (
self.min_keep_length is None
or self.min_keep_length <= n_tokens_sent
):
self.logger.info(
"Expanded parenthetical content: %s" % sent
)
reorged_tokens.append(sent)
# Bundles outer sentence with inner parenthetical content
if token_words:
if token_words[-1] not in [".", "!", "?"]:
token_words.append(".")
new_tokens.append(" ".join(token_words))
new_tokens.extend(reorged_tokens)
# Remove an extra space after the token splits
for k, block in enumerate(new_tokens):
new_tokens[k] = remove_trailing_space(block)
# New tokens returns a list of strings
return new_tokens
def remove_trailing_space(s, punctuation="!?.,"):
""" Removes a trailing space in a sentence eg.
"I saw a foo ." to "I saw a foo."
"""
for punc in punctuation:
if len(s) < 2:
return s
if " %s" % punc == s[-2:]:
s = s[:-2] + punc
for punc in punctuation:
if "%s%s" % (punc, punc) == s[-2:]:
s = s[:-1]
return s