/
find_quotations.py
60 lines (43 loc) · 1.65 KB
/
find_quotations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# USAGE
# put this script in the same folder as your plaintext file you're trying
# to extract quoted speech from. Edit line 52 with the name of your file.
# Run it from the command line as a Python3 program like so -
# python3 find_quotations.py
# results will be stored in a results.txt file
import re
def read_in_text(filename):
with open(filename, 'r') as fin:
text = fin.read()
return text
def count_quotation_marks(text):
""""counts the number of double quotation marks in a text"""
return len(list(re.finditer(r'"', text)))
def count_single_quotation_marks(text):
"""counts number of single quotation marks in a text"""
return len(list(re.finditer(r"'", text)))
def find_quoted_quotes(text):
"""This returns the regex matches from finding the quoted
quotes. Note: if the number of quotation marks is less than fifty
it assumes that single quotes are used to designate dialogue."""
if count_quotation_marks(text) < count_single_quotation_marks(text):
splits = re.findall(r'((?<!\w)\'.+?\'(?!\w))', text)
else:
splits = re.findall(r'("[^"]+")', text)
return splits
def find_non_anglo_quotes(text):
splits = re.findall(r'(«.+»)', text)
return splits
def save_results(quotes):
with open('results.txt', 'w') as fout:
for quote in quotes:
fout.write(quote)
fout.write('\n\n')
def main():
# store the path to the text you're looking at below
filename = 'YOURTEXT.txt'
text = read_in_text(filename)
quotes = find_quoted_quotes(text)
quotes.extend(find_non_anglo_quotes(text))
save_results(quotes)
if __name__ == '__main__':
main()