/
HolbrookCorpus.py
91 lines (81 loc) · 2.87 KB
/
HolbrookCorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from Datum import Datum
from Sentence import Sentence
class HolbrookCorpus:
corpus = [] # list of sentences
def __init__(self, filename=None):
if filename:
self.read_holbrook(filename)
else:
self.corpus = []
def processLine(self, line):
line = line.strip()
line = line.lower()
line = line.replace('"','')
line = line.replace(',', '')
line = line.replace('.','')
line = line.replace('!','')
line = line.replace("'",'')
line = line.replace(":",'')
line = line.replace(";",'')
if line == '':
return None
processed_tokens = Sentence()
processed_tokens.append(Datum("<s>")) #start symbol
tokens = line.split()
i = 0
while i < len(tokens):
token = tokens[i]
if token == '<err':
targ = tokens[i+1]
targ_splits = targ.split('=')
correct_token = targ_splits[1][:-1] # chop off the trailing '>'
correct_token_splits = correct_token.split()
if len(correct_token_splits) > 2: # targ with multiple words
#print 'targ with multiple words: "%s"' % targ
for correct_word in correct_token_splits:
processed_tokens.append(Datum(correct_word))
elif tokens[i+3] != '</err>':
processed_tokens.append(Datum(correct_token))
else:
incorrect_token = tokens[i+2]
processed_tokens.append(Datum(correct_token, incorrect_token))
i += tokens[i:].index('</err>') + 1 # update index
else: # regular word
processed_tokens.append(Datum(token))
i += 1
processed_tokens.append(Datum("</s>"))
return processed_tokens
def read_holbrook(self, filename):
"""Read in holbrook data, returns a list (sentence) of list(words) of lists(alternatives).
The first item in each word list is the correct word."""
f = open(filename)
self.corpus = []
for line in f:
sentence = self.processLine(line)
if sentence:
self.corpus.append(sentence)
def generateTestCases(self):
"""Returns a list of sentences with exactly 1 elligible spelling error"""
testCases = [] # list of Sentences
for sentence in self.corpus:
cleanSentence = sentence.cleanSentence()
for i in range(0, len(sentence)):
datum_i = sentence.get(i)
if datum_i.hasError() and datum_i.isValidTest():
testSentence = Sentence(cleanSentence)
testSentence.put(i, datum_i)
testCases.append(testSentence)
return testCases
def slurpString(self, contents):
"""Reads a clean corpus from string instead of file. Used for submission."""
lines = contents.split('\n')
self.corpus = []
for line in lines:
sentence = self.processLine(line)
if sentence:
self.corpus.append(sentence)
def __str__(self):
str_list = []
for sentence in self.corpus:
str_list.append(str(sentence))
return '\n'.join(str_list)