/
senGen.py
executable file
·131 lines (114 loc) · 4.62 KB
/
senGen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/python
import re
import random
import sys
class SentenceGenerator(object):
def __init__(self):
self.tempMapping = {}
self.mapping = {}
self.starts = []
def fixCaps(self, word):
if word.isupper() and word != "I":
word = word.lower()
elif word[0].isupper():
word = word.lower().capitalize()
else:
word = word.lower()
return word
def toHashKey(self, lst):
return tuple(lst)
def wordlist(self, filename):
f = open(filename, 'r')
wordlist = [self.fixCaps(w)
for w in re.findall(r"#?[\w']+|[.,!?;]", f.read())]
f.close()
return wordlist
def addItemToTempMapping(self, history, word):
while len(history) > 0:
first = self.toHashKey(history)
if first in self.tempMapping:
if word in self.tempMapping[first]:
self.tempMapping[first][word] += 1.0
else:
self.tempMapping[first][word] = 1.0
else:
self.tempMapping[first] = {}
self.tempMapping[first][word] = 1.0
history = history[1:]
# Building and normalizing the mapping.
def buildMapping(self, wordlist, markovLength):
self.starts.append(wordlist[0])
for i in range(1, len(wordlist) - 1):
if i <= markovLength:
history = wordlist[: i + 1]
else:
history = wordlist[i - markovLength + 1: i + 1]
follow = wordlist[i + 1]
# if the last elt was a period, add the next word to the start list
if history[-1] == "." and follow not in ".,!?;":
self.starts.append(follow)
self.addItemToTempMapping(history, follow)
# Normalize the values in tempMapping, put them into mapping
for first, followset in self.tempMapping.items():
total = sum(followset.values())
# Normalizing here:
self.mapping[first] = dict([(k, v / total)
for k, v in followset.items()])
# Returns the next word in the sentence (chosen randomly),
# given the previous ones.
def next(self, prevList):
sum = 0.0
retval = ""
index = random.random()
# Shorten prevList until it's in mapping
while self.toHashKey(prevList) not in self.mapping and len(prevList) != 0:
prevList.pop(0)
# Get a random word from the mapping, given prevLis
if(len(prevList) == 0):
prevList = ['wall']
for k, v in self.mapping[self.toHashKey(prevList)].items():
sum += v
if sum >= index and retval == "":
retval = k
return retval
def genSentence(self, markovLength, startWord=None):
if(startWord is None):
startWord = "wall"
# Start with a random "starting word"
curr = startWord
sent = curr.capitalize()
prevList = [curr]
if(len(self.next([curr])) == 0):
curr = "wall"
sent = "WALL"
prevList = [curr]
# Keep adding words until we hit a period
while (curr not in "."):
curr = self.next(prevList)
prevList.append(curr)
# if the prevList has gotten too long, trim it
if len(prevList) > markovLength:
prevList.pop(0)
if (curr not in ".,!?;"):
sent += " " # Add spaces between words (but not punctuation)
sent += curr
return sent
# These mappings can get fairly large -- they're stored globally to
# save copying time.
# (tuple of words) -> {dict: word -> number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 2.0, 'from': 1.0}
# Used briefly while first constructing the normalized mapping
# (tuple of words) -> {dict: word -> *normalized* number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 0.66666666, 'from': 0.33333333}
# Contains the set of words that can start sentences
# We want to be able to compare words independent of their capitalization.
# affect processing time too negatively.
# Returns the contents of the file, split into a list of words and
# (some) punctuation.
# Self-explanatory -- adds "word" to the "tempMapping" dict under "history".
# tempMapping (and mapping) both match each word to a list of possible next
# words.
# Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to
# the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"].