/
markov.py
96 lines (80 loc) · 3.42 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import random
from collections import defaultdict
class MarkovChain(object):
# symbols: a list of symbols in order
# ngram: the n-gram of this Markov chain generator
def __init__(self, symbols, ngram):
self.symbols = symbols
self.ngram = ngram
self.chain = defaultdict(list)
self.num_symbols = len(self.symbols)
if self.num_symbols < self.ngram:
raise AttributeError('Not enough symbols for this n-gram')
self._build_chain()
# Creates tuples of n-gram length from the symbol list
def _generate_adjacent_symbols(self):
for s in range(self.num_symbols - self.ngram + 1):
yield tuple(self.symbols[s:s+self.ngram])
# loop from last symbol to first to ensure every n-1 symbols form a
# prefix with a valid suffix
for s in range(self.num_symbols - self.ngram + 1, self.num_symbols):
yield tuple(self.symbols[s:] + self.symbols[:(s+self.ngram)%self.num_symbols])
# Builds a Markov chain based on adjacent symbols
def _build_chain(self):
for adjacent_symbols in self._generate_adjacent_symbols():
# Use first n-1 symbols as a prefix
prefix = adjacent_symbols[:-1]
# Use nth symbol as the suffix
suffix = adjacent_symbols[-1]
self.chain[prefix].append(suffix)
# Beginning with a random location in the Markov chain, this function
# aggregates a randomly generated list of symbols in the chain
# based on the n-gram
def generate(self, length=15):
seed = random.randint(0, self.num_symbols - self.ngram)
prefix = tuple(self.symbols[seed:seed+self.ngram-1])
suffix = self.symbols[seed+self.ngram-1]
for s in range(length):
yield suffix
prefix = prefix[1:] + (suffix,)
suffix = random.choice(self.chain[prefix])
class TextMarkovChain(MarkovChain):
# Randomly chooses a starting word for the generator, and returns
# its index in the list of symbols
def _choose_index_of_first_word(self):
first_guess = random.randint(0, self.num_symbols - self.ngram - 1)
idx = first_guess + 1
while idx != first_guess:
symbol = self.symbols[idx]
if len(symbol) > 0 and symbol[0].isupper():
return idx
if idx >= self.num_symbols - self.ngram - 1:
idx = 0
else:
idx += 1
# Generate as in MarkovChain, but choose the beginning of a
# sentence as the beginning of the result
def generate(self, length=15):
seed = self._choose_index_of_first_word()
prefix = tuple(self.symbols[seed:seed+self.ngram-1])
# Ensure the prefix is included in the result because the
# starting word is the first word of the prefix
for word in prefix:
yield word
suffix = self.symbols[seed+self.ngram-1]
for s in range(length - self.ngram - 1):
yield suffix
prefix = prefix[1:] + (suffix,)
suffix = random.choice(self.chain[prefix])
if __name__ == '__main__':
import sys
if len(sys.argv) != 2:
print("Usage: python markov.py <filename>")
exit()
filename = sys.argv[1]
symbols = []
with open(filename, 'r') as f:
symbols = f.read().split()
chain = TextMarkovChain(symbols, 4)
result = ' '.join(chain.generate(100))
print(result)