/
tweetgenerator.py
120 lines (111 loc) · 4.54 KB
/
tweetgenerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# from dictogram import Dictogram
# import random
# from collections import deque
# import re
#
# def nth_order_markov(order, data)
# markov_chain = dict()
#
# for i in range(0, len(data)-order):
# window = tuple(data[i: i+order])
#
# if window in markov_chain:
# markov_chain[window].update([data[i+order]])
# else:
# markov_chain[window] = Dictogram([data[i+order]])
# return markov_model
#
# def random_start(model):
# if 'END' in model:
# seed_word = 'END'
# while seed_word == 'END'
# seed_word = model['END'].return_weighted_random_word()
# return seed_word
# return random.choice(model.keys())
#
# def random_sentence(length, markov_chain):
# current_word = random_start(markov_model)
# sentence = [current_word]
# for i in range(0, length):
# current_dictogram = markov_model[current_word]
# random_weighted_word = current_dictogram.return_weighted_random_word()
# current_word = random_weighted_word
# sentence.append(current_word)
# sentence[0] = sentence[0].capitalize()
# return ' '.join(sentence) + '.'
# return sentence
from dictogram import Dictogram
from dictionary_histogram_weighted import generate_random_word
from get_corpus import read_file
import random
class Markov_nth_order(dict):
def __init__(self, word_list=None, order=2):
super(Markov_nth_order, self).__init__() # Initialise empty dictionary
self.types = 0
self.tokens = 0
self.order = order
if word_list is not None:
self.create_dict_of_dict(word_list)
def create_dict_of_dict(self, word_list):
"""
Creating a dictionary of {(current_tuple):{next_type: 1}} structure.
"""
list_length = len(word_list)
for i in range(0, list_length-self.order):
if i + 1 + self.order < list_length:
# Tuple of nth order as current_tuple
current_tuple = tuple(word for word in word_list[i: i + self.order])
next_type = word_list[i + 1 + self.order]
self.add_word_to_dict_of_dict(current_tuple, next_type)
else:
current_type = (word_list[i], word_list[i+self.order])
next_word = 'END'
self.add_word_to_dict_of_dict(current_type, next_word)
def add_word_to_dict_of_dict(self, current_type, next_type):
"""
Adding new tuple key-value pairs or incrementing next_next_word frequency if
they exist.
"""
if current_type in self:
dictogram = self[current_type]
# Dictogram class incrementing frequency if current_type is found
# in initialised dictionary.
dictogram.add_count(next_type)
else:
# New {(current_type, next_word): {next_next_word: 1}} pair being added if current_type
# is not found. Dictogram used to make histogram.
self[current_type] = Dictogram(next_type)
self.types += 1
self.tokens += 1
def generate_random_sentence(self, word_list, sentence_length=8):
"""
Generating a random sentence from given corpus using markov chains,
returning as a list.
"""
random_sentence_output = list()
random_index = random.randint(0, len(self.keys())-1)
random_type = list(self.keys())[random_index]
random_word = random_type[0]
next_words = list(random_type[1:self.order])
random_sentence_output.extend((word for word in random_type))
output_count = 1
# * Now using 2nd order markov chains to append the most likely "next_next_word"
while output_count < sentence_length:
try:
random_word = generate_random_word(self[random_word])
if not random_word == 'END':
random_sentence_output.append(random_word)
output_count += 1
next_words.append(random_word)
random_word = tuple(next_words)
next_words = next_words[1:self.order]
# 'END' will be used to check if the random word picked is not at the
# end of the given words_list. If it is, there can be no "next_next_word".
else:
new_index = random.randint(0, len(self.keys()) - 1)
random_word = list(self.keys())[new_index]
except KeyError:
new_index = random.randint(0, len(self.keys()) - 1)
random_word = list(self.keys())[new_index]
next_word = random_word[1]
return random_sentence_output