/
sk_markov.py
73 lines (56 loc) · 2.47 KB
/
sk_markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
# Set length of n-grams
NG=3
# Get one that's the right length
def format_sentence(sent):
return sent.replace(' .', '.').replace(' ?', '?')
# Check if a sentence is finished
def is_finished(sent):
return sent[-2:] in [' .', ' ?']
def generate_tweet():
with open('text_samples/obama.txt', 'r') as f:
text = f.read()[57:].decode('UTF-8').replace('\n', ' ')
#text = re.sub(r'","|",|"', ',', re.sub(r'(?:(\d+?),)',r'\1',text)) # get rid of the commas in large numbers
beginning = get_beginning(NG, text)
# vocab_counts2 = vectorize(text, NG)
# vocab_counts2 = [(t,c) for (t,c) in vocab_counts2 if c>2]
vocab_counts3 = vectorize(text, NG+1)
sentence = make_sentence(beginning, vocab_counts3, NG)
while len(format_sentence(sentence)) > 139:
sentence = make_sentence(beginning, vocab_counts3, NG)
return format_sentence(sentence)
# find sentence beginnings
def get_beginning(ng, text):
varpat = r'\s+\w+'*(NG-1)
beginnings = Counter(re.findall(r'(?:\n|\.\s+)([A-Z]\w*'+ varpat +')\s', text))
(beg, ct) = zip(*[(b, c) for b,c in beginnings.iteritems() if c > 1])
beginning = np.random.choice(a=beg, p = np.array(ct).astype(float) / np.sum(ct))
return beginning
# Vectorize the corpus
def vectorize(text, n):
# Handle ends of sentences
text = re.sub('([a-zA-Z]+)([\.\?])', r'\1 \2', text)
# Vectorize it. Don't citicize it.
tf_vec = CountVectorizer(input=u'content', ngram_range=(n,n), tokenizer=lambda x: x.split(), lowercase=False)
tf_matrix = tf_vec.fit_transform([text])
tf_vocab = tf_vec.get_feature_names()
# Format it
vocab_counts = zip(tf_vocab, tf_matrix.toarray()[0,:])
return vocab_counts
# Given a tuple, find the nuxt word
def get_next_word(gram, vocab_counts3):
options = [(t3.split()[NG], c3) for t3, c3 in vocab_counts3 if ' '.join(t3.split()[:-1]) == gram]
next_words, counts = zip(*options)
return np.random.choice(a=next_words, p = np.array(counts).astype(float) / np.sum(counts))
# Mke a sentence, given a beginning
def make_sentence(beginning, vocab_counts3, ng=2):
sent = beginning
while len(sent) < 140:
cur_gram = ' '.join(sent.split()[-ng:])#.lower()
sent += ' ' + get_next_word(cur_gram, vocab_counts3)
if is_finished(sent) and len(sent) > 80:
return sent
return sent