/
main.py
188 lines (164 loc) · 5.96 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import pylab
pylab.ion()
from monary import Monary
import numpy
from pandas import DataFrame
from pymongo import Connection
from BeautifulSoup import BeautifulSoup
import nltk
#####
# Database
#####
DBNAME = 'kaggle-wp'
def get_monary(conn=None, dbname=DBNAME, _cache={}):
if dbname not in _cache:
_cache[dbname] = conn or Monary()
return [_cache[dbname], dbname]
def get_pymongo(dbname=DBNAME, _cache={}, *args, **kwargs):
if dbname not in _cache:
_cache[dbname] = Connection(*args, **kwargs)[dbname]
return _cache[dbname]
def get_tu(_cache=[]):
if not _cache:
conn, dbname = get_monary()
columns = ['blog', 'inTestSet', 'post_id', 'uid', 'like_dt']
arrs = conn.query(dbname, 'tu2', {}, columns, ['int32']*4+['date'])
df = numpy.matrix(arrs).transpose()
df = DataFrame(df, columns = columns)
_cache.append(df)
return _cache[0].copy(deep=True)
def get_tp(conn=None):
conn, dbname = get_monary()
print "not implemented, bumt here's the tp collection using pymongo driver"
return get_pymongo.tp
#columns = []
#arrs = conn.query(dbname, '
#####
# Plotting
#####
def subplots(grid=[2,2], figure_index=1, clear_subplot=True):
""" Iteratively draw pylab subplots in a grid.
You may need to set pylab.ion() to do this interactively
Examples:
>>> import pylab ; pylab.ion()
>>> p = subplots(grid=[1,2])
>>> for x in range(5):
p.send([range(10), range(10), 'r-'])
p.send({'args':[[1,2,3], [1,3,2]],
'kwargs': {'marker': 'o', 'color': 'green'}})
subplt = p.send([range(10), range(10), 'b^'])
subplt.set_title('hello') ; pylab.draw()
"""
# initialized coroutine by wrapping in closure
def _subplots():
pylab.figure(figure_index)
n = -1
ax = None
while True:
n = (n+1) % (grid[0] * grid[1])
grid_size_w_index = grid + [n]
plot_params = (yield ax)
if isinstance(plot_params, dict):
plot_kwargs = plot_params.get('kwargs', {})
plot_args = plot_params.get('args', ())
else:
plot_args = plot_params
plot_kwargs = {}
ax = pylab.subplot(*grid_size_w_index)
if clear_subplot: ax.clear()
pylab.plot(*plot_args, **plot_kwargs)
pylab.draw()
coroutine = _subplots()
coroutine.next()
return coroutine
def plot_labeled_histogram(dist, fig):
"""Create labeled histograms using subplots()
Given a dict or nltk.FreqDist. Yea, this is ridiculous :)"""
tag_ids = {}
for key in dist.keys():
tag_ids[key] = len(tag_ids)
x = [tag_ids[key] for key in dist.keys()]
y = dist.values()
fig.send({'args':( x, y ), 'kwargs': {'linestyle':'', 'marker':'^'}})
pylab.xticks([tag_ids[key] for key in dist.keys()],
[key for key in dist.keys()],
rotation=90)
#####
# Tools
#####
def limited(generator, num_iters=10):
"""Limit a generator to num_iters.
If num_iters == None, assume infinite generator"""
if num_iters == None: num_iters = float('inf')
for n,x in enumerate(generator):
if n >= num_iters:
break
yield x
#####
# Analyzing post content
#####
def get_parsed_content(yield_with_post_id=False, **pymongo_find_kwargs):
if not pymongo_find_kwargs: pymongo_find_kwargs = {'skip': 8050, 'limit': 10}
db = get_pymongo()
for post_data in db.tp2.find(fields=['post_id', 'content'], **pymongo_find_kwargs):
print post_data
content = post_data['content']
parsed = BeautifulSoup(content)
if yield_with_post_id:
yield (post_data['post_id'], parsed)
else: yield parsed
def most_popular_content(df = None, db = None, yield_with_post_id=False, _cache={}):
"""Yields parsed content of posts sorted by most likes"""
if not df: df = get_tu()
if not db: db = get_pymongo()
if 'value_counts().index' not in _cache:
_cache['value_counts().index'] = df.post_id.value_counts().index
post_ids = _cache['value_counts().index']
for pid in post_ids:
try:
content = db.tp2.find_one({'post_id': pid}, fields=['content'])['content']
except:
print "pid %s exists in tu but not in tp" % pid
continue
parsed = BeautifulSoup(content)
if yield_with_post_id:
yield ( pid, parsed )
else: yield parsed
def describe_content_html_tags(batch_size=1000, num_batches=4, plot_=True):
"""find frequency distribution of html tags
in the first XXX most popular posts"""
gen = most_popular_content()
if plot_: fig = subplots([2,1])
dists = []
for _ in range(num_batches):
dist = nltk.FreqDist(tag.name
for parsed in limited(gen, batch_size)
for tag in parsed.findAll())
if plot_: plot_labeled_histogram(dist, fig)
dists.append(dist)
return dists
def _count_youtube_links(num, plot_=True):
print 'count # youtube links in first XXX most popular posts'
gen = most_popular_content()
num_youtube_links = numpy.zeros(num)
for n, parsed in enumerate(limited(gen, num)):
#debug:
for tag in parsed.findAll('a'):
url = dict(tag.attrs).get('href', '')
if 'youtube' in url:
print url
num_youtube_links[n] = sum(1
for tag in parsed.findAll('a')
if 'youtube' in dict(tag.attrs).get('href', ''))
if plot_: pylab.plot(num_youtube_links) # hopefully this looks like a normal distribution
return num_youtube_links
def describe_content(num=1000):
dist=describe_content_html_tags(num)
num_youtube_links = _count_youtube_links(num)
return dist, num_youtube_links
def tokenize_words(parsed):
words = nltk.word_tokenize(parsed.text)
freq_dist = nltk.FreqDist(words)
return freq_dist
#t = get_parsed_content()
#print 'parsed stuff in t'