/
functions.py
430 lines (381 loc) · 19.2 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#######################################################################
# File: functions.py
# Description: YOUStatAnalyzer script.
# Author: Mattia Zeni
# Created: Wed Feb 11 16:08:015 CET 2015
# E-mail: mattia.zeni@disi.unitn.it
# University of Trento - Department of Information Engineering and Computer Science
#
# (C) Copyright 2015, Mattia Zeni
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#######################################################################
import urllib2
import urllib
import cookielib
from xml.dom.minidom import parse, parseString
import re
from bs4 import BeautifulSoup
from pymongo import MongoClient
import random
import linecache
import json
import os
import os.path
HOST = 'www.youtube.com'
GDATA_HOST = 'gdata.youtube.com'
TOKEN_KEYWORD = 'account_playback_token'
UA = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/31.0.1650.57 Safari/537.36')
config_file = "config.xml"
yt_video_url = lambda vid: 'http://%s/watch?v=%s' % (HOST, vid)
yt_video_api_url= lambda vid: ('https://%s/feeds/api/videos/%s?v=2' % (GDATA_HOST, vid))
yt_related_video_url= lambda vid: ('https://%s/feeds/api/videos/%s/related?v=2' % (GDATA_HOST, vid))
yt_insights_url = lambda vid: ('https://%s/insight_ajax?action_get_statistics_and_data=1&v=%s' % (HOST, vid))
yt_gplus_url = lambda vid: ('https://plus.google.com/ripples/details?url=https://%s/watch?v=%s' % (HOST, vid))
#***************************************************************************************#
# Function: extractAttributeFromConfigFile
# Parameters: attribute
# How it works: the purpose of this function is to extract from the xml configuration
# file the value corresponding to the passed attribute
# Return: It returns the value of the corresponding configure parameter
#***************************************************************************************#
def extractAttributeFromConfigFile (tag, attribute):
config_parser = parse(config_file)
for configuration in config_parser.getElementsByTagName("config"):
for tags in configuration.getElementsByTagName(tag):
values = tags.getAttribute(attribute)
return values
#***************************************************************************************#
# Function: extractAttributeFromConfigFile
# Parameters: attribute
# How it works: the purpose of this function is to extract from the xml configuration
# file the value corresponding to the passed attribute
# Return: It returns the value of the corresponding configure parameter
#***************************************************************************************#
def extractVideoIDsFromConfigFile ():
config_parser = parse(config_file)
for configuration in config_parser.getElementsByTagName("config"):
for singleVideoSearch in configuration.getElementsByTagName("singlevideosearch"):
return singleVideoSearch.getElementsByTagName("video")
#***************************************************************************************#
# Function: getServerIp
# Parameters: none
# How it works: this 2P extract the MongoDB Server IP Address from the config file
# Return: It returns the IP address of the server running MongoDB
#***************************************************************************************#
def getServerIp ():
return extractAttributeFromConfigFile('dbconfiguration', 'ip')
#***************************************************************************************#
# Function: getServerPort
# Parameters: none
# How it works: this function extract the MongoDB Server port from the config file
# Return: It returns the number of the port of the server (default is 27017)
#***************************************************************************************#
def getServerPort ():
return int(extractAttributeFromConfigFile('dbconfiguration', 'port'))
#***************************************************************************************#
# Function: getServerDB
# Parameters: none
# How it works: this function extract the MongoDB Server database name
# Return: It returns the name of the database that must be used to store the documents
#***************************************************************************************#
def getServerDB ():
return extractAttributeFromConfigFile('dbconfiguration', 'database')
#***************************************************************************************#
# Function: getServerCollection
# Parameters: none
# How it works: this function extract the MongoDB Server collection name
# Return: It returns the name of the collection that must be used to store the documents
#***************************************************************************************#
def getServerCollection ():
return extractAttributeFromConfigFile('dbconfiguration', 'collection')
#***************************************************************************************#
# Function: create_opener
# Parameters: none
# How it works: this function creates a custom Opener with headers information that will
# be later used to fetch the video's webpage content
# Return: It returns the custom urllib2 Opener
#***************************************************************************************#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
def create_opener(cookie_jar=None):
if not cookie_jar:
cookie_jar = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
opener.addheaders = [
('User-Agent', UA)
]
return opener
#***************************************************************************************#
# Function: fetch_video_html
# Parameters: opener, video_id
# How it works: this function extracts html code of the video's page from which we will
# extract the token that will be later used to get the statistics.
# It opens the videos webpage using open() funciton on the opener created
# by create_opener function.
# Return: It returns the html code of the webpage of the video
#***************************************************************************************#
def fetch_video_html(opener, video_id):
stream = opener.open(yt_video_url(video_id))
html = stream.read()
stream.close()
return html
#***************************************************************************************#
# Function: fetch_video_insights
# Parameters: opener, video_id, token
# How it works: this function extracts the statistics of the video using the custom
# opener, the ID of the video to be analyzed and the session token
# Return: It returns the simil-json containing the statistics
#***************************************************************************************#
def fetch_video_insights(opener, video_id, token):
url = yt_insights_url(video_id)
data = 'session_token=%s' % token
headers = {
'Origin': 'http://%s' % HOST,
'Referer': yt_video_url(video_id)
}
req = urllib2.Request(url, data, headers, HOST)
stream = opener.open(req)
return stream.read()
#***************************************************************************************#
# Function: get_insight_ajax_token
# Parameters: html
# How it works: this function gets the html content of the page containing the token and
# uses a regex to extract it. At the date we wrote this document (Nov-2014)
# the tag containing the token was 'account_playback_token'
# Return: It returns the session token as a string
#***************************************************************************************#
def get_insight_ajax_token(html):
m = re.search('\"'+TOKEN_KEYWORD+'\":\"(.+?)\"', html)
if m:
return m.group(1)
#***************************************************************************************#
# Function: extractVideoStatistics
# Parameters: videoID, data
# How it works: this function gets the html content of the page containing the statistics
# and returned by fetch_video_insights() with the videoID and using regex
# generates the statistics and insert the video object in MongoDB
#***************************************************************************************#
def extractVideoStatistics (videoID, data):
objectListData = [map(float, elem.split(',')) for elem in re.findall(r'["]+[\w]+[\\":]+[ ]+[\[]+([\d\.\d,\s]+|[\d,\s]+|[\d,\s\d]+|[-\d\.\d,\s]+|[-\d,\s]+|[-\d,\s\d]+)+[\]]', data)]
objectListLabels = re.findall(r'["]+[views]+[\\":]+|["]+[shares]+[\\":]+|["]+[subscribers]+[\\":]+|["]+[day]+[\\":]+|["]+[cumulative]+[\\":]+|["]+[daily]+[\\":]+|["]+[watch\-time]+[\\":]+', data)
video = {}
singleVideo = extractVideoData(videoID)
gplusdata = extractGplusStatistics(videoID)
if len(gplusdata) > 0:
singleVideo["gplus"]=[]
singleVideo["gplus"]=gplusdata
ii=0
for count2 in range(0,len(objectListLabels)):
if ('views' in objectListLabels[count2]) or ('shares' in objectListLabels[count2]) or ('subscribers' in objectListLabels[count2]) or ('watch-time' in objectListLabels[count2]):
level1 = filter(lambda x: x.isalpha(), objectListLabels[count2])
singleVideo[level1] = {}
elif ('daily' in objectListLabels[count2]) or ('cumulative' in objectListLabels[count2]):
level2 = filter(lambda x: x.isalpha(), objectListLabels[count2])
singleVideo[level1][level2] = {}
singleVideo[level1][level2]["data"] = objectListData[ii]
ii += 1
elif ('day' in objectListLabels[count2]):
level1 = filter(lambda x: x.isalpha(), objectListLabels[count2])
singleVideo[level1] = {}
singleVideo[level1]["data"] = objectListData[ii]
ii += 1
insertEntry(singleVideo)
return
#***************************************************************************************#
# Function: insertEntry
# Parameters: video
# How it works: this function inserts the dictionary into MongoDB
#***************************************************************************************#
def insertEntry(video):
try:
client = MongoClient(getServerIp(), getServerPort())
db = client[getServerDB()][getServerCollection()]
db.insert(video)
except pymongo.errors.DuplicateKeyError:
print 'Insert Error: Duplicate'
pass
#***************************************************************************************#
# Function: extractVideoData
# Parameters: videoID
# How it works: this function extracts video's meta-data information using youtube APIs
# Return: it returns a dict conteining these information
#***************************************************************************************#
def extractVideoData(videoID):
singleVideo = {}
singleVideo["_id"] = videoID
singleVideo["accessControl"] = {}
singleVideo["relatedVideos"] = []
s = urllib2.urlopen(yt_video_api_url(videoID))
videoParser = parseString(s.read())
for published in videoParser.getElementsByTagName("published"):
for published_sub in published.childNodes:
singleVideo["publishedDate"] = published_sub.data
for title in videoParser.getElementsByTagName("title"):
for title_sub in title.childNodes:
singleVideo["title"] = title_sub.data
for author in videoParser.getElementsByTagName("author"):
for authorName in author.getElementsByTagName("name"):
for authorName_sub in authorName.childNodes:
singleVideo["author"] = authorName_sub.data
for mediaGroup in videoParser.getElementsByTagName("media:group"):
for mediaInfo in mediaGroup.getElementsByTagName("media:content"):
singleVideo["type"] = mediaInfo.getAttribute("type")
singleVideo["duration"] = mediaInfo.getAttribute("duration")
for mediaCategory in mediaGroup.getElementsByTagName("media:category"):
if mediaCategory.hasAttribute("label"):
for category_sub in mediaCategory.childNodes:
singleVideo["category"] = category_sub.nodeValue
for mediaDescription in mediaGroup.getElementsByTagName("media:description"):
for description_sub in mediaDescription.childNodes:
singleVideo["description"] = description_sub.data
for gdComments in videoParser.getElementsByTagName("gd:comments"):
for gdFeedlink in gdComments.getElementsByTagName("gd:feedLink"):
singleVideo["commentsNumber"] = gdFeedlink.getAttribute("countHint")
for accessControl in videoParser.getElementsByTagName("yt:accessControl"):
singleVideo["accessControl"][accessControl.getAttribute("action")] = {}
singleVideo["accessControl"][accessControl.getAttribute("action")]["permission"] = accessControl.getAttribute("permission")
sRelated = urllib2.urlopen(yt_related_video_url(videoID))
relatedVideoParser = parseString(sRelated.read())
for relatedMediaGroup in relatedVideoParser.getElementsByTagName("media:group"):
for relatedVideoID in relatedMediaGroup.getElementsByTagName("yt:videoid"):
for relatedVideoID_sub in relatedVideoID.childNodes:
singleVideo["relatedVideos"].append(relatedVideoID_sub.data)
return singleVideo
#***************************************************************************************#
# Function: launchScraper
# Parameters: videoID
# How it works: function called by the main Program of the script that for each video
# passed as parameter, collects all the needed information
# Details: This function is called for each videoID found, it generates the custom urllib2
# Opener with create_opener(), then gets the html content of the video's webpage
# with fetch_video_html(opener, videoID) in order to extract the session token
# with get_insight_ajax_token(video_page), then if the token has been succesfully
# found, it proceeds downloading the statistics in a simil-json format with
# fetch_video_insights(opener, videoID, token). Then if the statistic's simil-json
# is valid (contains "data" keyword), it proceeds with the parsing and inserting
# phases.
# Return: it returns 1 if the video is provided with statistics, 0 otherwise
#***************************************************************************************#
def launchScraper (videoID):
try:
print 'Video id: '+str(videoID)
opener = create_opener()
video_page = fetch_video_html(opener, videoID)
token = get_insight_ajax_token(video_page)
if not token:
print "No session token"
raise ValueError("Couldn't find session_token in %s" % video_page)
data = fetch_video_insights(opener, videoID, token)
if '"data":' in data:
extractVideoStatistics(videoID, data)
return 1
else:
print "No statistics available for this video"
return 0
except:
return 0
#***************************************************************************************#
# Function: getLineNumber
# Parameters: filename
# How it works: this function returns the number of lines contained in the file from
# which we get the random word to be used as keyword
# Return: it returns the number of the lines in the file
#***************************************************************************************#
def getLineNumber(filename):
fh = open(filename, "r")
lineNum = len(fh.readlines())
fh.close()
return lineNum
#***************************************************************************************#
# Function: getRandomLine
# Parameters: filename, lineNum
# How it works: this function gets the name of the file containing the words to be used
# as keywords and the number of lines of that file and extracts a random
# keyword using random.randrange().
# Return: returns a keyword
#***************************************************************************************#
def getRandomLine(filename, lineNum):
randomNumber = random.randrange(0,lineNum+1)
return linecache.getline(filename,randomNumber).split(" ")[0].replace("_"," ").split(" ")[0]
#***************************************************************************************#
# Function: extractGplusStatistics
# Parameters: videoID
# How it works: knowing the ID of the video to be analyzed, this function extracts the
# Gplus activities related to that video from Ripples. It uses a regex in
# order to extract each activity from the simil-json returned by Ripple.
# Return: returns a list containing all the activities
#***************************************************************************************#
def extractGplusStatistics (videoID):
content = urllib2.urlopen(yt_gplus_url(videoID))
parsedHtmlContent = BeautifulSoup(content.read())
scripts=parsedHtmlContent.findAll('script')
gplusentry=[]
for script in scripts:
if 'OZ_ripplesData' in str(script):
for single in re.findall(r'\["(?P<name>[^"]*)","(?P<number1>[^"]*)","(?P<data>[^"]*)",(?P<number2>\d*),"(?P<website>[^"]*)","(?P<language>[^"]*)"\]', str(script.contents)):
if 'youtube' in single[4]:
gplusdata={}
gplusdata['authorName']=single[0]
gplusdata['authorID']=single[1]
gplusdata['activityID']=single[2]
gplusdata['activityTimestamp']=single[3]
gplusdata['activityType']='share'
gplusdata['activityLanguage']=single[5]
else:
gplusdata={}
gplusdata['authorName']=single[0]
gplusdata['authorID']=single[1]
gplusdata['activityID']=single[2]
gplusdata['activityTimestamp']=single[3]
gplusdata['activityReshared']=single[4]
gplusdata['activityType']='reshare'
gplusdata['activityLanguage']=single[5]
gplusentry.append(gplusdata)
for single in re.findall(r'\["(?P<name>[^"]*)","(?P<number1>[^"]*)","(?P<data>[^"]*)",(?P<number2>\d*),,"(?P<language>[^"]*)"\]', str(script.contents)):
gplusdata={}
gplusdata['authorName']=single[0]
gplusdata['authorID']=single[1]
gplusdata['activityID']=single[2]
gplusdata['activityTimestamp']=single[3]
gplusdata['activityType']='share'
gplusdata['activityLanguage']=single[4]
gplusentry.append(gplusdata)
return gplusentry
#***************************************************************************************#
# Function: downloadWordDataset
# Parameters: none
# How it works: it downloads the dataset if it doesn't exist
# Return: downloads the word database file if doesn't exist
#***************************************************************************************#
def downloadWordsDataset ():
file_name = "index.noun"
if os.path.isfile(file_name) and os.access(file_name, os.R_OK):
print "Words' dataset file exists and is readable"
else:
print "Words' dataset file does not exist"
url = "http://trac.csail.mit.edu/jwi/export/97/edu.princeton.wordnet/dict/2.1.10000/index.noun"
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (file_name, file_size)
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
print status,
f.close()