/
youtube_video_scraper.py
85 lines (62 loc) · 3.2 KB
/
youtube_video_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from urllib.request import urlopen
from urllib.request import build_opener
from bs4 import BeautifulSoup
import re
import random
import pprint
import httplib2
# list of all user names: done
# interpolate the channel name into most popular url
# scrape for titles of videos/urls: done
# go to each link and do this VVVVVV
#making aliases for youtubers whose user channel ids are hash looking strings
PONY_SYNDROME = 'UCT-_4GqC-yLY1xtTHhwY0hA'
JAMES_CHARLES = 'UCucot-Zp428OwkyRm2I7v2Q'
NYMA_TANG = 'UCroDJPcFCf6DBmHns6Xeb8g'
ALYSSA_FOREVER = 'UCNEwha2SIAz3NTtv9G0QPsg'
JASMINE_BROWN = 'UCw95JvOs39snnMPkYs-6Sog'
#25 youtube handles of bloggers that are white/white-passing (based on my perception)
WP_YOUTUBER_NAMES = ['jeffreystar', 'jaclynhill', 'macbby11', 'nikkietutorials', 'laura88lee', 'pixiwoo', 'kandeejohnson', 'zoella280390', 'makeupgeektv', 'stilaBabe09',
'shaaanxo', 'ChloeMorello', 'Laurenbeautyy', 'Missglamorazzi', 'AllThatGlitters21', 'Juicystar07', 'MannyMua733', 'GlamLifeGuru', 'CutiePieMarzia', 'Kathleen Lights',
'pixi2woo', 'CarliBel55', JAMES_CHARLES, 'HauteBrilliance', 'SierraMarieMakeup']
#25 youtube handles og bloggers that are disenfranchised in beauty community/darker skinned (based on my perception)
DK_YOUTUBER_NAMES = ['iamkareno', 'theepatrickstarrr', 'wwwengie', 'bubzbeauty', 'itsalissaweekly', 'mylifeaseva', 'Dope2111', PONY_SYNDROME, 'MichellePhan', 'itsmyRayeRaye',
'BritPopPrincess', 'DulceCandy87', 'AndreasChoice', 'macbarbie07', 'ThatsHeart', 'SmartistaBeauty', NYMA_TANG, 'beautycrush', ALYSSA_FOREVER, JASMINE_BROWN, 'Cydbeats',
'Irishcel507', 'clothesencounters', 'TTLYTEALA', 'makeupbytinayong']
# for testing
SAMPLE_YOUTUBE_NAMES = []
for _ in range(0, 10):
SAMPLE_YOUTUBE_NAMES.append(random.choice(DK_YOUTUBER_NAMES + WP_YOUTUBER_NAMES))
#INDIVIDUAL YOUTUBE LINK PARSING
youtube_path = "https://www.youtube.com/watch?v=2FQReUmJRwg"
page = urlopen(youtube_path)
soup = BeautifulSoup(page, 'html.parser')
soup.prettify()
print(soup)
spans = str(soup.find('div', {"class":"watch-view-count"}))
views_count = re.sub('[^0-9]','', spans)
print(views_count)
#parses through webpage and cleans data to find view count of video
# un_views_count = soup.find_all('div',{'id':'count'})
# print(un_views_count)
# for view in un_views_count:
# count = view.get("span") <div class="watch-view-count">9,718,233 views</div>
# print(count)
# type = str(soup.find('div', 'watch-view-count'))
# print(type)
# views_count = re.sub('[^0-9]','', type)
# views_count = re.sub(',', '', views_count)
# print(views_count)
# #parses through webpage and cleans data to get dislike counts
# un_dislike_count = str(soup.find('button', title="I dislike this", type="button"))
# dislike_count = re.sub('[^0-9,]',' ', un_dislike_count)
# dislike_count = re.sub(',', '', dislike_count).split()
# #parses through webpage and cleans data to get likes counts
# un_likes_amount = str(soup.find('button', title="I like this", type='button'))
# likes_count = re.sub('[^0-9,]',' ', un_likes_amount)
# likes_count = re.sub(',', '', likes_count).split()
# print("dislikes:", int(dislike_count[0]))
# print("likes:", int(likes_count[0]))
# print('views:', views_count)
# print(len(WP_YOUTUBER_NAMES))
# print(len(DK_YOUTUBER_NAMES))