/
scraper.py
123 lines (107 loc) · 3.14 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# PYTHON GENIUS SCRAPER
import requests
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
GENIUS_API_KEY = os.environ.get('GENIUS_API_KEY')
base_url = 'http://api.genius.com'
headers = {
'Authorization': 'Bearer ' + GENIUS_API_KEY
}
# list of artists to scrape from
artists = [
'Britney Spears',
'Queen',
'OneRepublic',
'Whitney Houston',
'Stevie Wonder',
'Bon Jovi',
'Avril Lavigne',
'Carly Rae Jepsen',
'David Bowie',
'Amy Winehouse',
'Christina Aguilera',
'Gwen Stefani',
'Coldplay',
'Frank Sinatra',
'Celine Dion',
'Backstreet Boys',
'Janet Jackson',
'Jennifer Lopez',
'Meghan Trainor',
'Ellie Goulding',
'Nelly Furtado',
'Justin Bieber',
'Katy Perry',
'Bruno Mars',
'Beyonce',
'Lorde',
'The Weeknd',
'John Legend',
'Rihanna',
'Lady Gaga',
'Usher',
'Miley Cyrus',
'Taylor Swift',
'Major Lazer',
'One Direction',
'Ed Sheeran',
'Sia',
'Ariana Grande',
'Calvin Harris',
'Mariah Carey',
'Madonna',
'Elton John',
'The Beatles',
'Michael Jackson',
'Bee Gees',
'Prince',
'Maroon 5',
'The Black Eyed Peas',
'P!NK',
'TLC',
'R. Kelly',
'Kelly Clarkson',
'Justin Timberlake',
'Alessia Cara',
'Shawn Mendes',
'Hailee Steinfeld',
'Jason Derulo',
'Adele',
'Zedd',
'Train',
'Selena Gomez',
'Kygo',
]
def get_lyrics(song_api_path):
song_url = base_url + song_api_path
response = requests.get(song_url, headers=headers)
json = response.json()
path = json['response']['song']['path']
#print 'path %s' % path
page_url = 'http://genius.com' + path
page = requests.get(page_url)
print 'Page %s' % page
html = BeautifulSoup(page.text, 'html.parser')
print 'HTML %s' % html
[h.extract() for h in html('script')]
lyrics = html.find('div', { 'class': 'lyrics'}).get_text()
# print out the lyrics
print 'Lyrics %s' % lyrics
with open('data/input.txt', 'a') as f:
f.write(lyrics.encode('utf-8'))
f.close()
if __name__ == "__main__":
for artist_name in artists:
search_url = base_url + '/search?q=%s' % artist_name
print 'Base URL: %s' % (search_url)
print 'headers %s' % (headers)
#send the request
response = requests.get(search_url, headers=headers)
json = response.json()
#print "JSON %s" % (json)
song_info = None
for hit in json['response']['hits']:
print hit['result']['api_path']
get_lyrics(hit['result']['api_path'])