-
Notifications
You must be signed in to change notification settings - Fork 6
/
piladelphia_libraries.py
123 lines (116 loc) · 4.3 KB
/
piladelphia_libraries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
import scraperwiki
#import urllib2
import requests
#import json
import lxml.html
import time
import re
starturl = 'http://libwww.freelibrary.org/branches/brnlist.cfm'
regex_zip = re.compile("(PA \d.+) \(")
regex_street = re.compile("(.*) Philadelphia")
def process_library(link):
record = {}
html = get(link)
root = lxml.html.fromstring(html)
branch = root.xpath('/html/body/table/tr[4]/td[2]/div/b[3]')[0].text
address = root.xpath('//div[@id="globaltext"]')[0]
phone = root.xpath('//div[@id="globaltext"]')[1]
#print address.text_content()
full_address = re.sub(r"\s+", " ", address.text_content()).lstrip('Street Address').rstrip('Directions | Map | Bus Schedule ') #this is probably a very lame way to clean this up
zip = regex_zip.findall(full_address)[0]
street = regex_street.findall(full_address)[0]
phone = re.sub(r"\s+", " ", phone.text_content()).lstrip('Contact ')
openings = root.xpath('//div[@id="redlinks"]/table/tr')
sunday = openings[0][1].text
monday = openings[1][1].text
tuesday = openings[2][1].text
wednesday = openings[3][1].text
thursday = openings[4][1].text
friday = openings[5][1].text
saturday = openings[6][1].text
if sunday != 'CLOSED':
sunday_open = sunday.split('-')[0]
sunday_close = sunday.split('-')[1]
else:
sunday_open = 'CLOSED'
sunday_close = 'CLOSED'
if monday != 'CLOSED':
monday_open = monday.split('-')[0]
monday_close = monday.split('-')[1]
else:
monday_open = 'CLOSED'
monday_close = 'CLOSED'
if tuesday != 'CLOSED':
tuesday_open = tuesday.split('-')[0]
tuesday_close = tuesday.split('-')[1]
else:
tuesday_open = 'CLOSED'
tuesday = 'CLOSED'
if wednesday != 'CLOSED':
wednesday_open = wednesday.split('-')[0]
wednesday_close = wednesday.split('-')[1]
else:
wednesday_open = 'CLOSED'
wednesday_close = 'CLOSED'
if thursday != 'CLOSED':
thursday_open = thursday.split('-')[0]
thursday_close = thursday.split('-')[1]
else:
thursday_open = 'CLOSED'
thursday_close = 'CLOSED'
if friday != 'CLOSED':
friday_open = friday.split('-')[0]
friday_close = friday.split('-')[1]
else:
friday_open = 'CLOSED'
friday_close = 'CLOSED'
if saturday != 'CLOSED':
saturday_open = saturday.split('-')[0]
saturday_close = saturday.split('-')[1]
else:
saturday_open = 'CLOSED'
saturday_close = 'CLOSED'
record['full_address'] = full_address
record['zip'] = zip
record['street'] = street
record['phone'] = phone
record['branch'] = branch
record['sunday'] = sunday
record['monday'] = monday
record['tuesday'] = tuesday
record['wednesday'] = wednesday
record['thursday'] = thursday
record['friday'] = friday
record['saturday'] = saturday
record['sunday_open'] = sunday_open
record['sunday_close'] = sunday_close
record['monday_open'] = monday_open
record['monday_close'] = monday_close
record['tuesday_open'] = tuesday_open
record['tuesday_close'] = tuesday_close
record['wednesday_open'] = wednesday_open
record['wednesday_close'] = wednesday_close
record['thursday_open'] = thursday_open
record['thursday_close'] = thursday_close
record['friday_open'] = friday_open
record['friday_close'] = friday_close
record['saturday_open'] = saturday_open
record['saturday_close'] = saturday_close
scraperwiki.sqlite.save(['branch'], data=record, table_name='philadelphia_libraries', verbose=2)
print 'done with ', branch
def process_list(html):
xpath = '//td/div/table/tr/td/div/a'
root = lxml.html.fromstring(html)
for tr in root.xpath(xpath):
link = 'http://libwww.freelibrary.org/branches/' + tr.attrib['href']
process_library(link)
print ' All done '
def get(url):
headers = {'User-agent':'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1)'}
r = requests.post(url,headers=headers,allow_redirects=True)
content = r.content
#content = (content.decode('unicode_escape')).replace('\\/', '/').strip('"')
return content
html = get(starturl)
process_list(html)