-
Notifications
You must be signed in to change notification settings - Fork 6
/
kettering_planning_applications.py
159 lines (137 loc) · 6.21 KB
/
kettering_planning_applications.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
import scraperwiki
import lxml.html
#import requests
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
from pytz import timezone
from collections import defaultdict
from lxml.html import fromstring,tostring
import re
# http://www.kettering.gov.uk/site/scripts/planning_list.php?fromDay=05&fromMonth=04&fromYear=2012&toDay=12&toMonth=04&toYear=2012&submitDateWeekly=Go
DATE_FORMAT = "%d/%m/%Y"
tz = timezone('Europe/London')
rfc3339_date = "%Y-%m-%dT%H:%M:%SZ"
minus_60_days = (date.today() + relativedelta( days = -60 )).strftime(DATE_FORMAT)
test_date = '01/01/1998'
today_date = date.today().strftime(DATE_FORMAT)
is_date_regex = re.compile("^(3[01]|[12][0-9]|0[1-9])/(1[0-2]|0[1-9])/[0-9]{4}$") # match dd/mm/yyyy, requiring leading zeros (http://answers.oreilly.com/topic/226-how-to-validate-traditional-date-formats-with-regular-expressions/)
replacements = {' ':'_','(':'',')':'','º'.decode('utf-8'):''}
def replace_all(text, dic):
for i, j in dic.iteritems():
text = text.replace(i, j)
return text
def table_to_list(table):
dct = table_to_2d_dict(table)
return list(iter_2d_dict(dct))
def table_to_2d_dict(table):
result = defaultdict(lambda : defaultdict(unicode))
for row_i, row in enumerate(table.xpath('./tr')):
for col_i, col in enumerate(row.xpath('./td|./th')):
colspan = int(col.get('colspan', 1))
rowspan = int(col.get('rowspan', 1))
col_data = col.text_content()
while row_i in result and col_i in result[row_i]:
col_i += 1
for i in range(row_i, row_i + rowspan):
for j in range(col_i, col_i + colspan):
result[i][j] = col_data
return result
def iter_2d_dict(dct):
for i, row in sorted(dct.items()):
cols = []
for j, col in sorted(row.items()):
cols.append(col)
yield cols
def get_dt(date_string, date_format=DATE_FORMAT): # taken from scraperwiki.com/scrapers/utility_library/
try:
dt = datetime.strptime(date_string, date_format)
return dt.date()
except:
return None
def test_dt(date_to_test, ref_date, date_format=DATE_FORMAT):
try:
test_dt = datetime.strptime(date_to_test, date_format)
ref_dt = datetime.strptime(ref_date, date_format)
if test_dt > ref_dt:
return 1
elif test_dt < ref_dt:
return -1
elif test_dt == ref_dt:
return 0
else:
return None
except:
return None
def process(uid, url):
record = {}
record['url'] = url
record['uid'] = uid
record['date_scraped'] = datetime.now(tz).strftime(rfc3339_date)
html = scraperwiki.scrape(url)
doc = lxml.html.fromstring(html)
#Application details
for table_el in doc.xpath('//table[@summary="Planning application Details"]/tbody'):
table = table_to_list(table_el)
for item in table[:-2]:
#if item[0] in allowed_keys:
record[replace_all(item[0].strip().lower(),replacements)] = item[1].strip()
#Application dates
for table_el in doc.xpath('//table[@summary="key application dates"]/tbody'):
table = table_to_list(table_el)
for item in table:
if is_date_regex.search(item[1].strip()):
record[replace_all(item[0].strip().lower(),replacements)] = str(get_dt(str(item[1].strip()),DATE_FORMAT))
else:
record[str(replace_all(item[0].strip().lower(),replacements))] = item[1].strip()
#Rename keys that need renaming
if 'location' in record:
record['address'] = record.pop('location')
if 'proposal' in record:
record['description'] = record.pop('proposal')
if 'decided_on' in record:
record['decision_date'] = record.pop('decided_on')
if 'validation/registration_date' in record:
record['date_validated'] = record.pop('validation/registration_date')
if 'neighbours/consultations_sent_on' in record:
record['neighbour_consultation_start_date'] = record.pop('neighbours/consultations_sent_on')
if 'expiry_date_for_neighbours/consultations' in record:
record['neighbour_consultation_end_date'] = record.pop('expiry_date_for_neighbours/consultations')
if 'advertised_on' in record:
record['last_advertised_date'] = record.pop('advertised_on')
#Start date
if record['date_received']:
record['start_date'] = record['date_received']
else:
record['start_date'] = record['date_validated']
scraperwiki.sqlite.save(unique_keys=['uid'], data=record, verbose=0)
def get(from_date,to_date,url,counter):
if counter == '':
counter = 0
if url == '':
url = 'http://www.kettering.gov.uk/site/scripts/planning_list.php?fromDay=%s&fromMonth=%s&fromYear=%s&toDay=%s&toMonth=%s&toYear=%s&submitDateWeekly=Go' % (from_date.split('/')[0],from_date.split('/')[1],from_date.split('/')[2],to_date.split('/')[0],to_date.split('/')[1],to_date.split('/')[2])
#url = 'http://www.kettering.gov.uk/site/scripts/planning_list.php?fromDay=01&fromMonth=04&fromYear=2004&toDay=08&toMonth=04&toYear=2004&submitDateWeekly=Go'
results = scraperwiki.scrape(url)
root = lxml.html.fromstring(results)
table = root.xpath('//table/tbody/tr')
for tr in table:
uid = tr[0][0].text.strip()
detail_url = tr[0][0].attrib['href']
process(uid, detail_url)
counter = counter+1
if root.xpath('//a[contains(text(),"Next")]'):
get('','',root.xpath('//a[contains(text(),"Next")]')[0].attrib['href'],counter)
else:
print 'Done. Fetched ',counter,' records.'
print 'Fetching last 60 days'
get(minus_60_days,today_date,'','')
last_old = scraperwiki.sqlite.get_var('last_old')
minus_60_old_days = (datetime.strptime(last_old,DATE_FORMAT) + relativedelta( days = -60 )).strftime(DATE_FORMAT)
if test_dt(last_old, test_date) == 1:
print 'Fetching 60 more days of old ones, starting from', last_old, ' and going to ', minus_60_old_days
get(minus_60_old_days,last_old,'','')
scraperwiki.sqlite.save_var('last_old', minus_60_old_days)
else:
print 'Old collection is permanently done!'
print
print '--- Done for today ---'