/
daft.py
109 lines (94 loc) · 3.38 KB
/
daft.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
import time
import random
import logging
import hashlib
import warnings
import argparse
from datetime import datetime
from daftlistings import Daft, RentType
from orm import DataController
from letmecrawl import letmecrawl
def main(args):
rent_types = list(RentType)
while True:
random.shuffle(rent_types)
for rent_type in rent_types:
try:
with DataController(con_str=args.connection_string) as ds:
for proxy in letmecrawl():
con_conf = {
'proxy': 'http://{}:{}'.format(proxy.ip, proxy.port),
'timeout': 2
}
for doc in documents(rent_type, con_conf):
ds.insert(doc)
except Exception as exp:
logging.error('Unexpected error: {}. Sleeping a while.'.format(exp))
time.sleep(60)
logging.info('Resting a bit.')
time.sleep(60 * 10)
def to_dict(listing, id):
price = listing.get_price()
price_number = None
price_month = None
try:
price_number = int(re.sub('\D', '', price))
if 'week' in price.lower():
price_month = price_number * 4
else:
price_month = price_number
except:
pass
return {
'hash': id,
'price': listing.get_price(),
'price_number': price_number,
'price_month': price_month,
'price_change': listing.get_price_change(),
'viewings': listing.get_upcoming_viewings(),
'facilities': [l for sl in listing.get_facilities() for l in sl] if listing.get_facilities() else [],
'features': listing.get_features(),
'formalised_address': listing.get_formalised_address(),
'address_line_1': listing.get_address_line_1(),
'address_line_2': listing.get_address_line_2(),
'town': listing.get_town(),
'county': listing.get_county(),
'listing_image': listing.get_listing_image(),
'agent': listing.get_agent(),
'agent_url': listing.get_agent_url(),
'contact_number': listing.get_contact_number(),
'daft_link': listing.get_daft_link(),
'dwelling_type': listing.get_dwelling_type(),
'posted_since': listing.get_posted_since(),
'num_bedrooms': listing.get_num_bedrooms(),
'num_bathrooms': listing.get_num_bathrooms(),
'area_size': listing.get_area_size(),
'timestamp': datetime.now()
}
def documents(rent_type, con_conf):
for dwelling in dwellings(rent_type, con_conf):
url = dwelling.get_daft_link()
if url:
yield to_dict(dwelling, hashlib.sha1(url).hexdigest())
def dwellings(rent_type, con_conf):
offset = 0
daft = Daft(con_conf=con_conf)
daft.set_listing_type(rent_type)
daft.set_county('Dublin City')
daft.set_offset(offset)
listings = True
while listings:
listings = daft.get_listings()
for listing in listings:
yield listing
offset += len(listings)
daft.set_offset(offset)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--connection_string', type=str)
return parser.parse_args()
if __name__ == '__main__':
warnings.simplefilter('ignore')
logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO)
main(parse_args())