/
loopnet_com.py
174 lines (152 loc) · 7.05 KB
/
loopnet_com.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import csv
import requests
import threading
from bs4 import BeautifulSoup
# request headers
HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
# encode and format
def format_properly(string):
string = str(string.encode("utf8"))
string = str(string.replace("\\n", "").replace("b'", "").replace("'", ""))
return string
class loopnet_com():
def __init__(self, link, mode):
self.scrape_page(link, mode)
# scrape listings page
def scrape_page(self, link, mode):
permalink = link
lindex = 0
while True:
lindex += 1
# get next link
if lindex != 1:
link = permalink + "/" + str(lindex)
# make request and generate code soup
page = requests.get(link, headers = HEADERS, timeout=30)
soup = BeautifulSoup(page.content, 'html.parser')
# break if no results found
if len(soup.find_all('app-listing-showcase')) == 0:
break
# get all result links
results = []
try:
for res in soup.find_all('app-listing-diamond'):
result_link = res.find('a')
results.append("https://www.loopnet.com" + result_link['href'])
except:
None
for res in soup.find_all('app-listing-showcase'):
result_link = res.find('a')
results.append("https://www.loopnet.com" + result_link['href'])
# read current data
with open("results.csv", "r", newline='') as resultsFile:
reader = csv.reader(resultsFile)
alldata = list(reader)
data = []
for d in alldata:
data.append(d[15])
# process every result
with open("changes.csv", "a", newline='') as changesFile:
# generate writer object for changes file
changesWriter = csv.writer(changesFile)
with open("results.csv", "a", newline='') as resultsFile:
# generate writer object for results file
csvWriter = csv.writer(resultsFile)
# open new thread for each result on the page
threads = []
for result in results:
thread = threading.Thread(target = self.scrape_result, args = (result, csvWriter, changesWriter, mode, data))
threads.append(thread)
thread.start()
# wait for all threads to execute
for thread in threads:
thread.join()
# scrape listing
def scrape_result(self, result, csvWriter, changesWriter, mode, data):
print(result)
# make request and generate code soup
page = requests.get(result, headers = HEADERS, timeout=30)
soup = BeautifulSoup(page.content, 'html.parser')
# source
source = "Loopnet.com"
#listing
full_listing = soup.find(class_ = "imageContact")
# title
title = full_listing.find("h1")
title = format_properly(title.find('span').text)
# description
try:
description = format_properly(soup.find(class_ = "col-parent col-12 mobile-col-6 tablet-col-6 summary text-light descriptionAd").text)
except:
return # ad or franchise
# get top results
year = "n/a"
price = "n/a"
revenue = "n/a"
ebitda = "n/a"
cash_flow = "n/a"
inventory = "n/a"
ffe = "n/a"
for body in soup.find_all('tbody'):
for tr in body.find_all('tr'):
try:
tds = tr.find_all('td')
if "asking price" in tds[0].find('span').text.lower():
price = format_properly(tds[1].find('span').text)
elif "revenue" in tds[0].find('span').text.lower():
revenue = format_properly(tds[1].find('span').text)
elif "ff&e" in tds[0].find('span').text.lower():
ffe = format_properly(tds[1].find('span').text)
elif "year" in tds[0].find('span').text.lower():
year = format_properly(tds[1].find('span').text)
elif "cash flow" in tds[0].find('span').text.lower():
cash_flow = format_properly(tds[1].find('span').text)
elif "inventory" in tds[0].find('span').text.lower():
inventory = format_properly(tds[1].find('span').text)
elif "ebitda" in tds[0].find('span').text.lower():
ebitda = format_properly(tds[1].find('span').text)
except:
pass
# get location
state = "n/a"
region = "n/a"
location = soup.find(class_ = "col-12 col-parent locationHeight")
loc_divs = location.find_all('div')
if "location" in loc_divs[0].text.lower():
location = format_properly(loc_divs[1].text).split(',')
if len(location) == 1:
state = location[0]
else:
region = location[0]
state = location[1]
# get business details
real_estate = "n/a"
reason = "n/a"
employees = "n/a"
for detail in soup.find_all(class_ = "col-12 col-parent detailInformationHeight"):
divs = detail.find_all('div')
if "real estate" in divs[0].text.lower():
real_estate = format_properly(divs[1].text)
elif "employees" in divs[0].text.lower():
employees = format_properly(divs[1].text)
elif "reason for selling" in divs[0].text.lower():
reason = format_properly(divs[1].text)
# contact
contact = ''
try:
for div in soup.find(class_ = 'broker-profile-name'):
contact = contact + format_properly(div.text)
except:
contact = "n/a"
# phone
phone = soup.find(class_ = 'profile-phone')
phone = format_properly(phone.text)
# write data to csv file
# if mode is 'f' then changes will not be taken into consideration
if mode.lower() == 'f':
csvWriter.writerow([source, state, region, title, description, real_estate, reason, employees, year, price, revenue, ebitda, cash_flow, inventory, ffe, result, contact, phone])
# if mode is 't' then only changes will be written to the output file
else:
listing_data = [source, state, region, title, description, real_estate, reason, employees, year, price, revenue, ebitda, cash_flow, inventory, ffe, result, contact, phone]
if not result in data:
changesWriter.writerow([source, state, region, title, description, real_estate, reason, employees, year, price, revenue, ebitda, cash_flow, inventory, ffe, result, contact, phone])