/
businessesforsale_com.py
178 lines (150 loc) · 6.88 KB
/
businessesforsale_com.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import csv, requests, threading
from bs4 import BeautifulSoup
# request headers
HEADERS = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
# encode and format
def format_properly(string):
string = str(string.encode("utf8"))
string = str(string.replace("\\n", "").replace("b'", "").replace("'", ""))
return string
class businessesforsale_com():
def __init__(self, link, mode):
categories_links = self.get_categories(link)
self.scrape_category(categories_links, mode)
# get all categories from categories page
def get_categories(self, link):
# make request and generate code soup
page = requests.get(link, headers = HEADERS, timeout=30)
soup = BeautifulSoup(page.content, 'html.parser')
# get each link
categories_links = []
for category in soup.find_all(class_='cats-sector'):
category_link = category.find('a')
categories_links.append(category_link['href'])
return categories_links
# start scraping a category
def scrape_category(self, categories_links, mode):
for link in categories_links:
lindex = 0
while True:
lindex += 1
# get next link
if lindex != 1:
link = link + "-" + str(lindex)
page = requests.get(link, headers = HEADERS, timeout=30)
soup = BeautifulSoup(page.content, 'html.parser')
# break if no results found
if len(soup.find_all(class_='result')) == 0:
break
# get all result links
results = []
for res in soup.find_all(class_='result'):
# get the link for each result
for result_link in res.find_all('a'):
results.append(result_link['href'])
break
# read current data
with open("results.csv", "r", newline='') as resultsFile:
reader = csv.reader(resultsFile)
alldata = list(reader)
data = []
for d in alldata:
data.append(d[15])
# process every result
with open("changes.csv", "a", newline='') as changesFile:
# generate writer object for changes file
changesWriter = csv.writer(changesFile)
with open("results.csv", "a", newline='') as resultsFile:
# generate writer object for results file
csvWriter = csv.writer(resultsFile)
# open new thread for each result on the page
threads = []
for result in results:
thread = threading.Thread(target = self.scrape_result, args = (result, csvWriter, changesWriter, mode, data))
threads.append(thread)
thread.start()
# wait for all threads to execute
for thread in threads:
thread.join()
# scrape listing
def scrape_result(self, result, csvWriter, changesWriter, mode, data):
print(result)
# make request and generate code soup
page = requests.get(result, headers = HEADERS, timeout=30)
soup = BeautifulSoup(page.content, 'html.parser')
# source
source = "Businessesforsale.com"
# state
try:
state = soup.find(attrs={"itemprop" : "addressRegion"}).text
except:
state = "n/a"
# region
try:
region = soup.find(attrs={"itemprop" : "addressLocality"}).text
except:
region = "n/a"
try:
# title
title = format_properly(soup.find(attrs={"itemprop" : "name"}).text)
# description
description = format_properly(soup.find(class_ = "listing-section-content").text)
except:
# skip (add / franchise page)
return
# get business details
real_estate = "n/a"
reason = "n/a"
employees = "n/a"
year = "n/a"
inventory = "n/a"
ffe = "n/a"
for inf in soup.find_all(class_ = 'listing-details'):
for detail in inf.find_all('dt'):
# real estate
if "real estate" in detail.text.lower():
real_estate = format_properly(inf.find('p').text)
# reason for selling
elif "reasons for selling" in detail.text.lower():
reason = format_properly(inf.find('p').text)
# number of employees
elif "employees" in detail.text.lower():
employees = inf.find('dd').text
# years established
elif "years established" in detail.text.lower():
year = inf.find('dd').text
# inventory
elif "inventory" in detail.text.lower():
inventory = format_properly(inf.find('dd').text)
# furniture / fixtures value
elif "furniture / fixtures value" in detail.text.lower():
ffe = format_properly(inf.find('dd').text)
# get asking price
price = soup.find(class_ = 'price')
price = format_properly(price.find('span').text)
# sales revenue
revenue = soup.find(id = 'revenue')
revenue = format_properly(revenue.find('dd').text)
# EBITDA
ebitda = "n/a"
# cash flow
cash_flow = soup.find(id = 'profit')
cash_flow = format_properly(cash_flow.find('dd').text)
# contact
contact = ''
try:
contact = soup.find(class_ = 'broker-details')
contact = format_properly(contact.find('h4').text)
except:
contact = "n/a"
# phone
phone = "n/a"
# write data to csv file
# if mode is 'f' then changes will not be taken into consideration
if mode.lower() == 'f':
csvWriter.writerow([source, state, region, title, description, real_estate, reason, employees, year, price, revenue, ebitda, cash_flow, inventory, ffe, result, contact, phone])
# if mode is 't' then only changes will be written to the output file
else:
listing_data = [source, state, region, title, description, real_estate, reason, employees, year, price, revenue, ebitda, cash_flow, inventory, ffe, result, contact, phone]
if not result in data:
changesWriter.writerow([source, state, region, title, description, real_estate, reason, employees, year, price, revenue, ebitda, cash_flow, inventory, ffe, result, contact, phone])