/
scrapMultipleAll.py
260 lines (214 loc) · 11.5 KB
/
scrapMultipleAll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#from scrap1 (grab; price, condition, link)
#drpd link(grab; general, transmission, engine, dimension&weight, brake, susp, steering, tyres&wheel)
import requests as req
import pandas as pd
#calling package URL lib
from urllib.request import urlopen as uReq
#pase HTML text
from bs4 import BeautifulSoup as soup
filename = "PeroduaMyvi_WholeSelangor_PrivateSeller.csv"
#"ToyotaCamry.csv"
#"ToyotaAlphard.csv"
#"ToyotaVellfire.csv"
#"ToyotaVios.csv"
#"HondaAccord.csv"
#"HondaCity.csv"
#"HondaCivic.csv"
#"PeroduaAxia.csv"
#"PeroduaMyvi.csv"
#"df_multipleAllToyota.csv"
#substituting avg of mileage
def subs (gMileage):
if gMileage == "0 - 4 999":
return gMileage.replace("0 - 4 999", "2500", 1)
elif gMileage == "5 000 - 9 999":
return gMileage.replace("5 000 - 9 999", "52500", 1)
elif gMileage == "10 000 - 14 999":
return gMileage.replace("10 000 - 14 999", "12500", 1)
elif gMileage == "15 000 - 19 999":
return gMileage.replace("15 000 - 19 999", "17500", 1)
elif gMileage == "20 000 - 24 999":
return gMileage.replace("20 000 - 24 999", "22500", 1)
elif gMileage == "25 000 - 29 999":
return gMileage.replace("25 000 - 29 999", "27500", 1)
elif gMileage == "30 000 - 34 999":
return gMileage.replace("30 000 - 34 999", "32500", 1)
elif gMileage == "35 000 - 39 999":
return gMileage.replace("35 000 - 39 999", "37500", 1)
elif gMileage == "40 000 - 44 999":
return gMileage.replace("40 000 - 44 999", "42500", 1)
elif gMileage == "45 000 - 49 999":
return gMileage.replace("45 000 - 49 999", "47500", 1)
elif gMileage == "50 000 - 54 999":
return gMileage.replace("50 000 - 54 999", "52500", 1)
elif gMileage == "55 000 - 59 999":
return gMileage.replace("55 000 - 59 999", "57500", 1)
elif gMileage == "60 000 - 64 999":
return gMileage.replace("60 000 - 64 999", "62500", 1)
elif gMileage == "65 000 - 69 999":
return gMileage.replace("65 000 - 69 999", "67500", 1)
elif gMileage == "70 000 - 74 999":
return gMileage.replace("70 000 - 74 999", "72500", 1)
elif gMileage == "75 000 - 79 999":
return gMileage.replace("75 000 - 79 999", "77500", 1)
elif gMileage == "80 000 - 84 999":
return gMileage.replace("80 000 - 84 999", "82500", 1)
elif gMileage == "85 000 - 89 999":
return gMileage.replace("85 000 - 89 999", "87500", 1)
elif gMileage == "90 000 - 94 999":
return gMileage.replace("90 000 - 94 999", "92500", 1)
elif gMileage == "95 000 - 99 999":
return gMileage.replace("95 000 - 99 999", "97500", 1)
elif gMileage == "100 000 - 109 999":
return gMileage.replace("100 000 - 109 999", "105000", 1)
elif gMileage == "110 000 - 119 999":
return gMileage.replace("110 000 - 119 999", "115000", 1)
elif gMileage == "120 000 - 129 999":
return gMileage.replace("120 000 - 129 999", "125000", 1)
elif gMileage == "130 000 - 139 999":
return gMileage.replace("130 000 - 139 999", "135000", 1)
elif gMileage == "140 000 - 149 999":
return gMileage.replace("140 000 - 149 999", "145000", 1)
elif gMileage == "150 000 - 159 999":
return gMileage.replace("150 000 - 159 999", "155000", 1)
elif gMileage == "160 000 - 169 999":
return gMileage.replace("160 000 - 169 999", "165000", 1)
elif gMileage == "170 000 - 179 999":
return gMileage.replace("170 000 - 179 999", "175000", 1)
elif gMileage == "180 000 - 189 999":
return gMileage.replace("180 000 - 189 999", "185000", 1)
elif gMileage == "190 000 - 199 999":
return gMileage.replace("190 000 - 199 999", "195000", 1)
elif gMileage == "200 000 - 249 999":
return gMileage.replace("200 000 - 249 999", "225000", 1)
elif gMileage == "250 000 - 299 999":
return gMileage.replace("250 000 - 299 999", "275000", 1)
elif gMileage == "300 000 - 349 999":
return gMileage.replace("300 000 - 349 999", "325000", 1)
elif gMileage == "350 000 - 399 999":
return gMileage.replace("350 000 - 399 999", "375000", 1)
elif gMileage == "400 000 - 449 999":
return gMileage.replace("400 000 - 449 999", "425000", 1)
elif gMileage == "450 000 - 499 999":
return gMileage.replace("450 000 - 499 999", "475000", 1)
else:
return gMileage
#substituting (-) data (quantitative variable) with certain value
#def dash (a):
#if a == '-':
#return a.replace("-", "0", 1)
#else:
#return a
def dprice (a):
if (' ' in a) == True:
return ''.join(a.split())
else:
return a
container = []
pages = []
global gBrand
for i in range(1,4):
#toyota
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/toyota?o=' + str(i) + '&q=&so=1&th=1'
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/perodua/myvi?o=' + str(i) + '&q=&so=1&th=1' #PeroduaMyvi
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/perodua/axia?o=' + str(i) + '&q=&so=1&th=1' #PeroduaAxia
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/honda/civic?o=' + str(i) + '&q=&so=1&th=1' #HondaCivic
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/honda/city?o=' + str(i) + '&q=&so=1&th=1' #HondaCity
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/honda/accord?o=' + str(i) + '&q=&so=1&th=1' #HondaAccord
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/toyota/vios?o=' + str(i) + '&q=&so=1&th=1' #ToyotaVios
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/toyota/vellfire?o=' + str(i) + '&q=&so=1&th=1' #ToyotaVellfire
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/toyota/alphard?o=' + str(i) + '&q=&so=1&th=1' #ToyotaAlphard
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/toyota/camry?o=' + str(i) + '&q=&so=1&th=1' #ToyotaCamry
#my_url = 'https://www.mudah.my/malaysia/cars-for-sale/proton/saga?o=' + str(i) + '&q=&so=1&th=1' #ProtonSaga
#my_url = 'https://www.mudah.my/selangor/cars-for-sale/perodua/myvi?o=' + str(i) + '&sa=333&q=&so=1&sp=1&th=1' #Myvi ShahAlam
#my_url = 'https://www.mudah.my/selangor/cars-for-sale/perodua/myvi?o=' + str(i) + '&q=&so=1&sp=1&th=1' #Myvi Selangor
my_url = 'https://www.mudah.my/selangor/cars-for-sale/perodua/myvi?o=' + str(i) + '&q=&so=1&sp=1&f=p&th=1' #Myvi Selangor PrivateSeller
#my_url = 'https://www.mudah.my/selangor/cars-for-sale/perodua/myvi?o=' + str(i) + '&q=&so=1&sp=1&f=c&th=1' #Myvi Company
pages.append(my_url)
for item in pages:
page = req.get(item)
page_soup = soup(page.text, "html.parser")
containers = page_soup.findAll("div", {"class":"listing_params_container"})
for fathiah in containers:
name = fathiah.div.div.a["title"].strip()
price = fathiah.findAll("div", {"class":"ads_price"})
Price = price[0].text.strip()
year = fathiah.findAll("font", {"class":"icon_label"})
Year = year[1].text.strip()
Condition = year[0].text.strip()
clink = fathiah.div.div.a["href"].strip()
gMileage = year[2].text.strip()
Price1 = Price.replace("RM", "", 1)
nPrice = dprice(Price1)
Mil = subs(gMileage).strip()
my_url1 = clink
page1 = req.get(my_url1)
page_soup1 = soup(page1.text, "html.parser")
panel1 = page_soup1.findAll("div", {"class":"general_section"})
#GENERAL
for pop in panel1:
#print("TEST")
brand = pop.findAll("div", {"class":"col-xs-3 mcd_record_adview"})
gBrand = brand[0].text.strip()
#gMileage = brand[1].text.strip()
gModel = brand[2].text.strip()
gSeat = brand[3].text.strip()
gVariant = brand[4].text.strip()
gType = brand[5].text.strip()
gSeries = brand[6].text.strip()
gYear = brand[7].text.strip()
gCountry = brand[8].text.strip()
#TRANSMISSION
panel2 = page_soup1.findAll("div", {"class":"transmission_section"})
for pop2 in panel2:
trans= pop2.findAll("div", {"class":"col-xs-9 mcd_record_adview"})
Ttrans = trans[0].text.strip()
#ENGINE
panel3 = page_soup1.findAll("div", {"class":"engine_section"})
for pop3 in panel3:
eng = pop3.findAll("div", {"class":"col-xs-3 mcd_record_adview"})
engCC = eng[0].text.strip()
engCompr = eng[1].text.strip()
engPeakPwr = eng[2].text.strip()
engPeakTorque = eng[3].text.strip()
engType = eng[4].text.strip()
engFuel = eng[5].text.strip()
#DIMENSION
panel4 = page_soup1.findAll("div", {"class":"dimension_section"})
for pop4 in panel4:
dim = pop4.findAll("div", {"class":"col-xs-3 mcd_record_adview"})
dLength = dim[0].text.strip()
dWidth = dim[1].text.strip()
dHeight = dim[2].text.strip()
dWheelBase = dim[3].text.strip()
dKerbWeight = dim[4].text.strip()
dFuelTank = dim[5].text.strip()
#BRAKES
panel5 = page_soup1.findAll("div", {"class":"brakes_section"})
for pop5 in panel5:
b = pop5.findAll("div", {"class": "col-xs-3 mcd_record_adview"})
bFront = b[0].text.strip()
bRear = b[1].text.strip()
#SUSPENSIONS
panel6 = page_soup1.findAll("div", {"class":"suspensions_section"})
for pop6 in panel6:
s = pop6.findAll("div", {"class": "col-xs-3 mcd_record_adview"})
sFront = s[0].text.strip()
sRear = s[1].text.strip()
#STEERING
panel7 = page_soup1.findAll("div", {"class":"steering_section"})
for pop7 in panel7:
st = pop7.findAll("div", {"class":"col-xs-9 mcd_record_adview"})
sSteering = st[0].text.strip()
#TYRES
panel8 = page_soup1.findAll("div", {"class":"tyres_section"})
for pop8 in panel8:
t = pop8.findAll("div", {"class":"col-xs-3 mcd_record_adview"})
tFront = t[0].text.strip()
tRear = t[1].text.strip()
tFRim = t[2].text.strip()
tRRim = t[3].text.strip()
#container.append((name, nPrice.strip(), Year, Condition, clink, gMileage, Mil, gBrand, gModel, dash(gSeat).strip(), gVariant, gType, gSeries, gCountry,Ttrans,dash(engCC).strip(), dash(engCompr).strip(), dash(engPeakPwr).strip(),dash(engPeakTorque).strip(), engType, engFuel,dash(dLength).strip(), dash(dWidth).strip(), dash(dHeight).strip(), dash(dWheelBase).strip(),dash(dKerbWeight).strip(), dash(dFuelTank).strip(), bFront, bRear,sFront, sRear,sSteering,tFront, tRear, tFRim, tRRim ))
container.append((name, nPrice.strip(), Year, Condition, clink, gMileage, Mil, gBrand, gModel, gSeat, gVariant, gType, gSeries, gCountry, Ttrans, engCC, engCompr, engPeakPwr, engPeakTorque, engType, engFuel, dLength, dWidth, dHeight, dWheelBase, dKerbWeight, dFuelTank, bFront, bRear, sFront, sRear, sSteering, tFront, tRear, tFRim, tRRim ))
df = pd.DataFrame(container, columns = ['Name', 'Price', 'Manufactured_Yr', 'Condition', 'Link', 'Mileage', 'NewMil', 'Brand', 'Model', 'Seat', 'Variant', 'Type', 'Series', 'Country', 'Transmission', 'EngineCC', 'Eng_Compression', 'Eng_PeakPower', 'Eng_PeakTorque', 'Eng_Type', 'Eng_Fuel', 'Length', 'Width', 'height', 'WheelBase', 'KerbWeight', 'FuelTank', 'FrontBrake', 'RearBrake', 'FrontSusp', 'RearSusp', 'Steering', 'FrontTyres', 'RearTyres', 'FrontRim', 'RearRim'])
df.to_csv(filename, index=False, encoding='utf-8')