/
591RentData.py
95 lines (83 loc) · 3.86 KB
/
591RentData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import csv
import requests
# input 物件網址 撈取網頁資料 return 欄位
def getData(url):
request_url='https:'+str(url).strip()
res=requests.get(request_url)
if res.status_code == 200:
bs=BeautifulSoup(res.text,'html.parser')
#先宣告變數為NULL 若無撈到資料則寫入NULL
addr='NULL'
price='NULL'
size='NULL'
floor='NULL'
room_type='NULL'
form='NULL'
car='NULL'
# 利用 beautfiulsoup 的 find function 利用 css selector 定位 並撈出指定資料
addr=bs.find('span',{'class':'addr'}).text
price=bs.find('div',{'class':'price'}).text.strip().split(' ')[0]
room_attrs=bs.find('ul',{'class':'attr'}).findAll('li')
for attr in room_attrs:
if attr.text.split('\xa0:\xa0\xa0')[0]=='坪數':
size=attr.text.split('\xa0:\xa0\xa0')[1]
elif attr.text.split('\xa0:\xa0\xa0')[0]=='樓層':
floor=attr.text.split('\xa0:\xa0\xa0')[1]
elif attr.text.split('\xa0:\xa0\xa0')[0]=='型態':
room_type=attr.text.split('\xa0:\xa0\xa0')[1]
room_descriptions=bs.find('ul',{'class':'labelList-1'}).findAll('li')
for description in room_descriptions:
if description.text.split(':')[0]=='格局':
form=description.text.split(':')[1]
if description.text.split(':')[0]=='車 位':
car=description.text.split(':')[1]
return addr,price,size,floor,room_type,form,car
else:
print('link expired:', url)
return 404, 404, 404, 404, 404, 404, 404
def main(outputfile):
#利用chrome模擬器開啟
browser = webdriver.Chrome()
browser.get("https://rent.591.com.tw/?kind=0®ion=1")
#關閉選取地區pop-up 否則無法點選下一頁
browser.find_element_by_id('area-box-close').click()
time.sleep(3)
#輸入 ESC 關閉google 提示,否則無法點選
browser.find_element_by_class_name('pageNext').send_keys(Keys.ESCAPE) #ECS鍵
with open(outputfile, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
bs = BeautifulSoup(browser.page_source, 'html.parser')
totalpages = int(bs.find('span', {'class':'TotalRecord'}).text.split(' ')[-2])/30 + 1
print('Total pages: ', totalpages)
for i in range(totalpages):
room_url_list=[] #存放網址list
bs = BeautifulSoup(browser.page_source, 'html.parser')
titles=bs.findAll('h3') # h3 放置物件的區塊
for title in titles:
room_url=title.find('a').get('href') # 每個物件的 url
room_url_list.append(room_url)
time.sleep(3)
# ------------- write into csv ------------- #
for url in room_url_list:
addr,price,size,floor,room_type,form,car = getData(url)
writer.writerow([addr,price,size,floor,room_type,form,car])
# ------------------------------------------ #
print(i/totalpages*100, '%',end='\r') # print out 完成 %數
#若偵測到無法點選最後一頁則跳出
if bs.find('a',{'class':'last'}):
pass
else:
#撈取完資料後點選下一頁,並等待 3 秒載入新頁面
browser.find_element_by_class_name('pageNext').send_keys(Keys.ESCAPE)
browser.find_element_by_class_name('pageNext').click()
time.sleep(3)
if __name__ == '__main__':
# -------- configurable parameter -------- #
output_file_name = 'tpe_rent_output.csv'
# ---------------------------------------- #
main(output_file_name)
print('\nfinish!')