/
crawler.py
92 lines (81 loc) · 3.73 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from datetime import datetime
from lxml import etree
from datetime import datetime
from time import sleep
class Crawler(object):
def __init__(self,
base_url='https://www.csie.ntu.edu.tw/news/',
rel_url='news.php?class=101'):
self.base_url = base_url
self.rel_url = rel_url
def crawl(self, start_date, end_date,
date_thres=datetime(2012, 1, 1)):
"""Main crawl API
1. Note that you need to sleep 0.1 seconds for any request.
2. It is welcome to modify TA's template.
"""
if end_date < date_thres:
end_date = date_thres
contents = list()
page_num = 0
while True:
rets, last_date = self.crawl_page(
start_date, end_date, page=f'&no={page_num}')
page_num += 10
if rets:
contents += rets
if last_date < start_date:
break
return contents
def crawl_page(self, start_date, end_date, page=''):
"""Parse ten rows of the given page
Parameters:
start_date (datetime): the start date (included)
end_date (datetime): the end date (included)
page (str): the relative url specified page num
Returns:
content (list): a list of date, title, and content
last_date (datetime): the smallest date in the page
"""
res = requests.get(
self.base_url + self.rel_url + page,
headers={'Accept-Language':
'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6'}
).content.decode()
sleep(0.1)
# TODO: parse the response and get dates, titles and relative url with etree
parser = etree.HTML(res)
root = parser.xpath('//div[1]/div/div[2]/div/div/div[2]/div/table/tbody')[0]
dates = root.xpath('//tr/td[1]/text()')
titles = root.xpath('//tr/td[2]/a/text()')
rel_urls = root.xpath('//tr/td[2]/a/@href')
contents = list()
for date , title , rel_url in zip(dates , titles, rel_urls):
last_date = datetime.strptime(date,'%Y-%m-%d')
url = self.base_url + rel_url
#import pdb
#pdb.set_trace()
content = self.crawl_content(url)
contents.append((date,title,content))
# TODO: 1. concatenate relative url to full url
# 2. for each url call self.crawl_content
# to crawl the content
# 3. append the date, title and content to
# contents
return contents,last_date
def crawl_content(self, url):
"""Crawl the content of given url
For example, if the url is
https://www.csie.ntu.edu.tw/news/news.php?Sn=15216
then you are to crawl contents of
``Title : 我與DeepMind的A.I.研究之路, My A.I. Journey with DeepMind Date : 2019-12-27 2:20pm-3:30pm Location : R103, CSIE Speaker : 黃士傑博士, DeepMind Hosted by : Prof. Shou-De Lin Abstract: 我將與同學們分享,我博士班研究到加入DeepMind所參與的projects (AlphaGo, AlphaStar與AlphaZero),以及從我個人與DeepMind的視角對未來AI發展的展望。 Biography: 黃士傑, Aja Huang 台灣人,國立臺灣師範大學資訊工程研究所博士,現為DeepMind Staff Research Scientist。``
"""
#raise NotImplementedError
#url = "https://www.csie.ntu.edu.tw/news/news.php?Sn=15216"
res = requests.get(url).content.decode()
parser = etree.HTML(res)
content = parser.xpath('/html/body/div[1]/div/div[2]/div/div/div[2]/div/div[2]//text()')
#import pdb
#pdb.set_trace()
return content