forked from zwendzw/TechJob_crawler
/
crawler.rb
59 lines (54 loc) · 1.85 KB
/
crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
require 'nokogiri'
require 'json'
require 'open-uri'
host ="https://www.ptt.cc"
doc = String
data, link, post_id, articles_of_page= []
doc_article, title, content_text = String
# next_page = html[2]['href']#下頁
# previous_page = html[1]['href']#上頁
count = Integer
def parse_content(url)
begin
host ="https://www.ptt.cc"
count = 0
doc = Nokogiri::HTML(open(url)) #HTML開啟
data = doc.css('.r-ent').css('.title').css('/a[@href]').to_a #將頁面上的文章資訊轉成陣列
articles_of_page = []
data.each do |link|
link = data[count]['href'] #取陣列中"href"欄位
post_id = link.split("/")[3]
title = data[count].text
doc_article= Nokogiri::HTML(open(host+link))
content_text = doc_article.xpath("//div[@class='bbs-screen bbs-content']").text
count = count +1
articles_of_page.push({:post_id=>post_id, :link_url=>link, :title=>title, :content=>content_text})
end
# puts "GET____ARTICLES:#{articles_of_page.length}"
articles_of_page.each do |a|
puts "TITLE____:#{a[:title]}"
end
rescue
puts "FQ REREY"
sleep 3
parse_content(url)
end
end
first_page = host + "/bbs/Tech_Job/index.html"
index_doc = Nokogiri::HTML(open(first_page))
html = index_doc.css("//div[@class='btn-group pull-right']").css('/a[@class]').to_a
page_number = html[1]['href'].split("/")[3][5..8].to_i
page_number.times do |link_number|
link_number = html[1]['href'].split("/")[3][5..8].to_i
page_number -=1
url = host+ "/bbs/Tech_Job/index#{page_number}.html"
puts url
first_page = url
parse_content(url)
end
# if html[1]['href'] == nil
# url = index_url
# url=host#{next_page}
# parse_content(url)
# end
# end