-
Notifications
You must be signed in to change notification settings - Fork 324
/
core.rb
327 lines (284 loc) · 8.85 KB
/
core.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
require 'thread'
require 'robotex'
require 'anemone/tentacle'
require 'anemone/page'
require 'anemone/exceptions'
require 'anemone/page_store'
require 'anemone/storage'
require 'anemone/storage/base'
module Anemone
VERSION = '0.7.2';
#
# Convenience method to start a crawl
#
def Anemone.crawl(urls, options = {}, &block)
Core.crawl(urls, options, &block)
end
class Core
# PageStore storing all Page objects encountered during the crawl
attr_reader :pages
# Hash of options for the crawl
attr_reader :opts
DEFAULT_OPTS = {
# run 4 Tentacle threads to fetch pages
:threads => 4,
# disable verbose output
:verbose => false,
# don't throw away the page response body after scanning it for links
:discard_page_bodies => false,
# identify self as Anemone/VERSION
:user_agent => "Anemone/#{Anemone::VERSION}",
# no delay between requests
:delay => 0,
# don't obey the robots exclusion protocol
:obey_robots_txt => false,
# by default, don't limit the depth of the crawl
:depth_limit => false,
# number of times HTTP redirects will be followed
:redirect_limit => 5,
# storage engine defaults to Hash in +process_options+ if none specified
:storage => nil,
# Hash of cookie name => value to send with HTTP requests
:cookies => nil,
# accept cookies from the server and send them back?
:accept_cookies => false,
# skip any link with a query string? e.g. http://foo.com/?u=user
:skip_query_strings => false,
# proxy server hostname
:proxy_host => nil,
# proxy server port number
:proxy_port => false,
# HTTP read timeout in seconds
:read_timeout => nil,
#limit number of crawled pages queue
:pages_queue_limit => 1000,
#limit number of unique allowed links per crawl (TODO: move links queue to external storage)
:links_limit => 500000
}
# Create setter methods for all options to be called from the crawl block
DEFAULT_OPTS.keys.each do |key|
define_method "#{key}=" do |value|
@opts[key.to_sym] = value
end
end
#
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
# and optional *block*
#
def initialize(urls, opts = {})
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
@tentacles = []
@on_every_page_blocks = []
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@after_crawl_blocks = []
@opts = opts
@stop_crawl = false
yield self if block_given?
end
#
# Convenience method to start a new crawl
#
def self.crawl(urls, opts = {})
self.new(urls, opts) do |core|
yield core if block_given?
core.run
end
end
#
# Add a block to be executed on the PageStore after the crawl
# is finished
#
def after_crawl(&block)
@after_crawl_blocks << block
self
end
#
# Add one ore more Regex patterns for URLs which should not be
# followed
#
def skip_links_like(*patterns)
@skip_link_patterns.concat [patterns].flatten.compact
self
end
#
# Add a block to be executed on every Page as they are encountered
# during the crawl
#
def on_every_page(&block)
@on_every_page_blocks << block
self
end
#
# Add a block to be executed on Page objects with a URL matching
# one or more patterns
#
def on_pages_like(*patterns, &block)
if patterns
patterns.each do |pattern|
@on_pages_like_blocks[pattern] << block
end
end
self
end
#
# Specify a block which will select which links to follow on each page.
# The block should return an Array of URI objects.
#
def focus_crawl(&block)
@focus_crawl_block = block
self
end
#
# Signals the crawler that it should stop the crawl before visiting the
# next page.
#
# This method is expected to be called within a page block, and it signals
# the crawler that it must stop after the current page is completely
# processed. All pages and links currently on queue are discared.
#
def stop_crawl
@stop_crawl = true
end
#
# Perform the crawl
#
def run
process_options
@urls.delete_if { |url| !visit_link?(url) }
return if @urls.empty?
link_queue = Queue.new
page_queue = SizedQueue.new(@opts[:pages_queue_limit])
@opts[:threads].times do
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
end
@urls.each{ |url| link_queue.enq(url) }
loop do
page = page_queue.deq
@pages.touch_key page.url
puts "#{page.url} Queue: #{link_queue.size} PageQueue #{page_queue.size}" if @opts[:verbose]
do_page_blocks page
page.discard_doc! if @opts[:discard_page_bodies]
if link_queue.size < @opts[:links_limit] and !@stop_crawl
links = links_to_follow page
links.each do |link|
link_queue << [link, page.url.dup, page.depth + 1]
end
@pages.touch_keys links
end
@pages[page.url] = page
if @stop_crawl
link_queue.clear
end
# if we are done with the crawl, tell the threads to end
if link_queue.empty? and page_queue.empty?
until link_queue.num_waiting == @tentacles.size
Thread.pass
break unless page_queue.empty? #page queue could be filled again by waiting threads
end
if page_queue.empty?
@tentacles.size.times { link_queue << :END }
break
end
end
end
@tentacles.each { |thread| thread.join }
do_after_crawl_blocks
self
end
private
def process_options
@opts = DEFAULT_OPTS.merge @opts
@opts[:threads] = 1 if @opts[:delay] > 0
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
@pages = PageStore.new(storage)
@robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
freeze_options
end
#
# Freeze the opts Hash so that no options can be modified
# once the crawl begins
#
def freeze_options
@opts.freeze
@opts.each_key { |key| @opts[key].freeze }
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
end
#
# Execute the after_crawl blocks
#
def do_after_crawl_blocks
@after_crawl_blocks.each { |block| block.call(@pages) }
end
#
# Execute the on_every_page blocks for *page*
#
def do_page_blocks(page)
@on_every_page_blocks.each do |block|
block.call(page)
end
@on_pages_like_blocks.each do |pattern, blocks|
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
end
end
#
# Return an Array of links to follow from the given page.
# Based on whether or not the link has already been crawled,
# and the block given to focus_crawl()
#
def links_to_follow(page)
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
end
#
# Returns +true+ if *link* has not been visited already,
# and is not excluded by a skip_link pattern...
# and is not excluded by robots.txt...
# and is not deeper than the depth limit
# Returns +false+ otherwise.
#
def visit_link?(link, from_page = nil)
!@pages.has_page?(link) &&
!skip_link?(link) &&
!skip_query_string?(link) &&
allowed(link) &&
!too_deep?(from_page)
end
#
# Returns +true+ if we are obeying robots.txt and the link
# is granted access in it. Always returns +true+ when we are
# not obeying robots.txt.
#
def allowed(link)
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
rescue
false
end
#
# Returns +true+ if we are over the page depth limit.
# This only works when coming from a page and with the +depth_limit+ option set.
# When neither is the case, will always return +false+.
def too_deep?(from_page)
if from_page && @opts[:depth_limit]
from_page.depth >= @opts[:depth_limit]
else
false
end
end
#
# Returns +true+ if *link* should not be visited because
# it has a query string and +skip_query_strings+ is true.
#
def skip_query_string?(link)
@opts[:skip_query_strings] && link.query
end
#
# Returns +true+ if *link* should not be visited because
# its URL matches a skip_link pattern.
#
def skip_link?(link)
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
end
end
end