Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements #70

Open
wants to merge 13 commits into
base: next
Choose a base branch
from
1 change: 1 addition & 0 deletions README.rdoc
Expand Up @@ -12,6 +12,7 @@ See http://anemone.rubyforge.org for more information.
* Built-in BFS algorithm for determining page depth
* Allows exclusion of URLs based on regular expressions
* Choose the links to follow on each page with focus_crawl()
* Allows to stop crawl
* HTTPS support
* Records response time for each page
* CLI program can list all pages in a domain, calculate page depths, and more
Expand Down
41 changes: 33 additions & 8 deletions lib/anemone/core.rb
Expand Up @@ -55,7 +55,11 @@ class Core
# proxy server port number
:proxy_port => false,
# HTTP read timeout in seconds
:read_timeout => nil
:read_timeout => nil,
#limit number of crawled pages queue
:pages_queue_limit => 1000,
#limit number of unique allowed links per crawl (TODO: move links queue to external storage)
:links_limit => 500000
}

# Create setter methods for all options to be called from the crawl block
Expand All @@ -79,6 +83,7 @@ def initialize(urls, opts = {})
@skip_link_patterns = []
@after_crawl_blocks = []
@opts = opts
@stop_crawl = false

yield self if block_given?
end
Expand Down Expand Up @@ -142,6 +147,18 @@ def focus_crawl(&block)
self
end

#
# Signals the crawler that it should stop the crawl before visiting the
# next page.
#
# This method is expected to be called within a page block, and it signals
# the crawler that it must stop after the current page is completely
# processed. All pages and links currently on queue are discared.
#
def stop_crawl
@stop_crawl = true
end

#
# Perform the crawl
#
Expand All @@ -152,7 +169,7 @@ def run
return if @urls.empty?

link_queue = Queue.new
page_queue = Queue.new
page_queue = SizedQueue.new(@opts[:pages_queue_limit])

@opts[:threads].times do
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
Expand All @@ -163,24 +180,32 @@ def run
loop do
page = page_queue.deq
@pages.touch_key page.url
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
puts "#{page.url} Queue: #{link_queue.size} PageQueue #{page_queue.size}" if @opts[:verbose]
do_page_blocks page
page.discard_doc! if @opts[:discard_page_bodies]

links = links_to_follow page
links.each do |link|
link_queue << [link, page.url.dup, page.depth + 1]

if link_queue.size < @opts[:links_limit] and !@stop_crawl
links = links_to_follow page
links.each do |link|
link_queue << [link, page.url.dup, page.depth + 1]
end
@pages.touch_keys links
end
@pages.touch_keys links

@pages[page.url] = page

if @stop_crawl
link_queue.clear
end

# if we are done with the crawl, tell the threads to end
if link_queue.empty? and page_queue.empty?
until link_queue.num_waiting == @tentacles.size
Thread.pass
break unless page_queue.empty? #page queue could be filled again by waiting threads
end
if page_queue.empty?
if page_queue.empty?
@tentacles.size.times { link_queue << :END }
break
end
Expand Down
7 changes: 6 additions & 1 deletion lib/anemone/storage/redis.rb
Expand Up @@ -5,10 +5,12 @@ module Storage
class Redis

MARSHAL_FIELDS = %w(links visited fetched)
DOZEN_HOURS = 43200

def initialize(opts = {})
@redis = ::Redis.new(opts)
@key_prefix = opts[:key_prefix] || 'anemone'
@expiration = opts[:key_expiration] || DOZEN_HOURS
keys.each { |key| delete(key) }
end

Expand All @@ -23,9 +25,12 @@ def []=(key, value)
MARSHAL_FIELDS.each do |field|
hash[field] = Marshal.dump(hash[field])
end
key_vals = []
hash.each do |field, value|
@redis.hset(rkey, field, value)
key_vals += [field, value]
end
@redis.hmset(rkey, key_vals)
@redis.expire(rkey, @expiration)
end

def delete(key)
Expand Down
31 changes: 31 additions & 0 deletions spec/core_spec.rb
Expand Up @@ -227,6 +227,37 @@ module Anemone
core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
core.should have(4).pages
end

it "should stop crawl if requested" do
num_pages = 0
Anemone.crawl(@pages[0].url, @opts.merge({:pages_queue_limit => 1})) do |anemone|
anemone.on_every_page do
num_pages += 1
anemone.stop_crawl if num_pages == 2
end
end
num_pages.should == 2
end

it "should limit number of links per crawl" do
num_pages = 0
Anemone.crawl(@pages[0].url, @opts.merge({:links_limit => 0})) do |anemone|
anemone.on_every_page do
num_pages += 1
end
end
num_pages.should == 1
end

it "should limit pages queue per crawl" do
num_pages = 0
Anemone.crawl(@pages[0].url, @opts.merge({:pages_queue_limit => 1})) do |anemone|
anemone.on_every_page do
num_pages += 1
end
end
num_pages.should == 5
end
end

end
Expand Down