diff --git a/README.rdoc b/README.rdoc index a115b186..39084301 100644 --- a/README.rdoc +++ b/README.rdoc @@ -12,6 +12,7 @@ See http://anemone.rubyforge.org for more information. * Built-in BFS algorithm for determining page depth * Allows exclusion of URLs based on regular expressions * Choose the links to follow on each page with focus_crawl() +* Allows to stop crawl * HTTPS support * Records response time for each page * CLI program can list all pages in a domain, calculate page depths, and more diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index d1629a49..d1c4bd5f 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -55,7 +55,11 @@ class Core # proxy server port number :proxy_port => false, # HTTP read timeout in seconds - :read_timeout => nil + :read_timeout => nil, + #limit number of crawled pages queue + :pages_queue_limit => 1000, + #limit number of unique allowed links per crawl (TODO: move links queue to external storage) + :links_limit => 500000 } # Create setter methods for all options to be called from the crawl block @@ -79,6 +83,7 @@ def initialize(urls, opts = {}) @skip_link_patterns = [] @after_crawl_blocks = [] @opts = opts + @stop_crawl = false yield self if block_given? end @@ -142,6 +147,18 @@ def focus_crawl(&block) self end + # + # Signals the crawler that it should stop the crawl before visiting the + # next page. + # + # This method is expected to be called within a page block, and it signals + # the crawler that it must stop after the current page is completely + # processed. All pages and links currently on queue are discared. + # + def stop_crawl + @stop_crawl = true + end + # # Perform the crawl # @@ -152,7 +169,7 @@ def run return if @urls.empty? link_queue = Queue.new - page_queue = Queue.new + page_queue = SizedQueue.new(@opts[:pages_queue_limit]) @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } @@ -163,24 +180,32 @@ def run loop do page = page_queue.deq @pages.touch_key page.url - puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] + puts "#{page.url} Queue: #{link_queue.size} PageQueue #{page_queue.size}" if @opts[:verbose] do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] - links = links_to_follow page - links.each do |link| - link_queue << [link, page.url.dup, page.depth + 1] + + if link_queue.size < @opts[:links_limit] and !@stop_crawl + links = links_to_follow page + links.each do |link| + link_queue << [link, page.url.dup, page.depth + 1] + end + @pages.touch_keys links end - @pages.touch_keys links @pages[page.url] = page + if @stop_crawl + link_queue.clear + end + # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass + break unless page_queue.empty? #page queue could be filled again by waiting threads end - if page_queue.empty? + if page_queue.empty? @tentacles.size.times { link_queue << :END } break end diff --git a/lib/anemone/storage/redis.rb b/lib/anemone/storage/redis.rb index f063a5f4..2a430a74 100644 --- a/lib/anemone/storage/redis.rb +++ b/lib/anemone/storage/redis.rb @@ -5,10 +5,12 @@ module Storage class Redis MARSHAL_FIELDS = %w(links visited fetched) + DOZEN_HOURS = 43200 def initialize(opts = {}) @redis = ::Redis.new(opts) @key_prefix = opts[:key_prefix] || 'anemone' + @expiration = opts[:key_expiration] || DOZEN_HOURS keys.each { |key| delete(key) } end @@ -23,9 +25,12 @@ def []=(key, value) MARSHAL_FIELDS.each do |field| hash[field] = Marshal.dump(hash[field]) end + key_vals = [] hash.each do |field, value| - @redis.hset(rkey, field, value) + key_vals += [field, value] end + @redis.hmset(rkey, key_vals) + @redis.expire(rkey, @expiration) end def delete(key) diff --git a/spec/core_spec.rb b/spec/core_spec.rb index 775c79f1..d61d418b 100644 --- a/spec/core_spec.rb +++ b/spec/core_spec.rb @@ -227,6 +227,37 @@ module Anemone core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3})) core.should have(4).pages end + + it "should stop crawl if requested" do + num_pages = 0 + Anemone.crawl(@pages[0].url, @opts.merge({:pages_queue_limit => 1})) do |anemone| + anemone.on_every_page do + num_pages += 1 + anemone.stop_crawl if num_pages == 2 + end + end + num_pages.should == 2 + end + + it "should limit number of links per crawl" do + num_pages = 0 + Anemone.crawl(@pages[0].url, @opts.merge({:links_limit => 0})) do |anemone| + anemone.on_every_page do + num_pages += 1 + end + end + num_pages.should == 1 + end + + it "should limit pages queue per crawl" do + num_pages = 0 + Anemone.crawl(@pages[0].url, @opts.merge({:pages_queue_limit => 1})) do |anemone| + anemone.on_every_page do + num_pages += 1 + end + end + num_pages.should == 5 + end end end