From 2f1b83f80b90d7ad726c8ce9dee40564ade5c5e3 Mon Sep 17 00:00:00 2001 From: efrat-safanov Date: Thu, 15 Nov 2012 18:41:02 +0200 Subject: [PATCH 1/8] add expiration to Redis storage engin --- lib/anemone/storage/redis.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/anemone/storage/redis.rb b/lib/anemone/storage/redis.rb index f063a5f4..b1627cf5 100644 --- a/lib/anemone/storage/redis.rb +++ b/lib/anemone/storage/redis.rb @@ -5,10 +5,12 @@ module Storage class Redis MARSHAL_FIELDS = %w(links visited fetched) + DOZEN_HOURS = 43200 def initialize(opts = {}) @redis = ::Redis.new(opts) @key_prefix = opts[:key_prefix] || 'anemone' + @expiration = opts[:key_expiration] || DOZEN_HOURS keys.each { |key| delete(key) } end @@ -26,6 +28,7 @@ def []=(key, value) hash.each do |field, value| @redis.hset(rkey, field, value) end + @redis.expire(rkey, @expiration) end def delete(key) From 607c7a711b7cd682c6ebbe90ed090c33565480b7 Mon Sep 17 00:00:00 2001 From: efrat-safanov Date: Thu, 15 Nov 2012 18:47:50 +0200 Subject: [PATCH 2/8] adding patch from gnapse/anemone --- lib/anemone/core.rb | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index d1629a49..1f760b91 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -79,6 +79,7 @@ def initialize(urls, opts = {}) @skip_link_patterns = [] @after_crawl_blocks = [] @opts = opts + @stop_crawl = false yield self if block_given? end @@ -142,6 +143,18 @@ def focus_crawl(&block) self end + # + # Signals the crawler that it should stop the crawl before visiting the + # next page. + # + # This method is expected to be called within a page block, and it signals + # the crawler that it must stop after the current page is completely + # processed. All pages and links currently on queue are discared. + # + def stop_crawl + @stop_crawl = true + end + # # Perform the crawl # @@ -175,12 +188,17 @@ def run @pages[page.url] = page + if @stop_crawl + page_queue.clear + link_queue.clear + end + # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end - if page_queue.empty? + if page_queue.empty? || @stop_crawl @tentacles.size.times { link_queue << :END } break end From 111b64f3ee19fee124dff358ab4451df278e6f91 Mon Sep 17 00:00:00 2001 From: efrat-safanov Date: Thu, 15 Nov 2012 18:51:12 +0200 Subject: [PATCH 3/8] add test for stop crawl --- spec/core_spec.rb | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/spec/core_spec.rb b/spec/core_spec.rb index 775c79f1..0a82832c 100644 --- a/spec/core_spec.rb +++ b/spec/core_spec.rb @@ -227,6 +227,17 @@ module Anemone core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3})) core.should have(4).pages end + + it "should stop crawl if requested" do + num_pages = 0 + Anemone.crawl(@pages[0].url) do |anemone| + anemone.on_every_page do + num_pages += 1 + anemone.stop_crawl + end + end + num_pages.should == 1 + end end end From 0e7f2f54d3418db5b26edd24eb68262eb874f284 Mon Sep 17 00:00:00 2001 From: Efrat Blaier Date: Thu, 15 Nov 2012 19:00:47 +0200 Subject: [PATCH 4/8] Update README.rdoc --- README.rdoc | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rdoc b/README.rdoc index a115b186..39084301 100644 --- a/README.rdoc +++ b/README.rdoc @@ -12,6 +12,7 @@ See http://anemone.rubyforge.org for more information. * Built-in BFS algorithm for determining page depth * Allows exclusion of URLs based on regular expressions * Choose the links to follow on each page with focus_crawl() +* Allows to stop crawl * HTTPS support * Records response time for each page * CLI program can list all pages in a domain, calculate page depths, and more From b97baa0b2e420c7d229c679d8adf99a006ffb273 Mon Sep 17 00:00:00 2001 From: efrat-safanov Date: Thu, 22 Nov 2012 12:40:43 +0200 Subject: [PATCH 5/8] add some memory limitation on queues --- lib/anemone/core.rb | 17 ++++++++++++----- spec/core_spec.rb | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index 1f760b91..5d7e58da 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -55,7 +55,11 @@ class Core # proxy server port number :proxy_port => false, # HTTP read timeout in seconds - :read_timeout => nil + :read_timeout => nil, + #limit number of crawled pages queue + :pages_queue_limit => 1000, + #limit number of unique allowed links per crawl (TODO: move links queue to external storage) + :links_limit => 500000 } # Create setter methods for all options to be called from the crawl block @@ -165,7 +169,7 @@ def run return if @urls.empty? link_queue = Queue.new - page_queue = Queue.new + page_queue = SizedQueue.new(@opts[:pages_queue_limit]) @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } @@ -181,10 +185,13 @@ def run page.discard_doc! if @opts[:discard_page_bodies] links = links_to_follow page - links.each do |link| - link_queue << [link, page.url.dup, page.depth + 1] + if link_queue.num_waiting < @opts[:links_limit] + links.each do |link| + link_queue << [link, page.url.dup, page.depth + 1] + end + @pages.touch_keys links end - @pages.touch_keys links + @pages[page.url] = page diff --git a/spec/core_spec.rb b/spec/core_spec.rb index 0a82832c..5e60137b 100644 --- a/spec/core_spec.rb +++ b/spec/core_spec.rb @@ -238,6 +238,26 @@ module Anemone end num_pages.should == 1 end + + it "should limit number of links per crawl" do + num_pages = 0 + Anemone.crawl(@pages[0].url, @opts.merge({:links_limit => 0})) do |anemone| + anemone.on_every_page do + num_pages += 1 + end + end + num_pages.should == 1 + end + + it "should limit pages queue per crawl" do + num_pages = 0 + Anemone.crawl(@pages[0].url, @opts.merge({:pages_queue_limit => 1})) do |anemone| + anemone.on_every_page do + num_pages += 1 + end + end + num_pages.should == 5 + end end end From 171385835b5b1b6248b735edaeb3882e7bd530f6 Mon Sep 17 00:00:00 2001 From: efrat-safanov Date: Thu, 22 Nov 2012 17:32:17 +0200 Subject: [PATCH 6/8] improve redis performance --- lib/anemone/storage/redis.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/anemone/storage/redis.rb b/lib/anemone/storage/redis.rb index b1627cf5..2a430a74 100644 --- a/lib/anemone/storage/redis.rb +++ b/lib/anemone/storage/redis.rb @@ -25,9 +25,11 @@ def []=(key, value) MARSHAL_FIELDS.each do |field| hash[field] = Marshal.dump(hash[field]) end + key_vals = [] hash.each do |field, value| - @redis.hset(rkey, field, value) + key_vals += [field, value] end + @redis.hmset(rkey, key_vals) @redis.expire(rkey, @expiration) end From 98d6b159328603e73373ba20266e86792a1da8ba Mon Sep 17 00:00:00 2001 From: efrat-safanov Date: Sun, 25 Nov 2012 11:05:25 +0200 Subject: [PATCH 7/8] changeing stop_crawl behaviour to stop new links, but finish processing all pages in queue --- lib/anemone/core.rb | 11 +++++------ spec/core_spec.rb | 6 +++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index 5d7e58da..d1d88266 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -180,23 +180,22 @@ def run loop do page = page_queue.deq @pages.touch_key page.url - puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] + puts "#{page.url} Queue: #{link_queue.size} PageQueue #{page_queue.size}" if @opts[:verbose] do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] - links = links_to_follow page - if link_queue.num_waiting < @opts[:links_limit] + + if link_queue.size < @opts[:links_limit] and !@stop_crawl + links = links_to_follow page links.each do |link| link_queue << [link, page.url.dup, page.depth + 1] end @pages.touch_keys links end - @pages[page.url] = page if @stop_crawl - page_queue.clear link_queue.clear end @@ -205,7 +204,7 @@ def run until link_queue.num_waiting == @tentacles.size Thread.pass end - if page_queue.empty? || @stop_crawl + if page_queue.empty? @tentacles.size.times { link_queue << :END } break end diff --git a/spec/core_spec.rb b/spec/core_spec.rb index 5e60137b..d61d418b 100644 --- a/spec/core_spec.rb +++ b/spec/core_spec.rb @@ -230,13 +230,13 @@ module Anemone it "should stop crawl if requested" do num_pages = 0 - Anemone.crawl(@pages[0].url) do |anemone| + Anemone.crawl(@pages[0].url, @opts.merge({:pages_queue_limit => 1})) do |anemone| anemone.on_every_page do num_pages += 1 - anemone.stop_crawl + anemone.stop_crawl if num_pages == 2 end end - num_pages.should == 1 + num_pages.should == 2 end it "should limit number of links per crawl" do From 6c47adaeefa0a7f435e12126991a9a74d90ea2bb Mon Sep 17 00:00:00 2001 From: efrat-safanov Date: Sun, 25 Nov 2012 19:16:43 +0200 Subject: [PATCH 8/8] deadlock fix - pages queue could be refilled by waiting threads --- lib/anemone/core.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index 5d7e58da..0720235c 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -204,6 +204,7 @@ def run if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass + break unless page_queue.empty? #page queue could be filled again by waiting threads end if page_queue.empty? || @stop_crawl @tentacles.size.times { link_queue << :END }