Skip to content

Commit

Permalink
add #sitemaps method
Browse files Browse the repository at this point in the history
  • Loading branch information
MothOnMars committed Jun 18, 2018
1 parent bc3b621 commit 21fd4d7
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 20 deletions.
1 change: 1 addition & 0 deletions README.rdoc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Usage:
robotex = Robotex.new "My User Agent"
robotex.allowed?("http://www.example.com/foo")
robotex.delay!("http://www.example.com/foo") # wait until any specified Crawl-Delay has passed
robotex.sitemaps("http://www.example.com/") # return an array of sitemap urls

== Acknowledgements

Expand Down
13 changes: 12 additions & 1 deletion lib/robotex.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ class Robotex

class ParsedRobots

attr_reader :sitemaps

def initialize(uri, user_agent)
io = Robotex.get_robots_txt(uri, user_agent)

Expand All @@ -25,6 +27,7 @@ def initialize(uri, user_agent)
@disallows = {}
@allows = {}
@delays = {}
@sitemaps = []
agent = /.*/
io.each do |line|
next if line =~ /^\s*(#.*|$)/
Expand All @@ -43,6 +46,8 @@ def initialize(uri, user_agent)
@disallows[agent] << to_regex(value)
when "crawl-delay"
@delays[agent] = value.to_i
when "sitemap"
@sitemaps << URI.join(uri, value).to_s
end
end

Expand Down Expand Up @@ -148,5 +153,11 @@ def delay!(uri)
sleep delay - (Time.now - @last_accessed) if !!delay
@last_accessed = Time.now
end


#
# Returns an array of the sitemap urls specified in robots.txt
#
def sitemaps(uri)
parse_host(uri).sitemaps
end
end
65 changes: 46 additions & 19 deletions spec/robotex_spec.rb
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
require 'spec_helper'

describe Robotex do
let(:robots) do
<<~ROBOTS
User-Agent: msnbot
Crawl-Delay: 20
before(:all) do
User-Agent: bender
Disallow: /my_shiny_metal_ass
User-Agent: *
Disallow: /login
Allow: /
Disallow: /locked
Allow: /locked
ROBOTS
end

let(:response) do
{ body: robots, content_type: 'text/plain', status: [200, "OK"] }
end

before do
FakeWeb.allow_net_connect = false
robots = <<-END
User-Agent: msnbot
Crawl-Delay: 20
User-Agent: bender
Disallow: /my_shiny_metal_ass
User-Agent: *
Disallow: /login
Allow: /
Disallow: /locked
Allow: /locked
END
options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]}
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options)
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', response)
end

describe '#initialize' do
Expand Down Expand Up @@ -73,15 +78,37 @@
robotex = Robotex.new
robotex.delay(SPEC_DOMAIN).should be_nil
end
end

context 'when Crawl-Delay is specified for the user-agent' do
it 'returns the delay as a Fixnum' do
robotex = Robotex.new('msnbot')
robotex.delay(SPEC_DOMAIN).should == 20
end
end
end
end

end
describe '#sitemaps' do
let(:robots) do
<<~ROBOTS
Sitemap: http://www.example.com/sitemap_1.xml
Sitemap: http://www.example.com/sitemap_2.xml
ROBOTS
end

it 'returns an array of sitemaps' do
robotex = Robotex.new
robotex.sitemaps(SPEC_DOMAIN).should == %w[http://www.example.com/sitemap_1.xml
http://www.example.com/sitemap_2.xml]
end

context 'when the sitemap url is relative' do
let(:robots) { 'Sitemap: /relative.xml' }

it 'returns the sitemap' do
robotex = Robotex.new
robotex.sitemaps(SPEC_DOMAIN).should == ['http://www.example.com/relative.xml']
end
end
end
end

0 comments on commit 21fd4d7

Please sign in to comment.