Skip to content

Commit

Permalink
add #sitemaps method
Browse files Browse the repository at this point in the history
  • Loading branch information
MothOnMars committed Feb 20, 2018
1 parent bc3b621 commit b5a0aca
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 3 deletions.
1 change: 1 addition & 0 deletions README.rdoc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Usage:
robotex = Robotex.new "My User Agent"
robotex.allowed?("http://www.example.com/foo")
robotex.delay!("http://www.example.com/foo") # wait until any specified Crawl-Delay has passed
robotex.sitemaps("http://www.example.com/") # return an array of sitemap urls

== Acknowledgements

Expand Down
15 changes: 14 additions & 1 deletion lib/robotex.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def initialize(uri, user_agent)
@disallows = {}
@allows = {}
@delays = {}
@sitemaps = []
agent = /.*/
io.each do |line|
next if line =~ /^\s*(#.*|$)/
Expand All @@ -43,6 +44,8 @@ def initialize(uri, user_agent)
@disallows[agent] << to_regex(value)
when "crawl-delay"
@delays[agent] = value.to_i
when "sitemap"
@sitemaps << value
end
end

Expand Down Expand Up @@ -86,6 +89,10 @@ def delay(user_agent)
end
nil
end

def sitemaps(uri)
@sitemaps
end

protected

Expand Down Expand Up @@ -148,5 +155,11 @@ def delay!(uri)
sleep delay - (Time.now - @last_accessed) if !!delay
@last_accessed = Time.now
end


#
# Returns an array of the sitemap urls specified in robots.txt
#
def sitemaps(uri)
parse_host(uri).sitemaps(uri)
end
end
13 changes: 11 additions & 2 deletions spec/robotex_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
before(:all) do
FakeWeb.allow_net_connect = false
robots = <<-END
Sitemap: http://www.example.com/sitemap_1.xml
Sitemap: http://www.example.com/sitemap_2.xml
User-Agent: msnbot
Crawl-Delay: 20
Expand Down Expand Up @@ -73,15 +76,21 @@
robotex = Robotex.new
robotex.delay(SPEC_DOMAIN).should be_nil
end
end

context 'when Crawl-Delay is specified for the user-agent' do
it 'returns the delay as a Fixnum' do
robotex = Robotex.new('msnbot')
robotex.delay(SPEC_DOMAIN).should == 20
end
end
end
end

describe '#sitemaps' do
it 'returns an array of sitemaps' do
sitemaps = ['http://www.example.com/sitemap_1.xml','http://www.example.com/sitemap_2.xml']
robotex = Robotex.new
robotex.sitemaps(SPEC_DOMAIN).should == sitemaps
end
end
end

0 comments on commit b5a0aca

Please sign in to comment.