Skip to content

Commit

Permalink
follow robots.txt redirects
Browse files Browse the repository at this point in the history
  • Loading branch information
MothOnMars committed May 4, 2018
1 parent 99b8962 commit 68c155b
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 21 deletions.
7 changes: 4 additions & 3 deletions lib/robotex.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
require 'open-uri'
require 'uri'
require 'timeout'
require 'open_uri_redirections'

class Robotex

Expand All @@ -17,7 +18,7 @@ class ParsedRobots

def initialize(uri, user_agent)
io = Robotex.get_robots_txt(uri, user_agent)

if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
io = StringIO.new("User-agent: *\nAllow: /\n")
end
Expand Down Expand Up @@ -100,8 +101,8 @@ def to_regex(pattern)
def self.get_robots_txt(uri, user_agent)
begin
Timeout::timeout(Robotex.timeout) do
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
end
URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent, allow_redirections: :all) rescue nil
end
rescue Timeout::Error
STDERR.puts "robots.txt request timed out"
end
Expand Down
2 changes: 2 additions & 0 deletions robotex.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ spec = Gem::Specification.new do |s|
s.rdoc_options << '-m' << 'README.rdoc' << '-t' << 'Robotex'
s.extra_rdoc_files = ["README.rdoc"]

s.add_runtime_dependency "open_uri_redirections", "~> 0.2.1"

s.add_development_dependency "rake", ">=0.9.2"
s.add_development_dependency "rdoc", ">=3.12"
s.add_development_dependency "rspec", ">=2.8.0"
Expand Down
57 changes: 39 additions & 18 deletions spec/robotex_spec.rb
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
require 'spec_helper'

describe Robotex do
let(:robots) do
<<~END
User-Agent: msnbot
Crawl-Delay: 20
before(:all) do
User-Agent: bender
Disallow: /my_shiny_metal_ass
User-Agent: *
Disallow: /login
Allow: /
Disallow: /locked
Allow: /locked
END
end

let(:response) do
{ body: robots, content_type: 'text/plain', status: [200, "OK"] }
end

before do
FakeWeb.allow_net_connect = false
robots = <<-END
User-Agent: msnbot
Crawl-Delay: 20
User-Agent: bender
Disallow: /my_shiny_metal_ass
User-Agent: *
Disallow: /login
Allow: /
Disallow: /locked
Allow: /locked
END
options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]}
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options)
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', response)
end

describe '#initialize' do
Expand Down Expand Up @@ -65,13 +70,29 @@
robotex.allowed?(SPEC_DOMAIN + 'locked').should be_false
end
end

context 'when the robots.txt url is redirected' do
let(:redirection) do
{ status: [301], location: 'https://example.com/robots.txt' }
end

before do
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', redirection)
FakeWeb.register_uri(:get, 'https://example.com/robots.txt', response)
end

it 'returns false' do
robotex = Robotex.new
robotex.allowed?(SPEC_DOMAIN + 'locked').should be_false
end
end
end

describe '#delay' do
context 'when no Crawl-Delay is specified for the user-agent' do
it 'returns nil' do
robotex = Robotex.new
robotex.delay(SPEC_DOMAIN).should be_nil
robotex.delay(SPEC_DOMAIN).should be_nil
end

context 'when Crawl-Delay is specified for the user-agent' do
Expand Down

0 comments on commit 68c155b

Please sign in to comment.