Skip to content

Commit

Permalink
Merge branch 'p-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
hartator committed Jun 12, 2017
2 parents 5b0ed26 + af8ef28 commit e0982e9
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 32 deletions.
10 changes: 7 additions & 3 deletions bin/wayback_machine_downloader
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ option_parser = OptionParser.new do |opts|
options[:to_timestamp] = t
end

opts.on("-e", "--exact_url", String, "Download only the url provied and not the full site") do |t|
options[:only_filter] = t
end

opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
options[:only_filter] = t
end
Expand All @@ -42,11 +46,11 @@ option_parser = OptionParser.new do |opts|
options[:threads_count] = t
end

opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t|
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
options[:maximum_pages] = t
end

opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
options[:list] = true
end

Expand All @@ -58,7 +62,7 @@ end.parse!
if (base_url = ARGV[-1])
options[:base_url] = base_url
wayback_machine_downloader = WaybackMachineDownloader.new options
if wayback_machine_downloader.list
if options[:list]
wayback_machine_downloader.list_files
else
wayback_machine_downloader.download_files
Expand Down
33 changes: 19 additions & 14 deletions lib/wayback_machine_downloader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,19 @@ class WaybackMachineDownloader

VERSION = "2.0.0"

attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
attr_accessor :base_url, :exact_url, :directory,
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
:all, :maximum_pages, :threads_count

def initialize params
@base_url = params[:base_url]
@exact_url = params[:exact_url]
@directory = params[:directory]
@from_timestamp = params[:from_timestamp].to_i
@to_timestamp = params[:to_timestamp].to_i
@only_filter = params[:only_filter]
@exclude_filter = params[:exclude_filter]
@all = params[:all]
@list = params[:list]
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = params[:threads_count].to_i
end
Expand Down Expand Up @@ -78,18 +80,19 @@ def match_exclude_filter file_url
end

def get_all_snapshots_to_consider
# Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
# Note: Passing a page index parameter allow us to get more snapshots,
# but from a less fresh index
print "Getting snapshot pages"
snapshot_list_to_consider = ""
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
print "."
snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
print "."
@maximum_pages.times do |page_index|
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
break if snapshot_list.empty?
snapshot_list_to_consider += snapshot_list
print "."
unless @exact_url
@maximum_pages.times do |page_index|
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
break if snapshot_list.empty?
snapshot_list_to_consider += snapshot_list
print "."
end
end
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
puts
Expand Down Expand Up @@ -134,8 +137,10 @@ def get_file_list_by_timestamp
end

def list_files
# retrieval produces its own output
files = get_file_list_by_timestamp
puts "["
get_file_list_by_timestamp.each do |file|
files.each do |file|
puts file.to_json + ","
end
puts "]"
Expand Down Expand Up @@ -179,7 +184,7 @@ def download_files

def structure_dir_path dir_path
begin
FileUtils::mkdir_p dir_path unless File.exists? dir_path
FileUtils::mkdir_p dir_path unless File.exist? dir_path
rescue Errno::EEXIST => e
error_to_string = e.to_s
puts "# #{error_to_string}"
Expand Down Expand Up @@ -219,7 +224,7 @@ def download_file file_remote_info
if Gem.win_platform?
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
end
unless File.exists? file_path
unless File.exist? file_path
begin
structure_dir_path dir_path
open(file_path, "wb") do |file|
Expand All @@ -240,7 +245,7 @@ def download_file file_remote_info
rescue StandardError => e
puts "#{file_url} # #{e}"
ensure
if not @all and File.exists?(file_path) and File.size(file_path) == 0
if not @all and File.exist?(file_path) and File.size(file_path) == 0
File.delete(file_path)
puts "#{file_path} was empty and was removed."
end
Expand Down
14 changes: 7 additions & 7 deletions lib/wayback_machine_downloader/archive_api.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
module ArchiveAPI

def get_raw_list_from_api url, page_index
request_url = "http://web.archive.org/cdx/search/xd?url="
request_url += url
request_url += parameters_for_api page_index
def get_raw_list_from_api url, page_index
request_url = "http://web.archive.org/cdx/search/xd?url="
request_url += url
request_url += parameters_for_api page_index

open(request_url).read
end
end

def parameters_for_api page_index
parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
def parameters_for_api page_index
parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
if @all
parameters += ""
else
Expand Down
6 changes: 3 additions & 3 deletions lib/wayback_machine_downloader/tidy_bytes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def tidy_bytes(force = false)
bytes.each_index do |i|

byte = bytes[i]
is_ascii = byte < 128
_is_ascii = byte < 128
is_cont = byte > 127 && byte < 192
is_lead = byte > 191 && byte < 245
is_unused = byte > 240
Expand All @@ -78,7 +78,7 @@ def tidy_bytes(force = false)
# the leading byte.
begin
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
rescue NoMethodError => e
rescue NoMethodError
next
end
conts_expected = 0
Expand All @@ -98,7 +98,7 @@ def tidy_bytes(force = false)
end
begin
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
rescue ArgumentError => e
rescue ArgumentError
nil
end
end
Expand Down
2 changes: 1 addition & 1 deletion lib/wayback_machine_downloader/to_regex.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def literal?(str)
# @option options [true,false] :lang /foo/[nesu]
def to_regex(options = {})
if args = as_regexp(options)
::Regexp.new *args
::Regexp.new(*args)
end
end

Expand Down
21 changes: 17 additions & 4 deletions test/test_wayback_machine_downloader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
class WaybackMachineDownloaderTest < Minitest::Test

def setup
@wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net'
@wayback_machine_downloader = WaybackMachineDownloader.new(
base_url: 'http://www.onlyfreegames.net')
$stdout = StringIO.new
end

Expand Down Expand Up @@ -38,6 +39,16 @@ def test_file_list_by_timestamp
assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2]
end

def test_without_exact_url
@wayback_machine_downloader.exact_url = false
assert @wayback_machine_downloader.get_file_list_curated.size > 1
end

def test_exact_url
@wayback_machine_downloader.exact_url = true
assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size
end

def test_file_list_only_filter_without_matches
@wayback_machine_downloader.only_filter = 'abc123'
assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size
Expand Down Expand Up @@ -85,20 +96,22 @@ def test_to_timestamp_being_respected
assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"]
end

def test_file_list_exclude_filter_with_a_regex
def test_all_get_file_list_curated_size
@wayback_machine_downloader.all = true
assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size
end

# Testing encoding conflicts needs a different base_url
def test_nonascii_suburls_download
@wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84'
@wayback_machine_downloader = WaybackMachineDownloader.new(
base_url: 'https://en.wikipedia.org/wiki/%C3%84')
# Once just for the downloading...
@wayback_machine_downloader.download_files
end

def test_nonascii_suburls_already_present
@wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84'
@wayback_machine_downloader = WaybackMachineDownloader.new(
base_url: 'https://en.wikipedia.org/wiki/%C3%84')
# ... twice to test the "is already present" case
@wayback_machine_downloader.download_files
@wayback_machine_downloader.download_files
Expand Down

0 comments on commit e0982e9

Please sign in to comment.