diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 924d7ac..401ba7a 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -201,7 +201,8 @@ def structure_dir_path dir_path end def download_file file_remote_info - file_url = file_remote_info[:file_url] + current_encoding = "".encoding + file_url = file_remote_info[:file_url].encode(current_encoding) file_id = file_remote_info[:file_id] file_timestamp = file_remote_info[:timestamp] file_path_elements = file_id.split('/') diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index 0e4ceb4..cd5b822 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -89,5 +89,19 @@ def test_file_list_exclude_filter_with_a_regex @wayback_machine_downloader.all = true assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size end + + # Testing encoding conflicts needs a different base_url + def test_nonascii_suburls_download + @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + # Once just for the downloading... + @wayback_machine_downloader.download_files + end + + def test_nonascii_suburls_already_present + @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + # ... twice to test the "is already present" case + @wayback_machine_downloader.download_files + @wayback_machine_downloader.download_files + end end