spider fix: use internal download utilities for robots.txt (#590)

* spider fix: use internal download utilities for robots.txt * separate function and tests
adbar · May 8, 2024 · 92bdd6e · 92bdd6e
1 parent efe38bb
commit 92bdd6e
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 12 deletions.
diff --git a/tests/spider_tests.py b/tests/spider_tests.py
@@ -13,8 +13,7 @@
 from courlan import UrlStore
 
 from trafilatura import spider
-from trafilatura.settings import DEFAULT_CONFIG
-from trafilatura.utils import LANGID_FLAG
+# from trafilatura.utils import LANGID_FLAG
 
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
@@ -155,10 +154,20 @@ def test_focused_crawler():
     ## assert sorted(todo) == ['https://httpbun.com/links/1/0']
 
 
+def test_robots():
+    "Test robots.txt parsing"
+    robots_url = "https://example.org/robots.txt"
+    assert spider.parse_robots(robots_url, None) is None
+    assert spider.parse_robots(robots_url, 123) is None
+    assert spider.parse_robots(robots_url, b"123") is None
+    assert spider.parse_robots(robots_url, "Allow: *") is not None
+
+
 if __name__ == '__main__':
     test_redirections()
     test_meta_redirections()
     test_process_links()
     test_crawl_logic()
     test_crawl_page()
     test_focused_crawler()
+    test_robots()
diff --git a/trafilatura/spider.py b/trafilatura/spider.py
@@ -119,27 +119,40 @@ def process_response(response, base_url, language, rules=None):
             process_links(htmlstring, base_url, language=language, rules=rules)
 
 
+def parse_robots(robots_url, data):
+    "Parse a robots.txt file with the standard library urllib.robotparser."
+    # https://github.com/python/cpython/blob/main/Lib/urllib/robotparser.py
+    rules = urllib.robotparser.RobotFileParser()
+    rules.set_url(robots_url)
+    # exceptions happening here
+    try:
+        rules.parse(data.splitlines())
+    except Exception as exc:
+        LOGGER.error("cannot read robots.txt: %s", exc)
+        return None
+    return rules
+
+
 def init_crawl(homepage, todo, known_links, language=None, rules=None):
-    """Start crawl by initializing variables and potentially examining the starting page."""
+    "Start crawl by initializing variables and potentially examining the starting page."
     # config=DEFAULT_CONFIG
     _, base_url = get_hostinfo(homepage)
     if base_url is None or len(base_url) < 1:
-        raise ValueError(f'cannot crawl homepage: {homepage}')
+        raise ValueError(f"cannot crawl homepage: {homepage}")
+
     # TODO: just known or also visited?
     if known_links is not None:
         URL_STORE.add_urls(urls=known_links, visited=True)
     i = 0
+
     # fetch and parse robots.txt file if necessary
     if rules is None:
-        rules = urllib.robotparser.RobotFileParser()
-        rules.set_url(base_url + '/robots.txt')
-        # exceptions happening here
-        try:
-            rules.read()
-        except Exception as exc:
-            LOGGER.error('cannot read robots.txt: %s', exc)
-            rules = None
+        robots_url = base_url + "/robots.txt"
+        data = fetch_url(robots_url)
+        if data is not None:
+            rules = parse_robots(robots_url, data)
     URL_STORE.store_rules(base_url, rules)
+
     # initialize crawl by visiting homepage if necessary
     if todo is None:
         URL_STORE.add_urls(urls=[homepage], visited=False)