Skip to content

Commit

Permalink
spider fix: use internal download utilities for robots.txt (#590)
Browse files Browse the repository at this point in the history
* spider fix: use internal download utilities for robots.txt

* separate function and tests
  • Loading branch information
adbar committed May 8, 2024
1 parent efe38bb commit 92bdd6e
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 12 deletions.
13 changes: 11 additions & 2 deletions tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
from courlan import UrlStore

from trafilatura import spider
from trafilatura.settings import DEFAULT_CONFIG
from trafilatura.utils import LANGID_FLAG
# from trafilatura.utils import LANGID_FLAG


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
Expand Down Expand Up @@ -155,10 +154,20 @@ def test_focused_crawler():
## assert sorted(todo) == ['https://httpbun.com/links/1/0']


def test_robots():
"Test robots.txt parsing"
robots_url = "https://example.org/robots.txt"
assert spider.parse_robots(robots_url, None) is None
assert spider.parse_robots(robots_url, 123) is None
assert spider.parse_robots(robots_url, b"123") is None
assert spider.parse_robots(robots_url, "Allow: *") is not None


if __name__ == '__main__':
test_redirections()
test_meta_redirections()
test_process_links()
test_crawl_logic()
test_crawl_page()
test_focused_crawler()
test_robots()
33 changes: 23 additions & 10 deletions trafilatura/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,27 +119,40 @@ def process_response(response, base_url, language, rules=None):
process_links(htmlstring, base_url, language=language, rules=rules)


def parse_robots(robots_url, data):
"Parse a robots.txt file with the standard library urllib.robotparser."
# https://github.com/python/cpython/blob/main/Lib/urllib/robotparser.py
rules = urllib.robotparser.RobotFileParser()
rules.set_url(robots_url)
# exceptions happening here
try:
rules.parse(data.splitlines())
except Exception as exc:
LOGGER.error("cannot read robots.txt: %s", exc)
return None
return rules


def init_crawl(homepage, todo, known_links, language=None, rules=None):
"""Start crawl by initializing variables and potentially examining the starting page."""
"Start crawl by initializing variables and potentially examining the starting page."
# config=DEFAULT_CONFIG
_, base_url = get_hostinfo(homepage)
if base_url is None or len(base_url) < 1:
raise ValueError(f'cannot crawl homepage: {homepage}')
raise ValueError(f"cannot crawl homepage: {homepage}")

# TODO: just known or also visited?
if known_links is not None:
URL_STORE.add_urls(urls=known_links, visited=True)
i = 0

# fetch and parse robots.txt file if necessary
if rules is None:
rules = urllib.robotparser.RobotFileParser()
rules.set_url(base_url + '/robots.txt')
# exceptions happening here
try:
rules.read()
except Exception as exc:
LOGGER.error('cannot read robots.txt: %s', exc)
rules = None
robots_url = base_url + "/robots.txt"
data = fetch_url(robots_url)
if data is not None:
rules = parse_robots(robots_url, data)
URL_STORE.store_rules(base_url, rules)

# initialize crawl by visiting homepage if necessary
if todo is None:
URL_STORE.add_urls(urls=[homepage], visited=False)
Expand Down

0 comments on commit 92bdd6e

Please sign in to comment.