New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement a close reason for robots.txt affecting all start requests #6164
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,6 +82,15 @@ | |
|
||
|
||
class ExecutionEngine: | ||
"""The execution engine manages all the core :ref:`components | ||
<topics-components>`, such as the :ref:`scheduler <topics-scheduler>`, the | ||
downloader, or the :ref:`spider <topics-spiders>`, at run time. | ||
|
||
Some components access the engine through :attr:`Crawler.engine | ||
<scrapy.crawler.Crawler.engine>` to access or modify other components, or | ||
use core functionality such as closing the running spider. | ||
""" | ||
|
||
def __init__(self, crawler: "Crawler", spider_closed_callback: Callable) -> None: | ||
self.crawler: "Crawler" = crawler | ||
self.settings: Settings = crawler.settings | ||
|
@@ -181,14 +190,19 @@ | |
request = next(self.slot.start_requests) | ||
except StopIteration: | ||
self.slot.start_requests = None | ||
self.signals.send_catch_log(signal=signals.start_requests_exhausted) | ||
except Exception: | ||
self.slot.start_requests = None | ||
logger.error( | ||
"Error while obtaining start requests", | ||
exc_info=True, | ||
extra={"spider": self.spider}, | ||
) | ||
self.signals.send_catch_log(signal=signals.start_requests_exhausted) | ||
Comment on lines
+193
to
+201
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we should implement separate methods for these, but it felt a bit out of scope. |
||
else: | ||
self.signals.send_catch_log( | ||
signal=signals.start_request_returned, request=request | ||
) | ||
self.crawl(request) | ||
|
||
if self.spider_is_idle() and self.slot.close_if_idle: | ||
|
@@ -401,7 +415,30 @@ | |
self.close_spider(self.spider, reason=ex.reason) | ||
|
||
def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred: | ||
"""Close (cancel) spider and clear all its outstanding requests""" | ||
"""Stop the crawl with the specified *reason* and clear all its | ||
outstanding requests. | ||
|
||
*reason* is an arbitrary string. Built-in Scrapy :ref:`components | ||
<topics-components>` use the following reasons: | ||
|
||
- ``finished``: When the crawl finishes normally. | ||
|
||
- ``shutdown``: When stopping the crawl is requested, usually by the | ||
user through a system signal. | ||
|
||
- ``cancelled``: When :exc:`~scrapy.exceptions.CloseSpider` is | ||
raised, e.g. from a spider callback, without a custom *reason*. | ||
|
||
- ``closespider_errorcount``, ``closespider_pagecount``, | ||
``closespider_itemcount``, ``closespider_timeout_no_item``: See | ||
:class:`~scrapy.extensions.closespider.CloseSpider`. | ||
|
||
- ``memusage_exceeded``: See | ||
:class:`~scrapy.extensions.memusage.MemoryUsage`. | ||
|
||
- ``robotstxt_denied``: See | ||
:class:`~scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware`. | ||
""" | ||
if self.slot is None: | ||
raise RuntimeError("Engine slot not assigned") | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,12 +7,12 @@ | |
from __future__ import annotations | ||
|
||
import logging | ||
from typing import TYPE_CHECKING, Any, Dict, Optional, Union | ||
from typing import TYPE_CHECKING, Any, Dict, Optional, Set, Union | ||
|
||
from twisted.internet.defer import Deferred, maybeDeferred | ||
from twisted.python.failure import Failure | ||
|
||
from scrapy import Spider | ||
from scrapy import Spider, signals | ||
from scrapy.crawler import Crawler | ||
from scrapy.exceptions import IgnoreRequest, NotConfigured | ||
from scrapy.http import Request, Response | ||
|
@@ -31,9 +31,46 @@ | |
|
||
|
||
class RobotsTxtMiddleware: | ||
"""This middleware filters out requests forbidden by the robots.txt | ||
exclusion standard. | ||
|
||
To make sure Scrapy respects robots.txt make sure the middleware is enabled | ||
and the :setting:`ROBOTSTXT_OBEY` setting is enabled. | ||
|
||
The :setting:`ROBOTSTXT_USER_AGENT` setting can be used to specify the | ||
user agent string to use for matching in the robots.txt_ file. If it | ||
is ``None``, the User-Agent header you are sending with the request or the | ||
:setting:`USER_AGENT` setting (in that order) will be used for determining | ||
the user agent to use in the robots.txt_ file. | ||
|
||
This middleware has to be combined with a robots.txt_ parser. | ||
|
||
Scrapy ships with support for the following robots.txt_ parsers: | ||
|
||
* :ref:`Protego <protego-parser>` (default) | ||
* :ref:`RobotFileParser <python-robotfileparser>` | ||
* :ref:`Robotexclusionrulesparser <rerp-parser>` | ||
* :ref:`Reppy <reppy-parser>` (deprecated) | ||
|
||
You can change the robots.txt_ parser with the :setting:`ROBOTSTXT_PARSER` | ||
setting. Or you can also :ref:`implement support for a new parser | ||
<support-for-new-robots-parser>`. | ||
|
||
If all start requests from a spider are ignored due to robots.txt rules, | ||
the spider close reason becomes ``robotstxt_denied``. | ||
""" | ||
|
||
DOWNLOAD_PRIORITY: int = 1000 | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler: Crawler) -> Self: | ||
return cls(crawler) | ||
|
||
def __init__(self, crawler: Crawler): | ||
self._forbidden_start_request_count = 0 | ||
self._total_start_request_count = 0 | ||
self._pending_start_request_fingerprints: Set[bytes] = set() | ||
self._exhausted_start_requests = False | ||
if not crawler.settings.getbool("ROBOTSTXT_OBEY"): | ||
raise NotConfigured | ||
self._default_useragent: str = crawler.settings.get("USER_AGENT", "Scrapy") | ||
|
@@ -48,23 +85,53 @@ | |
|
||
# check if parser dependencies are met, this should throw an error otherwise. | ||
self._parserimpl.from_crawler(self.crawler, b"") | ||
assert crawler.request_fingerprinter is not None | ||
self._fingerprinter = crawler.request_fingerprinter | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler: Crawler) -> Self: | ||
return cls(crawler) | ||
crawler.signals.connect( | ||
self._start_request_returned, signal=signals.start_request_returned | ||
) | ||
crawler.signals.connect( | ||
self._start_requests_exhausted, signal=signals.start_requests_exhausted | ||
) | ||
|
||
def _start_request_returned(self, request): | ||
self._total_start_request_count += 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about counting them on the engine and passing the count as a parameter through the signal, but it felt weird. |
||
fingerprint = self._fingerprinter.fingerprint(request) | ||
self._pending_start_request_fingerprints.add(fingerprint) | ||
Comment on lines
+100
to
+101
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the best I could come up with to avoid changing the request meta. It gets rather verbose in the 2 process_request methods, and weird having to remove and re-add fingerprints in the first one, but it works. |
||
|
||
def _start_requests_exhausted(self): | ||
self._exhausted_start_requests = True | ||
self._maybe_close() | ||
|
||
def process_request(self, request: Request, spider: Spider) -> Optional[Deferred]: | ||
fingerprint = self._fingerprinter.fingerprint(request) | ||
if fingerprint in self._pending_start_request_fingerprints: | ||
self._pending_start_request_fingerprints.remove(fingerprint) | ||
is_start_request = True | ||
else: | ||
is_start_request = False | ||
|
||
if request.meta.get("dont_obey_robotstxt"): | ||
return None | ||
if request.url.startswith("data:") or request.url.startswith("file:"): | ||
return None | ||
d: Deferred = maybeDeferred(self.robot_parser, request, spider) | ||
if is_start_request: | ||
self._pending_start_request_fingerprints.add(fingerprint) | ||
d.addCallback(self.process_request_2, request, spider) | ||
return d | ||
|
||
def process_request_2( | ||
self, rp: Optional[RobotParser], request: Request, spider: Spider | ||
) -> None: | ||
fingerprint = self._fingerprinter.fingerprint(request) | ||
if fingerprint in self._pending_start_request_fingerprints: | ||
self._pending_start_request_fingerprints.remove(fingerprint) | ||
is_start_request = True | ||
else: | ||
is_start_request = False | ||
|
||
if rp is None: | ||
return | ||
|
||
|
@@ -80,6 +147,11 @@ | |
) | ||
assert self.crawler.stats | ||
self.crawler.stats.inc_value("robotstxt/forbidden") | ||
|
||
if is_start_request: | ||
self._forbidden_start_request_count += 1 | ||
self._maybe_close() | ||
|
||
raise IgnoreRequest("Forbidden by robots.txt") | ||
|
||
def robot_parser( | ||
|
@@ -148,3 +220,18 @@ | |
assert isinstance(rp_dfd, Deferred) | ||
self._parsers[netloc] = None | ||
rp_dfd.callback(None) | ||
|
||
def _maybe_close(self): | ||
if ( | ||
not self._exhausted_start_requests | ||
or self._pending_start_request_fingerprints | ||
): | ||
return | ||
if self._forbidden_start_request_count < self._total_start_request_count: | ||
return | ||
logger.error( | ||
"Stopping the spider, all start requests failed because they " | ||
"were rejected based on robots.txt rules. See " | ||
"https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#topics-dlmw-robots" | ||
) | ||
self.crawler.engine.close_spider(self.crawler.spider, "robotstxt_denied") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I went with these names to be more in line with current signal naming, although I am not 100% sure about them. I used “returned” because that’s the wording used in the iterator docs, and with “exhausted” as a word that works both for the iterator stopping and an exception being raised.