scrapy · Gallaecio · Nov 30, 2023 · Nov 30, 2023 · Dec 14, 2023 · Gallaecio
diff --git a/docs/topics/api.rst b/docs/topics/api.rst
@@ -81,14 +81,7 @@ how you :ref:`configure the downloader middlewares
         For an introduction on extensions and a list of available extensions on
         Scrapy see :ref:`topics-extensions`.
 
-    .. attribute:: engine
-
-        The execution engine, which coordinates the core crawling logic
-        between the scheduler, downloader and spiders.
-
-        Some extension may want to access the Scrapy engine, to inspect  or 
-        modify the downloader and scheduler behaviour, although this is an
-        advanced use and this API is not yet stable.
+    .. autoattribute:: engine
 
     .. attribute:: spider
 
@@ -277,3 +270,16 @@ class (which they all inherit from).
 
         Close the given spider. After this is called, no more specific stats
         can be accessed or collected.
+
+
+.. _engine:
+
+ExecutionEngine API
+===================
+
+.. module:: scrapy.core.engine
+   :synopsis: Execution engine
+
+.. autoclass:: ExecutionEngine
+
+    .. automethod:: close_spider
diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
@@ -1019,31 +1019,7 @@ RobotsTxtMiddleware
 .. module:: scrapy.downloadermiddlewares.robotstxt
    :synopsis: robots.txt middleware
 
-.. class:: RobotsTxtMiddleware
-
-    This middleware filters out requests forbidden by the robots.txt exclusion
-    standard.
-
-    To make sure Scrapy respects robots.txt make sure the middleware is enabled
-    and the :setting:`ROBOTSTXT_OBEY` setting is enabled.
-
-    The :setting:`ROBOTSTXT_USER_AGENT` setting can be used to specify the
-    user agent string to use for matching in the robots.txt_ file. If it
-    is ``None``, the User-Agent header you are sending with the request or the
-    :setting:`USER_AGENT` setting (in that order) will be used for determining
-    the user agent to use in the robots.txt_ file.
-
-    This middleware has to be combined with a robots.txt_ parser.
-
-    Scrapy ships with support for the following robots.txt_ parsers:
-
-    * :ref:`Protego <protego-parser>` (default)
-    * :ref:`RobotFileParser <python-robotfileparser>`
-    * :ref:`Robotexclusionrulesparser <rerp-parser>`
-    * :ref:`Reppy <reppy-parser>` (deprecated)
-
-    You can change the robots.txt_ parser with the :setting:`ROBOTSTXT_PARSER`
-    setting. Or you can also :ref:`implement support for a new parser <support-for-new-robots-parser>`.
+.. autoclass:: RobotsTxtMiddleware
 
 .. reqmeta:: dont_obey_robotstxt
 

diff --git a/docs/topics/signals.rst b/docs/topics/signals.rst
@@ -466,6 +466,45 @@ headers_received
     :param spider: the spider associated with the response
     :type spider: :class:`~scrapy.Spider` object
 
+start_request_returned
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. signal:: start_request_returned
+.. function:: start_request_returned(request)
+
+    .. versionadded:: VERSION
+
+    Sent after a :class:`~scrapy.http.Request` is returned by the
+    :meth:`spider.start_requests <scrapy.Spider.start_requests>` iterator and
+    processed by the
+    :meth:`process_start_requests <scrapy.spidermiddlewares.SpiderMiddleware.process_start_requests>`
+    method of :ref:`spider middlewares <topics-spider-middleware>`, and before
+    that request reaches the :ref:`scheduler <topics-scheduler>`
+    (:signal:`request_scheduled` signal).
+
+    This signal does not support returning deferreds from its handlers.
+
+    :param request: Returned request.
+    :type request: scrapy.http.Request
+
+start_requests_exhausted
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. signal:: start_requests_exhausted
+.. function:: start_requests_exhausted()
+
+    .. versionadded:: VERSION
+
+    Sent after the :meth:`spider.start_requests <scrapy.Spider.start_requests>`
+    iterator (including the :meth:`process_start_requests <scrapy.spidermiddlewares.SpiderMiddleware.process_start_requests>`
+    method of :ref:`spider middlewares <topics-spider-middleware>`) is
+    exhausted, either normally or due to an exception.
+
+    The :signal:`start_request_returned` signal will have been called for all
+    start requests by the time this signal is sent.
+
+    This signal does not support returning deferreds from its handlers.
+
 Response signals
 ----------------
 

diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py
@@ -82,6 +82,15 @@
 
 
 class ExecutionEngine:
+    """The execution engine manages all the core :ref:`components
+    <topics-components>`, such as the :ref:`scheduler <topics-scheduler>`, the
+    downloader, or the :ref:`spider <topics-spiders>`, at run time.
+
+    Some components access the engine through :attr:`Crawler.engine
+    <scrapy.crawler.Crawler.engine>` to access or modify other components, or
+    use core functionality such as closing the running spider.
+    """
+
     def __init__(self, crawler: "Crawler", spider_closed_callback: Callable) -> None:
         self.crawler: "Crawler" = crawler
         self.settings: Settings = crawler.settings
@@ -181,14 +190,19 @@
                 request = next(self.slot.start_requests)
             except StopIteration:
                 self.slot.start_requests = None
+                self.signals.send_catch_log(signal=signals.start_requests_exhausted)
             except Exception:
                 self.slot.start_requests = None
                 logger.error(
                     "Error while obtaining start requests",
                     exc_info=True,
                     extra={"spider": self.spider},
                 )
+                self.signals.send_catch_log(signal=signals.start_requests_exhausted)
             else:
+                self.signals.send_catch_log(
+                    signal=signals.start_request_returned, request=request
+                )
                 self.crawl(request)
 
         if self.spider_is_idle() and self.slot.close_if_idle:
@@ -401,7 +415,30 @@
             self.close_spider(self.spider, reason=ex.reason)
 
     def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred:
-        """Close (cancel) spider and clear all its outstanding requests"""
+        """Stop the crawl with the specified *reason* and clear all its
+        outstanding requests.
+
+        *reason* is an arbitrary string. Built-in Scrapy :ref:`components
+        <topics-components>` use the following reasons:
+
+        -   ``finished``: When the crawl finishes normally.
+
+        -   ``shutdown``: When stopping the crawl is requested, usually by the
+            user through a system signal.
+
+        -   ``cancelled``: When :exc:`~scrapy.exceptions.CloseSpider` is
+            raised, e.g. from a spider callback, without a custom *reason*.
+
+        -   ``closespider_errorcount``, ``closespider_pagecount``,
+            ``closespider_itemcount``, ``closespider_timeout_no_item``: See
+            :class:`~scrapy.extensions.closespider.CloseSpider`.
+
+        -   ``memusage_exceeded``: See
+            :class:`~scrapy.extensions.memusage.MemoryUsage`.
+
+        -   ``robotstxt_denied``: See
+            :class:`~scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware`.
+        """
         if self.slot is None:
             raise RuntimeError("Engine slot not assigned")
 

diff --git a/scrapy/crawler.py b/scrapy/crawler.py
@@ -85,6 +85,8 @@ def __init__(
         self.logformatter: Optional[LogFormatter] = None
         self.request_fingerprinter: Optional[RequestFingerprinter] = None
         self.spider: Optional[Spider] = None
+
+        #: Running instance of :class:`~scrapy.core.engine.ExecutionEngine`.
         self.engine: Optional[ExecutionEngine] = None
 
     def _update_root_log_handler(self) -> None:

diff --git a/scrapy/downloadermiddlewares/robotstxt.py b/scrapy/downloadermiddlewares/robotstxt.py
@@ -7,12 +7,12 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Set, Union
 
 from twisted.internet.defer import Deferred, maybeDeferred
 from twisted.python.failure import Failure
 
-from scrapy import Spider
+from scrapy import Spider, signals
 from scrapy.crawler import Crawler
 from scrapy.exceptions import IgnoreRequest, NotConfigured
 from scrapy.http import Request, Response
@@ -31,9 +31,46 @@
 
 
 class RobotsTxtMiddleware:
+    """This middleware filters out requests forbidden by the robots.txt
+    exclusion standard.
+
+    To make sure Scrapy respects robots.txt make sure the middleware is enabled
+    and the :setting:`ROBOTSTXT_OBEY` setting is enabled.
+
+    The :setting:`ROBOTSTXT_USER_AGENT` setting can be used to specify the
+    user agent string to use for matching in the robots.txt_ file. If it
+    is ``None``, the User-Agent header you are sending with the request or the
+    :setting:`USER_AGENT` setting (in that order) will be used for determining
+    the user agent to use in the robots.txt_ file.
+
+    This middleware has to be combined with a robots.txt_ parser.
+
+    Scrapy ships with support for the following robots.txt_ parsers:
+
+    * :ref:`Protego <protego-parser>` (default)
+    * :ref:`RobotFileParser <python-robotfileparser>`
+    * :ref:`Robotexclusionrulesparser <rerp-parser>`
+    * :ref:`Reppy <reppy-parser>` (deprecated)
+
+    You can change the robots.txt_ parser with the :setting:`ROBOTSTXT_PARSER`
+    setting. Or you can also :ref:`implement support for a new parser
+    <support-for-new-robots-parser>`.
+
+    If all start requests from a spider are ignored due to robots.txt rules,
+    the spider close reason becomes ``robotstxt_denied``.
+    """
+
     DOWNLOAD_PRIORITY: int = 1000
 
+    @classmethod
+    def from_crawler(cls, crawler: Crawler) -> Self:
+        return cls(crawler)
+
     def __init__(self, crawler: Crawler):
+        self._forbidden_start_request_count = 0
+        self._total_start_request_count = 0
+        self._pending_start_request_fingerprints: Set[bytes] = set()
+        self._exhausted_start_requests = False
         if not crawler.settings.getbool("ROBOTSTXT_OBEY"):
             raise NotConfigured
         self._default_useragent: str = crawler.settings.get("USER_AGENT", "Scrapy")
@@ -48,23 +85,53 @@
 
         # check if parser dependencies are met, this should throw an error otherwise.
         self._parserimpl.from_crawler(self.crawler, b"")
+        assert crawler.request_fingerprinter is not None
+        self._fingerprinter = crawler.request_fingerprinter
 
-    @classmethod
-    def from_crawler(cls, crawler: Crawler) -> Self:
-        return cls(crawler)
+        crawler.signals.connect(
+            self._start_request_returned, signal=signals.start_request_returned
+        )
+        crawler.signals.connect(
+            self._start_requests_exhausted, signal=signals.start_requests_exhausted
+        )
+
+    def _start_request_returned(self, request):
+        self._total_start_request_count += 1
+        fingerprint = self._fingerprinter.fingerprint(request)
+        self._pending_start_request_fingerprints.add(fingerprint)
+
+    def _start_requests_exhausted(self):
+        self._exhausted_start_requests = True
+        self._maybe_close()
 
     def process_request(self, request: Request, spider: Spider) -> Optional[Deferred]:
+        fingerprint = self._fingerprinter.fingerprint(request)
+        if fingerprint in self._pending_start_request_fingerprints:
+            self._pending_start_request_fingerprints.remove(fingerprint)
+            is_start_request = True
+        else:
+            is_start_request = False
+
         if request.meta.get("dont_obey_robotstxt"):
             return None
         if request.url.startswith("data:") or request.url.startswith("file:"):
             return None
         d: Deferred = maybeDeferred(self.robot_parser, request, spider)
+        if is_start_request:
+            self._pending_start_request_fingerprints.add(fingerprint)
         d.addCallback(self.process_request_2, request, spider)
         return d
 
     def process_request_2(
         self, rp: Optional[RobotParser], request: Request, spider: Spider
     ) -> None:
+        fingerprint = self._fingerprinter.fingerprint(request)
+        if fingerprint in self._pending_start_request_fingerprints:
+            self._pending_start_request_fingerprints.remove(fingerprint)
+            is_start_request = True
+        else:
+            is_start_request = False
+
         if rp is None:
             return
 
@@ -80,6 +147,11 @@
             )
             assert self.crawler.stats
             self.crawler.stats.inc_value("robotstxt/forbidden")
+
+            if is_start_request:
+                self._forbidden_start_request_count += 1
+                self._maybe_close()
+
             raise IgnoreRequest("Forbidden by robots.txt")
 
     def robot_parser(
@@ -148,3 +220,18 @@
         assert isinstance(rp_dfd, Deferred)
         self._parsers[netloc] = None
         rp_dfd.callback(None)
+
+    def _maybe_close(self):
+        if (
+            not self._exhausted_start_requests
+            or self._pending_start_request_fingerprints
+        ):
+            return
+        if self._forbidden_start_request_count < self._total_start_request_count:
+            return
+        logger.error(
+            "Stopping the spider, all start requests failed because they "
+            "were rejected based on robots.txt rules. See "
+            "https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#topics-dlmw-robots"
+        )
+        self.crawler.engine.close_spider(self.crawler.spider, "robotstxt_denied")
diff --git a/scrapy/signals.py b/scrapy/signals.py
@@ -24,6 +24,8 @@
 item_error = object()
 feed_slot_closed = object()
 feed_exporter_closed = object()
+start_request_returned = object()
+start_requests_exhausted = object()
 
 # for backward compatibility
 stats_spider_opened = spider_opened

diff --git a/tests/mockserver.py b/tests/mockserver.py
@@ -21,6 +21,7 @@
 from twisted.web.static import File
 from twisted.web.util import redirectTo
 
+from scrapy.utils.misc import load_object
 from scrapy.utils.python import to_bytes, to_unicode
 
 
@@ -271,9 +272,16 @@ def render(self, request):
 
 
 class MockServer:
+    def __init__(self, resource=None):
+        self._args = []
+        if resource:
+            resource_path = f"{resource.__module__}.{resource.__name__}"
+            self._args.append("--resource")
+            self._args.append(resource_path)
+
     def __enter__(self):
         self.proc = Popen(
-            [sys.executable, "-u", "-m", "tests.mockserver", "-t", "http"],
+            [sys.executable, "-u", "-m", "tests.mockserver", *self._args, "-t", "http"],
             stdout=PIPE,
             env=get_mockserver_env(),
         )
@@ -378,13 +386,14 @@ def ssl_context_factory(
     parser.add_argument(
         "-t", "--type", type=str, choices=("http", "dns"), default="http"
     )
+    parser.add_argument("--resource", type=str, default="tests.mockserver.Root")
     args = parser.parse_args()
 
     factory: ServerFactory
 
     if args.type == "http":
-        root = Root()
-        factory = Site(root)
+        resource = load_object(args.resource)()
+        factory = Site(resource)
         httpPort = reactor.listenTCP(0, factory)
         contextFactory = ssl_context_factory()
         httpsPort = reactor.listenSSL(0, factory, contextFactory)