scrapy · Gallaecio · Nov 30, 2023 · Nov 30, 2023 · Dec 14, 2023
diff --git a/docs/topics/api.rst b/docs/topics/api.rst
@@ -81,14 +81,7 @@ how you :ref:`configure the downloader middlewares
         For an introduction on extensions and a list of available extensions on
         Scrapy see :ref:`topics-extensions`.
 
-    .. attribute:: engine
-
-        The execution engine, which coordinates the core crawling logic
-        between the scheduler, downloader and spiders.
-
-        Some extension may want to access the Scrapy engine, to inspect  or 
-        modify the downloader and scheduler behaviour, although this is an
-        advanced use and this API is not yet stable.
+    .. autoattribute:: engine
 
     .. attribute:: spider
 
@@ -277,3 +270,16 @@ class (which they all inherit from).
 
         Close the given spider. After this is called, no more specific stats
         can be accessed or collected.
+
+
+.. _engine:
+
+ExecutionEngine API
+===================
+
+.. module:: scrapy.core.engine
+   :synopsis: Execution engine
+
+.. autoclass:: ExecutionEngine
+
+    .. automethod:: close_spider
diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
@@ -1019,31 +1019,7 @@ RobotsTxtMiddleware
 .. module:: scrapy.downloadermiddlewares.robotstxt
    :synopsis: robots.txt middleware
 
-.. class:: RobotsTxtMiddleware
-
-    This middleware filters out requests forbidden by the robots.txt exclusion
-    standard.
-
-    To make sure Scrapy respects robots.txt make sure the middleware is enabled
-    and the :setting:`ROBOTSTXT_OBEY` setting is enabled.
-
-    The :setting:`ROBOTSTXT_USER_AGENT` setting can be used to specify the
-    user agent string to use for matching in the robots.txt_ file. If it
-    is ``None``, the User-Agent header you are sending with the request or the
-    :setting:`USER_AGENT` setting (in that order) will be used for determining
-    the user agent to use in the robots.txt_ file.
-
-    This middleware has to be combined with a robots.txt_ parser.
-
-    Scrapy ships with support for the following robots.txt_ parsers:
-
-    * :ref:`Protego <protego-parser>` (default)
-    * :ref:`RobotFileParser <python-robotfileparser>`
-    * :ref:`Robotexclusionrulesparser <rerp-parser>`
-    * :ref:`Reppy <reppy-parser>` (deprecated)
-
-    You can change the robots.txt_ parser with the :setting:`ROBOTSTXT_PARSER`
-    setting. Or you can also :ref:`implement support for a new parser <support-for-new-robots-parser>`.
+.. autoclass:: RobotsTxtMiddleware
 
 .. reqmeta:: dont_obey_robotstxt
 

diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py
@@ -82,6 +82,15 @@ def _maybe_fire_closing(self) -> None:
 
 
 class ExecutionEngine:
+    """The execution engine manages all the core :ref:`components
+    <topics-components>`, such as the :ref:`scheduler <topics-scheduler>`, the
+    downloader, or the :ref:`spider <topics-spiders>`, at run time.
+
+    Some components access the engine through :attr:`Crawler.engine
+    <scrapy.crawler.Crawler.engine>` to access or modify other components, or
+    use core functionality such as closing the running spider.
+    """
+
     def __init__(self, crawler: "Crawler", spider_closed_callback: Callable) -> None:
         self.crawler: "Crawler" = crawler
         self.settings: Settings = crawler.settings
@@ -401,7 +410,30 @@ def _spider_idle(self) -> None:
             self.close_spider(self.spider, reason=ex.reason)
 
     def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred:
-        """Close (cancel) spider and clear all its outstanding requests"""
+        """Stop the crawl with the specified *reason* and clear all its
+        outstanding requests.
+
+        *reason* is an arbitrary string. Built-in Scrapy :ref:`components
+        <topics-components>` use the following reasons:
+
+        -   ``finished``: When the crawl finishes normally.
+
+        -   ``shutdown``: When stopping the crawl is requested, usually by the
+            user through a system signal.
+
+        -   ``cancelled``: When :exc:`~scrapy.exceptions.CloseSpider` is
+            raised, e.g. from a spider callback, without a custom *reason*.
+
+        -   ``closespider_errorcount``, ``closespider_pagecount``,
+            ``closespider_itemcount``, ``closespider_timeout_no_item``: See
+            :class:`~scrapy.extensions.closespider.CloseSpider`.
+
+        -   ``memusage_exceeded``: See
+            :class:`~scrapy.extensions.memusage.MemoryUsage`.
+
+        -   ``robotstxt_denied``: See
+            :class:`~scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware`.
+        """
         if self.slot is None:
             raise RuntimeError("Engine slot not assigned")
 

diff --git a/scrapy/crawler.py b/scrapy/crawler.py
@@ -85,6 +85,8 @@ def __init__(
         self.logformatter: Optional[LogFormatter] = None
         self.request_fingerprinter: Optional[RequestFingerprinter] = None
         self.spider: Optional[Spider] = None
+
+        #: Running instance of :class:`~scrapy.core.engine.ExecutionEngine`.
         self.engine: Optional[ExecutionEngine] = None
 
     def _update_root_log_handler(self) -> None:

diff --git a/scrapy/downloadermiddlewares/robotstxt.py b/scrapy/downloadermiddlewares/robotstxt.py
@@ -18,6 +18,7 @@
 from scrapy.http import Request, Response
 from scrapy.http.request import NO_CALLBACK
 from scrapy.robotstxt import RobotParser
+from scrapy.spidermiddlewares.robotstxt import _start_requests_processed
 from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.log import failure_to_exc_info
 from scrapy.utils.misc import load_object
@@ -31,9 +32,44 @@
 
 
 class RobotsTxtMiddleware:
+    """This middleware filters out requests forbidden by the robots.txt
+    exclusion standard.
+
+    To make sure Scrapy respects robots.txt make sure the middleware is enabled
+    and the :setting:`ROBOTSTXT_OBEY` setting is enabled.
+
+    The :setting:`ROBOTSTXT_USER_AGENT` setting can be used to specify the
+    user agent string to use for matching in the robots.txt_ file. If it
+    is ``None``, the User-Agent header you are sending with the request or the
+    :setting:`USER_AGENT` setting (in that order) will be used for determining
+    the user agent to use in the robots.txt_ file.
+
+    This middleware has to be combined with a robots.txt_ parser.
+
+    Scrapy ships with support for the following robots.txt_ parsers:
+
+    * :ref:`Protego <protego-parser>` (default)
+    * :ref:`RobotFileParser <python-robotfileparser>`
+    * :ref:`Robotexclusionrulesparser <rerp-parser>`
+    * :ref:`Reppy <reppy-parser>` (deprecated)
+
+    You can change the robots.txt_ parser with the :setting:`ROBOTSTXT_PARSER`
+    setting. Or you can also :ref:`implement support for a new parser
+    <support-for-new-robots-parser>`.
+
+    If all start requests from a spider are ignored due to robots.txt rules,
+    the spider close reason becomes ``robotstxt_denied``.
+    """
+
     DOWNLOAD_PRIORITY: int = 1000
 
+    @classmethod
+    def from_crawler(cls, crawler: Crawler) -> Self:
+        return cls(crawler)
+
     def __init__(self, crawler: Crawler):
+        self._forbidden_start_request_count = 0
+        self._total_start_request_count = 0
         if not crawler.settings.getbool("ROBOTSTXT_OBEY"):
             raise NotConfigured
         self._default_useragent: str = crawler.settings.get("USER_AGENT", "Scrapy")
@@ -49,9 +85,13 @@
         # check if parser dependencies are met, this should throw an error otherwise.
         self._parserimpl.from_crawler(self.crawler, b"")
 
-    @classmethod
-    def from_crawler(cls, crawler: Crawler) -> Self:
-        return cls(crawler)
+        crawler.signals.connect(
+            self._start_requests_processed, signal=_start_requests_processed
+        )
+
+    def _start_requests_processed(self, count):
+        self._total_start_request_count = count
+        self._maybe_close()
 
     def process_request(self, request: Request, spider: Spider) -> Optional[Deferred]:
         if request.meta.get("dont_obey_robotstxt"):
@@ -80,6 +120,11 @@
             )
             assert self.crawler.stats
             self.crawler.stats.inc_value("robotstxt/forbidden")
+
+            if request.meta.get("is_start_request", False):
+                self._forbidden_start_request_count += 1
+                self._maybe_close()
+
             raise IgnoreRequest("Forbidden by robots.txt")
 
     def robot_parser(
@@ -148,3 +193,15 @@
         assert isinstance(rp_dfd, Deferred)
         self._parsers[netloc] = None
         rp_dfd.callback(None)
+
+    def _maybe_close(self):
+        if not self._total_start_request_count:
+            return
+        if self._forbidden_start_request_count < self._total_start_request_count:
+            return
+        logger.error(
+            "Stopping the spider, all start requests failed because they "
+            "were rejected based on robots.txt rules. See "
+            "https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#topics-dlmw-robots"
+        )
+        self.crawler.engine.close_spider(self.crawler.spider, "robotstxt_denied")
diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
@@ -305,6 +305,7 @@
     "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
     "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
     "scrapy.spidermiddlewares.depth.DepthMiddleware": 900,
+    "scrapy.spidermiddlewares.robotstxt.RobotsTxtSpiderMiddleware": 1000,
     # Spider side
 }
 

diff --git a/scrapy/spidermiddlewares/robotstxt.py b/scrapy/spidermiddlewares/robotstxt.py
@@ -0,0 +1,20 @@
+_start_requests_processed = object()
+
+
+class RobotsTxtSpiderMiddleware:
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler)
+
+    def __init__(self, crawler):
+        self._send_signal = crawler.signals.send_catch_log
+
+    def process_start_requests(self, start_requests, spider):
+        # Mark start requests and reports to the downloader middleware the
+        # number of them once all have been processed.
+        count = 0
+        for request in start_requests:
+            request.meta["is_start_request"] = True
+            yield request
+            count += 1
+        self._send_signal(_start_requests_processed, count=count)
diff --git a/tests/mockserver.py b/tests/mockserver.py
@@ -21,6 +21,7 @@
 from twisted.web.static import File
 from twisted.web.util import redirectTo
 
+from scrapy.utils.misc import load_object
 from scrapy.utils.python import to_bytes, to_unicode
 
 
@@ -271,9 +272,16 @@ def render(self, request):
 
 
 class MockServer:
+    def __init__(self, resource=None):
+        self._args = []
+        if resource:
+            resource_path = f"{resource.__module__}.{resource.__name__}"
+            self._args.append("--resource")
+            self._args.append(resource_path)
+
     def __enter__(self):
         self.proc = Popen(
-            [sys.executable, "-u", "-m", "tests.mockserver", "-t", "http"],
+            [sys.executable, "-u", "-m", "tests.mockserver", *self._args, "-t", "http"],
             stdout=PIPE,
             env=get_mockserver_env(),
         )
@@ -378,13 +386,14 @@ def ssl_context_factory(
     parser.add_argument(
         "-t", "--type", type=str, choices=("http", "dns"), default="http"
     )
+    parser.add_argument("--resource", type=str, default="tests.mockserver.Root")
     args = parser.parse_args()
 
     factory: ServerFactory
 
     if args.type == "http":
-        root = Root()
-        factory = Site(root)
+        resource = load_object(args.resource)()
+        factory = Site(resource)
         httpPort = reactor.listenTCP(0, factory)
         contextFactory = ssl_context_factory()
         httpsPort = reactor.listenSSL(0, factory, contextFactory)