Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling CloseSpider exception if it has been raised in start_requests() #6148

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 12 additions & 0 deletions scrapy/core/engine.py
Expand Up @@ -400,6 +400,18 @@ def _spider_idle(self) -> None:
assert isinstance(ex, CloseSpider) # typing
self.close_spider(self.spider, reason=ex.reason)

def close_spider_before_start(self, spider: Spider, reason: str = "cancelled"):
if self.slot is not None:
raise RuntimeError("Engine slot is already assigned. Use self.close_spider")

self.start()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look correct because start() returns a Deferred, but also is it OK to call it here at all if we are closing anyway?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wRAR
I didn't place it at beginning. But to common close_spider method has several checks for engine being started. Not sure we need to have use it though, just think that the code inside it is useful.

I've tried to avoid it, but neither of approaches I've considered allowed to get the proper result. The only other option I see so far is to simply set close reason to spider and leave it in piece.

How do you think to do it better?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In any case if you call it you need to wait until it completes.

nextcall_none = CallLaterOnce(lambda: None)
scheduler = create_instance(
self.scheduler_cls, settings=None, crawler=self.crawler
)
self.slot = Slot((), True, nextcall_none, scheduler)
return self.close_spider(spider, reason)

def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred:
"""Close (cancel) spider and clear all its outstanding requests"""
if self.slot is None:
Expand Down
13 changes: 9 additions & 4 deletions scrapy/crawler.py
Expand Up @@ -24,7 +24,7 @@
from scrapy import Spider, signals
from scrapy.addons import AddonManager
from scrapy.core.engine import ExecutionEngine
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.exceptions import CloseSpider, ScrapyDeprecationWarning
from scrapy.extension import ExtensionManager
from scrapy.interfaces import ISpiderLoader
from scrapy.logformatter import LogFormatter
Expand Down Expand Up @@ -155,9 +155,14 @@ def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]:
self._apply_settings()
self._update_root_log_handler()
self.engine = self._create_engine()
start_requests = iter(self.spider.start_requests())
yield self.engine.open_spider(self.spider, start_requests)
yield maybeDeferred(self.engine.start)
try:
start_requests = iter(self.spider.start_requests())
yield self.engine.open_spider(self.spider, start_requests)
yield maybeDeferred(self.engine.start)
except CloseSpider as e:
yield self.engine.close_spider_before_start(
self.spider, reason=e.reason
)
except Exception:
self.crawling = False
if self.engine is not None:
Expand Down
26 changes: 25 additions & 1 deletion tests/spiders.py
Expand Up @@ -3,12 +3,13 @@
"""
import asyncio
import time
from typing import Iterable
from urllib.parse import urlencode

from twisted.internet import defer

from scrapy import signals
from scrapy.exceptions import StopDownload
from scrapy.exceptions import CloseSpider, StopDownload
from scrapy.http import Request
from scrapy.item import Item
from scrapy.linkextractors import LinkExtractor
Expand Down Expand Up @@ -276,6 +277,29 @@ def parse(self, response):
self.raise_exception()


class CloseExceptionSpider(FollowAllSpider):
_expected_message: str = "Error"

def __init__(self, *args, **kwargs):
if "expected_message" in kwargs:
self._expected_message = kwargs["expected_message"]
super().__init__(*args, **kwargs)


class CloseExceptionStartSpider(CloseExceptionSpider):
def start_requests(self) -> Iterable[Request]:
raise CloseSpider(reason=self._expected_message)


class CloseExceptionParseSpider(CloseExceptionSpider):
def start_requests(self) -> Iterable[Request]:
url = self.mockserver.url("/close_spider")
yield Request(url, callback=self.parse)

def parse(self, response):
raise CloseSpider(reason=self._expected_message)


class BrokenStartRequestsSpider(FollowAllSpider):
fail_before_yield = False
fail_yielding = False
Expand Down
27 changes: 26 additions & 1 deletion tests/test_closespider.py
Expand Up @@ -3,7 +3,14 @@

from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
from tests.spiders import ErrorSpider, FollowAllSpider, ItemSpider, SlowSpider
from tests.spiders import (
CloseExceptionParseSpider,
CloseExceptionStartSpider,
ErrorSpider,
FollowAllSpider,
ItemSpider,
SlowSpider,
)


class TestCloseSpider(TestCase):
Expand Down Expand Up @@ -64,3 +71,21 @@ def test_closespider_timeout_no_item(self):
self.assertEqual(reason, "closespider_timeout_no_item")
total_seconds = crawler.stats.get_value("elapsed_time_seconds")
self.assertTrue(total_seconds >= timeout)

@defer.inlineCallbacks
def test_closespider_exception_handler(self):
expected_message_parse = "Raised on parse."
crawler_parse = get_crawler(CloseExceptionParseSpider)
yield crawler_parse.crawl(
mockserver=self.mockserver, expected_message=expected_message_parse
)
reason_parse = crawler_parse.spider.meta["close_reason"]
self.assertEqual(reason_parse, expected_message_parse)

expected_message_start = "Raised on start."
crawler_start = get_crawler(CloseExceptionStartSpider)
yield crawler_start.crawl(
mockserver=self.mockserver, expected_message=expected_message_start
)
reason_start = crawler_start.spider.meta["close_reason"]
self.assertEqual(reason_start, expected_message_start)