From 8ce01b3b76d4634f55067d6cfdf632ec70ba304a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 1 Mar 2022 12:26:05 +0100 Subject: [PATCH] Merge pull request from GHSA-cjvr-mfj7-j4j8 * Do not carry over cookies to a different domain on redirect * Cover the cookie-domain redirect fix in the release notes * Cover 1.8.2 in the release notes * Fix redirect Cookie handling when the cookie middleware is disabled * Update the 1.8.2 release date --- docs/news.rst | 67 ++++++++- scrapy/downloadermiddlewares/redirect.py | 31 ++++- tests/test_downloadermiddleware_cookies.py | 155 +++++++++++++++++++++ 3 files changed, 247 insertions(+), 6 deletions(-) diff --git a/docs/news.rst b/docs/news.rst index 2128f2f0e4d..aef12d9dbf7 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -5,11 +5,13 @@ Release notes .. _release-2.6.0: -Scrapy 2.6.0 (2022-02-??) +Scrapy 2.6.0 (2022-03-01) ------------------------- Highlights: +* :ref:`Security fixes for cookie handling <2.6-security-fixes>` + * Python 3.10 support * :ref:`asyncio support ` is no longer considered @@ -20,6 +22,37 @@ Highlights: :ref:`item filtering ` and :ref:`post-processing ` +.. _2.6-security-fixes: + +Security bug fixes +~~~~~~~~~~~~~~~~~~ + +- When a :class:`~scrapy.http.Request` object with cookies defined gets a + redirect response causing a new :class:`~scrapy.http.Request` object to be + scheduled, the cookies defined in the original + :class:`~scrapy.http.Request` object are no longer copied into the new + :class:`~scrapy.http.Request` object. + + If you manually set the ``Cookie`` header on a + :class:`~scrapy.http.Request` object and the domain name of the redirect + URL is not an exact match for the domain of the URL of the original + :class:`~scrapy.http.Request` object, your ``Cookie`` header is now dropped + from the new :class:`~scrapy.http.Request` object. + + The old behavior could be exploited by an attacker to gain access to your + cookies. Please, see the `cjvr-mfj7-j4j8 security advisory`_ for more + information. + + .. _cjvr-mfj7-j4j8 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cjvr-mfj7-j4j8 + + .. note:: It is still possible to enable the sharing of cookies between + different domains with a shared domain suffix (e.g. + ``example.com`` and any subdomain) by defining the shared domain + suffix (e.g. ``example.com``) as the cookie domain when defining + your cookies. See the documentation of the + :class:`~scrapy.http.Request` class for more information. + + Modified requirements ~~~~~~~~~~~~~~~~~~~~~ @@ -1842,6 +1875,38 @@ affect subclasses: (:issue:`3884`) +.. _release-1.8.2: + +Scrapy 1.8.2 (2022-03-01) +------------------------- + +**Security bug fixes:** + +- When a :class:`~scrapy.http.Request` object with cookies defined gets a + redirect response causing a new :class:`~scrapy.http.Request` object to be + scheduled, the cookies defined in the original + :class:`~scrapy.http.Request` object are no longer copied into the new + :class:`~scrapy.http.Request` object. + + If you manually set the ``Cookie`` header on a + :class:`~scrapy.http.Request` object and the domain name of the redirect + URL is not an exact match for the domain of the URL of the original + :class:`~scrapy.http.Request` object, your ``Cookie`` header is now dropped + from the new :class:`~scrapy.http.Request` object. + + The old behavior could be exploited by an attacker to gain access to your + cookies. Please, see the `cjvr-mfj7-j4j8 security advisory`_ for more + information. + + .. _cjvr-mfj7-j4j8 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cjvr-mfj7-j4j8 + + .. note:: It is still possible to enable the sharing of cookies between + different domains with a shared domain suffix (e.g. + ``example.com`` and any subdomain) by defining the shared domain + suffix (e.g. ``example.com``) as the cookie domain when defining + your cookies. See the documentation of the + :class:`~scrapy.http.Request` class for more information. + .. _release-1.8.1: diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py index 4053fecc511..fcd6c298bd7 100644 --- a/scrapy/downloadermiddlewares/redirect.py +++ b/scrapy/downloadermiddlewares/redirect.py @@ -4,6 +4,7 @@ from w3lib.url import safe_url_string from scrapy.http import HtmlResponse +from scrapy.utils.httpobj import urlparse_cached from scrapy.utils.response import get_meta_refresh from scrapy.exceptions import IgnoreRequest, NotConfigured @@ -11,6 +12,21 @@ logger = logging.getLogger(__name__) +def _build_redirect_request(source_request, *, url, method=None, body=None): + redirect_request = source_request.replace( + url=url, + method=method, + body=body, + cookies=None, + ) + if 'Cookie' in redirect_request.headers: + source_request_netloc = urlparse_cached(source_request).netloc + redirect_request_netloc = urlparse_cached(redirect_request).netloc + if source_request_netloc != redirect_request_netloc: + del redirect_request.headers['Cookie'] + return redirect_request + + class BaseRedirectMiddleware: enabled_setting = 'REDIRECT_ENABLED' @@ -47,10 +63,15 @@ def _redirect(self, redirected, request, spider, reason): raise IgnoreRequest("max redirections reached") def _redirect_request_using_get(self, request, redirect_url): - redirected = request.replace(url=redirect_url, method='GET', body='') - redirected.headers.pop('Content-Type', None) - redirected.headers.pop('Content-Length', None) - return redirected + redirect_request = _build_redirect_request( + request, + url=redirect_url, + method='GET', + body='', + ) + redirect_request.headers.pop('Content-Type', None) + redirect_request.headers.pop('Content-Length', None) + return redirect_request class RedirectMiddleware(BaseRedirectMiddleware): @@ -80,7 +101,7 @@ def process_response(self, request, response, spider): redirected_url = urljoin(request.url, location) if response.status in (301, 307, 308) or request.method == 'HEAD': - redirected = request.replace(url=redirected_url) + redirected = _build_redirect_request(request, url=redirected_url) return self._redirect(redirected, request, spider, response.status) redirected = self._redirect_request_using_get(request, redirected_url) diff --git a/tests/test_downloadermiddleware_cookies.py b/tests/test_downloadermiddleware_cookies.py index 36021bfbfc2..1747f3b94ac 100644 --- a/tests/test_downloadermiddleware_cookies.py +++ b/tests/test_downloadermiddleware_cookies.py @@ -6,8 +6,10 @@ from scrapy.downloadermiddlewares.cookies import CookiesMiddleware from scrapy.downloadermiddlewares.defaultheaders import DefaultHeadersMiddleware +from scrapy.downloadermiddlewares.redirect import RedirectMiddleware from scrapy.exceptions import NotConfigured from scrapy.http import Response, Request +from scrapy.settings import Settings from scrapy.spiders import Spider from scrapy.utils.python import to_bytes from scrapy.utils.test import get_crawler @@ -23,9 +25,11 @@ def split_cookies(cookies): def setUp(self): self.spider = Spider('foo') self.mw = CookiesMiddleware() + self.redirect_middleware = RedirectMiddleware(settings=Settings()) def tearDown(self): del self.mw + del self.redirect_middleware def test_basic(self): req = Request('http://scrapytest.org/') @@ -368,3 +372,154 @@ def test_primitive_type_cookies(self): req4 = Request('http://example.org', cookies={'a': 'b'}) assert self.mw.process_request(req4, self.spider) is None self.assertCookieValEqual(req4.headers['Cookie'], b'a=b') + + def _test_cookie_redirect( + self, + source, + target, + *, + cookies1, + cookies2, + ): + input_cookies = {'a': 'b'} + + if not isinstance(source, dict): + source = {'url': source} + if not isinstance(target, dict): + target = {'url': target} + target.setdefault('status', 301) + + request1 = Request(cookies=input_cookies, **source) + self.mw.process_request(request1, self.spider) + cookies = request1.headers.get('Cookie') + self.assertEqual(cookies, b"a=b" if cookies1 else None) + + response = Response( + headers={ + 'Location': target['url'], + }, + **target, + ) + self.assertEqual( + self.mw.process_response(request1, response, self.spider), + response, + ) + + request2 = self.redirect_middleware.process_response( + request1, + response, + self.spider, + ) + self.assertIsInstance(request2, Request) + + self.mw.process_request(request2, self.spider) + cookies = request2.headers.get('Cookie') + self.assertEqual(cookies, b"a=b" if cookies2 else None) + + def test_cookie_redirect_same_domain(self): + self._test_cookie_redirect( + 'https://toscrape.com', + 'https://toscrape.com', + cookies1=True, + cookies2=True, + ) + + def test_cookie_redirect_same_domain_forcing_get(self): + self._test_cookie_redirect( + 'https://toscrape.com', + {'url': 'https://toscrape.com', 'status': 302}, + cookies1=True, + cookies2=True, + ) + + def test_cookie_redirect_different_domain(self): + self._test_cookie_redirect( + 'https://toscrape.com', + 'https://example.com', + cookies1=True, + cookies2=False, + ) + + def test_cookie_redirect_different_domain_forcing_get(self): + self._test_cookie_redirect( + 'https://toscrape.com', + {'url': 'https://example.com', 'status': 302}, + cookies1=True, + cookies2=False, + ) + + def _test_cookie_header_redirect( + self, + source, + target, + *, + cookies2, + ): + """Test the handling of a user-defined Cookie header when building a + redirect follow-up request. + + We follow RFC 6265 for cookie handling. The Cookie header can only + contain a list of key-value pairs (i.e. no additional cookie + parameters like Domain or Path). Because of that, we follow the same + rules that we would follow for the handling of the Set-Cookie response + header when the Domain is not set: the cookies must be limited to the + target URL domain (not even subdomains can receive those cookies). + + .. note:: This method tests the scenario where the cookie middleware is + disabled. Because of known issue #1992, when the cookies + middleware is enabled we do not need to be concerned about + the Cookie header getting leaked to unintended domains, + because the middleware empties the header from every request. + """ + if not isinstance(source, dict): + source = {'url': source} + if not isinstance(target, dict): + target = {'url': target} + target.setdefault('status', 301) + + request1 = Request(headers={'Cookie': b'a=b'}, **source) + + response = Response( + headers={ + 'Location': target['url'], + }, + **target, + ) + + request2 = self.redirect_middleware.process_response( + request1, + response, + self.spider, + ) + self.assertIsInstance(request2, Request) + + cookies = request2.headers.get('Cookie') + self.assertEqual(cookies, b"a=b" if cookies2 else None) + + def test_cookie_header_redirect_same_domain(self): + self._test_cookie_header_redirect( + 'https://toscrape.com', + 'https://toscrape.com', + cookies2=True, + ) + + def test_cookie_header_redirect_same_domain_forcing_get(self): + self._test_cookie_header_redirect( + 'https://toscrape.com', + {'url': 'https://toscrape.com', 'status': 302}, + cookies2=True, + ) + + def test_cookie_header_redirect_different_domain(self): + self._test_cookie_header_redirect( + 'https://toscrape.com', + 'https://example.com', + cookies2=False, + ) + + def test_cookie_header_redirect_different_domain_forcing_get(self): + self._test_cookie_header_redirect( + 'https://toscrape.com', + {'url': 'https://example.com', 'status': 302}, + cookies2=False, + )