From 8ce01b3b76d4634f55067d6cfdf632ec70ba304a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 1 Mar 2022 12:26:05 +0100
Subject: [PATCH] Merge pull request from GHSA-cjvr-mfj7-j4j8

* Do not carry over cookies to a different domain on redirect

* Cover the cookie-domain redirect fix in the release notes

* Cover 1.8.2 in the release notes

* Fix redirect Cookie handling when the cookie middleware is disabled

* Update the 1.8.2 release date
---
 docs/news.rst                              |  67 ++++++++-
 scrapy/downloadermiddlewares/redirect.py   |  31 ++++-
 tests/test_downloadermiddleware_cookies.py | 155 +++++++++++++++++++++
 3 files changed, 247 insertions(+), 6 deletions(-)
diff --git a/docs/news.rst b/docs/news.rst
index 2128f2f0e4d..aef12d9dbf7 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -5,11 +5,13 @@ Release notes
 
 .. _release-2.6.0:
 
-Scrapy 2.6.0 (2022-02-??)
+Scrapy 2.6.0 (2022-03-01)
 -------------------------
 
 Highlights:
 
+*   :ref:`Security fixes for cookie handling <2.6-security-fixes>`
+
 *   Python 3.10 support
 
 *   :ref:`asyncio support <using-asyncio>` is no longer considered
@@ -20,6 +22,37 @@ Highlights:
     :ref:`item filtering <item-filter>` and
     :ref:`post-processing <post-processing>`
 
+.. _2.6-security-fixes:
+
+Security bug fixes
+~~~~~~~~~~~~~~~~~~
+
+-   When a :class:`~scrapy.http.Request` object with cookies defined gets a
+    redirect response causing a new :class:`~scrapy.http.Request` object to be
+    scheduled, the cookies defined in the original
+    :class:`~scrapy.http.Request` object are no longer copied into the new
+    :class:`~scrapy.http.Request` object.
+
+    If you manually set the ``Cookie`` header on a
+    :class:`~scrapy.http.Request` object and the domain name of the redirect
+    URL is not an exact match for the domain of the URL of the original
+    :class:`~scrapy.http.Request` object, your ``Cookie`` header is now dropped
+    from the new :class:`~scrapy.http.Request` object.
+
+    The old behavior could be exploited by an attacker to gain access to your
+    cookies. Please, see the `cjvr-mfj7-j4j8 security advisory`_ for more
+    information.
+
+    .. _cjvr-mfj7-j4j8 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cjvr-mfj7-j4j8
+
+    .. note:: It is still possible to enable the sharing of cookies between
+              different domains with a shared domain suffix (e.g.
+              ``example.com`` and any subdomain) by defining the shared domain
+              suffix (e.g. ``example.com``) as the cookie domain when defining
+              your cookies. See the documentation of the
+              :class:`~scrapy.http.Request` class for more information.
+
+
 Modified requirements
 ~~~~~~~~~~~~~~~~~~~~~
 
@@ -1842,6 +1875,38 @@ affect subclasses:
 
 (:issue:`3884`)
 
+.. _release-1.8.2:
+
+Scrapy 1.8.2 (2022-03-01)
+-------------------------
+
+**Security bug fixes:**
+
+-   When a :class:`~scrapy.http.Request` object with cookies defined gets a
+    redirect response causing a new :class:`~scrapy.http.Request` object to be
+    scheduled, the cookies defined in the original
+    :class:`~scrapy.http.Request` object are no longer copied into the new
+    :class:`~scrapy.http.Request` object.
+
+    If you manually set the ``Cookie`` header on a
+    :class:`~scrapy.http.Request` object and the domain name of the redirect
+    URL is not an exact match for the domain of the URL of the original
+    :class:`~scrapy.http.Request` object, your ``Cookie`` header is now dropped
+    from the new :class:`~scrapy.http.Request` object.
+
+    The old behavior could be exploited by an attacker to gain access to your
+    cookies. Please, see the `cjvr-mfj7-j4j8 security advisory`_ for more
+    information.
+
+    .. _cjvr-mfj7-j4j8 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cjvr-mfj7-j4j8
+
+    .. note:: It is still possible to enable the sharing of cookies between
+              different domains with a shared domain suffix (e.g.
+              ``example.com`` and any subdomain) by defining the shared domain
+              suffix (e.g. ``example.com``) as the cookie domain when defining
+              your cookies. See the documentation of the
+              :class:`~scrapy.http.Request` class for more information.
+
 
 .. _release-1.8.1:
 
diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py
index 4053fecc511..fcd6c298bd7 100644
--- a/scrapy/downloadermiddlewares/redirect.py
+++ b/scrapy/downloadermiddlewares/redirect.py
@@ -4,6 +4,7 @@
 from w3lib.url import safe_url_string
 
 from scrapy.http import HtmlResponse
+from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.response import get_meta_refresh
 from scrapy.exceptions import IgnoreRequest, NotConfigured
 
@@ -11,6 +12,21 @@
 logger = logging.getLogger(__name__)
 
 
+def _build_redirect_request(source_request, *, url, method=None, body=None):
+    redirect_request = source_request.replace(
+        url=url,
+        method=method,
+        body=body,
+        cookies=None,
+    )
+    if 'Cookie' in redirect_request.headers:
+        source_request_netloc = urlparse_cached(source_request).netloc
+        redirect_request_netloc = urlparse_cached(redirect_request).netloc
+        if source_request_netloc != redirect_request_netloc:
+            del redirect_request.headers['Cookie']
+    return redirect_request
+
+
 class BaseRedirectMiddleware:
 
     enabled_setting = 'REDIRECT_ENABLED'
@@ -47,10 +63,15 @@ def _redirect(self, redirected, request, spider, reason):
             raise IgnoreRequest("max redirections reached")
 
     def _redirect_request_using_get(self, request, redirect_url):
-        redirected = request.replace(url=redirect_url, method='GET', body='')
-        redirected.headers.pop('Content-Type', None)
-        redirected.headers.pop('Content-Length', None)
-        return redirected
+        redirect_request = _build_redirect_request(
+            request,
+            url=redirect_url,
+            method='GET',
+            body='',
+        )
+        redirect_request.headers.pop('Content-Type', None)
+        redirect_request.headers.pop('Content-Length', None)
+        return redirect_request
 
 
 class RedirectMiddleware(BaseRedirectMiddleware):
@@ -80,7 +101,7 @@ def process_response(self, request, response, spider):
         redirected_url = urljoin(request.url, location)
 
         if response.status in (301, 307, 308) or request.method == 'HEAD':
-            redirected = request.replace(url=redirected_url)
+            redirected = _build_redirect_request(request, url=redirected_url)
             return self._redirect(redirected, request, spider, response.status)
 
         redirected = self._redirect_request_using_get(request, redirected_url)
diff --git a/tests/test_downloadermiddleware_cookies.py b/tests/test_downloadermiddleware_cookies.py
index 36021bfbfc2..1747f3b94ac 100644
--- a/tests/test_downloadermiddleware_cookies.py
+++ b/tests/test_downloadermiddleware_cookies.py
@@ -6,8 +6,10 @@
 
 from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
 from scrapy.downloadermiddlewares.defaultheaders import DefaultHeadersMiddleware
+from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
 from scrapy.exceptions import NotConfigured
 from scrapy.http import Response, Request
+from scrapy.settings import Settings
 from scrapy.spiders import Spider
 from scrapy.utils.python import to_bytes
 from scrapy.utils.test import get_crawler
@@ -23,9 +25,11 @@ def split_cookies(cookies):
     def setUp(self):
         self.spider = Spider('foo')
         self.mw = CookiesMiddleware()
+        self.redirect_middleware = RedirectMiddleware(settings=Settings())
 
     def tearDown(self):
         del self.mw
+        del self.redirect_middleware
 
     def test_basic(self):
         req = Request('http://scrapytest.org/')
@@ -368,3 +372,154 @@ def test_primitive_type_cookies(self):
         req4 = Request('http://example.org', cookies={'a': 'b'})
         assert self.mw.process_request(req4, self.spider) is None
         self.assertCookieValEqual(req4.headers['Cookie'], b'a=b')
+
+    def _test_cookie_redirect(
+        self,
+        source,
+        target,
+        *,
+        cookies1,
+        cookies2,
+    ):
+        input_cookies = {'a': 'b'}
+
+        if not isinstance(source, dict):
+            source = {'url': source}
+        if not isinstance(target, dict):
+            target = {'url': target}
+        target.setdefault('status', 301)
+
+        request1 = Request(cookies=input_cookies, **source)
+        self.mw.process_request(request1, self.spider)
+        cookies = request1.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies1 else None)
+
+        response = Response(
+            headers={
+                'Location': target['url'],
+            },
+            **target,
+        )
+        self.assertEqual(
+            self.mw.process_response(request1, response, self.spider),
+            response,
+        )
+
+        request2 = self.redirect_middleware.process_response(
+            request1,
+            response,
+            self.spider,
+        )
+        self.assertIsInstance(request2, Request)
+
+        self.mw.process_request(request2, self.spider)
+        cookies = request2.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies2 else None)
+
+    def test_cookie_redirect_same_domain(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            'https://toscrape.com',
+            cookies1=True,
+            cookies2=True,
+        )
+
+    def test_cookie_redirect_same_domain_forcing_get(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            {'url': 'https://toscrape.com', 'status': 302},
+            cookies1=True,
+            cookies2=True,
+        )
+
+    def test_cookie_redirect_different_domain(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            'https://example.com',
+            cookies1=True,
+            cookies2=False,
+        )
+
+    def test_cookie_redirect_different_domain_forcing_get(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            {'url': 'https://example.com', 'status': 302},
+            cookies1=True,
+            cookies2=False,
+        )
+
+    def _test_cookie_header_redirect(
+        self,
+        source,
+        target,
+        *,
+        cookies2,
+    ):
+        """Test the handling of a user-defined Cookie header when building a
+        redirect follow-up request.
+
+        We follow RFC 6265 for cookie handling. The Cookie header can only
+        contain a list of key-value pairs (i.e. no additional cookie
+        parameters like Domain or Path). Because of that, we follow the same
+        rules that we would follow for the handling of the Set-Cookie response
+        header when the Domain is not set: the cookies must be limited to the
+        target URL domain (not even subdomains can receive those cookies).
+
+        .. note:: This method tests the scenario where the cookie middleware is
+                  disabled. Because of known issue #1992, when the cookies
+                  middleware is enabled we do not need to be concerned about
+                  the Cookie header getting leaked to unintended domains,
+                  because the middleware empties the header from every request.
+        """
+        if not isinstance(source, dict):
+            source = {'url': source}
+        if not isinstance(target, dict):
+            target = {'url': target}
+        target.setdefault('status', 301)
+
+        request1 = Request(headers={'Cookie': b'a=b'}, **source)
+
+        response = Response(
+            headers={
+                'Location': target['url'],
+            },
+            **target,
+        )
+
+        request2 = self.redirect_middleware.process_response(
+            request1,
+            response,
+            self.spider,
+        )
+        self.assertIsInstance(request2, Request)
+
+        cookies = request2.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies2 else None)
+
+    def test_cookie_header_redirect_same_domain(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            'https://toscrape.com',
+            cookies2=True,
+        )
+
+    def test_cookie_header_redirect_same_domain_forcing_get(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            {'url': 'https://toscrape.com', 'status': 302},
+            cookies2=True,
+        )
+
+    def test_cookie_header_redirect_different_domain(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            'https://example.com',
+            cookies2=False,
+        )
+
+    def test_cookie_header_redirect_different_domain_forcing_get(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            {'url': 'https://example.com', 'status': 302},
+            cookies2=False,
+        )