scrapy · cakemd · Nov 13, 2023 · Nov 13, 2023 · Nov 13, 2023 · Nov 14, 2023
diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
 import io
+import logging
 import zlib
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 from scrapy import Request, Spider
 from scrapy.crawler import Crawler
-from scrapy.exceptions import NotConfigured
+from scrapy.exceptions import NotConfigured, NotSupported
 from scrapy.http import Response, TextResponse
 from scrapy.responsetypes import responsetypes
 from scrapy.statscollectors import StatsCollector
@@ -17,6 +18,10 @@
     from typing_extensions import Self
 
 ACCEPTED_ENCODINGS: List[bytes] = [b"gzip", b"deflate"]
+ENCODINGS_DELIMETER: bytes = b", "
+
+logger = logging.getLogger(__name__)
+
 
 try:
     import brotli
@@ -35,7 +40,7 @@
 
 class HttpCompressionMiddleware:
     """This middleware allows compressed (gzip, deflate) traffic to be
-    sent/received from web sites"""
+    sent/received from websites"""
 
     def __init__(self, stats: Optional[StatsCollector] = None):
         self.stats = stats
@@ -49,7 +54,10 @@ def from_crawler(cls, crawler: Crawler) -> Self:
     def process_request(
         self, request: Request, spider: Spider
     ) -> Union[Request, Response, None]:
-        request.headers.setdefault("Accept-Encoding", b", ".join(ACCEPTED_ENCODINGS))
+        self._raise_unsupported_compressors(request)
+        request.headers.setdefault(
+            "Accept-Encoding", ENCODINGS_DELIMETER.join(ACCEPTED_ENCODINGS)
+        )
         return None
 
     def process_response(
@@ -85,6 +93,22 @@ def process_response(
 
         return response
 
+    @property
+    def _raise_unsupported(self) -> Tuple[bytes]:
+        return (b"br",)
+
+    def _raise_unsupported_compressors(self, request: Request):
+        encodings = request.headers.getlist("Accept-Encoding")
+        if encodings and len(encodings):
+            encodings = encodings.pop().split(ENCODINGS_DELIMETER)
+            unsupported = [key for key in encodings if key not in ACCEPTED_ENCODINGS]
+            for unsupp in unsupported:
+                if unsupp in self._raise_unsupported:
+                    raise NotSupported(
+                        f"Request is configured with Accept-Encoding header with unsupported encoding(s): "
+                        f"{unsupp.decode()}"
+                    )
+
     def _decode(self, body: bytes, encoding: bytes) -> bytes:
         if encoding == b"gzip" or encoding == b"x-gzip":
             body = gunzip(body)
@@ -99,8 +123,13 @@ def _decode(self, body: bytes, encoding: bytes) -> bytes:
                 # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
                 # http://www.gzip.org/zlib/zlib_faq.html#faq38
                 body = zlib.decompress(body, -15)
-        if encoding == b"br" and b"br" in ACCEPTED_ENCODINGS:
-            body = brotli.decompress(body)
+        if encoding == b"br":
+            if b"br" in ACCEPTED_ENCODINGS:
+                body = brotli.decompress(body)
+            else:
+                logger.warning(
+                    "Brotli encoding received. Cannot decompress the body as Brotli is not installed."
+                )
         if encoding == b"zstd" and b"zstd" in ACCEPTED_ENCODINGS:
             # Using its streaming API since its simple API could handle only cases
             # where there is content size data embedded in the frame

diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py
@@ -1,6 +1,7 @@
 from gzip import GzipFile
 from io import BytesIO
 from pathlib import Path
+from typing import Tuple
 from unittest import SkipTest, TestCase
 
 from w3lib.encoding import resolve_encoding
@@ -9,7 +10,7 @@
     ACCEPTED_ENCODINGS,
     HttpCompressionMiddleware,
 )
-from scrapy.exceptions import NotConfigured
+from scrapy.exceptions import NotConfigured, NotSupported
 from scrapy.http import HtmlResponse, Request, Response
 from scrapy.responsetypes import responsetypes
 from scrapy.spiders import Spider
@@ -37,6 +38,12 @@
 }
 
 
+class BroHttpCompressionMiddleware(HttpCompressionMiddleware):
+    @property
+    def _raise_unsupported(self) -> Tuple[bytes]:
+        return (b"bro",)
+
+
 class HttpCompressionTest(TestCase):
     def setUp(self):
         self.crawler = get_crawler(Spider)
@@ -102,6 +109,31 @@ def test_process_request(self):
             request.headers.get("Accept-Encoding"), b", ".join(ACCEPTED_ENCODINGS)
         )
 
+    def test_process_request_checks_encodings(self):
+        initial_encodings = ACCEPTED_ENCODINGS.copy()
+
+        mw = BroHttpCompressionMiddleware.from_crawler(self.crawler)
+
+        ACCEPTED_ENCODINGS.append(b"bro")
+
+        request = Request(
+            "http://scrapytest.org",
+            headers={"Accept-Encoding": b", ".join((b"bro", b"gzip"))},
+        )
+        mw.process_request(request, self.spider)
+        """Expecting no Exception raised here as `bro` encoding is forced to be allowed."""
+
+        request = Request(
+            "http://scrapytest.org",
+            headers={"Accept-Encoding": b", ".join((b"bro", b"gzip"))},
+        )
+        ACCEPTED_ENCODINGS.pop()
+        mw = BroHttpCompressionMiddleware.from_crawler(self.crawler)
+        self.assertRaises(NotSupported, mw.process_request, request, self.spider)
+
+        """Checking that valid encondings are back"""
+        self.assertListEqual(ACCEPTED_ENCODINGS, initial_encodings)
+
     def test_process_response_gzip(self):
         response = self._getresponse("gzip")
         request = response.request