Skip to content

Commit

Permalink
Add support for multiple-compressed responses (#6063)
Browse files Browse the repository at this point in the history
  • Loading branch information
vishesh10 committed Feb 22, 2024
1 parent ebd7e19 commit e208f82
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 5 deletions.
34 changes: 29 additions & 5 deletions scrapy/downloadermiddlewares/httpcompression.py
@@ -1,6 +1,7 @@
from __future__ import annotations

import warnings
from itertools import chain
from logging import getLogger
from typing import TYPE_CHECKING, List, Optional, Union

Expand Down Expand Up @@ -102,25 +103,26 @@ def process_response(
if isinstance(response, Response):
content_encoding = response.headers.getlist("Content-Encoding")
if content_encoding:
encoding = content_encoding.pop()
max_size = request.meta.get("download_maxsize", self._max_size)
warn_size = request.meta.get("download_warnsize", self._warn_size)
try:
decoded_body = self._decode(
response.body, encoding.lower(), max_size
decoded_body, content_encoding = self._handle_encoding(
response.body, content_encoding, max_size
)
except _DecompressionMaxSizeExceeded:
raise IgnoreRequest(
f"Ignored response {response} because its body "
f"({len(response.body)} B) exceeded DOWNLOAD_MAXSIZE "
f"({max_size} B) during decompression."
f"({len(response.body)} B compressed) exceeded "
f"DOWNLOAD_MAXSIZE ({max_size} B) during "
f"decompression."
)
if len(response.body) < warn_size <= len(decoded_body):
logger.warning(
f"{response} body size after decompression "
f"({len(decoded_body)} B) is larger than the "
f"download warning size ({warn_size} B)."
)
response.headers["Content-Encoding"] = content_encoding
if self.stats:
self.stats.inc_value(
"httpcompression/response_bytes",
Expand All @@ -144,6 +146,28 @@ def process_response(

return response

def _handle_encoding(self, body, content_encoding, max_size):
to_decode, to_keep = self._split_encodings(content_encoding)
for encoding in to_decode:
body = self._decode(body, encoding, max_size)
return body, to_keep

def _split_encodings(self, content_encoding):
to_keep = [
encoding.strip().lower()
for encoding in chain.from_iterable(
encodings.split(b",") for encodings in content_encoding
)
]
to_decode = []
while to_keep:
encoding = to_keep.pop()
if encoding not in ACCEPTED_ENCODINGS:
to_keep.append(encoding)
return to_decode, to_keep
to_decode.append(encoding)
return to_decode, to_keep

def _decode(self, body: bytes, encoding: bytes, max_size: int) -> bytes:
if encoding == b"gzip" or encoding == b"x-gzip":
return gunzip(body, max_size=max_size)
Expand Down
Binary file not shown.
Binary file not shown.
58 changes: 58 additions & 0 deletions tests/test_downloadermiddleware_httpcompression.py
Expand Up @@ -27,6 +27,8 @@
"x-gzip": ("html-gzip.bin", "gzip"),
"rawdeflate": ("html-rawdeflate.bin", "deflate"),
"zlibdeflate": ("html-zlibdeflate.bin", "deflate"),
"gzip-deflate": ("html-gzip-deflate.bin", "gzip, deflate"),
"gzip-deflate-gzip": ("html-gzip-deflate-gzip.bin", "gzip, deflate, gzip"),
"br": ("html-br.bin", "br"),
# $ zstd raw.html --content-size -o html-zstd-static-content-size.bin
"zstd-static-content-size": ("html-zstd-static-content-size.bin", "zstd"),
Expand Down Expand Up @@ -205,6 +207,62 @@ def test_multipleencodings(self):
assert newresponse is not response
self.assertEqual(newresponse.headers.getlist("Content-Encoding"), [b"uuencode"])

def test_multi_compression_single_header(self):
response = self._getresponse("gzip-deflate")
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
assert "Content-Encoding" not in newresponse.headers
assert newresponse.body.startswith(b"<!DOCTYPE")

def test_multi_compression_single_header_invalid_compression(self):
response = self._getresponse("gzip-deflate")
response.headers["Content-Encoding"] = [b"gzip, foo, deflate"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
)

def test_multi_compression_multiple_header(self):
response = self._getresponse("gzip-deflate")
response.headers["Content-Encoding"] = ["gzip", "deflate"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
assert "Content-Encoding" not in newresponse.headers
assert newresponse.body.startswith(b"<!DOCTYPE")

def test_multi_compression_multiple_header_invalid_compression(self):
response = self._getresponse("gzip-deflate")
response.headers["Content-Encoding"] = ["gzip", "foo", "deflate"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
)

def test_multi_compression_single_and_multiple_header(self):
response = self._getresponse("gzip-deflate-gzip")
response.headers["Content-Encoding"] = ["gzip", "deflate, gzip"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
assert "Content-Encoding" not in newresponse.headers
assert newresponse.body.startswith(b"<!DOCTYPE")

def test_multi_compression_single_and_multiple_header_invalid_compression(self):
response = self._getresponse("gzip-deflate")
response.headers["Content-Encoding"] = ["gzip", "foo,deflate"]
request = response.request
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(
newresponse.headers.getlist("Content-Encoding"), [b"gzip", b"foo"]
)

def test_process_response_encoding_inside_body(self):
headers = {
"Content-Type": "text/html",
Expand Down

0 comments on commit e208f82

Please sign in to comment.