Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Content-Encoding header in response flag #5943

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
90 changes: 64 additions & 26 deletions scrapy/downloadermiddlewares/httpcompression.py
Expand Up @@ -39,12 +39,52 @@ class HttpCompressionMiddleware:

def __init__(self, stats: Optional[StatsCollector] = None):
self.stats = stats
if not stats:
warnings.warn(
"The default value of COMPRESSION_KEEP_ENCODING_HEADER, "
"False, is deprecated, and will stop working and stop "
"being its default value in a future version of Scrapy. "
"Set COMPRESSION_KEEP_ENCODING_HEADER=True in your "
"settings to remove this warning.",
ScrapyDeprecationWarning,
stacklevel=2,
)
if settings:
self.keep_encoding_header = settings.getbool('COMPRESSION_KEEP_ENCODING_HEADER')
if not self.keep_encoding_header:
warnings.warn(
"Setting COMPRESSION_KEEP_ENCODING_HEADER=False is deprecated",
ScrapyDeprecationWarning,
)
else:
self.keep_encoding_header = False
warnings.warn(
"The default value of COMPRESSION_KEEP_ENCODING_HEADER, "
"False, is deprecated, and will stop working and stop "
"being its default value in a future version of Scrapy. "
"Set COMPRESSION_KEEP_ENCODING_HEADER=True in your "
"settings to remove this warning.",
ScrapyDeprecationWarning,
stacklevel=2,
)

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
if not crawler.settings.getbool("COMPRESSION_ENABLED"):
raise NotConfigured
return cls(stats=crawler.stats)
try:
return cls(stats=crawler.stats, settings=crawler.settings)
except TypeError:
warnings.warn(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support 'stats' and 'settings' parameters or "
"reimplement the 'from_crawler' method.",
ScrapyDeprecationWarning,
)
result = cls()
result.stats = crawler.stats
result.keep_encoding_header = False
return result

def process_request(
self, request: Request, spider: Spider
Expand All @@ -57,32 +97,30 @@ def process_response(
) -> Union[Request, Response]:
if request.method == "HEAD":
return response
if isinstance(response, Response):
content_encoding = response.headers.getlist("Content-Encoding")
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value(
"httpcompression/response_bytes",
len(decoded_body),
spider=spider,
)
self.stats.inc_value(
"httpcompression/response_count", spider=spider
)
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs = dict(cls=respcls, body=decoded_body)
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs["encoding"] = None
response = response.replace(**kwargs)
if not content_encoding:
del response.headers["Content-Encoding"]
if b'decoded' in response.flags:
return response
content_encoding = response.headers.getlist('Content-Encoding')
if not content_encoding:
return response

encoding = content_encoding[0]
decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider)
self.stats.inc_value('httpcompression/response_count', spider=spider)
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs = dict(cls=respcls, body=decoded_body)
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs['encoding'] = None

kwargs['flags'] = response.flags + [b'decoded']
response = response.replace(**kwargs)
if not self.keep_encoding_header:
del response.headers['Content-Encoding']
return response

def _decode(self, body: bytes, encoding: bytes) -> bytes:
Expand Down
1 change: 1 addition & 0 deletions scrapy/settings/default_settings.py
Expand Up @@ -39,6 +39,7 @@
COMMANDS_MODULE = ""

COMPRESSION_ENABLED = True
COMPRESSION_KEEP_ENCODING_HEADER = False

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should keep the empty line before CONCURRENT_ITEMS

CONCURRENT_ITEMS = 100

Expand Down
3 changes: 3 additions & 0 deletions scrapy/templates/project/module/settings.py.tmpl
Expand Up @@ -87,6 +87,9 @@ ROBOTSTXT_OBEY = True
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Keep original Content-Encoding header
COMPRESSION_KEEP_ENCODING_HEADER = True

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
Expand Down