Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Follow vorboto/feature/httpcompression keeep response encoding #6156

Open
wants to merge 29 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
5f068dc
Add HEADERS_KEEP variable to settings and to HttpCompressionMiddlewar…
cwen13 Sep 14, 2019
6a725ca
fixed type
cwen13 Sep 15, 2019
2b6fb1b
Added init to HttpCompMiddleware #1988
cwen13 Sep 16, 2019
96a8dfc
Added init to HttpCompMiddleware #1988 Typeo
cwen13 Sep 16, 2019
9010d9f
Trying with passing init crawler #1988
cwen13 Sep 16, 2019
608b342
Adding settings to HttpCompressionMiddleware instantiation in test
cwen13 Sep 16, 2019
3350d0f
Slowed down and tried to take things slower
cwen13 Sep 17, 2019
cb09ef5
I mispelled init -_-
VorBoto Sep 17, 2019
f949126
changed the how its intialized in test
VorBoto Sep 17, 2019
c94f35b
Added a crawler to HttpCompressionTest using the Spider it created
VorBoto Sep 17, 2019
490fb3c
Added from scrapy.utils.test import get_crawler
VorBoto Sep 17, 2019
e22c851
Passed tox but still not right
VorBoto Sep 17, 2019
e479f88
Trying to add flag for encoding
VorBoto Sep 18, 2019
25d4760
Trying to add flag for encoding
VorBoto Sep 18, 2019
ccbb524
Place encodings in response's flags issue #1988
VorBoto Sep 18, 2019
8814265
Did some recomended changes
VorBoto Sep 21, 2019
4588368
updated tests to function with no paramters sent to setUp()
VorBoto Sep 25, 2019
0a0b40d
Switched dict['k'] to a dict.get('k') so no KeyError
VorBoto Sep 25, 2019
30ead53
Actually pull the flags before trying to add decoded
VorBoto Sep 26, 2019
fd11144
Removed unecessary checks for flag entry of kwargs
VorBoto Sep 27, 2019
091a8a4
Added/copied a few test as a way to show some continued functionality
VorBoto Oct 5, 2019
d2fbd10
Fix an oversight
VorBoto Oct 5, 2019
a944117
Fix an oversight on casting a str to binary
VorBoto Oct 5, 2019
8164e31
Fix an oversight on casting a str to binary again
VorBoto Oct 5, 2019
11af9bb
fix some sytanx and formating as well as variable usage
VorBoto Oct 17, 2019
64597cd
Ther was a ( on the loose
VorBoto Oct 17, 2019
0a03b45
Syntax clean up
VorBoto Oct 18, 2019
e776fae
Merge branch 'master' of github.com:scrapy/scrapy into feature/httpco…
heorhiikorzh Nov 27, 2023
cab040b
Changed settings pickup in HttpCompressionMiddleware.__init__
heorhiikorzh Nov 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 17 additions & 4 deletions scrapy/downloadermiddlewares/httpcompression.py
Expand Up @@ -9,6 +9,7 @@
from scrapy.exceptions import NotConfigured
from scrapy.http import Response, TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.settings import Settings
from scrapy.statscollectors import StatsCollector
from scrapy.utils.gz import gunzip

Expand All @@ -33,18 +34,29 @@
pass


def _get_init_settings_kwargs(settings: Settings):
keep_encoding_header = (
settings.getbool("COMPRESSION_KEEP_ENCODING_HEADERS") if settings else False
)
return dict(keep_encoding_header=keep_encoding_header)


class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""

def __init__(self, stats: Optional[StatsCollector] = None):
def __init__(
self, stats: Optional[StatsCollector] = None, keep_encoding_header: bool = False
):
self.stats = stats
self.keep_encoding_header = keep_encoding_header

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
if not crawler.settings.getbool("COMPRESSION_ENABLED"):
raise NotConfigured
return cls(stats=crawler.stats)
settings_kwargs = _get_init_settings_kwargs(crawler.settings)
return cls(stats=crawler.stats, **settings_kwargs)

def process_request(
self, request: Request, spider: Spider
Expand Down Expand Up @@ -79,10 +91,11 @@
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs["encoding"] = None
if self.keep_encoding_header:
kwargs["flags"] = response.flags + ["decoded"]

Check warning on line 95 in scrapy/downloadermiddlewares/httpcompression.py

View check run for this annotation

Codecov / codecov/patch

scrapy/downloadermiddlewares/httpcompression.py#L95

Added line #L95 was not covered by tests
response = response.replace(**kwargs)
if not content_encoding:
if not self.keep_encoding_header and not content_encoding:
del response.headers["Content-Encoding"]

return response

def _decode(self, body: bytes, encoding: bytes) -> bytes:
Expand Down
1 change: 1 addition & 0 deletions scrapy/settings/default_settings.py
Expand Up @@ -39,6 +39,7 @@
COMMANDS_MODULE = ""

COMPRESSION_ENABLED = True
COMPRESSION_KEEP_ENCODING_HEADERS = False

CONCURRENT_ITEMS = 100

Expand Down
39 changes: 39 additions & 0 deletions tests/test_downloadermiddleware_httpcompression.py
Expand Up @@ -38,6 +38,17 @@


class HttpCompressionTest(TestCase):
@staticmethod
def create_spider_mw(compression_enabled=True, compression_header=False):
settings = {
"COMPRESSION_ENABLED": compression_enabled,
"COMPRESSION_KEEP_ENCODING_HEADERS": compression_header,
}
crawler = get_crawler(Spider, settings)
spider = crawler._create_spider("foo")
mw = HttpCompressionMiddleware.from_crawler(crawler)
return spider, mw

def setUp(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider("scrapytest.org")
Expand Down Expand Up @@ -371,3 +382,31 @@ def test_process_response_head_request_no_decode_required(self):
self.assertEqual(response.body, b"")
self.assertStatsEqual("httpcompression/response_count", None)
self.assertStatsEqual("httpcompression/response_bytes", None)

def test_process_response_gzip_keep_headers(self):
test_spider, test_mw = self.create_spider_mw(
compression_enabled=True, compression_header=True
)
response = self._getresponse("gzip")
request = response.request

self.assertEqual(response.headers["Content-Encoding"], b"gzip")
newresponse = test_mw.process_response(request, response, test_spider)
assert newresponse is not response
assert newresponse.body.startswith(b"<!DOCTYPE")
assert "Content-Encoding" in newresponse.headers
assert "decoded" in newresponse.flags

def test_process_response_gzip_binary_octetstream_contenttype_kept(self):
test_spider, test_mw = self.create_spider_mw(
compression_enabled=True, compression_header=True
)
response = self._getresponse("x-gzip")
response.headers["Content-Type"] = "binary/octet-stream"
request = response.request

newresponse = test_mw.process_response(request, response, test_spider)
self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b"<!DOCTYPE"))
self.assertIn("Content-Encoding", newresponse.headers)
self.assertIn("decoded", newresponse.flags)