From a006df3f2f4994186e620b350477cf3b13e94a97 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Tue, 24 Nov 2020 16:03:51 -0500 Subject: [PATCH] fix: fall back to 'charset' of 'content_type' in 'download_as_text' Explicit 'encoding' overrides the fallback. Use the 'charset' param of 'content_type', rather than 'content_encoding', which isn't going to be a Unicode -> bytes encoding. Closes #319. --- google/cloud/storage/blob.py | 18 ++-- tests/unit/test_blob.py | 161 +++++++++++++++++++++++------------ 2 files changed, 120 insertions(+), 59 deletions(-) diff --git a/google/cloud/storage/blob.py b/google/cloud/storage/blob.py index 18006d5ad..aa8ea3630 100644 --- a/google/cloud/storage/blob.py +++ b/google/cloud/storage/blob.py @@ -26,6 +26,7 @@ """ import base64 +import cgi import copy import hashlib from io import BytesIO @@ -1345,7 +1346,7 @@ def download_as_text( start=None, end=None, raw_download=False, - encoding="utf-8", + encoding=None, if_generation_match=None, if_generation_not_match=None, if_metageneration_match=None, @@ -1374,8 +1375,8 @@ def download_as_text( :type encoding: str :param encoding: (Optional) The data of the blob will be decoded by - encoding method. Defaults to UTF-8. Apply only - if the value of ``blob.content_encoding`` is None. + encoding method. Defaults to the ``charset`` param + of attr:`content_type`, or else to "utf-8". :type if_generation_match: long :param if_generation_match: (Optional) Make the operation conditional on whether @@ -1422,11 +1423,16 @@ def download_as_text( timeout=timeout, ) - if self.content_encoding: - return data.decode(self.content_encoding) - else: + if encoding is not None: return data.decode(encoding) + if self.content_type is not None: + _, params = cgi.parse_header(self.content_type) + if "charset" in params: + return data.decode(params["charset"]) + + return data.decode("utf-8") + def _get_content_type(self, content_type, filename=None): """Determine the content type from the current object. diff --git a/tests/unit/test_blob.py b/tests/unit/test_blob.py index bc9918627..a680cd747 100644 --- a/tests/unit/test_blob.py +++ b/tests/unit/test_blob.py @@ -1622,66 +1622,88 @@ def test_download_as_bytes_w_raw(self): def test_download_as_byte_w_custom_timeout(self): self._download_as_bytes_helper(raw_download=False, timeout=9.58) - def _download_as_text_helper(self, raw_download, encoding=None, timeout=None): + def _download_as_text_helper( + self, + raw_download, + client=None, + start=None, + end=None, + if_generation_match=None, + if_generation_not_match=None, + if_metageneration_match=None, + if_metageneration_not_match=None, + timeout=None, + encoding=None, + charset=None, + no_charset=False, + expected_value=u"DEADBEEF", + payload=None, + ): + if payload is None: + if encoding is not None: + payload = expected_value.encode(encoding) + elif charset is not None: + payload = expected_value.encode(charset) + else: + payload = expected_value.encode() + blob_name = "blob-name" - client = mock.Mock(spec=["_http"]) - bucket = _Bucket(client) - media_link = "http://example.com/media/" - properties = {"mediaLink": media_link} - if encoding: - properties["contentEncoding"] = encoding + bucket = _Bucket() + + properties = {} + if charset is not None: + properties["contentType"] = "text/plain; charset={}".format(charset) + elif no_charset: + properties = {"contentType": "text/plain"} + blob = self._make_one(blob_name, bucket=bucket, properties=properties) - blob._do_download = mock.Mock() + blob.download_as_bytes = mock.Mock(return_value=payload) - if timeout is None: - expected_timeout = self._get_default_timeout() - fetched = blob.download_as_text(raw_download=raw_download) - else: - expected_timeout = timeout - fetched = blob.download_as_text(raw_download=raw_download, timeout=timeout) + kwargs = {"raw_download": raw_download} - self.assertEqual(fetched, "") + if client is not None: + kwargs["client"] = client - headers = {"accept-encoding": "gzip"} - blob._do_download.assert_called_once_with( - client._http, - mock.ANY, - media_link, - headers, - None, - None, - raw_download, - timeout=expected_timeout, - checksum="md5", - ) - stream = blob._do_download.mock_calls[0].args[1] - self.assertIsInstance(stream, io.BytesIO) + if start is not None: + kwargs["start"] = start - def test_download_as_text_w_generation_match(self): - GENERATION_NUMBER = 6 - MEDIA_LINK = "http://example.com/media/" + if end is not None: + kwargs["end"] = end - client = mock.Mock(spec=["_http"]) - blob = self._make_one( - "blob-name", bucket=_Bucket(client), properties={"mediaLink": MEDIA_LINK} - ) - blob.download_to_file = mock.Mock() + if encoding is not None: + kwargs["encoding"] = encoding - fetched = blob.download_as_text(if_generation_match=GENERATION_NUMBER) - self.assertEqual(fetched, "") + if if_generation_match is not None: + kwargs["if_generation_match"] = if_generation_match - blob.download_to_file.assert_called_once_with( - mock.ANY, - client=None, - start=None, - end=None, - raw_download=False, - if_generation_match=GENERATION_NUMBER, - if_generation_not_match=None, - if_metageneration_match=None, - if_metageneration_not_match=None, - timeout=self._get_default_timeout(), - checksum="md5", + if if_generation_not_match is not None: + kwargs["if_generation_not_match"] = if_generation_not_match + + if if_metageneration_match is not None: + kwargs["if_metageneration_match"] = if_metageneration_match + + if if_metageneration_not_match is not None: + kwargs["if_metageneration_not_match"] = if_metageneration_not_match + + if timeout is None: + expected_timeout = self._get_default_timeout() + else: + kwargs["timeout"] = expected_timeout = timeout + + fetched = blob.download_as_text(**kwargs) + + self.assertEqual(fetched, expected_value) + + blob.download_as_bytes.assert_called_once_with( + client=client, + start=client, + end=client, + raw_download=raw_download, + timeout=expected_timeout, + if_generation_match=if_generation_match, + if_generation_not_match=if_generation_not_match, + if_metageneration_match=if_metageneration_match, + if_metageneration_not_match=if_metageneration_not_match, ) def test_download_as_text_wo_raw(self): @@ -1693,8 +1715,41 @@ def test_download_as_text_w_raw(self): def test_download_as_text_w_custom_timeout(self): self._download_as_text_helper(raw_download=False, timeout=9.58) - def test_download_as_text_w_encoding(self): - self._download_as_text_helper(raw_download=False, encoding="utf-8") + def test_download_as_text_w_if_generation_match(self): + self._download_as_text_helper(raw_download=False, if_generation_match=6) + + def test_download_as_text_w_if_generation_not_match(self): + self._download_as_text_helper(raw_download=False, if_generation_not_match=6) + + def test_download_as_text_w_if_metageneration_match(self): + self._download_as_text_helper(raw_download=False, if_metageneration_match=6) + + def test_download_as_text_w_if_metageneration_not_match(self): + self._download_as_text_helper(raw_download=False, if_metageneration_not_match=6) + + def test_download_as_text_w_non_ascii_w_explicit_encoding(self): + expected_value = u"\x0AFe" + encoding = "utf-16" + charset = "latin1" + payload = expected_value.encode(encoding) + self._download_as_text_helper( + raw_download=False, + expected_value=expected_value, + payload=payload, + encoding=encoding, + charset=charset, + ) + + def test_download_as_text_w_non_ascii_wo_explicit_encoding_w_charset(self): + expected_value = u"\x0AFe" + charset = "utf-16" + payload = expected_value.encode(charset) + self._download_as_text_helper( + raw_download=False, + expected_value=expected_value, + payload=payload, + charset=charset, + ) @mock.patch("warnings.warn") def test_download_as_string(self, mock_warn):