Skip to content

Commit

Permalink
fix: fall back to 'charset' of 'content_type' in 'download_as_text'
Browse files Browse the repository at this point in the history
Explicit 'encoding' overrides the fallback.

Use the 'charset' param of 'content_type', rather than 'content_encoding',
which isn't going to be a Unicode -> bytes encoding.

Closes #319.
  • Loading branch information
tseaver committed Nov 24, 2020
1 parent 6a0b8a7 commit d4ee030
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 59 deletions.
18 changes: 12 additions & 6 deletions google/cloud/storage/blob.py
Expand Up @@ -26,6 +26,7 @@
"""

import base64
import cgi
import copy
import hashlib
from io import BytesIO
Expand Down Expand Up @@ -1345,7 +1346,7 @@ def download_as_text(
start=None,
end=None,
raw_download=False,
encoding="utf-8",
encoding=None,
if_generation_match=None,
if_generation_not_match=None,
if_metageneration_match=None,
Expand Down Expand Up @@ -1374,8 +1375,8 @@ def download_as_text(
:type encoding: str
:param encoding: (Optional) The data of the blob will be decoded by
encoding method. Defaults to UTF-8. Apply only
if the value of ``blob.content_encoding`` is None.
encoding method. Defaults to the ``charset`` param
of attr:`content_type`, or else to "utf-8".
:type if_generation_match: long
:param if_generation_match: (Optional) Make the operation conditional on whether
Expand Down Expand Up @@ -1422,11 +1423,16 @@ def download_as_text(
timeout=timeout,
)

if self.content_encoding:
return data.decode(self.content_encoding)
else:
if encoding is not None:
return data.decode(encoding)

if self.content_type is not None:
_, params = cgi.parse_header(self.content_type)
if "charset" in params:
return data.decode(params["charset"])

return data.decode("utf-8")

def _get_content_type(self, content_type, filename=None):
"""Determine the content type from the current object.
Expand Down
161 changes: 108 additions & 53 deletions tests/unit/test_blob.py
Expand Up @@ -1622,66 +1622,88 @@ def test_download_as_bytes_w_raw(self):
def test_download_as_byte_w_custom_timeout(self):
self._download_as_bytes_helper(raw_download=False, timeout=9.58)

def _download_as_text_helper(self, raw_download, encoding=None, timeout=None):
def _download_as_text_helper(
self,
raw_download,
client=None,
start=None,
end=None,
if_generation_match=None,
if_generation_not_match=None,
if_metageneration_match=None,
if_metageneration_not_match=None,
timeout=None,
encoding=None,
charset=None,
no_charset=False,
expected_value=u"DEADBEEF",
payload=None,
):
if payload is None:
if encoding is not None:
payload = expected_value.encode(encoding)
elif charset is not None:
payload = expected_value.encode(charset)
else:
payload = expected_value.encode()

blob_name = "blob-name"
client = mock.Mock(spec=["_http"])
bucket = _Bucket(client)
media_link = "http://example.com/media/"
properties = {"mediaLink": media_link}
if encoding:
properties["contentEncoding"] = encoding
bucket = _Bucket()

properties = {}
if charset is not None:
properties["contentType"] = "text/plain; charset={}".format(charset)
elif no_charset:
properties = {"contentType": "text/plain"}

blob = self._make_one(blob_name, bucket=bucket, properties=properties)
blob._do_download = mock.Mock()
blob.download_as_bytes = mock.Mock(return_value=payload)

if timeout is None:
expected_timeout = self._get_default_timeout()
fetched = blob.download_as_text(raw_download=raw_download)
else:
expected_timeout = timeout
fetched = blob.download_as_text(raw_download=raw_download, timeout=timeout)
kwargs = {"raw_download": raw_download}

self.assertEqual(fetched, "")
if client is not None:
kwargs["client"] = client

headers = {"accept-encoding": "gzip"}
blob._do_download.assert_called_once_with(
client._http,
mock.ANY,
media_link,
headers,
None,
None,
raw_download,
timeout=expected_timeout,
checksum="md5",
)
stream = blob._do_download.mock_calls[0].args[1]
self.assertIsInstance(stream, io.BytesIO)
if start is not None:
kwargs["start"] = start

def test_download_as_text_w_generation_match(self):
GENERATION_NUMBER = 6
MEDIA_LINK = "http://example.com/media/"
if end is not None:
kwargs["end"] = end

client = mock.Mock(spec=["_http"])
blob = self._make_one(
"blob-name", bucket=_Bucket(client), properties={"mediaLink": MEDIA_LINK}
)
blob.download_to_file = mock.Mock()
if encoding is not None:
kwargs["encoding"] = encoding

fetched = blob.download_as_text(if_generation_match=GENERATION_NUMBER)
self.assertEqual(fetched, "")
if if_generation_match is not None:
kwargs["if_generation_match"] = if_generation_match

blob.download_to_file.assert_called_once_with(
mock.ANY,
client=None,
start=None,
end=None,
raw_download=False,
if_generation_match=GENERATION_NUMBER,
if_generation_not_match=None,
if_metageneration_match=None,
if_metageneration_not_match=None,
timeout=self._get_default_timeout(),
checksum="md5",
if if_generation_not_match is not None:
kwargs["if_generation_not_match"] = if_generation_not_match

if if_metageneration_match is not None:
kwargs["if_metageneration_match"] = if_metageneration_match

if if_metageneration_not_match is not None:
kwargs["if_metageneration_not_match"] = if_metageneration_not_match

if timeout is None:
expected_timeout = self._get_default_timeout()
else:
kwargs["timeout"] = expected_timeout = timeout

fetched = blob.download_as_text(**kwargs)

self.assertEqual(fetched, expected_value)

blob.download_as_bytes.assert_called_once_with(
client=client,
start=client,
end=client,
raw_download=raw_download,
timeout=expected_timeout,
if_generation_match=if_generation_match,
if_generation_not_match=if_generation_not_match,
if_metageneration_match=if_metageneration_match,
if_metageneration_not_match=if_metageneration_not_match,
)

def test_download_as_text_wo_raw(self):
Expand All @@ -1693,8 +1715,41 @@ def test_download_as_text_w_raw(self):
def test_download_as_text_w_custom_timeout(self):
self._download_as_text_helper(raw_download=False, timeout=9.58)

def test_download_as_text_w_encoding(self):
self._download_as_text_helper(raw_download=False, encoding="utf-8")
def test_download_as_text_w_if_generation_match(self):
self._download_as_text_helper(raw_download=False, if_generation_match=6)

def test_download_as_text_w_if_generation_not_match(self):
self._download_as_text_helper(raw_download=False, if_generation_not_match=6)

def test_download_as_text_w_if_metageneration_match(self):
self._download_as_text_helper(raw_download=False, if_metageneration_match=6)

def test_download_as_text_w_if_metageneration_not_match(self):
self._download_as_text_helper(raw_download=False, if_metageneration_not_match=6)

def test_download_as_text_w_non_ascii_w_explicit_encoding(self):
expected_value = u"\x0AFe"
encoding = "utf-16"
charset = "latin1"
payload = expected_value.encode(encoding)
self._download_as_text_helper(
raw_download=False,
expected_value=expected_value,
payload=payload,
encoding=encoding,
charset=charset,
)

def test_download_as_text_w_non_ascii_wo_explicit_encoding_w_charset(self):
expected_value = u"\x0AFe"
charset = "utf-16"
payload = expected_value.encode(charset)
self._download_as_text_helper(
raw_download=False,
expected_value=expected_value,
payload=payload,
charset=charset,
)

@mock.patch("warnings.warn")
def test_download_as_string(self, mock_warn):
Expand Down

0 comments on commit d4ee030

Please sign in to comment.