Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: issue a warning if buggy pyarrow is detected #787

Merged
merged 1 commit into from Jul 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 15 additions & 0 deletions google/cloud/bigquery/client.py
Expand Up @@ -27,13 +27,16 @@
import json
import math
import os
import packaging.version
import tempfile
from typing import Any, BinaryIO, Dict, Iterable, Optional, Sequence, Tuple, Union
import uuid
import warnings

try:
import pyarrow

_PYARROW_VERSION = packaging.version.parse(pyarrow.__version__)
except ImportError: # pragma: NO COVER
pyarrow = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potentially leaves _PYARROW_VERSION undefined. Maybe:

Suggested change
pyarrow = None
pyarrow = None
_PYARROW_VERSION = None

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's only used in the path where pyarrow is definitely available (with a prior check), thus left it out. Although that implicit fact can change...

On the other hand, we are already making pyarrow required, thus it probably shouldn't be an issue in the transitional period, but a valid remark nevertheless.


Expand Down Expand Up @@ -118,6 +121,9 @@
# https://github.com/googleapis/python-bigquery/issues/438
_MIN_GET_QUERY_RESULTS_TIMEOUT = 120

# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414
_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")])


class Project(object):
"""Wrapper for resource describing a BigQuery project.
Expand Down Expand Up @@ -2609,6 +2615,15 @@ def load_table_from_dataframe(
try:

if job_config.source_format == job.SourceFormat.PARQUET:
if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS:
msg = (
"Loading dataframe data in PARQUET format with pyarrow "
f"{_PYARROW_VERSION} can result in data corruption. It is "
"therefore *strongly* advised to use a different pyarrow "
"version or a different source format. "
"See: https://github.com/googleapis/python-bigquery/issues/781"
)
warnings.warn(msg, category=RuntimeWarning)

if job_config.schema:
if parquet_compression == "snappy": # adjust the default value
Expand Down
37 changes: 37 additions & 0 deletions tests/unit/test_client.py
Expand Up @@ -27,6 +27,7 @@
import warnings

import mock
import packaging
import requests
import pytest
import pytz
Expand Down Expand Up @@ -7510,6 +7511,42 @@ def test_load_table_from_dataframe_wo_pyarrow_raises_error(self):
parquet_compression="gzip",
)

def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self):
pytest.importorskip("pandas", reason="Requires `pandas`")
pytest.importorskip("pyarrow", reason="Requires `pyarrow`")

client = self._make_client()
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
dataframe = pandas.DataFrame(records)

pyarrow_version_patch = mock.patch(
"google.cloud.bigquery.client._PYARROW_VERSION",
packaging.version.parse("2.0.0"), # A known bad version of pyarrow.
)
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
side_effect=google.api_core.exceptions.NotFound("Table not found"),
)
load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

with load_patch, get_table_patch, pyarrow_version_patch:
with warnings.catch_warnings(record=True) as warned:
client.load_table_from_dataframe(
dataframe, self.TABLE_REF, location=self.LOCATION,
)

expected_warnings = [
warning for warning in warned if "pyarrow" in str(warning).lower()
]
assert len(expected_warnings) == 1
assert issubclass(expected_warnings[0].category, RuntimeWarning)
msg = str(expected_warnings[0].message)
assert "pyarrow 2.0.0" in msg
assert "data corruption" in msg

@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
def test_load_table_from_dataframe_w_nulls(self):
Expand Down