diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 0a1f71444..9df0f3d0a 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -19,7 +19,7 @@ import decimal import math import re -from typing import Union +from typing import Any, Union from google.cloud._helpers import UTC from google.cloud._helpers import _date_from_iso8601_date @@ -29,7 +29,10 @@ from google.cloud._helpers import _to_bytes import packaging.version -from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery.exceptions import ( + LegacyBigQueryStorageError, + LegacyPyarrowError, +) _RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f" @@ -42,6 +45,7 @@ re.VERBOSE, ) +_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") _MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") @@ -95,12 +99,74 @@ def verify_version(self): if self.installed_version < _MIN_BQ_STORAGE_VERSION: msg = ( "Dependency google-cloud-bigquery-storage is outdated, please upgrade " - f"it to version >= 2.0.0 (version found: {self.installed_version})." + f"it to version >= {_MIN_BQ_STORAGE_VERSION} (version found: {self.installed_version})." ) raise LegacyBigQueryStorageError(msg) +class PyarrowVersions: + """Version comparisons for pyarrow package.""" + + def __init__(self): + self._installed_version = None + + @property + def installed_version(self) -> packaging.version.Version: + """Return the parsed version of pyarrow.""" + if self._installed_version is None: + import pyarrow + + self._installed_version = packaging.version.parse( + # Use 0.0.0, since it is earlier than any released version. + # Legacy versions also have the same property, but + # creating a LegacyVersion has been deprecated. + # https://github.com/pypa/packaging/issues/321 + getattr(pyarrow, "__version__", "0.0.0") + ) + + return self._installed_version + + def try_import(self, raise_if_error: bool = False) -> Any: + """Verify that a recent enough version of pyarrow extra is + installed. + + The function assumes that pyarrow extra is installed, and should thus + be used in places where this assumption holds. + + Because `pip` can install an outdated version of this extra despite the + constraints in `setup.py`, the calling code can use this helper to + verify the version compatibility at runtime. + + Returns: + The ``pyarrow`` module or ``None``. + + Raises: + LegacyPyarrowError: + If the pyarrow package is outdated and ``raise_if_error`` is ``True``. + """ + try: + import pyarrow + except ImportError as exc: # pragma: NO COVER + if raise_if_error: + raise LegacyPyarrowError( + f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}." + ) from exc + return None + + if self.installed_version < _MIN_PYARROW_VERSION: + if raise_if_error: + msg = ( + "Dependency pyarrow is outdated, please upgrade " + f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})." + ) + raise LegacyPyarrowError(msg) + return None + + return pyarrow + + BQ_STORAGE_VERSIONS = BQStorageVersions() +PYARROW_VERSIONS = PyarrowVersions() def _not_null(value, field): diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 29139ae09..0a22043a3 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -55,12 +55,6 @@ def _to_wkb(v): _to_wkb = _to_wkb() -try: - import pyarrow - import pyarrow.parquet -except ImportError: # pragma: NO COVER - pyarrow = None - try: from google.cloud.bigquery_storage import ArrowSerializationOptions except ImportError: @@ -73,12 +67,10 @@ def _to_wkb(v): from google.cloud.bigquery import schema -_LOGGER = logging.getLogger(__name__) +pyarrow = _helpers.PYARROW_VERSIONS.try_import() -_NO_BQSTORAGE_ERROR = ( - "The google-cloud-bigquery-storage library is not installed, " - "please install google-cloud-bigquery-storage to use bqstorage features." -) + +_LOGGER = logging.getLogger(__name__) _PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds. @@ -548,8 +540,9 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN serializing method. Defaults to "SNAPPY". https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table """ - if pyarrow is None: - raise ValueError("pyarrow is required for BigQuery schema conversion.") + pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) + + import pyarrow.parquet bq_schema = schema._to_schema_fields(bq_schema) arrow_table = dataframe_to_arrow(dataframe, bq_schema) diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py index 6e5c27eb1..fb1188eee 100644 --- a/google/cloud/bigquery/exceptions.py +++ b/google/cloud/bigquery/exceptions.py @@ -19,3 +19,7 @@ class BigQueryError(Exception): class LegacyBigQueryStorageError(BigQueryError): """Raised when too old a version of BigQuery Storage extra is detected at runtime.""" + + +class LegacyPyarrowError(BigQueryError): + """Raised when too old a version of pyarrow package is detected at runtime.""" diff --git a/noxfile.py b/noxfile.py index 9077924e9..d53b33121 100644 --- a/noxfile.py +++ b/noxfile.py @@ -94,9 +94,16 @@ def unit(session): default(session) -@nox.session(python=UNIT_TEST_PYTHON_VERSIONS[-1]) +@nox.session(python=[UNIT_TEST_PYTHON_VERSIONS[0], UNIT_TEST_PYTHON_VERSIONS[-1]]) def unit_noextras(session): """Run the unit test suite.""" + + # Install optional dependencies that are out-of-date. + # https://github.com/googleapis/python-bigquery/issues/933 + # There is no pyarrow 1.0.0 package for Python 3.9. + if session.python == UNIT_TEST_PYTHON_VERSIONS[0]: + session.install("pyarrow==1.0.0") + default(session, install_extras=False) diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index be1a992fa..23d2724f7 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -19,6 +19,6 @@ proto-plus==1.10.0 protobuf==3.12.0 pyarrow==3.0.0 requests==2.18.0 -shapely==1.6.0 +Shapely==1.6.0 six==1.13.0 tqdm==4.7.4 diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index b5af90c0b..580b41c78 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -31,10 +31,6 @@ import geopandas except (ImportError, AttributeError): # pragma: NO COVER geopandas = None -try: - import pyarrow -except (ImportError, AttributeError): # pragma: NO COVER - pyarrow = None try: from google.cloud import bigquery_storage except (ImportError, AttributeError): # pragma: NO COVER @@ -44,11 +40,15 @@ except (ImportError, AttributeError): # pragma: NO COVER tqdm = None +from google.cloud.bigquery import _helpers from .helpers import _make_client from .helpers import _make_connection from .helpers import _make_job_resource +pyarrow = _helpers.PYARROW_VERSIONS.try_import() + + @pytest.fixture def table_read_options_kwarg(): # Create a BigQuery Storage table read options object with pyarrow compression diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index f8d00e67d..945b95d1b 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -24,9 +24,20 @@ except ImportError: # pragma: NO COVER bigquery_storage = None +try: + import pyarrow +except ImportError: # pragma: NO COVER + pyarrow = None + @unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") class TestBQStorageVersions(unittest.TestCase): + def tearDown(self): + from google.cloud.bigquery import _helpers + + # Reset any cached versions since it may not match reality. + _helpers.BQ_STORAGE_VERSIONS._installed_version = None + def _object_under_test(self): from google.cloud.bigquery import _helpers @@ -89,6 +100,63 @@ def test_is_read_session_optional_false(self): assert not versions.is_read_session_optional +@unittest.skipIf(pyarrow is None, "Requires `pyarrow`") +class TestPyarrowVersions(unittest.TestCase): + def tearDown(self): + from google.cloud.bigquery import _helpers + + # Reset any cached versions since it may not match reality. + _helpers.PYARROW_VERSIONS._installed_version = None + + def _object_under_test(self): + from google.cloud.bigquery import _helpers + + return _helpers.PyarrowVersions() + + def _call_try_import(self, **kwargs): + from google.cloud.bigquery import _helpers + + _helpers.PYARROW_VERSIONS._installed_version = None + return _helpers.PYARROW_VERSIONS.try_import(**kwargs) + + def test_try_import_raises_no_error_w_recent_pyarrow(self): + from google.cloud.bigquery.exceptions import LegacyPyarrowError + + with mock.patch("pyarrow.__version__", new="5.0.0"): + try: + pyarrow = self._call_try_import(raise_if_error=True) + self.assertIsNotNone(pyarrow) + except LegacyPyarrowError: # pragma: NO COVER + self.fail("Legacy error raised with a non-legacy dependency version.") + + def test_try_import_returns_none_w_legacy_pyarrow(self): + with mock.patch("pyarrow.__version__", new="2.0.0"): + pyarrow = self._call_try_import() + self.assertIsNone(pyarrow) + + def test_try_import_raises_error_w_legacy_pyarrow(self): + from google.cloud.bigquery.exceptions import LegacyPyarrowError + + with mock.patch("pyarrow.__version__", new="2.0.0"): + with self.assertRaises(LegacyPyarrowError): + self._call_try_import(raise_if_error=True) + + def test_installed_version_returns_cached(self): + versions = self._object_under_test() + versions._installed_version = object() + assert versions.installed_version is versions._installed_version + + def test_installed_version_returns_parsed_version(self): + versions = self._object_under_test() + + with mock.patch("pyarrow.__version__", new="1.2.3"): + version = versions.installed_version + + assert version.major == 1 + assert version.minor == 2 + assert version.micro == 3 + + class Test_not_null(unittest.TestCase): def _call_fut(self, value, field): from google.cloud.bigquery._helpers import _not_null diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index f0975ef65..80b226a3a 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -29,13 +29,6 @@ import pandas.testing except ImportError: # pragma: NO COVER pandas = None -try: - import pyarrow - import pyarrow.types -except ImportError: # pragma: NO COVER - # Mock out pyarrow when missing, because methods from pyarrow.types are - # used in test parameterization. - pyarrow = mock.Mock() try: import geopandas except ImportError: # pragma: NO COVER @@ -44,9 +37,19 @@ import pytest from google import api_core +from google.cloud.bigquery import exceptions from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema + +pyarrow = _helpers.PYARROW_VERSIONS.try_import() +if pyarrow: + import pyarrow.types +else: # pragma: NO COVER + # Mock out pyarrow when missing, because methods from pyarrow.types are + # used in test parameterization. + pyarrow = mock.Mock() + try: from google.cloud import bigquery_storage @@ -1120,15 +1123,19 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch): - monkeypatch.setattr(module_under_test, "pyarrow", None) - with pytest.raises(ValueError) as exc_context: + mock_pyarrow_import = mock.Mock() + mock_pyarrow_import.side_effect = exceptions.LegacyPyarrowError( + "pyarrow not installed" + ) + monkeypatch.setattr(_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import) + + with pytest.raises(exceptions.LegacyPyarrowError): module_under_test.dataframe_to_parquet(pandas.DataFrame(), (), None) - assert "pyarrow is required" in str(exc_context.value) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") -def test_dataframe_to_parquet_w_extra_fields(module_under_test, monkeypatch): +def test_dataframe_to_parquet_w_extra_fields(module_under_test): with pytest.raises(ValueError) as exc_context: module_under_test.dataframe_to_parquet( pandas.DataFrame(), (schema.SchemaField("not_in_df", "STRING"),), None diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 1ce930ee4..c64620a48 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -45,18 +45,18 @@ except (ImportError, AttributeError): # pragma: NO COVER geopandas = None -try: - import pyarrow - import pyarrow.types -except ImportError: # pragma: NO COVER - pyarrow = None - try: from tqdm import tqdm except (ImportError, AttributeError): # pragma: NO COVER tqdm = None from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery import _helpers + + +pyarrow = _helpers.PYARROW_VERSIONS.try_import() +if pyarrow: + import pyarrow.types def _mock_client():