deps: update dependencies (#1282)

* update dependencies * deps: pyarrow extras * clean up comments * add test pyarrow skips * replace storage checks * update tests * update tests * Update setup.py * update system tests * update verify_pandas_imports * add pyarrow guards * add datetime check * change pyarrow import * update * add pyarrow skips * fix types * lint * Update google/cloud/bigquery/client.py Co-authored-by: Tim Swast <swast@google.com> * update pyarrow version * update test * lint * update pyarrow req * update noxfile * remove bignum check * remove comments * add test importorskip * update test * update test * update dependency * change version * update imports Co-authored-by: Anthonios Partheniou <partheniou@google.com> Co-authored-by: Tim Swast <swast@google.com>
googleapis · Dec 8, 2022 · e1aa921 · e1aa921
1 parent 589c8bd
commit e1aa921
Show file tree

Hide file tree

Showing 23 changed files with 1,013 additions and 118 deletions.
diff --git a/docs/snippets.py b/docs/snippets.py
@@ -31,6 +31,11 @@
 except (ImportError, AttributeError):
     pandas = None
 
+try:
+    import pyarrow
+except (ImportError, AttributeError):
+    pyarrow = None
+
 from google.api_core.exceptions import InternalServerError
 from google.api_core.exceptions import ServiceUnavailable
 from google.api_core.exceptions import TooManyRequests

diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py
@@ -42,6 +42,8 @@
 from google.cloud.bigquery.enums import KeyResultStatementKind
 from google.cloud.bigquery.enums import SqlTypeNames
 from google.cloud.bigquery.enums import StandardSqlTypeNames
+from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
+from google.cloud.bigquery.exceptions import LegacyPyarrowError
 from google.cloud.bigquery.external_config import ExternalConfig
 from google.cloud.bigquery.external_config import BigtableOptions
 from google.cloud.bigquery.external_config import BigtableColumnFamily
@@ -195,6 +197,9 @@
     "WriteDisposition",
     # EncryptionConfiguration
     "EncryptionConfiguration",
+    # Custom exceptions
+    "LegacyBigQueryStorageError",
+    "LegacyPyarrowError",
 ]
 
 

diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py
@@ -20,7 +20,7 @@
 import math
 import re
 import os
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 from dateutil import relativedelta
 from google.cloud._helpers import UTC  # type: ignore
@@ -32,6 +32,11 @@
 
 import packaging.version
 
+from google.cloud.bigquery.exceptions import (
+    LegacyBigQueryStorageError,
+    LegacyPyarrowError,
+)
+
 _RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f"
 _TIMEONLY_WO_MICROS = "%H:%M:%S"
 _TIMEONLY_W_MICROS = "%H:%M:%S.%f"
@@ -50,6 +55,10 @@
     r"(?P<time_sign>-?)(?P<hours>\d+):(?P<minutes>\d+):(?P<seconds>\d+)\.?(?P<fraction>\d*)?$"
 )
 
+_MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0")
+
+_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0")
+
 _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0")
 
 BIGQUERY_EMULATOR_HOST = "BIGQUERY_EMULATOR_HOST"
@@ -83,7 +92,7 @@ def installed_version(self) -> packaging.version.Version:
                 getattr(bigquery_storage, "__version__", "0.0.0")
             )
 
-        return self._installed_version
+        return self._installed_version  # type: ignore
 
     @property
     def is_read_session_optional(self) -> bool:
@@ -93,6 +102,29 @@ def is_read_session_optional(self) -> bool:
         """
         return self.installed_version >= _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION
 
+    def verify_version(self):
+        """Verify that a recent enough version of BigQuery Storage extra is
+        installed.
+
+        The function assumes that google-cloud-bigquery-storage extra is
+        installed, and should thus be used in places where this assumption
+        holds.
+
+        Because `pip` can install an outdated version of this extra despite the
+        constraints in `setup.py`, the calling code can use this helper to
+        verify the version compatibility at runtime.
+
+        Raises:
+            LegacyBigQueryStorageError:
+                If the google-cloud-bigquery-storage package is outdated.
+        """
+        if self.installed_version < _MIN_BQ_STORAGE_VERSION:
+            msg = (
+                "Dependency google-cloud-bigquery-storage is outdated, please upgrade "
+                f"it to version >= {_MIN_BQ_STORAGE_VERSION} (version found: {self.installed_version})."
+            )
+            raise LegacyBigQueryStorageError(msg)
+
 
 class PyarrowVersions:
     """Version comparisons for pyarrow package."""
@@ -120,6 +152,44 @@ def installed_version(self) -> packaging.version.Version:
     def use_compliant_nested_type(self) -> bool:
         return self.installed_version.major >= 4
 
+    def try_import(self, raise_if_error: bool = False) -> Any:
+        """Verify that a recent enough version of pyarrow extra is
+        installed.
+
+        The function assumes that pyarrow extra is installed, and should thus
+        be used in places where this assumption holds.
+
+        Because `pip` can install an outdated version of this extra despite the
+        constraints in `setup.py`, the calling code can use this helper to
+        verify the version compatibility at runtime.
+
+        Returns:
+            The ``pyarrow`` module or ``None``.
+
+        Raises:
+            LegacyPyarrowError:
+                If the pyarrow package is outdated and ``raise_if_error`` is ``True``.
+        """
+        try:
+            import pyarrow
+        except ImportError as exc:  # pragma: NO COVER
+            if raise_if_error:
+                raise LegacyPyarrowError(
+                    f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}."
+                ) from exc
+            return None
+
+        if self.installed_version < _MIN_PYARROW_VERSION:
+            if raise_if_error:
+                msg = (
+                    "Dependency pyarrow is outdated, please upgrade "
+                    f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})."
+                )
+                raise LegacyPyarrowError(msg)
+            return None
+
+        return pyarrow
+
 
 BQ_STORAGE_VERSIONS = BQStorageVersions()
 PYARROW_VERSIONS = PyarrowVersions()

diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -22,6 +22,11 @@
 import queue
 import warnings
 
+from packaging import version
+
+from google.cloud.bigquery import _helpers
+from google.cloud.bigquery import schema
+
 try:
     import pandas  # type: ignore
 
@@ -43,9 +48,7 @@
     db_dtypes_import_exception = exc
     date_dtype_name = time_dtype_name = ""  # Use '' rather than None because pytype
 
-
-import pyarrow  # type: ignore
-import pyarrow.parquet  # type: ignore
+pyarrow = _helpers.PYARROW_VERSIONS.try_import()
 
 try:
     # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array`
@@ -77,10 +80,6 @@ def _to_wkb(v):
     # Having BQ Storage available implies that pyarrow >=1.0.0 is available, too.
     _ARROW_COMPRESSION_SUPPORT = True
 
-from google.cloud.bigquery import _helpers
-from google.cloud.bigquery import schema
-
-
 _LOGGER = logging.getLogger(__name__)
 
 _PROGRESS_INTERVAL = 0.2  # Maximum time between download status checks, in seconds.
@@ -141,52 +140,65 @@ def pyarrow_timestamp():
     return pyarrow.timestamp("us", tz="UTC")
 
 
-# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
-# When modifying it be sure to update it there as well.
-BQ_TO_ARROW_SCALARS = {
-    "BIGNUMERIC": pyarrow_bignumeric,
-    "BOOL": pyarrow.bool_,
-    "BOOLEAN": pyarrow.bool_,
-    "BYTES": pyarrow.binary,
-    "DATE": pyarrow.date32,
-    "DATETIME": pyarrow_datetime,
-    "FLOAT": pyarrow.float64,
-    "FLOAT64": pyarrow.float64,
-    "GEOGRAPHY": pyarrow.string,
-    "INT64": pyarrow.int64,
-    "INTEGER": pyarrow.int64,
-    "NUMERIC": pyarrow_numeric,
-    "STRING": pyarrow.string,
-    "TIME": pyarrow_time,
-    "TIMESTAMP": pyarrow_timestamp,
-}
-ARROW_SCALAR_IDS_TO_BQ = {
-    # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
-    pyarrow.bool_().id: "BOOL",
-    pyarrow.int8().id: "INT64",
-    pyarrow.int16().id: "INT64",
-    pyarrow.int32().id: "INT64",
-    pyarrow.int64().id: "INT64",
-    pyarrow.uint8().id: "INT64",
-    pyarrow.uint16().id: "INT64",
-    pyarrow.uint32().id: "INT64",
-    pyarrow.uint64().id: "INT64",
-    pyarrow.float16().id: "FLOAT64",
-    pyarrow.float32().id: "FLOAT64",
-    pyarrow.float64().id: "FLOAT64",
-    pyarrow.time32("ms").id: "TIME",
-    pyarrow.time64("ns").id: "TIME",
-    pyarrow.timestamp("ns").id: "TIMESTAMP",
-    pyarrow.date32().id: "DATE",
-    pyarrow.date64().id: "DATETIME",  # because millisecond resolution
-    pyarrow.binary().id: "BYTES",
-    pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
-    # The exact scale and precision don't matter, see below.
-    pyarrow.decimal128(38, scale=9).id: "NUMERIC",
-    # The exact decimal's scale and precision are not important, as only
-    # the type ID matters, and it's the same for all decimal256 instances.
-    pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC",
-}
+if pyarrow:
+    # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
+    # When modifying it be sure to update it there as well.
+    BQ_TO_ARROW_SCALARS = {
+        "BOOL": pyarrow.bool_,
+        "BOOLEAN": pyarrow.bool_,
+        "BYTES": pyarrow.binary,
+        "DATE": pyarrow.date32,
+        "DATETIME": pyarrow_datetime,
+        "FLOAT": pyarrow.float64,
+        "FLOAT64": pyarrow.float64,
+        "GEOGRAPHY": pyarrow.string,
+        "INT64": pyarrow.int64,
+        "INTEGER": pyarrow.int64,
+        "NUMERIC": pyarrow_numeric,
+        "STRING": pyarrow.string,
+        "TIME": pyarrow_time,
+        "TIMESTAMP": pyarrow_timestamp,
+    }
+    ARROW_SCALAR_IDS_TO_BQ = {
+        # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
+        pyarrow.bool_().id: "BOOL",
+        pyarrow.int8().id: "INT64",
+        pyarrow.int16().id: "INT64",
+        pyarrow.int32().id: "INT64",
+        pyarrow.int64().id: "INT64",
+        pyarrow.uint8().id: "INT64",
+        pyarrow.uint16().id: "INT64",
+        pyarrow.uint32().id: "INT64",
+        pyarrow.uint64().id: "INT64",
+        pyarrow.float16().id: "FLOAT64",
+        pyarrow.float32().id: "FLOAT64",
+        pyarrow.float64().id: "FLOAT64",
+        pyarrow.time32("ms").id: "TIME",
+        pyarrow.time64("ns").id: "TIME",
+        pyarrow.timestamp("ns").id: "TIMESTAMP",
+        pyarrow.date32().id: "DATE",
+        pyarrow.date64().id: "DATETIME",  # because millisecond resolution
+        pyarrow.binary().id: "BYTES",
+        pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
+        # The exact scale and precision don't matter, see below.
+        pyarrow.decimal128(38, scale=9).id: "NUMERIC",
+    }
+
+    if version.parse(pyarrow.__version__) >= version.parse("3.0.0"):
+        BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
+        # The exact decimal's scale and precision are not important, as only
+        # the type ID matters, and it's the same for all decimal256 instances.
+        ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
+        _BIGNUMERIC_SUPPORT = True
+    else:
+        _BIGNUMERIC_SUPPORT = False  # pragma: NO COVER
+
+else:  # pragma: NO COVER
+    BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
+    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER
+    _BIGNUMERIC_SUPPORT = False  # pragma: NO COVER
+
+
 BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = {
     "GEOGRAPHY": {
         b"ARROW:extension:name": b"google:sqlType:geography",
@@ -480,6 +492,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
     # If schema detection was not successful for all columns, also try with
     # pyarrow, if available.
     if unknown_type_fields:
+        if not pyarrow:
+            msg = "Could not determine the type of columns: {}".format(
+                ", ".join(field.name for field in unknown_type_fields)
+            )
+            warnings.warn(msg)
+            return None  # We cannot detect the schema in full.
+
         # The augment_schema() helper itself will also issue unknown type
         # warnings if detection still fails for any of the fields.
         bq_schema_out = augment_schema(dataframe, bq_schema_out)
@@ -654,6 +673,8 @@ def dataframe_to_parquet(
 
             This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``.
     """
+    pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)
+
     import pyarrow.parquet  # type: ignore
 
     kwargs = (