Skip to content

Commit

Permalink
deps: update dependencies (#1282)
Browse files Browse the repository at this point in the history
* update dependencies

* deps: pyarrow extras

* clean up comments

* add test pyarrow skips

* replace storage checks

* update tests

* update tests

* Update setup.py

* update system tests

* update verify_pandas_imports

* add pyarrow guards

* add datetime check

* change pyarrow import

* update

* add pyarrow skips

* fix types

* lint

* Update google/cloud/bigquery/client.py

Co-authored-by: Tim Swast <swast@google.com>

* update pyarrow version

* update test

* lint

* update pyarrow req

* update noxfile

* remove bignum check

* remove comments

* add test importorskip

* update test

* update test

* update dependency

* change version

* update imports

Co-authored-by: Anthonios Partheniou <partheniou@google.com>
Co-authored-by: Tim Swast <swast@google.com>
  • Loading branch information
3 people committed Dec 8, 2022
1 parent 589c8bd commit e1aa921
Show file tree
Hide file tree
Showing 23 changed files with 1,013 additions and 118 deletions.
5 changes: 5 additions & 0 deletions docs/snippets.py
Expand Up @@ -31,6 +31,11 @@
except (ImportError, AttributeError):
pandas = None

try:
import pyarrow
except (ImportError, AttributeError):
pyarrow = None

from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import ServiceUnavailable
from google.api_core.exceptions import TooManyRequests
Expand Down
5 changes: 5 additions & 0 deletions google/cloud/bigquery/__init__.py
Expand Up @@ -42,6 +42,8 @@
from google.cloud.bigquery.enums import KeyResultStatementKind
from google.cloud.bigquery.enums import SqlTypeNames
from google.cloud.bigquery.enums import StandardSqlTypeNames
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
from google.cloud.bigquery.exceptions import LegacyPyarrowError
from google.cloud.bigquery.external_config import ExternalConfig
from google.cloud.bigquery.external_config import BigtableOptions
from google.cloud.bigquery.external_config import BigtableColumnFamily
Expand Down Expand Up @@ -195,6 +197,9 @@
"WriteDisposition",
# EncryptionConfiguration
"EncryptionConfiguration",
# Custom exceptions
"LegacyBigQueryStorageError",
"LegacyPyarrowError",
]


Expand Down
74 changes: 72 additions & 2 deletions google/cloud/bigquery/_helpers.py
Expand Up @@ -20,7 +20,7 @@
import math
import re
import os
from typing import Optional, Union
from typing import Any, Optional, Union

from dateutil import relativedelta
from google.cloud._helpers import UTC # type: ignore
Expand All @@ -32,6 +32,11 @@

import packaging.version

from google.cloud.bigquery.exceptions import (
LegacyBigQueryStorageError,
LegacyPyarrowError,
)

_RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f"
_TIMEONLY_WO_MICROS = "%H:%M:%S"
_TIMEONLY_W_MICROS = "%H:%M:%S.%f"
Expand All @@ -50,6 +55,10 @@
r"(?P<time_sign>-?)(?P<hours>\d+):(?P<minutes>\d+):(?P<seconds>\d+)\.?(?P<fraction>\d*)?$"
)

_MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0")

_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0")

_BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0")

BIGQUERY_EMULATOR_HOST = "BIGQUERY_EMULATOR_HOST"
Expand Down Expand Up @@ -83,7 +92,7 @@ def installed_version(self) -> packaging.version.Version:
getattr(bigquery_storage, "__version__", "0.0.0")
)

return self._installed_version
return self._installed_version # type: ignore

@property
def is_read_session_optional(self) -> bool:
Expand All @@ -93,6 +102,29 @@ def is_read_session_optional(self) -> bool:
"""
return self.installed_version >= _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION

def verify_version(self):
"""Verify that a recent enough version of BigQuery Storage extra is
installed.
The function assumes that google-cloud-bigquery-storage extra is
installed, and should thus be used in places where this assumption
holds.
Because `pip` can install an outdated version of this extra despite the
constraints in `setup.py`, the calling code can use this helper to
verify the version compatibility at runtime.
Raises:
LegacyBigQueryStorageError:
If the google-cloud-bigquery-storage package is outdated.
"""
if self.installed_version < _MIN_BQ_STORAGE_VERSION:
msg = (
"Dependency google-cloud-bigquery-storage is outdated, please upgrade "
f"it to version >= {_MIN_BQ_STORAGE_VERSION} (version found: {self.installed_version})."
)
raise LegacyBigQueryStorageError(msg)


class PyarrowVersions:
"""Version comparisons for pyarrow package."""
Expand Down Expand Up @@ -120,6 +152,44 @@ def installed_version(self) -> packaging.version.Version:
def use_compliant_nested_type(self) -> bool:
return self.installed_version.major >= 4

def try_import(self, raise_if_error: bool = False) -> Any:
"""Verify that a recent enough version of pyarrow extra is
installed.
The function assumes that pyarrow extra is installed, and should thus
be used in places where this assumption holds.
Because `pip` can install an outdated version of this extra despite the
constraints in `setup.py`, the calling code can use this helper to
verify the version compatibility at runtime.
Returns:
The ``pyarrow`` module or ``None``.
Raises:
LegacyPyarrowError:
If the pyarrow package is outdated and ``raise_if_error`` is ``True``.
"""
try:
import pyarrow
except ImportError as exc: # pragma: NO COVER
if raise_if_error:
raise LegacyPyarrowError(
f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}."
) from exc
return None

if self.installed_version < _MIN_PYARROW_VERSION:
if raise_if_error:
msg = (
"Dependency pyarrow is outdated, please upgrade "
f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})."
)
raise LegacyPyarrowError(msg)
return None

return pyarrow


BQ_STORAGE_VERSIONS = BQStorageVersions()
PYARROW_VERSIONS = PyarrowVersions()
Expand Down
127 changes: 74 additions & 53 deletions google/cloud/bigquery/_pandas_helpers.py
Expand Up @@ -22,6 +22,11 @@
import queue
import warnings

from packaging import version

from google.cloud.bigquery import _helpers
from google.cloud.bigquery import schema

try:
import pandas # type: ignore

Expand All @@ -43,9 +48,7 @@
db_dtypes_import_exception = exc
date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype


import pyarrow # type: ignore
import pyarrow.parquet # type: ignore
pyarrow = _helpers.PYARROW_VERSIONS.try_import()

try:
# _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array`
Expand Down Expand Up @@ -77,10 +80,6 @@ def _to_wkb(v):
# Having BQ Storage available implies that pyarrow >=1.0.0 is available, too.
_ARROW_COMPRESSION_SUPPORT = True

from google.cloud.bigquery import _helpers
from google.cloud.bigquery import schema


_LOGGER = logging.getLogger(__name__)

_PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds.
Expand Down Expand Up @@ -141,52 +140,65 @@ def pyarrow_timestamp():
return pyarrow.timestamp("us", tz="UTC")


# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
# When modifying it be sure to update it there as well.
BQ_TO_ARROW_SCALARS = {
"BIGNUMERIC": pyarrow_bignumeric,
"BOOL": pyarrow.bool_,
"BOOLEAN": pyarrow.bool_,
"BYTES": pyarrow.binary,
"DATE": pyarrow.date32,
"DATETIME": pyarrow_datetime,
"FLOAT": pyarrow.float64,
"FLOAT64": pyarrow.float64,
"GEOGRAPHY": pyarrow.string,
"INT64": pyarrow.int64,
"INTEGER": pyarrow.int64,
"NUMERIC": pyarrow_numeric,
"STRING": pyarrow.string,
"TIME": pyarrow_time,
"TIMESTAMP": pyarrow_timestamp,
}
ARROW_SCALAR_IDS_TO_BQ = {
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
pyarrow.bool_().id: "BOOL",
pyarrow.int8().id: "INT64",
pyarrow.int16().id: "INT64",
pyarrow.int32().id: "INT64",
pyarrow.int64().id: "INT64",
pyarrow.uint8().id: "INT64",
pyarrow.uint16().id: "INT64",
pyarrow.uint32().id: "INT64",
pyarrow.uint64().id: "INT64",
pyarrow.float16().id: "FLOAT64",
pyarrow.float32().id: "FLOAT64",
pyarrow.float64().id: "FLOAT64",
pyarrow.time32("ms").id: "TIME",
pyarrow.time64("ns").id: "TIME",
pyarrow.timestamp("ns").id: "TIMESTAMP",
pyarrow.date32().id: "DATE",
pyarrow.date64().id: "DATETIME", # because millisecond resolution
pyarrow.binary().id: "BYTES",
pyarrow.string().id: "STRING", # also alias for pyarrow.utf8()
# The exact scale and precision don't matter, see below.
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
# The exact decimal's scale and precision are not important, as only
# the type ID matters, and it's the same for all decimal256 instances.
pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC",
}
if pyarrow:
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
# When modifying it be sure to update it there as well.
BQ_TO_ARROW_SCALARS = {
"BOOL": pyarrow.bool_,
"BOOLEAN": pyarrow.bool_,
"BYTES": pyarrow.binary,
"DATE": pyarrow.date32,
"DATETIME": pyarrow_datetime,
"FLOAT": pyarrow.float64,
"FLOAT64": pyarrow.float64,
"GEOGRAPHY": pyarrow.string,
"INT64": pyarrow.int64,
"INTEGER": pyarrow.int64,
"NUMERIC": pyarrow_numeric,
"STRING": pyarrow.string,
"TIME": pyarrow_time,
"TIMESTAMP": pyarrow_timestamp,
}
ARROW_SCALAR_IDS_TO_BQ = {
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
pyarrow.bool_().id: "BOOL",
pyarrow.int8().id: "INT64",
pyarrow.int16().id: "INT64",
pyarrow.int32().id: "INT64",
pyarrow.int64().id: "INT64",
pyarrow.uint8().id: "INT64",
pyarrow.uint16().id: "INT64",
pyarrow.uint32().id: "INT64",
pyarrow.uint64().id: "INT64",
pyarrow.float16().id: "FLOAT64",
pyarrow.float32().id: "FLOAT64",
pyarrow.float64().id: "FLOAT64",
pyarrow.time32("ms").id: "TIME",
pyarrow.time64("ns").id: "TIME",
pyarrow.timestamp("ns").id: "TIMESTAMP",
pyarrow.date32().id: "DATE",
pyarrow.date64().id: "DATETIME", # because millisecond resolution
pyarrow.binary().id: "BYTES",
pyarrow.string().id: "STRING", # also alias for pyarrow.utf8()
# The exact scale and precision don't matter, see below.
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
}

if version.parse(pyarrow.__version__) >= version.parse("3.0.0"):
BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
# The exact decimal's scale and precision are not important, as only
# the type ID matters, and it's the same for all decimal256 instances.
ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
_BIGNUMERIC_SUPPORT = True
else:
_BIGNUMERIC_SUPPORT = False # pragma: NO COVER

else: # pragma: NO COVER
BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER
ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER
_BIGNUMERIC_SUPPORT = False # pragma: NO COVER


BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = {
"GEOGRAPHY": {
b"ARROW:extension:name": b"google:sqlType:geography",
Expand Down Expand Up @@ -480,6 +492,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
# If schema detection was not successful for all columns, also try with
# pyarrow, if available.
if unknown_type_fields:
if not pyarrow:
msg = "Could not determine the type of columns: {}".format(
", ".join(field.name for field in unknown_type_fields)
)
warnings.warn(msg)
return None # We cannot detect the schema in full.

# The augment_schema() helper itself will also issue unknown type
# warnings if detection still fails for any of the fields.
bq_schema_out = augment_schema(dataframe, bq_schema_out)
Expand Down Expand Up @@ -654,6 +673,8 @@ def dataframe_to_parquet(
This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``.
"""
pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)

import pyarrow.parquet # type: ignore

kwargs = (
Expand Down

0 comments on commit e1aa921

Please sign in to comment.