From cc3394f80934419eb00c2029bb81c92a696e7d88 Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Tue, 23 Feb 2021 18:33:11 +0100 Subject: [PATCH] feat: add BIGNUMERIC support (#527) * feat: add support of BIGNUMERIC * feat: add BIGNUMERIC support * Add bignumeric_type extra * Add additional BIGNUMERIC tests * Prevent import time error if no BIGNUMERIC support * Add/improve a few comments * Add feature flag for BIGNUMERIC suppport Co-authored-by: HemangChothani --- google/cloud/bigquery/_pandas_helpers.py | 18 +- google/cloud/bigquery/dbapi/_helpers.py | 12 +- google/cloud/bigquery/dbapi/types.py | 2 +- google/cloud/bigquery/query.py | 8 +- google/cloud/bigquery/schema.py | 1 + setup.py | 2 + tests/system/test_client.py | 162 ++++++++++------ tests/unit/test__pandas_helpers.py | 228 +++++++++++++++-------- tests/unit/test_dbapi__helpers.py | 14 ++ tests/unit/test_query.py | 10 + 10 files changed, 305 insertions(+), 152 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 162c58b4b..7ad416e08 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -20,6 +20,7 @@ import queue import warnings +from packaging import version try: import pandas @@ -80,6 +81,10 @@ def pyarrow_numeric(): return pyarrow.decimal128(38, 9) +def pyarrow_bignumeric(): + return pyarrow.decimal256(76, 38) + + def pyarrow_time(): return pyarrow.time64("us") @@ -128,14 +133,23 @@ def pyarrow_timestamp(): pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + # The exact scale and precision don't matter, see below. pyarrow.decimal128(38, scale=9).id: "NUMERIC", - # The exact decimal's scale and precision are not important, as only - # the type ID matters, and it's the same for all decimal128 instances. } + if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): + BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" + _BIGNUMERIC_SUPPORT = True + else: + _BIGNUMERIC_SUPPORT = False + else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER + _BIGNUMERIC_SUPPORT = False # pragma: NO COVER def bq_to_arrow_struct_data_type(field): diff --git a/google/cloud/bigquery/dbapi/_helpers.py b/google/cloud/bigquery/dbapi/_helpers.py index 95b5869e5..6b36d6e43 100644 --- a/google/cloud/bigquery/dbapi/_helpers.py +++ b/google/cloud/bigquery/dbapi/_helpers.py @@ -19,6 +19,11 @@ import functools import numbers +try: + import pyarrow +except ImportError: # pragma: NO COVER + pyarrow = None + from google.cloud import bigquery from google.cloud.bigquery import table from google.cloud.bigquery.dbapi import exceptions @@ -184,7 +189,12 @@ def bigquery_scalar_type(value): elif isinstance(value, numbers.Real): return "FLOAT64" elif isinstance(value, decimal.Decimal): - return "NUMERIC" + # We check for NUMERIC before BIGNUMERIC in order to support pyarrow < 3.0. + scalar_object = pyarrow.scalar(value) + if isinstance(scalar_object, pyarrow.Decimal128Scalar): + return "NUMERIC" + else: + return "BIGNUMERIC" elif isinstance(value, str): return "STRING" elif isinstance(value, bytes): diff --git a/google/cloud/bigquery/dbapi/types.py b/google/cloud/bigquery/dbapi/types.py index 14917820c..20eca9b00 100644 --- a/google/cloud/bigquery/dbapi/types.py +++ b/google/cloud/bigquery/dbapi/types.py @@ -78,7 +78,7 @@ def __eq__(self, other): STRING = "STRING" BINARY = _DBAPITypeObject("BYTES", "RECORD", "STRUCT") NUMBER = _DBAPITypeObject( - "INTEGER", "INT64", "FLOAT", "FLOAT64", "NUMERIC", "BOOLEAN", "BOOL" + "INTEGER", "INT64", "FLOAT", "FLOAT64", "NUMERIC", "BIGNUMERIC", "BOOLEAN", "BOOL" ) DATETIME = _DBAPITypeObject("TIMESTAMP", "DATE", "TIME", "DATETIME") ROWID = "ROWID" diff --git a/google/cloud/bigquery/query.py b/google/cloud/bigquery/query.py index f2ed6337e..ecec73e99 100644 --- a/google/cloud/bigquery/query.py +++ b/google/cloud/bigquery/query.py @@ -83,7 +83,7 @@ class ScalarQueryParameter(_AbstractQueryParameter): type_ (str): Name of parameter type. One of 'STRING', 'INT64', - 'FLOAT64', 'NUMERIC', 'BOOL', 'TIMESTAMP', 'DATETIME', or + 'FLOAT64', 'NUMERIC', 'BIGNUMERIC', 'BOOL', 'TIMESTAMP', 'DATETIME', or 'DATE'. value (Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date]): @@ -102,7 +102,7 @@ def positional(cls, type_, value): Args: type_ (str): Name of parameter type. One of 'STRING', 'INT64', - 'FLOAT64', 'NUMERIC', 'BOOL', 'TIMESTAMP', 'DATETIME', or + 'FLOAT64', 'NUMERIC', 'BIGNUMERIC', 'BOOL', 'TIMESTAMP', 'DATETIME', or 'DATE'. value (Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date]): @@ -186,7 +186,7 @@ class ArrayQueryParameter(_AbstractQueryParameter): array_type (str): Name of type of array elements. One of `'STRING'`, `'INT64'`, - `'FLOAT64'`, `'NUMERIC'`, `'BOOL'`, `'TIMESTAMP'`, or `'DATE'`. + `'FLOAT64'`, `'NUMERIC'`, `'BIGNUMERIC'`, `'BOOL'`, `'TIMESTAMP'`, or `'DATE'`. values (List[appropriate scalar type]): The parameter array values. """ @@ -203,7 +203,7 @@ def positional(cls, array_type, values): Args: array_type (str): Name of type of array elements. One of `'STRING'`, `'INT64'`, - `'FLOAT64'`, `'NUMERIC'`, `'BOOL'`, `'TIMESTAMP'`, or `'DATE'`. + `'FLOAT64'`, `'NUMERIC'`, `'BIGNUMERIC'`, `'BOOL'`, `'TIMESTAMP'`, or `'DATE'`. values (List[appropriate scalar type]): The parameter array values. diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py index c76aded02..9be27f3e8 100644 --- a/google/cloud/bigquery/schema.py +++ b/google/cloud/bigquery/schema.py @@ -32,6 +32,7 @@ "FLOAT": types.StandardSqlDataType.TypeKind.FLOAT64, "FLOAT64": types.StandardSqlDataType.TypeKind.FLOAT64, "NUMERIC": types.StandardSqlDataType.TypeKind.NUMERIC, + "BIGNUMERIC": types.StandardSqlDataType.TypeKind.BIGNUMERIC, "BOOLEAN": types.StandardSqlDataType.TypeKind.BOOL, "BOOL": types.StandardSqlDataType.TypeKind.BOOL, "GEOGRAPHY": types.StandardSqlDataType.TypeKind.GEOGRAPHY, diff --git a/setup.py b/setup.py index ea2df4843..31b6a3ff7 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ "proto-plus >= 1.10.0", "google-cloud-core >= 1.4.1, < 2.0dev", "google-resumable-media >= 0.6.0, < 2.0dev", + "packaging >= 14.3", "protobuf >= 3.12.0", ] extras = { @@ -48,6 +49,7 @@ "pyarrow >= 1.0.0, < 4.0dev", ], "pandas": ["pandas>=0.23.0", "pyarrow >= 1.0.0, < 4.0dev",], + "bignumeric_type": ["pyarrow >= 3.0.0, < 4.0dev"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ "opentelemetry-api==0.11b0", diff --git a/tests/system/test_client.py b/tests/system/test_client.py index 60c3b3fa8..684a42c30 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -65,6 +65,7 @@ from google.api_core.iam import Policy from google.cloud import bigquery from google.cloud import bigquery_v2 +from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.table import Table @@ -891,6 +892,9 @@ def test_load_table_from_dataframe_w_nulls(self): bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), ) + if _BIGNUMERIC_SUPPORT: + scalars_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) + table_schema = scalars_schema + ( # TODO: Array columns can't be read due to NULLABLE versus REPEATED # mode mismatch. See: @@ -902,21 +906,22 @@ def test_load_table_from_dataframe_w_nulls(self): ) num_rows = 100 nulls = [None] * num_rows - df_data = collections.OrderedDict( - [ - ("bool_col", nulls), - ("bytes_col", nulls), - ("date_col", nulls), - ("dt_col", nulls), - ("float_col", nulls), - ("geo_col", nulls), - ("int_col", nulls), - ("num_col", nulls), - ("str_col", nulls), - ("time_col", nulls), - ("ts_col", nulls), - ] - ) + df_data = [ + ("bool_col", nulls), + ("bytes_col", nulls), + ("date_col", nulls), + ("dt_col", nulls), + ("float_col", nulls), + ("geo_col", nulls), + ("int_col", nulls), + ("num_col", nulls), + ("str_col", nulls), + ("time_col", nulls), + ("ts_col", nulls), + ] + if _BIGNUMERIC_SUPPORT: + df_data.append(("bignum_col", nulls)) + df_data = collections.OrderedDict(df_data) dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) dataset_id = _make_dataset_id("bq_load_test") @@ -1003,6 +1008,9 @@ def test_load_table_from_dataframe_w_explicit_schema(self): bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), ) + if _BIGNUMERIC_SUPPORT: + scalars_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) + table_schema = scalars_schema + ( # TODO: Array columns can't be read due to NULLABLE versus REPEATED # mode mismatch. See: @@ -1012,57 +1020,65 @@ def test_load_table_from_dataframe_w_explicit_schema(self): # https://jira.apache.org/jira/browse/ARROW-2587 # bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema), ) - df_data = collections.OrderedDict( - [ - ("bool_col", [True, None, False]), - ("bytes_col", [b"abc", None, b"def"]), - ( - "date_col", - [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)], - ), - # ( - # "dt_col", - # [ - # datetime.datetime(1, 1, 1, 0, 0, 0), - # None, - # datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - # ], - # ), - ("float_col", [float("-inf"), float("nan"), float("inf")]), - ( - "geo_col", - [ - "POINT(30 10)", - None, - "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", - ], - ), - ("int_col", [-9223372036854775808, None, 9223372036854775807]), - ( - "num_col", - [ - decimal.Decimal("-99999999999999999999999999999.999999999"), - None, - decimal.Decimal("99999999999999999999999999999.999999999"), - ], - ), - ("str_col", [u"abc", None, u"def"]), - ( - "time_col", - [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], - ), + + df_data = [ + ("bool_col", [True, None, False]), + ("bytes_col", [b"abc", None, b"def"]), + ("date_col", [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)]), + # ( + # "dt_col", + # [ + # datetime.datetime(1, 1, 1, 0, 0, 0), + # None, + # datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + # ], + # ), + ("float_col", [float("-inf"), float("nan"), float("inf")]), + ( + "geo_col", + [ + "POINT(30 10)", + None, + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", + ], + ), + ("int_col", [-9223372036854775808, None, 9223372036854775807]), + ( + "num_col", + [ + decimal.Decimal("-99999999999999999999999999999.999999999"), + None, + decimal.Decimal("99999999999999999999999999999.999999999"), + ], + ), + ("str_col", [u"abc", None, u"def"]), + ( + "time_col", + [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], + ), + ( + "ts_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc), + None, + datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc + ), + ], + ), + ] + if _BIGNUMERIC_SUPPORT: + df_data.append( ( - "ts_col", + "bignum_col", [ - datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc), + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), None, - datetime.datetime( - 9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc - ), + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), ], - ), - ] - ) + ) + ) + df_data = collections.OrderedDict(df_data) dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) dataset_id = _make_dataset_id("bq_load_test") @@ -1172,6 +1188,7 @@ def test_load_table_from_dataframe_w_explicit_schema_source_format_csv(self): bigquery.SchemaField("geo_col", "GEOGRAPHY"), bigquery.SchemaField("int_col", "INTEGER"), bigquery.SchemaField("num_col", "NUMERIC"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), bigquery.SchemaField("str_col", "STRING"), bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), @@ -1210,6 +1227,14 @@ def test_load_table_from_dataframe_w_explicit_schema_source_format_csv(self): decimal.Decimal("99999999999999999999999999999.999999999"), ], ), + ( + "bignum_col", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], + ), ("str_col", [u"abc", None, u"def"]), ( "time_col", @@ -2157,6 +2182,10 @@ def test_query_w_query_params(self): pi_numeric_param = ScalarQueryParameter( name="pi_numeric_param", type_="NUMERIC", value=pi_numeric ) + bignum = decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)) + bignum_param = ScalarQueryParameter( + name="bignum_param", type_="BIGNUMERIC", value=bignum + ) truthy = True truthy_param = ScalarQueryParameter(name="truthy", type_="BOOL", value=truthy) beef = b"DEADBEEF" @@ -2302,6 +2331,15 @@ def test_query_w_query_params(self): "query_parameters": [with_friends_param], }, ] + if _BIGNUMERIC_SUPPORT: + examples.append( + { + "sql": "SELECT @bignum_param", + "expected": bignum, + "query_parameters": [bignum_param], + } + ) + for example in examples: jconfig = QueryJobConfig() jconfig.query_parameters = example["query_parameters"] diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index ef0c40e1a..abd725820 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -39,6 +39,12 @@ from google import api_core from google.cloud.bigquery import schema +from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT + + +skip_if_no_bignumeric = pytest.mark.skipif( + not _BIGNUMERIC_SUPPORT, reason="BIGNUMERIC support requires pyarrow>=3.0.0", +) @pytest.fixture @@ -70,6 +76,15 @@ def is_numeric(type_): )(type_) +def is_bignumeric(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type + return all_( + pyarrow.types.is_decimal, + lambda type_: type_.precision == 76, + lambda type_: type_.scale == 38, + )(type_) + + def is_timestamp(type_): # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp-type return all_( @@ -120,6 +135,9 @@ def test_all_(): ("FLOAT", "NULLABLE", pyarrow.types.is_float64), ("FLOAT64", "NULLABLE", pyarrow.types.is_float64), ("NUMERIC", "NULLABLE", is_numeric), + pytest.param( + "BIGNUMERIC", "NULLABLE", is_bignumeric, marks=skip_if_no_bignumeric, + ), ("BOOLEAN", "NULLABLE", pyarrow.types.is_boolean), ("BOOL", "NULLABLE", pyarrow.types.is_boolean), ("TIMESTAMP", "NULLABLE", is_timestamp), @@ -198,6 +216,12 @@ def test_all_(): "REPEATED", all_(pyarrow.types.is_list, lambda type_: is_numeric(type_.value_type)), ), + pytest.param( + "BIGNUMERIC", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), + marks=skip_if_no_bignumeric, + ), ( "BOOLEAN", "REPEATED", @@ -270,34 +294,41 @@ def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), - schema.SchemaField("field08", "BOOLEAN"), - schema.SchemaField("field09", "BOOL"), - schema.SchemaField("field10", "TIMESTAMP"), - schema.SchemaField("field11", "DATE"), - schema.SchemaField("field12", "TIME"), - schema.SchemaField("field13", "DATETIME"), - schema.SchemaField("field14", "GEOGRAPHY"), + schema.SchemaField("field09", "BOOLEAN"), + schema.SchemaField("field10", "BOOL"), + schema.SchemaField("field11", "TIMESTAMP"), + schema.SchemaField("field12", "DATE"), + schema.SchemaField("field13", "TIME"), + schema.SchemaField("field14", "DATETIME"), + schema.SchemaField("field15", "GEOGRAPHY"), ) + + if _BIGNUMERIC_SUPPORT: + fields += (schema.SchemaField("field08", "BIGNUMERIC"),) + field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) - expected = pyarrow.struct( - ( - pyarrow.field("field01", pyarrow.string()), - pyarrow.field("field02", pyarrow.binary()), - pyarrow.field("field03", pyarrow.int64()), - pyarrow.field("field04", pyarrow.int64()), - pyarrow.field("field05", pyarrow.float64()), - pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", pyarrow.bool_()), - pyarrow.field("field09", pyarrow.bool_()), - pyarrow.field("field10", module_under_test.pyarrow_timestamp()), - pyarrow.field("field11", pyarrow.date32()), - pyarrow.field("field12", module_under_test.pyarrow_time()), - pyarrow.field("field13", module_under_test.pyarrow_datetime()), - pyarrow.field("field14", pyarrow.string()), - ) + + expected = ( + pyarrow.field("field01", pyarrow.string()), + pyarrow.field("field02", pyarrow.binary()), + pyarrow.field("field03", pyarrow.int64()), + pyarrow.field("field04", pyarrow.int64()), + pyarrow.field("field05", pyarrow.float64()), + pyarrow.field("field06", pyarrow.float64()), + pyarrow.field("field07", module_under_test.pyarrow_numeric()), + pyarrow.field("field09", pyarrow.bool_()), + pyarrow.field("field10", pyarrow.bool_()), + pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field12", pyarrow.date32()), + pyarrow.field("field13", module_under_test.pyarrow_time()), + pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field15", pyarrow.string()), ) + if _BIGNUMERIC_SUPPORT: + expected += (pyarrow.field("field08", module_under_test.pyarrow_bignumeric()),) + expected = pyarrow.struct(expected) + assert pyarrow.types.is_struct(actual) assert actual.num_fields == len(fields) assert actual.equals(expected) @@ -314,34 +345,41 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), - schema.SchemaField("field08", "BOOLEAN"), - schema.SchemaField("field09", "BOOL"), - schema.SchemaField("field10", "TIMESTAMP"), - schema.SchemaField("field11", "DATE"), - schema.SchemaField("field12", "TIME"), - schema.SchemaField("field13", "DATETIME"), - schema.SchemaField("field14", "GEOGRAPHY"), + schema.SchemaField("field09", "BOOLEAN"), + schema.SchemaField("field10", "BOOL"), + schema.SchemaField("field11", "TIMESTAMP"), + schema.SchemaField("field12", "DATE"), + schema.SchemaField("field13", "TIME"), + schema.SchemaField("field14", "DATETIME"), + schema.SchemaField("field15", "GEOGRAPHY"), ) + + if _BIGNUMERIC_SUPPORT: + fields += (schema.SchemaField("field08", "BIGNUMERIC"),) + field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) - expected_value_type = pyarrow.struct( - ( - pyarrow.field("field01", pyarrow.string()), - pyarrow.field("field02", pyarrow.binary()), - pyarrow.field("field03", pyarrow.int64()), - pyarrow.field("field04", pyarrow.int64()), - pyarrow.field("field05", pyarrow.float64()), - pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", pyarrow.bool_()), - pyarrow.field("field09", pyarrow.bool_()), - pyarrow.field("field10", module_under_test.pyarrow_timestamp()), - pyarrow.field("field11", pyarrow.date32()), - pyarrow.field("field12", module_under_test.pyarrow_time()), - pyarrow.field("field13", module_under_test.pyarrow_datetime()), - pyarrow.field("field14", pyarrow.string()), - ) + + expected = ( + pyarrow.field("field01", pyarrow.string()), + pyarrow.field("field02", pyarrow.binary()), + pyarrow.field("field03", pyarrow.int64()), + pyarrow.field("field04", pyarrow.int64()), + pyarrow.field("field05", pyarrow.float64()), + pyarrow.field("field06", pyarrow.float64()), + pyarrow.field("field07", module_under_test.pyarrow_numeric()), + pyarrow.field("field09", pyarrow.bool_()), + pyarrow.field("field10", pyarrow.bool_()), + pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field12", pyarrow.date32()), + pyarrow.field("field13", module_under_test.pyarrow_time()), + pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field15", pyarrow.string()), ) + if _BIGNUMERIC_SUPPORT: + expected += (pyarrow.field("field08", module_under_test.pyarrow_bignumeric()),) + expected_value_type = pyarrow.struct(expected) + assert pyarrow.types.is_list(actual) assert pyarrow.types.is_struct(actual.value_type) assert actual.value_type.num_fields == len(fields) @@ -385,6 +423,16 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): decimal.Decimal("999.123456789"), ], ), + pytest.param( + "BIGNUMERIC", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + decimal.Decimal("3.141592653589793238462643383279"), + ], + marks=skip_if_no_bignumeric, + ), ("BOOLEAN", [True, None, False, None]), ("BOOL", [False, None, True, None]), ( @@ -841,41 +889,45 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): schema.SchemaField("field05", "FLOAT", mode="REQUIRED"), schema.SchemaField("field06", "FLOAT64", mode="REQUIRED"), schema.SchemaField("field07", "NUMERIC", mode="REQUIRED"), - schema.SchemaField("field08", "BOOLEAN", mode="REQUIRED"), - schema.SchemaField("field09", "BOOL", mode="REQUIRED"), - schema.SchemaField("field10", "TIMESTAMP", mode="REQUIRED"), - schema.SchemaField("field11", "DATE", mode="REQUIRED"), - schema.SchemaField("field12", "TIME", mode="REQUIRED"), - schema.SchemaField("field13", "DATETIME", mode="REQUIRED"), - schema.SchemaField("field14", "GEOGRAPHY", mode="REQUIRED"), - ) - dataframe = pandas.DataFrame( - { - "field01": ["hello", "world"], - "field02": [b"abd", b"efg"], - "field03": [1, 2], - "field04": [3, 4], - "field05": [1.25, 9.75], - "field06": [-1.75, -3.5], - "field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")], - "field08": [True, False], - "field09": [False, True], - "field10": [ - datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc), - datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=pytz.utc), - ], - "field11": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)], - "field12": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)], - "field13": [ - datetime.datetime(1970, 1, 1, 0, 0, 0), - datetime.datetime(2012, 12, 21, 9, 7, 42), - ], - "field14": [ - "POINT(30 10)", - "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", - ], - } + schema.SchemaField("field09", "BOOLEAN", mode="REQUIRED"), + schema.SchemaField("field10", "BOOL", mode="REQUIRED"), + schema.SchemaField("field11", "TIMESTAMP", mode="REQUIRED"), + schema.SchemaField("field12", "DATE", mode="REQUIRED"), + schema.SchemaField("field13", "TIME", mode="REQUIRED"), + schema.SchemaField("field14", "DATETIME", mode="REQUIRED"), + schema.SchemaField("field15", "GEOGRAPHY", mode="REQUIRED"), ) + if _BIGNUMERIC_SUPPORT: + bq_schema += (schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"),) + + data = { + "field01": ["hello", "world"], + "field02": [b"abd", b"efg"], + "field03": [1, 2], + "field04": [3, 4], + "field05": [1.25, 9.75], + "field06": [-1.75, -3.5], + "field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")], + "field09": [True, False], + "field10": [False, True], + "field11": [ + datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc), + datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=pytz.utc), + ], + "field12": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)], + "field13": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)], + "field14": [ + datetime.datetime(1970, 1, 1, 0, 0, 0), + datetime.datetime(2012, 12, 21, 9, 7, 42), + ], + "field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], + } + if _BIGNUMERIC_SUPPORT: + data["field08"] = [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ] + dataframe = pandas.DataFrame(data) arrow_table = module_under_test.dataframe_to_arrow(dataframe, bq_schema) arrow_schema = arrow_table.schema @@ -1089,6 +1141,7 @@ def test_augment_schema_type_detection_succeeds(module_under_test): "bytes_field": b"some bytes", "string_field": u"some characters", "numeric_field": decimal.Decimal("123.456"), + "bignumeric_field": decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), } ] ) @@ -1109,6 +1162,10 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("string_field", field_type=None, mode="NULLABLE"), schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"), ) + if _BIGNUMERIC_SUPPORT: + current_schema += ( + schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), + ) with warnings.catch_warnings(record=True) as warned: augmented_schema = module_under_test.augment_schema(dataframe, current_schema) @@ -1131,6 +1188,13 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("string_field", field_type="STRING", mode="NULLABLE"), schema.SchemaField("numeric_field", field_type="NUMERIC", mode="NULLABLE"), ) + if _BIGNUMERIC_SUPPORT: + expected_schema += ( + schema.SchemaField( + "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" + ), + ) + by_name = operator.attrgetter("name") assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name) diff --git a/tests/unit/test_dbapi__helpers.py b/tests/unit/test_dbapi__helpers.py index fffa46aa8..c28c014d4 100644 --- a/tests/unit/test_dbapi__helpers.py +++ b/tests/unit/test_dbapi__helpers.py @@ -25,6 +25,7 @@ import google.cloud._helpers from google.cloud.bigquery import table +from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT from google.cloud.bigquery.dbapi import _helpers from google.cloud.bigquery.dbapi import exceptions from tests.unit.helpers import _to_pyarrow @@ -51,6 +52,14 @@ def test_scalar_to_query_parameter(self): "TIMESTAMP", ), ] + if _BIGNUMERIC_SUPPORT: + expected_types.append( + ( + decimal.Decimal("1.1234567890123456789012345678901234567890"), + "BIGNUMERIC", + ) + ) + for value, expected_type in expected_types: msg = "value: {} expected_type: {}".format(value, expected_type) parameter = _helpers.scalar_to_query_parameter(value) @@ -104,6 +113,11 @@ def test_array_to_query_parameter_valid_argument(self): ), ] + if _BIGNUMERIC_SUPPORT: + expected_types.append( + ([decimal.Decimal("{d38}.{d38}".format(d38="9" * 38))], "BIGNUMERIC") + ) + for values, expected_type in expected_types: msg = "value: {} expected_type: {}".format(values, expected_type) parameter = _helpers.array_to_query_parameter(values) diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index cf268daf1..ae2c29d09 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -166,6 +166,16 @@ def test_to_api_repr_w_numeric(self): param = klass.positional(type_="NUMERIC", value="123456789.123456789") self.assertEqual(param.to_api_repr(), EXPECTED) + def test_to_api_repr_w_bignumeric(self): + big_num_string = "{d38}.{d38}".format(d38="9" * 38) + EXPECTED = { + "parameterType": {"type": "BIGNUMERIC"}, + "parameterValue": {"value": big_num_string}, + } + klass = self._get_target_class() + param = klass.positional(type_="BIGNUMERIC", value=big_num_string) + self.assertEqual(param.to_api_repr(), EXPECTED) + def test_to_api_repr_w_bool(self): EXPECTED = { "parameterType": {"type": "BOOL"},