Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support passing struct data to the DB API #718

Merged
merged 26 commits into from Jul 1, 2021
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
586a022
Added support for dbi struct parameters with explicit types
jimfulton Jun 22, 2021
13f9107
Make the dataset_id fixture a session fixture
jimfulton Jun 23, 2021
aea7bae
Make the dataset_id fixture a session fixture
jimfulton Jun 23, 2021
6f26130
system test of the struct machinery
jimfulton Jun 23, 2021
c386448
Verify that we can bind non-parameterized types
jimfulton Jun 23, 2021
71c8614
Parse and remove type parameters from explcit types.
jimfulton Jun 23, 2021
6f5b345
Document passing struct data.
jimfulton Jun 23, 2021
e08f6f6
Merge remote-tracking branch 'origin/master' into riversnake-dbi-stru…
jimfulton Jun 23, 2021
411c336
blacken
jimfulton Jun 23, 2021
ef2b323
using match.groups() throws off pytypes, also fix some type hints.
jimfulton Jun 23, 2021
654b108
🦉 Updates from OwlBot
gcf-owl-bot[bot] Jun 23, 2021
525b8fd
blacken
jimfulton Jun 23, 2021
462f2eb
remove type hints -- maybe they broke docs?
jimfulton Jun 23, 2021
904d2ce
merge upstream
jimfulton Jun 23, 2021
a6393e6
Revert "remove type hints -- maybe they broke docs?"
jimfulton Jun 23, 2021
e63e8b7
pin gcp-sphinx-docfx-yaml==0.2.0 so docfx doesn't fail.
jimfulton Jun 23, 2021
2f2bdcd
Merge remote-tracking branch 'origin/master' into riversnake-dbi-stru…
jimfulton Jun 24, 2021
91b0028
Review comments: examples, and guard against large number of fields
jimfulton Jun 24, 2021
35555aa
🦉 Updates from OwlBot
gcf-owl-bot[bot] Jun 24, 2021
0d81b80
Merge remote-tracking branch 'origin/master' into riversnake-dbi-stru…
jimfulton Jun 24, 2021
d3f959c
Merge branch 'riversnake-dbi-struct-types' of github.com:googleapis/p…
jimfulton Jun 24, 2021
5554301
Factored some repeated code in handling complex parameters
jimfulton Jun 25, 2021
8830113
Improved the error for dict (structish) parameter values without expl…
jimfulton Jun 25, 2021
a72cafb
blacken
jimfulton Jun 25, 2021
cba9697
removed repeated word
jimfulton Jun 25, 2021
12bd941
Update google/cloud/bigquery/dbapi/_helpers.py
jimfulton Jun 29, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion docs/dbapi.rst
Expand Up @@ -37,7 +37,14 @@ colon, as in::

insert into people (name, income) values (%(name:string)s, %(income:numeric)s)

For unnamed parameters, use the named syntax with a type, but now
For unnamed parameters, use the named syntax with a type, but no
plamut marked this conversation as resolved.
Show resolved Hide resolved
name, as in::

insert into people (name, income) values (%(:string)s, %(:numeric)s)

Providing type information is the *only* way to pass `struct` data::

cursor.execute(
"insert into points (point) values (%(:struct<x float64, y float64>)s)",
[{"x": 10, "y": 20}],
)
213 changes: 201 additions & 12 deletions google/cloud/bigquery/dbapi/_helpers.py
Expand Up @@ -18,18 +18,34 @@
import decimal
import functools
import numbers
import re
import typing

from google.cloud import bigquery
from google.cloud.bigquery import table, enums
from google.cloud.bigquery import table, enums, query
from google.cloud.bigquery.dbapi import exceptions


_NUMERIC_SERVER_MIN = decimal.Decimal("-9.9999999999999999999999999999999999999E+28")
_NUMERIC_SERVER_MAX = decimal.Decimal("9.9999999999999999999999999999999999999E+28")

type_parameters_re = re.compile(
r"""
\(
\s*[0-9]+\s*
(,
\s*[0-9]+\s*
)*
\)
""",
re.VERBOSE,
)


def _parameter_type(name, value, query_parameter_type=None, value_doc=""):
if query_parameter_type:
# Strip type parameters
query_parameter_type = type_parameters_re.sub("", query_parameter_type)
try:
parameter_type = getattr(
enums.SqlParameterScalarTypes, query_parameter_type.upper()
Expand Down Expand Up @@ -113,6 +129,176 @@ def array_to_query_parameter(value, name=None, query_parameter_type=None):
return bigquery.ArrayQueryParameter(name, array_type, value)


complex_query_parameter_parse = re.compile(
r"""
\s*
(ARRAY|STRUCT|RECORD) # Type
\s*
<([A-Z0-9<> ,()]+)> # Subtype(s)
\s*$
""",
re.IGNORECASE | re.VERBOSE,
).match
parse_struct_field = re.compile(
r"""
(?:(\w+)\s+) # field name
([A-Z0-9<> ,()]+) # Field type
$""",
re.VERBOSE | re.IGNORECASE,
).match


def _split_struct_fields(fields):
# Split a string of struct fields. They're defined by commas, but
# we have to avoid splitting on commas interbal to fields. For
jimfulton marked this conversation as resolved.
Show resolved Hide resolved
# example:
# name string, children array<struct<name string, bdate date>>
#
# only has 2 top-level fields.
fields = fields.split(",")
fields = list(reversed(fields)) # in the off chance that there are very many
while fields:
field = fields.pop()
while fields and field.count("<") != field.count(">"):
field += "," + fields.pop()
yield field


def complex_query_parameter_type(name: typing.Optional[str], type_: str, base: str):
"""Construct a parameter type (`StructQueryParameterType`) for a complex type

or a non-complex type that's part of a complex type.

Examples:

array<struct<x float64, y float64>>

struct<name string, children array<struct<name string, bdate date>>>

This is used for computing array types.
"""
type_ = type_.strip()
if "<" not in type_:
# Scalar

# Strip type parameters
type_ = type_parameters_re.sub("", type_).strip()
try:
type_ = getattr(enums.SqlParameterScalarTypes, type_.upper())
except AttributeError:
raise exceptions.ProgrammingError(
f"Invalid scalar type, {type_}, in {base}"
)
if name:
type_ = type_.with_name(name)
return type_

m = complex_query_parameter_parse(type_)
if not m:
raise exceptions.ProgrammingError(f"Invalid parameter type, {type_}")
tname, sub = m.group(1, 2)
tname = tname.upper()
sub = sub.strip()
if tname == "ARRAY":
return query.ArrayQueryParameterType(
complex_query_parameter_type(None, sub, base), name=name
)
else:
fields = []
plamut marked this conversation as resolved.
Show resolved Hide resolved
for field_string in _split_struct_fields(sub):
field_string = field_string.strip()
m = parse_struct_field(field_string)
if not m:
raise exceptions.ProgrammingError(
f"Invalid struct field, {field_string}, in {base}"
)
field_name, field_type = m.groups()
fields.append(complex_query_parameter_type(field_name, field_type, base))

return query.StructQueryParameterType(*fields, name=name)


def complex_query_parameter(
name: typing.Optional[str], value, type_: str, base: typing.Optional[str] = None
):
plamut marked this conversation as resolved.
Show resolved Hide resolved
"""
Construct a query parameter for a complex type (array or struct record)

or for a subtype, which may not be complex

Examples:

array<struct<x float64, y float64>>

struct<name string, children array<struct<name string, bdate date>>>

"""
type_ = type_.strip()
base = base or type_
if ">" not in type_:
# Scalar

# Strip type parameters
type_ = type_parameters_re.sub("", type_).strip()
try:
type_ = getattr(enums.SqlParameterScalarTypes, type_.upper())._type
except AttributeError:
raise exceptions.ProgrammingError(
f"The given parameter type, {type_},"
f" for {name} is not a valid BigQuery scalar type, in {base}."
)

return query.ScalarQueryParameter(name, type_, value)

m = complex_query_parameter_parse(type_)
if not m:
raise exceptions.ProgrammingError(f"Invalid parameter type, {type_}")
tname, sub = m.group(1, 2)
tname = tname.upper()
sub = sub.strip()
if tname == "ARRAY":
if not array_like(value):
raise exceptions.ProgrammingError(
f"Array type with non-array-like value"
f" with type {type(value).__name__}"
)
array_type = complex_query_parameter_type(name, sub, base)
if isinstance(array_type, query.ArrayQueryParameterType):
raise exceptions.ProgrammingError(f"Array can't contain an array in {base}")
return query.ArrayQueryParameter(
name,
array_type,
[complex_query_parameter(None, v, sub, base) for v in value]
if "<" in sub
else value,
)
else:
fields = []
if not isinstance(value, collections_abc.Mapping):
raise exceptions.ProgrammingError(f"Non-mapping value for type {type_}")
value_keys = set(value)
for field_string in _split_struct_fields(sub):
field_string = field_string.strip()
m = parse_struct_field(field_string)
if not m:
raise exceptions.ProgrammingError(
f"Invalid struct field, {field_string}, in {base or type_}"
)
field_name, field_type = m.groups()
if field_name not in value:
raise exceptions.ProgrammingError(
f"No field value for {field_name} in {type_}"
)
value_keys.remove(field_name)
fields.append(
complex_query_parameter(field_name, value[field_name], field_type, base)
)
if value_keys:
raise exceptions.ProgrammingError(f"Extra data keys for {type_}")

return query.StructQueryParameter(name, *fields)


def to_query_parameters_list(parameters, parameter_types):
"""Converts a sequence of parameter values into query parameters.

Expand All @@ -129,7 +315,9 @@ def to_query_parameters_list(parameters, parameter_types):
result = []

for value, type_ in zip(parameters, parameter_types):
if isinstance(value, collections_abc.Mapping):
if type_ is not None and "<" in type_:
param = complex_query_parameter(None, value, type_)
elif isinstance(value, collections_abc.Mapping):
plamut marked this conversation as resolved.
Show resolved Hide resolved
raise NotImplementedError("STRUCT-like parameter values are not supported.")
elif array_like(value):
param = array_to_query_parameter(value, None, type_)
Expand Down Expand Up @@ -157,21 +345,22 @@ def to_query_parameters_dict(parameters, query_parameter_types):
result = []

for name, value in parameters.items():
if isinstance(value, collections_abc.Mapping):
query_parameter_type = query_parameter_types.get(name)
if query_parameter_type is not None and "<" in query_parameter_type:
param = complex_query_parameter(name, value, query_parameter_type)
elif isinstance(value, collections_abc.Mapping):
raise NotImplementedError(
"STRUCT-like parameter values are not supported "
"(parameter {}).".format(name)
)
elif array_like(value):
param = array_to_query_parameter(
value, name=name, query_parameter_type=query_parameter_type
)
else:
query_parameter_type = query_parameter_types.get(name)
if array_like(value):
param = array_to_query_parameter(
value, name=name, query_parameter_type=query_parameter_type
)
else:
param = scalar_to_query_parameter(
value, name=name, query_parameter_type=query_parameter_type,
)
param = scalar_to_query_parameter(
value, name=name, query_parameter_type=query_parameter_type,
)

result.append(param)

Expand Down
28 changes: 27 additions & 1 deletion google/cloud/bigquery/dbapi/cursor.py
Expand Up @@ -483,7 +483,33 @@ def _format_operation(operation, parameters):


def _extract_types(
operation, extra_type_sub=re.compile(r"(%*)%(?:\(([^:)]*)(?::(\w+))?\))?s").sub
operation,
extra_type_sub=re.compile(
r"""
(%*) # Extra %s. We'll deal with these in the replacement code

% # Beginning of replacement, %s, %(...)s

(?:\( # Begin of optional name and/or type
([^:)]*) # name
(?:: # ':' introduces type
( # start of type group
[a-zA-Z0-9<>, ]+ # First part, no parens

(?: # start sets of parens + non-paren text
\([0-9 ,]+\) # comma-separated groups of digits in parens
# (e.g. string(10))
(?=[, >)]) # Must be followed by ,>) or space
[a-zA-Z0-9<>, ]* # Optional non-paren chars
)* # Can be zero or more of parens and following text
) # end of type group
)? # close type clause ":type"
\))? # End of optional name and/or type

s # End of replacement
""",
re.VERBOSE,
plamut marked this conversation as resolved.
Show resolved Hide resolved
).sub,
):
"""Remove type information from parameter placeholders.

Expand Down
7 changes: 6 additions & 1 deletion tests/system/conftest.py
Expand Up @@ -31,9 +31,14 @@ def bqstorage_client(bigquery_client):
return bigquery_storage.BigQueryReadClient(credentials=bigquery_client._credentials)


@pytest.fixture
@pytest.fixture(scope="session")
def dataset_id(bigquery_client):
dataset_id = f"bqsystem_{helpers.temp_suffix()}"
bigquery_client.create_dataset(dataset_id)
yield dataset_id
bigquery_client.delete_dataset(dataset_id, delete_contents=True)


@pytest.fixture
def table_id(dataset_id):
return f"{dataset_id}.table_{helpers.temp_suffix()}"
11 changes: 2 additions & 9 deletions tests/system/test_pandas.py
Expand Up @@ -149,17 +149,14 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype(
reason="Only `pandas version >=1.0.0` is supported",
)
def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(
bigquery_client, dataset_id
bigquery_client, dataset_id, table_id
):
"""Test that a DataFrame containing column with None-type values and int64 datatype
can be uploaded without specifying a schema.

https://github.com/googleapis/python-bigquery/issues/22
"""

table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format(
bigquery_client.project, dataset_id
)
df_data = collections.OrderedDict(
[("x", pandas.Series([1, 2, None, 4], dtype="Int64"))]
)
Expand Down Expand Up @@ -511,7 +508,7 @@ def test_load_table_from_dataframe_w_explicit_schema_source_format_csv(


def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats(
bigquery_client, dataset_id
bigquery_client, dataset_id, table_id
):
from google.cloud.bigquery.job import SourceFormat

Expand All @@ -536,10 +533,6 @@ def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats(
)
dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys())

table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema_csv".format(
bigquery_client.project, dataset_id
)

job_config = bigquery.LoadJobConfig(
schema=table_schema, source_format=SourceFormat.CSV
)
Expand Down
31 changes: 31 additions & 0 deletions tests/system/test_structs.py
@@ -0,0 +1,31 @@
import datetime

import pytest

from google.cloud.bigquery.dbapi import connect

person_type = "struct<name string," " children array<struct<name string, bdate date>>>"
person_type_sized = (
"struct<name string(22)," " children array<struct<name string(22), bdate date>>>"
)


@pytest.mark.parametrize("person_type_decl", [person_type, person_type_sized])
def test_structs(bigquery_client, dataset_id, person_type_decl, table_id):
conn = connect(bigquery_client)
cursor = conn.cursor()
cursor.execute(f"create table {table_id} (person {person_type_decl})")
data = dict(
name="par",
children=[
dict(name="ch1", bdate=datetime.date(2021, 1, 1)),
dict(name="ch2", bdate=datetime.date(2021, 1, 2)),
],
)
cursor.execute(
f"insert into {table_id} (person) values (%(v:{person_type})s)", dict(v=data),
)

cursor.execute(f"select * from {table_id}")
[[result]] = list(cursor)
assert result == data