From 1f6e6d89b8f80841fbbb923991f6723f3bb03ec2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 17 Mar 2021 16:18:44 -0500 Subject: [PATCH 1/4] WIP: fix: don't set policy tags in load job from dataframe --- google/cloud/bigquery/client.py | 8 +++- google/cloud/bigquery/schema.py | 43 +++++++++-------- tests/unit/test_client.py | 84 +++++++++++++++++++-------------- tests/unit/test_schema.py | 14 +++--- 4 files changed, 83 insertions(+), 66 deletions(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index bdbcb767c..311059705 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -2291,9 +2291,13 @@ def load_table_from_dataframe( name for name, _ in _pandas_helpers.list_columns_and_indexes(dataframe) ) - # schema fields not present in the dataframe are not needed job_config.schema = [ - field for field in table.schema if field.name in columns_and_indexes + # Field description and policy tags are not needed to + # serialize a data frame. + SchemaField(field.name, field.field_type, mode=field.mode) + # schema fields not present in the dataframe are not needed + for field in table.schema + if field.name in columns_and_indexes ] job_config.schema = _pandas_helpers.dataframe_to_bq_schema( diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py index 9be27f3e8..680dcc138 100644 --- a/google/cloud/bigquery/schema.py +++ b/google/cloud/bigquery/schema.py @@ -19,6 +19,7 @@ from google.cloud.bigquery_v2 import types +_DEFAULT_VALUE = object() _STRUCT_TYPES = ("RECORD", "STRUCT") # SQL types reference: @@ -73,14 +74,18 @@ def __init__( name, field_type, mode="NULLABLE", - description=None, + description=_DEFAULT_VALUE, fields=(), policy_tags=None, ): - self._name = name - self._field_type = field_type - self._mode = mode - self._description = description + self._properties = { + "name": name, + "type": field_type, + } + if mode is not None: + self._properties["mode"] = mode.upper() + if description is not _DEFAULT_VALUE: + self._properties["description"] = description self._fields = tuple(fields) self._policy_tags = policy_tags @@ -98,7 +103,7 @@ def from_api_repr(cls, api_repr): """ # Handle optional properties with default values mode = api_repr.get("mode", "NULLABLE") - description = api_repr.get("description") + description = api_repr.get("description", _DEFAULT_VALUE) fields = api_repr.get("fields", ()) return cls( @@ -113,7 +118,7 @@ def from_api_repr(cls, api_repr): @property def name(self): """str: The name of the field.""" - return self._name + return self._properties["name"] @property def field_type(self): @@ -122,7 +127,7 @@ def field_type(self): See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type """ - return self._field_type + return self._properties["type"] @property def mode(self): @@ -131,17 +136,17 @@ def mode(self): See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.mode """ - return self._mode + return self._properties.get("mode") @property def is_nullable(self): """bool: whether 'mode' is 'nullable'.""" - return self._mode == "NULLABLE" + return self.mode == "NULLABLE" @property def description(self): """Optional[str]: description for the field.""" - return self._description + return self._properties.get("description") @property def fields(self): @@ -164,13 +169,7 @@ def to_api_repr(self): Returns: Dict: A dictionary representing the SchemaField in a serialized form. """ - # Put together the basic representation. See http://bit.ly/2hOAT5u. - answer = { - "mode": self.mode.upper(), - "name": self.name, - "type": self.field_type.upper(), - "description": self.description, - } + answer = self._properties.copy() # If this is a RECORD type, then sub-fields are also included, # add this to the serialized representation. @@ -193,10 +192,10 @@ def _key(self): Tuple: The contents of this :class:`~google.cloud.bigquery.schema.SchemaField`. """ return ( - self._name, - self._field_type.upper(), - self._mode.upper(), - self._description, + self.name, + self.field_type.upper(), + self.mode.upper(), + self.description, self._fields, self._policy_tags, ) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 6c3263ea5..64d677802 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -1596,18 +1596,8 @@ def test_create_table_w_schema_and_query(self): { "schema": { "fields": [ - { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - }, - { - "name": "age", - "type": "INTEGER", - "mode": "REQUIRED", - "description": None, - }, + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, ] }, "view": {"query": query}, @@ -1641,18 +1631,8 @@ def test_create_table_w_schema_and_query(self): }, "schema": { "fields": [ - { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - }, - { - "name": "age", - "type": "INTEGER", - "mode": "REQUIRED", - "description": None, - }, + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, ] }, "view": {"query": query, "useLegacySql": False}, @@ -2602,7 +2582,7 @@ def test_update_table(self): "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, + "description": "New field description", }, ] }, @@ -2613,8 +2593,10 @@ def test_update_table(self): } ) schema = [ - SchemaField("full_name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), + SchemaField("full_name", "STRING", mode="REQUIRED", description=None), + SchemaField( + "age", "INTEGER", mode="REQUIRED", description="New field description" + ), ] creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) @@ -2647,7 +2629,7 @@ def test_update_table(self): "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, + "description": "New field description", }, ] }, @@ -7658,18 +7640,40 @@ def test_load_table_from_file_w_invalid_job_config(self): def test_load_table_from_dataframe(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job - from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.schema import PolicyTagList, SchemaField client = self._make_client() - records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + records = [ + {"id": 1, "age": 100, "accounts": [2, 3]}, + {"id": 2, "age": 60, "accounts": [5]}, + {"id": 3, "age": 40, "accounts": []}, + ] dataframe = pandas.DataFrame(records) + get_table_schema = [ + SchemaField( + "id", + "INTEGER", + mode="REQUIRED", + description="integer column", + policy_tags=PolicyTagList(names=("foo", "bar")), + ), + SchemaField( + "age", + "INTEGER", + mode="NULLABLE", + description="age column", + policy_tags=PolicyTagList(names=("baz",)), + ), + SchemaField( + "accounts", "INTEGER", mode="REPEATED", description="array column", + ), + ] + get_table_patch = mock.patch( "google.cloud.bigquery.client.Client.get_table", autospec=True, - return_value=mock.Mock( - schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")] - ), + return_value=mock.Mock(schema=get_table_schema), ) load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True @@ -7695,8 +7699,18 @@ def test_load_table_from_dataframe(self): sent_file = load_table_from_file.mock_calls[0][1][1] assert sent_file.closed - sent_config = load_table_from_file.mock_calls[0][2]["job_config"] - assert sent_config.source_format == job.SourceFormat.PARQUET + sent_config = load_table_from_file.mock_calls[0][2]["job_config"].to_api_repr()[ + "load" + ] + assert sent_config["sourceFormat"] == job.SourceFormat.PARQUET + for field_index, field in enumerate(sent_config["schema"]["fields"]): + assert field["name"] == get_table_schema[field_index].name + assert field["type"] == get_table_schema[field_index].field_type + assert field["mode"] == get_table_schema[field_index].mode + # Omit unnecessary fields when they come from getting the table + # (not passed in via job_config) + assert "description" not in field + assert "policyTags" not in field @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 71bf6b5ae..9c77287ad 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -60,8 +60,8 @@ def test_constructor_subfields(self): self.assertEqual(field._mode, "NULLABLE") self.assertIsNone(field._description) self.assertEqual(len(field._fields), 2) - self.assertIs(field._fields[0], sub_field1) - self.assertIs(field._fields[1], sub_field2) + self.assertEqual(field._fields[0], sub_field1) + self.assertEqual(field._fields[1], sub_field2) def test_constructor_with_policy_tags(self): from google.cloud.bigquery.schema import PolicyTagList @@ -168,17 +168,17 @@ def test_from_api_repr_defaults(self): def test_name_property(self): name = "lemon-ness" schema_field = self._make_one(name, "INTEGER") - self.assertIs(schema_field.name, name) + self.assertEqual(schema_field.name, name) def test_field_type_property(self): field_type = "BOOLEAN" schema_field = self._make_one("whether", field_type) - self.assertIs(schema_field.field_type, field_type) + self.assertEqual(schema_field.field_type, field_type) def test_mode_property(self): mode = "REPEATED" schema_field = self._make_one("again", "FLOAT", mode=mode) - self.assertIs(schema_field.mode, mode) + self.assertEqual(schema_field.mode, mode) def test_is_nullable(self): mode = "NULLABLE" @@ -193,14 +193,14 @@ def test_is_not_nullable(self): def test_description_property(self): description = "It holds some data." schema_field = self._make_one("do", "TIMESTAMP", description=description) - self.assertIs(schema_field.description, description) + self.assertEqual(schema_field.description, description) def test_fields_property(self): sub_field1 = self._make_one("one", "STRING") sub_field2 = self._make_one("fish", "INTEGER") fields = (sub_field1, sub_field2) schema_field = self._make_one("boat", "RECORD", fields=fields) - self.assertIs(schema_field.fields, fields) + self.assertEqual(schema_field.fields, fields) def test_to_standard_sql_simple_type(self): sql_type = self._get_standard_sql_data_type_class() From d4b6d324012ee50c158a485752946c9a080e6fed Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 17 Mar 2021 17:13:16 -0500 Subject: [PATCH 2/4] copy fields parameter for struct support --- google/cloud/bigquery/client.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 311059705..305d60d3b 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -2294,7 +2294,12 @@ def load_table_from_dataframe( job_config.schema = [ # Field description and policy tags are not needed to # serialize a data frame. - SchemaField(field.name, field.field_type, mode=field.mode) + SchemaField( + field.name, + field.field_type, + mode=field.mode, + fields=field.fields, + ) # schema fields not present in the dataframe are not needed for field in table.schema if field.name in columns_and_indexes From 3cdbcc73192e94de1679e054fe15fd42b15df5e0 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 18 Mar 2021 09:33:55 -0500 Subject: [PATCH 3/4] update tests to allow missing description property --- tests/unit/job/test_load_config.py | 12 +--- tests/unit/test_client.py | 17 ++++- tests/unit/test_external_config.py | 9 +-- tests/unit/test_schema.py | 99 +++++++++++------------------- 4 files changed, 53 insertions(+), 84 deletions(-) diff --git a/tests/unit/job/test_load_config.py b/tests/unit/job/test_load_config.py index c18f51bff..63f15ec5a 100644 --- a/tests/unit/job/test_load_config.py +++ b/tests/unit/job/test_load_config.py @@ -434,13 +434,11 @@ def test_schema_setter_fields(self): "name": "full_name", "type": "STRING", "mode": "REQUIRED", - "description": None, } age_repr = { "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, } self.assertEqual( config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]} @@ -449,24 +447,18 @@ def test_schema_setter_fields(self): def test_schema_setter_valid_mappings_list(self): config = self._get_target_class()() - schema = [ - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "age", "type": "INTEGER", "mode": "REQUIRED"}, - ] - config.schema = schema - full_name_repr = { "name": "full_name", "type": "STRING", "mode": "REQUIRED", - "description": None, } age_repr = { "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, } + schema = [full_name_repr, age_repr] + config.schema = schema self.assertEqual( config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]} ) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 64d677802..2595c4c14 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -2755,13 +2755,24 @@ def test_update_table_w_query(self): "name": "age", "type": "INTEGER", "mode": "REQUIRED", - "description": None, + "description": "this is a column", }, + {"name": "country", "type": "STRING", "mode": "NULLABLE"}, ] } schema = [ - SchemaField("full_name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), + SchemaField( + "full_name", + "STRING", + mode="REQUIRED", + # Explicitly unset the description. + description=None, + ), + SchemaField( + "age", "INTEGER", mode="REQUIRED", description="this is a column" + ), + # Omit the description to not make updates to it. + SchemaField("country", "STRING"), ] resource = self._make_table_resource() resource.update( diff --git a/tests/unit/test_external_config.py b/tests/unit/test_external_config.py index 4b6ef5118..4ca2e9012 100644 --- a/tests/unit/test_external_config.py +++ b/tests/unit/test_external_config.py @@ -77,14 +77,7 @@ def test_to_api_repr_base(self): ec.schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")] exp_schema = { - "fields": [ - { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - } - ] + "fields": [{"name": "full_name", "type": "STRING", "mode": "REQUIRED"}] } got_resource = ec.to_api_repr() exp_resource = { diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 9c77287ad..87baaf379 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -35,19 +35,19 @@ def _make_one(self, *args, **kw): def test_constructor_defaults(self): field = self._make_one("test", "STRING") - self.assertEqual(field._name, "test") - self.assertEqual(field._field_type, "STRING") - self.assertEqual(field._mode, "NULLABLE") - self.assertIsNone(field._description) - self.assertEqual(field._fields, ()) + self.assertEqual(field.name, "test") + self.assertEqual(field.field_type, "STRING") + self.assertEqual(field.mode, "NULLABLE") + self.assertIsNone(field.description) + self.assertEqual(field.fields, ()) def test_constructor_explicit(self): field = self._make_one("test", "STRING", mode="REQUIRED", description="Testing") - self.assertEqual(field._name, "test") - self.assertEqual(field._field_type, "STRING") - self.assertEqual(field._mode, "REQUIRED") - self.assertEqual(field._description, "Testing") - self.assertEqual(field._fields, ()) + self.assertEqual(field.name, "test") + self.assertEqual(field.field_type, "STRING") + self.assertEqual(field.mode, "REQUIRED") + self.assertEqual(field.description, "Testing") + self.assertEqual(field.fields, ()) def test_constructor_subfields(self): sub_field1 = self._make_one("area_code", "STRING") @@ -55,13 +55,13 @@ def test_constructor_subfields(self): field = self._make_one( "phone_number", "RECORD", fields=[sub_field1, sub_field2] ) - self.assertEqual(field._name, "phone_number") - self.assertEqual(field._field_type, "RECORD") - self.assertEqual(field._mode, "NULLABLE") - self.assertIsNone(field._description) - self.assertEqual(len(field._fields), 2) - self.assertEqual(field._fields[0], sub_field1) - self.assertEqual(field._fields[1], sub_field2) + self.assertEqual(field.name, "phone_number") + self.assertEqual(field.field_type, "RECORD") + self.assertEqual(field.mode, "NULLABLE") + self.assertIsNone(field.description) + self.assertEqual(len(field.fields), 2) + self.assertEqual(field.fields[0], sub_field1) + self.assertEqual(field.fields[1], sub_field2) def test_constructor_with_policy_tags(self): from google.cloud.bigquery.schema import PolicyTagList @@ -70,12 +70,12 @@ def test_constructor_with_policy_tags(self): field = self._make_one( "test", "STRING", mode="REQUIRED", description="Testing", policy_tags=policy ) - self.assertEqual(field._name, "test") - self.assertEqual(field._field_type, "STRING") - self.assertEqual(field._mode, "REQUIRED") - self.assertEqual(field._description, "Testing") - self.assertEqual(field._fields, ()) - self.assertEqual(field._policy_tags, policy) + self.assertEqual(field.name, "test") + self.assertEqual(field.field_type, "STRING") + self.assertEqual(field.mode, "REQUIRED") + self.assertEqual(field.description, "Testing") + self.assertEqual(field.fields, ()) + self.assertEqual(field.policy_tags, policy) def test_to_api_repr(self): from google.cloud.bigquery.schema import PolicyTagList @@ -92,7 +92,6 @@ def test_to_api_repr(self): "mode": "NULLABLE", "name": "foo", "type": "INTEGER", - "description": None, "policyTags": {"names": ["foo", "bar"]}, }, ) @@ -104,18 +103,10 @@ def test_to_api_repr_with_subfield(self): self.assertEqual( field.to_api_repr(), { - "fields": [ - { - "mode": "NULLABLE", - "name": "bar", - "type": "INTEGER", - "description": None, - } - ], + "fields": [{"mode": "NULLABLE", "name": "bar", "type": "INTEGER"}], "mode": "REQUIRED", "name": "foo", "type": record_type, - "description": None, }, ) @@ -532,17 +523,10 @@ def test_defaults(self): resource = self._call_fut([full_name, age]) self.assertEqual(len(resource), 2) self.assertEqual( - resource[0], - { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - }, + resource[0], {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, ) self.assertEqual( - resource[1], - {"name": "age", "type": "INTEGER", "mode": "REQUIRED", "description": None}, + resource[1], {"name": "age", "type": "INTEGER", "mode": "REQUIRED"} ) def test_w_description(self): @@ -552,7 +536,13 @@ def test_w_description(self): full_name = SchemaField( "full_name", "STRING", mode="REQUIRED", description=DESCRIPTION ) - age = SchemaField("age", "INTEGER", mode="REQUIRED") + age = SchemaField( + "age", + "INTEGER", + mode="REQUIRED", + # Explicitly unset description. + description=None, + ) resource = self._call_fut([full_name, age]) self.assertEqual(len(resource), 2) self.assertEqual( @@ -581,13 +571,7 @@ def test_w_subfields(self): resource = self._call_fut([full_name, phone]) self.assertEqual(len(resource), 2) self.assertEqual( - resource[0], - { - "name": "full_name", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - }, + resource[0], {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, ) self.assertEqual( resource[1], @@ -595,20 +579,9 @@ def test_w_subfields(self): "name": "phone", "type": "RECORD", "mode": "REPEATED", - "description": None, "fields": [ - { - "name": "type", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - }, - { - "name": "number", - "type": "STRING", - "mode": "REQUIRED", - "description": None, - }, + {"name": "type", "type": "STRING", "mode": "REQUIRED"}, + {"name": "number", "type": "STRING", "mode": "REQUIRED"}, ], }, ) From fe1029d6c544d9d3d9cb828fec1355806f982ec2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 18 Mar 2021 10:15:16 -0500 Subject: [PATCH 4/4] fix load from dataframe test on python 3.6 Also, check that sent schema matches DataFrame order, not table order --- tests/unit/test_client.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 2595c4c14..26ef340de 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -7659,26 +7659,33 @@ def test_load_table_from_dataframe(self): {"id": 2, "age": 60, "accounts": [5]}, {"id": 3, "age": 40, "accounts": []}, ] - dataframe = pandas.DataFrame(records) - - get_table_schema = [ - SchemaField( + # Mixup column order so that we can verify sent schema matches the + # serialized order, not the table column order. + column_order = ["age", "accounts", "id"] + dataframe = pandas.DataFrame(records, columns=column_order) + table_fields = { + "id": SchemaField( "id", "INTEGER", mode="REQUIRED", description="integer column", policy_tags=PolicyTagList(names=("foo", "bar")), ), - SchemaField( + "age": SchemaField( "age", "INTEGER", mode="NULLABLE", description="age column", policy_tags=PolicyTagList(names=("baz",)), ), - SchemaField( + "accounts": SchemaField( "accounts", "INTEGER", mode="REPEATED", description="array column", ), + } + get_table_schema = [ + table_fields["id"], + table_fields["age"], + table_fields["accounts"], ] get_table_patch = mock.patch( @@ -7715,9 +7722,12 @@ def test_load_table_from_dataframe(self): ] assert sent_config["sourceFormat"] == job.SourceFormat.PARQUET for field_index, field in enumerate(sent_config["schema"]["fields"]): - assert field["name"] == get_table_schema[field_index].name - assert field["type"] == get_table_schema[field_index].field_type - assert field["mode"] == get_table_schema[field_index].mode + assert field["name"] == column_order[field_index] + table_field = table_fields[field["name"]] + assert field["name"] == table_field.name + assert field["type"] == table_field.field_type + assert field["mode"] == table_field.mode + assert len(field.get("fields", [])) == len(table_field.fields) # Omit unnecessary fields when they come from getting the table # (not passed in via job_config) assert "description" not in field