From 8fe725429541eed34ddc01cffc8b1ee846c14162 Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Thu, 30 Jul 2020 22:08:47 +0200 Subject: [PATCH] fix: raise error if inserting rows with unknown fields (#163) Co-authored-by: Tres Seaver --- google/cloud/bigquery/_helpers.py | 31 +++++++++++++++++++++++- tests/unit/test__helpers.py | 40 +++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index d814eec8c..47851d42c 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -19,6 +19,7 @@ import datetime import decimal import re +import six from google.cloud._helpers import UTC from google.cloud._helpers import _date_from_iso8601_date @@ -419,9 +420,23 @@ def _record_field_to_json(fields, row_value): Returns: Mapping[str, Any]: A JSON-serializable dictionary. """ - record = {} isdict = isinstance(row_value, dict) + # If row is passed as a tuple, make the length sanity check to avoid either + # uninformative index errors a few lines below or silently omitting some of + # the values from the result (we cannot know exactly which fields are missing + # or redundant, since we don't have their names). + if not isdict and len(row_value) != len(fields): + msg = "The number of row fields ({}) does not match schema length ({}).".format( + len(row_value), len(fields) + ) + raise ValueError(msg) + + record = {} + + if isdict: + processed_fields = set() + for subindex, subfield in enumerate(fields): subname = subfield.name subvalue = row_value.get(subname) if isdict else row_value[subindex] @@ -430,6 +445,20 @@ def _record_field_to_json(fields, row_value): if subvalue is not None: record[subname] = _field_to_json(subfield, subvalue) + if isdict: + processed_fields.add(subname) + + # Unknown fields should not be silently dropped, include them. Since there + # is no schema information available for them, include them as strings + # to make them JSON-serializable. + if isdict: + not_processed = set(row_value.keys()) - processed_fields + + for field_name in not_processed: + value = row_value[field_name] + if value is not None: + record[field_name] = six.text_type(value) + return record diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index fa6d27c98..28ebe8144 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -18,6 +18,7 @@ import unittest import mock +import six class Test_not_null(unittest.TestCase): @@ -847,6 +848,26 @@ def test_w_non_empty_list(self): converted = self._call_fut(fields, original) self.assertEqual(converted, {"one": "42", "two": "two"}) + def test_w_list_missing_fields(self): + fields = [ + _make_field("INT64", name="one", mode="NULLABLE"), + _make_field("STRING", name="two", mode="NULLABLE"), + ] + original = [42] + + with six.assertRaisesRegex(self, ValueError, r".*not match schema length.*"): + self._call_fut(fields, original) + + def test_w_list_too_many_fields(self): + fields = [ + _make_field("INT64", name="one", mode="NULLABLE"), + _make_field("STRING", name="two", mode="NULLABLE"), + ] + original = [42, "two", "three"] + + with six.assertRaisesRegex(self, ValueError, r".*not match schema length.*"): + self._call_fut(fields, original) + def test_w_non_empty_dict(self): fields = [ _make_field("INT64", name="one", mode="NULLABLE"), @@ -890,6 +911,25 @@ def test_w_explicit_none_value(self): # None values should be dropped regardless of the field type self.assertEqual(converted, {"one": "42"}) + def test_w_dict_unknown_fields(self): + fields = [ + _make_field("INT64", name="one", mode="NULLABLE"), + _make_field("STRING", name="two", mode="NULLABLE"), + ] + original = { + "whoami": datetime.date(2020, 7, 20), + "one": 111, + "two": "222", + "void": None, + } + + converted = self._call_fut(fields, original) + + # Unknown fields should be included (if not None), but converted as strings. + self.assertEqual( + converted, {"whoami": "2020-07-20", "one": "111", "two": "222"}, + ) + class Test_field_to_json(unittest.TestCase): def _call_fut(self, field, value):