fix: raise error if inserting rows with unknown fields (#163)

Co-authored-by: Tres Seaver <tseaver@palladion.com>
googleapis · Jul 30, 2020 · 8fe7254 · 8fe7254
1 parent f59fc9a
commit 8fe7254
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 1 deletion.
diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py
@@ -19,6 +19,7 @@
 import datetime
 import decimal
 import re
+import six
 
 from google.cloud._helpers import UTC
 from google.cloud._helpers import _date_from_iso8601_date
@@ -419,9 +420,23 @@ def _record_field_to_json(fields, row_value):
     Returns:
         Mapping[str, Any]: A JSON-serializable dictionary.
     """
-    record = {}
     isdict = isinstance(row_value, dict)
 
+    # If row is passed as a tuple, make the length sanity check to avoid either
+    # uninformative index errors a few lines below or silently omitting some of
+    # the values from the result (we cannot know exactly which fields are missing
+    # or redundant, since we don't have their names).
+    if not isdict and len(row_value) != len(fields):
+        msg = "The number of row fields ({}) does not match schema length ({}).".format(
+            len(row_value), len(fields)
+        )
+        raise ValueError(msg)
+
+    record = {}
+
+    if isdict:
+        processed_fields = set()
+
     for subindex, subfield in enumerate(fields):
         subname = subfield.name
         subvalue = row_value.get(subname) if isdict else row_value[subindex]
@@ -430,6 +445,20 @@ def _record_field_to_json(fields, row_value):
         if subvalue is not None:
             record[subname] = _field_to_json(subfield, subvalue)
 
+        if isdict:
+            processed_fields.add(subname)
+
+    # Unknown fields should not be silently dropped, include them. Since there
+    # is no schema information available for them, include them as strings
+    # to make them JSON-serializable.
+    if isdict:
+        not_processed = set(row_value.keys()) - processed_fields
+
+        for field_name in not_processed:
+            value = row_value[field_name]
+            if value is not None:
+                record[field_name] = six.text_type(value)
+
     return record
 
 

diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py
@@ -18,6 +18,7 @@
 import unittest
 
 import mock
+import six
 
 
 class Test_not_null(unittest.TestCase):
@@ -847,6 +848,26 @@ def test_w_non_empty_list(self):
         converted = self._call_fut(fields, original)
         self.assertEqual(converted, {"one": "42", "two": "two"})
 
+    def test_w_list_missing_fields(self):
+        fields = [
+            _make_field("INT64", name="one", mode="NULLABLE"),
+            _make_field("STRING", name="two", mode="NULLABLE"),
+        ]
+        original = [42]
+
+        with six.assertRaisesRegex(self, ValueError, r".*not match schema length.*"):
+            self._call_fut(fields, original)
+
+    def test_w_list_too_many_fields(self):
+        fields = [
+            _make_field("INT64", name="one", mode="NULLABLE"),
+            _make_field("STRING", name="two", mode="NULLABLE"),
+        ]
+        original = [42, "two", "three"]
+
+        with six.assertRaisesRegex(self, ValueError, r".*not match schema length.*"):
+            self._call_fut(fields, original)
+
     def test_w_non_empty_dict(self):
         fields = [
             _make_field("INT64", name="one", mode="NULLABLE"),
@@ -890,6 +911,25 @@ def test_w_explicit_none_value(self):
         # None values should be dropped regardless of the field type
         self.assertEqual(converted, {"one": "42"})
 
+    def test_w_dict_unknown_fields(self):
+        fields = [
+            _make_field("INT64", name="one", mode="NULLABLE"),
+            _make_field("STRING", name="two", mode="NULLABLE"),
+        ]
+        original = {
+            "whoami": datetime.date(2020, 7, 20),
+            "one": 111,
+            "two": "222",
+            "void": None,
+        }
+
+        converted = self._call_fut(fields, original)
+
+        # Unknown fields should be included (if not None), but converted as strings.
+        self.assertEqual(
+            converted, {"whoami": "2020-07-20", "one": "111", "two": "222"},
+        )
+
 
 class Test_field_to_json(unittest.TestCase):
     def _call_fut(self, field, value):