fix: use compliant Parquet by default

googleapis · Sep 21, 2021 · 62fd565 · 62fd565
1 parent ee1e25c
commit 62fd565
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 15 deletions.
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -530,7 +530,13 @@ def dataframe_to_arrow(dataframe, bq_schema):
     return pyarrow.Table.from_arrays(arrow_arrays, names=arrow_names)
 
 
-def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SNAPPY"):
+def dataframe_to_parquet(
+    dataframe,
+    bq_schema,
+    filepath,
+    parquet_compression="SNAPPY",
+    parquet_use_compliant_nested_type=True,
+):
     """Write dataframe as a Parquet file, according to the desired BQ schema.
 
     This function requires the :mod:`pyarrow` package. Arrow is used as an
@@ -551,14 +557,24 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN
             The compression codec to use by the the ``pyarrow.parquet.write_table``
             serializing method. Defaults to "SNAPPY".
             https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
+        parquet_use_compliant_nested_type (bool):
+            Whether the ``pyarrow.parquet.write_table`` serializing method should write
+            compliant Parquet nested type (lists). Defaults to ``True``.
+            https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types
+            https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
     """
     pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)
 
     import pyarrow.parquet
 
     bq_schema = schema._to_schema_fields(bq_schema)
     arrow_table = dataframe_to_arrow(dataframe, bq_schema)
-    pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression)
+    pyarrow.parquet.write_table(
+        arrow_table,
+        filepath,
+        compression=parquet_compression,
+        use_compliant_nested_type=parquet_use_compliant_nested_type,
+    )
 
 
 def _row_iterator_page_to_arrow(page, column_names, arrow_types):

diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py
@@ -2456,6 +2456,7 @@ def load_table_from_dataframe(
         project: str = None,
         job_config: LoadJobConfig = None,
         parquet_compression: str = "snappy",
+        parquet_use_compliant_nested_type: bool = True,
         timeout: float = DEFAULT_TIMEOUT,
     ) -> job.LoadJob:
         """Upload the contents of a table from a pandas DataFrame.
@@ -2519,18 +2520,34 @@ def load_table_from_dataframe(
                 :attr:`~google.cloud.bigquery.job.SourceFormat.PARQUET` are
                 supported.
             parquet_compression (Optional[str]):
-                 [Beta] The compression method to use if intermittently
-                 serializing ``dataframe`` to a parquet file.
-
-                 The argument is directly passed as the ``compression``
-                 argument to the underlying ``pyarrow.parquet.write_table()``
-                 method (the default value "snappy" gets converted to uppercase).
-                 https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
-
-                 If the job config schema is missing, the argument is directly
-                 passed as the ``compression`` argument to the underlying
-                 ``DataFrame.to_parquet()`` method.
-                 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
+                [Beta] The compression method to use if intermittently
+                serializing ``dataframe`` to a parquet file.
+
+                The argument is directly passed as the ``compression``
+                argument to the underlying ``pyarrow.parquet.write_table()``
+                method (the default value "snappy" gets converted to uppercase).
+                https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
+
+                If the job config schema is missing, the argument is directly
+                passed as the ``compression`` argument to the underlying
+                ``DataFrame.to_parquet()`` method.
+                https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
+            parquet_use_compliant_nested_type (bool):
+                Whether the ``pyarrow.parquet.write_table`` serializing method should write
+                compliant Parquet nested type (lists). Defaults to ``True``.
+
+                The argument is directly passed as the ``use_compliant_nested_type``
+                argument to the underlying ``pyarrow.parquet.write_table()``
+                method.
+                https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
+
+                If the job config schema is missing, the argument is directly
+                passed as an additonal ``kwarg`` argument to the underlying
+                ``DataFrame.to_parquet()`` method.
+                https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
+
+                This argument is only present to allow for backwards compatibility with
+                tables created using an old version of this method.
             timeout (Optional[float]):
                 The number of seconds to wait for the underlying HTTP transport
                 before using ``retry``.
@@ -2647,9 +2664,15 @@ def load_table_from_dataframe(
                         job_config.schema,
                         tmppath,
                         parquet_compression=parquet_compression,
+                        parquet_use_compliant_nested_type=parquet_use_compliant_nested_type,
                     )
                 else:
-                    dataframe.to_parquet(tmppath, compression=parquet_compression)
+                    dataframe.to_parquet(
+                        tmppath,
+                        engine="pyarrow",
+                        compression=parquet_compression,
+                        use_compliant_nested_type=parquet_use_compliant_nested_type,
+                    )
 
             else: