Skip to content

Commit

Permalink
fix: use compliant Parquet by default
Browse files Browse the repository at this point in the history
  • Loading branch information
judahrand committed Sep 21, 2021
1 parent ee1e25c commit 62fd565
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 15 deletions.
20 changes: 18 additions & 2 deletions google/cloud/bigquery/_pandas_helpers.py
Expand Up @@ -530,7 +530,13 @@ def dataframe_to_arrow(dataframe, bq_schema):
return pyarrow.Table.from_arrays(arrow_arrays, names=arrow_names)


def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SNAPPY"):
def dataframe_to_parquet(
dataframe,
bq_schema,
filepath,
parquet_compression="SNAPPY",
parquet_use_compliant_nested_type=True,
):
"""Write dataframe as a Parquet file, according to the desired BQ schema.
This function requires the :mod:`pyarrow` package. Arrow is used as an
Expand All @@ -551,14 +557,24 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN
The compression codec to use by the the ``pyarrow.parquet.write_table``
serializing method. Defaults to "SNAPPY".
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
parquet_use_compliant_nested_type (bool):
Whether the ``pyarrow.parquet.write_table`` serializing method should write
compliant Parquet nested type (lists). Defaults to ``True``.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
"""
pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)

import pyarrow.parquet

bq_schema = schema._to_schema_fields(bq_schema)
arrow_table = dataframe_to_arrow(dataframe, bq_schema)
pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression)
pyarrow.parquet.write_table(
arrow_table,
filepath,
compression=parquet_compression,
use_compliant_nested_type=parquet_use_compliant_nested_type,
)


def _row_iterator_page_to_arrow(page, column_names, arrow_types):
Expand Down
49 changes: 36 additions & 13 deletions google/cloud/bigquery/client.py
Expand Up @@ -2456,6 +2456,7 @@ def load_table_from_dataframe(
project: str = None,
job_config: LoadJobConfig = None,
parquet_compression: str = "snappy",
parquet_use_compliant_nested_type: bool = True,
timeout: float = DEFAULT_TIMEOUT,
) -> job.LoadJob:
"""Upload the contents of a table from a pandas DataFrame.
Expand Down Expand Up @@ -2519,18 +2520,34 @@ def load_table_from_dataframe(
:attr:`~google.cloud.bigquery.job.SourceFormat.PARQUET` are
supported.
parquet_compression (Optional[str]):
[Beta] The compression method to use if intermittently
serializing ``dataframe`` to a parquet file.
The argument is directly passed as the ``compression``
argument to the underlying ``pyarrow.parquet.write_table()``
method (the default value "snappy" gets converted to uppercase).
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
If the job config schema is missing, the argument is directly
passed as the ``compression`` argument to the underlying
``DataFrame.to_parquet()`` method.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
[Beta] The compression method to use if intermittently
serializing ``dataframe`` to a parquet file.
The argument is directly passed as the ``compression``
argument to the underlying ``pyarrow.parquet.write_table()``
method (the default value "snappy" gets converted to uppercase).
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
If the job config schema is missing, the argument is directly
passed as the ``compression`` argument to the underlying
``DataFrame.to_parquet()`` method.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
parquet_use_compliant_nested_type (bool):
Whether the ``pyarrow.parquet.write_table`` serializing method should write
compliant Parquet nested type (lists). Defaults to ``True``.
The argument is directly passed as the ``use_compliant_nested_type``
argument to the underlying ``pyarrow.parquet.write_table()``
method.
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
If the job config schema is missing, the argument is directly
passed as an additonal ``kwarg`` argument to the underlying
``DataFrame.to_parquet()`` method.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
This argument is only present to allow for backwards compatibility with
tables created using an old version of this method.
timeout (Optional[float]):
The number of seconds to wait for the underlying HTTP transport
before using ``retry``.
Expand Down Expand Up @@ -2647,9 +2664,15 @@ def load_table_from_dataframe(
job_config.schema,
tmppath,
parquet_compression=parquet_compression,
parquet_use_compliant_nested_type=parquet_use_compliant_nested_type,
)
else:
dataframe.to_parquet(tmppath, compression=parquet_compression)
dataframe.to_parquet(
tmppath,
engine="pyarrow",
compression=parquet_compression,
use_compliant_nested_type=parquet_use_compliant_nested_type,
)

else:

Expand Down

0 comments on commit 62fd565

Please sign in to comment.