Skip to content

Commit

Permalink
feat: to_gbq uses Parquet by default, use api_method="load_csv" f…
Browse files Browse the repository at this point in the history
…or old behavior (#413)

* avoid parquet for older pandas

docs: deprecate `chunksize` when used with load jobs

* keep `chunksize` for future use in streaming APIs

deps: explicitly require `pyarrow >= 3.0`

* mention pyarrow as a dependency

* add pyarrow to conda deps

deps: explicitly require `numpy >= 1.16.6`

* update minimum numpy

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
  • Loading branch information
tswast and gcf-owl-bot[bot] committed Nov 2, 2021
1 parent 81fa744 commit 9a65383
Show file tree
Hide file tree
Showing 16 changed files with 286 additions and 151 deletions.
2 changes: 1 addition & 1 deletion CONTRIBUTING.rst
Expand Up @@ -148,7 +148,7 @@ Running System Tests

.. note::

System tests are only configured to run under Python 3.8 and 3.9.
System tests are only configured to run under Python 3.7, 3.8 and 3.9.
For expediency, we do not run them in older versions of Python 3.

This alone will not run the tests. You'll need to change some local
Expand Down
3 changes: 2 additions & 1 deletion ci/requirements-3.7-0.23.2.conda
Expand Up @@ -2,8 +2,9 @@ codecov
coverage
fastavro
flake8
numpy==1.14.5
numpy==1.16.6
google-cloud-bigquery==1.11.1
pyarrow==3.0.0
pydata-google-auth
pytest
pytest-cov
Expand Down
1 change: 1 addition & 0 deletions ci/requirements-3.9-NIGHTLY.conda
@@ -1,6 +1,7 @@
pydata-google-auth
google-cloud-bigquery
google-cloud-bigquery-storage
pyarrow
pytest
pytest-cov
codecov
Expand Down
3 changes: 2 additions & 1 deletion docs/install.rst
Expand Up @@ -29,7 +29,7 @@ Install from Source

.. code-block:: shell
$ pip install git+https://github.com/pydata/pandas-gbq.git
$ pip install git+https://github.com/googleapis/python-bigquery-pandas.git
Dependencies
Expand All @@ -38,6 +38,7 @@ Dependencies
This module requires following additional dependencies:

- `pydata-google-auth <https://github.com/pydata/pydata-google-auth>`__: Helpers for authentication to Google's API
- `pyarrow <https://arrow.apache.org/docs/python/>`__: Format for getting data to/from Google BigQuery
- `google-auth <https://github.com/GoogleCloudPlatform/google-auth-library-python>`__: authentication and authorization for Google's API
- `google-auth-oauthlib <https://github.com/GoogleCloudPlatform/google-auth-library-python-oauthlib>`__: integration with `oauthlib <https://github.com/idan/oauthlib>`__ for end-user authentication
- `google-cloud-bigquery <https://googleapis.dev/python/bigquery/latest/index.html>`__: Google Cloud client library for BigQuery
Expand Down
2 changes: 1 addition & 1 deletion noxfile.py
Expand Up @@ -28,7 +28,7 @@
BLACK_PATHS = ["docs", "pandas_gbq", "tests", "noxfile.py", "setup.py"]

DEFAULT_PYTHON_VERSION = "3.8"
SYSTEM_TEST_PYTHON_VERSIONS = ["3.8", "3.9"]
SYSTEM_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9"]
UNIT_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9"]

CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute()
Expand Down
2 changes: 1 addition & 1 deletion owlbot.py
Expand Up @@ -31,7 +31,7 @@
extras = ["tqdm"]
templated_files = common.py_library(
unit_test_python_versions=["3.7", "3.8", "3.9"],
system_test_python_versions=["3.8", "3.9"],
system_test_python_versions=["3.7", "3.8", "3.9"],
cov_level=86,
unit_test_extras=extras,
system_test_extras=extras,
Expand Down
15 changes: 12 additions & 3 deletions pandas_gbq/exceptions.py
Expand Up @@ -3,21 +3,30 @@
# license that can be found in the LICENSE file.


class GenericGBQException(ValueError):
"""
Raised when an unrecognized Google API Error occurs.
"""


class AccessDenied(ValueError):
"""
Raised when invalid credentials are provided, or tokens have expired.
"""

pass

class ConversionError(GenericGBQException):
"""
Raised when there is a problem converting the DataFrame to a format
required to upload it to BigQuery.
"""


class InvalidPrivateKeyFormat(ValueError):
"""
Raised when provided private key has invalid format.
"""

pass


class PerformanceWarning(RuntimeWarning):
"""
Expand Down
10 changes: 10 additions & 0 deletions pandas_gbq/features.py
Expand Up @@ -10,6 +10,7 @@
BIGQUERY_BQSTORAGE_VERSION = "1.24.0"
BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0"
PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0"
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0"


class Features:
Expand Down Expand Up @@ -89,5 +90,14 @@ def pandas_has_deprecated_verbose(self):
)
return self.pandas_installed_version >= pandas_verbosity_deprecation

@property
def pandas_has_parquet_with_lossless_timestamp(self):
import pkg_resources

desired_version = pkg_resources.parse_version(
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION
)
return self.pandas_installed_version >= desired_version


FEATURES = Features()
49 changes: 38 additions & 11 deletions pandas_gbq/gbq.py
Expand Up @@ -18,8 +18,11 @@
bigquery = None
google_exceptions = None

from pandas_gbq.exceptions import AccessDenied
from pandas_gbq.exceptions import PerformanceWarning
from pandas_gbq.exceptions import (
AccessDenied,
GenericGBQException,
PerformanceWarning,
)
from pandas_gbq import features
from pandas_gbq.features import FEATURES
import pandas_gbq.schema
Expand Down Expand Up @@ -69,14 +72,6 @@ class DatasetCreationError(ValueError):
pass


class GenericGBQException(ValueError):
"""
Raised when an unrecognized Google API Error occurs.
"""

pass


class InvalidColumnOrder(ValueError):
"""
Raised when the provided column order for output
Expand Down Expand Up @@ -520,7 +515,7 @@ def _download_results(
df = rows_iter.to_dataframe(
dtypes=conversion_dtypes,
progress_bar_type=progress_bar_type,
**to_dataframe_kwargs
**to_dataframe_kwargs,
)
except self.http_error as ex:
self.process_http_error(ex)
Expand All @@ -541,6 +536,7 @@ def load_data(
chunksize=None,
schema=None,
progress_bar=True,
api_method: str = "load_parquet",
):
from pandas_gbq import load

Expand All @@ -554,6 +550,7 @@ def load_data(
chunksize=chunksize,
schema=schema,
location=self.location,
api_method=api_method,
)
if progress_bar and tqdm:
chunks = tqdm.tqdm(chunks)
Expand Down Expand Up @@ -876,6 +873,7 @@ def to_gbq(
location=None,
progress_bar=True,
credentials=None,
api_method: str = "default",
verbose=None,
private_key=None,
):
Expand Down Expand Up @@ -964,6 +962,12 @@ def to_gbq(
:class:`google.oauth2.service_account.Credentials` directly.
.. versionadded:: 0.8.0
api_method : str, optional
API method used to upload DataFrame to BigQuery. One of "load_parquet",
"load_csv". Default "load_parquet" if pandas is version 1.1.0+,
otherwise "load_csv".
.. versionadded:: 0.16.0
verbose : bool, deprecated
Deprecated in Pandas-GBQ 0.4.0. Use the `logging module
to adjust verbosity instead
Expand All @@ -988,6 +992,28 @@ def to_gbq(
stacklevel=1,
)

if api_method == "default":
# Avoid using parquet if pandas doesn't support lossless conversions to
# parquet timestamp. See: https://stackoverflow.com/a/69758676/101923
if FEATURES.pandas_has_parquet_with_lossless_timestamp:
api_method = "load_parquet"
else:
api_method = "load_csv"

if chunksize is not None:
if api_method == "load_parquet":
warnings.warn(
"chunksize is ignored when using api_method='load_parquet'",
DeprecationWarning,
stacklevel=2,
)
elif api_method == "load_csv":
warnings.warn(
"chunksize will be ignored when using api_method='load_csv' in a future version of pandas-gbq",
PendingDeprecationWarning,
stacklevel=2,
)

if if_exists not in ("fail", "replace", "append"):
raise ValueError("'{0}' is not valid for if_exists".format(if_exists))

Expand Down Expand Up @@ -1071,6 +1097,7 @@ def to_gbq(
chunksize=chunksize,
schema=table_schema,
progress_bar=progress_bar,
api_method=api_method,
)


Expand Down

0 comments on commit 9a65383

Please sign in to comment.