Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: converting to dataframe with out of bounds timestamps #209

Merged
merged 2 commits into from Aug 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
31 changes: 30 additions & 1 deletion google/cloud/bigquery/table.py
Expand Up @@ -21,6 +21,7 @@
import functools
import logging
import operator
import pytz
import warnings

import six
Expand Down Expand Up @@ -1726,7 +1727,35 @@ def to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=create_bqstorage_client,
)
df = record_batch.to_pandas(date_as_object=date_as_object)

# When converting timestamp values to nanosecond precision, the result
# can be out of pyarrow bounds. To avoid the error when converting to
# Pandas, we set the timestamp_as_object parameter to True, if necessary.
#
# NOTE: Python 3+ only, as timestamp_as_object parameter is only supported
# in pyarrow>=1.0, but the latter is not compatible with Python 2.
if six.PY2:
extra_kwargs = {}
else:
types_to_check = {
pyarrow.timestamp("us"),
pyarrow.timestamp("us", tz=pytz.UTC),
}

for column in record_batch:
if column.type in types_to_check:
try:
column.cast("timestamp[ns]")
except pyarrow.lib.ArrowInvalid:
timestamp_as_object = True
break
else:
timestamp_as_object = False

extra_kwargs = {"timestamp_as_object": timestamp_as_object}

df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)

for column in dtypes:
df[column] = pandas.Series(df[column], dtype=dtypes[column])
return df
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Expand Up @@ -48,7 +48,9 @@
"pandas": ["pandas>=0.17.1"],
# Exclude PyArrow dependency from Windows Python 2.7.
'pyarrow: platform_system != "Windows" or python_version >= "3.5"': [
"pyarrow>=0.17.0"
"pyarrow>=1.0.0, <2.0dev; python_version>='3.4'",
# Pyarrow >= 0.17.0 is not compatible with Python 2 anymore.
"pyarrow < 0.17.0; python_version < '3.0'",
],
"tqdm": ["tqdm >= 4.0.0, <5.0.0dev"],
"fastparquet": [
Expand Down
63 changes: 63 additions & 0 deletions tests/unit/test_table.py
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime as dt
import itertools
import logging
import time
Expand Down Expand Up @@ -2271,6 +2272,68 @@ def test_to_dataframe(self):
self.assertEqual(df.name.dtype.name, "object")
self.assertEqual(df.age.dtype.name, "int64")

@pytest.mark.xfail(
six.PY2,
reason=(
"Requires pyarrow>-1.0 to work, but the latter is not compatible "
"with Python 2 anymore."
),
)
@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self):
from google.cloud.bigquery.schema import SchemaField

schema = [SchemaField("some_timestamp", "TIMESTAMP")]
rows = [
{"f": [{"v": "81953424000.0"}]}, # 4567-01-01 00:00:00 UTC
{"f": [{"v": "253402214400.0"}]}, # 9999-12-31 00:00:00 UTC
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)

df = row_iterator.to_dataframe(create_bqstorage_client=False)

self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 2) # verify the number of rows
self.assertEqual(list(df.columns), ["some_timestamp"])
self.assertEqual(
list(df["some_timestamp"]),
[dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
)

@pytest.mark.xfail(
six.PY2,
reason=(
"Requires pyarrow>-1.0 to work, but the latter is not compatible "
"with Python 2 anymore."
),
)
@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
def test_to_dataframe_datetime_out_of_pyarrow_bounds(self):
from google.cloud.bigquery.schema import SchemaField

schema = [SchemaField("some_datetime", "DATETIME")]
rows = [
{"f": [{"v": "4567-01-01T00:00:00"}]},
{"f": [{"v": "9999-12-31T00:00:00"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)

df = row_iterator.to_dataframe(create_bqstorage_client=False)

self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 2) # verify the number of rows
self.assertEqual(list(df.columns), ["some_datetime"])
self.assertEqual(
list(df["some_datetime"]),
[dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
)

@unittest.skipIf(pandas is None, "Requires `pandas`")
def test_to_dataframe_warning_wo_pyarrow(self):
from google.cloud.bigquery.client import PyarrowMissingWarning
Expand Down