diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 10b4198d3..d9e5f7773 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -21,6 +21,7 @@ import functools import logging import operator +import pytz import warnings import six @@ -1726,7 +1727,35 @@ def to_dataframe( bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, ) - df = record_batch.to_pandas(date_as_object=date_as_object) + + # When converting timestamp values to nanosecond precision, the result + # can be out of pyarrow bounds. To avoid the error when converting to + # Pandas, we set the timestamp_as_object parameter to True, if necessary. + # + # NOTE: Python 3+ only, as timestamp_as_object parameter is only supported + # in pyarrow>=1.0, but the latter is not compatible with Python 2. + if six.PY2: + extra_kwargs = {} + else: + types_to_check = { + pyarrow.timestamp("us"), + pyarrow.timestamp("us", tz=pytz.UTC), + } + + for column in record_batch: + if column.type in types_to_check: + try: + column.cast("timestamp[ns]") + except pyarrow.lib.ArrowInvalid: + timestamp_as_object = True + break + else: + timestamp_as_object = False + + extra_kwargs = {"timestamp_as_object": timestamp_as_object} + + df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) + for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) return df diff --git a/setup.py b/setup.py index fe6143557..389517277 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,9 @@ "pandas": ["pandas>=0.17.1"], # Exclude PyArrow dependency from Windows Python 2.7. 'pyarrow: platform_system != "Windows" or python_version >= "3.5"': [ - "pyarrow>=0.17.0" + "pyarrow>=1.0.0, <2.0dev; python_version>='3.4'", + # Pyarrow >= 0.17.0 is not compatible with Python 2 anymore. + "pyarrow < 0.17.0; python_version < '3.0'", ], "tqdm": ["tqdm >= 4.0.0, <5.0.0dev"], "fastparquet": [ diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 28575bd43..80223e8e1 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime as dt import itertools import logging import time @@ -2271,6 +2272,68 @@ def test_to_dataframe(self): self.assertEqual(df.name.dtype.name, "object") self.assertEqual(df.age.dtype.name, "int64") + @pytest.mark.xfail( + six.PY2, + reason=( + "Requires pyarrow>-1.0 to work, but the latter is not compatible " + "with Python 2 anymore." + ), + ) + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [SchemaField("some_timestamp", "TIMESTAMP")] + rows = [ + {"f": [{"v": "81953424000.0"}]}, # 4567-01-01 00:00:00 UTC + {"f": [{"v": "253402214400.0"}]}, # 9999-12-31 00:00:00 UTC + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = self._make_one(_mock_client(), api_request, path, schema) + + df = row_iterator.to_dataframe(create_bqstorage_client=False) + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 2) # verify the number of rows + self.assertEqual(list(df.columns), ["some_timestamp"]) + self.assertEqual( + list(df["some_timestamp"]), + [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)], + ) + + @pytest.mark.xfail( + six.PY2, + reason=( + "Requires pyarrow>-1.0 to work, but the latter is not compatible " + "with Python 2 anymore." + ), + ) + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_to_dataframe_datetime_out_of_pyarrow_bounds(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [SchemaField("some_datetime", "DATETIME")] + rows = [ + {"f": [{"v": "4567-01-01T00:00:00"}]}, + {"f": [{"v": "9999-12-31T00:00:00"}]}, + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = self._make_one(_mock_client(), api_request, path, schema) + + df = row_iterator.to_dataframe(create_bqstorage_client=False) + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 2) # verify the number of rows + self.assertEqual(list(df.columns), ["some_datetime"]) + self.assertEqual( + list(df["some_datetime"]), + [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)], + ) + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_warning_wo_pyarrow(self): from google.cloud.bigquery.client import PyarrowMissingWarning