fix: converting to dataframe with out of bounds timestamps (#209)

Fixes #168. This PR fixes the problem when converting query results to Pandas with `pyarrow` when data contains timestamps that would fall out of `pyarrow`'s nanoseconds precision. The fix requires `pyarrow>=1.0.0`, thus it only works on Python 3. ### PR checklist - [x] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary)
googleapis · Aug 15, 2020 · 8209203 · 8209203
1 parent 478597a
commit 8209203
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 2 deletions.
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -21,6 +21,7 @@
 import functools
 import logging
 import operator
+import pytz
 import warnings
 
 import six
@@ -1726,7 +1727,35 @@ def to_dataframe(
                 bqstorage_client=bqstorage_client,
                 create_bqstorage_client=create_bqstorage_client,
             )
-            df = record_batch.to_pandas(date_as_object=date_as_object)
+
+            # When converting timestamp values to nanosecond precision, the result
+            # can be out of pyarrow bounds. To avoid the error when converting to
+            # Pandas, we set the timestamp_as_object parameter to True, if necessary.
+            #
+            # NOTE: Python 3+ only, as timestamp_as_object parameter is only supported
+            # in pyarrow>=1.0, but the latter is not compatible with Python 2.
+            if six.PY2:
+                extra_kwargs = {}
+            else:
+                types_to_check = {
+                    pyarrow.timestamp("us"),
+                    pyarrow.timestamp("us", tz=pytz.UTC),
+                }
+
+                for column in record_batch:
+                    if column.type in types_to_check:
+                        try:
+                            column.cast("timestamp[ns]")
+                        except pyarrow.lib.ArrowInvalid:
+                            timestamp_as_object = True
+                            break
+                else:
+                    timestamp_as_object = False
+
+                extra_kwargs = {"timestamp_as_object": timestamp_as_object}
+
+            df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)
+
             for column in dtypes:
                 df[column] = pandas.Series(df[column], dtype=dtypes[column])
             return df

diff --git a/setup.py b/setup.py
@@ -48,7 +48,9 @@
     "pandas": ["pandas>=0.17.1"],
     # Exclude PyArrow dependency from Windows Python 2.7.
     'pyarrow: platform_system != "Windows" or python_version >= "3.5"': [
-        "pyarrow>=0.17.0"
+        "pyarrow>=1.0.0, <2.0dev; python_version>='3.4'",
+        # Pyarrow >= 0.17.0 is not compatible with Python 2 anymore.
+        "pyarrow < 0.17.0; python_version < '3.0'",
     ],
     "tqdm": ["tqdm >= 4.0.0, <5.0.0dev"],
     "fastparquet": [

diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime as dt
 import itertools
 import logging
 import time
@@ -2271,6 +2272,68 @@ def test_to_dataframe(self):
         self.assertEqual(df.name.dtype.name, "object")
         self.assertEqual(df.age.dtype.name, "int64")
 
+    @pytest.mark.xfail(
+        six.PY2,
+        reason=(
+            "Requires pyarrow>-1.0 to work, but the latter is not compatible "
+            "with Python 2 anymore."
+        ),
+    )
+    @unittest.skipIf(pandas is None, "Requires `pandas`")
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self):
+        from google.cloud.bigquery.schema import SchemaField
+
+        schema = [SchemaField("some_timestamp", "TIMESTAMP")]
+        rows = [
+            {"f": [{"v": "81953424000.0"}]},  # 4567-01-01 00:00:00  UTC
+            {"f": [{"v": "253402214400.0"}]},  # 9999-12-31 00:00:00  UTC
+        ]
+        path = "/foo"
+        api_request = mock.Mock(return_value={"rows": rows})
+        row_iterator = self._make_one(_mock_client(), api_request, path, schema)
+
+        df = row_iterator.to_dataframe(create_bqstorage_client=False)
+
+        self.assertIsInstance(df, pandas.DataFrame)
+        self.assertEqual(len(df), 2)  # verify the number of rows
+        self.assertEqual(list(df.columns), ["some_timestamp"])
+        self.assertEqual(
+            list(df["some_timestamp"]),
+            [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
+        )
+
+    @pytest.mark.xfail(
+        six.PY2,
+        reason=(
+            "Requires pyarrow>-1.0 to work, but the latter is not compatible "
+            "with Python 2 anymore."
+        ),
+    )
+    @unittest.skipIf(pandas is None, "Requires `pandas`")
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_to_dataframe_datetime_out_of_pyarrow_bounds(self):
+        from google.cloud.bigquery.schema import SchemaField
+
+        schema = [SchemaField("some_datetime", "DATETIME")]
+        rows = [
+            {"f": [{"v": "4567-01-01T00:00:00"}]},
+            {"f": [{"v": "9999-12-31T00:00:00"}]},
+        ]
+        path = "/foo"
+        api_request = mock.Mock(return_value={"rows": rows})
+        row_iterator = self._make_one(_mock_client(), api_request, path, schema)
+
+        df = row_iterator.to_dataframe(create_bqstorage_client=False)
+
+        self.assertIsInstance(df, pandas.DataFrame)
+        self.assertEqual(len(df), 2)  # verify the number of rows
+        self.assertEqual(list(df.columns), ["some_datetime"])
+        self.assertEqual(
+            list(df["some_datetime"]),
+            [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
+        )
+
     @unittest.skipIf(pandas is None, "Requires `pandas`")
     def test_to_dataframe_warning_wo_pyarrow(self):
         from google.cloud.bigquery.client import PyarrowMissingWarning