googleapis · gcf-merge-on-green · Aug 15, 2020 · Aug 1, 2020 · Aug 15, 2020
@@ -21,6 +21,7 @@
 import functools
 import logging
 import operator
+import pytz
 import warnings
 
 import six
@@ -1726,7 +1727,35 @@ def to_dataframe(
                 bqstorage_client=bqstorage_client,
                 create_bqstorage_client=create_bqstorage_client,
             )
-            df = record_batch.to_pandas(date_as_object=date_as_object)
+
+            # When converting timestamp values to nanosecond precision, the result
+            # can be out of pyarrow bounds. To avoid the error when converting to
+            # Pandas, we set the timestamp_as_object parameter to True, if necessary.
+            #
+            # NOTE: Python 3+ only, as timestamp_as_object parameter is only supported
+            # in pyarrow>=1.0, but the latter is not compatible with Python 2.
+            if six.PY2:
+                extra_kwargs = {}
+            else:
+                types_to_check = {
+                    pyarrow.timestamp("us"),
+                    pyarrow.timestamp("us", tz=pytz.UTC),
+                }
+
+                for column in record_batch:
+                    if column.type in types_to_check:
+                        try:
+                            column.cast("timestamp[ns]")
+                        except pyarrow.lib.ArrowInvalid:
+                            timestamp_as_object = True
+                            break
+                else:
+                    timestamp_as_object = False
+
+                extra_kwargs = {"timestamp_as_object": timestamp_as_object}
+
+            df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)
+
             for column in dtypes:
                 df[column] = pandas.Series(df[column], dtype=dtypes[column])
             return df

@@ -48,7 +48,9 @@
     "pandas": ["pandas>=0.17.1"],
     # Exclude PyArrow dependency from Windows Python 2.7.
     'pyarrow: platform_system != "Windows" or python_version >= "3.5"': [
-        "pyarrow>=0.17.0"
+        "pyarrow>=1.0.0, <2.0dev; python_version>='3.4'",
+        # Pyarrow >= 0.17.0 is not compatible with Python 2 anymore.
+        "pyarrow < 0.17.0; python_version < '3.0'",
     ],
     "tqdm": ["tqdm >= 4.0.0, <5.0.0dev"],
     "fastparquet": [

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime as dt
 import itertools
 import logging
 import time
@@ -2271,6 +2272,68 @@ def test_to_dataframe(self):
         self.assertEqual(df.name.dtype.name, "object")
         self.assertEqual(df.age.dtype.name, "int64")
 
+    @pytest.mark.xfail(
+        six.PY2,
+        reason=(
+            "Requires pyarrow>-1.0 to work, but the latter is not compatible "
+            "with Python 2 anymore."
+        ),
+    )
+    @unittest.skipIf(pandas is None, "Requires `pandas`")
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self):
+        from google.cloud.bigquery.schema import SchemaField
+
+        schema = [SchemaField("some_timestamp", "TIMESTAMP")]
+        rows = [
+            {"f": [{"v": "81953424000.0"}]},  # 4567-01-01 00:00:00  UTC
+            {"f": [{"v": "253402214400.0"}]},  # 9999-12-31 00:00:00  UTC
+        ]
+        path = "/foo"
+        api_request = mock.Mock(return_value={"rows": rows})
+        row_iterator = self._make_one(_mock_client(), api_request, path, schema)
+
+        df = row_iterator.to_dataframe(create_bqstorage_client=False)
+
+        self.assertIsInstance(df, pandas.DataFrame)
+        self.assertEqual(len(df), 2)  # verify the number of rows
+        self.assertEqual(list(df.columns), ["some_timestamp"])
+        self.assertEqual(
+            list(df["some_timestamp"]),
+            [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
+        )
+
+    @pytest.mark.xfail(
+        six.PY2,
+        reason=(
+            "Requires pyarrow>-1.0 to work, but the latter is not compatible "
+            "with Python 2 anymore."
+        ),
+    )
+    @unittest.skipIf(pandas is None, "Requires `pandas`")
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_to_dataframe_datetime_out_of_pyarrow_bounds(self):
+        from google.cloud.bigquery.schema import SchemaField
+
+        schema = [SchemaField("some_datetime", "DATETIME")]
+        rows = [
+            {"f": [{"v": "4567-01-01T00:00:00"}]},
+            {"f": [{"v": "9999-12-31T00:00:00"}]},
+        ]
+        path = "/foo"
+        api_request = mock.Mock(return_value={"rows": rows})
+        row_iterator = self._make_one(_mock_client(), api_request, path, schema)
+
+        df = row_iterator.to_dataframe(create_bqstorage_client=False)
+
+        self.assertIsInstance(df, pandas.DataFrame)
+        self.assertEqual(len(df), 2)  # verify the number of rows
+        self.assertEqual(list(df.columns), ["some_datetime"])
+        self.assertEqual(
+            list(df["some_datetime"]),
+            [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
+        )
+
     @unittest.skipIf(pandas is None, "Requires `pandas`")
     def test_to_dataframe_warning_wo_pyarrow(self):
         from google.cloud.bigquery.client import PyarrowMissingWarning