diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py index b0d2e7517..930dc413d 100644 --- a/google/cloud/bigquery/job.py +++ b/google/cloud/bigquery/job.py @@ -3320,6 +3320,7 @@ def to_dataframe( dtypes=None, progress_bar_type=None, create_bqstorage_client=True, + date_as_object=True, ): """Return a pandas DataFrame from a QueryJob @@ -3350,9 +3351,9 @@ def to_dataframe( for details. ..versionadded:: 1.11.0 - create_bqstorage_client (bool): - Optional. If ``True`` (default), create a BigQuery Storage API - client using the default API settings. The BigQuery Storage API + create_bqstorage_client (Optional[bool]): + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the ``bqstorage_client`` parameter for more information. @@ -3360,6 +3361,12 @@ def to_dataframe( ..versionadded:: 1.24.0 + date_as_object (Optional[bool]): + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. + + ..versionadded:: 1.26.0 + Returns: A :class:`~pandas.DataFrame` populated with row data and column headers from the query results. The column headers are derived @@ -3373,6 +3380,7 @@ def to_dataframe( dtypes=dtypes, progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, ) def __iter__(self): diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 5b13cc52a..5f557d28a 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1633,6 +1633,7 @@ def to_dataframe( dtypes=None, progress_bar_type=None, create_bqstorage_client=True, + date_as_object=True, ): """Create a pandas DataFrame by loading all pages of a query. @@ -1673,9 +1674,9 @@ def to_dataframe( progress bar as a graphical dialog box. ..versionadded:: 1.11.0 - create_bqstorage_client (bool): - Optional. If ``True`` (default), create a BigQuery Storage API - client using the default API settings. The BigQuery Storage API + create_bqstorage_client (Optional[bool]): + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the ``bqstorage_client`` parameter for more information. @@ -1683,6 +1684,12 @@ def to_dataframe( ..versionadded:: 1.24.0 + date_as_object (Optional[bool]): + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. + + ..versionadded:: 1.26.0 + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column @@ -1722,7 +1729,7 @@ def to_dataframe( bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, ) - df = record_batch.to_pandas() + df = record_batch.to_pandas(date_as_object=date_as_object) for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) return df @@ -1799,6 +1806,7 @@ def to_dataframe( dtypes=None, progress_bar_type=None, create_bqstorage_client=True, + date_as_object=True, ): """Create an empty dataframe. @@ -1807,6 +1815,7 @@ def to_dataframe( dtypes (Any): Ignored. Added for compatibility with RowIterator. progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. + date_as_object (bool): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py index 9eec9fda3..733445337 100644 --- a/tests/unit/test_job.py +++ b/tests/unit/test_job.py @@ -5504,7 +5504,15 @@ def test_to_dataframe_column_dtypes(self): }, } row_data = [ - ["1.4338368E9", "420", "1.1", "1.77", "Cash", "true", "1999-12-01"], + [ + "1.4338368E9", + "420", + "1.1", + "1.77", + "Cto_dataframeash", + "true", + "1999-12-01", + ], ["1.3878117E9", "2580", "17.7", "28.5", "Cash", "false", "1953-06-14"], ["1.3855653E9", "2280", "4.4", "7.1", "Credit", "true", "1981-11-04"], ] @@ -5533,6 +5541,69 @@ def test_to_dataframe_column_dtypes(self): self.assertEqual(df.complete.dtype.name, "bool") self.assertEqual(df.date.dtype.name, "object") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_column_date_dtypes(self): + begun_resource = self._make_resource() + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "totalRows": "1", + "schema": {"fields": [{"name": "date", "type": "DATE"}]}, + } + row_data = [ + ["1999-12-01"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + query_resource["rows"] = rows + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, query_resource + ) + client = _make_client(project=self.PROJECT, connection=connection) + job = self._make_one(self.JOB_ID, self.QUERY, client) + df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False) + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 1) # verify the number of rows + exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] + self.assertEqual(list(df), exp_columns) # verify the column names + + self.assertEqual(df.date.dtype.name, "datetime64[ns]") + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_column_date_dtypes_wo_pyarrow(self): + begun_resource = self._make_resource() + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "totalRows": "1", + "schema": {"fields": [{"name": "date", "type": "DATE"}]}, + } + row_data = [ + ["1999-12-01"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + query_resource["rows"] = rows + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, query_resource + ) + client = _make_client(project=self.PROJECT, connection=connection) + job = self._make_one(self.JOB_ID, self.QUERY, client) + + with mock.patch("google.cloud.bigquery.table.pyarrow", None): + df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False) + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 1) # verify the number of rows + exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] + self.assertEqual(list(df), exp_columns) # verify the column names + + self.assertEqual(df.date.dtype.name, "object") + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm")