From 4026997d7a286b63ed2b969c0bd49de59635326d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 13 Jul 2021 14:11:14 -0500 Subject: [PATCH] docs: pandas DataFrame samples are more standalone (#224) * docs: pandas DataFrame samples are more standalone * fix region tag * fix region tag * remove unused imports * blacken * remove session from call to rows/to_dataframe --- samples/conftest.py | 22 ++++++ samples/quickstart/quickstart_test.py | 8 --- samples/to_dataframe/noxfile.py | 2 +- samples/to_dataframe/read_query_results.py | 47 +++++++++++++ .../to_dataframe/read_query_results_test.py | 21 ++++++ samples/to_dataframe/read_table_bigquery.py | 42 +++++++++++ .../to_dataframe/read_table_bigquery_test.py | 21 ++++++ samples/to_dataframe/read_table_bqstorage.py | 69 +++++++++++++++++++ .../to_dataframe/read_table_bqstorage_test.py | 21 ++++++ samples/to_dataframe/requirements.txt | 8 +-- 10 files changed, 248 insertions(+), 13 deletions(-) create mode 100644 samples/conftest.py create mode 100644 samples/to_dataframe/read_query_results.py create mode 100644 samples/to_dataframe/read_query_results_test.py create mode 100644 samples/to_dataframe/read_table_bigquery.py create mode 100644 samples/to_dataframe/read_table_bigquery_test.py create mode 100644 samples/to_dataframe/read_table_bqstorage.py create mode 100644 samples/to_dataframe/read_table_bqstorage_test.py diff --git a/samples/conftest.py b/samples/conftest.py new file mode 100644 index 00000000..92068ef5 --- /dev/null +++ b/samples/conftest.py @@ -0,0 +1,22 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + + +@pytest.fixture(scope="session") +def project_id(): + return os.environ["GOOGLE_CLOUD_PROJECT"] diff --git a/samples/quickstart/quickstart_test.py b/samples/quickstart/quickstart_test.py index 23f3c350..8e1e0dfd 100644 --- a/samples/quickstart/quickstart_test.py +++ b/samples/quickstart/quickstart_test.py @@ -13,9 +13,6 @@ # limitations under the License. import datetime -import os - -import pytest from . import quickstart @@ -27,11 +24,6 @@ def now_millis(): ) -@pytest.fixture() -def project_id(): - return os.environ["GOOGLE_CLOUD_PROJECT"] - - def test_quickstart_wo_snapshot(capsys, project_id): quickstart.main(project_id) out, _ = capsys.readouterr() diff --git a/samples/to_dataframe/noxfile.py b/samples/to_dataframe/noxfile.py index 160fe728..b3c8658a 100644 --- a/samples/to_dataframe/noxfile.py +++ b/samples/to_dataframe/noxfile.py @@ -226,7 +226,7 @@ def py(session: nox.sessions.Session) -> None: def _get_repo_root() -> Optional[str]: - """ Returns the root folder of the project. """ + """Returns the root folder of the project.""" # Get root of this repository. Assume we don't have directories nested deeper than 10 items. p = Path(os.getcwd()) for i in range(10): diff --git a/samples/to_dataframe/read_query_results.py b/samples/to_dataframe/read_query_results.py new file mode 100644 index 00000000..45bae1ea --- /dev/null +++ b/samples/to_dataframe/read_query_results.py @@ -0,0 +1,47 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def read_query_results(): + # [START bigquerystorage_pandas_tutorial_read_query_results] + from google.cloud import bigquery + + bqclient = bigquery.Client() + + # Download query results. + query_string = """ + SELECT + CONCAT( + 'https://stackoverflow.com/questions/', + CAST(id as STRING)) as url, + view_count + FROM `bigquery-public-data.stackoverflow.posts_questions` + WHERE tags like '%google-bigquery%' + ORDER BY view_count DESC + """ + + dataframe = ( + bqclient.query(query_string) + .result() + .to_dataframe( + # Optionally, explicitly request to use the BigQuery Storage API. As of + # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage + # API is used by default. + create_bqstorage_client=True, + ) + ) + print(dataframe.head()) + # [END bigquerystorage_pandas_tutorial_read_query_results] + + return dataframe diff --git a/samples/to_dataframe/read_query_results_test.py b/samples/to_dataframe/read_query_results_test.py new file mode 100644 index 00000000..55b55a08 --- /dev/null +++ b/samples/to_dataframe/read_query_results_test.py @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import read_query_results + + +def test_read_query_results(capsys): + read_query_results.read_query_results() + out, _ = capsys.readouterr() + assert "stackoverflow" in out diff --git a/samples/to_dataframe/read_table_bigquery.py b/samples/to_dataframe/read_table_bigquery.py new file mode 100644 index 00000000..82d8879b --- /dev/null +++ b/samples/to_dataframe/read_table_bigquery.py @@ -0,0 +1,42 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def read_table(): + # [START bigquerystorage_pandas_tutorial_read_table] + from google.cloud import bigquery + + bqclient = bigquery.Client() + + # Download a table. + table = bigquery.TableReference.from_string( + "bigquery-public-data.utility_us.country_code_iso" + ) + rows = bqclient.list_rows( + table, + selected_fields=[ + bigquery.SchemaField("country_name", "STRING"), + bigquery.SchemaField("fips_code", "STRING"), + ], + ) + dataframe = rows.to_dataframe( + # Optionally, explicitly request to use the BigQuery Storage API. As of + # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage + # API is used by default. + create_bqstorage_client=True, + ) + print(dataframe.head()) + # [END bigquerystorage_pandas_tutorial_read_table] + + return dataframe diff --git a/samples/to_dataframe/read_table_bigquery_test.py b/samples/to_dataframe/read_table_bigquery_test.py new file mode 100644 index 00000000..c8301857 --- /dev/null +++ b/samples/to_dataframe/read_table_bigquery_test.py @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import read_table_bigquery + + +def test_read_table(capsys): + read_table_bigquery.read_table() + out, _ = capsys.readouterr() + assert "country_name" in out diff --git a/samples/to_dataframe/read_table_bqstorage.py b/samples/to_dataframe/read_table_bqstorage.py new file mode 100644 index 00000000..0a3ae777 --- /dev/null +++ b/samples/to_dataframe/read_table_bqstorage.py @@ -0,0 +1,69 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def read_table(your_project_id): + original_your_project_id = your_project_id + # [START bigquerystorage_pandas_tutorial_read_session] + your_project_id = "project-for-read-session" + # [END bigquerystorage_pandas_tutorial_read_session] + your_project_id = original_your_project_id + + # [START bigquerystorage_pandas_tutorial_read_session] + from google.cloud import bigquery_storage + from google.cloud.bigquery_storage import types + import pandas + + bqstorageclient = bigquery_storage.BigQueryReadClient() + + project_id = "bigquery-public-data" + dataset_id = "new_york_trees" + table_id = "tree_species" + table = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}" + + # Select columns to read with read options. If no read options are + # specified, the whole table is read. + read_options = types.ReadSession.TableReadOptions( + selected_fields=["species_common_name", "fall_color"] + ) + + parent = "projects/{}".format(your_project_id) + + requested_session = types.ReadSession( + table=table, + # Avro is also supported, but the Arrow data format is optimized to + # work well with column-oriented data structures such as pandas + # DataFrames. + data_format=types.DataFormat.ARROW, + read_options=read_options, + ) + read_session = bqstorageclient.create_read_session( + parent=parent, read_session=requested_session, max_stream_count=1, + ) + + # This example reads from only a single stream. Read from multiple streams + # to fetch data faster. Note that the session may not contain any streams + # if there are no rows to read. + stream = read_session.streams[0] + reader = bqstorageclient.read_rows(stream.name) + + # Parse all Arrow blocks and create a dataframe. + frames = [] + for message in reader.rows().pages: + frames.append(message.to_dataframe()) + dataframe = pandas.concat(frames) + print(dataframe.head()) + # [END bigquerystorage_pandas_tutorial_read_session] + + return dataframe diff --git a/samples/to_dataframe/read_table_bqstorage_test.py b/samples/to_dataframe/read_table_bqstorage_test.py new file mode 100644 index 00000000..cc093078 --- /dev/null +++ b/samples/to_dataframe/read_table_bqstorage_test.py @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import read_table_bqstorage + + +def test_read_table(capsys, project_id): + read_table_bqstorage.read_table(your_project_id=project_id) + out, _ = capsys.readouterr() + assert "species_common_name" in out diff --git a/samples/to_dataframe/requirements.txt b/samples/to_dataframe/requirements.txt index 894fd023..ff4e18a8 100644 --- a/samples/to_dataframe/requirements.txt +++ b/samples/to_dataframe/requirements.txt @@ -2,7 +2,7 @@ google-auth==1.32.1 google-cloud-bigquery-storage==2.6.0 google-cloud-bigquery==2.20.0 pyarrow==4.0.1 -ipython==7.10.2; python_version > '3.0' -ipython==5.9.0; python_version < '3.0' -pandas==0.25.3; python_version > '3.0' -pandas==0.24.2; python_version < '3.0' +ipython==7.24.0; python_version > '3.6' +ipython==7.16.1; python_version <= '3.6' +pandas==1.2.5; python_version > '3.6' +pandas==1.1.5; python_version <= '3.6'