Skip to content

Commit

Permalink
docs: pandas DataFrame samples are more standalone (#224)
Browse files Browse the repository at this point in the history
* docs: pandas DataFrame samples are more standalone

* fix region tag

* fix region tag

* remove unused imports

* blacken

* remove session from call to rows/to_dataframe
  • Loading branch information
tswast committed Jul 13, 2021
1 parent 7b086ba commit 4026997
Show file tree
Hide file tree
Showing 10 changed files with 248 additions and 13 deletions.
22 changes: 22 additions & 0 deletions samples/conftest.py
@@ -0,0 +1,22 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import pytest


@pytest.fixture(scope="session")
def project_id():
return os.environ["GOOGLE_CLOUD_PROJECT"]
8 changes: 0 additions & 8 deletions samples/quickstart/quickstart_test.py
Expand Up @@ -13,9 +13,6 @@
# limitations under the License.

import datetime
import os

import pytest

from . import quickstart

Expand All @@ -27,11 +24,6 @@ def now_millis():
)


@pytest.fixture()
def project_id():
return os.environ["GOOGLE_CLOUD_PROJECT"]


def test_quickstart_wo_snapshot(capsys, project_id):
quickstart.main(project_id)
out, _ = capsys.readouterr()
Expand Down
2 changes: 1 addition & 1 deletion samples/to_dataframe/noxfile.py
Expand Up @@ -226,7 +226,7 @@ def py(session: nox.sessions.Session) -> None:


def _get_repo_root() -> Optional[str]:
""" Returns the root folder of the project. """
"""Returns the root folder of the project."""
# Get root of this repository. Assume we don't have directories nested deeper than 10 items.
p = Path(os.getcwd())
for i in range(10):
Expand Down
47 changes: 47 additions & 0 deletions samples/to_dataframe/read_query_results.py
@@ -0,0 +1,47 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def read_query_results():
# [START bigquerystorage_pandas_tutorial_read_query_results]
from google.cloud import bigquery

bqclient = bigquery.Client()

# Download query results.
query_string = """
SELECT
CONCAT(
'https://stackoverflow.com/questions/',
CAST(id as STRING)) as url,
view_count
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE tags like '%google-bigquery%'
ORDER BY view_count DESC
"""

dataframe = (
bqclient.query(query_string)
.result()
.to_dataframe(
# Optionally, explicitly request to use the BigQuery Storage API. As of
# google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
# API is used by default.
create_bqstorage_client=True,
)
)
print(dataframe.head())
# [END bigquerystorage_pandas_tutorial_read_query_results]

return dataframe
21 changes: 21 additions & 0 deletions samples/to_dataframe/read_query_results_test.py
@@ -0,0 +1,21 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import read_query_results


def test_read_query_results(capsys):
read_query_results.read_query_results()
out, _ = capsys.readouterr()
assert "stackoverflow" in out
42 changes: 42 additions & 0 deletions samples/to_dataframe/read_table_bigquery.py
@@ -0,0 +1,42 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def read_table():
# [START bigquerystorage_pandas_tutorial_read_table]
from google.cloud import bigquery

bqclient = bigquery.Client()

# Download a table.
table = bigquery.TableReference.from_string(
"bigquery-public-data.utility_us.country_code_iso"
)
rows = bqclient.list_rows(
table,
selected_fields=[
bigquery.SchemaField("country_name", "STRING"),
bigquery.SchemaField("fips_code", "STRING"),
],
)
dataframe = rows.to_dataframe(
# Optionally, explicitly request to use the BigQuery Storage API. As of
# google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
# API is used by default.
create_bqstorage_client=True,
)
print(dataframe.head())
# [END bigquerystorage_pandas_tutorial_read_table]

return dataframe
21 changes: 21 additions & 0 deletions samples/to_dataframe/read_table_bigquery_test.py
@@ -0,0 +1,21 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import read_table_bigquery


def test_read_table(capsys):
read_table_bigquery.read_table()
out, _ = capsys.readouterr()
assert "country_name" in out
69 changes: 69 additions & 0 deletions samples/to_dataframe/read_table_bqstorage.py
@@ -0,0 +1,69 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def read_table(your_project_id):
original_your_project_id = your_project_id
# [START bigquerystorage_pandas_tutorial_read_session]
your_project_id = "project-for-read-session"
# [END bigquerystorage_pandas_tutorial_read_session]
your_project_id = original_your_project_id

# [START bigquerystorage_pandas_tutorial_read_session]
from google.cloud import bigquery_storage
from google.cloud.bigquery_storage import types
import pandas

bqstorageclient = bigquery_storage.BigQueryReadClient()

project_id = "bigquery-public-data"
dataset_id = "new_york_trees"
table_id = "tree_species"
table = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}"

# Select columns to read with read options. If no read options are
# specified, the whole table is read.
read_options = types.ReadSession.TableReadOptions(
selected_fields=["species_common_name", "fall_color"]
)

parent = "projects/{}".format(your_project_id)

requested_session = types.ReadSession(
table=table,
# Avro is also supported, but the Arrow data format is optimized to
# work well with column-oriented data structures such as pandas
# DataFrames.
data_format=types.DataFormat.ARROW,
read_options=read_options,
)
read_session = bqstorageclient.create_read_session(
parent=parent, read_session=requested_session, max_stream_count=1,
)

# This example reads from only a single stream. Read from multiple streams
# to fetch data faster. Note that the session may not contain any streams
# if there are no rows to read.
stream = read_session.streams[0]
reader = bqstorageclient.read_rows(stream.name)

# Parse all Arrow blocks and create a dataframe.
frames = []
for message in reader.rows().pages:
frames.append(message.to_dataframe())
dataframe = pandas.concat(frames)
print(dataframe.head())
# [END bigquerystorage_pandas_tutorial_read_session]

return dataframe
21 changes: 21 additions & 0 deletions samples/to_dataframe/read_table_bqstorage_test.py
@@ -0,0 +1,21 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import read_table_bqstorage


def test_read_table(capsys, project_id):
read_table_bqstorage.read_table(your_project_id=project_id)
out, _ = capsys.readouterr()
assert "species_common_name" in out
8 changes: 4 additions & 4 deletions samples/to_dataframe/requirements.txt
Expand Up @@ -2,7 +2,7 @@ google-auth==1.32.1
google-cloud-bigquery-storage==2.6.0
google-cloud-bigquery==2.20.0
pyarrow==4.0.1
ipython==7.10.2; python_version > '3.0'
ipython==5.9.0; python_version < '3.0'
pandas==0.25.3; python_version > '3.0'
pandas==0.24.2; python_version < '3.0'
ipython==7.24.0; python_version > '3.6'
ipython==7.16.1; python_version <= '3.6'
pandas==1.2.5; python_version > '3.6'
pandas==1.1.5; python_version <= '3.6'

0 comments on commit 4026997

Please sign in to comment.