Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs(samples): add more clustering code snippets #330

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/usage/tables.rst
Expand Up @@ -85,6 +85,23 @@ Load table data from a file with the
:start-after: [START bigquery_load_from_file]
:end-before: [END bigquery_load_from_file]

Creating a clustered table from a query result:

.. literalinclude:: ../samples/client_query_destination_table_clustered.py
:language: python
:dedent: 4
:start-after: [START bigquery_query_destination_table_clustered]
:end-before: [END bigquery_query_destination_table_clustered]

Creating a clustered table when you load data
:func:`~google.cloud.bigquery.client.Client.load_table_from_file` method:

.. literalinclude:: ../samples/load_table_clustered.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_table_clustered]
:end-before: [END bigquery_load_table_clustered]

Load a CSV file from Cloud Storage with the
:func:`~google.cloud.bigquery.client.Client.load_table_from_uri` method:

Expand Down
41 changes: 41 additions & 0 deletions samples/client_query_destination_table_clustered.py
@@ -0,0 +1,41 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def client_query_destination_table_clustered(table_id):

# [START bigquery_query_destination_table_clustered]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to standardize with what the other languages are using.

Suggested change
# [START bigquery_query_destination_table_clustered]
# [START bigquery_query_clustered_table]

from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the destination table.
# table_id = "your-project.your_dataset.your_table_name"

sql = "SELECT * FROM `bigquery-public-data.samples.shakespeare`"
cluster_fields = ["corpus"]

job_config = bigquery.QueryJobConfig(
clustering_fields=cluster_fields, destination=table_id
)

# Start the query, passing in the extra configuration.
query_job = client.query(sql, job_config=job_config) # Make an API request.
query_job.result() # Wait for the job to complete.

table = client.get_table(table_id) # Make an API request.
if table.clustering_fields == cluster_fields:
print("The destination table is written using the cluster_fields configuration.")
# [END bigquery_query_destination_table_clustered]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# [END bigquery_query_destination_table_clustered]
# [END bigquery_query_clustered_table]

50 changes: 50 additions & 0 deletions samples/load_table_clustered.py
@@ -0,0 +1,50 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def load_table_clustered(file_path, table_id):

# [START bigquery_load_table_clustered]
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"

job_config = bigquery.LoadJobConfig(
schema=[
bigquery.SchemaField("full_name", "STRING"),
bigquery.SchemaField("age", "INTEGER"),
],
clustering_fields=["age"],
skip_leading_rows=1,
# The source format defaults to CSV, so the line below is optional.
source_format=bigquery.SourceFormat.CSV,
)

with open(file_path, "rb") as source_file:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: The Go sample loads from GCS "gs://cloud-samples-data/bigquery/sample-transactions/transactions.csv"

It'd be nice if we had some consistency across languages. Also, it's a little bit simpler to run the sample without the need for local files.

Go sample:

https://github.com/GoogleCloudPlatform/golang-samples/blob/7c9a079be0894b2c04c825c07d55176e65087c88/bigquery/snippets/loadingdata/bigquery_load_table_clustered.go#L17-L70

job = client.load_table_from_file(source_file, table_id, job_config=job_config)

job.result() # Waits for the job to complete.

table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
# [END bigquery_load_table_clustered]
return table
27 changes: 27 additions & 0 deletions samples/tests/test_client_query_destination_table_clustered.py
@@ -0,0 +1,27 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .. import client_query_destination_table_clustered


def test_client_query_destination_table_clustered(capsys, random_table_id):

client_query_destination_table_clustered.client_query_destination_table_clustered(
random_table_id
)
out, err = capsys.readouterr()
assert (
"The destination table is written using the cluster_fields configuration."
in out
)
40 changes: 40 additions & 0 deletions samples/tests/test_load_table_clustered.py
@@ -0,0 +1,40 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from google.cloud import bigquery

from .. import load_table_clustered


def test_load_table_clustered(capsys, random_table_id, client):

samples_test_dir = os.path.abspath(os.path.dirname(__file__))
file_path = os.path.join(
samples_test_dir, "..", "..", "tests", "data", "people.csv"
)
table = load_table_clustered.load_table_clustered(file_path, random_table_id)

out, _ = capsys.readouterr()
assert "Loaded 2 rows and 2 columns" in out

rows = list(client.list_rows(table)) # Make an API request.
assert len(rows) == 2
# Order is not preserved, so compare individually
row1 = bigquery.Row(("Wylma Phlyntstone", 29), {"full_name": 0, "age": 1})
assert row1 in rows
row2 = bigquery.Row(("Phred Phlyntstone", 32), {"full_name": 0, "age": 1})
assert row2 in rows
assert table.clustering_fields == ["age"]