Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs(samples): add more clustering code snippets #330

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/usage/tables.rst
Expand Up @@ -85,6 +85,23 @@ Load table data from a file with the
:start-after: [START bigquery_load_from_file]
:end-before: [END bigquery_load_from_file]

Creating a clustered table from a query result:

.. literalinclude:: ../samples/client_query_destination_table_clustered.py
:language: python
:dedent: 4
:start-after: [START bigquery_query_clustered_table]
:end-before: [END bigquery_query_clustered_table]

Creating a clustered table when you load data with the
:func:`~google.cloud.bigquery.client.Client.load_table_from_uri` method:

.. literalinclude:: ../samples/load_table_clustered.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_table_clustered]
:end-before: [END bigquery_load_table_clustered]

Load a CSV file from Cloud Storage with the
:func:`~google.cloud.bigquery.client.Client.load_table_from_uri` method:

Expand Down
4 changes: 3 additions & 1 deletion google/cloud/bigquery/__init__.py
Expand Up @@ -37,6 +37,7 @@
from google.cloud.bigquery.dataset import Dataset
from google.cloud.bigquery.dataset import DatasetReference
from google.cloud.bigquery import enums
from google.cloud.bigquery.enums import SqlTypeNames
from google.cloud.bigquery.enums import StandardSqlDataTypes
from google.cloud.bigquery.external_config import ExternalConfig
from google.cloud.bigquery.external_config import BigtableOptions
Expand Down Expand Up @@ -137,8 +138,9 @@
"Encoding",
"QueryPriority",
"SchemaUpdateOption",
"StandardSqlDataTypes",
"SourceFormat",
"SqlTypeNames",
"StandardSqlDataTypes",
"WriteDisposition",
# EncryptionConfiguration
"EncryptionConfiguration",
Expand Down
43 changes: 43 additions & 0 deletions samples/client_query_destination_table_clustered.py
@@ -0,0 +1,43 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def client_query_destination_table_clustered(table_id):

# [START bigquery_query_clustered_table]
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the destination table.
# table_id = "your-project.your_dataset.your_table_name"

sql = "SELECT * FROM `bigquery-public-data.samples.shakespeare`"
cluster_fields = ["corpus"]

job_config = bigquery.QueryJobConfig(
clustering_fields=cluster_fields, destination=table_id
)

# Start the query, passing in the extra configuration.
query_job = client.query(sql, job_config=job_config) # Make an API request.
query_job.result() # Wait for the job to complete.

table = client.get_table(table_id) # Make an API request.
if table.clustering_fields == cluster_fields:
print(
"The destination table is written using the cluster_fields configuration."
)
# [END bigquery_query_clustered_table]
55 changes: 55 additions & 0 deletions samples/load_table_clustered.py
@@ -0,0 +1,55 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def load_table_clustered(table_id):

# [START bigquery_load_table_clustered]
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"

job_config = bigquery.LoadJobConfig(
skip_leading_rows=1,
source_format=bigquery.SourceFormat.CSV,
schema=[
bigquery.SchemaField("timestamp", bigquery.SqlTypeNames.TIMESTAMP),
bigquery.SchemaField("origin", bigquery.SqlTypeNames.STRING),
bigquery.SchemaField("destination", bigquery.SqlTypeNames.STRING),
bigquery.SchemaField("amount", bigquery.SqlTypeNames.NUMERIC),
],
time_partitioning=bigquery.TimePartitioning(field="timestamp"),
clustering_fields=["origin", "destination"],
)

job = client.load_table_from_uri(
["gs://cloud-samples-data/bigquery/sample-transactions/transactions.csv"],
table_id,
job_config=job_config,
)

job.result() # Waits for the job to complete.

table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
# [END bigquery_load_table_clustered]
return table
27 changes: 27 additions & 0 deletions samples/tests/test_client_query_destination_table_clustered.py
@@ -0,0 +1,27 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .. import client_query_destination_table_clustered


def test_client_query_destination_table_clustered(capsys, random_table_id):

client_query_destination_table_clustered.client_query_destination_table_clustered(
random_table_id
)
out, err = capsys.readouterr()
assert (
"The destination table is written using the cluster_fields configuration."
in out
)
27 changes: 27 additions & 0 deletions samples/tests/test_load_table_clustered.py
@@ -0,0 +1,27 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .. import load_table_clustered


def test_load_table_clustered(capsys, random_table_id, client):

table = load_table_clustered.load_table_clustered(random_table_id)

out, _ = capsys.readouterr()
assert "rows and 4 columns" in out

rows = list(client.list_rows(table)) # Make an API request.
assert len(rows) > 0
assert table.clustering_fields == ["origin", "destination"]