docs(samples): add more clustering code snippets

googleapis · Oct 19, 2020 · 2ca69be · 2ca69be
1 parent 5178b55
commit 2ca69be
Show file tree

Hide file tree

Showing 5 changed files with 175 additions and 0 deletions.
diff --git a/docs/usage/tables.rst b/docs/usage/tables.rst
@@ -85,6 +85,23 @@ Load table data from a file with the
    :start-after: [START bigquery_load_from_file]
    :end-before: [END bigquery_load_from_file]
 
+Creating a clustered table from a query result:
+
+.. literalinclude:: ../samples/client_query_destination_table_clustered.py
+   :language: python
+   :dedent: 4
+   :start-after: [START bigquery_query_destination_table_clustered]
+   :end-before: [END bigquery_query_destination_table_clustered]
+
+Creating a clustered table when you load data
+:func:`~google.cloud.bigquery.client.Client.load_table_from_file` method:
+
+.. literalinclude:: ../samples/load_table_clustered.py
+   :language: python
+   :dedent: 4
+   :start-after: [START bigquery_load_table_clustered]
+   :end-before: [END bigquery_load_table_clustered]
+
 Load a CSV file from Cloud Storage with the
 :func:`~google.cloud.bigquery.client.Client.load_table_from_uri` method:
 

diff --git a/samples/client_query_destination_table_clustered.py b/samples/client_query_destination_table_clustered.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def client_query_destination_table_clustered(table_id):
+
+    # [START bigquery_query_destination_table_clustered]
+    from google.cloud import bigquery
+
+    # Construct a BigQuery client object.
+    client = bigquery.Client()
+
+    # TODO(developer): Set table_id to the ID of the destination table.
+    # table_id = "your-project.your_dataset.your_table_name"
+
+    sql = "SELECT * FROM `bigquery-public-data.samples.shakespeare`"
+    cluster_fields = ["corpus"]
+
+    job_config = bigquery.QueryJobConfig(
+        clustering_fields=cluster_fields, destination=table_id
+    )
+
+    # Start the query, passing in the extra configuration.
+    query_job = client.query(sql, job_config=job_config)  # Make an API request.
+    query_job.result()  # Wait for the job to complete.
+
+    table = client.get_table(table_id)  # Make an API request.
+    if table.clustering_fields == cluster_fields:
+        print("The destination table is written using the cluster_fields configuration.")
+    # [END bigquery_query_destination_table_clustered]
diff --git a/samples/load_table_clustered.py b/samples/load_table_clustered.py
@@ -0,0 +1,50 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def load_table_clustered(file_path, table_id):
+
+    # [START bigquery_load_table_clustered]
+    from google.cloud import bigquery
+
+    # Construct a BigQuery client object.
+    client = bigquery.Client()
+
+    # TODO(developer): Set table_id to the ID of the table to create.
+    # table_id = "your-project.your_dataset.your_table_name"
+
+    job_config = bigquery.LoadJobConfig(
+        schema=[
+            bigquery.SchemaField("full_name", "STRING"),
+            bigquery.SchemaField("age", "INTEGER"),
+        ],
+        clustering_fields=["age"],
+        skip_leading_rows=1,
+        # The source format defaults to CSV, so the line below is optional.
+        source_format=bigquery.SourceFormat.CSV,
+    )
+
+    with open(file_path, "rb") as source_file:
+        job = client.load_table_from_file(source_file, table_id, job_config=job_config)
+
+    job.result()  # Waits for the job to complete.
+
+    table = client.get_table(table_id)  # Make an API request.
+    print(
+        "Loaded {} rows and {} columns to {}".format(
+            table.num_rows, len(table.schema), table_id
+        )
+    )
+    # [END bigquery_load_table_clustered]
+    return table
diff --git a/samples/tests/test_client_query_destination_table_clustered.py b/samples/tests/test_client_query_destination_table_clustered.py
@@ -0,0 +1,27 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .. import client_query_destination_table_clustered
+
+
+def test_client_query_destination_table_clustered(capsys, random_table_id):
+
+    client_query_destination_table_clustered.client_query_destination_table_clustered(
+        random_table_id
+    )
+    out, err = capsys.readouterr()
+    assert (
+        "The destination table is written using the cluster_fields configuration."
+        in out
+    )
diff --git a/samples/tests/test_load_table_clustered.py b/samples/tests/test_load_table_clustered.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from google.cloud import bigquery
+
+from .. import load_table_clustered
+
+
+def test_load_table_clustered(capsys, random_table_id, client):
+
+    samples_test_dir = os.path.abspath(os.path.dirname(__file__))
+    file_path = os.path.join(
+        samples_test_dir, "..", "..", "tests", "data", "people.csv"
+    )
+    table = load_table_clustered.load_table_clustered(file_path, random_table_id)
+
+    out, _ = capsys.readouterr()
+    assert "Loaded 2 rows and 2 columns" in out
+
+    rows = list(client.list_rows(table))  # Make an API request.
+    assert len(rows) == 2
+    # Order is not preserved, so compare individually
+    row1 = bigquery.Row(("Wylma Phlyntstone", 29), {"full_name": 0, "age": 1})
+    assert row1 in rows
+    row2 = bigquery.Row(("Phred Phlyntstone", 32), {"full_name": 0, "age": 1})
+    assert row2 in rows
+    assert table.clustering_fields == ["age"]