From db5146980bd1a358413c56f6e090c07277bfac26 Mon Sep 17 00:00:00 2001 From: Veronica Wasson <3992422+VeronicaWasson@users.noreply.github.com> Date: Tue, 5 Oct 2021 14:06:12 -0700 Subject: [PATCH] docs(samples): Add minimal sample to show Write API in pending mode (#322) This sample is a stripped down version of the bigquerystorage_append_rows_raw_proto2 sample, for embedding in the Write API documentation. The docs would then link to the longer sample which shows how to format all of the datatypes including STRUCT types. btw I registered a new region tag for this snippet --- samples/snippets/append_rows_pending.py | 134 +++++++++++++++++++ samples/snippets/append_rows_pending_test.py | 73 ++++++++++ samples/snippets/customer_record.proto | 28 ++++ samples/snippets/customer_record_pb2.py | 99 ++++++++++++++ samples/snippets/customer_record_schema.json | 11 ++ 5 files changed, 345 insertions(+) create mode 100644 samples/snippets/append_rows_pending.py create mode 100644 samples/snippets/append_rows_pending_test.py create mode 100644 samples/snippets/customer_record.proto create mode 100644 samples/snippets/customer_record_pb2.py create mode 100644 samples/snippets/customer_record_schema.json diff --git a/samples/snippets/append_rows_pending.py b/samples/snippets/append_rows_pending.py new file mode 100644 index 00000000..af780ffa --- /dev/null +++ b/samples/snippets/append_rows_pending.py @@ -0,0 +1,134 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START bigquerystorage_append_rows_pending] +""" +This code sample demonstrates how to write records in pending mode +using the low-level generated client for Python. +""" + +from google.cloud import bigquery_storage_v1 +from google.cloud.bigquery_storage_v1 import types +from google.cloud.bigquery_storage_v1 import writer +from google.protobuf import descriptor_pb2 + +# If you update the customer_record.proto protocol buffer definition, run: +# +# protoc --python_out=. customer_record.proto +# +# from the samples/snippets directory to generate the customer_record_pb2.py module. +from . import customer_record_pb2 + + +def create_row_data(row_num: int, name: str): + row = customer_record_pb2.CustomerRecord() + row.row_num = row_num + row.customer_name = name + return row.SerializeToString() + + +def append_rows_pending(project_id: str, dataset_id: str, table_id: str): + + """Create a write stream, write some sample data, and commit the stream.""" + write_client = bigquery_storage_v1.BigQueryWriteClient() + parent = write_client.table_path(project_id, dataset_id, table_id) + write_stream = types.WriteStream() + + # When creating the stream, choose the type. Use the PENDING type to wait + # until the stream is committed before it is visible. See: + # https://cloud.google.com/bigquery/docs/reference/storage/rpc/google.cloud.bigquery.storage.v1#google.cloud.bigquery.storage.v1.WriteStream.Type + write_stream.type_ = types.WriteStream.Type.PENDING + write_stream = write_client.create_write_stream( + parent=parent, write_stream=write_stream + ) + stream_name = write_stream.name + + # Create a template with fields needed for the first request. + request_template = types.AppendRowsRequest() + + # The initial request must contain the stream name. + request_template.write_stream = stream_name + + # So that BigQuery knows how to parse the serialized_rows, generate a + # protocol buffer representation of your message descriptor. + proto_schema = types.ProtoSchema() + proto_descriptor = descriptor_pb2.DescriptorProto() + customer_record_pb2.CustomerRecord.DESCRIPTOR.CopyToProto(proto_descriptor) + proto_schema.proto_descriptor = proto_descriptor + proto_data = types.AppendRowsRequest.ProtoData() + proto_data.writer_schema = proto_schema + request_template.proto_rows = proto_data + + # Some stream types support an unbounded number of requests. Construct an + # AppendRowsStream to send an arbitrary number of requests to a stream. + append_rows_stream = writer.AppendRowsStream(write_client, request_template) + + # Create a batch of row data by appending proto2 serialized bytes to the + # serialized_rows repeated field. + proto_rows = types.ProtoRows() + proto_rows.serialized_rows.append(create_row_data(1, "Alice")) + proto_rows.serialized_rows.append(create_row_data(2, "Bob")) + + # Set an offset to allow resuming this stream if the connection breaks. + # Keep track of which requests the server has acknowledged and resume the + # stream at the first non-acknowledged message. If the server has already + # processed a message with that offset, it will return an ALREADY_EXISTS + # error, which can be safely ignored. + # + # The first request must always have an offset of 0. + request = types.AppendRowsRequest() + request.offset = 0 + proto_data = types.AppendRowsRequest.ProtoData() + proto_data.rows = proto_rows + request.proto_rows = proto_data + + response_future_1 = append_rows_stream.send(request) + + # Send another batch. + proto_rows = types.ProtoRows() + proto_rows.serialized_rows.append(create_row_data(3, "Charles")) + + # Since this is the second request, you only need to include the row data. + # The name of the stream and protocol buffers DESCRIPTOR is only needed in + # the first request. + request = types.AppendRowsRequest() + proto_data = types.AppendRowsRequest.ProtoData() + proto_data.rows = proto_rows + request.proto_rows = proto_data + + # Offset must equal the number of rows that were previously sent. + request.offset = 2 + + response_future_2 = append_rows_stream.send(request) + + print(response_future_1.result()) + print(response_future_2.result()) + + # Shutdown background threads and close the streaming connection. + append_rows_stream.close() + + # A PENDING type stream must be "finalized" before being committed. No new + # records can be written to the stream after this method has been called. + write_client.finalize_write_stream(name=write_stream.name) + + # Commit the stream you created earlier. + batch_commit_write_streams_request = types.BatchCommitWriteStreamsRequest() + batch_commit_write_streams_request.parent = parent + batch_commit_write_streams_request.write_streams = [write_stream.name] + write_client.batch_commit_write_streams(batch_commit_write_streams_request) + + print(f"Writes to stream: '{write_stream.name}' have been committed.") + + +# [END bigquerystorage_append_rows_pending] diff --git a/samples/snippets/append_rows_pending_test.py b/samples/snippets/append_rows_pending_test.py new file mode 100644 index 00000000..dc0e690d --- /dev/null +++ b/samples/snippets/append_rows_pending_test.py @@ -0,0 +1,73 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +import random + +from google.cloud import bigquery +import pytest + +from . import append_rows_pending + + +DIR = pathlib.Path(__file__).parent + + +regions = ["US", "non-US"] + + +@pytest.fixture(params=regions) +def sample_data_table( + request: pytest.FixtureRequest, + bigquery_client: bigquery.Client, + project_id: str, + dataset_id: str, + dataset_id_non_us: str, +) -> str: + dataset = dataset_id + if request.param != "US": + dataset = dataset_id_non_us + schema = bigquery_client.schema_from_json(str(DIR / "customer_record_schema.json")) + table_id = f"append_rows_proto2_{random.randrange(10000)}" + full_table_id = f"{project_id}.{dataset}.{table_id}" + table = bigquery.Table(full_table_id, schema=schema) + table = bigquery_client.create_table(table, exists_ok=True) + yield full_table_id + bigquery_client.delete_table(table, not_found_ok=True) + + +def test_append_rows_pending( + capsys: pytest.CaptureFixture, + bigquery_client: bigquery.Client, + sample_data_table: str, +): + project_id, dataset_id, table_id = sample_data_table.split(".") + append_rows_pending.append_rows_pending( + project_id=project_id, dataset_id=dataset_id, table_id=table_id + ) + out, _ = capsys.readouterr() + assert "have been committed" in out + + rows = bigquery_client.query( + f"SELECT * FROM `{project_id}.{dataset_id}.{table_id}`" + ).result() + row_items = [ + # Convert to sorted tuple of items to more easily search for expected rows. + tuple(sorted(row.items())) + for row in rows + ] + + assert (("customer_name", "Alice"), ("row_num", 1)) in row_items + assert (("customer_name", "Bob"), ("row_num", 2)) in row_items + assert (("customer_name", "Charles"), ("row_num", 3)) in row_items diff --git a/samples/snippets/customer_record.proto b/samples/snippets/customer_record.proto new file mode 100644 index 00000000..06142c3b --- /dev/null +++ b/samples/snippets/customer_record.proto @@ -0,0 +1,28 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The BigQuery Storage API expects protocol buffer data to be encoded in the +// proto2 wire format. This allows it to disambiguate missing optional fields +// from default values without the need for wrapper types. +syntax = "proto2"; + +// Define a message type representing the rows in your table. The message +// cannot contain fields which are not present in the table. +message CustomerRecord { + + optional string customer_name = 1; + + // Use the required keyword for client-side validation of required fields. + required int64 row_num = 2; +} diff --git a/samples/snippets/customer_record_pb2.py b/samples/snippets/customer_record_pb2.py new file mode 100644 index 00000000..14201ea9 --- /dev/null +++ b/samples/snippets/customer_record_pb2.py @@ -0,0 +1,99 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: customer_record.proto + +import sys + +_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1")) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +DESCRIPTOR = _descriptor.FileDescriptor( + name="customer_record.proto", + package="", + syntax="proto2", + serialized_options=None, + serialized_pb=_b( + '\n\x15\x63ustomer_record.proto"8\n\x0e\x43ustomerRecord\x12\x15\n\rcustomer_name\x18\x01 \x01(\t\x12\x0f\n\x07row_num\x18\x02 \x02(\x03' + ), +) + + +_CUSTOMERRECORD = _descriptor.Descriptor( + name="CustomerRecord", + full_name="CustomerRecord", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="customer_name", + full_name="CustomerRecord.customer_name", + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="row_num", + full_name="CustomerRecord.row_num", + index=1, + number=2, + type=3, + cpp_type=2, + label=2, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=25, + serialized_end=81, +) + +DESCRIPTOR.message_types_by_name["CustomerRecord"] = _CUSTOMERRECORD +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +CustomerRecord = _reflection.GeneratedProtocolMessageType( + "CustomerRecord", + (_message.Message,), + dict( + DESCRIPTOR=_CUSTOMERRECORD, + __module__="customer_record_pb2" + # @@protoc_insertion_point(class_scope:CustomerRecord) + ), +) +_sym_db.RegisterMessage(CustomerRecord) + + +# @@protoc_insertion_point(module_scope) diff --git a/samples/snippets/customer_record_schema.json b/samples/snippets/customer_record_schema.json new file mode 100644 index 00000000..e04b31a7 --- /dev/null +++ b/samples/snippets/customer_record_schema.json @@ -0,0 +1,11 @@ +[ + { + "name": "customer_name", + "type": "STRING" + }, + { + "name": "row_num", + "type": "INTEGER", + "mode": "REQUIRED" + } +]