Merge branch 'master' into autosynth-synthtool

googleapis · Nov 18, 2020 · 28cb12b · 28cb12b
2 parents 0547c2a + 7039a35
commit 28cb12b
Show file tree

Hide file tree

Showing 13 changed files with 666 additions and 3 deletions.
diff --git a/README.rst b/README.rst
@@ -14,8 +14,8 @@ language, computer vision, translation, and AutoML.
    :target: https://github.com/googleapis/google-cloud-python/blob/master/README.rst#beta-support
 .. |pypi| image:: https://img.shields.io/pypi/v/google-cloud-service-directory.svg
    :target: https://pypi.org/project/google-cloud-service-directory/
-.. |versions| image:: https://img.shields.io/pypi/pyversions/google-cloud-service-directory.svg
-   :target: https://pypi.org/project/google-cloud-service-directory/
+.. |versions| image:: https://img.shields.io/pypi/pyversions/google-cloud-documentai.svg
+   :target: https://pypi.org/project/google-cloud-documentai/
 .. _Cloud Document AI API: https://cloud.google.com/document-understanding/docs/
 .. _Client Library Documentation: https://googleapis.dev/python/documentai/latest
 .. _Product Documentation: https://cloud.google.com/document-understanding/docs/
@@ -81,4 +81,4 @@ Next Steps
    APIs that we cover.
 
 .. _Cloud Document AI API Product documentation:  https://cloud.google.com/document-understanding/docs/
-.. _README: https://github.com/googleapis/google-cloud-python/blob/master/README.rst
+.. _README: https://github.com/googleapis/google-cloud-python/blob/master/README.rst
diff --git a/samples/__init__.py b/samples/__init__.py
diff --git a/samples/snippets/__init__.py b/samples/snippets/__init__.py
diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py
@@ -0,0 +1,121 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_process_document]
+import re
+
+from google.cloud import documentai_v1beta3 as documentai
+from google.cloud import storage
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id= 'YOUR_PROJECT_ID'
+# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
+# input_uri = "YOUR_INPUT_URI"
+# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI"
+# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"
+
+
+def batch_process_documents(
+    project_id,
+    location,
+    processor_id,
+    gcs_input_uri,
+    gcs_output_uri,
+    gcs_output_uri_prefix,
+):
+
+    client = documentai.DocumentProcessorServiceClient()
+
+    destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"
+
+    # 'mime_type' can be 'application/pdf', 'image/tiff',
+    # and 'image/gif', or 'application/json'
+    input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
+        gcs_source=gcs_input_uri, mime_type="application/pdf"
+    )
+
+    # Where to write results
+    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
+        gcs_destination=destination_uri
+    )
+
+    # Location can be 'us' or 'eu'
+    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
+    request = documentai.types.document_processor_service.BatchProcessRequest(
+        name=name,
+        input_configs=[input_config],
+        output_config=output_config,
+    )
+
+    operation = client.batch_process_documents(request)
+
+    # Wait for the operation to finish
+    operation.result()
+
+    # Results are written to GCS. Use a regex to find
+    # output files
+    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
+    output_bucket = match.group(1)
+    prefix = match.group(2)
+
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(output_bucket)
+    blob_list = list(bucket.list_blobs(prefix=prefix))
+    print("Output files:")
+
+    for i, blob in enumerate(blob_list):
+        # Download the contents of this blob as a bytes object.
+        blob_as_bytes = blob.download_as_bytes()
+        document = documentai.types.Document.from_json(blob_as_bytes)
+
+        print(f"Fetched file {i + 1}")
+
+        # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
+
+        # Read the text recognition output from the processor
+        for page in document.pages:
+            for form_field in page.form_fields:
+                field_name = get_text(form_field.field_name, document)
+                field_value = get_text(form_field.field_value, document)
+                print("Extracted key value pair:")
+                print(f"\t{field_name}, {field_value}")
+            for paragraph in document.pages:
+                paragraph_text = get_text(paragraph.layout, document)
+                print(f"Paragraph text:\n{paragraph_text}")
+
+
+# Extract shards from the text field
+def get_text(doc_element: dict, document: dict):
+    """
+    Document AI identifies form fields by their offsets
+    in document text. This function converts offsets
+    to text snippets.
+    """
+    response = ""
+    # If a text segment spans several lines, it will
+    # be stored in different text segments.
+    for segment in doc_element.text_anchor.text_segments:
+        start_index = (
+            int(segment.start_index)
+            if segment in doc_element.text_anchor.text_segments
+            else 0
+        )
+        end_index = int(segment.end_index)
+        response += document.text[start_index:end_index]
+    return response
+
+
+# [END documentai_batch_process_document]
diff --git a/samples/snippets/batch_process_documents_sample_v1beta3_test.py b/samples/snippets/batch_process_documents_sample_v1beta3_test.py
@@ -0,0 +1,62 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from uuid import uuid4
+
+from google.cloud import storage
+from google.cloud.exceptions import NotFound
+
+import pytest
+
+from samples.snippets import batch_process_documents_sample_v1beta3
+
+location = "us"
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+processor_id = "90484cfdedb024f6"
+gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
+gcs_output_uri_prefix = uuid4()
+BUCKET_NAME = f"document-ai-python-{uuid4()}"
+
+
+@pytest.fixture(scope="module")
+def test_bucket():
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(BUCKET_NAME)
+    yield bucket.name
+
+    try:
+        blobs = list(bucket.list_blobs())
+        for blob in blobs:
+            blob.delete()
+        bucket.delete()
+    except NotFound:
+        print("Bucket already deleted.")
+
+
+def test_batch_process_documents(capsys, test_bucket):
+    batch_process_documents_sample_v1beta3.batch_process_documents(
+        project_id=project_id,
+        location=location,
+        processor_id=processor_id,
+        gcs_input_uri=gcs_input_uri,
+        gcs_output_uri=f"gs://{test_bucket}",
+        gcs_output_uri_prefix=gcs_output_uri_prefix,
+    )
+    out, _ = capsys.readouterr()
+
+    assert "Extracted" in out
+    assert "Paragraph" in out
+    assert "Invoice" in out