googleapis · leahecole · Oct 21, 2020 · Oct 13, 2020 · Oct 15, 2020 · Oct 15, 2020
diff --git a/samples/batch_process_documents_sample_v1beta3.py b/samples/batch_process_documents_sample_v1beta3.py
@@ -0,0 +1,107 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_process_document]
+import re
+import json
+from google.cloud import documentai_v1beta3 as documentai
+from google.cloud import storage
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id= 'YOUR_PROJECT_ID'
+# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
+# input_uri = "YOUR_INPUT_URI"
+# gcs_output_bucket_uri = "YOUR_OUTPUT_BUCKET_URI"
+# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"
+
+
+def batch_process_documents(
+        project_id,
+        location,
+        processor_id,
+        gcs_input_uri,
+        gcs_output_uri,
+        gcs_output_uri_prefix):
+    """Parse a form"""
+
+    client = documentai.DocumentProcessorServiceClient()
+
+    destination_uri = f'{gcs_output_uri}/{gcs_output_uri_prefix}/'
+
+    # mime_type can be application/pdf, image/tiff,
+    # and image/gif, or application/json
+    input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
+        gcs_source=gcs_input_uri, mime_type='application/pdf')
+
+    # where to write results
+    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
+        gcs_destination=destination_uri
+    )
+
+    # Location can be 'us' or 'eu'
+    name = f'projects/{project_id}/locations/{location}/processors/{processor_id}'
+    request = documentai.types.document_processor_service.BatchProcessRequest(
+        name=name,
+        input_configs=[input_config],
+        output_config=output_config,)
+
+    operation = client.batch_process_documents(request)
+
+    # Wait for the operation to finish
+    operation.result()
+
+    # Results are written to GCS. Use a regex to find
+    # output files
+    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
+    output_bucket = match.group(1)
+    prefix = match.group(2)
+
+    storage_client = storage.client.Client()
+    bucket = storage_client.get_bucket(output_bucket)
+    blob_list = list(bucket.list_blobs(prefix=prefix))
+    print('Output files:')
+
+    def _get_text(el):
+        """Doc AI identifies form fields by their offsets
+        in document text. This function converts offsets
+        to text snippets.
+        """
+        response = ''
+        # If a text segment spans several lines, it will
+        # be stored in different text segments.
+        for segment in el['textAnchor']['textSegments']:
+            start_index = int(segment['startIndex']) if 'startIndex' in el['textAnchor'] else 0
+            end_index = int(segment['endIndex'])
+            response += document['text'][start_index:end_index]
+        return response
+
+    for i, blob in enumerate(blob_list):
+        json_string = blob.download_as_bytes()
+        document = json.loads(json_string)
+
+        print(f'Fetched file {i + 1}')
+
+        for page in document['pages']:
+            for form_field in page['formFields']:
+                field_name = _get_text(form_field['fieldName'])
+                field_value = _get_text(form_field['fieldValue'])
+                print('Extracted key value pair:')
+                print(f'\t{field_name}, {field_value}')
+            for paragraph in document['pages']:
+                paragraph_text = _get_text(paragraph['layout'])
+                print(f'Paragraph text:\n{paragraph_text}')
+
+# [END documentai_batch_process_document]
diff --git a/samples/process_document_sample_v1beta3.py b/samples/process_document_sample_v1beta3.py
@@ -0,0 +1,78 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from google.cloud import documentai_v1beta3 as documentai
+
+# [START documentai_process_document]
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id= 'YOUR_PROJECT_ID';
+# location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
+# file_path = '/path/to/local/pdf';
+
+
+def process_document_sample(project_id: str, location: str, processor_id: str, file_path: str):
+    client_options = dict(api_endpoint="us-documentai.googleapis.com")
+    client = documentai.DocumentProcessorServiceClient(client_options=client_options)
+
+    # The full resource name of the processor, e.g.:
+    # projects/project-id/locations/location/processor/processor-id
+    # You must create new processors in the Cloud Console first
+    name = f'projects/{project_id}/locations/{location}/processors/{processor_id}'
+
+    with open(file_path, 'rb') as image:
+        image_content = image.read()
+
+    # Read the file into memory
+    document = {
+        'content': image_content,
+        'mime_type': 'application/pdf'
+    }
+
+    # Configure the process request
+    request = {
+        'name': name,
+        'document': document
+    }
+
+    result = client.process_document(request=request)
+
+    def _get_text(el):
+        """Doc AI identifies form fields by their offsets
+        in document text. This function converts offsets
+        to text snippets.
+        """
+        response = ''
+        # If a text segment spans several lines, it will
+        # be stored in different text segments.
+        for segment in el.text_anchor.text_segments:
+            start_index = int(segment.start_index) if segment.start_index in el.text_anchor.text_segments else 0
+            end_index = int(segment.end_index)
+            response += document.text[start_index:end_index]
+        return response
+
+    print('Document processing complete.')
+
+    document = result.document
+
+    page_1 = document.pages[0]
+    paragraphs = page_1.paragraphs
+
+    for paragraph in paragraphs:
+        paragraph_text = _get_text(paragraph.layout)
+        print(f'Paragraph text: {paragraph_text}')
+
+# [END documentai_process_document]
diff --git a/samples/quickstart_sample_v1beta3.py b/samples/quickstart_sample_v1beta3.py
@@ -0,0 +1,79 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from google.cloud import documentai_v1beta3 as documentai
+
+# [START documentai_quickstart]
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id= 'YOUR_PROJECT_ID';
+# location = 'YOUR_PROJECT_LOCATION'; # Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID'; #  Create processor in Cloud Console
+# file_path = '/path/to/local/pdf';
+
+
+def quickstart(project_id: str, location: str, processor_id: str, file_path: str):
+    client_options = {
+        'api_endpoint': 'us-documentai.googleapis.com'
+    }
+    client = documentai.DocumentProcessorServiceClient(client_options=client_options)
+
+    # The full resource name of the processor, e.g.:
+    # projects/project-id/locations/location/processor/processor-id
+    # You must create new processors in the Cloud Console first
+    name = f'projects/{project_id}/locations/{location}/processors/{processor_id}'
+
+    # Read the file into memory
+    with open(file_path, 'rb') as image:
+        image_content = image.read()
+
+    document = {
+        'content': image_content,
+        'mime_type': 'application/pdf'
+    }
+
+    # Configure the process request
+    request = {
+        'name': name,
+        'document': document
+    }
+
+    result = client.process_document(request=request)
+
+    document = result.document
+
+    def _get_text(el):
+        """Doc AI identifies form fields by their offsets
+        in document text. This function converts offsets
+        to text snippets.
+        """
+        response = ''
+        # If a text segment spans several lines, it will
+        # be stored in different text segments.
+        for segment in el.text_anchor.text_segments:
+            start_index = int(segment.start_index) if segment.start_index in el.text_anchor.text_segments else 0
+            end_index = int(segment.end_index)
+            response += document.text[start_index:end_index]
+        return response
+
+    page_1 = document.pages[0]
+    paragraphs = page_1.paragraphs
+
+    for paragraph in paragraphs:
+        print(paragraph)
+        paragraph_text = _get_text(paragraph.layout)
+        print(f'Paragraph text: {paragraph_text}')
+
+# [END documentai_quickstart]
diff --git a/samples/resources/invoice.pdf b/samples/resources/invoice.pdf
diff --git a/tests/unit/gapic/documentai_v1beta3/batch_process_documents_sample_v1beta3_test.py b/tests/unit/gapic/documentai_v1beta3/batch_process_documents_sample_v1beta3_test.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from uuid import uuid4
+import pytest
+import sys
+import os
+from google.cloud import storage
+
+from samples import batch_process_documents_sample_v1beta3
+from google.cloud import documentai_v1beta3 as documentai
+
+project_id = 'python-docs-samples-tests'
+location = 'us'
+processor_id = '90484cfdedb024f6'
+gcs_input_uri = 'gs://cloud-samples-data/documentai/invoice.pdf'
+gcs_output_uri = 'gs://document-ai-python'
+gcs_output_uri_prefix = uuid4()
+
+name = "projects/1012616486416/locations/us/processors/90484cfdedb024f6"
+
+@pytest.fixture(scope="module")
+def setup():
+	storage_client = storage.Client()
+	storage_client.create_bucket(gcs_output_uri)
+
+@pytest.fixture(scope="module")
+def tear_down():
+	storage_client = storage.Client()
+	bucket = storage_client.bucket(gcs_output_uri, prefix=gcs_output_uri_prefix)
+	blobs = storage_client.list_blobs(bucket)
+	# blobs = bucket.list_blobs(prefix=gcs_output_uri_prefix)
+
+	for blob in blobs:
+		blob.delete()
+
+	bucket.delete()
+
+def test_batch_process_documents(capsys):
+	batch_process_documents_sample_v1beta3.batch_process_documents(project_id=project_id, location=location, processor_id=processor_id, gcs_input_uri=gcs_input_uri, gcs_output_uri=gcs_output_uri, gcs_output_uri_prefix=gcs_output_uri_prefix)
+	out, err = capsys.readouterr()
+	sys.stdout.write(out)
+	sys.stderr.write(err)
+
+	assert "Extracted" in out
+	assert "Paragraph" in out
diff --git a/tests/unit/gapic/documentai_v1beta3/process_document_v1beta3_test.py b/tests/unit/gapic/documentai_v1beta3/process_document_v1beta3_test.py
@@ -0,0 +1,73 @@
+# # Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from uuid import uuid4
+import pytest
+import sys, os
+from google.cloud import storage
+import google.api_core as api_core
+
+from samples import process_document_sample_v1beta3
+
+from google.cloud.documentai_v1beta3.services.document_processor_service import (
+    DocumentProcessorServiceClient,
+)
+from google.cloud.documentai_v1beta3.services.document_processor_service import (
+    transports,
+)
+from google.cloud.documentai_v1beta3.types import document
+from google.cloud.documentai_v1beta3.types import document_processor_service
+from google.cloud.documentai_v1beta3.types import geometry
+from google.cloud.documentai_v1beta3.types import ProcessRequest
+
+
+location = "us"
+project_id = '1012616486416'
+processor_id = '90484cfdedb024f6'
+bucket_name = 'python_docs_samples_test_%s' %uuid4()
+gcs_input_uri ='gs://cloud-samples-data/documentai/invoice.pdf'
+gcs_output_uri = 'output-bucket'
+gcs_output_uri_prefix = uuid4()
+file_name = 'samples/resources/invoice.pdf'
+file_path = os.path.join(os.getcwd(), file_name)
+
+name = "projects/1012616486416/locations/us/processors/90484cfdedb024f6"
+
+client_options = dict(api_endpoint="us-documentai.googleapis.com")
+client = DocumentProcessorServiceClient(client_options=client_options)
+
+@pytest.fixture(scope="module")
+def setup():
+	storage_client = storage.Client()
+	storage_client.bucket(bucket_name)
+
+@pytest.fixture(scope="module")
+def tear_down():
+	storage_client = storage.Client()
+	bucket = storage_client.bucket(bucket_name)
+	blobs = bucket.list_blobs(prefix=gcs_output_uri_prefix)
+
+	for blob in blobs:
+		blob.delete()
+
+	bucket.delete()
+
+def test_process_documents(capsys):
+	result = process_document_sample_v1beta3.process_document_sample(project_id=project_id, location=location, processor_id=processor_id, file_path=file_path)
+	out, err = capsys.readouterr()
+	sys.stdout.write(out)
+	sys.stderr.write(err)
+
+	assert "Paragraph" in out