diff --git a/samples/__init__.py b/samples/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/samples/snippets/__init__.py b/samples/snippets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py new file mode 100644 index 00000000..2936b3b3 --- /dev/null +++ b/samples/snippets/batch_process_documents_sample_v1beta3.py @@ -0,0 +1,121 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START documentai_batch_process_document] +import re + +from google.cloud import documentai_v1beta3 as documentai +from google.cloud import storage + +# TODO(developer): Uncomment these variables before running the sample. +# project_id= 'YOUR_PROJECT_ID' +# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu' +# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console +# input_uri = "YOUR_INPUT_URI" +# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI" +# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" + + +def batch_process_documents( + project_id, + location, + processor_id, + gcs_input_uri, + gcs_output_uri, + gcs_output_uri_prefix, +): + + client = documentai.DocumentProcessorServiceClient() + + destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/" + + # 'mime_type' can be 'application/pdf', 'image/tiff', + # and 'image/gif', or 'application/json' + input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig( + gcs_source=gcs_input_uri, mime_type="application/pdf" + ) + + # Where to write results + output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig( + gcs_destination=destination_uri + ) + + # Location can be 'us' or 'eu' + name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" + request = documentai.types.document_processor_service.BatchProcessRequest( + name=name, + input_configs=[input_config], + output_config=output_config, + ) + + operation = client.batch_process_documents(request) + + # Wait for the operation to finish + operation.result() + + # Results are written to GCS. Use a regex to find + # output files + match = re.match(r"gs://([^/]+)/(.+)", destination_uri) + output_bucket = match.group(1) + prefix = match.group(2) + + storage_client = storage.Client() + bucket = storage_client.get_bucket(output_bucket) + blob_list = list(bucket.list_blobs(prefix=prefix)) + print("Output files:") + + for i, blob in enumerate(blob_list): + # Download the contents of this blob as a bytes object. + blob_as_bytes = blob.download_as_bytes() + document = documentai.types.Document.from_json(blob_as_bytes) + + print(f"Fetched file {i + 1}") + + # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document + + # Read the text recognition output from the processor + for page in document.pages: + for form_field in page.form_fields: + field_name = get_text(form_field.field_name, document) + field_value = get_text(form_field.field_value, document) + print("Extracted key value pair:") + print(f"\t{field_name}, {field_value}") + for paragraph in document.pages: + paragraph_text = get_text(paragraph.layout, document) + print(f"Paragraph text:\n{paragraph_text}") + + +# Extract shards from the text field +def get_text(doc_element: dict, document: dict): + """ + Document AI identifies form fields by their offsets + in document text. This function converts offsets + to text snippets. + """ + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in doc_element.text_anchor.text_segments: + start_index = ( + int(segment.start_index) + if "start_index" in doc_element.text_anchor.__dict__ + else 0 + ) + end_index = int(segment.end_index) + response += document.text[start_index:end_index] + return response + + +# [END documentai_batch_process_document] diff --git a/samples/snippets/batch_process_documents_sample_v1beta3_test.py b/samples/snippets/batch_process_documents_sample_v1beta3_test.py new file mode 100644 index 00000000..dcb63567 --- /dev/null +++ b/samples/snippets/batch_process_documents_sample_v1beta3_test.py @@ -0,0 +1,62 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from uuid import uuid4 + +from google.cloud import storage +from google.cloud.exceptions import NotFound + +import pytest + +from samples.snippets import batch_process_documents_sample_v1beta3 + +location = "us" +project_id = os.environ["GOOGLE_CLOUD_PROJECT"] +processor_id = "90484cfdedb024f6" +gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf" +gcs_output_uri_prefix = uuid4() +BUCKET_NAME = f"document-ai-python-{uuid4()}" + + +@pytest.fixture(scope="module") +def test_bucket(): + storage_client = storage.Client() + bucket = storage_client.create_bucket(BUCKET_NAME) + yield bucket.name + + try: + blobs = list(bucket.list_blobs()) + for blob in blobs: + blob.delete() + bucket.delete() + except NotFound: + print("Bucket already deleted.") + + +def test_batch_process_documents(capsys, test_bucket): + batch_process_documents_sample_v1beta3.batch_process_documents( + project_id=project_id, + location=location, + processor_id=processor_id, + gcs_input_uri=gcs_input_uri, + gcs_output_uri=f"gs://{test_bucket}", + gcs_output_uri_prefix=gcs_output_uri_prefix, + ) + out, _ = capsys.readouterr() + + assert "Extracted" in out + assert "Paragraph" in out + assert "Invoice" in out diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index eb2b6b9e..817cef92 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -37,24 +37,22 @@ TEST_CONFIG = { # You can opt out from the test for specific Python versions. - 'ignored_versions': ["2.7"], - + "ignored_versions": ["2.7"], # An envvar key for determining the project id to use. Change it # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a # build specific Cloud project. You can also use your own string # to use your own Cloud project. - 'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT', + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', - # A dictionary you want to inject into your test. Don't put any # secrets here. These values will override predefined values. - 'envs': {}, + "envs": {}, } try: # Ensure we can import noxfile_config in the project's directory. - sys.path.append('.') + sys.path.append(".") from noxfile_config import TEST_CONFIG_OVERRIDE except ImportError as e: print("No user noxfile_config found: detail: {}".format(e)) @@ -69,13 +67,13 @@ def get_pytest_env_vars(): ret = {} # Override the GCLOUD_PROJECT and the alias. - env_key = TEST_CONFIG['gcloud_project_env'] + env_key = TEST_CONFIG["gcloud_project_env"] # This should error out if not set. - ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] - ret['GCLOUD_PROJECT'] = os.environ[env_key] # deprecated + ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key] + ret["GCLOUD_PROJECT"] = os.environ[env_key] # deprecated # Apply user supplied envs. - ret.update(TEST_CONFIG['envs']) + ret.update(TEST_CONFIG["envs"]) return ret @@ -84,7 +82,7 @@ def get_pytest_env_vars(): ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] # Any default versions that should be ignored. -IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] +IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) @@ -138,7 +136,7 @@ def lint(session): args = FLAKE8_COMMON_ARGS + [ "--application-import-names", ",".join(local_names), - "." + ".", ] session.run("flake8", *args) @@ -147,6 +145,7 @@ def lint(session): # Black # + @nox.session def blacken(session): session.install("black") @@ -194,9 +193,9 @@ def py(session): if session.python in TESTED_VERSIONS: _session_tests(session) else: - session.skip("SKIPPED: {} tests are disabled for this sample.".format( - session.python - )) + session.skip( + "SKIPPED: {} tests are disabled for this sample.".format(session.python) + ) # diff --git a/samples/snippets/process_document_sample_v1beta3.py b/samples/snippets/process_document_sample_v1beta3.py new file mode 100644 index 00000000..330e8183 --- /dev/null +++ b/samples/snippets/process_document_sample_v1beta3.py @@ -0,0 +1,88 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from google.cloud import documentai_v1beta3 as documentai + +# [START documentai_process_document] + +# TODO(developer): Uncomment these variables before running the sample. +# project_id= 'YOUR_PROJECT_ID'; +# location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' +# processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console +# file_path = '/path/to/local/pdf'; + + +def process_document_sample( + project_id: str, location: str, processor_id: str, file_path: str +): + # Instantiates a client + client = documentai.DocumentProcessorServiceClient() + + # The full resource name of the processor, e.g.: + # projects/project-id/locations/location/processor/processor-id + # You must create new processors in the Cloud Console first + name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" + + with open(file_path, "rb") as image: + image_content = image.read() + + # Read the file into memory + document = {"content": image_content, "mime_type": "application/pdf"} + + # Configure the process request + request = {"name": name, "document": document} + + # Recognizes text entities in the PDF document + result = client.process_document(request=request) + + document = result.document + + print("Document processing complete.") + + # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document + + document_pages = document.pages + + # Read the text recognition output from the processor + print("The document contains the following paragraphs:") + for page in document_pages: + paragraphs = page.paragraphs + for paragraph in paragraphs: + paragraph_text = get_text(paragraph.layout, document) + print(f"Paragraph text: {paragraph_text}") + + +# Extract shards from the text field +def get_text(doc_element: dict, document: dict): + """ + Document AI identifies form fields by their offsets + in document text. This function converts offsets + to text snippets. + """ + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in doc_element.text_anchor.text_segments: + start_index = ( + int(segment.start_index) + if segment.start_index in doc_element.text_anchor.text_segments + else 0 + ) + end_index = int(segment.end_index) + response += document.text[start_index:end_index] + return response + + +# [END documentai_process_document] diff --git a/samples/snippets/process_document_sample_v1beta3_test.py b/samples/snippets/process_document_sample_v1beta3_test.py new file mode 100644 index 00000000..58b11b22 --- /dev/null +++ b/samples/snippets/process_document_sample_v1beta3_test.py @@ -0,0 +1,37 @@ +# # Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +from samples.snippets import process_document_sample_v1beta3 + + +location = "us" +project_id = os.environ["GOOGLE_CLOUD_PROJECT"] +processor_id = "90484cfdedb024f6" +file_path = "resources/invoice.pdf" + + +def test_process_documents(capsys): + process_document_sample_v1beta3.process_document_sample( + project_id=project_id, + location=location, + processor_id=processor_id, + file_path=file_path, + ) + out, _ = capsys.readouterr() + + assert "Paragraph" in out + assert "Invoice" in out diff --git a/samples/snippets/quickstart_sample_v1beta3.py b/samples/snippets/quickstart_sample_v1beta3.py new file mode 100644 index 00000000..c5cd34ae --- /dev/null +++ b/samples/snippets/quickstart_sample_v1beta3.py @@ -0,0 +1,81 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from google.cloud import documentai_v1beta3 as documentai + +# [START documentai_quickstart] + +# TODO(developer): Uncomment these variables before running the sample. +# project_id= 'YOUR_PROJECT_ID'; +# location = 'YOUR_PROJECT_LOCATION'; # Format is 'us' or 'eu' +# processor_id = 'YOUR_PROCESSOR_ID'; # Create processor in Cloud Console +# file_path = '/path/to/local/pdf'; + + +def quickstart(project_id: str, location: str, processor_id: str, file_path: str): + client = documentai.DocumentProcessorServiceClient() + + # The full resource name of the processor, e.g.: + # projects/project-id/locations/location/processor/processor-id + # You must create new processors in the Cloud Console first + name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" + + # Read the file into memory + with open(file_path, "rb") as image: + image_content = image.read() + + document = {"content": image_content, "mime_type": "application/pdf"} + + # Configure the process request + request = {"name": name, "document": document} + + result = client.process_document(request=request) + document = result.document + + document_pages = document.pages + + # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document + + # Read the text recognition output from the processor + print("The document contains the following paragraphs:") + for page in document_pages: + paragraphs = page.paragraphs + for paragraph in paragraphs: + print(paragraph) + paragraph_text = get_text(paragraph.layout, document) + print(f"Paragraph text: {paragraph_text}") + + +def get_text(doc_element: dict, document: dict): + """ + Document AI identifies form fields by their offsets + in document text. This function converts offsets + to text snippets. + """ + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in doc_element.text_anchor.text_segments: + start_index = ( + int(segment.start_index) + if segment.start_index in doc_element.text_anchor.text_segments + else 0 + ) + end_index = int(segment.end_index) + response += document.text[start_index:end_index] + return response + + +# [END documentai_quickstart] diff --git a/samples/snippets/quickstart_sample_v1beta3_test.py b/samples/snippets/quickstart_sample_v1beta3_test.py new file mode 100644 index 00000000..4badc1f7 --- /dev/null +++ b/samples/snippets/quickstart_sample_v1beta3_test.py @@ -0,0 +1,36 @@ +# # Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +from samples.snippets import quickstart_sample_v1beta3 + +location = "us" +project_id = os.environ["GOOGLE_CLOUD_PROJECT"] +processor_id = "90484cfdedb024f6" +file_path = "resources/invoice.pdf" + + +def test_quickstart(capsys): + quickstart_sample_v1beta3.quickstart( + project_id=project_id, + location=location, + processor_id=processor_id, + file_path=file_path, + ) + out, _ = capsys.readouterr() + + assert "Paragraph" in out + assert "Invoice" in out diff --git a/samples/snippets/resources/invoice.pdf b/samples/snippets/resources/invoice.pdf new file mode 100644 index 00000000..7722734a Binary files /dev/null and b/samples/snippets/resources/invoice.pdf differ