Skip to content
This repository has been archived by the owner on Sep 20, 2023. It is now read-only.

docs(samples): new Doc AI samples for v1beta3 #44

Merged
merged 49 commits into from Oct 21, 2020
Merged
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
b2dc573
batch_process_sample. changing from async to synchronous
aribray Oct 13, 2020
b01d802
add quick start and process_document samples and tests
aribray Oct 15, 2020
cfb964a
add test and sample for batch_process
aribray Oct 15, 2020
8f9246d
add test and sample for batch_process
aribray Oct 15, 2020
ba7681a
resolve merge conflict
aribray Oct 15, 2020
a37f39a
python document ai samples
aribray Oct 15, 2020
99f7f11
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
87254c7
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
bcf97a6
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
26b9450
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
4943437
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
9439937
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
15dd4e4
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
0943fba
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
01058fe
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
d616c54
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
0b18336
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
4d08bf4
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
6ee7994
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 16, 2020
dc24b32
resolve formatting
aribray Oct 16, 2020
82e8ab9
use os.environ
aribray Oct 16, 2020
c373ac3
remove os.path.join
aribray Oct 16, 2020
6389213
move tests
aribray Oct 16, 2020
37cd427
descriptive variable
aribray Oct 16, 2020
aef335e
specific Exception, formatting
aribray Oct 16, 2020
a4d2b4a
parse all pages in process_document
aribray Oct 16, 2020
bbc187e
add more helpful comments
aribray Oct 16, 2020
dd6488f
remove unused imports
aribray Oct 16, 2020
2179581
better exception handling
aribray Oct 16, 2020
3cb2c0a
rename test files
aribray Oct 16, 2020
f424aee
Merge branch 'master' into python-docai
aribray Oct 19, 2020
27b63f1
Merge branch 'master' into python-docai
aribray Oct 19, 2020
043b445
ran linter, removed nested function in batch predict
aribray Oct 19, 2020
dba5ef8
refactor tests
aribray Oct 19, 2020
5416bbc
format imports
aribray Oct 19, 2020
d9e2cca
format imports
aribray Oct 19, 2020
700ab75
format imports
aribray Oct 19, 2020
66dde36
serialize as Document object
aribray Oct 20, 2020
0b839e8
extract get_text helper function
aribray Oct 20, 2020
bda06e8
fix file path
aribray Oct 20, 2020
ad5ff58
delete test bucket
aribray Oct 20, 2020
cd4a1d1
Update samples/snippets/batch_process_documents_sample_v1beta3_test.py
aribray Oct 21, 2020
e9ba609
Update samples/snippets/batch_process_documents_sample_v1beta3_test.py
aribray Oct 21, 2020
a439e32
add more specific assertion in batch_process
aribray Oct 21, 2020
4e3f369
add more specific assertion in process_document and quickstart
aribray Oct 21, 2020
9c7adaf
fix output_uri name
aribray Oct 21, 2020
0849731
Apply suggestions from code review to resolve exception
aribray Oct 21, 2020
61f0c7f
resolve exception
aribray Oct 21, 2020
80d0fb4
lint
aribray Oct 21, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Empty file added samples/__init__.py
Empty file.
Empty file added samples/snippets/__init__.py
Empty file.
121 changes: 121 additions & 0 deletions samples/snippets/batch_process_documents_sample_v1beta3.py
@@ -0,0 +1,121 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_process_document]
import re

from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID'
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
# input_uri = "YOUR_INPUT_URI"
# gcs_output_bucket_uri = "YOUR_OUTPUT_BUCKET_URI"
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"


def batch_process_documents(
project_id,
location,
processor_id,
gcs_input_uri,
gcs_output_uri,
gcs_output_uri_prefix,
):

client = documentai.DocumentProcessorServiceClient()

destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"

# 'mime_type' can be 'application/pdf', 'image/tiff',
# and 'image/gif', or 'application/json'
input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
gcs_source=gcs_input_uri, mime_type="application/pdf"
)

# Where to write results
output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
gcs_destination=destination_uri
)

# Location can be 'us' or 'eu'
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
request = documentai.types.document_processor_service.BatchProcessRequest(
name=name,
input_configs=[input_config],
output_config=output_config,
)

operation = client.batch_process_documents(request)

# Wait for the operation to finish
operation.result()

# Results are written to GCS. Use a regex to find
# output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)

storage_client = storage.client.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print("Output files:")

for i, blob in enumerate(blob_list):
# Download the contents of this blob as a bytes object.
blob_as_bytes = blob.download_as_bytes()
document = documentai.types.Document.from_json(blob_as_bytes)

print(f"Fetched file {i + 1}")

# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

# Read the text recognition output from the processor
for page in document.pages:
for form_field in page.form_fields:
field_name = get_text(form_field.field_name, document)
field_value = get_text(form_field.field_value, document)
print("Extracted key value pair:")
print(f"\t{field_name}, {field_value}")
for paragraph in document.pages:
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text:\n{paragraph_text}")


# Extract shards from the text field
def get_text(doc_element: dict, document: dict):
aribray marked this conversation as resolved.
Show resolved Hide resolved
"""
Document AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in doc_element.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if "start_index" in doc_element.text_anchor.__dict__
else 0
)
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response


# [END documentai_batch_process_document]
61 changes: 61 additions & 0 deletions samples/snippets/batch_process_documents_sample_v1beta3_test.py
@@ -0,0 +1,61 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
from uuid import uuid4

from google.cloud import storage
from google.cloud.exceptions import NotFound

import pytest

from samples.snippets import batch_process_documents_sample_v1beta3

project_id = "python-docs-samples-tests"
aribray marked this conversation as resolved.
Show resolved Hide resolved
location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
gcs_output_uri = "gs://document-ai-python"
aribray marked this conversation as resolved.
Show resolved Hide resolved
gcs_output_uri_prefix = uuid4()
aribray marked this conversation as resolved.
Show resolved Hide resolved

aribray marked this conversation as resolved.
Show resolved Hide resolved

@pytest.fixture(scope="module")
def test_bucket():
storage_client = storage.Client()
bucket = storage_client.bucket(gcs_output_uri)
aribray marked this conversation as resolved.
Show resolved Hide resolved
yield bucket
aribray marked this conversation as resolved.
Show resolved Hide resolved

try:
blobs = list(bucket.list_blobs())
for blob in blobs:
blob.delete()
except NotFound:
print("Bucket already deleted.")


def test_batch_process_documents(capsys, test_bucket):
batch_process_documents_sample_v1beta3.batch_process_documents(
project_id=project_id,
location=location,
processor_id=processor_id,
gcs_input_uri=gcs_input_uri,
gcs_output_uri=gcs_output_uri,
aribray marked this conversation as resolved.
Show resolved Hide resolved
gcs_output_uri_prefix=gcs_output_uri_prefix,
)
out, _ = capsys.readouterr()

assert "Extracted" in out
assert "Paragraph" in out
aribray marked this conversation as resolved.
Show resolved Hide resolved
29 changes: 14 additions & 15 deletions samples/snippets/noxfile.py
Expand Up @@ -37,24 +37,22 @@

TEST_CONFIG = {
# You can opt out from the test for specific Python versions.
'ignored_versions': ["2.7"],

"ignored_versions": ["2.7"],
# An envvar key for determining the project id to use. Change it
# to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
# build specific Cloud project. You can also use your own string
# to use your own Cloud project.
'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT',
"gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
# 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',

# A dictionary you want to inject into your test. Don't put any
# secrets here. These values will override predefined values.
'envs': {},
"envs": {},
}


try:
# Ensure we can import noxfile_config in the project's directory.
sys.path.append('.')
sys.path.append(".")
from noxfile_config import TEST_CONFIG_OVERRIDE
except ImportError as e:
print("No user noxfile_config found: detail: {}".format(e))
Expand All @@ -69,13 +67,13 @@ def get_pytest_env_vars():
ret = {}

# Override the GCLOUD_PROJECT and the alias.
env_key = TEST_CONFIG['gcloud_project_env']
env_key = TEST_CONFIG["gcloud_project_env"]
# This should error out if not set.
ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
ret['GCLOUD_PROJECT'] = os.environ[env_key] # deprecated
ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]
ret["GCLOUD_PROJECT"] = os.environ[env_key] # deprecated

# Apply user supplied envs.
ret.update(TEST_CONFIG['envs'])
ret.update(TEST_CONFIG["envs"])
return ret


Expand All @@ -84,7 +82,7 @@ def get_pytest_env_vars():
ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]

# Any default versions that should be ignored.
IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"]

TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])

Expand Down Expand Up @@ -138,7 +136,7 @@ def lint(session):
args = FLAKE8_COMMON_ARGS + [
"--application-import-names",
",".join(local_names),
"."
".",
]
session.run("flake8", *args)

Expand All @@ -147,6 +145,7 @@ def lint(session):
# Black
#


@nox.session
def blacken(session):
session.install("black")
Expand Down Expand Up @@ -194,9 +193,9 @@ def py(session):
if session.python in TESTED_VERSIONS:
_session_tests(session)
else:
session.skip("SKIPPED: {} tests are disabled for this sample.".format(
session.python
))
session.skip(
"SKIPPED: {} tests are disabled for this sample.".format(session.python)
)


#
Expand Down
88 changes: 88 additions & 0 deletions samples/snippets/process_document_sample_v1beta3.py
@@ -0,0 +1,88 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from google.cloud import documentai_v1beta3 as documentai

# [START documentai_process_document]

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID';
# location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
# file_path = '/path/to/local/pdf';


def process_document_sample(
project_id: str, location: str, processor_id: str, file_path: str
):
# Instantiates a client
client = documentai.DocumentProcessorServiceClient()

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

with open(file_path, "rb") as image:
image_content = image.read()

# Read the file into memory
document = {"content": image_content, "mime_type": "application/pdf"}

# Configure the process request
request = {"name": name, "document": document}

# Recognizes text entities in the PDF document
result = client.process_document(request=request)

document = result.document

print("Document processing complete.")

# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

document_pages = document.pages

# Read the text recognition output from the processor
print("The document contains the following paragraphs:")
for page in document_pages:
paragraphs = page.paragraphs
for paragraph in paragraphs:
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text: {paragraph_text}")


# Extract shards from the text field
def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in doc_element.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment.start_index in doc_element.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response


# [END documentai_process_document]
36 changes: 36 additions & 0 deletions samples/snippets/process_document_sample_v1beta3_test.py
@@ -0,0 +1,36 @@
# # Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

from samples.snippets import process_document_sample_v1beta3


location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
file_path = "../../samples/snippets/resources/invoice.pdf"
aribray marked this conversation as resolved.
Show resolved Hide resolved


def test_process_documents(capsys):
process_document_sample_v1beta3.process_document_sample(
project_id=project_id,
location=location,
processor_id=processor_id,
file_path=file_path,
)
out, _ = capsys.readouterr()

assert "Paragraph" in out
aribray marked this conversation as resolved.
Show resolved Hide resolved