Skip to content
This repository has been archived by the owner on Sep 20, 2023. It is now read-only.

docs(samples): new Doc AI samples for v1beta3 #44

Merged
merged 49 commits into from Oct 21, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
b2dc573
batch_process_sample. changing from async to synchronous
aribray Oct 13, 2020
b01d802
add quick start and process_document samples and tests
aribray Oct 15, 2020
cfb964a
add test and sample for batch_process
aribray Oct 15, 2020
8f9246d
add test and sample for batch_process
aribray Oct 15, 2020
ba7681a
resolve merge conflict
aribray Oct 15, 2020
a37f39a
python document ai samples
aribray Oct 15, 2020
99f7f11
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
87254c7
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
bcf97a6
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
26b9450
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
4943437
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
9439937
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
15dd4e4
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
0943fba
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
01058fe
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
d616c54
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
0b18336
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
4d08bf4
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 15, 2020
6ee7994
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray Oct 16, 2020
dc24b32
resolve formatting
aribray Oct 16, 2020
82e8ab9
use os.environ
aribray Oct 16, 2020
c373ac3
remove os.path.join
aribray Oct 16, 2020
6389213
move tests
aribray Oct 16, 2020
37cd427
descriptive variable
aribray Oct 16, 2020
aef335e
specific Exception, formatting
aribray Oct 16, 2020
a4d2b4a
parse all pages in process_document
aribray Oct 16, 2020
bbc187e
add more helpful comments
aribray Oct 16, 2020
dd6488f
remove unused imports
aribray Oct 16, 2020
2179581
better exception handling
aribray Oct 16, 2020
3cb2c0a
rename test files
aribray Oct 16, 2020
f424aee
Merge branch 'master' into python-docai
aribray Oct 19, 2020
27b63f1
Merge branch 'master' into python-docai
aribray Oct 19, 2020
043b445
ran linter, removed nested function in batch predict
aribray Oct 19, 2020
dba5ef8
refactor tests
aribray Oct 19, 2020
5416bbc
format imports
aribray Oct 19, 2020
d9e2cca
format imports
aribray Oct 19, 2020
700ab75
format imports
aribray Oct 19, 2020
66dde36
serialize as Document object
aribray Oct 20, 2020
0b839e8
extract get_text helper function
aribray Oct 20, 2020
bda06e8
fix file path
aribray Oct 20, 2020
ad5ff58
delete test bucket
aribray Oct 20, 2020
cd4a1d1
Update samples/snippets/batch_process_documents_sample_v1beta3_test.py
aribray Oct 21, 2020
e9ba609
Update samples/snippets/batch_process_documents_sample_v1beta3_test.py
aribray Oct 21, 2020
a439e32
add more specific assertion in batch_process
aribray Oct 21, 2020
4e3f369
add more specific assertion in process_document and quickstart
aribray Oct 21, 2020
9c7adaf
fix output_uri name
aribray Oct 21, 2020
0849731
Apply suggestions from code review to resolve exception
aribray Oct 21, 2020
61f0c7f
resolve exception
aribray Oct 21, 2020
80d0fb4
lint
aribray Oct 21, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
107 changes: 107 additions & 0 deletions samples/batch_process_documents_sample_v1beta3.py
@@ -0,0 +1,107 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_process_document]
import re
import json
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID'
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
# input_uri = "YOUR_INPUT_URI"
# gcs_output_bucket_uri = "YOUR_OUTPUT_BUCKET_URI"
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"


def batch_process_documents(
project_id,
location,
processor_id,
gcs_input_uri,
gcs_output_uri,
gcs_output_uri_prefix):
"""Parse a form"""
aribray marked this conversation as resolved.
Show resolved Hide resolved

client = documentai.DocumentProcessorServiceClient()

destination_uri = f'{gcs_output_uri}/{gcs_output_uri_prefix}/'

# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
gcs_source=gcs_input_uri, mime_type='application/pdf')

# where to write results
output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
gcs_destination=destination_uri
)

# Location can be 'us' or 'eu'
name = f'projects/{project_id}/locations/{location}/processors/{processor_id}'
request = documentai.types.document_processor_service.BatchProcessRequest(
name=name,
input_configs=[input_config],
output_config=output_config,)

operation = client.batch_process_documents(request)

# Wait for the operation to finish
operation.result()

# Results are written to GCS. Use a regex to find
# output files
match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)

storage_client = storage.client.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')

def _get_text(el):
aribray marked this conversation as resolved.
Show resolved Hide resolved
"""Doc AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ''
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in el['textAnchor']['textSegments']:
start_index = int(segment['startIndex']) if 'startIndex' in el['textAnchor'] else 0
end_index = int(segment['endIndex'])
response += document['text'][start_index:end_index]
return response

for i, blob in enumerate(blob_list):
json_string = blob.download_as_bytes()
document = json.loads(json_string)

print(f'Fetched file {i + 1}')

for page in document['pages']:
for form_field in page['formFields']:
field_name = _get_text(form_field['fieldName'])
field_value = _get_text(form_field['fieldValue'])
print('Extracted key value pair:')
print(f'\t{field_name}, {field_value}')
for paragraph in document['pages']:
paragraph_text = _get_text(paragraph['layout'])
print(f'Paragraph text:\n{paragraph_text}')
aribray marked this conversation as resolved.
Show resolved Hide resolved

# [END documentai_batch_process_document]
78 changes: 78 additions & 0 deletions samples/process_document_sample_v1beta3.py
@@ -0,0 +1,78 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from google.cloud import documentai_v1beta3 as documentai

# [START documentai_process_document]

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID';
# location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
# file_path = '/path/to/local/pdf';


def process_document_sample(project_id: str, location: str, processor_id: str, file_path: str):
client_options = dict(api_endpoint="us-documentai.googleapis.com")
aribray marked this conversation as resolved.
Show resolved Hide resolved
client = documentai.DocumentProcessorServiceClient(client_options=client_options)

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f'projects/{project_id}/locations/{location}/processors/{processor_id}'

with open(file_path, 'rb') as image:
image_content = image.read()

# Read the file into memory
document = {
'content': image_content,
'mime_type': 'application/pdf'
}

# Configure the process request
request = {
'name': name,
'document': document
}

result = client.process_document(request=request)

def _get_text(el):
"""Doc AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ''
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in el.text_anchor.text_segments:
start_index = int(segment.start_index) if segment.start_index in el.text_anchor.text_segments else 0
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response

print('Document processing complete.')

document = result.document

page_1 = document.pages[0]
paragraphs = page_1.paragraphs

for paragraph in paragraphs:
paragraph_text = _get_text(paragraph.layout)
print(f'Paragraph text: {paragraph_text}')

# [END documentai_process_document]
79 changes: 79 additions & 0 deletions samples/quickstart_sample_v1beta3.py
@@ -0,0 +1,79 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from google.cloud import documentai_v1beta3 as documentai

# [START documentai_quickstart]

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID';
# location = 'YOUR_PROJECT_LOCATION'; # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID'; # Create processor in Cloud Console
# file_path = '/path/to/local/pdf';


def quickstart(project_id: str, location: str, processor_id: str, file_path: str):
client_options = {
aribray marked this conversation as resolved.
Show resolved Hide resolved
'api_endpoint': 'us-documentai.googleapis.com'
}
client = documentai.DocumentProcessorServiceClient(client_options=client_options)

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f'projects/{project_id}/locations/{location}/processors/{processor_id}'

# Read the file into memory
with open(file_path, 'rb') as image:
image_content = image.read()

document = {
'content': image_content,
'mime_type': 'application/pdf'
}

# Configure the process request
request = {
'name': name,
'document': document
}

result = client.process_document(request=request)

document = result.document

def _get_text(el):
"""Doc AI identifies form fields by their offsets
aribray marked this conversation as resolved.
Show resolved Hide resolved
in document text. This function converts offsets
to text snippets.
"""
response = ''
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in el.text_anchor.text_segments:
start_index = int(segment.start_index) if segment.start_index in el.text_anchor.text_segments else 0
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response

page_1 = document.pages[0]
paragraphs = page_1.paragraphs

for paragraph in paragraphs:
print(paragraph)
paragraph_text = _get_text(paragraph.layout)
print(f'Paragraph text: {paragraph_text}')

# [END documentai_quickstart]
Binary file added samples/resources/invoice.pdf
Binary file not shown.
@@ -0,0 +1,58 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from uuid import uuid4
import pytest
import sys
import os
from google.cloud import storage

from samples import batch_process_documents_sample_v1beta3
from google.cloud import documentai_v1beta3 as documentai

project_id = 'python-docs-samples-tests'
location = 'us'
processor_id = '90484cfdedb024f6'
gcs_input_uri = 'gs://cloud-samples-data/documentai/invoice.pdf'
gcs_output_uri = 'gs://document-ai-python'
gcs_output_uri_prefix = uuid4()

name = "projects/1012616486416/locations/us/processors/90484cfdedb024f6"

@pytest.fixture(scope="module")
def setup():
storage_client = storage.Client()
storage_client.create_bucket(gcs_output_uri)

@pytest.fixture(scope="module")
def tear_down():
storage_client = storage.Client()
bucket = storage_client.bucket(gcs_output_uri, prefix=gcs_output_uri_prefix)
blobs = storage_client.list_blobs(bucket)
# blobs = bucket.list_blobs(prefix=gcs_output_uri_prefix)

for blob in blobs:
blob.delete()

bucket.delete()

def test_batch_process_documents(capsys):
batch_process_documents_sample_v1beta3.batch_process_documents(project_id=project_id, location=location, processor_id=processor_id, gcs_input_uri=gcs_input_uri, gcs_output_uri=gcs_output_uri, gcs_output_uri_prefix=gcs_output_uri_prefix)
out, err = capsys.readouterr()
sys.stdout.write(out)
sys.stderr.write(err)

assert "Extracted" in out
assert "Paragraph" in out
@@ -0,0 +1,73 @@
# # Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from uuid import uuid4
import pytest
import sys, os
from google.cloud import storage
import google.api_core as api_core

from samples import process_document_sample_v1beta3

from google.cloud.documentai_v1beta3.services.document_processor_service import (
DocumentProcessorServiceClient,
)
from google.cloud.documentai_v1beta3.services.document_processor_service import (
transports,
)
from google.cloud.documentai_v1beta3.types import document
from google.cloud.documentai_v1beta3.types import document_processor_service
from google.cloud.documentai_v1beta3.types import geometry
from google.cloud.documentai_v1beta3.types import ProcessRequest


location = "us"
project_id = '1012616486416'
processor_id = '90484cfdedb024f6'
bucket_name = 'python_docs_samples_test_%s' %uuid4()
gcs_input_uri ='gs://cloud-samples-data/documentai/invoice.pdf'
gcs_output_uri = 'output-bucket'
gcs_output_uri_prefix = uuid4()
file_name = 'samples/resources/invoice.pdf'
file_path = os.path.join(os.getcwd(), file_name)

name = "projects/1012616486416/locations/us/processors/90484cfdedb024f6"

client_options = dict(api_endpoint="us-documentai.googleapis.com")
client = DocumentProcessorServiceClient(client_options=client_options)

@pytest.fixture(scope="module")
def setup():
storage_client = storage.Client()
storage_client.bucket(bucket_name)

@pytest.fixture(scope="module")
def tear_down():
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=gcs_output_uri_prefix)

for blob in blobs:
blob.delete()

bucket.delete()

def test_process_documents(capsys):
result = process_document_sample_v1beta3.process_document_sample(project_id=project_id, location=location, processor_id=processor_id, file_path=file_path)
out, err = capsys.readouterr()
sys.stdout.write(out)
sys.stderr.write(err)

assert "Paragraph" in out