Skip to content
This repository has been archived by the owner on Nov 9, 2023. It is now read-only.

Commit

Permalink
samples: add v1beta2 samples (#42)
Browse files Browse the repository at this point in the history
* init commit & missing tests

* updated samples with new default api, added dependencies to other pom files

* init commit & missing tests

* updated samples with new default api, added dependencies to other pom files

* added other tests

* finished formatting and tests

* init commit & missing tests

* updated samples with new default api, added dependencies to other pom files

* init commit & missing tests

* added other tests

* finished formatting and tests

* init commit & missing tests

* updated samples with new default api, added dependencies to other pom files

* removed TODOs

* corrected region tags, made requested changes

* fixed bom issues

* removed unused imports & fixed pom

* applied latest changes from API

* added correct location format

* changed env var

* removed prefix bucket prefix

* Update samples/snippets/pom.xml

Co-Authored-By: Noah Negrey <nnegrey@users.noreply.github.com>

* Update samples/snippets/src/main/java/com/examples/documentai/ParseWithModel.java

Co-Authored-By: Noah Negrey <nnegrey@users.noreply.github.com>

* Update samples/snippets/pom.xml

Co-Authored-By: Noah Negrey <nnegrey@users.noreply.github.com>

* Update samples/snippets/pom.xml

Co-Authored-By: Noah Negrey <nnegrey@users.noreply.github.com>

* Update samples/snippets/src/main/java/com/examples/documentai/BatchParseForm.java

Co-Authored-By: Noah Negrey <nnegrey@users.noreply.github.com>

* Update samples/snippets/src/main/java/com/examples/documentai/ParseWithModel.java

Co-Authored-By: Noah Negrey <nnegrey@users.noreply.github.com>

* update autoML id

* updated custom model

* reformatted automl param

* added env vars

* refactored tests and updated samples to latest changes

* removed AutoML project test for now

* formatted code

* testing project to have correct id

* changed project ID for testing

* added missing cfg

* added another env var

* added some comments and fixed region tags

* renamed package path, add _beta tags, added some comment on autoMl model

* removed version of bom pom.xml

* changed env var names

* corrected project id in kokoro config

* hard-coded java-docs to test

* added .

* correct autoML test to two project ID

* trigger kokoro test

* last kokoro test trigger

* moved storage dependency and fixed nit

Co-authored-by: Noah Negrey <nnegrey@users.noreply.github.com>
Co-authored-by: Jeff Ching <chingor@google.com>
  • Loading branch information
3 people committed Apr 29, 2020
1 parent 27f921b commit 8a66303
Show file tree
Hide file tree
Showing 19 changed files with 1,455 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .kokoro/nightly/samples.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ env_vars: {

env_vars: {
key: "GOOGLE_CLOUD_PROJECT"
value: "java-docs-samples-testing"
value: "gcloud-devel"
}

env_vars: {
Expand Down
2 changes: 1 addition & 1 deletion .kokoro/presubmit/samples.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ env_vars: {

env_vars: {
key: "GOOGLE_CLOUD_PROJECT"
value: "java-docs-samples-testing"
value: "gcloud-devel"
}

env_vars: {
Expand Down
8 changes: 6 additions & 2 deletions samples/install-without-bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@
<artifactId>google-cloud-document-ai</artifactId>
<version>0.1.1</version>
</dependency>
<!-- [END documentai_install_without_bom] -->

<!-- [END documentai_install_without_bom] -->
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-storage</artifactId>
<version>1.103.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
5 changes: 5 additions & 0 deletions samples/snapshot/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
<version>0.1.1</version>
</dependency>

<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-storage</artifactId>
<version>1.103.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
5 changes: 4 additions & 1 deletion samples/snippets/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@
<artifactId>google-cloud-document-ai</artifactId>
</dependency>
<!-- [END documentai_install_with_bom] -->

<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-storage</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
* Copyright 2020 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package documentai.v1beta2;

// [START documentai_batch_parse_form_beta]

import com.google.api.gax.longrunning.OperationFuture;
import com.google.api.gax.paging.Page;
import com.google.cloud.documentai.v1beta2.BatchProcessDocumentsRequest;
import com.google.cloud.documentai.v1beta2.BatchProcessDocumentsResponse;
import com.google.cloud.documentai.v1beta2.Document;
import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceClient;
import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceSettings;
import com.google.cloud.documentai.v1beta2.FormExtractionParams;
import com.google.cloud.documentai.v1beta2.GcsDestination;
import com.google.cloud.documentai.v1beta2.GcsSource;
import com.google.cloud.documentai.v1beta2.InputConfig;
import com.google.cloud.documentai.v1beta2.KeyValuePairHint;
import com.google.cloud.documentai.v1beta2.OperationMetadata;
import com.google.cloud.documentai.v1beta2.OutputConfig;
import com.google.cloud.documentai.v1beta2.ProcessDocumentRequest;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.BlobId;
import com.google.cloud.storage.Bucket;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;
import com.google.protobuf.util.JsonFormat;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

public class BatchParseFormBeta {

public static void batchParseFormGcs()
throws IOException, InterruptedException, ExecutionException, TimeoutException {
// TODO(developer): Replace these variables before running the sample.
String projectId = "your-project-id";
String location = "your-project-location"; // Format is "us" or "eu".
String outputGcsBucketName = "your-gcs-bucket-name";
String outputGcsPrefix = "PREFIX";
String inputGcsUri = "gs://your-gcs-bucket/path/to/input/file.json";
batchParseFormGcs(projectId, location, outputGcsBucketName, outputGcsPrefix, inputGcsUri);
}

public static void batchParseFormGcs(
String projectId,
String location,
String outputGcsBucketName,
String outputGcsPrefix,
String inputGcsUri)
throws IOException, InterruptedException, ExecutionException, TimeoutException {
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
try (DocumentUnderstandingServiceClient client =
DocumentUnderstandingServiceClient.create()) {

// Configure the request for processing the PDF
String parent = String.format("projects/%s/locations/%s", projectId, location);

// Improve form parsing results by providing key-value pair hints.
// For each key hint, key is text that is likely to appear in the
// document as a form field name (i.e. "DOB").
// Value types are optional, but can be one or more of:
// ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
// NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
KeyValuePairHint keyValuePairHint =
KeyValuePairHint.newBuilder().setKey("Phone").addValueTypes("PHONE_NUMBER").build();

KeyValuePairHint keyValuePairHint2 =
KeyValuePairHint.newBuilder()
.setKey("Contact")
.addValueTypes("EMAIL")
.addValueTypes("NAME")
.build();

// Setting enabled=True enables form extraction
FormExtractionParams params =
FormExtractionParams.newBuilder()
.setEnabled(true)
.addKeyValuePairHints(keyValuePairHint)
.addKeyValuePairHints(keyValuePairHint2)
.build();

GcsSource inputUri = GcsSource.newBuilder().setUri(inputGcsUri).build();

// mime_type can be application/pdf, image/tiff,
// and image/gif, or application/json
InputConfig config =
InputConfig.newBuilder().setGcsSource(inputUri)
.setMimeType("application/pdf").build();

GcsDestination gcsDestination = GcsDestination.newBuilder()
.setUri(String.format("gs://%s/%s", outputGcsBucketName, outputGcsPrefix)).build();

OutputConfig outputConfig = OutputConfig.newBuilder()
.setGcsDestination(gcsDestination)
.setPagesPerShard(1)
.build();

ProcessDocumentRequest request =
ProcessDocumentRequest.newBuilder()
.setFormExtractionParams(params)
.setInputConfig(config)
.setOutputConfig(outputConfig)
.build();

BatchProcessDocumentsRequest requests =
BatchProcessDocumentsRequest.newBuilder().addRequests(request).setParent(parent).build();

// Batch process document using a long-running operation.
OperationFuture<BatchProcessDocumentsResponse, OperationMetadata> future =
client.batchProcessDocumentsAsync(requests);

// Wait for operation to complete.
System.out.println("Waiting for operation to complete...");
future.get(300, TimeUnit.SECONDS);

System.out.println("Document processing complete.");

Storage storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
Bucket bucket = storage.get(outputGcsBucketName);

// List all of the files in the Storage bucket.
Page<Blob> blobs =
bucket.list(
Storage.BlobListOption.currentDirectory(),
Storage.BlobListOption.prefix(outputGcsPrefix));

int idx = 0;
for (Blob blob : blobs.iterateAll()) {
if (!blob.isDirectory()) {
System.out.printf("Fetched file #%d\n", ++idx);
// Read the results

// Download and store json data in a temp file.
File tempFile = File.createTempFile("file", ".json");
Blob fileInfo = storage.get(BlobId.of(outputGcsBucketName, blob.getName()));
fileInfo.downloadTo(tempFile.toPath());

// Parse json file into Document.
FileReader reader = new FileReader(tempFile);
Document.Builder builder = Document.newBuilder();
JsonFormat.parser().merge(reader, builder);

Document document = builder.build();

// Get all of the document text as one big string.
String text = document.getText();

// Process the output.
Document.Page page1 = document.getPages(0);
for (Document.Page.FormField field : page1.getFormFieldsList()) {
String fieldName = getText(field.getFieldName(), text);
String fieldValue = getText(field.getFieldValue(), text);

System.out.println("Extracted form fields pair:");
System.out.printf("\t(%s, %s))", fieldName, fieldValue);
}

// Clean up temp file.
tempFile.deleteOnExit();
}
}
}
}

private static String getText(Document.Page.Layout layout, String text) {
Document.TextAnchor textAnchor = layout.getTextAnchor();
if (textAnchor.getTextSegmentsList().size() > 0) {
int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
return text.substring(startIdx, endIdx);
}
return "[NO TEXT]";
}
}
// [END documentai_batch_parse_form_beta]

0 comments on commit 8a66303

Please sign in to comment.