diff --git a/README.md b/README.md index 743efdf3..7dec10c4 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,11 @@ Samples are in the [`samples/`](https://github.com/googleapis/java-document-ai/t | Parse With Model Beta | [source code](https://github.com/googleapis/java-document-ai/blob/main/samples/snippets/src/main/java/documentai/v1beta2/ParseWithModelBeta.java) | [![Open in Cloud Shell][shell_img]](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/googleapis/java-document-ai&page=editor&open_in_editor=samples/snippets/src/main/java/documentai/v1beta2/ParseWithModelBeta.java) | | Quick Start | [source code](https://github.com/googleapis/java-document-ai/blob/main/samples/snippets/src/main/java/documentai/v1beta2/QuickStart.java) | [![Open in Cloud Shell][shell_img]](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/googleapis/java-document-ai&page=editor&open_in_editor=samples/snippets/src/main/java/documentai/v1beta2/QuickStart.java) | | Set End Point Beta | [source code](https://github.com/googleapis/java-document-ai/blob/main/samples/snippets/src/main/java/documentai/v1beta2/SetEndPointBeta.java) | [![Open in Cloud Shell][shell_img]](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/googleapis/java-document-ai&page=editor&open_in_editor=samples/snippets/src/main/java/documentai/v1beta2/SetEndPointBeta.java) | +| Process Form Document | [source code](https://github.com/googleapis/java-document-ai/blob/main/samples/snippets/src/main/java/documentai/v1beta3/ProcessFormDocument.java) | [![Open in Cloud Shell][shell_img]](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/googleapis/java-document-ai&page=editor&open_in_editor=samples/snippets/src/main/java/documentai/v1beta3/ProcessFormDocument.java) | +| Process Ocr Document | [source code](https://github.com/googleapis/java-document-ai/blob/main/samples/snippets/src/main/java/documentai/v1beta3/ProcessOcrDocument.java) | [![Open in Cloud Shell][shell_img]](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/googleapis/java-document-ai&page=editor&open_in_editor=samples/snippets/src/main/java/documentai/v1beta3/ProcessOcrDocument.java) | +| Process Quality Document | [source code](https://github.com/googleapis/java-document-ai/blob/main/samples/snippets/src/main/java/documentai/v1beta3/ProcessQualityDocument.java) | [![Open in Cloud Shell][shell_img]](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/googleapis/java-document-ai&page=editor&open_in_editor=samples/snippets/src/main/java/documentai/v1beta3/ProcessQualityDocument.java) | +| Process Specialized Document | [source code](https://github.com/googleapis/java-document-ai/blob/main/samples/snippets/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java) | [![Open in Cloud Shell][shell_img]](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/googleapis/java-document-ai&page=editor&open_in_editor=samples/snippets/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java) | +| Process Splitter Document | [source code](https://github.com/googleapis/java-document-ai/blob/main/samples/snippets/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java) | [![Open in Cloud Shell][shell_img]](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/googleapis/java-document-ai&page=editor&open_in_editor=samples/snippets/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java) | diff --git a/samples/install-without-bom/resources/document_quality_poor.pdf b/samples/install-without-bom/resources/document_quality_poor.pdf new file mode 100644 index 00000000..3a34a925 Binary files /dev/null and b/samples/install-without-bom/resources/document_quality_poor.pdf differ diff --git a/samples/install-without-bom/resources/handwritten_form.pdf b/samples/install-without-bom/resources/handwritten_form.pdf new file mode 100644 index 00000000..2189ffff Binary files /dev/null and b/samples/install-without-bom/resources/handwritten_form.pdf differ diff --git a/samples/install-without-bom/resources/multi_document.pdf b/samples/install-without-bom/resources/multi_document.pdf new file mode 100644 index 00000000..7ea62eb8 Binary files /dev/null and b/samples/install-without-bom/resources/multi_document.pdf differ diff --git a/samples/install-without-bom/resources/us_driver_license.pdf b/samples/install-without-bom/resources/us_driver_license.pdf new file mode 100644 index 00000000..f8f62d90 Binary files /dev/null and b/samples/install-without-bom/resources/us_driver_license.pdf differ diff --git a/samples/snapshot/resources/document_quality_poor.pdf b/samples/snapshot/resources/document_quality_poor.pdf new file mode 100644 index 00000000..3a34a925 Binary files /dev/null and b/samples/snapshot/resources/document_quality_poor.pdf differ diff --git a/samples/snapshot/resources/handwritten_form.pdf b/samples/snapshot/resources/handwritten_form.pdf new file mode 100644 index 00000000..2189ffff Binary files /dev/null and b/samples/snapshot/resources/handwritten_form.pdf differ diff --git a/samples/snapshot/resources/multi_document.pdf b/samples/snapshot/resources/multi_document.pdf new file mode 100644 index 00000000..7ea62eb8 Binary files /dev/null and b/samples/snapshot/resources/multi_document.pdf differ diff --git a/samples/snapshot/resources/us_driver_license.pdf b/samples/snapshot/resources/us_driver_license.pdf new file mode 100644 index 00000000..f8f62d90 Binary files /dev/null and b/samples/snapshot/resources/us_driver_license.pdf differ diff --git a/samples/snippets/resources/document_quality_poor.pdf b/samples/snippets/resources/document_quality_poor.pdf new file mode 100644 index 00000000..3a34a925 Binary files /dev/null and b/samples/snippets/resources/document_quality_poor.pdf differ diff --git a/samples/snippets/resources/handwritten_form.pdf b/samples/snippets/resources/handwritten_form.pdf new file mode 100644 index 00000000..2189ffff Binary files /dev/null and b/samples/snippets/resources/handwritten_form.pdf differ diff --git a/samples/snippets/resources/multi_document.pdf b/samples/snippets/resources/multi_document.pdf new file mode 100644 index 00000000..7ea62eb8 Binary files /dev/null and b/samples/snippets/resources/multi_document.pdf differ diff --git a/samples/snippets/resources/us_driver_license.pdf b/samples/snippets/resources/us_driver_license.pdf new file mode 100644 index 00000000..f8f62d90 Binary files /dev/null and b/samples/snippets/resources/us_driver_license.pdf differ diff --git a/samples/snippets/src/main/java/documentai/v1beta3/ProcessFormDocument.java b/samples/snippets/src/main/java/documentai/v1beta3/ProcessFormDocument.java new file mode 100644 index 00000000..8a50d853 --- /dev/null +++ b/samples/snippets/src/main/java/documentai/v1beta3/ProcessFormDocument.java @@ -0,0 +1,149 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_form_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessFormDocument { + public static void processFormDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processFormDocument(projectId, location, processerId, filePath); + } + + public static void processFormDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read the text recognition output from the processor + // For a full list of Document object attributes, + // please reference this page: + // https://googleapis.dev/java/google-cloud-document-ai/latest/index.html + + // Get all of the document text as one big string + String text = documentResponse.getText(); + System.out.printf("Full document text: '%s'\n", removeNewlines(text)); + + // Read the text recognition output from the processor + List pages = documentResponse.getPagesList(); + System.out.printf("There are %s page(s) in this document.\n", pages.size()); + + for (Document.Page page : pages) { + System.out.printf("\n\n**** Page %d ****\n", page.getPageNumber()); + + List tables = page.getTablesList(); + System.out.printf("Found %d table(s):\n", tables.size()); + for (Document.Page.Table table : tables) { + printTableInfo(table, text); + } + + List formFields = page.getFormFieldsList(); + System.out.printf("Found %d form fields:\n", formFields.size()); + for (Document.Page.FormField formField : formFields) { + String fieldName = getLayoutText(formField.getFieldName().getTextAnchor(), text); + String fieldValue = getLayoutText(formField.getFieldValue().getTextAnchor(), text); + System.out.printf( + " * '%s': '%s'\n", removeNewlines(fieldName), removeNewlines(fieldValue)); + } + } + } + } + + private static void printTableInfo(Document.Page.Table table, String text) { + Document.Page.Table.TableRow firstBodyRow = table.getBodyRows(0); + int columnCount = firstBodyRow.getCellsCount(); + System.out.printf( + " Table with %d columns and %d rows:\n", columnCount, table.getBodyRowsCount()); + + Document.Page.Table.TableRow headerRow = table.getHeaderRows(0); + StringBuilder headerRowText = new StringBuilder(); + for (Document.Page.Table.TableCell cell : headerRow.getCellsList()) { + String columnName = getLayoutText(cell.getLayout().getTextAnchor(), text); + headerRowText.append(String.format("%s | ", removeNewlines(columnName))); + } + headerRowText.setLength(headerRowText.length() - 3); + System.out.printf(" Collumns: %s\n", headerRowText.toString()); + + StringBuilder firstRowText = new StringBuilder(); + for (Document.Page.Table.TableCell cell : firstBodyRow.getCellsList()) { + String cellText = getLayoutText(cell.getLayout().getTextAnchor(), text); + firstRowText.append(String.format("%s | ", removeNewlines(cellText))); + } + firstRowText.setLength(firstRowText.length() - 3); + System.out.printf(" First row data: %s\n", firstRowText.toString()); + } + + // Extract shards from the text field + private static String getLayoutText(Document.TextAnchor textAnchor, String text) { + if (textAnchor.getTextSegmentsList().size() > 0) { + int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex(); + int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex(); + return text.substring(startIdx, endIdx); + } + return "[NO TEXT]"; + } + + private static String removeNewlines(String s) { + return s.replace("\n", "").replace("\r", ""); + } +} +// [END documentai_process_form_document] diff --git a/samples/snippets/src/main/java/documentai/v1beta3/ProcessOcrDocument.java b/samples/snippets/src/main/java/documentai/v1beta3/ProcessOcrDocument.java new file mode 100644 index 00000000..f483929a --- /dev/null +++ b/samples/snippets/src/main/java/documentai/v1beta3/ProcessOcrDocument.java @@ -0,0 +1,172 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_ocr_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessOcrDocument { + public static void processOcrDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processOcrDocument(projectId, location, processerId, filePath); + } + + public static void processOcrDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read the text recognition output from the processor + // For a full list of Document object attributes, + // please reference this page: + // https://googleapis.dev/java/google-cloud-document-ai/latest/index.html + + // Get all of the document text as one big string + String text = documentResponse.getText(); + System.out.printf("Full document text: '%s'\n", escapeNewlines(text)); + + // Read the text recognition output from the processor + List pages = documentResponse.getPagesList(); + System.out.printf("There are %s page(s) in this document.\n", pages.size()); + + for (Document.Page page : pages) { + System.out.printf("Page %d:\n", page.getPageNumber()); + printPageDimensions(page.getDimension()); + printDetectedLanguages(page.getDetectedLanguagesList()); + printParagraphs(page.getParagraphsList(), text); + printBlocks(page.getBlocksList(), text); + printLines(page.getLinesList(), text); + printTokens(page.getTokensList(), text); + } + } + } + + private static void printPageDimensions(Document.Page.Dimension dimension) { + String unit = dimension.getUnit(); + System.out.printf(" Width: %.1f %s\n", dimension.getWidth(), unit); + System.out.printf(" Height: %.1f %s\n", dimension.getHeight(), unit); + } + + private static void printDetectedLanguages( + List detectedLangauges) { + System.out.println(" Detected languages:"); + for (Document.Page.DetectedLanguage detectedLanguage : detectedLangauges) { + String languageCode = detectedLanguage.getLanguageCode(); + float confidence = detectedLanguage.getConfidence(); + System.out.printf(" %s (%.2f%%)\n", languageCode, confidence * 100.0); + } + } + + private static void printParagraphs(List paragraphs, String text) { + System.out.printf(" %d paragraphs detected:\n", paragraphs.size()); + Document.Page.Paragraph firstParagraph = paragraphs.get(0); + String firstParagraphText = getLayoutText(firstParagraph.getLayout().getTextAnchor(), text); + System.out.printf(" First paragraph text: %s\n", escapeNewlines(firstParagraphText)); + Document.Page.Paragraph lastParagraph = paragraphs.get(paragraphs.size() - 1); + String lastParagraphText = getLayoutText(lastParagraph.getLayout().getTextAnchor(), text); + System.out.printf(" Last paragraph text: %s\n", escapeNewlines(lastParagraphText)); + } + + private static void printBlocks(List blocks, String text) { + System.out.printf(" %d blocks detected:\n", blocks.size()); + Document.Page.Block firstBlock = blocks.get(0); + String firstBlockText = getLayoutText(firstBlock.getLayout().getTextAnchor(), text); + System.out.printf(" First block text: %s\n", escapeNewlines(firstBlockText)); + Document.Page.Block lastBlock = blocks.get(blocks.size() - 1); + String lastBlockText = getLayoutText(lastBlock.getLayout().getTextAnchor(), text); + System.out.printf(" Last block text: %s\n", escapeNewlines(lastBlockText)); + } + + private static void printLines(List lines, String text) { + System.out.printf(" %d lines detected:\n", lines.size()); + Document.Page.Line firstLine = lines.get(0); + String firstLineText = getLayoutText(firstLine.getLayout().getTextAnchor(), text); + System.out.printf(" First line text: %s\n", escapeNewlines(firstLineText)); + Document.Page.Line lastLine = lines.get(lines.size() - 1); + String lastLineText = getLayoutText(lastLine.getLayout().getTextAnchor(), text); + System.out.printf(" Last line text: %s\n", escapeNewlines(lastLineText)); + } + + private static void printTokens(List tokens, String text) { + System.out.printf(" %d tokens detected:\n", tokens.size()); + Document.Page.Token firstToken = tokens.get(0); + String firstTokenText = getLayoutText(firstToken.getLayout().getTextAnchor(), text); + System.out.printf(" First token text: %s\n", escapeNewlines(firstTokenText)); + Document.Page.Token lastToken = tokens.get(tokens.size() - 1); + String lastTokenText = getLayoutText(lastToken.getLayout().getTextAnchor(), text); + System.out.printf(" Last token text: %s\n", escapeNewlines(lastTokenText)); + } + + // Extract shards from the text field + private static String getLayoutText(Document.TextAnchor textAnchor, String text) { + if (textAnchor.getTextSegmentsList().size() > 0) { + int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex(); + int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex(); + return text.substring(startIdx, endIdx); + } + return "[NO TEXT]"; + } + + private static String escapeNewlines(String s) { + return s.replace("\n", "\\n").replace("\r", "\\r"); + } +} +// [END documentai_process_ocr_document] diff --git a/samples/snippets/src/main/java/documentai/v1beta3/ProcessQualityDocument.java b/samples/snippets/src/main/java/documentai/v1beta3/ProcessQualityDocument.java new file mode 100644 index 00000000..3e80a574 --- /dev/null +++ b/samples/snippets/src/main/java/documentai/v1beta3/ProcessQualityDocument.java @@ -0,0 +1,98 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_quality_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessQualityDocument { + public static void processQualityDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processQualityDocument(projectId, location, processerId, filePath); + } + + public static void processQualityDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read the quality-specific information from the output from the + // Intelligent Document Quality Processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_doc-quality-processor + // OCR and other data is also present in the quality processor's response. + // Please see the OCR and other samples for how to parse other data in the + // response. + List entities = documentResponse.getEntitiesList(); + for (Document.Entity entity : entities) { + float entityConfidence = entity.getConfidence(); + long pageNumber = entity.getPageAnchor().getPageRefs(0).getPage() + 1; + System.out.printf( + "Page %d has a quality score of (%.2f%%):\n", pageNumber, entityConfidence * 100.0); + for (Document.Entity property : entity.getPropertiesList()) { + float propertyConfidence = property.getConfidence(); + String propertyType = property.getType(); + System.out.printf(" * %s score of %.2f%%\n", propertyType, propertyConfidence * 100.0); + } + } + } + } +} +// [END documentai_process_quality_document] diff --git a/samples/snippets/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java b/samples/snippets/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java new file mode 100644 index 00000000..5cbb1af1 --- /dev/null +++ b/samples/snippets/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java @@ -0,0 +1,106 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_specialized_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessSpecializedDocument { + public static void processSpecializedDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processSpecializedDocument(projectId, location, processerId, filePath); + } + + public static void processSpecializedDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read fields specificly from the specalized US drivers license processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_us-driver-license-parser + // retriving data from other specalized processors follow a similar pattern. + // For a complete list of processors see: + // https://cloud.google.com/document-ai/docs/processors-list + // + // OCR and other data is also present in the quality processor's response. + // Please see the OCR and other samples for how to parse other data in the + // response. + for (Document.Entity entity : documentResponse.getEntitiesList()) { + // Fields detected. For a full list of fields for each processor see + // the processor documentation: + // https://cloud.google.com/document-ai/docs/processors-list + String entityType = entity.getType(); + // some other value formats in addition to text are availible + // e.g. dates: `entity.getNormalizedValue().getDateValue().getYear()` + // check for normilized value with `entity.hasNormalizedValue()` + String entityTextValue = escapeNewlines(entity.getTextAnchor().getContent()); + float entityConfidence = entity.getConfidence(); + System.out.printf( + " * %s: %s (%.2f%% confident)\n", + entityType, entityTextValue, entityConfidence * 100.0); + } + } + } + + private static String escapeNewlines(String s) { + return s.replace("\n", "\\n").replace("\r", "\\r"); + } +} +// [END documentai_process_specialized_document] diff --git a/samples/snippets/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java b/samples/snippets/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java new file mode 100644 index 00000000..e63e2f8e --- /dev/null +++ b/samples/snippets/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java @@ -0,0 +1,112 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_splitter_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessSplitterDocument { + public static void processSplitterDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processSplitterDocument(projectId, location, processerId, filePath); + } + + public static void processSplitterDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read the splitter output from the document splitter processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_doc-splitter + // This processor only provides text for the document and information on how + // to split the document on logical boundaries. To identify and extract text, + // form elements, and entities please see other processors like the OCR, form, + // and specalized processors. + List entities = documentResponse.getEntitiesList(); + System.out.printf("Found %d subdocuments:\n", entities.size()); + for (Document.Entity entity : entities) { + float entityConfidence = entity.getConfidence(); + String pagesRangeText = pageRefsToString(entity.getPageAnchor().getPageRefsList()); + String subdocumentType = entity.getType(); + if (subdocumentType.isEmpty()) { + System.out.printf( + "%.2f%% confident that %s a subdocument.\n", entityConfidence * 100, pagesRangeText); + } else { + System.out.printf( + "%.2f%% confident that %s a '%s' subdocument.\n", + entityConfidence * 100, pagesRangeText, subdocumentType); + } + } + } + } + + // Converts page reference(s) to a string describing the page or page range. + private static String pageRefsToString(List pageRefs) { + if (pageRefs.size() == 1) { + return String.format("page %d is", pageRefs.get(0).getPage() + 1); + } else { + long start = pageRefs.get(0).getPage() + 1; + long end = pageRefs.get(1).getPage() + 1; + return String.format("pages %d to %d are", start, end); + } + } +} +// [END documentai_process_splitter_document] diff --git a/samples/snippets/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java b/samples/snippets/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java new file mode 100644 index 00000000..7491d744 --- /dev/null +++ b/samples/snippets/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java @@ -0,0 +1,78 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessFormDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "88541adc6eeec481"; + private static final String FILE_PATH = "resources/invoice.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessFormDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessFormDocument.processFormDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("There are 1 page(s) in this document."); + assertThat(got).contains("Table with 4 columns and 6 rows"); + assertThat(got).contains("Found 13 form fields"); + assertThat(got).contains("'BALANCE DUE': '$2140.00'"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/samples/snippets/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java b/samples/snippets/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java new file mode 100644 index 00000000..0c2da471 --- /dev/null +++ b/samples/snippets/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessOcrDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "f9018d35bc5edc1e"; + private static final String FILE_PATH = "resources/handwritten_form.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessOcrDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessOcrDocument.processOcrDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Page 1"); + assertThat(got).contains("en"); + assertThat(got).contains("FakeDoc"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/samples/snippets/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java b/samples/snippets/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java new file mode 100644 index 00000000..7379dbf0 --- /dev/null +++ b/samples/snippets/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessQualityDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "f80f55e03d4c20ed"; + private static final String FILE_PATH = "resources/document_quality_poor.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessQualityDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessQualityDocument.processQualityDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Page 1 has a quality score of"); + assertThat(got).contains("defect_blurry score of 9"); + assertThat(got).contains("defect_noisy"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/samples/snippets/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java b/samples/snippets/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java new file mode 100644 index 00000000..5f5b21d0 --- /dev/null +++ b/samples/snippets/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessSpecializedDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "ae8bc99f01b36b5e"; + private static final String FILE_PATH = "resources/us_driver_license.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessSpecializedDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessSpecializedDocument.processSpecializedDocument( + PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Document Id"); + assertThat(got).contains("97551579"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/samples/snippets/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java b/samples/snippets/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java new file mode 100644 index 00000000..8fcf7aaf --- /dev/null +++ b/samples/snippets/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessSplitterDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "7cb010d65184a4d"; + private static final String FILE_PATH = "resources/multi_document.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessSplitterDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessSplitterDocument.processSplitterDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Found 8 subdocuments"); + assertThat(got).contains("confident that pages 1 to 2 are a subdocument"); + assertThat(got).contains("confident that page 10 is a subdocument"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +}