/
doit.sh
executable file
·48 lines (42 loc) · 2.42 KB
/
doit.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#! /bin/bash
# A sample example of calling main.py function, to process PDF form and store raw_entities as is in BQ
# Developer replace data below below (at least PROCESSOR_ID, FILE_NAME)
PROCESSOR_PROJECT_ID="${PROJECT_ID}" # The project id for the processor to be used
PROCESSOR_ID="_processor_id_" # The id of the processor to be used BSC
BUCKET_NAME="${PROJECT_ID}" # The Google Cloud Storage bucket name for the source document. Example: 'split-docs'
FILE_NAME="path/to/form.pdf" # The file name for the source document within the bucket. Example: 'my-document-12.pdf'
DESTINATION_PROJECT_ID=${PROJECT_ID} # The BigQuery project id for the destination
DESTINATION_DATASET_ID="${DATASET_ID}" # The BigQuery dataset id for the destination
DESTINATION_TABLE_ID="doc_reference" # The BigQuery table id for the destination
CONTENT_TYPE="application/pdf" # The MIME type of the document to be processed
PROCESSOR_LOCATION="us" # The location of the processor to be used
PARSING_METHODOLOGY="entities" # entities,form,normalized_values default="entities"
EXTRACTION_OUTPUT_BUCKET="${PROJECT_ID}-docai-output"
ASYNC_OUTPUT_FOLDER_GCS_URI="gs://${PROJECT_ID}-docai-output"
echo "--bucket_name $BUCKET_NAME
--file_name $FILE_NAME
--content_type $CONTENT_TYPE
--processor_project_id $PROCESSOR_PROJECT_ID
--processor_location $PROCESSOR_LOCATION
--processor_id $PROCESSOR_ID
--extraction_output_bucket $EXTRACTION_OUTPUT_BUCKET
--parsing_methodology $PARSING_METHODOLOGY
--destination_project_id $DESTINATION_PROJECT_ID
--destination_dataset_id $DESTINATION_DATASET_ID
--destination_table_id $DESTINATION_TABLE_ID
--async_output_folder_gcs_uri $ASYNC_OUTPUT_FOLDER_GCS_URI
--include_raw_entities"
python main.py \
--bucket_name $BUCKET_NAME \
--file_name $FILE_NAME \
--content_type $CONTENT_TYPE \
--processor_project_id $PROCESSOR_PROJECT_ID \
--processor_location $PROCESSOR_LOCATION \
--processor_id $PROCESSOR_ID \
--extraction_output_bucket $EXTRACTION_OUTPUT_BUCKET \
--parsing_methodology $PARSING_METHODOLOGY \
--destination_project_id $DESTINATION_PROJECT_ID \
--destination_dataset_id $DESTINATION_DATASET_ID \
--destination_table_id $DESTINATION_TABLE_ID \
--async_output_folder_gcs_uri $ASYNC_OUTPUT_FOLDER_GCS_URI \
--include_raw_entities