diff --git a/CHANGELOG.md b/CHANGELOG.md index fbc9ace711..a3cd424026 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ ### Fixes -* **Fix `clean_pdfminer_inner_elements()` to remove only pdfminer (embedded) elements merged with inferred elements** Previously, some embedded elements were removed even if they were not merged with inferred elements. Now, only embedded elements that are already merged with inferred elements are removed. +* **Fix `clean_pdfminer_inner_elements()` to remove only pdfminer (embedded) elements merged with inferred elements**. Previously, some embedded elements were removed even if they were not merged with inferred elements. Now, only embedded elements that are already merged with inferred elements are removed. * **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles. * **Change table extraction defaults** Change table extraction defaults in favor of using `skip_infer_table_types` parameter and reflect these changes in documentation. * **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index ec228ab946..1c8ec23fa9 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -15,6 +15,7 @@ from unstructured.documents.elements import ElementType from unstructured.partition.pdf_image import ocr from unstructured.partition.pdf_image.ocr import pad_element_bboxes +from unstructured.partition.utils.config import env_config from unstructured.partition.utils.constants import ( Source, ) @@ -267,7 +268,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): for ocr_element in ocr_elements: if ocr_element.bbox.is_almost_subregion_of( element.bbox, - ocr.SUBREGION_THRESHOLD_FOR_OCR, + env_config.OCR_LAYOUT_SUBREGION_THRESHOLD, ): assert ocr_element not in final_layout diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 9b6122427b..84f7b0f071 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -18,7 +18,6 @@ OCR_AGENT_PADDLE_OLD, OCR_AGENT_TESSERACT, OCR_AGENT_TESSERACT_OLD, - SUBREGION_THRESHOLD_FOR_OCR, OCRMode, ) from unstructured.partition.utils.ocr_models.ocr_interface import ( @@ -349,7 +348,6 @@ def merge_out_layout_with_ocr_layout( out_region.text = aggregate_ocr_text_by_block( ocr_layout, out_region, - SUBREGION_THRESHOLD_FOR_OCR, ) final_layout = ( @@ -364,7 +362,7 @@ def merge_out_layout_with_ocr_layout( def aggregate_ocr_text_by_block( ocr_layout: List["TextRegion"], region: "TextRegion", - subregion_threshold: float, + subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD, ) -> Optional[str]: """Extracts the text aggregated from the regions of the ocr layout that lie within the given block.""" @@ -374,7 +372,7 @@ def aggregate_ocr_text_by_block( for ocr_region in ocr_layout: ocr_region_is_subregion_of_given_region = ocr_region.bbox.is_almost_subregion_of( region.bbox, - subregion_threshold=subregion_threshold, + subregion_threshold, ) if ocr_region_is_subregion_of_given_region and ocr_region.text: extracted_texts.append(ocr_region.text) @@ -386,6 +384,7 @@ def aggregate_ocr_text_by_block( def supplement_layout_with_ocr_elements( layout: List["LayoutElement"], ocr_layout: List["TextRegion"], + subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD, ) -> List["LayoutElement"]: """ Supplement the existing layout with additional OCR-derived elements. @@ -410,7 +409,7 @@ def supplement_layout_with_ocr_elements( is a subregion of an existing layout element. - It also relies on `build_layout_elements_from_ocr_regions()` to convert OCR regions to layout elements. - - The `SUBREGION_THRESHOLD_FOR_OCR` constant is used to specify the subregion matching + - The env_config `OCR_LAYOUT_SUBREGION_THRESHOLD` is used to specify the subregion matching threshold. """ @@ -423,7 +422,7 @@ def supplement_layout_with_ocr_elements( for el in layout: ocr_region_is_subregion_of_out_el = ocr_region.bbox.is_almost_subregion_of( el.bbox, - SUBREGION_THRESHOLD_FOR_OCR, + subregion_threshold, ) if ocr_region_is_subregion_of_out_el: ocr_regions_to_remove.append(ocr_region) diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 23e916e0a0..151c90210d 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -94,5 +94,15 @@ def EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD(self) -> int: """ return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", 0) + @property + def OCR_LAYOUT_SUBREGION_THRESHOLD(self) -> float: + """threshold to determine if an OCR region is a sub-region of a given block + when aggregating the text from OCR'd elements that lie within the given block + + When the intersection region area divided by self area is larger than this threshold self is + considered a subregion of the other + """ + return self._get_float("OCR_LAYOUT_SUBREGION_THRESHOLD", 0.5) + env_config = ENVConfig() diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index a3f1a44ccc..7258e7ecaf 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -36,7 +36,6 @@ class PartitionStrategy: "unstructured.partition.utils.ocr_models.paddle_ocr", ).split(",") -SUBREGION_THRESHOLD_FOR_OCR = 0.5 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False) # Note(yuming): Default language for paddle OCR