refactor: use env_config instead of SUBREGION_THRESHOLD_FOR_OCR con…

…stant (#2697) The purpose of this PR is to introduce a new env_config for the subregion threshold for OCR. ### Testing CI should pass.
Unstructured-IO · Mar 28, 2024 · 887e6c9 · 887e6c9
1 parent c8cf8f3
commit 887e6c9
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,7 +14,7 @@
 
 ### Fixes
 
-* **Fix `clean_pdfminer_inner_elements()` to remove only pdfminer (embedded) elements merged with inferred elements** Previously, some embedded elements were removed even if they were not merged with inferred elements. Now, only embedded elements that are already merged with inferred elements are removed.
+* **Fix `clean_pdfminer_inner_elements()` to remove only pdfminer (embedded) elements merged with inferred elements**. Previously, some embedded elements were removed even if they were not merged with inferred elements. Now, only embedded elements that are already merged with inferred elements are removed.
 * **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
 * **Change table extraction defaults** Change table extraction defaults in favor of using `skip_infer_table_types` parameter and reflect these changes in documentation.
 * **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint

diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -15,6 +15,7 @@
 from unstructured.documents.elements import ElementType
 from unstructured.partition.pdf_image import ocr
 from unstructured.partition.pdf_image.ocr import pad_element_bboxes
+from unstructured.partition.utils.config import env_config
 from unstructured.partition.utils.constants import (
     Source,
 )
@@ -267,7 +268,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
         for ocr_element in ocr_elements:
             if ocr_element.bbox.is_almost_subregion_of(
                 element.bbox,
-                ocr.SUBREGION_THRESHOLD_FOR_OCR,
+                env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
             ):
                 assert ocr_element not in final_layout
 

diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -18,7 +18,6 @@
     OCR_AGENT_PADDLE_OLD,
     OCR_AGENT_TESSERACT,
     OCR_AGENT_TESSERACT_OLD,
-    SUBREGION_THRESHOLD_FOR_OCR,
     OCRMode,
 )
 from unstructured.partition.utils.ocr_models.ocr_interface import (
@@ -349,7 +348,6 @@ def merge_out_layout_with_ocr_layout(
         out_region.text = aggregate_ocr_text_by_block(
             ocr_layout,
             out_region,
-            SUBREGION_THRESHOLD_FOR_OCR,
         )
 
     final_layout = (
@@ -364,7 +362,7 @@ def merge_out_layout_with_ocr_layout(
 def aggregate_ocr_text_by_block(
     ocr_layout: List["TextRegion"],
     region: "TextRegion",
-    subregion_threshold: float,
+    subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
 ) -> Optional[str]:
     """Extracts the text aggregated from the regions of the ocr layout that lie within the given
     block."""
@@ -374,7 +372,7 @@ def aggregate_ocr_text_by_block(
     for ocr_region in ocr_layout:
         ocr_region_is_subregion_of_given_region = ocr_region.bbox.is_almost_subregion_of(
             region.bbox,
-            subregion_threshold=subregion_threshold,
+            subregion_threshold,
         )
         if ocr_region_is_subregion_of_given_region and ocr_region.text:
             extracted_texts.append(ocr_region.text)
@@ -386,6 +384,7 @@ def aggregate_ocr_text_by_block(
 def supplement_layout_with_ocr_elements(
     layout: List["LayoutElement"],
     ocr_layout: List["TextRegion"],
+    subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
 ) -> List["LayoutElement"]:
     """
     Supplement the existing layout with additional OCR-derived elements.
@@ -410,7 +409,7 @@ def supplement_layout_with_ocr_elements(
       is a subregion of an existing layout element.
     - It also relies on `build_layout_elements_from_ocr_regions()` to convert OCR regions to
      layout elements.
-    - The `SUBREGION_THRESHOLD_FOR_OCR` constant is used to specify the subregion matching
+    - The env_config `OCR_LAYOUT_SUBREGION_THRESHOLD` is used to specify the subregion matching
      threshold.
     """
 
@@ -423,7 +422,7 @@ def supplement_layout_with_ocr_elements(
         for el in layout:
             ocr_region_is_subregion_of_out_el = ocr_region.bbox.is_almost_subregion_of(
                 el.bbox,
-                SUBREGION_THRESHOLD_FOR_OCR,
+                subregion_threshold,
             )
             if ocr_region_is_subregion_of_out_el:
                 ocr_regions_to_remove.append(ocr_region)

diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -94,5 +94,15 @@ def EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD(self) -> int:
         """
         return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", 0)
 
+    @property
+    def OCR_LAYOUT_SUBREGION_THRESHOLD(self) -> float:
+        """threshold to determine if an OCR region is a sub-region of a given block
+        when aggregating the text from OCR'd elements that lie within the given block
+
+        When the intersection region area divided by self area is larger than this threshold self is
+        considered a subregion of the other
+        """
+        return self._get_float("OCR_LAYOUT_SUBREGION_THRESHOLD", 0.5)
+
 
 env_config = ENVConfig()
diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py
@@ -36,7 +36,6 @@ class PartitionStrategy:
     "unstructured.partition.utils.ocr_models.paddle_ocr",
 ).split(",")
 
-SUBREGION_THRESHOLD_FOR_OCR = 0.5
 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
 
 # Note(yuming): Default language for paddle OCR