Skip to content

Commit

Permalink
refactor: use env_config instead of SUBREGION_THRESHOLD_FOR_OCR con…
Browse files Browse the repository at this point in the history
…stant (#2697)

The purpose of this PR is to introduce a new env_config for the
subregion threshold for OCR.

### Testing
CI should pass.
  • Loading branch information
christinestraub committed Mar 28, 2024
1 parent c8cf8f3 commit 887e6c9
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 9 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Expand Up @@ -14,7 +14,7 @@

### Fixes

* **Fix `clean_pdfminer_inner_elements()` to remove only pdfminer (embedded) elements merged with inferred elements** Previously, some embedded elements were removed even if they were not merged with inferred elements. Now, only embedded elements that are already merged with inferred elements are removed.
* **Fix `clean_pdfminer_inner_elements()` to remove only pdfminer (embedded) elements merged with inferred elements**. Previously, some embedded elements were removed even if they were not merged with inferred elements. Now, only embedded elements that are already merged with inferred elements are removed.
* **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
* **Change table extraction defaults** Change table extraction defaults in favor of using `skip_infer_table_types` parameter and reflect these changes in documentation.
* **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint
Expand Down
3 changes: 2 additions & 1 deletion test_unstructured/partition/pdf_image/test_ocr.py
Expand Up @@ -15,6 +15,7 @@
from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image import ocr
from unstructured.partition.pdf_image.ocr import pad_element_bboxes
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import (
Source,
)
Expand Down Expand Up @@ -267,7 +268,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
for ocr_element in ocr_elements:
if ocr_element.bbox.is_almost_subregion_of(
element.bbox,
ocr.SUBREGION_THRESHOLD_FOR_OCR,
env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
):
assert ocr_element not in final_layout

Expand Down
11 changes: 5 additions & 6 deletions unstructured/partition/pdf_image/ocr.py
Expand Up @@ -18,7 +18,6 @@
OCR_AGENT_PADDLE_OLD,
OCR_AGENT_TESSERACT,
OCR_AGENT_TESSERACT_OLD,
SUBREGION_THRESHOLD_FOR_OCR,
OCRMode,
)
from unstructured.partition.utils.ocr_models.ocr_interface import (
Expand Down Expand Up @@ -349,7 +348,6 @@ def merge_out_layout_with_ocr_layout(
out_region.text = aggregate_ocr_text_by_block(
ocr_layout,
out_region,
SUBREGION_THRESHOLD_FOR_OCR,
)

final_layout = (
Expand All @@ -364,7 +362,7 @@ def merge_out_layout_with_ocr_layout(
def aggregate_ocr_text_by_block(
ocr_layout: List["TextRegion"],
region: "TextRegion",
subregion_threshold: float,
subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
) -> Optional[str]:
"""Extracts the text aggregated from the regions of the ocr layout that lie within the given
block."""
Expand All @@ -374,7 +372,7 @@ def aggregate_ocr_text_by_block(
for ocr_region in ocr_layout:
ocr_region_is_subregion_of_given_region = ocr_region.bbox.is_almost_subregion_of(
region.bbox,
subregion_threshold=subregion_threshold,
subregion_threshold,
)
if ocr_region_is_subregion_of_given_region and ocr_region.text:
extracted_texts.append(ocr_region.text)
Expand All @@ -386,6 +384,7 @@ def aggregate_ocr_text_by_block(
def supplement_layout_with_ocr_elements(
layout: List["LayoutElement"],
ocr_layout: List["TextRegion"],
subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
) -> List["LayoutElement"]:
"""
Supplement the existing layout with additional OCR-derived elements.
Expand All @@ -410,7 +409,7 @@ def supplement_layout_with_ocr_elements(
is a subregion of an existing layout element.
- It also relies on `build_layout_elements_from_ocr_regions()` to convert OCR regions to
layout elements.
- The `SUBREGION_THRESHOLD_FOR_OCR` constant is used to specify the subregion matching
- The env_config `OCR_LAYOUT_SUBREGION_THRESHOLD` is used to specify the subregion matching
threshold.
"""

Expand All @@ -423,7 +422,7 @@ def supplement_layout_with_ocr_elements(
for el in layout:
ocr_region_is_subregion_of_out_el = ocr_region.bbox.is_almost_subregion_of(
el.bbox,
SUBREGION_THRESHOLD_FOR_OCR,
subregion_threshold,
)
if ocr_region_is_subregion_of_out_el:
ocr_regions_to_remove.append(ocr_region)
Expand Down
10 changes: 10 additions & 0 deletions unstructured/partition/utils/config.py
Expand Up @@ -94,5 +94,15 @@ def EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD(self) -> int:
"""
return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", 0)

@property
def OCR_LAYOUT_SUBREGION_THRESHOLD(self) -> float:
"""threshold to determine if an OCR region is a sub-region of a given block
when aggregating the text from OCR'd elements that lie within the given block
When the intersection region area divided by self area is larger than this threshold self is
considered a subregion of the other
"""
return self._get_float("OCR_LAYOUT_SUBREGION_THRESHOLD", 0.5)


env_config = ENVConfig()
1 change: 0 additions & 1 deletion unstructured/partition/utils/constants.py
Expand Up @@ -36,7 +36,6 @@ class PartitionStrategy:
"unstructured.partition.utils.ocr_models.paddle_ocr",
).split(",")

SUBREGION_THRESHOLD_FOR_OCR = 0.5
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)

# Note(yuming): Default language for paddle OCR
Expand Down

0 comments on commit 887e6c9

Please sign in to comment.