Skip to content

Commit

Permalink
Merge pull request #253 from ArtifexSoftware/dothinking-dev
Browse files Browse the repository at this point in the history
Adjustments for upstream upgrades
  • Loading branch information
jamie-lemon committed Jan 19, 2024
2 parents 4d7dab0 + cbcbefe commit 5e48dbe
Show file tree
Hide file tree
Showing 19 changed files with 332 additions and 366 deletions.
40 changes: 0 additions & 40 deletions .github/workflows/doc.yml

This file was deleted.

14 changes: 8 additions & 6 deletions pdf2docx/common/Collection.py
Expand Up @@ -348,10 +348,12 @@ def split_with_intersection(self, bbox:fitz.Rect, threshold:float=1e-3):
for instance in self._instances:
# A contains B => A & B = B
intersection = instance.bbox & bbox
factor = round(intersection.get_area()/instance.bbox.get_area(), 2)

if factor >= threshold:
intersections.append(instance)
else:
if intersection.is_empty:
no_intersections.append(instance)
return self.__class__(intersections), self.__class__(no_intersections)
else:
factor = round(intersection.get_area()/instance.bbox.get_area(), 2)
if factor >= threshold:
intersections.append(instance)
else:
no_intersections.append(instance)
return self.__class__(intersections), self.__class__(no_intersections)
121 changes: 67 additions & 54 deletions pdf2docx/common/Element.py
@@ -1,10 +1,8 @@
# -*- coding: utf-8 -*-

'''Object with a bounding box, e.g. Block, Line, Span.
Based on ``PyMuPDF``, the coordinates (e.g. bbox of ``page.get_text('rawdict')``) are generally
provided relative to the un-rotated page; while this ``pdf2docx`` library works under real page
coordinate system, i.e. with rotation considered. So, any instances created by this Class are
Based on ``PyMuPDF``, the coordinates (e.g. bbox of ``page.get_text('rawdict')``) are generally
provided relative to the un-rotated page; while this ``pdf2docx`` library works under real page
coordinate system, i.e. with rotation considered. So, any instances created by this Class are
always applied a rotation matrix automatically.
Therefore, the bbox parameter used to create ``Element`` instance MUST be relative to un-rotated
Expand Down Expand Up @@ -36,7 +34,7 @@ def set_rotation_matrix(cls, rotation_matrix):
Args:
Rotation_matrix (fitz.Matrix): target matrix
"""
"""
if rotation_matrix and isinstance(rotation_matrix, fitz.Matrix):
cls.ROTATION_MATRIX = rotation_matrix

Expand All @@ -49,20 +47,28 @@ def pure_rotation_matrix(cls):


def __init__(self, raw:dict=None, parent=None):
''' Initialize Element and convert to the real (rotation considered) page coordinate system.'''
''' Initialize Element and convert to the real (rotation considered) page CS.'''
self.bbox = fitz.Rect() # type: fitz.Rect
self._parent = parent # type: Element

# NOTE: Any coordinates provided in raw is in original page CS (without considering page rotation).
# NOTE: Any coordinates provided in raw is in original page CS
# (without considering page rotation).
if 'bbox' in (raw or {}):
rect = fitz.Rect(raw['bbox']) * Element.ROTATION_MATRIX
self.update_bbox(rect)


def __bool__(self):
'''Real object when bbox is defined.'''
# NOTE inconsistent results of fitz.Rect for different version of pymupdf, e.g.,
# a = fitz.Rect(3,3,2,2)
# bool(a) a.get_area() a.is_empty
# pymupdf 1.23.5 True 1.0 True
# pymupdf 1.23.8 True 0.0 True
# bool(fitz.Rect())==False
# NOTE: do not use `return not self.bbox.is_empty` here
return bool(self.bbox)


def __repr__(self): return f'{self.__class__.__name__}({tuple(self.bbox)})'

Expand Down Expand Up @@ -98,18 +104,19 @@ def get_expand_bbox(self, dt:float):
Returns:
fitz.Rect: Expanded bbox.
.. note::
This method creates a new bbox, rather than changing the bbox of itself.
"""
"""
return self.bbox + (-dt, -dt, dt, dt)


def update_bbox(self, rect):
'''Update current bbox to specified ``rect``.
Args:
rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)`` in real page CS (with rotation considered).
rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)``,
in real page CS (with rotation considered).
'''
self.bbox = fitz.Rect([round(x,1) for x in rect])
return self
Expand All @@ -123,45 +130,44 @@ def union_bbox(self, e):
Returns:
Element: self
"""
"""
return self.update_bbox(self.bbox | e.bbox)


# --------------------------------------------
# location relationship to other Element instance
# --------------------------------------------
# --------------------------------------------
def contains(self, e:'Element', threshold:float=1.0):
"""Whether given element is contained in this instance, with margin considered.
Args:
e (Element): Target element
threshold (float, optional): Intersection rate. Defaults to 1.0. The larger, the stricter.
threshold (float, optional): Intersection rate.
Defaults to 1.0. The larger, the stricter.
Returns:
bool: [description]
"""
# NOTE the case bool(e)=True but e.bbox.get_area()=0
S = e.bbox.get_area()
if not S: return False
if not S: return False

# it's not practical to set a general threshold to consider the margin, so two steps:
# - set a coarse but acceptable area threshold,
# - check the length in main direction strictly

# A contains B => A & B = B
intersection = self.bbox & e.bbox
factor = round(intersection.get_area()/e.bbox.get_area(), 2)
factor = round(intersection.get_area()/S, 2)
if factor<threshold: return False

# check length
if self.bbox.width >= self.bbox.height:
return self.bbox.width+constants.MINOR_DIST >= e.bbox.width
else:
return self.bbox.height+constants.MINOR_DIST >= e.bbox.height

return self.bbox.height+constants.MINOR_DIST >= e.bbox.height


def get_main_bbox(self, e, threshold:float=0.95):
"""If the intersection with ``e`` exceeds the threshold, return the union of these two elements; else return None.
"""If the intersection with ``e`` exceeds the threshold, return the union of
these two elements; else return None.
Args:
e (Element): Target element.
Expand All @@ -172,43 +178,44 @@ def get_main_bbox(self, e, threshold:float=0.95):
"""
bbox_1 = self.bbox
bbox_2 = e.bbox if hasattr(e, 'bbox') else fitz.Rect(e)

# areas
b = bbox_1 & bbox_2
if not b: return None # no intersection

a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area()
if b.is_empty: return None # no intersection

# Note: if bbox_1 and bbox_2 intersects with only an edge, b is not empty but b.get_area()=0
# so give a small value when they're intersected but the area is zero
a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area()
factor = a/min(a1,a2) if a else 1e-6
return bbox_1 | bbox_2 if factor >= threshold else None


def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True):
'''Check whether two Element instances have enough intersection in vertical direction, i.e. perpendicular to reading direction.
'''Check whether two Element instances have enough intersection in vertical direction,
i.e. perpendicular to reading direction.
Args:
e (Element): Object to check with
factor (float, optional): Threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned.
text_direction (bool, optional): Consider text direction or not. True by default, from left to right if False.
factor (float, optional): Threshold of overlap ratio, the larger it is, the higher
probability the two bbox-es are aligned.
text_direction (bool, optional): Consider text direction or not. True by default.
Returns:
bool: [description]
Examples::
+--------------+
| |
+--------------+
+--------------+
L1
+-------------------+
| |
+-------------------+
L2
An enough intersection is defined based on the minimum width of two boxes::
L1+L2-L>factor*min(L1,L2)
'''
if not e or not bool(self): return False
Expand All @@ -225,29 +232,31 @@ def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True):


def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True):
'''Check whether two Element instances have enough intersection in horizontal direction, i.e. along the reading direction.
'''Check whether two Element instances have enough intersection in horizontal direction,
i.e. along the reading direction.
Args:
e (Element): Element to check with
factor (float, optional): threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned.
text_direction (bool, optional): consider text direction or not. True by default, from left to right if False.
factor (float, optional): threshold of overlap ratio, the larger it is, the higher
probability the two bbox-es are aligned.
text_direction (bool, optional): consider text direction or not. True by default.
Examples::
+--------------+
| | L1 +--------------------+
+--------------+ | | L2
+--------------------+
An enough intersection is defined based on the minimum width of two boxes::
L1+L2-L>factor*min(L1,L2)
'''
if not e or not bool(self): return False

# text direction
idx = 0 if text_direction and self.is_vertical_text else 1

L1 = self.bbox[idx+2]-self.bbox[idx]
L2 = e.bbox[idx+2]-e.bbox[idx]
L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx])
Expand All @@ -257,21 +266,19 @@ def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True)


def in_same_row(self, e):
"""Check whether in same row/line with specified Element instance. With text direction considered.
"""Check whether in same row/line with specified Element instance.
With text direction considered.
Taking horizontal text as an example:
* yes: the bottom edge of each box is lower than the centerline of the other one;
* otherwise, not in same row.
Args:
e (Element): Target object.
Returns:
bool: [description]
.. note::
The difference to method ``horizontally_align_with``: they may not in same line, though
The difference to method ``horizontally_align_with``: they may not in same line, though
aligned horizontally.
"""
if not e or self.is_horizontal_text != e.is_horizontal_text:
Expand All @@ -291,9 +298,15 @@ def in_same_row(self, e):
# ------------------------------------------------
def store(self):
'''Store properties in raw dict.'''
return { 'bbox': tuple([x for x in self.bbox]) }
return { 'bbox': tuple(x for x in self.bbox) }



def plot(self, page, stroke:tuple=(0,0,0), width:float=0.5, fill:tuple=None, dashes:str=None):
'''Plot bbox in PDF page for debug purpose.'''
page.draw_rect(self.bbox, color=stroke, fill=fill, width=width, dashes=dashes, overlay=False, fill_opacity=0.5)
page.draw_rect(self.bbox,
color=stroke,
fill=fill,
width=width,
dashes=dashes,
overlay=False,
fill_opacity=0.5)

0 comments on commit 5e48dbe

Please sign in to comment.