Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjustments for upstream upgrades #253

Merged
merged 8 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
40 changes: 0 additions & 40 deletions .github/workflows/doc.yml

This file was deleted.

14 changes: 8 additions & 6 deletions pdf2docx/common/Collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,10 +348,12 @@ def split_with_intersection(self, bbox:fitz.Rect, threshold:float=1e-3):
for instance in self._instances:
# A contains B => A & B = B
intersection = instance.bbox & bbox
factor = round(intersection.get_area()/instance.bbox.get_area(), 2)

if factor >= threshold:
intersections.append(instance)
else:
if intersection.is_empty:
no_intersections.append(instance)
return self.__class__(intersections), self.__class__(no_intersections)
else:
factor = round(intersection.get_area()/instance.bbox.get_area(), 2)
if factor >= threshold:
intersections.append(instance)
else:
no_intersections.append(instance)
return self.__class__(intersections), self.__class__(no_intersections)
115 changes: 65 additions & 50 deletions pdf2docx/common/Element.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-

'''Object with a bounding box, e.g. Block, Line, Span.

Based on ``PyMuPDF``, the coordinates (e.g. bbox of ``page.get_text('rawdict')``) are generally
Expand Down Expand Up @@ -49,20 +47,28 @@ def pure_rotation_matrix(cls):


def __init__(self, raw:dict=None, parent=None):
''' Initialize Element and convert to the real (rotation considered) page coordinate system.'''
''' Initialize Element and convert to the real (rotation considered) page CS.'''
self.bbox = fitz.Rect() # type: fitz.Rect
self._parent = parent # type: Element

# NOTE: Any coordinates provided in raw is in original page CS (without considering page rotation).
# NOTE: Any coordinates provided in raw is in original page CS
# (without considering page rotation).
if 'bbox' in (raw or {}):
rect = fitz.Rect(raw['bbox']) * Element.ROTATION_MATRIX
self.update_bbox(rect)


def __bool__(self):
'''Real object when bbox is defined.'''
# NOTE inconsistent results of fitz.Rect for different version of pymupdf, e.g.,
# a = fitz.Rect(3,3,2,2)
# bool(a) a.get_area() a.is_empty
# pymupdf 1.23.5 True 1.0 True
# pymupdf 1.23.8 True 0.0 True
# bool(fitz.Rect())==False
# NOTE: do not use `return not self.bbox.is_empty` here
return bool(self.bbox)


def __repr__(self): return f'{self.__class__.__name__}({tuple(self.bbox)})'

Expand Down Expand Up @@ -98,18 +104,19 @@ def get_expand_bbox(self, dt:float):

Returns:
fitz.Rect: Expanded bbox.

.. note::
This method creates a new bbox, rather than changing the bbox of itself.
"""
"""
return self.bbox + (-dt, -dt, dt, dt)


def update_bbox(self, rect):
'''Update current bbox to specified ``rect``.

Args:
rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)`` in real page CS (with rotation considered).
rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)``,
in real page CS (with rotation considered).
'''
self.bbox = fitz.Rect([round(x,1) for x in rect])
return self
Expand All @@ -123,45 +130,44 @@ def union_bbox(self, e):

Returns:
Element: self
"""
"""
return self.update_bbox(self.bbox | e.bbox)


# --------------------------------------------
# location relationship to other Element instance
# --------------------------------------------
# --------------------------------------------
def contains(self, e:'Element', threshold:float=1.0):
"""Whether given element is contained in this instance, with margin considered.

Args:
e (Element): Target element
threshold (float, optional): Intersection rate. Defaults to 1.0. The larger, the stricter.
threshold (float, optional): Intersection rate.
Defaults to 1.0. The larger, the stricter.

Returns:
bool: [description]
"""
# NOTE the case bool(e)=True but e.bbox.get_area()=0
S = e.bbox.get_area()
if not S: return False
if not S: return False

# it's not practical to set a general threshold to consider the margin, so two steps:
# - set a coarse but acceptable area threshold,
# - check the length in main direction strictly

# A contains B => A & B = B
intersection = self.bbox & e.bbox
factor = round(intersection.get_area()/e.bbox.get_area(), 2)
factor = round(intersection.get_area()/S, 2)
if factor<threshold: return False

# check length
if self.bbox.width >= self.bbox.height:
return self.bbox.width+constants.MINOR_DIST >= e.bbox.width
else:
return self.bbox.height+constants.MINOR_DIST >= e.bbox.height

return self.bbox.height+constants.MINOR_DIST >= e.bbox.height


def get_main_bbox(self, e, threshold:float=0.95):
"""If the intersection with ``e`` exceeds the threshold, return the union of these two elements; else return None.
"""If the intersection with ``e`` exceeds the threshold, return the union of
these two elements; else return None.

Args:
e (Element): Target element.
Expand All @@ -172,43 +178,45 @@ def get_main_bbox(self, e, threshold:float=0.95):
"""
bbox_1 = self.bbox
bbox_2 = e.bbox if hasattr(e, 'bbox') else fitz.Rect(e)

# areas
b = bbox_1 & bbox_2
if not b: return None # no intersection

a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area()
if b.is_empty: return None # no intersection

# Note: if bbox_1 and bbox_2 intersects with only an edge, b is not empty but b.get_area()=0
# so give a small value when they're intersected but the area is zero
a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area()
factor = a/min(a1,a2) if a else 1e-6
return bbox_1 | bbox_2 if factor >= threshold else None


def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True):
'''Check whether two Element instances have enough intersection in vertical direction, i.e. perpendicular to reading direction.

'''Check whether two Element instances have enough intersection in vertical direction,
i.e. perpendicular to reading direction.

Args:
e (Element): Object to check with
factor (float, optional): Threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned.
text_direction (bool, optional): Consider text direction or not. True by default, from left to right if False.
factor (float, optional): Threshold of overlap ratio, the larger it is, the higher
probability the two bbox-es are aligned.
text_direction (bool, optional): Consider text direction or not.
True by default,from left to right if False.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand what this means "True by default, from left to right if False".

So if the text_direction is True then we do consider text direction - okay, but if it is False then we don't consider text directions, however when I read this it seems like False means we consider a left to right text direction as it says "from left to right if False". I'm confused!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't read any contradictions between your thoughts -> don't consider text directions -> ignore real text directions -> use default text directions -> the most common case, horizontal, i.e., from left to right.

Appreciated if you help a precise wording.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per my understanding, don't consider text directions, means to use default text direction, which is from left to right.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose the bit I don't understand then is if True then what is the text direction? Basically:

False = from left to right
True = ?

Also if we are False then we do consider the text direction don't we (left to right)? Which is why text_direction (bool, optional): Consider text direction or not. confuses me!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I probably understand your confusion. When False, the words from left to right does not mean to text direction, but the default direction for "horizontal". I should just keep True by default and delete the rest.

image

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay - yes please just delete to avoid the confusion.

Returns:
bool: [description]

Examples::

+--------------+
| |
+--------------+
+--------------+
L1
+-------------------+
| |
+-------------------+
L2

An enough intersection is defined based on the minimum width of two boxes::

L1+L2-L>factor*min(L1,L2)
'''
if not e or not bool(self): return False
Expand All @@ -225,29 +233,32 @@ def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True):


def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True):
'''Check whether two Element instances have enough intersection in horizontal direction, i.e. along the reading direction.

'''Check whether two Element instances have enough intersection in horizontal direction,
i.e. along the reading direction.

Args:
e (Element): Element to check with
factor (float, optional): threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned.
text_direction (bool, optional): consider text direction or not. True by default, from left to right if False.
factor (float, optional): threshold of overlap ratio, the larger it is, the higher
probability the two bbox-es are aligned.
text_direction (bool, optional): consider text direction or not.
True by default, from left to right if False.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See previous comment.

Examples::

+--------------+
| | L1 +--------------------+
+--------------+ | | L2
+--------------------+

An enough intersection is defined based on the minimum width of two boxes::

L1+L2-L>factor*min(L1,L2)
'''
if not e or not bool(self): return False

# text direction
idx = 0 if text_direction and self.is_vertical_text else 1

L1 = self.bbox[idx+2]-self.bbox[idx]
L2 = e.bbox[idx+2]-e.bbox[idx]
L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx])
Expand All @@ -257,21 +268,19 @@ def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True)


def in_same_row(self, e):
"""Check whether in same row/line with specified Element instance. With text direction considered.

"""Check whether in same row/line with specified Element instance.
With text direction considered.

Taking horizontal text as an example:

* yes: the bottom edge of each box is lower than the centerline of the other one;
* otherwise, not in same row.

Args:
e (Element): Target object.

Returns:
bool: [description]

.. note::
The difference to method ``horizontally_align_with``: they may not in same line, though
The difference to method ``horizontally_align_with``: they may not in same line, though
aligned horizontally.
"""
if not e or self.is_horizontal_text != e.is_horizontal_text:
Expand All @@ -291,9 +300,15 @@ def in_same_row(self, e):
# ------------------------------------------------
def store(self):
'''Store properties in raw dict.'''
return { 'bbox': tuple([x for x in self.bbox]) }
return { 'bbox': tuple(x for x in self.bbox) }



def plot(self, page, stroke:tuple=(0,0,0), width:float=0.5, fill:tuple=None, dashes:str=None):
'''Plot bbox in PDF page for debug purpose.'''
page.draw_rect(self.bbox, color=stroke, fill=fill, width=width, dashes=dashes, overlay=False, fill_opacity=0.5)
page.draw_rect(self.bbox,
color=stroke,
fill=fill,
width=width,
dashes=dashes,
overlay=False,
fill_opacity=0.5)