Skip to content

Commit

Permalink
Spell checked /common, /font, /gui, /image, /layout, /page, /table, /…
Browse files Browse the repository at this point in the history
…text, converter.py and main.py along with other files; changes 2 constant's spelling in all occurrences.
  • Loading branch information
beginner-cryptonyx authored and dothinking committed Apr 5, 2023
1 parent 6ca5f19 commit 24ee434
Show file tree
Hide file tree
Showing 21 changed files with 38 additions and 38 deletions.
2 changes: 1 addition & 1 deletion pdf2docx/common/Block.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def parse_horizontal_spacing(self, bbox, *args):
bbox (fitz.rect): boundary box of this block.
"""
# NOTE: in PyMuPDF CS, horizontal text direction is same with positive x-axis,
# while vertical text is on the contrarory, so use f = -1 here
# while vertical text is on the contrary, so use f = -1 here
idx, f = (0, 1.0) if self.is_horizontal_text else (3, -1.0)
self.alignment = TextAlignment.LEFT
self.left_space = (self.bbox[idx] - bbox[idx]) * f
Expand Down
4 changes: 2 additions & 2 deletions pdf2docx/common/Element.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True):
L2 = e.bbox[idx+2]-e.bbox[idx]
L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx])

eps = 1e-3 # tolerent
eps = 1e-3 # tolerant
return L1+L2-L+eps >= factor*min(L1,L2)


Expand Down Expand Up @@ -252,7 +252,7 @@ def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True)
L2 = e.bbox[idx+2]-e.bbox[idx]
L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx])

eps = 1e-3 # tolerent
eps = 1e-3 # tolerant
return L1+L2-L+eps >= factor*min(L1,L2)


Expand Down
4 changes: 2 additions & 2 deletions pdf2docx/common/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def solve_rects_intersection(V:list, num:int, index_groups:list):
S22 = list(filter( lambda item: item[1][0]>X, right ))
S21 = list(filter( lambda item: item[1][0]<=X0, right ))

# intersection in x-direction is fullfilled, so check y-direction further
# intersection in x-direction is fulfilled, so check y-direction further
_stab(S12, S22, index_groups)
_stab(S21, S11, index_groups)
_stab(S12, S21, index_groups)
Expand Down Expand Up @@ -290,7 +290,7 @@ def inner_contours(img_binary:np.array, bbox:tuple, min_w:float, min_h:float):
'''Inner contours of current region, especially level 2 contours of the default opencv tree hirerachy.
Args:
img_binary (np.array): Binarized image with intresting region (255) and empty region (0).
img_binary (np.array): Binarized image with interesting region (255) and empty region (0).
bbox (tuple): The external bbox.
min_w (float): Ignore contours if the bbox width is less than this value.
min_h (float): Ignore contours if the bbox height is less than this value.
Expand Down
4 changes: 2 additions & 2 deletions pdf2docx/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
# -------------------------------------
HIDDEN_W_BORDER = 0.0 # do not show border
MIN_LINE_SPACING = 0.7 # minimum line spacing available in MS word
DEFULT_LINE_SPACING = 1.02
DEFAULT_LINE_SPACING = 1.02

# punctuation implying end of a sentense
SENTENSE_END_PUNC = '..。??!!'
SENTENCE_END_PUNC = '..。??!!'

# control characters not supported by lxml
# https://github.com/dothinking/pdf2docx/issues/126#issuecomment-1040034077
Expand Down
4 changes: 2 additions & 2 deletions pdf2docx/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class Converter:
* Read PDF file with ``PyMuPDF`` to get raw layout data page by page, including text,
image, drawing and its properties, e.g. boundary box, font, size, image width, height.
* Analyze layout in document level, e.g. page header, footer and margin.
* Parse page layout to docx structure, e.g. paragraph and its properties like indentaton,
* Parse page layout to docx structure, e.g. paragraph and its properties like indentation,
spacing, text alignment; table and its properties like border, shading, merging.
* Finally, generate docx with ``python-docx``.
'''
Expand Down Expand Up @@ -96,7 +96,7 @@ def default_settings(self):
'lines_left_aligned_threshold' : 1.0, # left aligned if d_x0 of two lines is lower than this value (Pt)
'lines_right_aligned_threshold' : 1.0, # right aligned if d_x1 of two lines is lower than this value (Pt)
'lines_center_aligned_threshold' : 2.0, # center aligned if delta center of two lines is lower than this value
'clip_image_res_ratio' : 4.0, # resolution ratio (to 72dpi) when cliping page image
'clip_image_res_ratio' : 4.0, # resolution ratio (to 72dpi) when clipping page image
'min_svg_gap_dx' : 15.0, # merge adjacent vector graphics if the horizontal gap is less than this value
'min_svg_gap_dy' : 2.0, # merge adjacent vector graphics if the vertical gap is less than this value
'min_svg_w' : 2.0, # ignore vector graphics if the bbox width is less than this value
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/font/Fonts.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def extract(cls, fitz_doc):
# - cff: Adobe Compact File Format, i.e. Type 1 font
assert ext not in ('n/a', 'cff'), "base font or not supported font"

# try to get more font metrices with fonttool
# try to get more font metrics with fonttool
tt = TTFont(BytesIO(buffer))
name = cls.get_font_family_name(tt)
line_height = cls.get_line_height_factor(tt)
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/image/Image.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def store(self):
'''Store image with base64 encode.
* Encode image bytes with base64 -> base64 bytes
* Decode base64 bytes -> str -> so can be serialized in json formart
* Decode base64 bytes -> str -> so can be serialized in json format
'''
res = super().store()
res.update({
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/image/ImagesExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def extract_images(self, clip_image_res_ratio:float=3.0):

# The final view might be formed by several images with alpha channel only, as shown in issue-123.
# It's still inconvenient to extract the original alpha/mask image, as a compromise, extract the
# equivalent image by cliping the union page region for now.
# equivalent image by clipping the union page region for now.
# https://github.com/dothinking/pdf2docx/issues/123

# step 1: collect images: [(bbox, item), ..., ]
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/layout/Blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ def close_text_block():

@staticmethod
def _split_text_block_vertically(instances:list, line_break_free_space_ratio:float, new_paragraph_free_space_ratio:float):
'''Split text block into separate paragraph based on punctuation of sentense.
'''Split text block into separate paragraph based on punctuation of sentence.
.. note::
Considered only normal reading direction, from left to right, from top
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/layout/Layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
table. In addition to ``Section`` and ``Column``, ``TableBlock`` is used to maintain the page layout .
So, detecting and parsing table block is the principle steps.
The prerequite work is done before this step:
The prerequisite work is done before this step:
1. Clean up source blocks and shapes in Page level, e.g. convert source blocks to ``Line`` level,
because the block structure determined by ``PyMuPDF`` might be not reasonable.
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def table(pdf_file, password:str=None, start:int=0, end:int=None, pages:list=Non
@staticmethod
def gui():
'''Simple user interface.'''
# import App containing tkinter internally, in case GUI is not supported by some platdorm,
# import App containing tkinter internally, in case GUI is not supported by some platforms,
# e.g. Amazon Linux 2
from .gui.App import App
app = App(title='PDF_2_Docx Converter', width=500, height=600)
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/page/Pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Pages(BaseCollection):
'''A collection of ``Page``.'''

def parse(self, fitz_doc, **settings):
'''Analyse document structure, e.g. page section, header, footer.
'''Analyze document structure, e.g. page section, header, footer.
Args:
fitz_doc (fitz.Document): ``PyMuPDF`` Document instance.
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/page/RawPage.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def calculate_margin(self, **settings):
Ensure this method is run right after cleaning up the layout, so the page margin is
calculated based on valid layout, and stay constant.
"""
# Exclude hyperlink from shapes because hyperlink might exist out of page unreasonablely,
# Exclude hyperlink from shapes because hyperlink might exist out of page unreasonably,
# while it should always within page since attached to text.
shapes = Shapes([shape for shape in self.shapes if not isinstance(shape, Hyperlink)])

Expand Down
4 changes: 2 additions & 2 deletions pdf2docx/shape/Paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def to_shapes(self):

def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15,
min_w:float=2, min_h:float=2, clip_image_res_ratio:float=3.0):
'''Convert paths to iso-oriented shapes or images. The sementic type of path is either table/text style or
'''Convert paths to iso-oriented shapes or images. The semantic type of path is either table/text style or
vector graphic. This method is to:
* detect svg regions -> exist at least one non-iso-oriented path
* convert svg to bitmap by clipping page
Expand All @@ -91,7 +91,7 @@ def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15,
Returns:
tuple: (list of shape raw dict, list of image raw dict).
'''
# convert all paths to shapes if no non-iso-orientied path exists
# convert all paths to shapes if no non-iso-orientated path exists
iso_shapes = []
if self.is_iso_oriented:
iso_shapes.extend(self.to_shapes())
Expand Down
10 changes: 5 additions & 5 deletions pdf2docx/shape/Shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def store(self):


def parse_semantic_type(self, blocks:list):
'''Determin semantic type based on the position to text blocks. Note the results might be
'''Determine semantic type based on the position to text blocks. Note the results might be
a combination of raw types, e.g. the semantic type of a stroke can be either text strike,
underline or table border.
Expand Down Expand Up @@ -217,7 +217,7 @@ def update_bbox(self, rect):

@property
def default_type(self):
'''Default sementic type for a Stroke shape: table border, underline or strike-through.'''
'''Default semantic type for a Stroke shape: table border, underline or strike-through.'''
return RectType.BORDER.value | RectType.UNDERLINE.value | RectType.STRIKE.value

def _semantic_type(self, line):
Expand Down Expand Up @@ -294,11 +294,11 @@ def to_stroke(self, max_border_width:float):

@property
def default_type(self):
'''Default sementic type for a Fill shape: table shading or text highlight.'''
'''Default semantic type for a Fill shape: table shading or text highlight.'''
return RectType.SHADING.value | RectType.HIGHLIGHT.value

def _semantic_type(self, line):
'''Override. Check semantic type based on the position to a text line. Along the main dimesion,
'''Override. Check semantic type based on the position to a text line. Along the main dimension,
text highlight never exceeds text line.
Args:
Expand Down Expand Up @@ -357,7 +357,7 @@ def store(self):

@property
def default_type(self):
'''Default sementic type for a Hyperlink: always hyperlink.'''
'''Default semantic type for a Hyperlink: always hyperlink.'''
return RectType.HYPERLINK.value

def parse_semantic_type(self, blocks:list=None):
Expand Down
6 changes: 3 additions & 3 deletions pdf2docx/table/Border.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

'''Module to determin stream table borders.
'''Module to determine stream table borders.
Though no exact borders exist for stream table, it's better to simplify table structure by
aligning borders as more as possible. Taking vertical borders for example, it can be moved
Expand Down Expand Up @@ -197,7 +197,7 @@ def finalize_by_stroke(self, stroke:Stroke):
* The border-like stroke may be an underline or strike-through.
'''
# NOTE: don't do this: `if self.finalized: continue`,
# because `self.finalized` just determed the main dimension, still need a chance to determin
# because `self.finalized` just determined the main dimension, still need a chance to determine
# the other dimension.

if self.is_horizontal:
Expand Down Expand Up @@ -334,7 +334,7 @@ def _finalize_by_layout(borders:list):
# check intersection status of each intervals
x_status = [] # [(x, status), ...]
for i in range(len(x_points)-1):
x = (x_points[i]+x_points[i+1])/2.0 # cenper point
x = (x_points[i]+x_points[i+1])/2.0 # center point
s = [int(border.is_valid(x)) for border in borders]
x_status.append((x,s))

Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/table/TablesConstructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def _inner_borders(lines:Lines, outer_borders:tuple):
* Rebuild layout, e.g. text layout with two columns, and
* parsing real borderless table.
It's controdictory that the former needn't to deep into row level, just ``1xN`` table
It's contradictory that the former needn't to deep into row level, just ``1xN`` table
convenient for layout recreation; instead, the later should, ``MxN`` table for each
cell precisely. So, the principle determining stream tables borders:
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/text/Char.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Char(Element):
def __init__(self, raw:dict=None):
if raw is None: raw = {}

# Note to filter control character avoiding error when makeing docx, #126
# Note to filter control character avoiding error when making docx, #126
c = raw.get('c', '')
if c in INVALID_CHARS: c = ''
self.c = c
Expand Down
8 changes: 4 additions & 4 deletions pdf2docx/text/Lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Lines(ElementCollection):

@property
def unique_parent(self):
'''Whether all contained lines have same parant.'''
'''Whether all contained lines have same parent.'''
if not bool(self): return False

first_line = self._instances[0]
Expand Down Expand Up @@ -65,9 +65,9 @@ def split_vertically_by_text(self, line_break_free_space_ratio:float, new_paragr
# check row by row
res = []
lines = Lines()
punc = tuple(constants.SENTENSE_END_PUNC)
punc = tuple(constants.SENTENCE_END_PUNC)
start_of_para = end_of_para = False # start/end of paragraph
start_of_sen = end_of_sen = False # start/end of sentense
start_of_sen = end_of_sen = False # start/end of sentence
for row in rows:
end_of_sen = row[-1].text.strip().endswith(punc)
w = row[-1].bbox[2]-row[0].bbox[0]
Expand All @@ -76,7 +76,7 @@ def split_vertically_by_text(self, line_break_free_space_ratio:float, new_paragr
if end_of_sen and w/W <= 1.0-line_break_free_space_ratio:
end_of_para = True

# start of sentense and free space at the start -> start of paragraph
# start of sentence and free space at the start -> start of paragraph
elif start_of_sen and (W-w)/H >= new_paragraph_free_space_ratio:
start_of_para = True

Expand Down
4 changes: 2 additions & 2 deletions pdf2docx/text/TextBlock.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def parse_relative_line_spacing(self):
# return default line spacing if any images exists
for line in self.lines:
if list(span for span in line.spans if isinstance(span, ImageSpan)):
self.line_space = constants.DEFULT_LINE_SPACING
self.line_space = constants.DEFAULT_LINE_SPACING
return

# otherwise, calculate average line spacing
Expand All @@ -243,7 +243,7 @@ def parse_relative_line_spacing(self):
line_space = block_height/standard_height

# overlap may exist when multi-rows, so set minimum spacing -> default spacing
if len(rows)>1: line_space = max(line_space, constants.DEFULT_LINE_SPACING)
if len(rows)>1: line_space = max(line_space, constants.DEFAULT_LINE_SPACING)
self.line_space = line_space


Expand Down
6 changes: 3 additions & 3 deletions pdf2docx/text/TextSpan.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def _parse_text_format(self, rect:Shape, horizontal:bool=True):
"""Parse text style based on the position to a rect shape.
Args:
rect (Shape): Target rect shape reprenting potential text style.
rect (Shape): Target rect shape representing potential text style.
horizontal (bool, optional): Horizontal text direction. Defaults to True.
Returns:
Expand Down Expand Up @@ -347,7 +347,7 @@ def intersects(self, rect):
if not rect.intersects(self.bbox):
return TextSpan()

# furcher check chars in span
# further check chars in span
span = self.copy()
span.chars.clear()
span.update_bbox((0.0,0.0,0.0,0.0))
Expand Down Expand Up @@ -379,7 +379,7 @@ def make_docx(self, paragraph):
# set text style, e.g. font, underline and highlight
self._set_text_format(docx_run)

# set charaters spacing
# set charters spacing
if self.char_spacing:
docx.set_char_spacing(docx_run, self.char_spacing)

Expand Down

0 comments on commit 24ee434

Please sign in to comment.