Spell checked /common, /font, /gui, /image, /layout, /page, /table, /…

…text, converter.py and main.py along with other files; changes 2 constant's spelling in all occurrences.
ArtifexSoftware · Apr 5, 2023 · 24ee434 · 24ee434
1 parent 6ca5f19
commit 24ee434
Show file tree

Hide file tree

Showing 21 changed files with 38 additions and 38 deletions.
diff --git a/pdf2docx/common/Block.py b/pdf2docx/common/Block.py
@@ -111,7 +111,7 @@ def parse_horizontal_spacing(self, bbox, *args):
             bbox (fitz.rect): boundary box of this block.
         """
         # NOTE: in PyMuPDF CS, horizontal text direction is same with positive x-axis,
-        # while vertical text is on the contrarory, so use f = -1 here
+        # while vertical text is on the contrary, so use f = -1 here
         idx, f = (0, 1.0) if self.is_horizontal_text else (3, -1.0)
         self.alignment = TextAlignment.LEFT
         self.left_space = (self.bbox[idx] - bbox[idx]) * f

diff --git a/pdf2docx/common/Element.py b/pdf2docx/common/Element.py
@@ -220,7 +220,7 @@ def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True):
         L2 = e.bbox[idx+2]-e.bbox[idx]
         L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx])
 
-        eps = 1e-3 # tolerent
+        eps = 1e-3 # tolerant
         return L1+L2-L+eps >= factor*min(L1,L2)
 
 
@@ -252,7 +252,7 @@ def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True)
         L2 = e.bbox[idx+2]-e.bbox[idx]
         L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx])
 
-        eps = 1e-3 # tolerent
+        eps = 1e-3 # tolerant
         return L1+L2-L+eps >= factor*min(L1,L2)
 
 

diff --git a/pdf2docx/common/algorithm.py b/pdf2docx/common/algorithm.py
@@ -124,7 +124,7 @@ def solve_rects_intersection(V:list, num:int, index_groups:list):
     S22 = list(filter( lambda item: item[1][0]>X, right ))
     S21 = list(filter( lambda item: item[1][0]<=X0, right ))
 
-    # intersection in x-direction is fullfilled, so check y-direction further
+    # intersection in x-direction is fulfilled, so check y-direction further
     _stab(S12, S22, index_groups)
     _stab(S21, S11, index_groups)
     _stab(S12, S21, index_groups)
@@ -290,7 +290,7 @@ def inner_contours(img_binary:np.array, bbox:tuple, min_w:float, min_h:float):
     '''Inner contours of current region, especially level 2 contours of the default opencv tree hirerachy.
 
     Args:
-        img_binary (np.array): Binarized image with intresting region (255) and empty region (0).
+        img_binary (np.array): Binarized image with interesting region (255) and empty region (0).
         bbox (tuple): The external bbox.
         min_w (float): Ignore contours if the bbox width is less than this value.
         min_h (float): Ignore contours if the bbox height is less than this value.

diff --git a/pdf2docx/common/constants.py b/pdf2docx/common/constants.py
@@ -25,10 +25,10 @@
 # -------------------------------------
 HIDDEN_W_BORDER = 0.0   # do not show border
 MIN_LINE_SPACING = 0.7  # minimum line spacing available in MS word
-DEFULT_LINE_SPACING = 1.02
+DEFAULT_LINE_SPACING = 1.02
 
 # punctuation implying end of a sentense
-SENTENSE_END_PUNC = '.．。?？!！'
+SENTENCE_END_PUNC = '.．。?？!！'
 
 # control characters not supported by lxml
 # https://github.com/dothinking/pdf2docx/issues/126#issuecomment-1040034077

diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py
@@ -28,7 +28,7 @@ class Converter:
     * Read PDF file with ``PyMuPDF`` to get raw layout data page by page, including text,
       image, drawing and its properties, e.g. boundary box, font, size, image width, height.
     * Analyze layout in document level, e.g. page header, footer and margin.
-    * Parse page layout to docx structure, e.g. paragraph and its properties like indentaton, 
+    * Parse page layout to docx structure, e.g. paragraph and its properties like indentation, 
       spacing, text alignment; table and its properties like border, shading, merging. 
     * Finally, generate docx with ``python-docx``.
     '''
@@ -96,7 +96,7 @@ def default_settings(self):
             'lines_left_aligned_threshold'   : 1.0,    # left aligned if d_x0 of two lines is lower than this value (Pt)
             'lines_right_aligned_threshold'  : 1.0,    # right aligned if d_x1 of two lines is lower than this value (Pt)
             'lines_center_aligned_threshold' : 2.0,    # center aligned if delta center of two lines is lower than this value
-            'clip_image_res_ratio'           : 4.0,    # resolution ratio (to 72dpi) when cliping page image
+            'clip_image_res_ratio'           : 4.0,    # resolution ratio (to 72dpi) when clipping page image
             'min_svg_gap_dx'                 : 15.0,   # merge adjacent vector graphics if the horizontal gap is less than this value
             'min_svg_gap_dy'                 : 2.0,    # merge adjacent vector graphics if the vertical gap is less than this value
             'min_svg_w'                      : 2.0,    # ignore vector graphics if the bbox width is less than this value

diff --git a/pdf2docx/font/Fonts.py b/pdf2docx/font/Fonts.py
@@ -77,7 +77,7 @@ def extract(cls, fitz_doc):
                 # - cff: Adobe Compact File Format, i.e. Type 1 font
                 assert ext not in ('n/a', 'cff'), "base font or not supported font"
 
-                # try to get more font metrices with fonttool
+                # try to get more font metrics with fonttool
                 tt = TTFont(BytesIO(buffer))
                 name = cls.get_font_family_name(tt)
                 line_height = cls.get_line_height_factor(tt)

diff --git a/pdf2docx/image/Image.py b/pdf2docx/image/Image.py
@@ -64,7 +64,7 @@ def store(self):
         '''Store image with base64 encode.
 
         * Encode image bytes with base64 -> base64 bytes
-        * Decode base64 bytes -> str -> so can be serialized in json formart
+        * Decode base64 bytes -> str -> so can be serialized in json format
         '''
         res = super().store()
         res.update({

diff --git a/pdf2docx/image/ImagesExtractor.py b/pdf2docx/image/ImagesExtractor.py
@@ -94,7 +94,7 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
 
         # The final view might be formed by several images with alpha channel only, as shown in issue-123. 
         # It's still inconvenient to extract the original alpha/mask image, as a compromise, extract the 
-        # equivalent image by cliping the union page region for now.
+        # equivalent image by clipping the union page region for now.
         # https://github.com/dothinking/pdf2docx/issues/123
 
         # step 1: collect images: [(bbox, item), ..., ]

diff --git a/pdf2docx/layout/Blocks.py b/pdf2docx/layout/Blocks.py
@@ -503,7 +503,7 @@ def close_text_block():
 
     @staticmethod
     def _split_text_block_vertically(instances:list, line_break_free_space_ratio:float, new_paragraph_free_space_ratio:float):
-        '''Split text block into separate paragraph based on punctuation of sentense.
+        '''Split text block into separate paragraph based on punctuation of sentence.
 
         .. note::
             Considered only normal reading direction, from left to right, from top

diff --git a/pdf2docx/layout/Layout.py b/pdf2docx/layout/Layout.py
@@ -7,7 +7,7 @@
 table. In addition to ``Section`` and ``Column``, ``TableBlock`` is used to maintain the page layout . 
 So, detecting and parsing table block is the principle steps.
 
-The prerequite work is done before this step:
+The prerequisite work is done before this step:
 
 1. Clean up source blocks and shapes in Page level, e.g. convert source blocks to ``Line`` level,
    because the block structure determined by ``PyMuPDF`` might be not reasonable.

diff --git a/pdf2docx/main.py b/pdf2docx/main.py
@@ -97,7 +97,7 @@ def table(pdf_file, password:str=None, start:int=0, end:int=None, pages:list=Non
     @staticmethod
     def gui():
         '''Simple user interface.'''
-        # import App containing tkinter internally, in case GUI is not supported by some platdorm,
+        # import App containing tkinter internally, in case GUI is not supported by some platforms,
         # e.g. Amazon Linux 2
         from .gui.App import App
         app = App(title='PDF_2_Docx Converter', width=500, height=600)

diff --git a/pdf2docx/page/Pages.py b/pdf2docx/page/Pages.py
@@ -13,7 +13,7 @@ class Pages(BaseCollection):
     '''A collection of ``Page``.'''
 
     def parse(self, fitz_doc, **settings):
-        '''Analyse document structure, e.g. page section, header, footer.
+        '''Analyze document structure, e.g. page section, header, footer.
 
         Args:
             fitz_doc (fitz.Document): ``PyMuPDF`` Document instance.

diff --git a/pdf2docx/page/RawPage.py b/pdf2docx/page/RawPage.py
@@ -117,7 +117,7 @@ def calculate_margin(self, **settings):
             Ensure this method is run right after cleaning up the layout, so the page margin is 
             calculated based on valid layout, and stay constant.
         """
-        # Exclude hyperlink from shapes because hyperlink might exist out of page unreasonablely, 
+        # Exclude hyperlink from shapes because hyperlink might exist out of page unreasonably, 
         # while it should always within page since attached to text.
         shapes = Shapes([shape for shape in self.shapes if not isinstance(shape, Hyperlink)])
 

diff --git a/pdf2docx/shape/Paths.py b/pdf2docx/shape/Paths.py
@@ -75,7 +75,7 @@ def to_shapes(self):
 
     def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15, 
                                 min_w:float=2, min_h:float=2, clip_image_res_ratio:float=3.0):
-        '''Convert paths to iso-oriented shapes or images. The sementic type of path is either table/text style or 
+        '''Convert paths to iso-oriented shapes or images. The semantic type of path is either table/text style or 
         vector graphic. This method is to:
         * detect svg regions -> exist at least one non-iso-oriented path
         * convert svg to bitmap by clipping page
@@ -91,7 +91,7 @@ def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15,
         Returns:
             tuple: (list of shape raw dict, list of image raw dict).
         '''
-        # convert all paths to shapes if no non-iso-orientied path exists
+        # convert all paths to shapes if no non-iso-orientated path exists
         iso_shapes = []
         if self.is_iso_oriented:
             iso_shapes.extend(self.to_shapes())

diff --git a/pdf2docx/shape/Shape.py b/pdf2docx/shape/Shape.py
@@ -103,7 +103,7 @@ def store(self):
 
 
     def parse_semantic_type(self, blocks:list):
-        '''Determin semantic type based on the position to text blocks. Note the results might be 
+        '''Determine semantic type based on the position to text blocks. Note the results might be 
         a combination of raw types, e.g. the semantic type of a stroke can be either text strike,
         underline or table border.
 
@@ -217,7 +217,7 @@ def update_bbox(self, rect):
 
     @property
     def default_type(self):
-        '''Default sementic type for a Stroke shape: table border, underline or strike-through.'''
+        '''Default semantic type for a Stroke shape: table border, underline or strike-through.'''
         return RectType.BORDER.value | RectType.UNDERLINE.value | RectType.STRIKE.value
 
     def _semantic_type(self, line):
@@ -294,11 +294,11 @@ def to_stroke(self, max_border_width:float):
 
     @property
     def default_type(self):
-        '''Default sementic type for a Fill shape: table shading or text highlight.'''
+        '''Default semantic type for a Fill shape: table shading or text highlight.'''
         return RectType.SHADING.value | RectType.HIGHLIGHT.value
 
     def _semantic_type(self, line):
-        '''Override. Check semantic type based on the position to a text line. Along the main dimesion,
+        '''Override. Check semantic type based on the position to a text line. Along the main dimension,
         text highlight never exceeds text line.
 
         Args:
@@ -357,7 +357,7 @@ def store(self):
 
     @property
     def default_type(self):
-        '''Default sementic type for a Hyperlink: always hyperlink.'''
+        '''Default semantic type for a Hyperlink: always hyperlink.'''
         return RectType.HYPERLINK.value
 
     def parse_semantic_type(self, blocks:list=None):

diff --git a/pdf2docx/table/Border.py b/pdf2docx/table/Border.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-'''Module to determin stream table borders.
+'''Module to determine stream table borders.
 
 Though no exact borders exist for stream table, it's better to simplify table structure by
 aligning borders as more as possible. Taking vertical borders for example, it can be moved 
@@ -197,7 +197,7 @@ def finalize_by_stroke(self, stroke:Stroke):
             * The border-like stroke may be an underline or strike-through.      
         '''
         # NOTE: don't do this: `if self.finalized: continue`, 
-        # because `self.finalized` just determed the main dimension, still need a chance to determin 
+        # because `self.finalized` just determined the main dimension, still need a chance to determine 
         # the other dimension.         
 
         if self.is_horizontal:
@@ -334,7 +334,7 @@ def _finalize_by_layout(borders:list):
         # check intersection status of each intervals
         x_status = [] # [(x, status), ...]
         for i in range(len(x_points)-1):
-            x = (x_points[i]+x_points[i+1])/2.0 # cenper point
+            x = (x_points[i]+x_points[i+1])/2.0 # center point
             s = [int(border.is_valid(x)) for border in borders]
             x_status.append((x,s))
 

diff --git a/pdf2docx/table/TablesConstructor.py b/pdf2docx/table/TablesConstructor.py
@@ -300,7 +300,7 @@ def _inner_borders(lines:Lines, outer_borders:tuple):
         * Rebuild layout, e.g. text layout with two columns, and
         * parsing real borderless table.
 
-        It's controdictory that the former needn't to deep into row level, just ``1xN`` table 
+        It's contradictory that the former needn't to deep into row level, just ``1xN`` table 
         convenient for layout recreation; instead, the later should, ``MxN`` table for each 
         cell precisely. So, the principle determining stream tables borders:
 

diff --git a/pdf2docx/text/Char.py b/pdf2docx/text/Char.py
@@ -22,7 +22,7 @@ class Char(Element):
     def __init__(self, raw:dict=None):
         if raw is None: raw = {}
 
-        # Note to filter control character avoiding error when makeing docx, #126
+        # Note to filter control character avoiding error when making docx, #126
         c = raw.get('c', '')
         if c in INVALID_CHARS: c = ''
         self.c = c

diff --git a/pdf2docx/text/Lines.py b/pdf2docx/text/Lines.py
@@ -18,7 +18,7 @@ class Lines(ElementCollection):
 
     @property
     def unique_parent(self):
-        '''Whether all contained lines have same parant.'''
+        '''Whether all contained lines have same parent.'''
         if not bool(self): return False
 
         first_line = self._instances[0]
@@ -65,9 +65,9 @@ def split_vertically_by_text(self, line_break_free_space_ratio:float, new_paragr
         # check row by row
         res = []
         lines = Lines()
-        punc = tuple(constants.SENTENSE_END_PUNC)
+        punc = tuple(constants.SENTENCE_END_PUNC)
         start_of_para = end_of_para = False # start/end of paragraph
-        start_of_sen = end_of_sen = False   # start/end of sentense
+        start_of_sen = end_of_sen = False   # start/end of sentence
         for row in rows:
             end_of_sen = row[-1].text.strip().endswith(punc)
             w =  row[-1].bbox[2]-row[0].bbox[0]
@@ -76,7 +76,7 @@ def split_vertically_by_text(self, line_break_free_space_ratio:float, new_paragr
             if end_of_sen and w/W <= 1.0-line_break_free_space_ratio:
                 end_of_para = True
 
-            # start of sentense and free space at the start -> start of paragraph
+            # start of sentence and free space at the start -> start of paragraph
             elif start_of_sen and (W-w)/H >= new_paragraph_free_space_ratio:
                 start_of_para = True
 

diff --git a/pdf2docx/text/TextBlock.py b/pdf2docx/text/TextBlock.py
@@ -227,7 +227,7 @@ def parse_relative_line_spacing(self):
         # return default line spacing if any images exists
         for line in self.lines:
             if list(span for span in line.spans if isinstance(span, ImageSpan)):
-                self.line_space = constants.DEFULT_LINE_SPACING
+                self.line_space = constants.DEFAULT_LINE_SPACING
                 return
 
         # otherwise, calculate average line spacing
@@ -243,7 +243,7 @@ def parse_relative_line_spacing(self):
         line_space = block_height/standard_height
 
         # overlap may exist when multi-rows, so set minimum spacing  -> default spacing
-        if len(rows)>1: line_space = max(line_space, constants.DEFULT_LINE_SPACING)
+        if len(rows)>1: line_space = max(line_space, constants.DEFAULT_LINE_SPACING)
         self.line_space = line_space
 
 

diff --git a/pdf2docx/text/TextSpan.py b/pdf2docx/text/TextSpan.py
@@ -276,7 +276,7 @@ def _parse_text_format(self, rect:Shape, horizontal:bool=True):
         """Parse text style based on the position to a rect shape.
 
         Args:
-            rect (Shape): Target rect shape reprenting potential text style.
+            rect (Shape): Target rect shape representing potential text style.
             horizontal (bool, optional): Horizontal text direction. Defaults to True.
 
         Returns:
@@ -347,7 +347,7 @@ def intersects(self, rect):
         if not rect.intersects(self.bbox):
             return TextSpan()
 
-        # furcher check chars in span
+        # further check chars in span
         span = self.copy()
         span.chars.clear()
         span.update_bbox((0.0,0.0,0.0,0.0))
@@ -379,7 +379,7 @@ def make_docx(self, paragraph):
         # set text style, e.g. font, underline and highlight
         self._set_text_format(docx_run)
 
-        # set charaters spacing
+        # set charters spacing
         if self.char_spacing: 
             docx.set_char_spacing(docx_run, self.char_spacing)