diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml deleted file mode 100644 index 0e73bd57..00000000 --- a/.github/workflows/doc.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: pdf2docx-doc - -on: - push: - branches: - - master - -jobs: - publish_doc: - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v2 - - - name: Set up Python 3.x - uses: actions/setup-python@v1 - with: - python-version: '3.x' - - - name: Display Python version - run: python -c "import sys; print(sys.version)" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install sphinx sphinx_rtd_theme autodoc sphinxcontrib.apidoc setuptools - pip install -r requirements.txt - python setup.py develop - - # build package for tags, e.g. 3.2.1 extracted from 'refs/tags/v3.2.1' - - name: Create html doc - run: | - echo ${GITHUB_REF#refs/tags/v} > version.txt - make doc - - - name: Deploy - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./build/html \ No newline at end of file diff --git a/pdf2docx/common/Collection.py b/pdf2docx/common/Collection.py index efc89996..3cf5dd64 100644 --- a/pdf2docx/common/Collection.py +++ b/pdf2docx/common/Collection.py @@ -348,10 +348,12 @@ def split_with_intersection(self, bbox:fitz.Rect, threshold:float=1e-3): for instance in self._instances: # A contains B => A & B = B intersection = instance.bbox & bbox - factor = round(intersection.get_area()/instance.bbox.get_area(), 2) - - if factor >= threshold: - intersections.append(instance) - else: + if intersection.is_empty: no_intersections.append(instance) - return self.__class__(intersections), self.__class__(no_intersections) \ No newline at end of file + else: + factor = round(intersection.get_area()/instance.bbox.get_area(), 2) + if factor >= threshold: + intersections.append(instance) + else: + no_intersections.append(instance) + return self.__class__(intersections), self.__class__(no_intersections) diff --git a/pdf2docx/common/Element.py b/pdf2docx/common/Element.py index e888b1e2..6681c42d 100644 --- a/pdf2docx/common/Element.py +++ b/pdf2docx/common/Element.py @@ -1,10 +1,8 @@ -# -*- coding: utf-8 -*- - '''Object with a bounding box, e.g. Block, Line, Span. -Based on ``PyMuPDF``, the coordinates (e.g. bbox of ``page.get_text('rawdict')``) are generally -provided relative to the un-rotated page; while this ``pdf2docx`` library works under real page -coordinate system, i.e. with rotation considered. So, any instances created by this Class are +Based on ``PyMuPDF``, the coordinates (e.g. bbox of ``page.get_text('rawdict')``) are generally +provided relative to the un-rotated page; while this ``pdf2docx`` library works under real page +coordinate system, i.e. with rotation considered. So, any instances created by this Class are always applied a rotation matrix automatically. Therefore, the bbox parameter used to create ``Element`` instance MUST be relative to un-rotated @@ -36,7 +34,7 @@ def set_rotation_matrix(cls, rotation_matrix): Args: Rotation_matrix (fitz.Matrix): target matrix - """ + """ if rotation_matrix and isinstance(rotation_matrix, fitz.Matrix): cls.ROTATION_MATRIX = rotation_matrix @@ -49,11 +47,12 @@ def pure_rotation_matrix(cls): def __init__(self, raw:dict=None, parent=None): - ''' Initialize Element and convert to the real (rotation considered) page coordinate system.''' + ''' Initialize Element and convert to the real (rotation considered) page CS.''' self.bbox = fitz.Rect() # type: fitz.Rect self._parent = parent # type: Element - # NOTE: Any coordinates provided in raw is in original page CS (without considering page rotation). + # NOTE: Any coordinates provided in raw is in original page CS + # (without considering page rotation). if 'bbox' in (raw or {}): rect = fitz.Rect(raw['bbox']) * Element.ROTATION_MATRIX self.update_bbox(rect) @@ -61,8 +60,15 @@ def __init__(self, raw:dict=None, parent=None): def __bool__(self): '''Real object when bbox is defined.''' + # NOTE inconsistent results of fitz.Rect for different version of pymupdf, e.g., + # a = fitz.Rect(3,3,2,2) + # bool(a) a.get_area() a.is_empty + # pymupdf 1.23.5 True 1.0 True + # pymupdf 1.23.8 True 0.0 True + # bool(fitz.Rect())==False + # NOTE: do not use `return not self.bbox.is_empty` here return bool(self.bbox) - + def __repr__(self): return f'{self.__class__.__name__}({tuple(self.bbox)})' @@ -98,18 +104,19 @@ def get_expand_bbox(self, dt:float): Returns: fitz.Rect: Expanded bbox. - + .. note:: This method creates a new bbox, rather than changing the bbox of itself. - """ + """ return self.bbox + (-dt, -dt, dt, dt) def update_bbox(self, rect): '''Update current bbox to specified ``rect``. - + Args: - rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)`` in real page CS (with rotation considered). + rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)``, + in real page CS (with rotation considered). ''' self.bbox = fitz.Rect([round(x,1) for x in rect]) return self @@ -123,45 +130,44 @@ def union_bbox(self, e): Returns: Element: self - """ + """ return self.update_bbox(self.bbox | e.bbox) # -------------------------------------------- # location relationship to other Element instance - # -------------------------------------------- + # -------------------------------------------- def contains(self, e:'Element', threshold:float=1.0): """Whether given element is contained in this instance, with margin considered. Args: e (Element): Target element - threshold (float, optional): Intersection rate. Defaults to 1.0. The larger, the stricter. + threshold (float, optional): Intersection rate. + Defaults to 1.0. The larger, the stricter. Returns: bool: [description] """ - # NOTE the case bool(e)=True but e.bbox.get_area()=0 S = e.bbox.get_area() - if not S: return False - + if not S: return False + # it's not practical to set a general threshold to consider the margin, so two steps: # - set a coarse but acceptable area threshold, # - check the length in main direction strictly - # A contains B => A & B = B intersection = self.bbox & e.bbox - factor = round(intersection.get_area()/e.bbox.get_area(), 2) + factor = round(intersection.get_area()/S, 2) if factor= self.bbox.height: return self.bbox.width+constants.MINOR_DIST >= e.bbox.width - else: - return self.bbox.height+constants.MINOR_DIST >= e.bbox.height - + return self.bbox.height+constants.MINOR_DIST >= e.bbox.height + def get_main_bbox(self, e, threshold:float=0.95): - """If the intersection with ``e`` exceeds the threshold, return the union of these two elements; else return None. + """If the intersection with ``e`` exceeds the threshold, return the union of + these two elements; else return None. Args: e (Element): Target element. @@ -172,43 +178,44 @@ def get_main_bbox(self, e, threshold:float=0.95): """ bbox_1 = self.bbox bbox_2 = e.bbox if hasattr(e, 'bbox') else fitz.Rect(e) - + # areas b = bbox_1 & bbox_2 - if not b: return None # no intersection - - a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area() + if b.is_empty: return None # no intersection # Note: if bbox_1 and bbox_2 intersects with only an edge, b is not empty but b.get_area()=0 # so give a small value when they're intersected but the area is zero + a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area() factor = a/min(a1,a2) if a else 1e-6 return bbox_1 | bbox_2 if factor >= threshold else None def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True): - '''Check whether two Element instances have enough intersection in vertical direction, i.e. perpendicular to reading direction. - + '''Check whether two Element instances have enough intersection in vertical direction, + i.e. perpendicular to reading direction. + Args: e (Element): Object to check with - factor (float, optional): Threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned. - text_direction (bool, optional): Consider text direction or not. True by default, from left to right if False. + factor (float, optional): Threshold of overlap ratio, the larger it is, the higher + probability the two bbox-es are aligned. + text_direction (bool, optional): Consider text direction or not. True by default. Returns: bool: [description] - + Examples:: +--------------+ | | - +--------------+ + +--------------+ L1 +-------------------+ | | +-------------------+ L2 - + An enough intersection is defined based on the minimum width of two boxes:: - + L1+L2-L>factor*min(L1,L2) ''' if not e or not bool(self): return False @@ -225,12 +232,14 @@ def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True): def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True): - '''Check whether two Element instances have enough intersection in horizontal direction, i.e. along the reading direction. - + '''Check whether two Element instances have enough intersection in horizontal direction, + i.e. along the reading direction. + Args: e (Element): Element to check with - factor (float, optional): threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned. - text_direction (bool, optional): consider text direction or not. True by default, from left to right if False. + factor (float, optional): threshold of overlap ratio, the larger it is, the higher + probability the two bbox-es are aligned. + text_direction (bool, optional): consider text direction or not. True by default. Examples:: @@ -238,16 +247,16 @@ def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True) | | L1 +--------------------+ +--------------+ | | L2 +--------------------+ - + An enough intersection is defined based on the minimum width of two boxes:: - + L1+L2-L>factor*min(L1,L2) ''' if not e or not bool(self): return False # text direction idx = 0 if text_direction and self.is_vertical_text else 1 - + L1 = self.bbox[idx+2]-self.bbox[idx] L2 = e.bbox[idx+2]-e.bbox[idx] L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx]) @@ -257,21 +266,19 @@ def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True) def in_same_row(self, e): - """Check whether in same row/line with specified Element instance. With text direction considered. - + """Check whether in same row/line with specified Element instance. + With text direction considered. + Taking horizontal text as an example: - + * yes: the bottom edge of each box is lower than the centerline of the other one; * otherwise, not in same row. Args: e (Element): Target object. - Returns: - bool: [description] - .. note:: - The difference to method ``horizontally_align_with``: they may not in same line, though + The difference to method ``horizontally_align_with``: they may not in same line, though aligned horizontally. """ if not e or self.is_horizontal_text != e.is_horizontal_text: @@ -291,9 +298,15 @@ def in_same_row(self, e): # ------------------------------------------------ def store(self): '''Store properties in raw dict.''' - return { 'bbox': tuple([x for x in self.bbox]) } + return { 'bbox': tuple(x for x in self.bbox) } + - def plot(self, page, stroke:tuple=(0,0,0), width:float=0.5, fill:tuple=None, dashes:str=None): '''Plot bbox in PDF page for debug purpose.''' - page.draw_rect(self.bbox, color=stroke, fill=fill, width=width, dashes=dashes, overlay=False, fill_opacity=0.5) \ No newline at end of file + page.draw_rect(self.bbox, + color=stroke, + fill=fill, + width=width, + dashes=dashes, + overlay=False, + fill_opacity=0.5) diff --git a/pdf2docx/common/share.py b/pdf2docx/common/share.py index 5487495e..e67b6e67 100644 --- a/pdf2docx/common/share.py +++ b/pdf2docx/common/share.py @@ -1,4 +1,5 @@ -# -*- coding: utf-8 -*- +'''Common methods.''' + from enum import Enum import random from collections.abc import Iterable @@ -26,7 +27,7 @@ class RectType(Enum): class TextDirection(Enum): - '''Text direction. + '''Text direction. * LEFT_RIGHT: from left to right within a line, and lines go from top to bottom * BOTTOM_TOP: from bottom to top within a line, and lines go from left to right * MIX : a mixture if LEFT_RIGHT and BOTTOM_TOP @@ -48,7 +49,7 @@ class TextAlignment(Enum): * UNKNOWN: can't decide, e.g. single line only ''' NONE = -1 - UNKNOWN = 0 + UNKNOWN = 0 LEFT = 1 CENTER = 2 RIGHT = 3 @@ -115,12 +116,25 @@ def flatten(items, klass): else: yield item + def lower_round(number:float, ndigits:int=0): '''Round number to lower bound with specified digits, e.g. lower_round(1.26, 1)=1.2''' n = 10.0**ndigits return int(n*number) / n +def decode(s:str): + '''Try to decode a unicode string.''' + b = bytes(ord(c) for c in s) + for encoding in ['utf-8', 'gbk', 'gb2312', 'iso-8859-1']: + try: + res = b.decode(encoding) + break + except: + continue + return res + + # ------------------------- # color methods # ------------------------- @@ -131,14 +145,14 @@ def rgb_component_from_name(name:str=''): pos = getColorList().index(name.upper()) else: pos = random.randint(0, len(getColorList())-1) - + c = getColorInfoList()[pos] return (c[1] / 255.0, c[2] / 255.0, c[3] / 255.0) def rgb_component(srgb:int): '''srgb value to R,G,B components, e.g. 16711680 -> (255, 0, 0). - + Equal to PyMuPDF built-in method:: [int(255*x) for x in fitz.sRGB_to_pdf(x)] @@ -191,7 +205,7 @@ def rgb_value(components:list): # ------------------------- def new_page(doc, width:float, height:float, title:str): '''Insert a new page with given title. - + Args: doc (fitz.Document): pdf document object. width (float): Page width. @@ -204,17 +218,17 @@ def new_page(doc, width:float, height:float, title:str): # plot title at the top-left corner gray = rgb_component_from_name('gray') page.insert_text((5, 16), title, color=gray, fontsize=15) - + return page def debug_plot(title:str, show=True): '''Plot the returned objects of inner function. - + Args: title (str): Page title. show (bool, optional): Don't plot if show==False. Default to True. - + .. note:: Prerequisite of the inner function: - the first argument is a :py:class:`~pdf2docx.page.BasePage` instance. @@ -241,4 +255,3 @@ def inner(*args, **kwargs): return objects return inner return wrapper - diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py index ddfbb638..8da7e8a5 100644 --- a/pdf2docx/converter.py +++ b/pdf2docx/converter.py @@ -12,9 +12,11 @@ from .page.Page import Page from .page.Pages import Pages -# check PyMuPDF>=1.19.x -if list(map(int, fitz.VersionBind.split("."))) < [1, 19, 0]: - raise SystemExit("PyMuPDF>=1.19.0 is required for pdf2docx.") +# check PyMuPDF version +# 1.19.0 <= v <= 1.23.8, or v>=1.23.16 +v = list(map(int, fitz.VersionBind.split("."))) +if v < [1,19,0] or [1,23,8]=1.23.16 is required for pdf2docx.") # logging logging.basicConfig( diff --git a/pdf2docx/font/Fonts.py b/pdf2docx/font/Fonts.py index feb26c9e..e369c81a 100644 --- a/pdf2docx/font/Fonts.py +++ b/pdf2docx/font/Fonts.py @@ -1,30 +1,31 @@ '''Extract fonts properties from PDF. -Font properties like font name, size are covered in :py:class:`~pdf2docx.text.TextSpan`, +Font properties like font name, size are covered in :py:class:`~pdf2docx.text.TextSpan`, but more generic properties are required further: -* Font family name. The font name extracted and set in ``TextSpan`` might not valid when +* Font family name. The font name extracted and set in ``TextSpan`` might not valid when directly used in MS Word, e.g. "ArialMT" should be "Arial". So, we need to get font family name, which should be accepted by MS Word, based on the font file itself. -* Font line height ratio. As line height = font_size * line_height_ratio, it's used to +* Font line height ratio. As line height = font_size * line_height_ratio, it's used to calculate relative line spacing. In general, 1.12 is an approximate value to this ratio, - but it's in fact a font-related value, especially for CJK font. + but it's in fact a font-related value, especially for CJK font. * So, extract font metrics, e.g. ascender and descender, with third party library ``fontTools`` - in first priority. This can obtain an accurate line height ratio, but sometimes the + in first priority. This can obtain an accurate line height ratio, but sometimes the embedded font data might crash. - + * Then, we have to use the default properties, i.e. ascender and descender, extracted by - ``PyMuPDF`` directly, but this value isn't so accurate. + ``PyMuPDF`` directly, but this value isn't so accurate. ''' import os from io import BytesIO -from collections import namedtuple +from collections import namedtuple from fontTools.ttLib import TTFont from ..common.Collection import BaseCollection from ..common.constants import (CJK_CODEPAGE_BITS, CJK_UNICODE_RANGE_BITS, CJK_UNICODE_RANGES) +from ..common.share import decode Font = namedtuple('Font', [ 'descriptor', # font descriptor @@ -42,7 +43,7 @@ def get(self, font_name:str): # 1st priority: check right the name for font in self: if target==font.descriptor: return font - + # 2nd priority: target name is contained in font name for font in self: if target in font.descriptor: return font @@ -50,7 +51,7 @@ def get(self, font_name:str): # 3rd priority: target name contains font name for font in self: if font.descriptor in target: return font - + return None @@ -59,7 +60,7 @@ def extract(cls, fitz_doc): '''Extract fonts from PDF and get properties. * Only embedded fonts (v.s. the base 14 fonts) can be extracted. * The extracted fonts may be invalid due to reason from PDF file itself. - ''' + ''' # get unique font references xrefs = set() for page in fitz_doc: @@ -69,9 +70,9 @@ def extract(cls, fitz_doc): fonts = [] for xref in xrefs: basename, ext, _, buffer = fitz_doc.extract_font(xref) - basename = bytes(ord(c) for c in basename).decode() + basename = decode(basename) name = cls._normalized_font_name(basename) - + try: # supported fonts: open/true type only # - n/a: base 14 fonts @@ -91,7 +92,7 @@ def extract(cls, fitz_doc): line_height=line_height)) return cls(fonts) - + @staticmethod def _normalized_font_name(name): @@ -104,7 +105,7 @@ def _to_descriptor(name:str): '''Remove potential space, dash in font name, and turn to upper case.''' return name.replace(' ', '').replace('-', '').upper() - + @staticmethod def get_font_family_name(tt_font:TTFont): '''Get the font family name from the font's names table. @@ -139,7 +140,7 @@ def get_line_height_factor(tt_font:TTFont): Fon non-CJK fonts:: f = (hhea.Ascent - hhea.Descent + hhea.LineGap) / units_per_em - + For non-CJK fonts (Windows):: f = (OS/2.winAscent + OS/2.winDescent + [External Leading]) / units_per_em @@ -176,9 +177,9 @@ def get_line_height_factor(tt_font:TTFont): os2_win_total_height = os2_win_ascent + os2_win_descent win_external_leading = max(0.0, hhea_linegap-(os2_win_total_height-hhea_total_height)) win_btb_distance = os2_win_total_height + win_external_leading - + btb_distance = win_btb_distance - + else: btb_distance = hhea_btb_distance @@ -187,7 +188,7 @@ def get_line_height_factor(tt_font:TTFont): distance = 1.3*hhea_total_height if cjk else 1.0*btb_distance return distance / units_per_em - + @staticmethod def is_cjk_font(tt_font:TTFont): diff --git a/pdf2docx/image/ImagesExtractor.py b/pdf2docx/image/ImagesExtractor.py index 3b802a7c..c4e8ed7b 100644 --- a/pdf2docx/image/ImagesExtractor.py +++ b/pdf2docx/image/ImagesExtractor.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - '''Extract images from PDF. Both raster images and vector graphics are considered: @@ -32,7 +30,7 @@ def clip_page_to_pixmap(self, bbox:fitz.Rect=None, zoom:float=3.0): Args: bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. - Note that ``bbox`` depends on un-rotated page CS, while cliping page is based on + Note that ``bbox`` depends on un-rotated page CS, while clipping page is based on the final page. zoom (float, optional): Improve resolution by this rate. Defaults to 3.0. @@ -100,7 +98,6 @@ def extract_images(self, clip_image_res_ratio:float=3.0): # step 1: collect images: [(bbox, item), ..., ] ic = Collection() for item in self._page.get_images(full=True): - # image item: (xref, smask, width, height, bpc, colorspace, ...) item = list(item) item[-1] = 0 @@ -132,19 +129,33 @@ def extract_images(self, clip_image_res_ratio:float=3.0): else: bbox, item = group[0] - # recover image - pix = self._recover_pixmap(doc, item) - # regarding images consist of alpha values only, i.e. colorspace is None, - # the turquoise color shown in the PDF is not part of the image, but part of PDF background. + # Regarding images consist of alpha values only, the turquoise color shown in + # the PDF is not part of the image, but part of PDF background. # So, just to clip page pixmap according to the right bbox # https://github.com/pymupdf/PyMuPDF/issues/677 - alpha_only = not pix.colorspace - if alpha_only: + + # It's not safe to identify images with alpha values only, + # - colorspace is None, for pymupdf <= 1.23.8 + # - colorspace is always Colorspace(CS_RGB), for pymupdf==1.23.9-15 -> issue + # - colorspace is Colorspace(CS_), for pymupdf >= 1.23.16 + + # So, use extracted image info directly. + # image item: (xref, smask, width, height, bpc, colorspace, ...), e.g., + # (19, 0, 331, 369, 1, '', '', 'Im1', 'FlateDecode', 0) + # (20, 24, 1265, 1303, 8, 'DeviceRGB', '', 'Im2', 'FlateDecode', 0) + # (21, 0, 331, 369, 1, '', '', 'Im3', 'CCITTFaxDecode', 0) + # (22, 25, 1265, 1303, 8, 'DeviceGray', '', 'Im4', 'DCTDecode', 0) + # (23, 0, 1731, 1331, 8, 'DeviceGray', '', 'Im5', 'DCTDecode', 0) + if item[5]=='': raw_dict = self.clip_page_to_dict(bbox, clip_image_res_ratio) - # rotate image with opencv if page is rotated + # normal images else: + # recover image, e.g., handle image with mask, or CMYK color space + pix = self._recover_pixmap(doc, item) + + # rotate image with opencv if page is rotated raw_dict = self._to_raw_dict(pix, bbox) if rotation: raw_dict['image'] = self._rotate_image(pix, -rotation) @@ -338,7 +349,7 @@ def _recover_pixmap(doc:fitz.Document, item:list): # we may need to adjust something for CMYK pixmaps here -> # recreate pixmap in RGB color space if necessary # NOTE: pix.colorspace may be None for images with alpha channel values only - if pix.colorspace and not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name): + if 'CMYK' in item[5].upper(): pix = fitz.Pixmap(fitz.csRGB, pix) return pix diff --git a/pdf2docx/layout/Column.py b/pdf2docx/layout/Column.py index 425c0025..3e8e4b4e 100644 --- a/pdf2docx/layout/Column.py +++ b/pdf2docx/layout/Column.py @@ -1,8 +1,6 @@ -# -*- coding: utf-8 -*- - '''Column of Section. -In most cases, one section per page. But in case multi-columns page, sections are used +In most cases, one section per page. But in case multi-columns page, sections are used to distinguish these different layouts. .. note:: @@ -22,21 +20,13 @@ ''' from ..common.Collection import Collection -from ..common.Element import Element from ..layout.Layout import Layout from ..shape.Shape import Shape from ..text.Line import Line -class Column(Element, Layout): - - def __init__(self, blocks=None, shapes=None): - '''Initialize empty column.''' - # Call the first parent class constructor only if omitting constructor. - # Unified constructor should be used (with *args, **kwargs) if using super().__init__(). - Element.__init__(self) - Layout.__init__(self, blocks, shapes) - +class Column(Layout): + '''Column of Section.''' @property def working_bbox(self): return self.bbox @@ -50,27 +40,10 @@ def add_elements(self, elements:Collection): self.assign_shapes(shapes) - def store(self): - '''Store parsed section layout in dict format.''' - res = Element.store(self) - res.update(Layout.store(self)) - return res - - - def restore(self, raw:dict): - '''Restore Column from raw dict.''' - self.update_bbox(raw.get('bbox', (0,)*4)) - super().restore(raw) - return self - - def make_docx(self, doc): - '''Create Section Column in docx. + '''Create Section Column in docx. Args: doc (Document): ``python-docx`` document object ''' self.blocks.make_docx(doc) - - - diff --git a/pdf2docx/layout/Layout.py b/pdf2docx/layout/Layout.py index a960501a..abdfe461 100644 --- a/pdf2docx/layout/Layout.py +++ b/pdf2docx/layout/Layout.py @@ -1,10 +1,8 @@ -# -*- coding: utf-8 -*- - '''Document layout depends on Blocks and Shapes. **Layout** here refers to the content and position of text, image and table. The target is to convert source blocks and shapes to a *flow layout* that can be re-created as docx elements like paragraph and -table. In addition to ``Section`` and ``Column``, ``TableBlock`` is used to maintain the page layout . +table. In addition to ``Section`` and ``Column``, ``TableBlock`` is used to maintain the page layout . So, detecting and parsing table block is the principle steps. The prerequisite work is done before this step: @@ -12,12 +10,12 @@ 1. Clean up source blocks and shapes in Page level, e.g. convert source blocks to ``Line`` level, because the block structure determined by ``PyMuPDF`` might be not reasonable. #. Parse structure in document level, e.g. page header/footer. -#. Parse Section and Column layout in Page level. +#. Parse Section and Column layout in Page level. The page layout parsing idea: 1. Parse table layout in Column level. - (a) Detect explicit tables first based on shapes. + (a) Detect explicit tables first based on shapes. (#) Then, detect stream tables based on original text blocks and parsed explicit tables. (#) Move table contained blocks (lines or explicit table) to associated cell-layout. #. Parse paragraph in Column level. @@ -27,61 +25,65 @@ #. Repeat above steps for cell-layout in parsed table level. ''' +from abc import (ABC, abstractmethod) from ..text.Line import Line from ..common import constants +from ..common.Element import Element from ..shape.Shapes import Shapes -class Layout: +class Layout(Element, ABC): '''Blocks and shapes structure and formats.''' - def __init__(self, blocks=None, shapes=None): - ''' Initialize layout. - - Args: - blocks (Blocks): Blocks representing text/table contents. - shapes (Shapes): Shapes representing table border, shading and text style like underline, highlight. - parent (Page, Column, Cell): The object that this layout belonging to. - ''' + def __init__(self, bbox=None): + ''' Initialize layout. Note that layout bbox must be set explicitly, + rather than calculated automatically from contained blocks and shapes.''' from .Blocks import Blocks # avoid import conflicts from ..table.TablesConstructor import TablesConstructor - self.blocks = Blocks(instances=blocks, parent=self) - self.shapes = Shapes(instances=shapes, parent=self) - self._table_parser = TablesConstructor(parent=self) # table parser + raw = {'bbox': bbox} if bbox else {} + super().__init__(raw) + # Blocks representing text/table contents. + self.blocks = Blocks(parent=self) - def working_bbox(self, *args, **kwargs): - '''Working bbox of current Layout.''' - raise NotImplementedError + # Shapes representing table border, shading and text style like underline, highlight. + self.shapes = Shapes(parent=self) + + # table builder + self._table_parser = TablesConstructor(parent=self) # table parser - def contains(self, *args, **kwargs): - '''Whether given element is contained in this layout.''' - raise NotImplementedError + @property + @abstractmethod + def working_bbox(self): + '''Working bbox of current Layout.''' def store(self): '''Store parsed layout in dict format.''' - return { + res = super().store() # Element + res.update({ 'blocks': self.blocks.store(), 'shapes': self.shapes.store() - } + }) + return res def restore(self, data:dict): '''Restore Layout from parsed results.''' + self.update_bbox(data.get('bbox', (0,)*4)) self.blocks.restore(data.get('blocks', [])) self.shapes.restore(data.get('shapes', [])) return self def assign_blocks(self, blocks:list): - '''Add blocks (line or table block) to this layout. - + '''Add blocks (line or table block) to this layout. + Args: blocks (list): a list of text line or table block to add. - + .. note:: If a text line is partly contained, it must deep into span -> char. ''' @@ -89,8 +91,8 @@ def assign_blocks(self, blocks:list): def assign_shapes(self, shapes:list): - '''Add shapes to this cell. - + '''Add shapes to this cell. + Args: shapes (list): a list of Shape instance to add. ''' @@ -123,36 +125,36 @@ def _assign_block(self, block): # add block directly if fully contained in cell if self.contains(block, threshold=constants.FACTOR_MAJOR): self.blocks.append(block) - + # deep into line span if any intersection - elif self.bbox & block.bbox and isinstance(block, Line): + elif isinstance(block, Line) and self.bbox.intersects(block.bbox): self.blocks.append(block.intersects(self.bbox)) def _parse_table(self, **settings): - '''Parse table layout: - - * detect explicit tables first based on shapes, + '''Parse table layout: + + * detect explicit tables first based on shapes, * then stream tables based on original text blocks and parsed explicit tables; * move table contained blocks (text block or explicit table) to associated cell layout. - ''' + ''' # parse table structure/format recognized from explicit shapes if settings['parse_lattice_table']: self._table_parser.lattice_tables( settings['connected_border_tolerance'], settings['min_border_clearance'], settings['max_border_width']) - + # parse table structure based on implicit layout of text blocks if settings['parse_stream_table']: self._table_parser.stream_tables( settings['min_border_clearance'], settings['max_border_width'], settings['line_separate_threshold']) - + def _parse_paragraph(self, **settings): - '''Create text block based on lines, and parse text format, e.g. text highlight, + '''Create text block based on lines, and parse text format, e.g. text highlight, paragraph indentation ''' # group lines to text block self.blocks.parse_block( @@ -164,7 +166,7 @@ def _parse_paragraph(self, **settings): self.blocks.parse_text_format( self.shapes.text_style_shapes, settings['delete_end_line_hyphen']) - + # paragraph / line spacing self.blocks.parse_spacing( settings['line_separate_threshold'], @@ -172,4 +174,4 @@ def _parse_paragraph(self, **settings): settings['line_break_free_space_ratio'], settings['lines_left_aligned_threshold'], settings['lines_right_aligned_threshold'], - settings['lines_center_aligned_threshold']) \ No newline at end of file + settings['lines_center_aligned_threshold']) diff --git a/pdf2docx/page/RawPage.py b/pdf2docx/page/RawPage.py index 4e912fac..ea53b38f 100644 --- a/pdf2docx/page/RawPage.py +++ b/pdf2docx/page/RawPage.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - '''A wrapper of pdf page engine (e.g. PyMuPDF, pdfminer) to do the following work: * extract source contents @@ -8,12 +6,13 @@ * parse page structure roughly, i.e. section and column ''' +from abc import (ABC, abstractmethod) from .BasePage import BasePage -from ..layout.Layout import Layout from ..layout.Section import Section from ..layout.Column import Column from ..shape.Shape import Hyperlink from ..shape.Shapes import Shapes +from ..layout.Blocks import Blocks from ..font.Fonts import Fonts from ..text.TextSpan import TextSpan from ..common.share import debug_plot @@ -21,36 +20,38 @@ from ..common.Collection import Collection -class RawPage(BasePage, Layout): +class RawPage(BasePage, ABC): '''A wrapper of page engine.''' def __init__(self, page_engine=None): ''' Initialize page layout. - + Args: page_engine (Object): Source pdf page. ''' BasePage.__init__(self) - Layout.__init__(self) self.page_engine = page_engine - + self.blocks = Blocks(parent=self) + self.shapes = Shapes(parent=self) + + @abstractmethod def extract_raw_dict(self, **settings): '''Extract source data with page engine. Return a dict with the following structure: ``` { "width" : w, - "height": h, + "height": h, "blocks": [{...}, {...}, ...], "shapes" : [{...}, {...}, ...] } ``` ''' - raise NotImplementedError - + + @property def text(self): - '''All extracted text in this page, with images considered as ````. + '''All extracted text in this page, with images considered as ````. Should be run after ``restore()`` data.''' return '\n'.join([block.text for block in self.blocks]) @@ -64,14 +65,15 @@ def raw_text(self): def restore(self, **settings): '''Initialize layout extracted with ``PyMuPDF``.''' raw_dict = self.extract_raw_dict(**settings) - super().restore(raw_dict) + self.blocks.restore(raw_dict.get('blocks', [])) + self.shapes.restore(raw_dict.get('shapes', [])) return self.blocks - + @debug_plot('Cleaned Shapes') def clean_up(self, **settings): - '''Clean up raw blocks and shapes, e.g. - + '''Clean up raw blocks and shapes, e.g. + * remove negative or duplicated instances, * detect semantic type of shapes ''' @@ -79,18 +81,16 @@ def clean_up(self, **settings): self.blocks.clean_up( settings['float_image_ignorable_gap'], settings['line_overlap_threshold']) - - # clean up shapes + # clean up shapes self.shapes.clean_up( settings['max_border_width'], settings['shape_min_dimension']) - return self.shapes - def process_font(self, fonts:Fonts): + def process_font(self, fonts:Fonts): '''Update font properties, e.g. font name, font line height ratio, of ``TextSpan``. - + Args: fonts (Fonts): Fonts parsed by ``fonttools``. ''' @@ -114,10 +114,10 @@ def calculate_margin(self, **settings): """Calculate page margin. .. note:: - Ensure this method is run right after cleaning up the layout, so the page margin is + Ensure this method is run right after cleaning up the layout, so the page margin is calculated based on valid layout, and stay constant. """ - # Exclude hyperlink from shapes because hyperlink might exist out of page unreasonably, + # Exclude hyperlink from shapes because hyperlink might exist out of page unreasonably, # while it should always within page since attached to text. shapes = Shapes([shape for shape in self.shapes if not isinstance(shape, Hyperlink)]) @@ -139,9 +139,9 @@ def calculate_margin(self, **settings): # use normal margin if calculated margin is large enough return ( - min(constants.ITP, round(left, 1)), - min(constants.ITP, round(right, 1)), - min(constants.ITP, round(top, 1)), + min(constants.ITP, round(left, 1)), + min(constants.ITP, round(right, 1)), + min(constants.ITP, round(top, 1)), min(constants.ITP, round(bottom, 1))) @@ -154,15 +154,15 @@ def parse_section(self, **settings): ''' # bbox X0, Y0, X1, _ = self.working_bbox - + # collect all blocks (line level) and shapes elements = Collection() elements.extend(self.blocks) elements.extend(self.shapes) if not elements: return - # to create section with collected lines - lines = Collection() + # to create section with collected lines + lines = Collection() sections = [] def close_section(num_col, elements, y_ref): # append to last section if both single column @@ -173,7 +173,7 @@ def close_section(num_col, elements, y_ref): # otherwise, create new section else: section = self._create_section(num_col, elements, (X0, X1), y_ref) - if section: + if section: sections.append(section) @@ -189,7 +189,7 @@ def close_section(num_col, elements, y_ref): # consider 2-cols only if current_num_col>2: current_num_col = 1 - + # the width of two columns shouldn't have significant difference elif current_num_col==2: u0, v0, u1, v1 = cols[0].bbox @@ -198,25 +198,25 @@ def close_section(num_col, elements, y_ref): c1, c2 = x0-X0, X1-x0 # column width w1, w2 = u1-u0, m1-m0 # line width f = 2.0 - if not 1/f<=c1/c2<=f or w1/c1<0.33 or w2/c2<0.33: + if not 1/f<=c1/c2<=f or w1/c1<0.33 or w2/c2<0.33: current_num_col = 1 # process exceptions if pre_num_col==2 and current_num_col==1: - # though current row has one single column, it might have another virtual + # though current row has one single column, it might have another virtual # and empty column. If so, it should be counted as 2-cols cols = lines.group_by_columns() pos = cols[0].bbox[2] if row.bbox[2]<=pos or row.bbox[0]>pos: current_num_col = 2 - + # pre_num_col!=current_num_col => to close section with collected lines, # before that, further check the height of collected lines else: x0, y0, x1, y1 = lines.bbox if y1-y0' for block in self.blocks]) @@ -43,36 +39,33 @@ def working_bbox(self): def store(self): - if bool(self): - res = super().store() # Element - res.update({ - 'bg_color': self.bg_color, - 'border_color': self.border_color, - 'border_width': self.border_width, - 'merged_cells': self.merged_cells - }) - res.update(Layout.store(self)) - return res - else: - return None + if not bool(self): return None + res = super().store() + res.update({ + 'bg_color': self.bg_color, + 'border_color': self.border_color, + 'border_width': self.border_width, + 'merged_cells': self.merged_cells + }) + return res def plot(self, page): - '''Plot cell and its sub-layout.''' + '''Plot cell and its sub-layout.''' super().plot(page) self.blocks.plot(page) def make_docx(self, table, indexes): '''Set cell style and assign contents. - + Args: table (Table): ``python-docx`` table instance. indexes (tuple): Row and column indexes, ``(i, j)``. - ''' + ''' # set cell style, e.g. border, shading, cell width self._set_style(table, indexes) - + # ignore merged cells if not bool(self): return @@ -83,7 +76,7 @@ def make_docx(self, table, indexes): if n_row*n_col!=1: _cell = table.cell(i+n_row-1, j+n_col-1) docx_cell.merge(_cell) - + # --------------------- # cell width (cell height is set by row height) # --------------------- @@ -92,18 +85,19 @@ def make_docx(self, table, indexes): docx_cell.width = Pt(x1-x0) # insert contents - # NOTE: there exists an empty paragraph already in each cell, which should be deleted first to - # avoid unexpected layout. `docx_cell._element.clear_content()` works here. - # But, docx requires at least one paragraph in each cell, otherwise resulting in a repair error. + # NOTE: there exists an empty paragraph already in each cell, which should be deleted + # first to avoid unexpected layout. `docx_cell._element.clear_content()` works here. + # But, docx requires at least one paragraph in each cell, otherwise resulting in a + # repair error. if self.blocks: docx_cell._element.clear_content() self.blocks.make_docx(docx_cell) def _set_style(self, table, indexes): - '''Set ``python-docx`` cell style, e.g. border, shading, width, row height, + '''Set ``python-docx`` cell style, e.g. border, shading, width, row height, based on cell block parsed from PDF. - + Args: table (Table): ``python-docx`` table object. indexes (tuple): ``(i, j)`` index of current cell in table. @@ -115,7 +109,7 @@ def _set_style(self, table, indexes): # --------------------- # border style # --------------------- - # NOTE: border width is specified in eighths of a point, with a minimum value of + # NOTE: border width is specified in eighths of a point, with a minimum value of # two (1/4 of a point) and a maximum value of 96 (twelve points) keys = ('top', 'end', 'bottom', 'start') kwargs = {} @@ -128,24 +122,24 @@ def _set_style(self, table, indexes): 'sz': 8*w, 'val': 'single', 'color': hex_c.upper() } - # merged cells are assumed to have same borders with the main cell + # merged cells are assumed to have same borders with the main cell for m in range(i, i+n_row): for n in range(j, j+n_col): - docx.set_cell_border(table.cell(m, n), **kwargs) + docx.set_cell_border(table.cell(m, n), **kwargs) # --------------------- # cell bg-color # --------------------- - if self.bg_color!=None: + if self.bg_color is not None: docx.set_cell_shading(docx_cell, self.bg_color) - + # --------------------- # clear cell margin # --------------------- - # NOTE: the start position of a table is based on text in cell, rather than left border of table. - # They're almost aligned if left-margin of cell is zero. + # NOTE: the start position of a table is based on text in cell, rather than + # left border of table. They're almost aligned if left-margin of cell is zero. docx.set_cell_margins(docx_cell, start=0, end=0) # set vertical direction if contained text blocks are in vertical direction if self.blocks.is_vertical_text: - docx.set_vertical_cell_direction(docx_cell) \ No newline at end of file + docx.set_vertical_cell_direction(docx_cell) diff --git a/pdf2docx/table/TableStructure.py b/pdf2docx/table/TableStructure.py index 994fc992..035c659e 100644 --- a/pdf2docx/table/TableStructure.py +++ b/pdf2docx/table/TableStructure.py @@ -483,9 +483,9 @@ def _check_outer_strokes(table_bbox:Element, borders:dict, direction:str, max_bo target = bbox[idx] # add missing border rects - sample_border = Stroke() - bbox[idx] = target - bbox[(idx+2)%4] = target + sample_border = Stroke() + idx1 = (idx+2)%4 + bbox[idx1] = target + 0.1 * (1 if idx1>idx else -1) # add whole border if not exist if abs(target-current)> max_border_width: @@ -514,7 +514,6 @@ def _check_outer_strokes(table_bbox:Element, borders:dict, direction:str, max_bo borders[current].extend(segments) - @staticmethod def _check_merged_cells(ref:float, borders:list, direction:str='row'): '''Check merged cells in a row/column. @@ -548,7 +547,6 @@ def _check_merged_cells(ref:float, borders:list, direction:str='row'): ref0, ref1 = border.y0, border.y1 else: ref0, ref1 = border.x0, border.x1 - # 1) intersection found if ref0 < ref < ref1: res.append(1) diff --git a/pdf2docx/text/Char.py b/pdf2docx/text/Char.py index 0e3e324f..4b66fa86 100644 --- a/pdf2docx/text/Char.py +++ b/pdf2docx/text/Char.py @@ -40,22 +40,19 @@ def contained_in_rect(self, rect:Shape, horizontal:bool=True): Returns: bool: Whether a Char locates in target rect. - + .. note:: It's considered as contained in the target rect if the intersection is larger than half of the char bbox. - """ + """ # char in rect? - if self.bbox in rect.bbox: - return True - - # intersection? - else: - intsec = self.bbox & rect.bbox # width=0 if invalid intersection - if horizontal: - return intsec.width > 0.5*self.bbox.width - else: - return intsec.height > 0.5*self.bbox.height + if self.bbox in rect.bbox: return True + + # intersection? + s = self.bbox & rect.bbox + if s.is_empty: return False + if horizontal: return s.width > 0.5*self.bbox.width + return s.height > 0.5*self.bbox.height def store(self): diff --git a/pdf2docx/text/TextSpan.py b/pdf2docx/text/TextSpan.py index e6f718b8..a00ae4d5 100644 --- a/pdf2docx/text/TextSpan.py +++ b/pdf2docx/text/TextSpan.py @@ -1,8 +1,6 @@ -# -*- coding: utf-8 -*- - '''Text Span object based on PDF raw dict extracted with ``PyMuPDF``. -Data structure for Span refer to +Data structure for Span refer to this `link `_:: { @@ -34,9 +32,9 @@ from docx.oxml.ns import qn from .Char import Char from ..common.Element import Element -from ..common.share import RectType +from ..common.share import (RectType, rgb_value, rgb_component, decode) from ..common import constants -from ..common import share, docx +from ..common import docx from ..shape.Shape import Shape @@ -55,8 +53,7 @@ def __init__(self, raw:dict=None): # font metrics # line_height is the standard single line height used in relative line spacing, # while exact line spacing is used when line_height==-1 by default. - font_name = raw.get('font', '') - self.font = bytes(ord(c) for c in font_name).decode() # in case unicode in font name + self.font = decode(raw.get('font', '')) # in case unicode in font name self.size = raw.get('size', 12.0) self.ascender = raw.get('ascender', 1.0) self.descender = raw.get('descender', 0.0) @@ -70,12 +67,12 @@ def __init__(self, raw:dict=None): # positive to expand space, otherwise condense # just an attribute placeholder: not used yet self.char_spacing = raw.get('char_spacing', 0.0) - + # init text span element super().__init__(raw) # in rare case, the font is unamed, so change font and update bbox accordingly - if 'UNNAMED' in self.font.upper(): + if self.chars and 'UNNAMED' in self.font.upper(): self._change_font_and_update_bbox(constants.DEFAULT_FONT_NAME) @@ -102,14 +99,14 @@ def is_valid_line_height(self): return self.line_height!=-1 def _change_font_and_update_bbox(self, font_name:str): '''Set new font, and update font size, span/char bbox accordingly. - It's generally used for span with unnamed fonts. - See this `issue `_. + It's generally used for span with unnamed fonts. + See this `issue `_. In corner case, where the PDF file containing unnamed and not embedded fonts, the span bbox - extracted from ``PyMuPDF`` is not correct. ``PyMuPDF`` provides feature to replace these - unnamed fonts with specified fonts, then extract correct bbox from the updated PDF. Since we - care less about the original PDF itself but its layout, the idea here is to set a default font - for text spans with unnamed fonts, and estimate the updated bbox with method from + extracted from ``PyMuPDF`` is not correct. ``PyMuPDF`` provides feature to replace these + unnamed fonts with specified fonts, then extract correct bbox from the updated PDF. Since we + care less about the original PDF itself but its layout, the idea here is to set a default + font for text spans with unnamed fonts, and estimate the updated bbox with method from ``fitz.TextWriter``. Args: @@ -153,7 +150,7 @@ def add(self, char:Char): self.chars.append(char) self.union_bbox(char) - + def lstrip(self): '''Remove blanks at the left side, but keep one blank.''' original_text = self.text @@ -164,7 +161,7 @@ def lstrip(self): self.chars = self.chars[num_blanks-1:] self.update_bbox(rect=self.cal_bbox()) return True - + def rstrip(self): '''Remove blanks at the right side, but keep one blank.''' @@ -205,7 +202,7 @@ def split(self, rect:Shape, horizontal:bool=True): Returns: list: Split text spans. - """ + """ # any intersection in this span? # NOTE: didn't consider the case that an underline is out of a span intsec = rect.bbox & self.bbox @@ -213,7 +210,6 @@ def split(self, rect:Shape, horizontal:bool=True): # no, then add this span as it is # Note the case bool(intsec)=True but intsec.get_area()=0 if intsec.is_empty: return [self] - # yes, then split spans: # - add new style to the intersection part @@ -241,7 +237,6 @@ def split(self, rect:Shape, horizontal:bool=True): pos_end = max(pos+length, 0) # max() is used in case: pos=-1, length=0 # split span with the intersection: span-intersection-span - # # left part if exists if pos > 0: if horizontal: @@ -287,9 +282,9 @@ def _parse_text_format(self, rect:Shape, horizontal:bool=True): # Skip table border/shading if rect.equal_to_type(RectType.BORDER) or rect.equal_to_type(RectType.SHADING): return False - + # set hyperlink - elif rect.equal_to_type(RectType.HYPERLINK): + if rect.equal_to_type(RectType.HYPERLINK): self.style.append({ 'type': rect.type, 'color': rect.color, @@ -311,9 +306,9 @@ def _parse_text_format(self, rect:Shape, horizontal:bool=True): # highlight: both the rect height and overlap must be large enough if h_rect >= 0.5*h_span: # In general, highlight color isn't white - if rect.color != share.rgb_value((1,1,1)) and self.get_main_bbox(rect, constants.FACTOR_MAJOR): + if rect.color != rgb_value((1,1,1)) and self.get_main_bbox(rect, constants.FACTOR_MAJOR): rect.type = RectType.HIGHLIGHT - + # near to bottom of span? yes, underline elif d <= 0.25*h_span: rect.type = RectType.UNDERLINE @@ -336,7 +331,7 @@ def _parse_text_format(self, rect:Shape, horizontal:bool=True): def intersects(self, rect): '''Create new TextSpan object with chars contained in given bbox. - + Args: rect (fitz.Rect): Target bbox. ''' @@ -362,12 +357,13 @@ def intersects(self, rect): def make_docx(self, paragraph): - '''Add text span to a docx paragraph, and set text style, e.g. font, color, underline, hyperlink, etc. + '''Add text span to a docx paragraph, and set text style, e.g. + font, color, underline, hyperlink, etc. .. note:: - Hyperlink and its style is parsed separately from pdf. For instance, regarding a general hyperlink with an - underline, the text and uri is parsed as hyperlink itself, while the underline is treated as a normal text - style. + Hyperlink and its style is parsed separately from pdf. For instance, regarding a general + hyperlink with an underline, the text and uri is parsed as hyperlink itself, while the + underline is treated as a normal text style. ''' # Create hyperlink in particular, otherwise add a run directly for style in self.style: @@ -376,12 +372,12 @@ def make_docx(self, paragraph): break else: docx_run = paragraph.add_run(self.text) - + # set text style, e.g. font, underline and highlight self._set_text_format(docx_run) # set charters spacing - if self.char_spacing: + if self.char_spacing: docx.set_char_spacing(docx_run, self.char_spacing) @@ -405,7 +401,7 @@ def _set_text_format(self, docx_run): font_name = self.font docx_run.font.name = font_name docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # set font for chinese characters - docx_run.font.color.rgb = RGBColor(*share.rgb_component(self.color)) + docx_run.font.color.rgb = RGBColor(*rgb_component(self.color)) # font size # NOTE: only x.0 and x.5 is accepted in docx, so set character scaling accordingly @@ -417,11 +413,11 @@ def _set_text_format(self, docx_run): scale = self.size / (font_size or self.size or 1) if abs(scale-1.0)>=0.01: docx.set_char_scaling(docx_run, scale) - - # font style parsed from PDF rectangles: + + # font style parsed from PDF rectangles: # e.g. highlight, underline, strike-through-line for style in self.style: - + t = style['type'] # Built-in method is provided to set highlight in python-docx, but supports only limited colors; # so, set character shading instead if out of highlight color scope @@ -435,7 +431,7 @@ def _set_text_format(self, docx_run): docx_run.font.underline = True else: docx.set_char_underline(docx_run, style['color']) - + # same color with text for strike line elif t==RectType.STRIKE.value: - docx_run.font.strike = True \ No newline at end of file + docx_run.font.strike = True diff --git a/requirements.txt b/requirements.txt index f3aaf506..14baa2b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -PyMuPDF>=1.19.0 +PyMuPDF python-docx>=0.8.10 fonttools>=4.24.0 numpy>=1.17.2 opencv-python>=4.5 +# opencv-python-headless>=4.5 fire>=0.3.0 \ No newline at end of file diff --git a/test/samples/demo-image-colorspace.pdf b/test/samples/demo-image-colorspace.pdf new file mode 100644 index 00000000..6d021bbc Binary files /dev/null and b/test/samples/demo-image-colorspace.pdf differ diff --git a/test/samples/demo-image-cmyk.pdf b/test/samples/demo-image-floating.pdf similarity index 100% rename from test/samples/demo-image-cmyk.pdf rename to test/samples/demo-image-floating.pdf diff --git a/test/samples/demo-image-transparent.pdf b/test/samples/demo-image-transparent.pdf deleted file mode 100644 index acec3c1b..00000000 Binary files a/test/samples/demo-image-transparent.pdf and /dev/null differ diff --git a/test/test.py b/test/test.py index fe70dc01..2a174d32 100644 --- a/test/test.py +++ b/test/test.py @@ -112,19 +112,19 @@ def convert(self, filename): '''Convert PDF file from sample path to output path.''' source_pdf_file = os.path.join(sample_path, f'{filename}.pdf') docx_file = os.path.join(output_path, f'{filename}.docx') - cv = Converter(source_pdf_file) - cv.convert(docx_file) - cv.close() - + c = Converter(source_pdf_file) + c.convert(docx_file) + c.close() + def convert_by_io_stream(self, filename): '''Convert PDF file from sample path to output path.''' source_pdf_file = os.path.join(sample_path, f'{filename}.pdf') with open(source_pdf_file, 'rb') as f: in_stream = f.read() - cv = Converter(stream=in_stream) + c = Converter(stream=in_stream) out_stream = io.BytesIO() - cv.convert(out_stream) - cv.close() + c.convert(out_stream) + c.close() docx_file = os.path.join(output_path, f'{filename}.docx') with open(docx_file, 'wb') as f: f.write(out_stream.getvalue()) @@ -141,7 +141,7 @@ def test_io_stream(self): # ------------------------------------------ def test_section(self): '''test page layout: section and column.''' - self.convert('demo-section') + self.convert('demo-section') def test_section_spacing(self): '''test page layout: section vertical position.''' @@ -185,13 +185,13 @@ def test_vector_graphic(self): '''test vector graphic.''' self.convert('demo-image-vector-graphic') - def test_image_cmyk(self): - '''test image in CMYK color-space.''' - self.convert('demo-image-cmyk') + def test_image_color_space(self): + '''test image color space.''' + self.convert('demo-image-colorspace') - def test_image_transparent(self): - '''test transparent images.''' - self.convert('demo-image-transparent') + def test_image_floating(self): + '''test floating images.''' + self.convert('demo-image-floating') def test_image_rotation(self): '''test rotating image due to pdf page rotation.''' @@ -210,7 +210,7 @@ def test_table_bottom(self): self.convert('demo-table-bottom') def test_table_format(self): - '''test table format, e.g. + '''test table format, e.g. - border and shading style - vertical cell - merged cell @@ -359,3 +359,4 @@ def test_quality(self): threshold = TestQuality.INDEX_MAP.get(filename, 0.10) print(f'Checking {filename}: {sidx} v.s. {threshold}') assert sidx>=threshold, 'Significant difference might exist since similarity index is lower than threshold.' +