From f30fb2bbbd90aa9aaed1d79b9aaad8979cbde47c Mon Sep 17 00:00:00 2001 From: dothinking Date: Tue, 23 Jan 2024 01:10:35 +0800 Subject: [PATCH] process shape partly out of page; ignore replacement character `\ufffd`; fix empty font name issue; #256 --- pdf2docx/font/Fonts.py | 2 + pdf2docx/shape/Shapes.py | 82 +++++++++++++++++++-------------------- pdf2docx/text/TextSpan.py | 14 ++++--- 3 files changed, 50 insertions(+), 48 deletions(-) diff --git a/pdf2docx/font/Fonts.py b/pdf2docx/font/Fonts.py index e369c81..44163f3 100644 --- a/pdf2docx/font/Fonts.py +++ b/pdf2docx/font/Fonts.py @@ -70,6 +70,8 @@ def extract(cls, fitz_doc): fonts = [] for xref in xrefs: basename, ext, _, buffer = fitz_doc.extract_font(xref) + if not basename: continue + basename = decode(basename) name = cls._normalized_font_name(basename) diff --git a/pdf2docx/shape/Shapes.py b/pdf2docx/shape/Shapes.py index a45f3ac..f84489a 100644 --- a/pdf2docx/shape/Shapes.py +++ b/pdf2docx/shape/Shapes.py @@ -1,7 +1,4 @@ -# -*- coding: utf-8 -*- - -'''A group of ``Shape`` instances. -''' +'''A group of ``Shape`` instances.''' from .Shape import Shape, Stroke, Fill, Hyperlink from ..common.share import RectType @@ -26,20 +23,16 @@ def restore(self, raws:list): shape = Fill(raw) # add to list self.append(shape) - return self - def _update_bbox(self, shape:Shape): + def _update_bbox(self, e:Shape): ''' override. Do nothing.''' - pass @property def strokes(self): - ''' Stroke Shapes, including table border, text underline and strike-through. - Cache it once calculated since it doesn't change generally. - ''' + ''' Stroke Shapes, including table border, text underline and strike-through.''' instances = list(filter( lambda shape: isinstance(shape, Stroke), self._instances)) return Shapes(instances) @@ -47,9 +40,7 @@ def strokes(self): @property def fillings(self): - ''' Fill Shapes, including cell shading and highlight. - Cache it once calculated since it doesn't change generally. - ''' + ''' Fill Shapes, including cell shading and highlight.''' # white bg-color is by default, so ignore those fillings instances = list(filter( lambda shape: isinstance(shape, Fill) and \ @@ -72,21 +63,24 @@ def table_strokes(self): lambda shape: shape.has_potential_type(RectType.BORDER), self._instances)) return ElementCollection(instances) - + @property def table_fillings(self): '''Potential table shadings.''' instances = list(filter( lambda shape: shape.has_potential_type(RectType.SHADING), self._instances)) return ElementCollection(instances) - + + @property def text_style_shapes(self): - '''Potential text style based shapes, e.g. underline, strike-through, highlight and hyperlink.''' - f = lambda shape: shape.has_potential_type(RectType.HIGHLIGHT) or \ - shape.has_potential_type(RectType.UNDERLINE) or \ - shape.has_potential_type(RectType.STRIKE) or \ - shape.has_potential_type(RectType.HYPERLINK) + '''Potential text style based shapes, + e.g. underline, strike-through, highlight and hyperlink.''' + def f(shape): + return shape.has_potential_type(RectType.HIGHLIGHT) or \ + shape.has_potential_type(RectType.UNDERLINE) or \ + shape.has_potential_type(RectType.STRIKE) or \ + shape.has_potential_type(RectType.HYPERLINK) instances = set(filter(f, self._instances)) return ElementCollection(instances) @@ -101,19 +95,24 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float): Args: max_border_width (float): The max border width. - shape_min_dimension (float): Ignore shape if both width and height is lower than this value. + shape_min_dimension (float): Ignore shape if both width and height + is lower than this value. """ if not self._instances: return - # remove small shapes or shapes out of page + # remove small shapes or shapes out of page; and + # update bbox in case part of the shape is out of page page_bbox = self.parent.bbox - f = lambda shape: shape.bbox.intersects(page_bbox) and \ - max(shape.bbox.width, shape.bbox.height)>=shape_min_dimension - cleaned_shapes = list(filter(f, self._instances)) # type: list[Shape] + cleaned_shapes = [] # type: list[Shape] + for s in self: + if max(s.bbox.width, s.bbox.height)= 0.5*h_span: # In general, highlight color isn't white - if rect.color != rgb_value((1,1,1)) and self.get_main_bbox(rect, constants.FACTOR_MAJOR): + if rect.color != rgb_value((1,1,1)) and \ + self.get_main_bbox(rect, constants.FACTOR_MAJOR): rect.type = RectType.HIGHLIGHT # near to bottom of span? yes, underline @@ -400,7 +402,7 @@ def _set_text_format(self, docx_run): # font name font_name = self.font docx_run.font.name = font_name - docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # set font for chinese characters + docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # for CJK characters docx_run.font.color.rgb = RGBColor(*rgb_component(self.color)) # font size @@ -419,8 +421,8 @@ def _set_text_format(self, docx_run): for style in self.style: t = style['type'] - # Built-in method is provided to set highlight in python-docx, but supports only limited colors; - # so, set character shading instead if out of highlight color scope + # Built-in method is provided to set highlight in python-docx,but supports only + # limited colors; so, set character shading instead if out of highlight color scope. if t==RectType.HIGHLIGHT.value: docx.set_char_shading(docx_run, style['color'])