From bc0fd4ac7c892baf35b0e475f6d8db8b1ac6dfae Mon Sep 17 00:00:00 2001 From: dothinking Date: Sun, 28 Jan 2024 21:07:36 +0800 Subject: [PATCH] fix hiding text issue #260 --- pdf2docx/image/ImagesExtractor.py | 180 ++++++++++++++++++++---------- pdf2docx/shape/Paths.py | 42 ++++--- 2 files changed, 144 insertions(+), 78 deletions(-) diff --git a/pdf2docx/image/ImagesExtractor.py b/pdf2docx/image/ImagesExtractor.py index c4e8ed7..4100fb1 100644 --- a/pdf2docx/image/ImagesExtractor.py +++ b/pdf2docx/image/ImagesExtractor.py @@ -16,61 +16,77 @@ class ImagesExtractor: + '''Extract images from PDF.''' + def __init__(self, page:fitz.Page) -> None: '''Extract images from PDF page. - + Args: page (fitz.Page): pdf page to extract images. ''' self._page = page - - def clip_page_to_pixmap(self, bbox:fitz.Rect=None, zoom:float=3.0): - '''Clip page pixmap (without text) according to ``bbox``. + + def clip_page_to_pixmap(self, + bbox:fitz.Rect=None, + rm_image:bool=False, + zoom:float=3.0): + '''Clip page pixmap according to ``bbox``. Args: bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. Note that ``bbox`` depends on un-rotated page CS, while clipping page is based on the final page. + rm_image (bool): remove images or not. zoom (float, optional): Improve resolution by this rate. Defaults to 3.0. Returns: fitz.Pixmap: The extracted pixmap. - ''' - # hide text - self._hide_page_text(self._page) - + ''' + # hide text and images + stream_dict = self._hide_page_text_and_images(self._page, rm_text=True, rm_image=rm_image) + if bbox is None: clip_bbox = self._page.rect - + # transform to the final bbox when page is rotated elif self._page.rotation: clip_bbox = bbox * self._page.rotation_matrix - + else: clip_bbox = bbox - - clip_bbox = clip_bbox & self._page.rect - + + clip_bbox = self._page.rect & clip_bbox + # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 matrix = fitz.Matrix(zoom, zoom) + pix = self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap - return self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap + # recovery page if hide text + doc = self._page.parent + for xref, stream in stream_dict.items(): doc.update_stream(xref, stream) + + return pix - def clip_page_to_dict(self, bbox:fitz.Rect=None, clip_image_res_ratio:float=3.0): + def clip_page_to_dict(self, + bbox:fitz.Rect=None, + rm_image:bool=False, + clip_image_res_ratio:float=3.0): '''Clip page pixmap (without text) according to ``bbox`` and convert to source image. Args: bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. - clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. + rm_image (bool): remove images or not. + clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. + Defaults to 3.0. Returns: list: A list of image raw dict. ''' - pix = self.clip_page_to_pixmap(bbox=bbox, zoom=clip_image_res_ratio) + pix = self.clip_page_to_pixmap(bbox=bbox, rm_image=rm_image, zoom=clip_image_res_ratio) return self._to_raw_dict(pix, bbox) @@ -78,30 +94,33 @@ def extract_images(self, clip_image_res_ratio:float=3.0): '''Extract normal images with ``Page.get_images()``. Args: - clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. + clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. + Defaults to 3.0. Returns: list: A list of extracted and recovered image raw dict. - + .. note:: - ``Page.get_images()`` contains each image only once, which may less than the real count of images in a page. + ``Page.get_images()`` contains each image only once, which may less than the + real count of images in a page. ''' # pdf document doc = self._page.parent rotation = self._page.rotation - # The final view might be formed by several images with alpha channel only, as shown in issue-123. - # It's still inconvenient to extract the original alpha/mask image, as a compromise, extract the - # equivalent image by clipping the union page region for now. + # The final view might be formed by several images with alpha channel only, + # as shown in issue-123. + # It's still inconvenient to extract the original alpha/mask image, as a compromise, + # extract the equivalent image by clipping the union page region for now. # https://github.com/dothinking/pdf2docx/issues/123 # step 1: collect images: [(bbox, item), ..., ] ic = Collection() for item in self._page.get_images(full=True): item = list(item) - item[-1] = 0 - - # find all occurrences referenced to this image + item[-1] = 0 + + # find all occurrences referenced to this image rects = self._page.get_image_rects(item) unrotated_page_bbox = self._page.cropbox # note the difference to page.rect for bbox in rects: @@ -125,8 +144,8 @@ def extract_images(self, clip_image_res_ratio:float=3.0): if len(group) > 1: clip_bbox = fitz.Rect() for (bbox, item) in group: clip_bbox |= bbox - raw_dict = self.clip_page_to_dict(clip_bbox, clip_image_res_ratio) - + raw_dict = self.clip_page_to_dict(clip_bbox, False, clip_image_res_ratio) + else: bbox, item = group[0] @@ -148,8 +167,8 @@ def extract_images(self, clip_image_res_ratio:float=3.0): # (22, 25, 1265, 1303, 8, 'DeviceGray', '', 'Im4', 'DCTDecode', 0) # (23, 0, 1731, 1331, 8, 'DeviceGray', '', 'Im5', 'DCTDecode', 0) if item[5]=='': - raw_dict = self.clip_page_to_dict(bbox, clip_image_res_ratio) - + raw_dict = self.clip_page_to_dict(bbox, False, clip_image_res_ratio) + # normal images else: # recover image, e.g., handle image with mask, or CMYK color space @@ -157,15 +176,19 @@ def extract_images(self, clip_image_res_ratio:float=3.0): # rotate image with opencv if page is rotated raw_dict = self._to_raw_dict(pix, bbox) - if rotation: + if rotation: raw_dict['image'] = self._rotate_image(pix, -rotation) images.append(raw_dict) - return images - - - def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w:float, min_h:float): + return images + + + def detect_svg_contours(self, + min_svg_gap_dx:float, + min_svg_gap_dy:float, + min_w:float, + min_h:float): '''Find contour of potential vector graphics. Args: @@ -180,23 +203,23 @@ def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w: import cv2 as cv # clip page and convert to opencv image - pixmap = self.clip_page_to_pixmap(zoom=1.0) + pixmap = self.clip_page_to_pixmap(rm_image=True, zoom=1.0) src = self._pixmap_to_cv_image(pixmap) # gray and binary gray = cv.cvtColor(src, cv.COLOR_BGR2GRAY) _, binary = cv.threshold(gray, 253, 255, cv.THRESH_BINARY_INV) - + # external bbox: split images with recursive xy cut - external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy) - + external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy) + # inner contours - grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h) for bbox in external_bboxes] + grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h) + for bbox in external_bboxes] # combined external and inner contours groups = list(zip(external_bboxes, grouped_inner_bboxes)) - - + # plot detected images for debug debug = False if debug: @@ -247,7 +270,7 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int): Args: pixmap (fitz.Pixmap): Image to rotate. rotation (int): Rotation angle. - + Return: image bytes. ''' import cv2 as cv @@ -260,7 +283,7 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int): # calculate the center of the image x0, y0 = w//2, h//2 - # default scale value for now -> might be extracted from PDF page property + # default scale value for now -> might be extracted from PDF page property scale = 1.0 # rotation matrix @@ -269,16 +292,16 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int): # calculate the final dimension cos = np.abs(matrix[0, 0]) sin = np.abs(matrix[0, 1]) - + # compute the new bounding dimensions of the image W = int((h * sin) + (w * cos)) H = int((h * cos) + (w * sin)) - + # adjust the rotation matrix to take into account translation matrix[0, 2] += (W / 2) - x0 matrix[1, 2] += (H / 2) - y0 - - # perform the rotation holding at the center + + # perform the rotation holding at the center rotated_img = cv.warpAffine(img, matrix, (W, H)) # convert back to bytes @@ -287,30 +310,67 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int): @staticmethod - def _hide_page_text(page:fitz.Page): - '''Hide page text before clipping page.''' + def _hide_page_text_and_images(page:fitz.Page, rm_text:bool, rm_image:bool): + '''Hide page text and images.''' # NOTE: text might exist in both content stream and form object stream # - content stream, i.e. direct page content # - form object, i.e. contents referenced by this page xref_list = [xref for (xref, name, invoker, bbox) in page.get_xobjects()] - xref_list.extend(page.get_contents()) + xref_list.extend(page.get_contents()) + # (1) hide text # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf + def hide_text(stream): + res = stream + found = False + # set 3 Tr to text block + for k in ['BT', 'Tm', 'Td', '2 Tr']: + bk = k.encode() + if bk in stream: + found = True + res = res.replace(bk, f'{k} 3 Tr'.encode()) + return res, found + + # (2) hide image + # https://github.com/pymupdf/PyMuPDF/issues/338 + def hide_images(stream): + res = stream + found = False + # image names, e.g. [[270, 0, 261, 115, 8, 'DeviceRGB', '', 'Im1', 'DCTDecode']] + img_names = [item[7] for item in page.get_images(full=True)] + for k in img_names: + bk = f'/{k} Do'.encode() + if bk in stream: + found = True + res = res.replace(bk, b'') + return res, found + doc = page.parent # type: fitz.Document + source = {} for xref in xref_list: - stream = doc.xref_stream(xref).replace(b'BT', b'BT 3 Tr') \ - .replace(b'Tm', b'Tm 3 Tr') \ - .replace(b'Td', b'Td 3 Tr') - doc.update_stream(xref, stream) - + src = doc.xref_stream(xref) + + # try to hide text + stream, found_text = hide_text(src) if rm_text else (src, False) + + # try to hide images + stream, found_images = hide_images(stream) if rm_image else (stream, False) + + if found_text or found_images: + doc.update_stream(xref, stream) + source[xref] = src # save original stream + + return source + + @staticmethod def _recover_pixmap(doc:fitz.Document, item:list): """Restore pixmap with soft mask considered. - + References: * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList @@ -339,14 +399,14 @@ def _recover_pixmap(doc:fitz.Document, item:list): temp = fitz.Pixmap(pix, 0) # make temp pixmap w/o the alpha pix = None # release storage pix = temp - + # check dimension if pix.width==mask.width and pix.height==mask.height: pix = fitz.Pixmap(pix, mask) # now compose final pixmap else: logging.warning('Ignore image due to inconsistent size of color and mask pixmaps: %s', item) - # we may need to adjust something for CMYK pixmaps here -> + # we may need to adjust something for CMYK pixmaps here -> # recreate pixmap in RGB color space if necessary # NOTE: pix.colorspace may be None for images with alpha channel values only if 'CMYK' in item[5].upper(): @@ -365,4 +425,4 @@ def _pixmap_to_cv_image(pixmap:fitz.Pixmap): import cv2 as cv import numpy as np img_byte = pixmap.tobytes() - return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR) \ No newline at end of file + return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR) diff --git a/pdf2docx/shape/Paths.py b/pdf2docx/shape/Paths.py index c3148ec..df143c4 100644 --- a/pdf2docx/shape/Paths.py +++ b/pdf2docx/shape/Paths.py @@ -1,9 +1,7 @@ -# -*- coding: utf-8 -*- - ''' Objects representing PDF path (stroke and filling) extracted by ``page.get_drawings()``. -This method is new since ``PyMuPDF`` 1.18.0, with both pdf raw path and annotations like Line, +This method is new since ``PyMuPDF`` 1.18.0, with both pdf raw path and annotations like Line, Square and Highlight considered. * https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_drawings @@ -28,9 +26,9 @@ def restore(self, raws:list): # ignore path out of page if not path.bbox.intersects(rect): continue self.append(path) - + return self - + @lazyproperty def bbox(self): bbox = fitz.Rect() @@ -57,7 +55,7 @@ def plot(self, page): canvas = page.new_shape() for path in self._instances: path.plot(canvas) canvas.commit() # commit the drawing shapes to page - + def to_shapes(self): '''Convert contained paths to ISO strokes or rectangular fills. @@ -73,10 +71,14 @@ def to_shapes(self): return shapes - def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15, - min_w:float=2, min_h:float=2, clip_image_res_ratio:float=3.0): - '''Convert paths to iso-oriented shapes or images. The semantic type of path is either table/text style or - vector graphic. This method is to: + def to_shapes_and_images(self, + min_svg_gap_dx:float=15, + min_svg_gap_dy:float=15, + min_w:float=2, + min_h:float=2, + clip_image_res_ratio:float=3.0): + '''Convert paths to iso-oriented shapes or images. The semantic type of path is either + table/text style or vector graphic. This method is to: * detect svg regions -> exist at least one non-iso-oriented path * convert svg to bitmap by clipping page * convert the rest paths to iso-oriented shapes for further table/text style parsing @@ -86,7 +88,8 @@ def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15, min_svg_gap_dy (float): Merge svg if the vertical gap is less than this value. min_w (float): Ignore contours if the bbox width is less than this value. min_h (float): Ignore contours if the bbox height is less than this value. - clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. + clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. + Defaults to 3.0. Returns: tuple: (list of shape raw dict, list of image raw dict). @@ -104,7 +107,7 @@ def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15, # `bbox` is the external bbox of current region, while `inner_bboxes` are the inner contours # of level-2 hierarchy, i.e. contours under table cell. - # * it a table (or text style) if paths contained in `bbox` but excluded from `inner_bboxes` + # * it a table (or text style) if paths contained in `bbox` but excluded from `inner_bboxes` # are all iso-oriented -> export iso-shapes, clip page image based on `inner_bboxes`; # * otherwise, it's a vector graphic -> clip page image (without any text) based on `bbox` def contained_in_inner_contours(path:Path, contours:list): @@ -115,22 +118,25 @@ def contained_in_inner_contours(path:Path, contours:list): # group every path to one of the detected bbox group_paths = [Paths() for _ in groups] # type: list[Paths] for path in self._instances: - for (bbox, inner_bboxes), paths in zip(groups, group_paths): + for (bbox, inner_bboxes), paths in zip(groups, group_paths): if path.bbox.intersects(bbox): if not contained_in_inner_contours(path, inner_bboxes): paths.append(path) break - + # check each group for (bbox, inner_bboxes), paths in zip(groups, group_paths): # all iso-oriented paths -> it's a table, but might contain svg in cell as well if paths.is_iso_oriented: iso_shapes.extend(paths.to_shapes()) for svg_bbox in inner_bboxes: - images.append(ie.clip_page_to_dict(fitz.Rect(svg_bbox), clip_image_res_ratio)) - + images.append(ie.clip_page_to_dict(bbox=fitz.Rect(svg_bbox), + rm_image=True, + clip_image_res_ratio=clip_image_res_ratio)) + # otherwise, it's a svg else: - images.append(ie.clip_page_to_dict(fitz.Rect(bbox), clip_image_res_ratio)) + images.append(ie.clip_page_to_dict(bbox=fitz.Rect(bbox), + rm_image=True, + clip_image_res_ratio=clip_image_res_ratio)) return iso_shapes, images -