From bc0fd4ac7c892baf35b0e475f6d8db8b1ac6dfae Mon Sep 17 00:00:00 2001
From: dothinking <train8808@gmail.com>
Date: Sun, 28 Jan 2024 21:07:36 +0800
Subject: [PATCH] fix hiding text issue #260

---
 pdf2docx/image/ImagesExtractor.py | 180 ++++++++++++++++++++----------
 pdf2docx/shape/Paths.py           |  42 ++++---
 2 files changed, 144 insertions(+), 78 deletions(-)

diff --git a/pdf2docx/image/ImagesExtractor.py b/pdf2docx/image/ImagesExtractor.py
index c4e8ed7..4100fb1 100644
--- a/pdf2docx/image/ImagesExtractor.py
+++ b/pdf2docx/image/ImagesExtractor.py
@@ -16,61 +16,77 @@
 
 
 class ImagesExtractor:
+    '''Extract images from PDF.'''
+
     def __init__(self, page:fitz.Page) -> None:
         '''Extract images from PDF page.
-        
+
         Args:
             page (fitz.Page): pdf page to extract images.
         '''
         self._page = page
-    
 
-    def clip_page_to_pixmap(self, bbox:fitz.Rect=None, zoom:float=3.0):
-        '''Clip page pixmap (without text) according to ``bbox``.
+
+    def clip_page_to_pixmap(self,
+                            bbox:fitz.Rect=None,
+                            rm_image:bool=False,
+                            zoom:float=3.0):
+        '''Clip page pixmap according to ``bbox``.
 
         Args:
             bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
                 Note that ``bbox`` depends on un-rotated page CS, while clipping page is based on
                 the final page.
+            rm_image (bool): remove images or not.
             zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.
 
         Returns:
             fitz.Pixmap: The extracted pixmap.
-        '''        
-        # hide text 
-        self._hide_page_text(self._page)
-        
+        '''
+        # hide text and images
+        stream_dict = self._hide_page_text_and_images(self._page, rm_text=True, rm_image=rm_image)
+
         if bbox is None:
             clip_bbox = self._page.rect
-        
+
         # transform to the final bbox when page is rotated
         elif self._page.rotation:
             clip_bbox = bbox * self._page.rotation_matrix
-            
+
         else:
             clip_bbox = bbox
-        
-        clip_bbox = clip_bbox & self._page.rect
-        
+
+        clip_bbox =  self._page.rect & clip_bbox
+
         # improve resolution
         # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
         # - https://github.com/pymupdf/PyMuPDF/issues/181
         matrix = fitz.Matrix(zoom, zoom)
+        pix = self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap
 
-        return self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap
+        # recovery page if hide text
+        doc = self._page.parent
+        for xref, stream in stream_dict.items(): doc.update_stream(xref, stream)
+
+        return pix
 
 
-    def clip_page_to_dict(self, bbox:fitz.Rect=None, clip_image_res_ratio:float=3.0):
+    def clip_page_to_dict(self,
+                          bbox:fitz.Rect=None,
+                          rm_image:bool=False,
+                          clip_image_res_ratio:float=3.0):
         '''Clip page pixmap (without text) according to ``bbox`` and convert to source image.
 
         Args:
             bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
-            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.
+            rm_image (bool): remove images or not.
+            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
+                Defaults to 3.0.
 
         Returns:
             list: A list of image raw dict.
         '''
-        pix = self.clip_page_to_pixmap(bbox=bbox, zoom=clip_image_res_ratio)
+        pix = self.clip_page_to_pixmap(bbox=bbox, rm_image=rm_image, zoom=clip_image_res_ratio)
         return self._to_raw_dict(pix, bbox)
 
 
@@ -78,30 +94,33 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
         '''Extract normal images with ``Page.get_images()``.
 
         Args:
-            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.
+            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
+                Defaults to 3.0.
 
         Returns:
             list: A list of extracted and recovered image raw dict.
-        
+
         .. note::
-            ``Page.get_images()`` contains each image only once, which may less than the real count of images in a page.
+            ``Page.get_images()`` contains each image only once, which may less than the
+            real count of images in a page.
         '''
         # pdf document
         doc = self._page.parent
         rotation = self._page.rotation
 
-        # The final view might be formed by several images with alpha channel only, as shown in issue-123. 
-        # It's still inconvenient to extract the original alpha/mask image, as a compromise, extract the 
-        # equivalent image by clipping the union page region for now.
+        # The final view might be formed by several images with alpha channel only,
+        # as shown in issue-123.
+        # It's still inconvenient to extract the original alpha/mask image, as a compromise,
+        # extract the equivalent image by clipping the union page region for now.
         # https://github.com/dothinking/pdf2docx/issues/123
 
         # step 1: collect images: [(bbox, item), ..., ]
         ic = Collection()
         for item in self._page.get_images(full=True):
             item = list(item)
-            item[-1] = 0            
-            
-            # find all occurrences referenced to this image            
+            item[-1] = 0
+
+            # find all occurrences referenced to this image
             rects = self._page.get_image_rects(item)
             unrotated_page_bbox = self._page.cropbox # note the difference to page.rect
             for bbox in rects:
@@ -125,8 +144,8 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
             if len(group) > 1:
                 clip_bbox = fitz.Rect()
                 for (bbox, item) in group: clip_bbox |= bbox
-                raw_dict = self.clip_page_to_dict(clip_bbox, clip_image_res_ratio)
-            
+                raw_dict = self.clip_page_to_dict(clip_bbox, False, clip_image_res_ratio)
+
             else:
                 bbox, item = group[0]
 
@@ -148,8 +167,8 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
                 # (22, 25, 1265, 1303, 8, 'DeviceGray', '', 'Im4', 'DCTDecode', 0)
                 # (23, 0, 1731, 1331, 8, 'DeviceGray', '', 'Im5', 'DCTDecode', 0)
                 if item[5]=='':
-                    raw_dict = self.clip_page_to_dict(bbox, clip_image_res_ratio)
-                
+                    raw_dict = self.clip_page_to_dict(bbox, False, clip_image_res_ratio)
+
                 # normal images
                 else:
                     # recover image, e.g., handle image with mask, or CMYK color space
@@ -157,15 +176,19 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
 
                     # rotate image with opencv if page is rotated
                     raw_dict = self._to_raw_dict(pix, bbox)
-                    if rotation: 
+                    if rotation:
                         raw_dict['image'] = self._rotate_image(pix, -rotation)
 
             images.append(raw_dict)
 
-        return images    
-        
-    
-    def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w:float, min_h:float):
+        return images
+
+
+    def detect_svg_contours(self,
+                            min_svg_gap_dx:float,
+                            min_svg_gap_dy:float,
+                            min_w:float,
+                            min_h:float):
         '''Find contour of potential vector graphics.
 
         Args:
@@ -180,23 +203,23 @@ def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w:
         import cv2 as cv
 
         # clip page and convert to opencv image
-        pixmap = self.clip_page_to_pixmap(zoom=1.0)
+        pixmap = self.clip_page_to_pixmap(rm_image=True, zoom=1.0)
         src = self._pixmap_to_cv_image(pixmap)
 
         # gray and binary
         gray = cv.cvtColor(src, cv.COLOR_BGR2GRAY)
         _, binary = cv.threshold(gray, 253, 255, cv.THRESH_BINARY_INV)
-        
+
         # external bbox: split images with recursive xy cut
-        external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy)        
-        
+        external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy)
+
         # inner contours
-        grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h) for bbox in external_bboxes]
+        grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h)
+                                for bbox in external_bboxes]
 
         # combined external and inner contours
         groups = list(zip(external_bboxes, grouped_inner_bboxes))
-            
-        
+
         # plot detected images for debug
         debug = False
         if debug:
@@ -247,7 +270,7 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
         Args:
             pixmap (fitz.Pixmap): Image to rotate.
             rotation (int): Rotation angle.
-        
+
         Return: image bytes.
         '''
         import cv2 as cv
@@ -260,7 +283,7 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
         # calculate the center of the image
         x0, y0 = w//2, h//2
 
-        # default scale value for now -> might be extracted from PDF page property    
+        # default scale value for now -> might be extracted from PDF page property
         scale = 1.0
 
         # rotation matrix
@@ -269,16 +292,16 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
         # calculate the final dimension
         cos = np.abs(matrix[0, 0])
         sin = np.abs(matrix[0, 1])
-    
+
         # compute the new bounding dimensions of the image
         W = int((h * sin) + (w * cos))
         H = int((h * cos) + (w * sin))
-    
+
         # adjust the rotation matrix to take into account translation
         matrix[0, 2] += (W / 2) - x0
         matrix[1, 2] += (H / 2) - y0
-        
-        # perform the rotation holding at the center        
+
+        # perform the rotation holding at the center
         rotated_img = cv.warpAffine(img, matrix, (W, H))
 
         # convert back to bytes
@@ -287,30 +310,67 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
 
 
     @staticmethod
-    def _hide_page_text(page:fitz.Page):
-        '''Hide page text before clipping page.'''
+    def _hide_page_text_and_images(page:fitz.Page, rm_text:bool, rm_image:bool):
+        '''Hide page text and images.'''
         # NOTE: text might exist in both content stream and form object stream
         # - content stream, i.e. direct page content
         # - form object, i.e. contents referenced by this page
         xref_list = [xref for (xref, name, invoker, bbox) in page.get_xobjects()]
-        xref_list.extend(page.get_contents())        
+        xref_list.extend(page.get_contents())
 
+        # (1) hide text
         # render Tr: set the text rendering mode
         # - 3: neither fill nor stroke the text -> invisible
         # read more:
         # - https://github.com/pymupdf/PyMuPDF/issues/257
         # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
+        def hide_text(stream):
+            res = stream
+            found = False
+            # set 3 Tr to text block
+            for k in ['BT', 'Tm', 'Td', '2 Tr']:
+                bk = k.encode()
+                if bk in stream:
+                    found = True
+                    res = res.replace(bk, f'{k} 3 Tr'.encode())
+            return res, found
+
+        # (2) hide image
+        # https://github.com/pymupdf/PyMuPDF/issues/338
+        def hide_images(stream):
+            res = stream
+            found = False
+            # image names, e.g. [[270, 0, 261, 115, 8, 'DeviceRGB', '', 'Im1', 'DCTDecode']]
+            img_names = [item[7] for item in page.get_images(full=True)]
+            for k in img_names:
+                bk = f'/{k} Do'.encode()
+                if bk in stream:
+                    found = True
+                    res = res.replace(bk, b'')
+            return res, found
+
         doc = page.parent # type: fitz.Document
+        source = {}
         for xref in xref_list:
-            stream = doc.xref_stream(xref).replace(b'BT', b'BT 3 Tr') \
-                                             .replace(b'Tm', b'Tm 3 Tr') \
-                                             .replace(b'Td', b'Td 3 Tr')
-            doc.update_stream(xref, stream)
-   
+            src = doc.xref_stream(xref)
+
+            # try to hide text
+            stream, found_text = hide_text(src) if rm_text else (src, False)
+
+            # try to hide images
+            stream, found_images = hide_images(stream) if rm_image else (stream, False)
+
+            if found_text or found_images:
+                doc.update_stream(xref, stream)
+                source[xref] = src # save original stream
+
+        return source
+
+
     @staticmethod
     def _recover_pixmap(doc:fitz.Document, item:list):
         """Restore pixmap with soft mask considered.
-        
+
         References:
 
             * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList        
@@ -339,14 +399,14 @@ def _recover_pixmap(doc:fitz.Document, item:list):
                 temp = fitz.Pixmap(pix, 0)  # make temp pixmap w/o the alpha
                 pix = None  # release storage
                 pix = temp
-            
+
             # check dimension
             if pix.width==mask.width and pix.height==mask.height:
                 pix = fitz.Pixmap(pix, mask)  # now compose final pixmap
             else:
                 logging.warning('Ignore image due to inconsistent size of color and mask pixmaps: %s', item)
 
-        # we may need to adjust something for CMYK pixmaps here -> 
+        # we may need to adjust something for CMYK pixmaps here ->
         # recreate pixmap in RGB color space if necessary
         # NOTE: pix.colorspace may be None for images with alpha channel values only
         if 'CMYK' in item[5].upper():
@@ -365,4 +425,4 @@ def _pixmap_to_cv_image(pixmap:fitz.Pixmap):
         import cv2 as cv
         import numpy as np
         img_byte = pixmap.tobytes()
-        return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR)
\ No newline at end of file
+        return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR)
diff --git a/pdf2docx/shape/Paths.py b/pdf2docx/shape/Paths.py
index c3148ec..df143c4 100644
--- a/pdf2docx/shape/Paths.py
+++ b/pdf2docx/shape/Paths.py
@@ -1,9 +1,7 @@
-# -*- coding: utf-8 -*-
-
 '''
 Objects representing PDF path (stroke and filling) extracted by ``page.get_drawings()``.
 
-This method is new since ``PyMuPDF`` 1.18.0, with both pdf raw path and annotations like Line, 
+This method is new since ``PyMuPDF`` 1.18.0, with both pdf raw path and annotations like Line,
 Square and Highlight considered.
 
 * https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_drawings
@@ -28,9 +26,9 @@ def restore(self, raws:list):
             # ignore path out of page
             if not path.bbox.intersects(rect): continue
             self.append(path)
-        
+
         return self
-    
+
     @lazyproperty
     def bbox(self):
         bbox = fitz.Rect()
@@ -57,7 +55,7 @@ def plot(self, page):
         canvas = page.new_shape()
         for path in self._instances: path.plot(canvas)
         canvas.commit() # commit the drawing shapes to page
-    
+
 
     def to_shapes(self):
         '''Convert contained paths to ISO strokes or rectangular fills.
@@ -73,10 +71,14 @@ def to_shapes(self):
         return shapes
 
 
-    def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15, 
-                                min_w:float=2, min_h:float=2, clip_image_res_ratio:float=3.0):
-        '''Convert paths to iso-oriented shapes or images. The semantic type of path is either table/text style or 
-        vector graphic. This method is to:
+    def to_shapes_and_images(self,
+                             min_svg_gap_dx:float=15,
+                             min_svg_gap_dy:float=15,
+                             min_w:float=2,
+                             min_h:float=2,
+                             clip_image_res_ratio:float=3.0):
+        '''Convert paths to iso-oriented shapes or images. The semantic type of path is either
+        table/text style or vector graphic. This method is to:
         * detect svg regions -> exist at least one non-iso-oriented path
         * convert svg to bitmap by clipping page
         * convert the rest paths to iso-oriented shapes for further table/text style parsing
@@ -86,7 +88,8 @@ def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15,
             min_svg_gap_dy (float): Merge svg if the vertical gap is less than this value.
             min_w (float): Ignore contours if the bbox width is less than this value.
             min_h (float): Ignore contours if the bbox height is less than this value.
-            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.
+            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
+                Defaults to 3.0.
 
         Returns:
             tuple: (list of shape raw dict, list of image raw dict).
@@ -104,7 +107,7 @@ def to_shapes_and_images(self, min_svg_gap_dx:float=15, min_svg_gap_dy:float=15,
 
         # `bbox` is the external bbox of current region, while `inner_bboxes` are the inner contours
         # of level-2 hierarchy, i.e. contours under table cell.
-        # * it a table (or text style) if paths contained in `bbox` but excluded from `inner_bboxes` 
+        # * it a table (or text style) if paths contained in `bbox` but excluded from `inner_bboxes`
         #   are all iso-oriented -> export iso-shapes, clip page image based on `inner_bboxes`;
         # * otherwise, it's a vector graphic -> clip page image (without any text) based on `bbox`
         def contained_in_inner_contours(path:Path, contours:list):
@@ -115,22 +118,25 @@ def contained_in_inner_contours(path:Path, contours:list):
         # group every path to one of the detected bbox
         group_paths = [Paths() for _ in groups] # type: list[Paths]
         for path in self._instances:
-            for (bbox, inner_bboxes), paths in zip(groups, group_paths):            
+            for (bbox, inner_bboxes), paths in zip(groups, group_paths):
                 if path.bbox.intersects(bbox):
                     if not contained_in_inner_contours(path, inner_bboxes): paths.append(path)
                     break
-        
+
         # check each group
         for (bbox, inner_bboxes), paths in zip(groups, group_paths): 
             # all iso-oriented paths -> it's a table, but might contain svg in cell as well
             if paths.is_iso_oriented:
                 iso_shapes.extend(paths.to_shapes())
                 for svg_bbox in inner_bboxes:
-                    images.append(ie.clip_page_to_dict(fitz.Rect(svg_bbox), clip_image_res_ratio))
-            
+                    images.append(ie.clip_page_to_dict(bbox=fitz.Rect(svg_bbox),
+                                                        rm_image=True,
+                                                        clip_image_res_ratio=clip_image_res_ratio))
+
             # otherwise, it's a svg
             else:
-                images.append(ie.clip_page_to_dict(fitz.Rect(bbox), clip_image_res_ratio))
+                images.append(ie.clip_page_to_dict(bbox=fitz.Rect(bbox),
+                                                   rm_image=True,
+                                                   clip_image_res_ratio=clip_image_res_ratio))
 
         return iso_shapes, images
-