fix hiding text issue #260

ArtifexSoftware · Jan 28, 2024 · bc0fd4a · bc0fd4a
1 parent 7d16370
commit bc0fd4a
Show file tree

Hide file tree

Showing 2 changed files with 144 additions and 78 deletions.
diff --git a/pdf2docx/image/ImagesExtractor.py b/pdf2docx/image/ImagesExtractor.py
@@ -16,92 +16,111 @@
 
 
 class ImagesExtractor:
+    '''Extract images from PDF.'''
+
     def __init__(self, page:fitz.Page) -> None:
         '''Extract images from PDF page.
-        
+
         Args:
             page (fitz.Page): pdf page to extract images.
         '''
         self._page = page
-
 
-    def clip_page_to_pixmap(self, bbox:fitz.Rect=None, zoom:float=3.0):
-        '''Clip page pixmap (without text) according to ``bbox``.
+
+    def clip_page_to_pixmap(self,
+                            bbox:fitz.Rect=None,
+                            rm_image:bool=False,
+                            zoom:float=3.0):
+        '''Clip page pixmap according to ``bbox``.
 
         Args:
             bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
                 Note that ``bbox`` depends on un-rotated page CS, while clipping page is based on
                 the final page.
+            rm_image (bool): remove images or not.
             zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.
 
         Returns:
             fitz.Pixmap: The extracted pixmap.
-        '''        
-        # hide text 
-        self._hide_page_text(self._page)
-        
+        '''
+        # hide text and images
+        stream_dict = self._hide_page_text_and_images(self._page, rm_text=True, rm_image=rm_image)
+
         if bbox is None:
             clip_bbox = self._page.rect
-        
+
         # transform to the final bbox when page is rotated
         elif self._page.rotation:
             clip_bbox = bbox * self._page.rotation_matrix
-            
+
         else:
             clip_bbox = bbox
-        
-        clip_bbox = clip_bbox & self._page.rect
-        
+
+        clip_bbox =  self._page.rect & clip_bbox
+
         # improve resolution
         # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
         # - https://github.com/pymupdf/PyMuPDF/issues/181
         matrix = fitz.Matrix(zoom, zoom)
+        pix = self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap
 
-        return self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap
+        # recovery page if hide text
+        doc = self._page.parent
+        for xref, stream in stream_dict.items(): doc.update_stream(xref, stream)
+
+        return pix
 
 
-    def clip_page_to_dict(self, bbox:fitz.Rect=None, clip_image_res_ratio:float=3.0):
+    def clip_page_to_dict(self,
+                          bbox:fitz.Rect=None,
+                          rm_image:bool=False,
+                          clip_image_res_ratio:float=3.0):
         '''Clip page pixmap (without text) according to ``bbox`` and convert to source image.
 
         Args:
             bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
-            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.
+            rm_image (bool): remove images or not.
+            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
+                Defaults to 3.0.
 
         Returns:
             list: A list of image raw dict.
         '''
-        pix = self.clip_page_to_pixmap(bbox=bbox, zoom=clip_image_res_ratio)
+        pix = self.clip_page_to_pixmap(bbox=bbox, rm_image=rm_image, zoom=clip_image_res_ratio)
         return self._to_raw_dict(pix, bbox)
 
 
     def extract_images(self, clip_image_res_ratio:float=3.0):
         '''Extract normal images with ``Page.get_images()``.
 
         Args:
-            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.
+            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
+                Defaults to 3.0.
 
         Returns:
             list: A list of extracted and recovered image raw dict.
-        
+
         .. note::
-            ``Page.get_images()`` contains each image only once, which may less than the real count of images in a page.
+            ``Page.get_images()`` contains each image only once, which may less than the
+            real count of images in a page.
         '''
         # pdf document
         doc = self._page.parent
         rotation = self._page.rotation
 
-        # The final view might be formed by several images with alpha channel only, as shown in issue-123. 
-        # It's still inconvenient to extract the original alpha/mask image, as a compromise, extract the 
-        # equivalent image by clipping the union page region for now.
+        # The final view might be formed by several images with alpha channel only,
+        # as shown in issue-123.
+        # It's still inconvenient to extract the original alpha/mask image, as a compromise,
+        # extract the equivalent image by clipping the union page region for now.
         # https://github.com/dothinking/pdf2docx/issues/123
 
         # step 1: collect images: [(bbox, item), ..., ]
         ic = Collection()
         for item in self._page.get_images(full=True):
             item = list(item)
-            item[-1] = 0            
-            
-            # find all occurrences referenced to this image            
+            item[-1] = 0
+
+            # find all occurrences referenced to this image
             rects = self._page.get_image_rects(item)
             unrotated_page_bbox = self._page.cropbox # note the difference to page.rect
             for bbox in rects:
@@ -125,8 +144,8 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
             if len(group) > 1:
                 clip_bbox = fitz.Rect()
                 for (bbox, item) in group: clip_bbox |= bbox
-                raw_dict = self.clip_page_to_dict(clip_bbox, clip_image_res_ratio)
-            
+                raw_dict = self.clip_page_to_dict(clip_bbox, False, clip_image_res_ratio)
+
             else:
                 bbox, item = group[0]
 
@@ -148,24 +167,28 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
                 # (22, 25, 1265, 1303, 8, 'DeviceGray', '', 'Im4', 'DCTDecode', 0)
                 # (23, 0, 1731, 1331, 8, 'DeviceGray', '', 'Im5', 'DCTDecode', 0)
                 if item[5]=='':
-                    raw_dict = self.clip_page_to_dict(bbox, clip_image_res_ratio)
-                
+                    raw_dict = self.clip_page_to_dict(bbox, False, clip_image_res_ratio)
+
                 # normal images
                 else:
                     # recover image, e.g., handle image with mask, or CMYK color space
                     pix = self._recover_pixmap(doc, item)
 
                     # rotate image with opencv if page is rotated
                     raw_dict = self._to_raw_dict(pix, bbox)
-                    if rotation: 
+                    if rotation:
                         raw_dict['image'] = self._rotate_image(pix, -rotation)
 
             images.append(raw_dict)
 
-        return images    
-
-
-    def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w:float, min_h:float):
+        return images
+
+
+    def detect_svg_contours(self,
+                            min_svg_gap_dx:float,
+                            min_svg_gap_dy:float,
+                            min_w:float,
+                            min_h:float):
         '''Find contour of potential vector graphics.
 
         Args:
@@ -180,23 +203,23 @@ def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w:
         import cv2 as cv
 
         # clip page and convert to opencv image
-        pixmap = self.clip_page_to_pixmap(zoom=1.0)
+        pixmap = self.clip_page_to_pixmap(rm_image=True, zoom=1.0)
         src = self._pixmap_to_cv_image(pixmap)
 
         # gray and binary
         gray = cv.cvtColor(src, cv.COLOR_BGR2GRAY)
         _, binary = cv.threshold(gray, 253, 255, cv.THRESH_BINARY_INV)
-        
+
         # external bbox: split images with recursive xy cut
-        external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy)        
-        
+        external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy)
+
         # inner contours
-        grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h) for bbox in external_bboxes]
+        grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h)
+                                for bbox in external_bboxes]
 
         # combined external and inner contours
         groups = list(zip(external_bboxes, grouped_inner_bboxes))
-
-
+
         # plot detected images for debug
         debug = False
         if debug:
@@ -247,7 +270,7 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
         Args:
             pixmap (fitz.Pixmap): Image to rotate.
             rotation (int): Rotation angle.
-        
+
         Return: image bytes.
         '''
         import cv2 as cv
@@ -260,7 +283,7 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
         # calculate the center of the image
         x0, y0 = w//2, h//2
 
-        # default scale value for now -> might be extracted from PDF page property    
+        # default scale value for now -> might be extracted from PDF page property
         scale = 1.0
 
         # rotation matrix
@@ -269,16 +292,16 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
         # calculate the final dimension
         cos = np.abs(matrix[0, 0])
         sin = np.abs(matrix[0, 1])
-    
+
         # compute the new bounding dimensions of the image
         W = int((h * sin) + (w * cos))
         H = int((h * cos) + (w * sin))
-    
+
         # adjust the rotation matrix to take into account translation
         matrix[0, 2] += (W / 2) - x0
         matrix[1, 2] += (H / 2) - y0
-        
-        # perform the rotation holding at the center        
+
+        # perform the rotation holding at the center
         rotated_img = cv.warpAffine(img, matrix, (W, H))
 
         # convert back to bytes
@@ -287,30 +310,67 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
 
 
     @staticmethod
-    def _hide_page_text(page:fitz.Page):
-        '''Hide page text before clipping page.'''
+    def _hide_page_text_and_images(page:fitz.Page, rm_text:bool, rm_image:bool):
+        '''Hide page text and images.'''
         # NOTE: text might exist in both content stream and form object stream
         # - content stream, i.e. direct page content
         # - form object, i.e. contents referenced by this page
         xref_list = [xref for (xref, name, invoker, bbox) in page.get_xobjects()]
-        xref_list.extend(page.get_contents())        
+        xref_list.extend(page.get_contents())
 
+        # (1) hide text
         # render Tr: set the text rendering mode
         # - 3: neither fill nor stroke the text -> invisible
         # read more:
         # - https://github.com/pymupdf/PyMuPDF/issues/257
         # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
+        def hide_text(stream):
+            res = stream
+            found = False
+            # set 3 Tr to text block
+            for k in ['BT', 'Tm', 'Td', '2 Tr']:
+                bk = k.encode()
+                if bk in stream:
+                    found = True
+                    res = res.replace(bk, f'{k} 3 Tr'.encode())
+            return res, found
+
+        # (2) hide image
+        # https://github.com/pymupdf/PyMuPDF/issues/338
+        def hide_images(stream):
+            res = stream
+            found = False
+            # image names, e.g. [[270, 0, 261, 115, 8, 'DeviceRGB', '', 'Im1', 'DCTDecode']]
+            img_names = [item[7] for item in page.get_images(full=True)]
+            for k in img_names:
+                bk = f'/{k} Do'.encode()
+                if bk in stream:
+                    found = True
+                    res = res.replace(bk, b'')
+            return res, found
+
         doc = page.parent # type: fitz.Document
+        source = {}
         for xref in xref_list:
-            stream = doc.xref_stream(xref).replace(b'BT', b'BT 3 Tr') \
-                                             .replace(b'Tm', b'Tm 3 Tr') \
-                                             .replace(b'Td', b'Td 3 Tr')
-            doc.update_stream(xref, stream)
-
+            src = doc.xref_stream(xref)
+
+            # try to hide text
+            stream, found_text = hide_text(src) if rm_text else (src, False)
+
+            # try to hide images
+            stream, found_images = hide_images(stream) if rm_image else (stream, False)
+
+            if found_text or found_images:
+                doc.update_stream(xref, stream)
+                source[xref] = src # save original stream
+
+        return source
+
+
     @staticmethod
     def _recover_pixmap(doc:fitz.Document, item:list):
         """Restore pixmap with soft mask considered.
-        
+
         References:
 
             * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList        
@@ -339,14 +399,14 @@ def _recover_pixmap(doc:fitz.Document, item:list):
                 temp = fitz.Pixmap(pix, 0)  # make temp pixmap w/o the alpha
                 pix = None  # release storage
                 pix = temp
-            
+
             # check dimension
             if pix.width==mask.width and pix.height==mask.height:
                 pix = fitz.Pixmap(pix, mask)  # now compose final pixmap
             else:
                 logging.warning('Ignore image due to inconsistent size of color and mask pixmaps: %s', item)
 
-        # we may need to adjust something for CMYK pixmaps here -> 
+        # we may need to adjust something for CMYK pixmaps here ->
         # recreate pixmap in RGB color space if necessary
         # NOTE: pix.colorspace may be None for images with alpha channel values only
         if 'CMYK' in item[5].upper():
@@ -365,4 +425,4 @@ def _pixmap_to_cv_image(pixmap:fitz.Pixmap):
         import cv2 as cv
         import numpy as np
         img_byte = pixmap.tobytes()
-        return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR)
+        return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR)