Skip to content

Commit

Permalink
fix hiding text issue #260
Browse files Browse the repository at this point in the history
  • Loading branch information
dothinking committed Jan 28, 2024
1 parent 7d16370 commit bc0fd4a
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 78 deletions.
180 changes: 120 additions & 60 deletions pdf2docx/image/ImagesExtractor.py
Expand Up @@ -16,92 +16,111 @@


class ImagesExtractor:
'''Extract images from PDF.'''

def __init__(self, page:fitz.Page) -> None:
'''Extract images from PDF page.
Args:
page (fitz.Page): pdf page to extract images.
'''
self._page = page


def clip_page_to_pixmap(self, bbox:fitz.Rect=None, zoom:float=3.0):
'''Clip page pixmap (without text) according to ``bbox``.

def clip_page_to_pixmap(self,
bbox:fitz.Rect=None,
rm_image:bool=False,
zoom:float=3.0):
'''Clip page pixmap according to ``bbox``.
Args:
bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
Note that ``bbox`` depends on un-rotated page CS, while clipping page is based on
the final page.
rm_image (bool): remove images or not.
zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.
Returns:
fitz.Pixmap: The extracted pixmap.
'''
# hide text
self._hide_page_text(self._page)
'''
# hide text and images
stream_dict = self._hide_page_text_and_images(self._page, rm_text=True, rm_image=rm_image)

if bbox is None:
clip_bbox = self._page.rect

# transform to the final bbox when page is rotated
elif self._page.rotation:
clip_bbox = bbox * self._page.rotation_matrix

else:
clip_bbox = bbox
clip_bbox = clip_bbox & self._page.rect

clip_bbox = self._page.rect & clip_bbox

# improve resolution
# - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
# - https://github.com/pymupdf/PyMuPDF/issues/181
matrix = fitz.Matrix(zoom, zoom)
pix = self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap

return self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap
# recovery page if hide text
doc = self._page.parent
for xref, stream in stream_dict.items(): doc.update_stream(xref, stream)

return pix


def clip_page_to_dict(self, bbox:fitz.Rect=None, clip_image_res_ratio:float=3.0):
def clip_page_to_dict(self,
bbox:fitz.Rect=None,
rm_image:bool=False,
clip_image_res_ratio:float=3.0):
'''Clip page pixmap (without text) according to ``bbox`` and convert to source image.
Args:
bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.
rm_image (bool): remove images or not.
clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
Defaults to 3.0.
Returns:
list: A list of image raw dict.
'''
pix = self.clip_page_to_pixmap(bbox=bbox, zoom=clip_image_res_ratio)
pix = self.clip_page_to_pixmap(bbox=bbox, rm_image=rm_image, zoom=clip_image_res_ratio)
return self._to_raw_dict(pix, bbox)


def extract_images(self, clip_image_res_ratio:float=3.0):
'''Extract normal images with ``Page.get_images()``.
Args:
clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.
clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
Defaults to 3.0.
Returns:
list: A list of extracted and recovered image raw dict.
.. note::
``Page.get_images()`` contains each image only once, which may less than the real count of images in a page.
``Page.get_images()`` contains each image only once, which may less than the
real count of images in a page.
'''
# pdf document
doc = self._page.parent
rotation = self._page.rotation

# The final view might be formed by several images with alpha channel only, as shown in issue-123.
# It's still inconvenient to extract the original alpha/mask image, as a compromise, extract the
# equivalent image by clipping the union page region for now.
# The final view might be formed by several images with alpha channel only,
# as shown in issue-123.
# It's still inconvenient to extract the original alpha/mask image, as a compromise,
# extract the equivalent image by clipping the union page region for now.
# https://github.com/dothinking/pdf2docx/issues/123

# step 1: collect images: [(bbox, item), ..., ]
ic = Collection()
for item in self._page.get_images(full=True):
item = list(item)
item[-1] = 0
# find all occurrences referenced to this image
item[-1] = 0

# find all occurrences referenced to this image
rects = self._page.get_image_rects(item)
unrotated_page_bbox = self._page.cropbox # note the difference to page.rect
for bbox in rects:
Expand All @@ -125,8 +144,8 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
if len(group) > 1:
clip_bbox = fitz.Rect()
for (bbox, item) in group: clip_bbox |= bbox
raw_dict = self.clip_page_to_dict(clip_bbox, clip_image_res_ratio)
raw_dict = self.clip_page_to_dict(clip_bbox, False, clip_image_res_ratio)

else:
bbox, item = group[0]

Expand All @@ -148,24 +167,28 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
# (22, 25, 1265, 1303, 8, 'DeviceGray', '', 'Im4', 'DCTDecode', 0)
# (23, 0, 1731, 1331, 8, 'DeviceGray', '', 'Im5', 'DCTDecode', 0)
if item[5]=='':
raw_dict = self.clip_page_to_dict(bbox, clip_image_res_ratio)
raw_dict = self.clip_page_to_dict(bbox, False, clip_image_res_ratio)

# normal images
else:
# recover image, e.g., handle image with mask, or CMYK color space
pix = self._recover_pixmap(doc, item)

# rotate image with opencv if page is rotated
raw_dict = self._to_raw_dict(pix, bbox)
if rotation:
if rotation:
raw_dict['image'] = self._rotate_image(pix, -rotation)

images.append(raw_dict)

return images


def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w:float, min_h:float):
return images


def detect_svg_contours(self,
min_svg_gap_dx:float,
min_svg_gap_dy:float,
min_w:float,
min_h:float):
'''Find contour of potential vector graphics.
Args:
Expand All @@ -180,23 +203,23 @@ def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w:
import cv2 as cv

# clip page and convert to opencv image
pixmap = self.clip_page_to_pixmap(zoom=1.0)
pixmap = self.clip_page_to_pixmap(rm_image=True, zoom=1.0)
src = self._pixmap_to_cv_image(pixmap)

# gray and binary
gray = cv.cvtColor(src, cv.COLOR_BGR2GRAY)
_, binary = cv.threshold(gray, 253, 255, cv.THRESH_BINARY_INV)

# external bbox: split images with recursive xy cut
external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy)
external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy)

# inner contours
grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h) for bbox in external_bboxes]
grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h)
for bbox in external_bboxes]

# combined external and inner contours
groups = list(zip(external_bboxes, grouped_inner_bboxes))



# plot detected images for debug
debug = False
if debug:
Expand Down Expand Up @@ -247,7 +270,7 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
Args:
pixmap (fitz.Pixmap): Image to rotate.
rotation (int): Rotation angle.
Return: image bytes.
'''
import cv2 as cv
Expand All @@ -260,7 +283,7 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
# calculate the center of the image
x0, y0 = w//2, h//2

# default scale value for now -> might be extracted from PDF page property
# default scale value for now -> might be extracted from PDF page property
scale = 1.0

# rotation matrix
Expand All @@ -269,16 +292,16 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
# calculate the final dimension
cos = np.abs(matrix[0, 0])
sin = np.abs(matrix[0, 1])

# compute the new bounding dimensions of the image
W = int((h * sin) + (w * cos))
H = int((h * cos) + (w * sin))

# adjust the rotation matrix to take into account translation
matrix[0, 2] += (W / 2) - x0
matrix[1, 2] += (H / 2) - y0
# perform the rotation holding at the center

# perform the rotation holding at the center
rotated_img = cv.warpAffine(img, matrix, (W, H))

# convert back to bytes
Expand All @@ -287,30 +310,67 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):


@staticmethod
def _hide_page_text(page:fitz.Page):
'''Hide page text before clipping page.'''
def _hide_page_text_and_images(page:fitz.Page, rm_text:bool, rm_image:bool):
'''Hide page text and images.'''
# NOTE: text might exist in both content stream and form object stream
# - content stream, i.e. direct page content
# - form object, i.e. contents referenced by this page
xref_list = [xref for (xref, name, invoker, bbox) in page.get_xobjects()]
xref_list.extend(page.get_contents())
xref_list.extend(page.get_contents())

# (1) hide text
# render Tr: set the text rendering mode
# - 3: neither fill nor stroke the text -> invisible
# read more:
# - https://github.com/pymupdf/PyMuPDF/issues/257
# - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
def hide_text(stream):
res = stream
found = False
# set 3 Tr to text block
for k in ['BT', 'Tm', 'Td', '2 Tr']:
bk = k.encode()
if bk in stream:
found = True
res = res.replace(bk, f'{k} 3 Tr'.encode())
return res, found

# (2) hide image
# https://github.com/pymupdf/PyMuPDF/issues/338
def hide_images(stream):
res = stream
found = False
# image names, e.g. [[270, 0, 261, 115, 8, 'DeviceRGB', '', 'Im1', 'DCTDecode']]
img_names = [item[7] for item in page.get_images(full=True)]
for k in img_names:
bk = f'/{k} Do'.encode()
if bk in stream:
found = True
res = res.replace(bk, b'')
return res, found

doc = page.parent # type: fitz.Document
source = {}
for xref in xref_list:
stream = doc.xref_stream(xref).replace(b'BT', b'BT 3 Tr') \
.replace(b'Tm', b'Tm 3 Tr') \
.replace(b'Td', b'Td 3 Tr')
doc.update_stream(xref, stream)

src = doc.xref_stream(xref)

# try to hide text
stream, found_text = hide_text(src) if rm_text else (src, False)

# try to hide images
stream, found_images = hide_images(stream) if rm_image else (stream, False)

if found_text or found_images:
doc.update_stream(xref, stream)
source[xref] = src # save original stream

return source


@staticmethod
def _recover_pixmap(doc:fitz.Document, item:list):
"""Restore pixmap with soft mask considered.
References:
* https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList
Expand Down Expand Up @@ -339,14 +399,14 @@ def _recover_pixmap(doc:fitz.Document, item:list):
temp = fitz.Pixmap(pix, 0) # make temp pixmap w/o the alpha
pix = None # release storage
pix = temp

# check dimension
if pix.width==mask.width and pix.height==mask.height:
pix = fitz.Pixmap(pix, mask) # now compose final pixmap
else:
logging.warning('Ignore image due to inconsistent size of color and mask pixmaps: %s', item)

# we may need to adjust something for CMYK pixmaps here ->
# we may need to adjust something for CMYK pixmaps here ->
# recreate pixmap in RGB color space if necessary
# NOTE: pix.colorspace may be None for images with alpha channel values only
if 'CMYK' in item[5].upper():
Expand All @@ -365,4 +425,4 @@ def _pixmap_to_cv_image(pixmap:fitz.Pixmap):
import cv2 as cv
import numpy as np
img_byte = pixmap.tobytes()
return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR)
return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR)

0 comments on commit bc0fd4a

Please sign in to comment.