Skip to content

Commit

Permalink
tweak Converter #58 #67
Browse files Browse the repository at this point in the history
  • Loading branch information
dothinking committed Dec 31, 2020
1 parent fd0683e commit dcfdd56
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 116 deletions.
205 changes: 118 additions & 87 deletions pdf2docx/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, pdf_file:str):
self._fitz_doc = fitz.Document(pdf_file)

# initialize pages
self.pages = [Page(fitz_page) for fitz_page in self._fitz_doc]
self._pages = [Page(fitz_page) for fitz_page in self._fitz_doc]


def __getitem__(self, index):
Expand All @@ -35,27 +35,99 @@ def __getitem__(self, index):
stop = num
else:
stop = index.stop
pages = [self.pages[i] for i in range(stop)]
pages = [self._pages[i] for i in range(stop)]
return pages[index]
else:
try:
page = self.pages[index]
page = self._pages[index]
except IndexError:
msg = f'Page index {index} out of range'
raise IndexError(msg)
else:
return page


def __len__(self): return len(self.pages)
def __len__(self): return len(self._pages)


@property
def doc_pdf(self): return self._fitz_doc
def fitz_doc(self): return self._fitz_doc


def close(self): self._fitz_doc.close()


def parse(self, page_indexes=None, config:dict=None):
'''Parse pages in specified page_indexes.'''
indexes = page_indexes if page_indexes else range(len(self._pages))
num_pages = len(indexes)
for i, idx in enumerate(indexes, start=1):
print(f'\rParsing Page {idx+1}: {i}/{num_pages}...', end='', flush=True)
try:
self._pages[idx].parse(config)
except Exception as e:
print(f'\nIgnore page due to error: {e}', flush=True)

return self


def make_docx(self, docx_filename=None):
'''Create docx file with converted pages. Note to run page parsing first.'''
# check parsed pages
parsed_pages = list(filter(
lambda page: page.finalized, self._pages
))
if not parsed_pages:
raise Exception('No parsed pages. Please parse page first.')

# docx file to convert to
filename = docx_filename if docx_filename else self.filename_pdf.replace('.pdf', '.docx')
if os.path.exists(filename): os.remove(filename)

# create page by page
docx_file = Document()
num_pages = len(parsed_pages)
print()
for i, page in enumerate(parsed_pages, start=1):
if not page.finalized: continue # ignore unparsed pages
print(f'\rCreating Page {page.id+1}: {i}/{num_pages}...', end='')
try:
page.make_docx(docx_file)
except Exception as e:
print(f'Ignore page due to error: {e}', flush=True)

# save docx
docx_file.save(filename)


def store(self):
'''Store parsed pages in dict format.'''
return {
'filename': os.path.basename(self.filename_pdf),
'page_num': len(self._pages), # count of all pages
'pages' : [page.store() for page in self._pages if page.finalized], # parsed pages only
}


def restore(self, data:dict):
'''Restore pages from parsed results.'''
for raw_page in data.get('pages', []):
idx = raw_page.get('id', -1)
self._pages[idx].restore(raw_page)


def serialize(self, filename:str):
'''Write parsed pages to specified JSON file.'''
with open(filename, 'w', encoding='utf-8') as f:
f.write(json.dumps(self.store(), indent=4))


def deserialize(self, filename:str):
'''Load parsed pages from specified JSON file.'''
with open(filename, 'r') as f:
data = json.load(f)
self.restore(data)


def debug_page(self, i:int, docx_filename:str=None, debug_pdf=None, layout_file=None, config:dict=None):
''' Parse, create and plot single page for debug purpose.
Expand All @@ -80,121 +152,93 @@ def debug_page(self, i:int, docx_filename:str=None, debug_pdf=None, layout_file=
'debug_filename': debug_pdf
})

# parse and make page
self.make_docx(docx_filename, pages=[i], config=config)
# parse and create docx
self.convert(docx_filename, pages=[i], config=config)

# layout information for debugging
pages[0].serialize(layout_file)
self.serialize(layout_file)


def make_docx(self, docx_filename=None, start=0, end=None, pages=None, config:dict=None):
def convert(self, docx_filename=None, start=0, end=None, pages=None, config:dict=None):
''' Convert specified PDF pages to DOCX file.
docx_filename : DOCX filename to write to
start : first page to process, starting from zero if --zero_based_index=True
end : last page to process, starting from zero if --zero_based_index=True
start : first page to process
end : last page to process
pages : range of pages
config : configuration parameters
'''
config = config if config else {}

# DOCX file to convert to
docx_file = Document()
filename = docx_filename if docx_filename else self.filename_pdf.replace('.pdf', '.docx')
if os.path.exists(filename): os.remove(filename)

# PDF pages to convert
zero_based = config.get('zero_based_index', True)
page_indexes = self._page_indexes(start, end, pages, len(self), zero_based)
# pages to convert
page_indexes = self._page_indexes(start, end, pages, len(self))

# convert page by page
t0 = perf_counter()
if config.get('multi_processing', False):
self._make_docx_multi_processing(docx_file, page_indexes, config)
self._parse_and_create_pages_with_multi_processing(docx_filename, page_indexes, config)
else:
self._make_docx(docx_file, page_indexes, config)
print(f'\n{"-"*50}\nTerminated in {perf_counter()-t0}s.')

# save docx
docx_file.save(filename)
self._parse_and_create_pages(docx_filename, page_indexes, config)
print(f'\n{"-"*50}\nTerminated in {perf_counter()-t0}s.')


def extract_tables(self, start=0, end=None, pages=None, config:dict=None):
'''Extract table contents from specified PDF pages.'''
# PDF pages to convert
config = config if config else {}
zero_based = config.get('zero_based_index', True)
page_indexes = self._page_indexes(start, end, pages, len(self), zero_based)
page_indexes = self._page_indexes(start, end, pages, len(self))

# process page by page
tables = []
num_pages = len(page_indexes)
for i in page_indexes:
print(f'\rProcessing Pages: {i+1}/{num_pages}...')
page_tables = self.parse(self.doc_pdf[i], config).extract_tables()
tables.extend(page_tables)
self.parse(page_indexes, config)

# get parsed tables
tables = []
for page in self._pages:
if page.finalized: tables.extend(page.extract_tables())
return tables


def _make_docx(self, docx_file:Document, page_indexes:list, config:dict):
def _parse_and_create_pages(self, docx_filename:str, page_indexes:list, config:dict):
''' Parse and create pages based on page indexes.
---
Args:
- docx_file : docx.Document, docx file write to
- page_indexes: list[int], page indexes to parse
- docx_filename: DOCX filename to write to
- page_indexes : list[int], page indexes to parse
'''
num_pages = len(page_indexes)
for i in page_indexes:
print(f'\rProcessing Pages: {i+1}/{num_pages}...', end='', flush=True)
self.pages[i].parse(config).make_docx(docx_file)
self.parse(page_indexes=page_indexes, config=config).make_docx(docx_filename)


def _make_docx_multi_processing(self, docx_file:Document, page_indexes:list, config:dict):
def _parse_and_create_pages_with_multi_processing(self, docx_filename:str, page_indexes:list, config:dict):
''' Parse and create pages based on page indexes.
---
Args:
- docx_file : docx.Document, docx file write to
- page_indexes: list[int], page indexes to parse
- docx_filename: DOCX filename to write to
- page_indexes : list[int], page indexes to parse
https://pymupdf.readthedocs.io/en/latest/faq.html#multiprocessing
'''
# make vectors of arguments for the processes
cpu = min(config['cpu_count'], cpu_count()) if 'cpu_count' in config else cpu_count()
start, end = min(page_indexes), max(page_indexes)
prefix_layout = 'layout'
vectors = [(i, cpu, start, end, self.filename_pdf, config, f'{prefix_layout}-{i}.json') for i in range(cpu)]
prefix = 'pages' # json file writing parsed pages per process
vectors = [(i, cpu, start, end, self.filename_pdf, config, f'{prefix}-{i}.json') for i in range(cpu)]

# start parsing processes
pool = Pool()
pool.map(self._make_docx_per_cpu, vectors, 1)
pool.map(self._parse_pages_per_cpu, vectors, 1)

# read parsed page data
raw_pages = {}
# restore parsed page data
for i in range(cpu):
filename = f'{prefix_layout}-{i}.json'
filename = f'{prefix}-{i}.json'
if not os.path.exists(filename): continue
with open(filename, 'r') as f:
raw_pages.update(json.load(f))
self.deserialize(filename)
os.remove(filename)

# restore pages and create docx pages
print()
num_pages = len(page_indexes)
pages = []
for page_index in page_indexes:
key = str(page_index)
if key not in raw_pages: continue

print(f'\rCreating Pages: {page_index+1}/{num_pages}...', end='')
layout = Page()
layout.restore(raw_pages[key]).make_docx(docx_file)
pages.append(layout)

return pages
# create docx file
self.make_docx(docx_filename)


@staticmethod
def _make_docx_per_cpu(vector):
def _parse_pages_per_cpu(vector):
''' Render a page range of a document.
---
Args:
Expand All @@ -213,36 +257,23 @@ def _make_docx_per_cpu(vector):
cv = Converter(pdf_filename)

# all pages to process
pages_indexes = range(s, e+1)
num_pages = len(pages_indexes)
all_indexes = range(s, e+1)
num_pages = len(all_indexes)

# pages per segment
seg_size = int(num_pages/cpu) + 1
seg_from = idx * seg_size
seg_to = min(seg_from + seg_size, num_pages)
page_indexes = [all_indexes[i] for i in range(seg_from, seg_to)]

res = {}
for i in range(seg_from, seg_to): # work through page segment
page_index = pages_indexes[i]
print(f'\rParsing Pages: {page_index+1}/{num_pages} per CPU {idx}...', end='', flush=True)

# store page parsed results
page = cv.doc_pdf[page_index]
res[page_index] = cv.parse(page, config).store()

# serialize results
with open(json_filename, 'w') as f:
f.write(json.dumps(res))
# parse pages and serialize data for further processing
cv.parse(page_indexes, config)
cv.serialize(json_filename)
cv.close()


@staticmethod
def _page_indexes(start, end, pages, pdf_len, zero_based=True):
# index starts from zero or one
if not zero_based:
start = max(start-1, 0)
if end: end -= 1
if pages: pages = [i-1 for i in pages]

def _page_indexes(start, end, pages, pdf_len):
# parsing arguments
if pages:
indexes = [int(x) for x in pages if 0<=x<pdf_len]
Expand Down
25 changes: 20 additions & 5 deletions pdf2docx/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,33 @@ def convert(pdf_file, docx_file=None, start=0, end=None, pages=None, **kwargs):
clip_image_res_ratio : 3.0, resolution ratio (to 72dpi) when cliping page image
curve_path_ratio : 0.2, clip page bitmap if the component of curve paths exceeds this ratio
'''
# index starts from zero or one
if not kwargs.get('zero_based_index', True):
start = max(start-1, 0)
if end: end -= 1
if pages: pages = [i-1 for i in pages]

cv = Converter(pdf_file)
cv.make_docx(docx_file, start, end, pages, kwargs)
cv.convert(docx_file, start, end, pages, kwargs)
cv.close()


@staticmethod
def debug_page(pdf_file, page_index=0, docx_file=None, debug_pdf=None, layout_file='layout.json', **kwargs):
def debug(pdf_file, page_index=0, docx_file=None, debug_pdf=None, layout_file='layout.json', **kwargs):
''' Convert one PDF page and plot layout information for debugging.
Args:
pdf_file (str) : PDF filename to read from
page_index (int): page index to convert
page_index (int): page index to convert (starting from zero if --zero_based_index=True)
docx_file (str): DOCX filename to write to (change extension from "pdf" to "docx" by default)
debug_pdf (str): new pdf file storing layout information (add prefix "debug_" by default)
layout_file (str): new json file storing parsed layout data (layout.json by default)
kwargs (dict) : configuration parameters
'''
# index starts from zero or one
if not kwargs.get('zero_based_index', True):
page_index = max(page_index-1, 0)

cv = Converter(pdf_file)
cv.debug_page(page_index, docx_file, debug_pdf, layout_file, kwargs)
cv.close()
Expand All @@ -67,10 +77,15 @@ def table(pdf_file, start=0, end=None, pages=None, **kwargs):
Args:
pdf_file (str) : PDF filename to read from
start (int) : first page to process, starting from zero
end (int) : last page to process, starting from zero
start (int) : first page to process (starting from zero if --zero_based_index=True)
end (int) : last page to process (starting from zero if --zero_based_index=True)
pages (list) : range of pages
'''
# index starts from zero or one
if not kwargs.get('zero_based_index', True):
start = max(start-1, 0)
if end: end -= 1
if pages: pages = [i-1 for i in pages]

cv = Converter(pdf_file)
tables = cv.extract_tables(start, end, pages, kwargs)
Expand Down

0 comments on commit dcfdd56

Please sign in to comment.