Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing GUI relevant issues #254

Merged
merged 5 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
109 changes: 31 additions & 78 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,36 @@ on:
# However, keep step 1 only, considering the difficulty to get a specific runner with MS Word installed.
# -----------------------------------------------------------------------------------------------------
jobs:

pdf2docx-docker:

runs-on: ubuntu-latest

container:
image: python:3.8

steps:
- name: Check out code
uses: actions/checkout@v2

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest
python setup.py develop

pdf2docx:
- name: Run unit test
run: |
pytest -v ./test/test.py::TestConversion


pdf2docx-ubuntu:

runs-on: ubuntu-latest

needs: pdf2docx-docker

strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
Expand All @@ -43,93 +68,21 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-cov codecov
pip install pytest pytest-cov
python setup.py develop

- name: Run unit test
run: |
pytest -v ./test/test.py::TestConversion --cov=./pdf2docx --cov-report=xml

- name: Upload coverage reports to Codecov
run: |
codecov
env: # Or as an environment variable
super_secret: ${{ secrets.CODECOV_TOKEN }}
uses: codecov/codecov-action@v3
with: # Or as an environment variable
token: ${{ secrets.CODECOV_TOKEN }}

# upload docx for further job
- name: Archive package
uses: actions/upload-artifact@v2
with:
name: outputs
path: ./test/outputs


# docx2pdf:
# # a specific runner with MS Word installed
# runs-on: self-hosted

# needs: pdf2docx

# steps:
# - name: Checkout code
# uses: actions/checkout@v2

# # download artifacts from depending job
# - name: Download artifacts
# uses: actions/download-artifact@v2
# with:
# name: outputs
# path: test\outputs

# # convert docx to pdf with OfficeToPDF
# - name: Convert to PDF
# run: |
# cd test\outputs
# $files = Get-ChildItem "."
# for ($i=0; $i -lt $files.Count; $i++) {
# $name = $files[$i].name;
# echo "Converting $name to pdf...";
# OfficeToPDF $files[$i]
# }
# del *.docx

# # upload pdf for further job
# - name: Archive package
# uses: actions/upload-artifact@v2
# with:
# name: outputs
# path: test\outputs


# check_quality:

# runs-on: ubuntu-latest

# needs: docx2pdf

# steps:
# - name: Check out code
# uses: actions/checkout@v2

# # download artifacts from depending job
# - name: Download artifacts
# uses: actions/download-artifact@v2
# with:
# name: outputs
# path: ./test/outputs

# - name: Set up Python 3.x
# uses: actions/setup-python@v1
# with:
# python-version: '3.x'

# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# pip install -r requirements.txt
# pip install pytest
# python setup.py develop

# - name: Check converting quality
# run: |
# pytest -sv ./test/test.py::TestQuality
path: ./test/outputs
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
include *.md
include LICENSE*
include requirements.txt
include pdf2docx/gui/*.ico
prune test
include test/*.py
include test/samples/*.pdf
8 changes: 2 additions & 6 deletions pdf2docx/gui/App.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,19 @@
'''Main window for ``pdf2docx`` graphic user interface.'''

import os
import sys
from tkinter import Tk, messagebox
from .MainFrame import MainFrame


class App(Tk):
'''Simple graphic user interface.'''
def __init__(self, title:str='App', width:int=300, height:int=200):
'''Top app window.'''
super().__init__()
self.title(title)
self.geometry(f'{width}x{height}')
self.resizable(0, 0) # not allowed to change size

# icon
icon_path = os.path.join(os.path.dirname(__file__), 'icon.ico')
self.iconbitmap(icon_path)

# layout on the root window
self.__create_widgets()

Expand All @@ -38,4 +34,4 @@ def _on_closing(self):

if __name__ == "__main__":
app = App(title='PDF_2_Docx Converter', width=500, height=600)
app.mainloop()
app.mainloop()
Binary file removed pdf2docx/gui/icon.ico
Binary file not shown.
55 changes: 36 additions & 19 deletions pdf2docx/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-

'''Entry for ``pdf2docx`` command line.'''
import logging
from .converter import Converter

Expand All @@ -9,9 +7,15 @@ class PDF2DOCX:
'''Command line interface for ``pdf2docx``.'''

@staticmethod
def convert(pdf_file:str, docx_file:str=None, password:str=None, start:int=0, end:int=None, pages:list=None, **kwargs):
def convert(pdf_file:str,
docx_file:str=None,
password:str=None,
start:int=0,
end:int=None,
pages:list=None,
**kwargs):
'''Convert pdf file to docx file.

Args:
pdf_file (str) : PDF filename to read from.
docx_file (str, optional): docx filename to write to. Defaults to None.
Expand All @@ -20,9 +24,10 @@ def convert(pdf_file:str, docx_file:str=None, password:str=None, start:int=0, en
end (int, optional): Last page to process. Defaults to None.
pages (list, optional): Range of pages, e.g. --pages=1,3,5. Defaults to None.
kwargs (dict) : Configuration parameters.

.. note::
Refer to :py:meth:`~pdf2docx.converter.Converter.convert` for detailed description on above arguments.
Refer to :py:meth:`~pdf2docx.converter.Converter.convert` for detailed description
on above arguments.
'''
# index starts from zero or one
if isinstance(pages, int): pages = [pages] # in case --pages=1
Expand All @@ -38,19 +43,27 @@ def convert(pdf_file:str, docx_file:str=None, password:str=None, start:int=0, en
logging.error(e)
finally:
cv.close()


@staticmethod
def debug(pdf_file:str, password:str=None, page:int=0, docx_file:str=None, debug_pdf:str=None, layout_file:str='layout.json', **kwargs):
def debug(pdf_file:str,
password:str=None,
page:int=0,
docx_file:str=None,
debug_pdf:str=None,
layout_file:str='layout.json',
**kwargs):
'''Convert one PDF page and plot layout information for debugging.

Args:
pdf_file (str) : PDF filename to read from.
password (str): Password for encrypted pdf. Default to None if not encrypted.
page (int, optional): Page index to convert.
docx_file (str, optional): docx filename to write to.
debug_pdf (str, optional): Filename for new pdf storing layout information. Defaults to same name with pdf file.
layout_file (str, optional): Filename for new json file storing parsed layout data. Defaults to ``layout.json``.
debug_pdf (str, optional): Filename for new pdf storing layout information.
Defaults to same name with pdf file.
layout_file (str, optional): Filename for new json file storing parsed layout data.
Defaults to ``layout.json``.
kwargs (dict) : Configuration parameters.
'''
# index starts from zero or one
Expand All @@ -61,13 +74,12 @@ def debug(pdf_file:str, password:str=None, page:int=0, docx_file:str=None, debug
cv = Converter(pdf_file, password)
cv.debug_page(page, docx_file, debug_pdf, layout_file, **kwargs)
cv.close()



@staticmethod
def table(pdf_file, password:str=None, start:int=0, end:int=None, pages:list=None, **kwargs):
'''Extract table content from pdf pages.

Args:
pdf_file (str) : PDF filename to read from.
password (str): Password for encrypted pdf. Default to None if not encrypted.
Expand All @@ -81,7 +93,7 @@ def table(pdf_file, password:str=None, start:int=0, end:int=None, pages:list=Non
start = max(start-1, 0)
if end: end -= 1
if pages: pages = [i-1 for i in pages]

cv = Converter(pdf_file, password)
try:
tables = cv.extract_tables(start, end, pages, **kwargs)
Expand All @@ -99,18 +111,23 @@ def gui():
'''Simple user interface.'''
# import App containing tkinter internally, in case GUI is not supported by some platforms,
# e.g. Amazon Linux 2
from .gui.App import App
app = App(title='PDF_2_Docx Converter', width=500, height=600)
app.mainloop()
try:
from .gui.App import App
except Exception:
logging.error('GUI is not supported in current platform.')
else:
app = App(title='PDF_2_Docx Converter', width=500, height=600)
app.mainloop()


parse = PDF2DOCX.convert


def main():
'''Command line entry.'''
import fire
fire.Fire(PDF2DOCX)


if __name__ == '__main__':
main()
main()
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
PyMuPDF
PyMuPDF>=1.19.0
python-docx>=0.8.10
fonttools>=4.24.0
numpy>=1.17.2
opencv-python>=4.5
# opencv-python-headless>=4.5
opencv-python-headless>=4.5
fire>=0.3.0
17 changes: 9 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,18 @@
DESCRIPTION = 'Open source Python library converting pdf to docx.'
EXCLUDE_FROM_PACKAGES = ["build", "dist", "test"]

# read version number from version.txt, otherwise alpha version
# Github CI can create version.txt dynamically.

def get_version(fname):
'''Read version number from version.txt, created dynamically per Github Action.'''
if os.path.exists(fname):
with open(fname, "r", encoding="utf-8") as f:
version = f.readline().strip()
else:
version = '0.5.6a1'

return version

# Load README.md for long description
def load_long_description(fname):
'''Load README.md for long description'''
if os.path.exists(fname):
with open(fname, "r", encoding="utf-8") as f:
long_description = f.read()
Expand All @@ -28,9 +27,10 @@ def load_long_description(fname):
return long_description

def load_requirements(fname):
'''Load requirements.'''
try:
# pip >= 10.0
from pip._internal.req import parse_requirements
from pip._internal.req import parse_requirements
except ImportError:
# pip < 10.0
from pip.req import parse_requirements
Expand All @@ -40,9 +40,9 @@ def load_requirements(fname):
requirements = [str(ir.requirement) for ir in reqs]
except AttributeError:
requirements = [str(ir.req) for ir in reqs]

return requirements


setup(
name="pdf2docx",
version=get_version("version.txt"),
Expand All @@ -51,8 +51,9 @@ def load_requirements(fname):
long_description=load_long_description("README.md"),
long_description_content_type="text/markdown",
license="GPL v3",
author = 'Artifex',
author_email = 'support@artifex.com',
author='Artifex',
author_email='support@artifex.com',
url='https://artifex.com/',
packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES),
include_package_data=True,
zip_safe=False,
Expand Down