ArtifexSoftware · dothinking · Jan 22, 2024 · Jan 20, 2024 · Jan 20, 2024 · Jan 20, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -21,11 +21,36 @@ on:
 # However, keep step 1 only, considering the difficulty to get a specific runner with MS Word installed.
 # -----------------------------------------------------------------------------------------------------
 jobs:
+
+  pdf2docx-docker:
+
+    runs-on: ubuntu-latest
+
+    container:
+      image: python:3.8
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v2
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest
+          python setup.py develop
 
-  pdf2docx:
+      - name: Run unit test
+        run: |
+          pytest -v ./test/test.py::TestConversion
+
+
+  pdf2docx-ubuntu:
 
     runs-on: ubuntu-latest
 
+    needs: pdf2docx-docker
+
     strategy:
       matrix:
         python-version: ["3.8", "3.9", "3.10"]
@@ -43,93 +68,21 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
-          pip install pytest pytest-cov codecov
+          pip install pytest pytest-cov
           python setup.py develop
 
       - name: Run unit test
         run: |
           pytest -v ./test/test.py::TestConversion --cov=./pdf2docx --cov-report=xml
 
       - name: Upload coverage reports to Codecov
-        run: |
-          codecov
-        env: # Or as an environment variable
-          super_secret: ${{ secrets.CODECOV_TOKEN }}
+        uses: codecov/codecov-action@v3
+        with: # Or as an environment variable
+          token: ${{ secrets.CODECOV_TOKEN }}
 
       # upload docx for further job
       - name: Archive package
         uses: actions/upload-artifact@v2
         with:
             name: outputs
-            path: ./test/outputs
-
-
-  # docx2pdf:
-  #   # a specific runner with MS Word installed
-  #   runs-on: self-hosted
-
-  #   needs: pdf2docx
-
-  #   steps:
-  #     - name: Checkout code
-  #       uses: actions/checkout@v2
-
-  #     # download artifacts from depending job
-  #     - name: Download artifacts
-  #       uses: actions/download-artifact@v2
-  #       with:
-  #         name: outputs
-  #         path: test\outputs
-
-  #     # convert docx to pdf with OfficeToPDF
-  #     - name: Convert to PDF
-  #       run: |
-  #         cd test\outputs
-  #         $files = Get-ChildItem "."
-  #         for ($i=0; $i -lt $files.Count; $i++) { 
-  #           $name = $files[$i].name;
-  #           echo "Converting $name to pdf...";
-  #           OfficeToPDF $files[$i] 
-  #         }
-  #         del *.docx
-
-  #     # upload pdf for further job
-  #     - name: Archive package
-  #       uses: actions/upload-artifact@v2
-  #       with:
-  #           name: outputs
-  #           path: test\outputs
-
-
-  # check_quality:
-
-  #   runs-on: ubuntu-latest
-
-  #   needs: docx2pdf
-
-  #   steps:
-  #     - name: Check out code
-  #       uses: actions/checkout@v2
-
-  #     # download artifacts from depending job
-  #     - name: Download artifacts
-  #       uses: actions/download-artifact@v2
-  #       with:
-  #         name: outputs
-  #         path: ./test/outputs
-
-  #     - name: Set up Python 3.x
-  #       uses: actions/setup-python@v1
-  #       with:
-  #         python-version: '3.x'
-
-  #     - name: Install dependencies
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install -r requirements.txt
-  #         pip install pytest
-  #         python setup.py develop
-
-  #     - name: Check converting quality
-  #       run: |
-  #         pytest -sv ./test/test.py::TestQuality
+            path: ./test/outputs
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,7 +1,6 @@
 include *.md
 include LICENSE*
 include requirements.txt
-include pdf2docx/gui/*.ico
 prune test
 include test/*.py
 include test/samples/*.pdf
diff --git a/pdf2docx/gui/App.py b/pdf2docx/gui/App.py
@@ -1,23 +1,19 @@
 '''Main window for ``pdf2docx`` graphic user interface.'''
 
-import os
 import sys
 from tkinter import Tk, messagebox
 from .MainFrame import MainFrame
 
 
 class App(Tk):
+    '''Simple graphic user interface.'''
     def __init__(self, title:str='App', width:int=300, height:int=200):
         '''Top app window.'''
         super().__init__()
         self.title(title)
         self.geometry(f'{width}x{height}')
         self.resizable(0, 0) # not allowed to change size
 
-        # icon
-        icon_path = os.path.join(os.path.dirname(__file__), 'icon.ico') 
-        self.iconbitmap(icon_path)
-
         # layout on the root window
         self.__create_widgets()
 
@@ -38,4 +34,4 @@ def _on_closing(self):
 
 if __name__ == "__main__":
     app = App(title='PDF_2_Docx Converter', width=500, height=600)
-    app.mainloop()
+    app.mainloop()
diff --git a/pdf2docx/gui/icon.ico b/pdf2docx/gui/icon.ico
diff --git a/pdf2docx/main.py b/pdf2docx/main.py
@@ -1,6 +1,4 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
+'''Entry for ``pdf2docx`` command line.'''
 import logging
 from .converter import Converter
 
@@ -9,9 +7,15 @@ class PDF2DOCX:
     '''Command line interface for ``pdf2docx``.'''
 
     @staticmethod
-    def convert(pdf_file:str, docx_file:str=None, password:str=None, start:int=0, end:int=None, pages:list=None, **kwargs):
+    def convert(pdf_file:str,
+                docx_file:str=None,
+                password:str=None,
+                start:int=0,
+                end:int=None,
+                pages:list=None,
+                **kwargs):
         '''Convert pdf file to docx file.
-        
+
         Args:
             pdf_file (str) : PDF filename to read from.
             docx_file (str, optional): docx filename to write to. Defaults to None.
@@ -20,9 +24,10 @@ def convert(pdf_file:str, docx_file:str=None, password:str=None, start:int=0, en
             end (int, optional): Last page to process. Defaults to None.
             pages (list, optional): Range of pages, e.g. --pages=1,3,5. Defaults to None.
             kwargs (dict) : Configuration parameters.
-        
+
         .. note::
-            Refer to :py:meth:`~pdf2docx.converter.Converter.convert` for detailed description on above arguments.
+            Refer to :py:meth:`~pdf2docx.converter.Converter.convert` for detailed description
+            on above arguments.
         '''
         # index starts from zero or one
         if isinstance(pages, int): pages = [pages] # in case --pages=1
@@ -38,19 +43,27 @@ def convert(pdf_file:str, docx_file:str=None, password:str=None, start:int=0, en
             logging.error(e)
         finally:
             cv.close()
-    
+
 
     @staticmethod
-    def debug(pdf_file:str, password:str=None, page:int=0, docx_file:str=None, debug_pdf:str=None, layout_file:str='layout.json', **kwargs):
+    def debug(pdf_file:str,
+              password:str=None,
+              page:int=0,
+              docx_file:str=None,
+              debug_pdf:str=None,
+              layout_file:str='layout.json',
+              **kwargs):
         '''Convert one PDF page and plot layout information for debugging.
-        
+
         Args:
             pdf_file (str) : PDF filename to read from.
             password (str): Password for encrypted pdf. Default to None if not encrypted.
             page (int, optional): Page index to convert.
             docx_file (str, optional): docx filename to write to.
-            debug_pdf (str, optional): Filename for new pdf storing layout information. Defaults to same name with pdf file.
-            layout_file (str, optional): Filename for new json file storing parsed layout data. Defaults to ``layout.json``.
+            debug_pdf (str, optional): Filename for new pdf storing layout information.
+                Defaults to same name with pdf file.
+            layout_file (str, optional): Filename for new json file storing parsed layout data.
+                Defaults to ``layout.json``.
             kwargs (dict)  : Configuration parameters.
         '''
         # index starts from zero or one
@@ -61,13 +74,12 @@ def debug(pdf_file:str, password:str=None, page:int=0, docx_file:str=None, debug
         cv = Converter(pdf_file, password)
         cv.debug_page(page, docx_file, debug_pdf, layout_file, **kwargs)
         cv.close()
-
 
 
     @staticmethod
     def table(pdf_file, password:str=None, start:int=0, end:int=None, pages:list=None, **kwargs):
         '''Extract table content from pdf pages.
-        
+
         Args:
             pdf_file (str) : PDF filename to read from.
             password (str): Password for encrypted pdf. Default to None if not encrypted.
@@ -81,7 +93,7 @@ def table(pdf_file, password:str=None, start:int=0, end:int=None, pages:list=Non
             start = max(start-1, 0)
             if end: end -= 1
             if pages: pages = [i-1 for i in pages]
-        
+
         cv = Converter(pdf_file, password)
         try:
             tables = cv.extract_tables(start, end, pages, **kwargs)
@@ -99,18 +111,23 @@ def gui():
         '''Simple user interface.'''
         # import App containing tkinter internally, in case GUI is not supported by some platforms,
         # e.g. Amazon Linux 2
-        from .gui.App import App
-        app = App(title='PDF_2_Docx Converter', width=500, height=600)
-        app.mainloop()
+        try:
+            from .gui.App import App
+        except Exception:
+            logging.error('GUI is not supported in current platform.')
+        else:
+            app = App(title='PDF_2_Docx Converter', width=500, height=600)
+            app.mainloop()
 
 
 parse = PDF2DOCX.convert
 
 
 def main():
+    '''Command line entry.'''
     import fire
     fire.Fire(PDF2DOCX)
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,6 @@
-PyMuPDF
+PyMuPDF>=1.19.0
 python-docx>=0.8.10
 fonttools>=4.24.0
 numpy>=1.17.2
-opencv-python>=4.5
-# opencv-python-headless>=4.5
+opencv-python-headless>=4.5
 fire>=0.3.0
diff --git a/setup.py b/setup.py
@@ -6,19 +6,18 @@
 DESCRIPTION = 'Open source Python library converting pdf to docx.'
 EXCLUDE_FROM_PACKAGES = ["build", "dist", "test"]
 
-# read version number from version.txt, otherwise alpha version
-# Github CI can create version.txt dynamically.
+
 def get_version(fname):
+    '''Read version number from version.txt, created dynamically per Github Action.'''
     if os.path.exists(fname):
         with open(fname, "r", encoding="utf-8") as f:
             version = f.readline().strip()
     else:
         version = '0.5.6a1'
-
     return version
 
-# Load README.md for long description
 def load_long_description(fname):
+    '''Load README.md for long description'''
     if os.path.exists(fname):
         with open(fname, "r", encoding="utf-8") as f:
             long_description = f.read()
@@ -28,9 +27,10 @@ def load_long_description(fname):
     return long_description
 
 def load_requirements(fname):
+    '''Load requirements.'''
     try:
         # pip >= 10.0
-        from pip._internal.req import parse_requirements        
+        from pip._internal.req import parse_requirements
     except ImportError:
         # pip < 10.0
         from pip.req import parse_requirements
@@ -40,9 +40,9 @@ def load_requirements(fname):
         requirements = [str(ir.requirement) for ir in reqs]
     except AttributeError:
         requirements = [str(ir.req) for ir in reqs]
-
     return requirements
 
+
 setup(
     name="pdf2docx",
     version=get_version("version.txt"),
@@ -51,8 +51,9 @@ def load_requirements(fname):
     long_description=load_long_description("README.md"),
     long_description_content_type="text/markdown",
     license="GPL v3",
-    author = 'Artifex',
-    author_email = 'support@artifex.com',
+    author='Artifex',
+    author_email='support@artifex.com',
+    url='https://artifex.com/',
     packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES),
     include_package_data=True,
     zip_safe=False,