Merge pull request #1 from Hedgehogues/change-versions

Change versions
Hedgehogues · Oct 25, 2019 · 412e483 · 412e483
2 parents 32bcd7c + b2f5462
commit 412e483
Show file tree

Hide file tree

Showing 41 changed files with 1,494 additions and 858 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,6 @@ data/dev/fonts_dataset/
 */__pycache__/
 *.pyc
 *~*
+build/
+*.egg-info/
+dist/
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -0,0 +1,11 @@
+FROM ubuntu:18.04 AS BUILD
+
+RUN apt-get -y update
+RUN apt-get install -y gcc python3 python3-pip python3-dev cmake make libsm6 libxext6 libxrender-dev
+RUN apt-get install -y git
+
+RUN git clone https://github.com/opencv/opencv.git
+WORKDIR /opencv/build
+RUN cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local ..
+RUN make -j4
+RUN make install
diff --git a/README.md b/README.md
@@ -61,6 +61,13 @@ To extract the text, you can use the following code:
     apt-get install tesseract-ocr
     apt-get install tesseract-ocr-rus
     apt-get install poppler-utils
+
+# NOTICE
+
+If the assembly fails, then there is probably a problem with the dependency versions and you should fix them yourself. 
+
+If you have a problem with openCV, you can study the workpiece for the docker file, which describes how to set 
+dependencies for openCV, as well as openCV itself.
 
 ## Other objects
 
@@ -231,13 +238,28 @@ tables were unified, then there is an erroneous recognition.
     for i in range(4):
         table.append(templator.next_points())
 
-# How to install
+# How to install without pip
 
+    apt-get install tesseract-ocr
+    apt-get install tesseract-ocr-rus
+    apt-get install poppler-utils
     pip install opencv-python=3.4.0.12 scikit-learn=0.19.1 numpy=1.14.0 scikit-image=0.13.1 pytesseract=0.2.0 scipy=1.0.0
     pip install pdf2image=0.1.8 pillow=5.0.0 xlrd=1.1.0
+
+# How to install WITH pip
+
     apt-get install tesseract-ocr
     apt-get install tesseract-ocr-rus
     apt-get install poppler-utils
+    pip install hochiminh
+
+## Замечение
+
+Если сборка завершается с ошибкой, то, вероятно, проблема с версиями зависимостей и Вам стоит их самостоятельно 
+поправить.
+
+Если у Вас возникает проблема с openCV, Вы можете изучить заготовку для докерфайла, где описан способ установки 
+зависимостей для openCV, а также самого openCV.
 
 ## Сопотствующие утилиты
 

diff --git a/data/test/ho_chi_minh/pdf/images/5.pdf/-1.ppm b/data/test/ho_chi_minh/pdf/images/5.pdf/-1.ppm
diff --git a/data/test/ho_chi_minh/pdf/images/5.pdf/-2.ppm b/data/test/ho_chi_minh/pdf/images/5.pdf/-2.ppm
diff --git a/data/test/ho_chi_minh/pdf/images/5.pdf/-3.ppm b/data/test/ho_chi_minh/pdf/images/5.pdf/-3.ppm
diff --git a/data/test/ho_chi_minh/pdf/images/5.pdf/-4.ppm b/data/test/ho_chi_minh/pdf/images/5.pdf/-4.ppm
diff --git a/example/extract.py b/example/extract.py
@@ -0,0 +1,27 @@
+from hochiminh import pdf_parser
+from hochiminh.image_processing import hochiminh
+from hochiminh.image_processing.connected_components import ConnectedComponents
+from hochiminh.image_processing.cross_detector import CrossDetector
+from hochiminh.image_processing.lines_detector import SobelDirector
+from hochiminh.image_processing.ocr import TesseractWrapper
+from hochiminh.io import pdfconverter, reader
+
+path = "../data/test/ho_chi_minh/"
+parser = pdf_parser.PDFParser(
+    table_extractor=hochiminh.HoChiMinh(
+        reader=reader.ImagePDFReader(
+            pdfconverter.PDFConverter(in_path=path + 'pdf/', out_path=path + 'pdf/images/', resolution=130)
+        ),
+        lines_detector=SobelDirector(),
+        connected_components=ConnectedComponents(),
+        cross_detector=CrossDetector(max_steps=20, detected_steps=18, line_width=8),
+        ocr=TesseractWrapper(),
+        binarization=210
+    )
+)
+
+tabels = parser.extract_table()
+for tabel in tabels:
+    print('--------------- Table ---------------')
+    for cell in tabel:
+        print(cell.__dict__)
diff --git a/internal/.gitkeep → hochiminh/.gitkeep b/internal/.gitkeep → hochiminh/.gitkeep
diff --git a/internal/__init__.py → hochiminh/__init__.py b/internal/__init__.py → hochiminh/__init__.py
diff --git a/internal/dev/other_solutions/.gitkeep → hochiminh/dev/__init__.py b/internal/dev/other_solutions/.gitkeep → hochiminh/dev/__init__.py
diff --git a/internal/dev/cell_detector_simple.py → hochiminh/dev/cell_detector_simple.py b/internal/dev/cell_detector_simple.py → hochiminh/dev/cell_detector_simple.py
@@ -2,9 +2,9 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-from internal.extractor.image_processing.connected_components import ConnectedComponents
-from internal.extractor.image_processing.geometry import Image
-from internal.io.reader import ImageReader
+from hochiminh.image_processing.connected_components import ConnectedComponents
+from hochiminh.image_processing.geometry import Image
+from hochiminh.io.reader import ImageReader
 
 path = 'data/test/ho_chi_minh/pdf/images/6.pdf/page-vertical.png'
 image = Image(image_reader=ImageReader(path), image_writer=None, binarization=210)

diff --git a/internal/dev/compile.sh → hochiminh/dev/compile.sh b/internal/dev/compile.sh → hochiminh/dev/compile.sh
diff --git a/internal/dev/font_to_image.py → hochiminh/dev/font_to_image.py b/internal/dev/font_to_image.py → hochiminh/dev/font_to_image.py
diff --git a/internal/dev/keras_filtering.py → hochiminh/dev/keras_filtering.py b/internal/dev/keras_filtering.py → hochiminh/dev/keras_filtering.py
diff --git a/internal/dev/keras_test.py → hochiminh/dev/keras_test.py b/internal/dev/keras_test.py → hochiminh/dev/keras_test.py
diff --git a/internal/dev/main.c → hochiminh/dev/main.c b/internal/dev/main.c → hochiminh/dev/main.c
diff --git a/internal/dev/main.o → hochiminh/dev/main.o b/internal/dev/main.o → hochiminh/dev/main.o
diff --git a/internal/dev/main.so → hochiminh/dev/main.so b/internal/dev/main.so → hochiminh/dev/main.so
diff --git a/internal/dev/main.so.py → hochiminh/dev/main.so.py b/internal/dev/main.so.py → hochiminh/dev/main.so.py
diff --git a/hochiminh/dev/other_solutions/.gitkeep b/hochiminh/dev/other_solutions/.gitkeep
diff --git a/internal/dev/to_csv.py → hochiminh/dev/to_csv.py b/internal/dev/to_csv.py → hochiminh/dev/to_csv.py
diff --git a/hochiminh/image_processing/__init__.py b/hochiminh/image_processing/__init__.py
diff --git a/.../image_processing/connected_components.py → .../image_processing/connected_components.py b/.../image_processing/connected_components.py → .../image_processing/connected_components.py
@@ -1,7 +1,7 @@
 import cv2
 import numpy as np
 
-from internal.image_processing.geometry import Rectangle, Point
+from hochiminh.image_processing.geometry import Rectangle, Point
 
 
 class ConnectedComponents:

diff --git a/internal/image_processing/cross_detector.py → hochiminh/image_processing/cross_detector.py b/internal/image_processing/cross_detector.py → hochiminh/image_processing/cross_detector.py
@@ -1,6 +1,7 @@
-from internal.image_processing.geometry import Point
 import numpy as np
 
+from hochiminh.image_processing.geometry import Point
+
 
 class CrossDetector:
     def __init__(self, max_steps=12, line_width=2, detected_steps=11):

diff --git a/internal/image_processing/geometry.py → hochiminh/image_processing/geometry.py b/internal/image_processing/geometry.py → hochiminh/image_processing/geometry.py
diff --git a/internal/image_processing/hochiminh.py → hochiminh/image_processing/hochiminh.py b/internal/image_processing/hochiminh.py → hochiminh/image_processing/hochiminh.py
@@ -3,7 +3,7 @@
 
 from sklearn.neighbors import KDTree
 
-from internal.image_processing.geometry import Image, Point, Cell
+from hochiminh.image_processing.geometry import Image, Point, Cell
 
 
 class HoChiMinh:

diff --git a/internal/image_processing/lines_detector.py → hochiminh/image_processing/lines_detector.py b/internal/image_processing/lines_detector.py → hochiminh/image_processing/lines_detector.py
@@ -4,7 +4,7 @@
 import numpy as np
 import scipy.stats as st
 
-from internal.image_processing.geometry import Point
+from hochiminh.image_processing.geometry import Point
 
 
 class HoughTransformerCanny:

diff --git a/internal/image_processing/ocr.py → hochiminh/image_processing/ocr.py b/internal/image_processing/ocr.py → hochiminh/image_processing/ocr.py
diff --git a/hochiminh/io/__init__.py b/hochiminh/io/__init__.py
diff --git a/internal/io/pdfconverter.py → hochiminh/io/pdfconverter.py b/internal/io/pdfconverter.py → hochiminh/io/pdfconverter.py
diff --git a/internal/io/reader.py → hochiminh/io/reader.py b/internal/io/reader.py → hochiminh/io/reader.py
diff --git a/internal/pdf_parser.py → hochiminh/pdf_parser.py b/internal/pdf_parser.py → hochiminh/pdf_parser.py
@@ -1,10 +1,10 @@
-from internal.image_processing.connected_components import ConnectedComponents
-from internal.image_processing.cross_detector import CrossDetector
-from internal.image_processing.hochiminh import HoChiMinh
-from internal.image_processing.lines_detector import SobelDirector
-from internal.image_processing.ocr import TesseractWrapper
-from internal.io.pdfconverter import PDFConverter
-from internal.io.reader import ImagePDFReader
+from hochiminh.image_processing.connected_components import ConnectedComponents
+from hochiminh.image_processing.cross_detector import CrossDetector
+from hochiminh.image_processing.hochiminh import HoChiMinh
+from hochiminh.image_processing.lines_detector import SobelDirector
+from hochiminh.image_processing.ocr import TesseractWrapper
+from hochiminh.io.pdfconverter import PDFConverter
+from hochiminh.io.reader import ImagePDFReader
 
 
 class PDFParserAPI:

diff --git a/internal/dev/other_solutions/OTR b/internal/dev/other_solutions/OTR
diff --git a/internal/dev/other_solutions/ocropy b/internal/dev/other_solutions/ocropy
diff --git a/internal/dev/other_solutions/unet b/internal/dev/other_solutions/unet
diff --git a/requirements b/requirements
@@ -0,0 +1,8 @@
+opencv-python==3.4.2.17
+scikit-learn==0.21.0
+scikit-image==0.16.2
+pytesseract==0.2.0
+scipy==1.3.1
+pdf2image==0.1.9
+xlrd==1.1.0
+pillow==5.2.0
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
@@ -1,21 +1,42 @@
-from setuptools import setup
-
-PACKAGE = "HoChiMinh"
-NAME = "HoChiMinh"
-DESCRIPTION = "Ho Chi Minh is designed to extract textual information from tables presented in PDF, pictures or " \
-              "other format. Хошимин предназначен для извлечения текстовой информации из таблиц, представленных в " \
-              "PDF, картинках или ином формате."
-AUTHOR = "Egor Urvanov"
-AUTHOR_EMAIL = "hedgehogues@bk.ru"
-URL = "https://github.com/Hedgehogues/HoChiMinh"
-req = open('requirements.txt').readlines()
-
-setup(
-    name=NAME,
-    description=DESCRIPTION,
-    author=AUTHOR,
-    author_email=AUTHOR_EMAIL,
-    license="MIT",
-    url=URL,
-    install_requiers=req,
-)
+import setuptools
+try:
+    from pip._internal import main as pipmain
+except:
+    print('your version of pip is deprecated')
+    from pip import main as pipmain
+
+
+class InternalRequirements:
+    pass
+
+
+def parse_requirements(filename):
+    """ load requirements from a pip requirements file """
+    lineiter = (line.strip() for line in open(filename))
+    return [line for line in lineiter if line and not line.startswith("#")]
+
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+install_reqs = parse_requirements('./requirements')
+print(f'public requirements: {install_reqs}')
+
+kwargs = {
+    'name': "HoChiMinh",
+    'version': "1.0.0",
+    'author': 'Egor Urvanov',
+    'author_email': 'hedgehogues@bk.ru',
+    'description': 'Ho Chi Minh is designed to extract textual information from tables presented in PDF, pictures or other format. Хошимин предназначен для извлечения текстовой информации из таблиц, представленных в PDF, картинках или ином формате.',
+    'long_description': long_description,
+    'long_description_content_type': 'text/markdown',
+    'url': 'https://github.com/Hedgehogues/HoChiMinh',
+    'packages': setuptools.find_packages(),
+    'classifiers': [
+        "Programming Language :: Python :: 3.7",
+        "Operating System :: OS Independent",
+    ],
+    'install_requires': install_reqs,
+}
+
+setuptools.setup(**kwargs)
diff --git a/test/test_extract_tables.py b/test/test_extract_tables.py
@@ -1,13 +1,13 @@
 import unittest
 
-from internal.image_processing.connected_components import ConnectedComponents
-from internal.image_processing.cross_detector import CrossDetector
-from internal.image_processing.hochiminh import HoChiMinh
-from internal.image_processing.lines_detector import SobelDirector
-from internal.image_processing.ocr import TesseractWrapper
-from internal.io.pdfconverter import PDFConverter
-from internal.io.reader import ImagePDFReader
-from internal.pdf_parser import PDFParser
+from hochiminh.image_processing.connected_components import ConnectedComponents
+from hochiminh.image_processing.cross_detector import CrossDetector
+from hochiminh.image_processing.hochiminh import HoChiMinh
+from hochiminh.image_processing.lines_detector import SobelDirector
+from hochiminh.image_processing.ocr import TesseractWrapper
+from hochiminh.io.pdfconverter import PDFConverter
+from hochiminh.io.reader import ImagePDFReader
+from hochiminh.pdf_parser import PDFParser
 
 
 class TestPDFConverter(unittest.TestCase):

diff --git a/test/test_pdfconverter.py b/test/test_pdfconverter.py
@@ -3,7 +3,7 @@
 
 from shutil import rmtree
 
-from internal.io.pdfconverter import PDFConverter
+from hochiminh.io.pdfconverter import PDFConverter
 
 
 class TestPDFConverter(unittest.TestCase):