Skip to content

Commit

Permalink
Merge pull request #1 from Hedgehogues/change-versions
Browse files Browse the repository at this point in the history
Change versions
  • Loading branch information
Hedgehogues committed Oct 25, 2019
2 parents 32bcd7c + b2f5462 commit 412e483
Show file tree
Hide file tree
Showing 41 changed files with 1,494 additions and 858 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -6,3 +6,6 @@ data/dev/fonts_dataset/
*/__pycache__/
*.pyc
*~*
build/
*.egg-info/
dist/
11 changes: 11 additions & 0 deletions Dockerfile.tmpl
@@ -0,0 +1,11 @@
FROM ubuntu:18.04 AS BUILD

RUN apt-get -y update
RUN apt-get install -y gcc python3 python3-pip python3-dev cmake make libsm6 libxext6 libxrender-dev
RUN apt-get install -y git

RUN git clone https://github.com/opencv/opencv.git
WORKDIR /opencv/build
RUN cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local ..
RUN make -j4
RUN make install
24 changes: 23 additions & 1 deletion README.md
Expand Up @@ -61,6 +61,13 @@ To extract the text, you can use the following code:
apt-get install tesseract-ocr
apt-get install tesseract-ocr-rus
apt-get install poppler-utils

# NOTICE

If the assembly fails, then there is probably a problem with the dependency versions and you should fix them yourself.

If you have a problem with openCV, you can study the workpiece for the docker file, which describes how to set
dependencies for openCV, as well as openCV itself.

## Other objects

Expand Down Expand Up @@ -231,13 +238,28 @@ tables were unified, then there is an erroneous recognition.
for i in range(4):
table.append(templator.next_points())

# How to install
# How to install without pip

apt-get install tesseract-ocr
apt-get install tesseract-ocr-rus
apt-get install poppler-utils
pip install opencv-python=3.4.0.12 scikit-learn=0.19.1 numpy=1.14.0 scikit-image=0.13.1 pytesseract=0.2.0 scipy=1.0.0
pip install pdf2image=0.1.8 pillow=5.0.0 xlrd=1.1.0

# How to install WITH pip

apt-get install tesseract-ocr
apt-get install tesseract-ocr-rus
apt-get install poppler-utils
pip install hochiminh

## Замечение

Если сборка завершается с ошибкой, то, вероятно, проблема с версиями зависимостей и Вам стоит их самостоятельно
поправить.

Если у Вас возникает проблема с openCV, Вы можете изучить заготовку для докерфайла, где описан способ установки
зависимостей для openCV, а также самого openCV.

## Сопотствующие утилиты

Expand Down
291 changes: 153 additions & 138 deletions data/test/ho_chi_minh/pdf/images/5.pdf/-1.ppm

Large diffs are not rendered by default.

676 changes: 425 additions & 251 deletions data/test/ho_chi_minh/pdf/images/5.pdf/-2.ppm

Large diffs are not rendered by default.

888 changes: 585 additions & 303 deletions data/test/ho_chi_minh/pdf/images/5.pdf/-3.ppm

Large diffs are not rendered by default.

302 changes: 193 additions & 109 deletions data/test/ho_chi_minh/pdf/images/5.pdf/-4.ppm

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions example/extract.py
@@ -0,0 +1,27 @@
from hochiminh import pdf_parser
from hochiminh.image_processing import hochiminh
from hochiminh.image_processing.connected_components import ConnectedComponents
from hochiminh.image_processing.cross_detector import CrossDetector
from hochiminh.image_processing.lines_detector import SobelDirector
from hochiminh.image_processing.ocr import TesseractWrapper
from hochiminh.io import pdfconverter, reader

path = "../data/test/ho_chi_minh/"
parser = pdf_parser.PDFParser(
table_extractor=hochiminh.HoChiMinh(
reader=reader.ImagePDFReader(
pdfconverter.PDFConverter(in_path=path + 'pdf/', out_path=path + 'pdf/images/', resolution=130)
),
lines_detector=SobelDirector(),
connected_components=ConnectedComponents(),
cross_detector=CrossDetector(max_steps=20, detected_steps=18, line_width=8),
ocr=TesseractWrapper(),
binarization=210
)
)

tabels = parser.extract_table()
for tabel in tabels:
print('--------------- Table ---------------')
for cell in tabel:
print(cell.__dict__)
File renamed without changes.
File renamed without changes.
File renamed without changes.
Expand Up @@ -2,9 +2,9 @@
import numpy as np
import matplotlib.pyplot as plt

from internal.extractor.image_processing.connected_components import ConnectedComponents
from internal.extractor.image_processing.geometry import Image
from internal.io.reader import ImageReader
from hochiminh.image_processing.connected_components import ConnectedComponents
from hochiminh.image_processing.geometry import Image
from hochiminh.io.reader import ImageReader

path = 'data/test/ho_chi_minh/pdf/images/6.pdf/page-vertical.png'
image = Image(image_reader=ImageReader(path), image_writer=None, binarization=210)
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
File renamed without changes.
Empty file.
@@ -1,7 +1,7 @@
import cv2
import numpy as np

from internal.image_processing.geometry import Rectangle, Point
from hochiminh.image_processing.geometry import Rectangle, Point


class ConnectedComponents:
Expand Down
@@ -1,6 +1,7 @@
from internal.image_processing.geometry import Point
import numpy as np

from hochiminh.image_processing.geometry import Point


class CrossDetector:
def __init__(self, max_steps=12, line_width=2, detected_steps=11):
Expand Down
File renamed without changes.
Expand Up @@ -3,7 +3,7 @@

from sklearn.neighbors import KDTree

from internal.image_processing.geometry import Image, Point, Cell
from hochiminh.image_processing.geometry import Image, Point, Cell


class HoChiMinh:
Expand Down
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import scipy.stats as st

from internal.image_processing.geometry import Point
from hochiminh.image_processing.geometry import Point


class HoughTransformerCanny:
Expand Down
File renamed without changes.
Empty file added hochiminh/io/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
14 changes: 7 additions & 7 deletions internal/pdf_parser.py → hochiminh/pdf_parser.py
@@ -1,10 +1,10 @@
from internal.image_processing.connected_components import ConnectedComponents
from internal.image_processing.cross_detector import CrossDetector
from internal.image_processing.hochiminh import HoChiMinh
from internal.image_processing.lines_detector import SobelDirector
from internal.image_processing.ocr import TesseractWrapper
from internal.io.pdfconverter import PDFConverter
from internal.io.reader import ImagePDFReader
from hochiminh.image_processing.connected_components import ConnectedComponents
from hochiminh.image_processing.cross_detector import CrossDetector
from hochiminh.image_processing.hochiminh import HoChiMinh
from hochiminh.image_processing.lines_detector import SobelDirector
from hochiminh.image_processing.ocr import TesseractWrapper
from hochiminh.io.pdfconverter import PDFConverter
from hochiminh.io.reader import ImagePDFReader


class PDFParserAPI:
Expand Down
1 change: 0 additions & 1 deletion internal/dev/other_solutions/OTR
Submodule OTR deleted from 8f2654
1 change: 0 additions & 1 deletion internal/dev/other_solutions/ocropy
Submodule ocropy deleted from d3e5cc
1 change: 0 additions & 1 deletion internal/dev/other_solutions/unet
Submodule unet deleted from 4b939a
8 changes: 8 additions & 0 deletions requirements
@@ -0,0 +1,8 @@
opencv-python==3.4.2.17
scikit-learn==0.21.0
scikit-image==0.16.2
pytesseract==0.2.0
scipy==1.3.1
pdf2image==0.1.9
xlrd==1.1.0
pillow==5.2.0
9 changes: 0 additions & 9 deletions requirements.txt

This file was deleted.

63 changes: 42 additions & 21 deletions setup.py
@@ -1,21 +1,42 @@
from setuptools import setup

PACKAGE = "HoChiMinh"
NAME = "HoChiMinh"
DESCRIPTION = "Ho Chi Minh is designed to extract textual information from tables presented in PDF, pictures or " \
"other format. Хошимин предназначен для извлечения текстовой информации из таблиц, представленных в " \
"PDF, картинках или ином формате."
AUTHOR = "Egor Urvanov"
AUTHOR_EMAIL = "hedgehogues@bk.ru"
URL = "https://github.com/Hedgehogues/HoChiMinh"
req = open('requirements.txt').readlines()

setup(
name=NAME,
description=DESCRIPTION,
author=AUTHOR,
author_email=AUTHOR_EMAIL,
license="MIT",
url=URL,
install_requiers=req,
)
import setuptools
try:
from pip._internal import main as pipmain
except:
print('your version of pip is deprecated')
from pip import main as pipmain


class InternalRequirements:
pass


def parse_requirements(filename):
""" load requirements from a pip requirements file """
lineiter = (line.strip() for line in open(filename))
return [line for line in lineiter if line and not line.startswith("#")]


with open("README.md", "r") as fh:
long_description = fh.read()

install_reqs = parse_requirements('./requirements')
print(f'public requirements: {install_reqs}')

kwargs = {
'name': "HoChiMinh",
'version': "1.0.0",
'author': 'Egor Urvanov',
'author_email': 'hedgehogues@bk.ru',
'description': 'Ho Chi Minh is designed to extract textual information from tables presented in PDF, pictures or other format. Хошимин предназначен для извлечения текстовой информации из таблиц, представленных в PDF, картинках или ином формате.',
'long_description': long_description,
'long_description_content_type': 'text/markdown',
'url': 'https://github.com/Hedgehogues/HoChiMinh',
'packages': setuptools.find_packages(),
'classifiers': [
"Programming Language :: Python :: 3.7",
"Operating System :: OS Independent",
],
'install_requires': install_reqs,
}

setuptools.setup(**kwargs)
16 changes: 8 additions & 8 deletions test/test_extract_tables.py
@@ -1,13 +1,13 @@
import unittest

from internal.image_processing.connected_components import ConnectedComponents
from internal.image_processing.cross_detector import CrossDetector
from internal.image_processing.hochiminh import HoChiMinh
from internal.image_processing.lines_detector import SobelDirector
from internal.image_processing.ocr import TesseractWrapper
from internal.io.pdfconverter import PDFConverter
from internal.io.reader import ImagePDFReader
from internal.pdf_parser import PDFParser
from hochiminh.image_processing.connected_components import ConnectedComponents
from hochiminh.image_processing.cross_detector import CrossDetector
from hochiminh.image_processing.hochiminh import HoChiMinh
from hochiminh.image_processing.lines_detector import SobelDirector
from hochiminh.image_processing.ocr import TesseractWrapper
from hochiminh.io.pdfconverter import PDFConverter
from hochiminh.io.reader import ImagePDFReader
from hochiminh.pdf_parser import PDFParser


class TestPDFConverter(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion test/test_pdfconverter.py
Expand Up @@ -3,7 +3,7 @@

from shutil import rmtree

from internal.io.pdfconverter import PDFConverter
from hochiminh.io.pdfconverter import PDFConverter


class TestPDFConverter(unittest.TestCase):
Expand Down

0 comments on commit 412e483

Please sign in to comment.