Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use mimetype detection as backup when extension not present #121

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements/debian
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ unrtf

# parse image files
tesseract-ocr
libjpeg-dev

# parse pdfs
poppler-utils
Expand Down
1 change: 1 addition & 0 deletions requirements/python
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ argcomplete
chardet
python-pptx>=0.5.1
python-docx
python-magic
pdfminer==20140328
beautifulsoup4
xlrd
Expand Down
6 changes: 5 additions & 1 deletion tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,12 @@ def __init__(self, *args, **kwargs):
)

def get_extension_directory(self):
return os.path.join(
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'data'
)
return os.path.join(
data_dir,
self.extension,
)

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
26 changes: 13 additions & 13 deletions tests/msg/raw_text.txt → tests/data/msg/raw_text.txt
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
Test for TIF files

This is a test email to experiment with the MS Outlook MSG Extractor
--
Kind regards
Brian Zhou
This is a test email to experiment with the MS Outlook MSG Extractor


--


Kind regards




Brian Zhou

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
70 changes: 70 additions & 0 deletions tests/test_detect_filetype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import unittest
import tempfile
import shutil

from textract.parsers.filetype import detect_filetype


def file_paths():
"""
Returns a list of paths to test files

e.g.
[
'./data/epub/raw_text.epub',
'.data/epub/standardized_text.epub'
...
]
"""
paths = []
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'data'
)
for root, directories, files in os.walk(data_dir):
_, folder = os.path.split(root)
for filename in files:
_, ext = os.path.splitext(filename)
ext = ext.strip('.')
if ext == folder:
abs_path = os.path.join(root, filename)
paths.append(abs_path)
return paths



class DetectFileTypeTestCase(unittest.TestCase):
"""
Make sure detecting file types works correctly.
"""


def test_detect_filetype_with_extension(self):
"""
Make sure we are using the extension when it exists
"""
files = file_paths()
for path in files:
_, ext = os.path.splitext(path)
ext = ext.strip('.')
detected = detect_filetype(path)
self.assertEqual(ext, detected)

def test_detect_filetype_with_mimetype(self):
"""
Test detectin filetypes by their mime
"""
files = file_paths()
for path in files:
try:
# Put in temporary file that has no extension
_, orig_ext = os.path.splitext(path)
orig_ext = orig_ext.strip('.')
handler, temp_path = tempfile.mkstemp()
shutil.copyfile(path, temp_path)
detected = detect_filetype(temp_path)
self.assertEqual(orig_ext, detected)
finally:
# clean up temp files, be a good citizen
os.remove(temp_path)
3 changes: 3 additions & 0 deletions textract/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import logging
logging.basicConfig()

from .parsers import process

VERSION = "1.4.0"
29 changes: 8 additions & 21 deletions textract/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,8 @@
import importlib

from .. import exceptions
from .filetype import detect_filetype

# Dictionary structure for synonymous file extension types
EXTENSION_SYNONYMS = {
".jpeg": ".jpg",
".htm": ".html",
"": ".txt",
}

# default encoding that is returned by the process method. specify it
# here so the default is used on both the process function and also by
Expand All @@ -30,28 +25,20 @@ def process(filename, encoding=DEFAULT_ENCODING, **kwargs):
if not os.path.exists(filename):
raise exceptions.MissingFileError(filename)

# get the filename extension, which is something like .docx for
# example, and import the module dynamically using importlib. This
# is a relative import so the name of the package is necessary
_, ext = os.path.splitext(filename)
ext = ext.lower()

# check the EXTENSION_SYNONYMS dictionary
ext = EXTENSION_SYNONYMS.get(ext, ext)
ext = detect_filetype(filename)

# to avoid conflicts with packages that are installed globally
# (e.g. python's json module), all extension parser modules have
# the _parser extension
rel_module = ext + '_parser'
module_name = rel_module[1:]
module_name = ext + '_parser'

# if this module name doesn't exist in this directory it isn't
# currently supported
this_dir = os.path.dirname(os.path.abspath(__file__))
if not os.path.exists(os.path.join(this_dir, module_name + '.py')):
## If it can't find the module, it's an unsupported file type
try:
import_path = 'textract.parsers.' + module_name
filetype_module = importlib.import_module(import_path)
except ImportError:
raise exceptions.ExtensionNotSupported(ext)

# do the extraction
filetype_module = importlib.import_module(rel_module, 'textract.parsers')
parser = filetype_module.Parser()
return parser.process(filename, encoding, **kwargs)
116 changes: 116 additions & 0 deletions textract/parsers/filetype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import os
import logging

logger = logging.getLogger('textract')

DETECT_METHOD = 'extension'

try:
import magic
except ImportError:
magic = None
logger.warn('Python-magic not detected. Using file extension to detect file type.')


# Dictionary structure for synonymous file extension types
EXTENSION_SYNONYMS = {
"jpeg": "jpg",
"htm": "html",
"": "txt",
}

# Sourced from
# http://svn.apache.org/repos/asf/httpd/httpd/trunk/docs/conf/mime.types
MIME_MAPPING = {
'application/atom+xml': 'txt',
'application/atomcat+xml': 'txt',
'application/atomsvc+xml': 'txt',
'application/ccxml+xml': 'txt',
'application/davmount+xml': 'txt',
'application/docbook+xml': 'txt',
'application/dssc+xml': 'txt',
'application/ecmascript': 'txt',
'application/emma+xml': 'txt',
'application/epub+zip': 'epub',
'application/gml+xml': 'txt',
'application/gpx+xml': 'txt',
'application/inkml+xml': 'txt',
'application/inkml+xml': 'txt',
'application/javascript': 'txt',
'application/json': 'json',
'application/jsonml+json': 'json',
'application/lost+xml': 'txt',
'application/mads+xml': 'txt',
'application/marcxml+xml': 'txt',
'application/mathml+xml': 'txt',
'application/metalink+xml': 'txt',
'application/metalink4+xml': 'txt',
'application/mets+xml': 'txt',
'application/mods+xml': 'txt',
'application/msword': 'doc',
'application/ogg': 'ogg',
'application/pdf': 'pdf',
'application/postscript': 'ps',
'application/rdf+xml': 'rdf',
'application/reginfo+xml': 'txt',
'application/rss+xml': 'txt',
'application/rtf': 'rtf',
'application/sbml+xml': 'txt',
'application/vnd.ms-excel': 'xls',
'application/vnd.ms-word.document.macroenabled.12': 'doc',
'application/vnd.oasis.opendocument.text': 'odt',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
'application/vnd.openxmlformats-officedocument.presentationml.slide': 'pptx',
'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'pptx',
'application/vnd.openxmlformats-officedocument.presentationml.template': 'pptx',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xlsx',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'docx',
'application/xhtml+xml': 'html',
'application/xml-dtd': 'html',
'application/xml': 'html',
'audio/basic': 'audio',
'audio/midi': 'audio',
'audio/mp4': 'audio',
'audio/mpeg': 'audio',
'audio/ogg': 'ogg',
'audio/s3m': 'audio',
'audio/webm': 'audio',
'audio/x-aac': 'audio',
'audio/x-flac': 'audio',
'audio/x-wav': 'wav',
'image/bmp': 'image',
'image/cgm': 'image',
'image/g3fax': 'image',
'image/gif': 'gif',
'image/jpeg': 'image',
'image/png': 'image',
'image/svg+xml': 'image',
'image/tiff': 'image',
'text/calendar': 'txt',
'text/css': 'txt',
'text/csv': 'csv',
'text/html': 'html',
'text/plain': 'txt',
'text/richtext': 'rtf',
}


def detect_filetype(filename, default='txt'):
"""
Detect a file's type, using `default` if not found.
"""
# First attempt to detect by extension
_, ext = os.path.splitext(filename)
if not ext:
mime = magic.from_file(filename, mime=True)
ext = MIME_MAPPING.get(mime, default)
logger.info('File {0} had mimetype {1}'.format(filename, mime))

logger.info('Using {0} parser'.format(ext))

# Use the extension synonyms dictionary to consolidate
ext = ext.lower().strip('.')
ext = EXTENSION_SYNONYMS.get(ext, ext)
return ext