deanmalmgren · akoumjian · Aug 8, 2016 · Aug 8, 2016 · Aug 13, 2016 · Aug 20, 2016
diff --git a/requirements/debian b/requirements/debian
@@ -12,6 +12,7 @@ unrtf
 
 # parse image files
 tesseract-ocr
+libjpeg-dev
 
 # parse pdfs
 poppler-utils

diff --git a/requirements/python b/requirements/python
@@ -4,6 +4,7 @@ argcomplete
 chardet
 python-pptx>=0.5.1
 python-docx
+python-magic
 pdfminer==20140328
 beautifulsoup4
 xlrd

diff --git a/tests/base.py b/tests/base.py
@@ -42,8 +42,12 @@ def __init__(self, *args, **kwargs):
             )
 
     def get_extension_directory(self):
-        return os.path.join(
+        data_dir = os.path.join(
             os.path.dirname(os.path.abspath(__file__)),
+            'data'
+        )
+        return os.path.join(
+            data_dir,
             self.extension,
         )
 

diff --git a/tests/csv/raw_text.csv → tests/data/csv/raw_text.csv b/tests/csv/raw_text.csv → tests/data/csv/raw_text.csv
diff --git a/tests/csv/raw_text.txt → tests/data/csv/raw_text.txt b/tests/csv/raw_text.txt → tests/data/csv/raw_text.txt
diff --git a/tests/csv/standardized_text.csv → tests/data/csv/standardized_text.csv b/tests/csv/standardized_text.csv → tests/data/csv/standardized_text.csv
diff --git a/tests/doc/raw_text.doc → tests/data/doc/raw_text.doc b/tests/doc/raw_text.doc → tests/data/doc/raw_text.doc
diff --git a/tests/doc/raw_text.txt → tests/data/doc/raw_text.txt b/tests/doc/raw_text.txt → tests/data/doc/raw_text.txt
diff --git a/tests/doc/standardized_text.doc → tests/data/doc/standardized_text.doc b/tests/doc/standardized_text.doc → tests/data/doc/standardized_text.doc
diff --git a/tests/doc/standardized_text_1.odt → tests/data/doc/standardized_text_1.odt b/tests/doc/standardized_text_1.odt → tests/data/doc/standardized_text_1.odt
diff --git a/tests/docx/paragraphs_and_tables.docx → tests/data/docx/paragraphs_and_tables.docx b/tests/docx/paragraphs_and_tables.docx → tests/data/docx/paragraphs_and_tables.docx
diff --git a/tests/docx/paragraphs_and_tables.txt → tests/data/docx/paragraphs_and_tables.txt b/tests/docx/paragraphs_and_tables.txt → tests/data/docx/paragraphs_and_tables.txt
diff --git a/tests/docx/raw_text.docx → tests/data/docx/raw_text.docx b/tests/docx/raw_text.docx → tests/data/docx/raw_text.docx
diff --git a/tests/docx/raw_text.txt → tests/data/docx/raw_text.txt b/tests/docx/raw_text.txt → tests/data/docx/raw_text.txt
diff --git a/tests/docx/standardized_text.docx → tests/data/docx/standardized_text.docx b/tests/docx/standardized_text.docx → tests/data/docx/standardized_text.docx
diff --git a/tests/eml/raw_text.eml → tests/data/eml/raw_text.eml b/tests/eml/raw_text.eml → tests/data/eml/raw_text.eml
diff --git a/tests/eml/raw_text.txt → tests/data/eml/raw_text.txt b/tests/eml/raw_text.txt → tests/data/eml/raw_text.txt
diff --git a/tests/eml/standardized_text.eml → tests/data/eml/standardized_text.eml b/tests/eml/standardized_text.eml → tests/data/eml/standardized_text.eml
diff --git a/tests/epub/raw_text.epub → tests/data/epub/raw_text.epub b/tests/epub/raw_text.epub → tests/data/epub/raw_text.epub
diff --git a/tests/epub/raw_text.txt → tests/data/epub/raw_text.txt b/tests/epub/raw_text.txt → tests/data/epub/raw_text.txt
diff --git a/tests/epub/standardized_text.epub → tests/data/epub/standardized_text.epub b/tests/epub/standardized_text.epub → tests/data/epub/standardized_text.epub
diff --git a/tests/gif/raw_text.gif → tests/data/gif/raw_text.gif b/tests/gif/raw_text.gif → tests/data/gif/raw_text.gif
diff --git a/tests/gif/standardized_text.gif → tests/data/gif/standardized_text.gif b/tests/gif/standardized_text.gif → tests/data/gif/standardized_text.gif
diff --git a/tests/html/raw_text.html → tests/data/html/raw_text.html b/tests/html/raw_text.html → tests/data/html/raw_text.html
diff --git a/tests/html/raw_text.txt → tests/data/html/raw_text.txt b/tests/html/raw_text.txt → tests/data/html/raw_text.txt
diff --git a/tests/html/standardized_text.html → tests/data/html/standardized_text.html b/tests/html/standardized_text.html → tests/data/html/standardized_text.html
diff --git a/tests/html/tables.html → tests/data/html/tables.html b/tests/html/tables.html → tests/data/html/tables.html
diff --git a/tests/html/tables.txt → tests/data/html/tables.txt b/tests/html/tables.txt → tests/data/html/tables.txt
diff --git a/tests/jpg/raw_text.jpg → tests/data/jpg/raw_text.jpg b/tests/jpg/raw_text.jpg → tests/data/jpg/raw_text.jpg
diff --git a/tests/jpg/standardized_text.jpg → tests/data/jpg/standardized_text.jpg b/tests/jpg/standardized_text.jpg → tests/data/jpg/standardized_text.jpg
diff --git a/tests/json/raw_text.json → tests/data/json/raw_text.json b/tests/json/raw_text.json → tests/data/json/raw_text.json
diff --git a/tests/json/raw_text.txt → tests/data/json/raw_text.txt b/tests/json/raw_text.txt → tests/data/json/raw_text.txt
diff --git a/tests/json/standardized_text.json → tests/data/json/standardized_text.json b/tests/json/standardized_text.json → tests/data/json/standardized_text.json
diff --git a/tests/mp3/raw_text.mp3 → tests/data/mp3/raw_text.mp3 b/tests/mp3/raw_text.mp3 → tests/data/mp3/raw_text.mp3
diff --git a/tests/mp3/raw_text.txt → tests/data/mp3/raw_text.txt b/tests/mp3/raw_text.txt → tests/data/mp3/raw_text.txt
diff --git a/tests/mp3/standardized_text.mp3 → tests/data/mp3/standardized_text.mp3 b/tests/mp3/standardized_text.mp3 → tests/data/mp3/standardized_text.mp3
diff --git a/tests/msg/raw_text.msg → tests/data/msg/raw_text.msg b/tests/msg/raw_text.msg → tests/data/msg/raw_text.msg
diff --git a/tests/msg/raw_text.txt → tests/data/msg/raw_text.txt b/tests/msg/raw_text.txt → tests/data/msg/raw_text.txt
@@ -1,15 +1,15 @@
 Test for TIF files
 
-This is a test email to experiment with the MS Outlook MSG Extractor
-
-
--- 
-
-
-Kind regards
-
-
-
-
-Brian Zhou
-
+This is a test email to experiment with the MS Outlook MSG Extractor
+
+
+-- 
+
+
+Kind regards
+
+
+
+
+Brian Zhou
+
diff --git a/tests/msg/standardized_text.msg → tests/data/msg/standardized_text.msg b/tests/msg/standardized_text.msg → tests/data/msg/standardized_text.msg
diff --git a/tests/odt/raw_text.odt → tests/data/odt/raw_text.odt b/tests/odt/raw_text.odt → tests/data/odt/raw_text.odt
diff --git a/tests/odt/raw_text.txt → tests/data/odt/raw_text.txt b/tests/odt/raw_text.txt → tests/data/odt/raw_text.txt
diff --git a/tests/odt/standardized_text.odt → tests/data/odt/standardized_text.odt b/tests/odt/standardized_text.odt → tests/data/odt/standardized_text.odt
diff --git a/tests/ogg/raw_text.ogg → tests/data/ogg/raw_text.ogg b/tests/ogg/raw_text.ogg → tests/data/ogg/raw_text.ogg
diff --git a/tests/ogg/raw_text.txt → tests/data/ogg/raw_text.txt b/tests/ogg/raw_text.txt → tests/data/ogg/raw_text.txt
diff --git a/tests/ogg/standardized_text.ogg → tests/data/ogg/standardized_text.ogg b/tests/ogg/standardized_text.ogg → tests/data/ogg/standardized_text.ogg
diff --git a/tests/pdf/ocr_text.pdf → tests/data/pdf/ocr_text.pdf b/tests/pdf/ocr_text.pdf → tests/data/pdf/ocr_text.pdf
diff --git a/tests/pdf/raw_text-m=pdfminer.txt → tests/data/pdf/raw_text-m=pdfminer.txt b/tests/pdf/raw_text-m=pdfminer.txt → tests/data/pdf/raw_text-m=pdfminer.txt
diff --git a/tests/pdf/raw_text.pdf → tests/data/pdf/raw_text.pdf b/tests/pdf/raw_text.pdf → tests/data/pdf/raw_text.pdf
diff --git a/tests/pdf/raw_text.txt → tests/data/pdf/raw_text.txt b/tests/pdf/raw_text.txt → tests/data/pdf/raw_text.txt
diff --git a/tests/pdf/standardized_text.pdf → tests/data/pdf/standardized_text.pdf b/tests/pdf/standardized_text.pdf → tests/data/pdf/standardized_text.pdf
diff --git a/tests/pdf/two_column.pdf → tests/data/pdf/two_column.pdf b/tests/pdf/two_column.pdf → tests/data/pdf/two_column.pdf
diff --git a/tests/pdf/two_column.txt → tests/data/pdf/two_column.txt b/tests/pdf/two_column.txt → tests/data/pdf/two_column.txt
diff --git a/tests/png/raw_text.png → tests/data/png/raw_text.png b/tests/png/raw_text.png → tests/data/png/raw_text.png
diff --git a/tests/png/standardized_text.png → tests/data/png/standardized_text.png b/tests/png/standardized_text.png → tests/data/png/standardized_text.png
diff --git a/tests/pptx/raw_text.pptx → tests/data/pptx/raw_text.pptx b/tests/pptx/raw_text.pptx → tests/data/pptx/raw_text.pptx
diff --git a/tests/pptx/raw_text.txt → tests/data/pptx/raw_text.txt b/tests/pptx/raw_text.txt → tests/data/pptx/raw_text.txt
diff --git a/tests/pptx/standardized_text.pptx → tests/data/pptx/standardized_text.pptx b/tests/pptx/standardized_text.pptx → tests/data/pptx/standardized_text.pptx
diff --git a/tests/ps/raw_text.ps → tests/data/ps/raw_text.ps b/tests/ps/raw_text.ps → tests/data/ps/raw_text.ps
diff --git a/tests/ps/raw_text.txt → tests/data/ps/raw_text.txt b/tests/ps/raw_text.txt → tests/data/ps/raw_text.txt
diff --git a/tests/ps/standardized_text.ps → tests/data/ps/standardized_text.ps b/tests/ps/standardized_text.ps → tests/data/ps/standardized_text.ps
diff --git a/tests/rtf/raw_text.rtf → tests/data/rtf/raw_text.rtf b/tests/rtf/raw_text.rtf → tests/data/rtf/raw_text.rtf
diff --git a/tests/rtf/raw_text.txt → tests/data/rtf/raw_text.txt b/tests/rtf/raw_text.txt → tests/data/rtf/raw_text.txt
diff --git a/tests/rtf/standardized_text.rtf → tests/data/rtf/standardized_text.rtf b/tests/rtf/standardized_text.rtf → tests/data/rtf/standardized_text.rtf
diff --git a/tests/tiff/raw_text.tiff → tests/data/tiff/raw_text.tiff b/tests/tiff/raw_text.tiff → tests/data/tiff/raw_text.tiff
diff --git a/tests/tiff/standardized_text.tiff → tests/data/tiff/standardized_text.tiff b/tests/tiff/standardized_text.tiff → tests/data/tiff/standardized_text.tiff
diff --git a/tests/txt/raw_text.txt → tests/data/txt/raw_text.txt b/tests/txt/raw_text.txt → tests/data/txt/raw_text.txt
diff --git a/tests/txt/standardized_text.txt → tests/data/txt/standardized_text.txt b/tests/txt/standardized_text.txt → tests/data/txt/standardized_text.txt
diff --git a/tests/wav/raw_text.txt → tests/data/wav/raw_text.txt b/tests/wav/raw_text.txt → tests/data/wav/raw_text.txt
diff --git a/tests/wav/raw_text.wav → tests/data/wav/raw_text.wav b/tests/wav/raw_text.wav → tests/data/wav/raw_text.wav
diff --git a/tests/wav/standardized_text.wav → tests/data/wav/standardized_text.wav b/tests/wav/standardized_text.wav → tests/data/wav/standardized_text.wav
diff --git a/tests/xls/raw_text.txt → tests/data/xls/raw_text.txt b/tests/xls/raw_text.txt → tests/data/xls/raw_text.txt
diff --git a/tests/xls/raw_text.xls → tests/data/xls/raw_text.xls b/tests/xls/raw_text.xls → tests/data/xls/raw_text.xls
diff --git a/tests/xls/standardized_text.xls → tests/data/xls/standardized_text.xls b/tests/xls/standardized_text.xls → tests/data/xls/standardized_text.xls
diff --git a/tests/xlsx/raw_text.txt → tests/data/xlsx/raw_text.txt b/tests/xlsx/raw_text.txt → tests/data/xlsx/raw_text.txt
diff --git a/tests/xlsx/raw_text.xlsx → tests/data/xlsx/raw_text.xlsx b/tests/xlsx/raw_text.xlsx → tests/data/xlsx/raw_text.xlsx
diff --git a/tests/xlsx/standardized_text.xlsx → tests/data/xlsx/standardized_text.xlsx b/tests/xlsx/standardized_text.xlsx → tests/data/xlsx/standardized_text.xlsx
diff --git a/tests/test_detect_filetype.py b/tests/test_detect_filetype.py
@@ -0,0 +1,70 @@
+import os
+import unittest
+import tempfile
+import shutil
+
+from textract.parsers.filetype import detect_filetype
+
+
+def file_paths():
+    """
+    Returns a list of paths to test files
+
+    e.g.
+    [
+        './data/epub/raw_text.epub',
+        '.data/epub/standardized_text.epub'
+        ...
+    ]
+    """
+    paths = []
+    data_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'data'
+    )
+    for root, directories, files in os.walk(data_dir):
+        _, folder = os.path.split(root)
+        for filename in files:
+            _, ext = os.path.splitext(filename)
+            ext = ext.strip('.')
+            if ext == folder:
+                abs_path = os.path.join(root, filename)
+                paths.append(abs_path)
+    return paths
+
+
+
+class DetectFileTypeTestCase(unittest.TestCase):
+    """
+    Make sure detecting file types works correctly.
+    """
+
+
+    def test_detect_filetype_with_extension(self):
+        """
+        Make sure we are using the extension when it exists
+        """
+        files = file_paths()
+        for path in files:
+            _, ext = os.path.splitext(path)
+            ext = ext.strip('.')
+            detected = detect_filetype(path)
+            self.assertEqual(ext, detected)
+
+    def test_detect_filetype_with_mimetype(self):
+        """
+        Test detectin filetypes by their mime
+        """
+        files = file_paths()
+        for path in files:
+            try:
+                # Put in temporary file that has no extension
+                _, orig_ext = os.path.splitext(path)
+                orig_ext = orig_ext.strip('.')
+                handler, temp_path = tempfile.mkstemp()
+                shutil.copyfile(path, temp_path)
+                detected = detect_filetype(temp_path)
+                self.assertEqual(orig_ext, detected)
+            finally:
+                # clean up temp files, be a good citizen
+                os.remove(temp_path)
diff --git a/textract/__init__.py b/textract/__init__.py
@@ -1,3 +1,6 @@
+import logging
+logging.basicConfig()
+
 from .parsers import process
 
 VERSION = "1.4.0"
diff --git a/textract/parsers/__init__.py b/textract/parsers/__init__.py
@@ -6,13 +6,8 @@
 import importlib
 
 from .. import exceptions
+from .filetype import detect_filetype
 
-# Dictionary structure for synonymous file extension types
-EXTENSION_SYNONYMS = {
-    ".jpeg": ".jpg",
-    ".htm": ".html",
-    "": ".txt",
-}
 
 # default encoding that is returned by the process method. specify it
 # here so the default is used on both the process function and also by
@@ -30,28 +25,20 @@ def process(filename, encoding=DEFAULT_ENCODING, **kwargs):
     if not os.path.exists(filename):
         raise exceptions.MissingFileError(filename)
 
-    # get the filename extension, which is something like .docx for
-    # example, and import the module dynamically using importlib. This
-    # is a relative import so the name of the package is necessary
-    _, ext = os.path.splitext(filename)
-    ext = ext.lower()
-
-    # check the EXTENSION_SYNONYMS dictionary
-    ext = EXTENSION_SYNONYMS.get(ext, ext)
+    ext = detect_filetype(filename)
 
     # to avoid conflicts with packages that are installed globally
     # (e.g. python's json module), all extension parser modules have
     # the _parser extension
-    rel_module = ext + '_parser'
-    module_name = rel_module[1:]
+    module_name = ext + '_parser'
 
-    # if this module name doesn't exist in this directory it isn't
-    # currently supported
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    if not os.path.exists(os.path.join(this_dir, module_name + '.py')):
+    ## If it can't find the module, it's an unsupported file type
+    try:
+        import_path = 'textract.parsers.' + module_name
+        filetype_module = importlib.import_module(import_path)
+    except ImportError:
         raise exceptions.ExtensionNotSupported(ext)
 
     # do the extraction
-    filetype_module = importlib.import_module(rel_module, 'textract.parsers')
     parser = filetype_module.Parser()
     return parser.process(filename, encoding, **kwargs)
diff --git a/textract/parsers/filetype.py b/textract/parsers/filetype.py
@@ -0,0 +1,116 @@
+import os
+import logging
+
+logger = logging.getLogger('textract')
+
+DETECT_METHOD = 'extension'
+
+try:
+    import magic
+except ImportError:
+    magic = None
+    logger.warn('Python-magic not detected. Using file extension to detect file type.')
+
+
+# Dictionary structure for synonymous file extension types
+EXTENSION_SYNONYMS = {
+    "jpeg": "jpg",
+    "htm": "html",
+    "": "txt",
+}
+
+# Sourced from
+# http://svn.apache.org/repos/asf/httpd/httpd/trunk/docs/conf/mime.types
+MIME_MAPPING = {
+    'application/atom+xml': 'txt',
+    'application/atomcat+xml': 'txt',
+    'application/atomsvc+xml': 'txt',
+    'application/ccxml+xml': 'txt',
+    'application/davmount+xml': 'txt',
+    'application/docbook+xml': 'txt',
+    'application/dssc+xml': 'txt',
+    'application/ecmascript': 'txt',
+    'application/emma+xml': 'txt',
+    'application/epub+zip': 'epub',
+    'application/gml+xml': 'txt',
+    'application/gpx+xml': 'txt',
+    'application/inkml+xml': 'txt',
+    'application/inkml+xml': 'txt',
+    'application/javascript': 'txt',
+    'application/json': 'json',
+    'application/jsonml+json': 'json',
+    'application/lost+xml': 'txt',
+    'application/mads+xml': 'txt',
+    'application/marcxml+xml': 'txt',
+    'application/mathml+xml': 'txt',
+    'application/metalink+xml': 'txt',
+    'application/metalink4+xml': 'txt',
+    'application/mets+xml': 'txt',
+    'application/mods+xml': 'txt',
+    'application/msword': 'doc',
+    'application/ogg': 'ogg',
+    'application/pdf': 'pdf',
+    'application/postscript': 'ps',
+    'application/rdf+xml': 'rdf',
+    'application/reginfo+xml': 'txt',
+    'application/rss+xml': 'txt',
+    'application/rtf': 'rtf',
+    'application/sbml+xml': 'txt',
+    'application/vnd.ms-excel': 'xls',
+    'application/vnd.ms-word.document.macroenabled.12': 'doc',
+    'application/vnd.oasis.opendocument.text': 'odt',
+    'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
+    'application/vnd.openxmlformats-officedocument.presentationml.slide': 'pptx',
+    'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'pptx',
+    'application/vnd.openxmlformats-officedocument.presentationml.template': 'pptx',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xlsx',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'docx',
+    'application/xhtml+xml': 'html',
+    'application/xml-dtd': 'html',
+    'application/xml': 'html',
+    'audio/basic': 'audio',
+    'audio/midi': 'audio',
+    'audio/mp4': 'audio',
+    'audio/mpeg': 'audio',
+    'audio/ogg': 'ogg',
+    'audio/s3m': 'audio',
+    'audio/webm': 'audio',
+    'audio/x-aac': 'audio',
+    'audio/x-flac': 'audio',
+    'audio/x-wav': 'wav',
+    'image/bmp': 'image',
+    'image/cgm': 'image',
+    'image/g3fax': 'image',
+    'image/gif': 'gif',
+    'image/jpeg': 'image',
+    'image/png': 'image',
+    'image/svg+xml': 'image',
+    'image/tiff': 'image',
+    'text/calendar': 'txt',
+    'text/css': 'txt',
+    'text/csv': 'csv',
+    'text/html': 'html',
+    'text/plain': 'txt',
+    'text/richtext': 'rtf',
+}
+
+
+def detect_filetype(filename, default='txt'):
+    """
+    Detect a file's type, using `default` if not found.
+    """
+    # First attempt to detect by extension
+    _, ext = os.path.splitext(filename)
+    if not ext:
+        mime = magic.from_file(filename, mime=True)
+        ext = MIME_MAPPING.get(mime, default)
+        logger.info('File {0} had mimetype {1}'.format(filename, mime))
+
+    logger.info('Using {0} parser'.format(ext))
+
+    # Use the extension synonyms dictionary to consolidate
+    ext = ext.lower().strip('.')
+    ext = EXTENSION_SYNONYMS.get(ext, ext)
+    return ext