Skip to content

Commit

Permalink
chg: [importers] add Dir/File Importer
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed May 22, 2023
1 parent d55f065 commit af719d1
Show file tree
Hide file tree
Showing 5 changed files with 323 additions and 107 deletions.
2 changes: 1 addition & 1 deletion HOWTO.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ For the moment, there are three different ways to feed AIL with data:

2. You can setup [pystemon](https://github.com/cvandeplas/pystemon) and use the custom feeder provided by AIL (see below).

3. You can feed your own data using the [./bin/import_dir.py](./bin/import_dir.py) script.
3. You can feed your own data using the [./bin/file_dir_importer.py](./bin/import_dir.py) script.

### Feeding AIL with pystemon

Expand Down
97 changes: 97 additions & 0 deletions bin/importer/FileImporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
Importer Class
================
Import Content
"""
import logging.config
import os
import sys


sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from importer.abstract_importer import AbstractImporter
# from modules.abstract_module import AbstractModule
from lib import ail_logger
from lib.ail_queues import AILQueue
from lib import ail_files # TODO RENAME ME

logging.config.dictConfig(ail_logger.get_config(name='modules'))

# TODO Clean queue one object destruct

class FileImporter(AbstractImporter):
def __init__(self, feeder='file_import'):
super().__init__()
self.logger = logging.getLogger(f'{self.__class__.__name__}')

self.feeder_name = feeder # TODO sanityze feeder name

# Setup the I/O queues
self.queue = AILQueue('FileImporter', 'manual')

def importer(self, path):
if os.path.isfile(path):
with open(path, 'rb') as f:
content = f.read()
mimetype = ail_files.get_mimetype(content)
if ail_files.is_text(mimetype):
item_id = ail_files.create_item_id(self.feeder_name, path)
content = ail_files.create_gzipped_b64(content)
if content:
message = f'dir_import {item_id} {content}'
self.logger.info(message)
self.queue.send_message(message)
elif mimetype == 'application/gzip':
item_id = ail_files.create_item_id(self.feeder_name, path)
content = ail_files.create_b64(content)
if content:
message = f'dir_import {item_id} {content}'
self.logger.info(message)
self.queue.send_message(message)

class DirImporter(AbstractImporter):
def __init__(self):
super().__init__()
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.file_importer = FileImporter()

def importer(self, dir_path):
if not os.path.isdir(dir_path):
message = f'Error, {dir_path} is not a directory'
self.logger.warning(message)
raise Exception(message)

for dirname, _, filenames in os.walk(dir_path):
for filename in filenames:
path = os.path.join(dirname, filename)
self.file_importer.importer(path)


# if __name__ == '__main__':
# import argparse
# # TODO multiple files/dirs ???
# parser = argparse.ArgumentParser(description='Directory or file importer')
# parser.add_argument('-d', '--directory', type=str, help='Root directory to import')
# parser.add_argument('-f', '--file', type=str, help='File to import')
# args = parser.parse_args()
#
# if not args.directory and not args.file:
# parser.print_help()
# sys.exit(0)
#
# if args.directory:
# dir_path = args.directory
# dir_importer = DirImporter()
# dir_importer.importer(dir_path)
#
# if args.file:
# file_path = args.file
# file_importer = FileImporter()
# file_importer.importer(file_path)
195 changes: 195 additions & 0 deletions bin/lib/ail_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import base64
import datetime
import gzip
import logging.config
import magic
import os
import sys

from werkzeug.utils import secure_filename

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib import ail_logger
from lib.ail_core import generate_uuid
# from lib import ConfigLoader
from packages import Date

logging.config.dictConfig(ail_logger.get_config(name='modules'))
logger = logging.getLogger()

# config_loader = ConfigLoader.ConfigLoader()
# r_serv = config_loader.get_db_conn("Kvrocks_Stats") # TODO CHANGE DB
# r_cache = config_loader.get_redis_conn("Redis_Log_submit")
#
# # Text max size
# TEXT_MAX_SIZE = ConfigLoader.ConfigLoader().get_config_int("SubmitPaste", "TEXT_MAX_SIZE")
# # File max size
# FILE_MAX_SIZE = ConfigLoader.ConfigLoader().get_config_int("SubmitPaste", "FILE_MAX_SIZE")
# # Allowed file type
# ALLOWED_EXTENSIONS = ConfigLoader.ConfigLoader().get_config_str("SubmitPaste", "FILE_ALLOWED_EXTENSIONS").split(',')
# config_loader = None
#
# # TODO generate UUID
#
# # TODO Source ????
#
# # TODO RENAME ME
# class Submit:
# def __init__(self, submit_uuid):
# self.uuid = submit_uuid
#
# def exists(self):
# return r_serv.exists(f'submit:{self.uuid}')
#
# def is_item(self):
# return r_serv.hexists(f'submit:{self.uuid}', 'content')
#
# def is_file(self):
# return r_serv.hexists(f'submit:{self.uuid}', 'filename')
#
# def get_filename(self):
# return r_serv.hget(f'submit:{self.uuid}', 'filename')
#
# def get_content(self):
# return r_serv.hget(f'submit:{self.uuid}', 'content')
#
# def get_password(self):
# r_serv.hget(f'submit:{self.uuid}', 'password')
#
# def get_tags(self):
# return r_serv.smembers(f'submit:tags:{self.uuid}')
#
# def get_error(self):
# return r_cache.hget(f'submit:{self.uuid}:', 'error')
#
# def get_stats(self):
# stats = {'ended': r_cache.hget(f'submit:{self.uuid}', 'ended'), # boolean
# 'objs': r_cache.hget(f'submit:{self.uuid}', 'objs'), # objs IDs
# 'nb_files': r_cache.hget(f'submit:{self.uuid}', 'nb_files'),
# 'nb_done': r_cache.hget(f'submit:{self.uuid}', 'nb_done'),
# 'submitted': r_cache.hget(f'submit:{self.uuid}', 'submitted'),
# 'error': self.get_error()}
# return stats
#
#
# def get_meta(self):
# meta = {'uuid': self.uuid}
# return meta
#
# def is_compressed(self):
# pass
#
#
# def abort(self, message):
# self.set_error(message)
# r_cache.hset(f'submit:{self.uuid}', 'ended', 'True')
# self.delete()
#
# def set_error(self, message):
#
# r_serv.hset(f'submit:{self.uuid}', 'error', )
#
# # source ???
# def create(self, content='', filename='', tags=[], password=None):
#
#
#
#
# r_serv.sadd(f'submits:all')
#
#
# def delete(self):
# r_serv.srem(f'submits:all', self.uuid)
# r_cache.delete(f'submit:{self.uuid}')
# r_serv.delete(f'submit:tags:{self.uuid}')
# r_serv.delete(f'submit:{self.uuid}')
#
#
# def create_submit(tags=[]):
# submit_uuid = generate_uuid()
# submit = Submit(submit_uuid)
#
# def api_create_submit():
# pass


#########################################################################################
#########################################################################################
#########################################################################################

ARCHIVE_MIME_TYPE = {
'application/zip',
# application/bzip2
'application/x-bzip2',
'application/java-archive',
'application/x-tar',
'application/gzip',
# application/x-gzip
'application/x-lzma',
'application/x-xz',
# application/x-xz-compressed-tar
'application/x-lz',
'application/x-7z-compressed',
'application/x-rar',
# application/x-rar-compressed
'application/x-iso9660-image',
'application/vnd.ms-cab-compressed',
# application/x-lzma
# application/x-compress
# application/x-lzip
# application/x-lz4
# application/zstd
}

def is_archive(mimetype):
return mimetype in ARCHIVE_MIME_TYPE

def is_text(mimetype):
return mimetype.split('/')[0] == 'text'


def get_mimetype(b_content):
return magic.from_buffer(b_content, mime=True)

def create_item_id(feeder_name, path):
names = path.split('/')
try:
date = datetime.datetime(int(names[-4]), int(names[-3]), int(names[-2])).strftime("%Y%m%d")
basename = names[-1]
except (IndexError, ValueError):
date = Date.get_today_date_str()
basename = path # TODO check max depth
date = f'{date[0:4]}/{date[4:6]}/{date[6:8]}'
basename = secure_filename(basename)
if len(basename) < 1:
basename = generate_uuid()
if len(basename) > 215:
basename = basename[-215:] + str(generate_uuid())
if not basename.endswith('.gz'):
basename = basename.replace('.', '_')
basename = f'{basename}.gz'
else:
nb = basename.count('.') - 1
if nb > 0:
basename = basename.replace('.', '_', nb)
item_id = os.path.join(feeder_name, date, basename)
# TODO check if already exists
return item_id

def create_b64(b_content):
return base64.standard_b64encode(b_content).decode()

def create_gzipped_b64(b_content):
try:
gzipencoded = gzip.compress(b_content)
gzip64encoded = create_b64(gzipencoded)
return gzip64encoded
except Exception as e:
logger.warning(e)
return ''
3 changes: 3 additions & 0 deletions configs/modules.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ publish = Importers
[Importer_Json]
publish = Importers,Tags

[FileImporter]
publish = Importers

[PystemonModuleImporter]
publish = Importers

Expand Down

0 comments on commit af719d1

Please sign in to comment.