From 35502d955f208b6cb26e237866f4c572bf41e029 Mon Sep 17 00:00:00 2001 From: terrtia Date: Fri, 26 Apr 2024 10:31:31 +0200 Subject: [PATCH] fix: [ocr] filter ocr supported languages + fix type of object accepted by the tracker --- bin/lib/ail_core.py | 4 ++-- bin/lib/objects/Ocrs.py | 22 ++++++++++++++++------ bin/modules/OcrExtractor.py | 9 +++++++-- var/www/templates/hunter/tracker_add.html | 4 ++++ 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 945e8634..9483a3d0 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -81,10 +81,10 @@ def get_default_correlation_objects(): return AIL_OBJECTS_CORRELATIONS_DEFAULT def get_obj_queued(): - return ['item', 'image'] + return ['item', 'image', 'message', 'ocr'] def get_objects_tracked(): - return ['decoded', 'item', 'pgp', 'message', 'title'] + return ['decoded', 'item', 'pgp', 'message', 'ocr', 'title'] def get_objects_retro_hunted(): return ['decoded', 'item', 'message'] diff --git a/bin/lib/objects/Ocrs.py b/bin/lib/objects/Ocrs.py index d8da0aa9..32686db3 100755 --- a/bin/lib/objects/Ocrs.py +++ b/bin/lib/objects/Ocrs.py @@ -296,14 +296,24 @@ def extract_text(image_path, languages, threshold=0.2): extracted.append((bbox, text)) return extracted -# TODO OCRS Class -def get_ids(): - return r_object.smembers(f'ocr:all') +def get_ocr_languages(): + return {'af', 'ar', 'as', 'az', 'be', 'bg', 'bh', 'bs', 'cs', 'cy', 'da', 'de', 'en', 'es', 'et', 'fa', 'fr', 'ga', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'kn', 'ko', 'ku', 'la', 'lt', 'lv', 'mi', 'mn', 'mr', 'ms', 'mt', 'ne', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'zh'} -def get_all_ocrs_objects(filters={}): - for obj_id in get_ids(): - yield Ocr(obj_id) + +def sanityze_ocr_languages(languages, ocr_languages=None): + langs = set() + if not ocr_languages: + ocr_languages = get_ocr_languages() + for lang in languages: + if lang in ocr_languages: + if lang == 'zh': + langs.add('ch_sim') + elif lang == 'sr': + langs.add('rs_latin') + else: + langs.add(lang) + return langs class Ocrs(AbstractDaterangeObjects): """ diff --git a/bin/modules/OcrExtractor.py b/bin/modules/OcrExtractor.py index e3285c8e..ad75a4e4 100755 --- a/bin/modules/OcrExtractor.py +++ b/bin/modules/OcrExtractor.py @@ -22,8 +22,9 @@ from lib.objects import Messages from lib.objects import Ocrs + # Default to eng -def get_model_languages(obj, add_en=True): +def get_model_languages(obj, ocr_languages, add_en=True): if add_en: model_languages = {'en'} else: @@ -53,6 +54,8 @@ def get_model_languages(obj, add_en=True): model_languages.add(lang) return model_languages + model_languages = Ocrs.sanityze_ocr_languages(model_languages, ocr_languages=ocr_languages) + return model_languages # TODO thread @@ -72,6 +75,8 @@ def __init__(self): config_loader = ConfigLoader() self.r_cache = config_loader.get_redis_conn("Redis_Cache") + self.ocr_languages = Ocrs.get_ocr_languages() + # Send module state to logs self.logger.info(f'Module {self.module_name} initialized') @@ -95,7 +100,7 @@ def compute(self, message): if not ocr.exists(): path = image.get_filepath() - languages = get_model_languages(image) + languages = get_model_languages(image, self.ocr_languages) print(image.id, languages) texts = Ocrs.extract_text(path, languages) if texts: diff --git a/var/www/templates/hunter/tracker_add.html b/var/www/templates/hunter/tracker_add.html index d2b9e0fc..85270659 100644 --- a/var/www/templates/hunter/tracker_add.html +++ b/var/www/templates/hunter/tracker_add.html @@ -140,6 +140,10 @@
Filter PGP by subtype:
+
+ + +
{#
#} {# #}