chg: [new title object] add new title object + correlation on page title

CIRCL · May 25, 2023 · c008366 · c008366
1 parent f7e0a35
commit c008366
Show file tree

Hide file tree

Showing 18 changed files with 1,205 additions and 27 deletions.
diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py
@@ -19,6 +19,7 @@
 from lib.objects.Domains import Domain
 from lib.objects.Items import Item
 from lib.objects import Screenshots
+from lib.objects import Titles
 
 logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
 
@@ -252,6 +253,13 @@ def save_capture_response(self, parent_id, entries):
                 self.root_item = item_id
             parent_id = item_id
 
+            item = Item(item_id)
+
+            title_content = crawlers.extract_title_from_html(entries['html'])
+            if title_content:
+                title = Titles.create_title(title_content)
+                title.add(item.get_date(), item_id)
+
             # SCREENSHOT
             if self.screenshot:
                 if 'png' in entries and entries['png']:
@@ -260,7 +268,6 @@ def save_capture_response(self, parent_id, entries):
                         if not screenshot.is_tags_safe():
                             unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
                             self.domain.add_tag(unsafe_tag)
-                            item = Item(item_id)
                             item.add_tag(unsafe_tag)
                         # Remove Placeholder pages # TODO Replace with warning list ???
                         if screenshot.id not in self.placeholder_screenshots:

diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py
@@ -15,7 +15,7 @@
 r_serv_db = config_loader.get_db_conn("Kvrocks_DB")
 config_loader = None
 
-AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'item', 'pgp', 'screenshot', 'username'})
+AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'item', 'pgp', 'screenshot', 'title', 'username'})
 
 def get_ail_uuid():
     ail_uuid = r_serv_db.get('ail:uuid')

diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py
@@ -44,11 +44,12 @@
     "cryptocurrency": ["domain", "item"],
     "cve": ["domain", "item"],
     "decoded": ["domain", "item"],
-    "domain": ["cve", "cryptocurrency", "decoded", "item", "pgp", "username", "screenshot"],
-    "item": ["cve", "cryptocurrency", "decoded", "domain", "pgp", "username", "screenshot"],
+    "domain": ["cve", "cryptocurrency", "decoded", "item", "pgp", "title", "screenshot", "username"],
+    "item": ["cve", "cryptocurrency", "decoded", "domain", "pgp", "screenshot", "title", "username"],
     "pgp": ["domain", "item"],
-    "username": ["domain", "item"],
     "screenshot": ["domain", "item"],
+    "title": ["domain", "item"],
+    "username": ["domain", "item"],
 }
 
 def get_obj_correl_types(obj_type):

diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
@@ -183,6 +183,47 @@ def extract_favicon_from_html(html, url):
 
 # # # - - # # #
 
+# # # # # # # #
+#             #
+#    TITLE    #
+#             #
+# # # # # # # #
+
+def extract_title_from_html(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    title = soup.title
+    if title:
+        return str(title.string)
+    return ''
+
+def extract_description_from_html(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    description = soup.find('meta', attrs={'name': 'description'})
+    if description:
+        return description['content']
+    return ''
+
+def extract_description_from_html(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    description = soup.find('meta', attrs={'name': 'description'})
+    if description:
+        return description['content']
+    return ''
+
+def extract_keywords_from_html(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    keywords = soup.find('meta', attrs={'name': 'keywords'})
+    if keywords:
+        return keywords['content']
+    return ''
+
+def extract_author_from_html(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    keywords = soup.find('meta', attrs={'name': 'author'})
+    if keywords:
+        return keywords['content']
+    return ''
+# # # - - # # #
 
 ################################################################################
 
@@ -1711,7 +1752,7 @@ def test_ail_crawlers():
 load_blacklist()
 
 # if __name__ == '__main__':
-    # task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
-    # print(task.get_meta())
-    # _clear_captures()
-
+#     item = Item('crawled/2023/03/06/foo.bec50a87b5-0c21-4ed4-9cb2-2d717a7a6507')
+#     content = item.get_content()
+#     r = extract_author_from_html(content)
+#     print(r)
diff --git a/bin/lib/module_extractor.py b/bin/lib/module_extractor.py
@@ -3,7 +3,6 @@
 import json
 import os
 import sys
-import time
 
 import yara
 
@@ -15,6 +14,7 @@
 ##################################
 from lib.objects import ail_objects
 from lib.objects.Items import Item
+from lib.objects.Titles import Title
 from lib import correlations_engine
 from lib import regex_helper
 from lib.ConfigLoader import ConfigLoader
@@ -58,18 +58,25 @@ def get_correl_match(extract_type, obj_id, content):
     correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type)
     to_extract = []
     map_subtype = {}
+    map_value_id = {}
     for c in correl:
         subtype, value = c.split(':', 1)
-        map_subtype[value] = subtype
-        to_extract.append(value)
+        if extract_type == 'title':
+            title = Title(value).get_content()
+            to_extract.append(title)
+            map_value_id[title] = value
+        else:
+            map_subtype[value] = subtype
+            to_extract.append(value)
+            map_value_id[value] = value
     if to_extract:
         objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content)
         for obj in objs:
-            if map_subtype[obj[2]]:
+            if map_subtype.get(obj[2]):
                 subtype = map_subtype[obj[2]]
             else:
                 subtype = ''
-            extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{obj[2]}'])
+            extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{map_value_id[obj[2]]}'])
     return extracted
 
 def _get_yara_match(data):
@@ -173,7 +180,7 @@ def extract(obj_id, content=None):
             if matches:
                 extracted = extracted + matches
 
-    for obj_t in ['cve', 'cryptocurrency', 'username']:  # Decoded, PGP->extract bloc
+    for obj_t in ['cve', 'cryptocurrency', 'title', 'username']:  # Decoded, PGP->extract bloc
         matches = get_correl_match(obj_t, obj_id, content)
         if matches:
             extracted = extracted + matches

diff --git a/bin/lib/objects/Titles.py b/bin/lib/objects/Titles.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import sys
+
+from hashlib import sha256
+from flask import url_for
+
+from pymisp import MISPObject
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from lib.ConfigLoader import ConfigLoader
+from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects
+
+config_loader = ConfigLoader()
+r_objects = config_loader.get_db_conn("Kvrocks_Objects")
+baseurl = config_loader.get_config_str("Notifications", "ail_domain")
+config_loader = None
+
+
+class Title(AbstractDaterangeObject):
+    """
+    AIL Title Object.
+    """
+
+    def __init__(self, id):
+        super(Title, self).__init__('title', id)
+
+    # def get_ail_2_ail_payload(self):
+    #     payload = {'raw': self.get_gzip_content(b64=True),
+    #                 'compress': 'gzip'}
+    #     return payload
+
+    # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
+    def delete(self):
+        # # TODO:
+        pass
+
+    def get_content(self, r_type='str'):
+        if r_type == 'str':
+            return self._get_field('content')
+
+    def get_link(self, flask_context=False):
+        if flask_context:
+            url = url_for('correlation.show_correlation', type=self.type, id=self.id)
+        else:
+            url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
+        return url
+
+    # TODO # CHANGE COLOR
+    def get_svg_icon(self):
+        return {'style': 'fas', 'icon': '\uf1dc', 'color': '#1E88E5', 'radius': 5}
+
+    def get_misp_object(self):
+        obj_attrs = []
+        obj = MISPObject('tsk-web-history')
+        obj.first_seen = self.get_first_seen()
+        obj.last_seen = self.get_last_seen()
+
+        obj_attrs.append(obj.add_attribute('title', value=self.get_content()))
+        for obj_attr in obj_attrs:
+            for tag in self.get_tags():
+                obj_attr.add_tag(tag)
+        return obj
+
+    def get_meta(self, options=set()):
+        meta = self._get_meta(options=options)
+        meta['id'] = self.id
+        meta['tags'] = self.get_tags(r_list=True)
+        meta['content'] = self.get_content()
+        return meta
+
+    def add(self, date, item_id):
+        self._add(date, item_id)
+
+    def create(self, content, _first_seen=None, _last_seen=None):
+        self._set_field('content', content)
+        self._create()
+
+
+def create_title(content):
+    title_id = sha256(content.encode()).hexdigest()
+    title = Title(title_id)
+    if not title.exists():
+        title.create(content)
+    return title
+
+class Titles(AbstractDaterangeObjects):
+    """
+        Titles Objects
+    """
+    def __init__(self):
+        super().__init__('title')
+
+    def get_metas(self, obj_ids, options=set()):
+        return self._get_metas(Title, obj_ids, options=options)
+
+    def sanitize_name_to_search(self, name_to_search):
+        return name_to_search
+
+
+# if __name__ == '__main__':
+#     from lib import crawlers
+#     from lib.objects import Items
+#     for item in Items.get_all_items_objects(filters={'sources': ['crawled']}):
+#         title_content = crawlers.extract_title_from_html(item.get_content())
+#         if title_content:
+#             print(item.id, title_content)
+#             title = create_title(title_content)
+#             title.add(item.get_date(), item.id)