Skip to content

Commit

Permalink
chg: [new title object] add new title object + correlation on page title
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed May 25, 2023
1 parent f7e0a35 commit c008366
Show file tree
Hide file tree
Showing 18 changed files with 1,205 additions and 27 deletions.
9 changes: 8 additions & 1 deletion bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from lib.objects.Domains import Domain
from lib.objects.Items import Item
from lib.objects import Screenshots
from lib.objects import Titles

logging.config.dictConfig(ail_logger.get_config(name='crawlers'))

Expand Down Expand Up @@ -252,6 +253,13 @@ def save_capture_response(self, parent_id, entries):
self.root_item = item_id
parent_id = item_id

item = Item(item_id)

title_content = crawlers.extract_title_from_html(entries['html'])
if title_content:
title = Titles.create_title(title_content)
title.add(item.get_date(), item_id)

# SCREENSHOT
if self.screenshot:
if 'png' in entries and entries['png']:
Expand All @@ -260,7 +268,6 @@ def save_capture_response(self, parent_id, entries):
if not screenshot.is_tags_safe():
unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
self.domain.add_tag(unsafe_tag)
item = Item(item_id)
item.add_tag(unsafe_tag)
# Remove Placeholder pages # TODO Replace with warning list ???
if screenshot.id not in self.placeholder_screenshots:
Expand Down
2 changes: 1 addition & 1 deletion bin/lib/ail_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
r_serv_db = config_loader.get_db_conn("Kvrocks_DB")
config_loader = None

AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'item', 'pgp', 'screenshot', 'username'})
AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'item', 'pgp', 'screenshot', 'title', 'username'})

def get_ail_uuid():
ail_uuid = r_serv_db.get('ail:uuid')
Expand Down
7 changes: 4 additions & 3 deletions bin/lib/correlations_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,12 @@
"cryptocurrency": ["domain", "item"],
"cve": ["domain", "item"],
"decoded": ["domain", "item"],
"domain": ["cve", "cryptocurrency", "decoded", "item", "pgp", "username", "screenshot"],
"item": ["cve", "cryptocurrency", "decoded", "domain", "pgp", "username", "screenshot"],
"domain": ["cve", "cryptocurrency", "decoded", "item", "pgp", "title", "screenshot", "username"],
"item": ["cve", "cryptocurrency", "decoded", "domain", "pgp", "screenshot", "title", "username"],
"pgp": ["domain", "item"],
"username": ["domain", "item"],
"screenshot": ["domain", "item"],
"title": ["domain", "item"],
"username": ["domain", "item"],
}

def get_obj_correl_types(obj_type):
Expand Down
49 changes: 45 additions & 4 deletions bin/lib/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,47 @@ def extract_favicon_from_html(html, url):

# # # - - # # #

# # # # # # # #
# #
# TITLE #
# #
# # # # # # # #

def extract_title_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
title = soup.title
if title:
return str(title.string)
return ''

def extract_description_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
description = soup.find('meta', attrs={'name': 'description'})
if description:
return description['content']
return ''

def extract_description_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
description = soup.find('meta', attrs={'name': 'description'})
if description:
return description['content']
return ''

def extract_keywords_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
keywords = soup.find('meta', attrs={'name': 'keywords'})
if keywords:
return keywords['content']
return ''

def extract_author_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
keywords = soup.find('meta', attrs={'name': 'author'})
if keywords:
return keywords['content']
return ''
# # # - - # # #

################################################################################

Expand Down Expand Up @@ -1711,7 +1752,7 @@ def test_ail_crawlers():
load_blacklist()

# if __name__ == '__main__':
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
# print(task.get_meta())
# _clear_captures()

# item = Item('crawled/2023/03/06/foo.bec50a87b5-0c21-4ed4-9cb2-2d717a7a6507')
# content = item.get_content()
# r = extract_author_from_html(content)
# print(r)
19 changes: 13 additions & 6 deletions bin/lib/module_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import json
import os
import sys
import time

import yara

Expand All @@ -15,6 +14,7 @@
##################################
from lib.objects import ail_objects
from lib.objects.Items import Item
from lib.objects.Titles import Title
from lib import correlations_engine
from lib import regex_helper
from lib.ConfigLoader import ConfigLoader
Expand Down Expand Up @@ -58,18 +58,25 @@ def get_correl_match(extract_type, obj_id, content):
correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type)
to_extract = []
map_subtype = {}
map_value_id = {}
for c in correl:
subtype, value = c.split(':', 1)
map_subtype[value] = subtype
to_extract.append(value)
if extract_type == 'title':
title = Title(value).get_content()
to_extract.append(title)
map_value_id[title] = value
else:
map_subtype[value] = subtype
to_extract.append(value)
map_value_id[value] = value
if to_extract:
objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content)
for obj in objs:
if map_subtype[obj[2]]:
if map_subtype.get(obj[2]):
subtype = map_subtype[obj[2]]
else:
subtype = ''
extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{obj[2]}'])
extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{map_value_id[obj[2]]}'])
return extracted

def _get_yara_match(data):
Expand Down Expand Up @@ -173,7 +180,7 @@ def extract(obj_id, content=None):
if matches:
extracted = extracted + matches

for obj_t in ['cve', 'cryptocurrency', 'username']: # Decoded, PGP->extract bloc
for obj_t in ['cve', 'cryptocurrency', 'title', 'username']: # Decoded, PGP->extract bloc
matches = get_correl_match(obj_t, obj_id, content)
if matches:
extracted = extracted + matches
Expand Down
114 changes: 114 additions & 0 deletions bin/lib/objects/Titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import os
import sys

from hashlib import sha256
from flask import url_for

from pymisp import MISPObject

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.ConfigLoader import ConfigLoader
from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects

config_loader = ConfigLoader()
r_objects = config_loader.get_db_conn("Kvrocks_Objects")
baseurl = config_loader.get_config_str("Notifications", "ail_domain")
config_loader = None


class Title(AbstractDaterangeObject):
"""
AIL Title Object.
"""

def __init__(self, id):
super(Title, self).__init__('title', id)

# def get_ail_2_ail_payload(self):
# payload = {'raw': self.get_gzip_content(b64=True),
# 'compress': 'gzip'}
# return payload

# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
def delete(self):
# # TODO:
pass

def get_content(self, r_type='str'):
if r_type == 'str':
return self._get_field('content')

def get_link(self, flask_context=False):
if flask_context:
url = url_for('correlation.show_correlation', type=self.type, id=self.id)
else:
url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
return url

# TODO # CHANGE COLOR
def get_svg_icon(self):
return {'style': 'fas', 'icon': '\uf1dc', 'color': '#1E88E5', 'radius': 5}

def get_misp_object(self):
obj_attrs = []
obj = MISPObject('tsk-web-history')
obj.first_seen = self.get_first_seen()
obj.last_seen = self.get_last_seen()

obj_attrs.append(obj.add_attribute('title', value=self.get_content()))
for obj_attr in obj_attrs:
for tag in self.get_tags():
obj_attr.add_tag(tag)
return obj

def get_meta(self, options=set()):
meta = self._get_meta(options=options)
meta['id'] = self.id
meta['tags'] = self.get_tags(r_list=True)
meta['content'] = self.get_content()
return meta

def add(self, date, item_id):
self._add(date, item_id)

def create(self, content, _first_seen=None, _last_seen=None):
self._set_field('content', content)
self._create()


def create_title(content):
title_id = sha256(content.encode()).hexdigest()
title = Title(title_id)
if not title.exists():
title.create(content)
return title

class Titles(AbstractDaterangeObjects):
"""
Titles Objects
"""
def __init__(self):
super().__init__('title')

def get_metas(self, obj_ids, options=set()):
return self._get_metas(Title, obj_ids, options=options)

def sanitize_name_to_search(self, name_to_search):
return name_to_search


# if __name__ == '__main__':
# from lib import crawlers
# from lib.objects import Items
# for item in Items.get_all_items_objects(filters={'sources': ['crawled']}):
# title_content = crawlers.extract_title_from_html(item.get_content())
# if title_content:
# print(item.id, title_content)
# title = create_title(title_content)
# title.add(item.get_date(), item.id)

0 comments on commit c008366

Please sign in to comment.