Skip to content

Commit

Permalink
Merge pull request #449 from CIRCL/tags_v2
Browse files Browse the repository at this point in the history
Tags v2 - Tagging system refractoring
  • Loading branch information
adulau committed Jan 13, 2020
2 parents a4dd224 + 5f8b81f commit 59b2745
Show file tree
Hide file tree
Showing 32 changed files with 1,247 additions and 345 deletions.
11 changes: 8 additions & 3 deletions OVERVIEW.md
Expand Up @@ -198,22 +198,27 @@ Redis and ARDB overview
##### Hset:
| Key | Field | Value |
| ------ | ------ | ------ |
| per_paste_**epoch** | **term** | **nb_seen** |
| | |
| tag_metadata:**tag** | first_seen | **date** |
| tag_metadata:**tag** | last_seen | **date** |

##### Set:
| Key | Value |
| ------ | ------ |
| list_tags | **tag** |
| list_tags:**object_type** | **tag** |
| list_tags:domain | **tag** |
||
| active_taxonomies | **taxonomie** |
| active_galaxies | **galaxie** |
| active_tag_**taxonomie or galaxy** | **tag** |
| synonym_tag_misp-galaxy:**galaxy** | **tag synonym** |
| list_export_tags | **user_tag** |
||
| **tag**:**date** | **paste** |

| **object_type**:**tag** | **object_id** |
||
| DB7 |
| tag:**object_id** | **tag** |

##### old:
| Key | Value |
Expand Down
57 changes: 4 additions & 53 deletions bin/Tags.py
Expand Up @@ -8,29 +8,11 @@
This module create tags.
"""
import redis

import time
import datetime

from pubsublogger import publisher
from Helper import Process
from packages import Paste
from packages import Item


def get_item_date(item_filename):
l_directory = item_filename.split('/')
return '{}{}{}'.format(l_directory[-4], l_directory[-3], l_directory[-2])

def set_tag_metadata(tag, date):
# First time we see this tag ## TODO: filter paste from the paste ?
if not server.hexists('tag_metadata:{}'.format(tag), 'first_seen'):
server.hset('tag_metadata:{}'.format(tag), 'first_seen', date)
# Check and Set tag last_seen
last_seen = server.hget('tag_metadata:{}'.format(tag), 'last_seen')
if last_seen is None or date > last_seen:
server.hset('tag_metadata:{}'.format(tag), 'last_seen', date)
from packages import Tag

if __name__ == '__main__':

Expand All @@ -45,18 +27,6 @@ def set_tag_metadata(tag, date):
# Setup the I/O queues
p = Process(config_section)

server = redis.StrictRedis(
host=p.config.get("ARDB_Tags", "host"),
port=p.config.get("ARDB_Tags", "port"),
db=p.config.get("ARDB_Tags", "db"),
decode_responses=True)

server_metadata = redis.StrictRedis(
host=p.config.get("ARDB_Metadata", "host"),
port=p.config.get("ARDB_Metadata", "port"),
db=p.config.get("ARDB_Metadata", "db"),
decode_responses=True)

# Sent to the logging a description of the module
publisher.info("Tags module started")

Expand All @@ -71,27 +41,8 @@ def set_tag_metadata(tag, date):
continue

else:
tag, path = message.split(';')
# add the tag to the tags word_list
res = server.sadd('list_tags', tag)
if res == 1:
print("new tags added : {}".format(tag))
# add the path to the tag set
#curr_date = datetime.date.today().strftime("%Y%m%d")
item_date = get_item_date(path)
res = server.sadd('{}:{}'.format(tag, item_date), path)
if res == 1:
print("new paste: {}".format(path))
print(" tagged: {}".format(tag))
set_tag_metadata(tag, item_date)
server_metadata.sadd('tag:{}'.format(path), tag)

# Domain Object
if Item.is_crawled(path) and tag!='infoleak:submission="crawler"':
domain = Item.get_item_domain(path)
server_metadata.sadd('tag:{}'.format(domain), tag)
server.sadd('domain:{}:{}'.format(tag, item_date), domain)
print(message)
tag, item_id = message.split(';')

curr_date = datetime.date.today().strftime("%Y%m%d")
server.hincrby('daily_tags:{}'.format(item_date), tag, 1)
Tag.add_tag("item", tag, item_id)
p.populate_set_out(message, 'MISP_The_Hive_feeder')
26 changes: 23 additions & 3 deletions bin/lib/Correlate_object.py
Expand Up @@ -23,6 +23,15 @@
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None

def is_valid_object_type(object_type):
if object_type in ['domain', 'item', 'image']:
return True
else:
return False

def get_all_objects():
return ['domain', 'paste', 'pgp', 'cryptocurrency', 'decoded', 'screenshot']

def get_all_correlation_names():
'''
Return a list of all available correlations
Expand Down Expand Up @@ -178,11 +187,21 @@ def get_item_url(correlation_name, value, correlation_type=None):
elif correlation_name == 'domain':
endpoint = 'crawler_splash.showDomain'
url = url_for(endpoint, domain=value)
elif correlation_name == 'paste':
elif correlation_name == 'item':
endpoint = 'showsavedpastes.showsavedpaste'
url = url_for(endpoint, paste=value)
elif correlation_name == 'paste': ### # TODO: remove me
endpoint = 'showsavedpastes.showsavedpaste'
url = url_for(endpoint, paste=value)
return url

def get_obj_tag_table_keys(object_type):
'''
Warning: use only in flask (dynamic templates)
'''
if object_type=="domain":
return ['id', 'first_seen', 'last_check', 'status'] # # TODO: add root screenshot


def create_graph_links(links_set):
graph_links_list = []
Expand Down Expand Up @@ -310,6 +329,7 @@ def get_graph_node_object_correlation(object_type, root_value, mode, correlation


######## API EXPOSED ########


def sanitize_object_type(object_type):
if not is_valid_object_type(object_type):
return ({'status': 'error', 'reason': 'Incorrect object_type'}, 400)
######## ########
4 changes: 2 additions & 2 deletions bin/lib/Domain.py
Expand Up @@ -292,7 +292,7 @@ def get_domain_items_crawled(domain, domain_type, port, epoch=None, items_link=F
if item_screenshot:
dict_item['screenshot'] = Item.get_item_screenshot(item)
if item_tag:
dict_item['tags'] = Tag.get_item_tags_minimal(item)
dict_item['tags'] = Tag.get_obj_tags_minimal(item)
item_crawled['items'].append(dict_item)
return item_crawled

Expand Down Expand Up @@ -365,7 +365,7 @@ def get_domain_tags(domain):
:param domain: crawled domain
'''
return Tag.get_item_tags(domain)
return Tag.get_obj_tag(domain)

def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, status=True, ports=True, tags=False):
'''
Expand Down
5 changes: 4 additions & 1 deletion bin/lib/Screenshot.py
Expand Up @@ -43,13 +43,16 @@ def get_screenshot_items_list(sha256_string):
else:
return []

def get_item_screenshot(item_id):
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'screenshot')

def get_item_screenshot_list(item_id):
'''
Retun all decoded item of a given item id.
:param item_id: item id
'''
screenshot = r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'screenshot')
screenshot = get_item_screenshot(item_id)
if screenshot:
return [screenshot]
else:
Expand Down
3 changes: 3 additions & 0 deletions bin/packages/Date.py
Expand Up @@ -79,6 +79,9 @@ def substract_day(self, numDay):
comp_day = str(computed_date.day).zfill(2)
return comp_year + comp_month + comp_day

def get_today_date_str():
return datetime.date.today().strftime("%Y%m%d")

def date_add_day(date, num_day=1):
new_date = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:8])) + datetime.timedelta(num_day)
new_date = str(new_date).replace('-', '')
Expand Down
4 changes: 2 additions & 2 deletions bin/packages/Item.py
Expand Up @@ -104,7 +104,7 @@ def get_item(request_dict):
dict_item['date'] = get_item_date(item_id, add_separator=add_separator)
tags = request_dict.get('tags', True)
if tags:
dict_item['tags'] = Tag.get_item_tags(item_id)
dict_item['tags'] = Tag.get_obj_tag(item_id)

size = request_dict.get('size', False)
if size:
Expand Down Expand Up @@ -242,7 +242,7 @@ def get_item_pgp_correlation(item_id):
def get_item_list_desc(list_item_id):
desc_list = []
for item_id in list_item_id:
desc_list.append( {'id': item_id, 'date': get_item_date(item_id), 'tags': Tag.get_item_tags(item_id)} )
desc_list.append( {'id': item_id, 'date': get_item_date(item_id), 'tags': Tag.get_obj_tag(item_id)} )
return desc_list

# # TODO: add an option to check the tag
Expand Down

0 comments on commit 59b2745

Please sign in to comment.