Skip to content

Commit

Permalink
chg: [Phone module] Filter Invalid Phone numbers + UI Show extracted
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed May 24, 2023
1 parent 7a52aec commit 353b290
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 36 deletions.
4 changes: 2 additions & 2 deletions bin/LAUNCH.sh
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,8 @@ function launching_scripts {
sleep 0.1
screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./PgpDump.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Phone" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Phone.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Tools" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Tools.py; read x"
Expand Down Expand Up @@ -290,8 +292,6 @@ function launching_scripts {
##################################
# DISABLED MODULES #
##################################
# screen -S "Script_AIL" -X screen -t "Phone" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Phone.py; read x"
# sleep 0.1
# screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x"
# sleep 0.1
# screen -S "Script_AIL" -X screen -t "Release" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Release.py; read x"
Expand Down
2 changes: 2 additions & 0 deletions bin/lib/module_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from modules.Iban import Iban
from modules.Mail import Mail
from modules.Onion import Onion
from modules.Phone import Phone
from modules.Tools import Tools

config_loader = ConfigLoader()
Expand All @@ -40,6 +41,7 @@
'infoleak:automatic-detection="iban"': Iban(queue=False),
'infoleak:automatic-detection="mail"': Mail(queue=False),
'infoleak:automatic-detection="onion"': Onion(queue=False),
'infoleak:automatic-detection="phone-number"': Phone(queue=False),
# APIkey ???
# Credentials
# Zerobins
Expand Down
43 changes: 38 additions & 5 deletions bin/lib/regex_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@

import os
import logging.config
import phonenumbers
import re
import sys
import uuid

from multiprocessing import Process as Proc

sys.path.append(os.environ['AIL_BIN'])

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
Expand Down Expand Up @@ -65,7 +64,6 @@ def regex_findall(module_name, redis_key, regex, item_id, item_content, max_time
proc.terminate()
# Statistics.incr_module_timeout_statistic(module_name)
err_mess = f"{module_name}: processing timeout: {item_id}"
print(err_mess)
logger.info(err_mess)
return []
else:
Expand Down Expand Up @@ -99,7 +97,6 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30):
proc.terminate()
# Statistics.incr_module_timeout_statistic(r_key)
err_mess = f"{r_key}: processing timeout: {item_id}"
print(err_mess)
logger.info(err_mess)
return []
else:
Expand Down Expand Up @@ -130,7 +127,6 @@ def regex_search(r_key, regex, item_id, content, max_time=30):
proc.terminate()
# Statistics.incr_module_timeout_statistic(r_key)
err_mess = f"{r_key}: processing timeout: {item_id}"
print(err_mess)
logger.info(err_mess)
return False
else:
Expand All @@ -144,3 +140,40 @@ def regex_search(r_key, regex, item_id, content, max_time=30):
print("Caught KeyboardInterrupt, terminating regex worker")
proc.terminate()
sys.exit(0)

## Phone Regexs ##
def _regex_phone_iter(r_key, country_code, content):
iterator = phonenumbers.PhoneNumberMatcher(content, country_code)
for match in iterator:
value = match.raw_string
# PhoneNumberFormat.E164
# value = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
start = match.start
end = match.end
r_serv_cache.rpush(r_key, f'{start}:{end}:{value}')
r_serv_cache.expire(r_key, 360)

def regex_phone_iter(r_key, country_code, item_id, content, max_time=30):
proc = Proc(target=_regex_phone_iter, args=(r_key, country_code, content))
try:
proc.start()
proc.join(max_time)
if proc.is_alive():
proc.terminate()
# Statistics.incr_module_timeout_statistic(r_key)
err_mess = f"{r_key}: processing timeout: {item_id}"
logger.info(err_mess)
return []
else:
res = r_serv_cache.lrange(r_key, 0, -1)
r_serv_cache.delete(r_key)
proc.terminate()
all_match = []
for match in res:
start, end, value = match.split(':', 2)
all_match.append((int(start), int(end), value))
return all_match
except KeyboardInterrupt:
print("Caught KeyboardInterrupt, terminating regex worker")
proc.terminate()
sys.exit(0)
70 changes: 45 additions & 25 deletions bin/modules/Phone.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# Import External packages
##################################
import os
import re
import sys
import phonenumbers

Expand All @@ -34,44 +33,65 @@ class Phone(AbstractModule):

# regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required)
# reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})')
REG_PHONE = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})')
# REG_PHONE = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})')

def __init__(self):
super(Phone, self).__init__()
def __init__(self, queue=True):
super(Phone, self).__init__(queue=queue)

# Waiting time in seconds between to message processed
self.pending_seconds = 1

def extract(self, obj_id, content, tag):
extracted = []
phones = self.regex_phone_iter('US', obj_id, content)
for phone in phones:
extracted.append([phone[0], phone[1], phone[2], f'tag:{tag}'])
return extracted

def compute(self, message):
item = Item(message)
content = item.get_content()
# List of the regex results in the Item, may be null
results = self.REG_PHONE.findall(content)

# If the list is greater than 4, we consider the Item may contain a list of phone numbers
if len(results) > 4:
self.logger.debug(results)
self.redis_logger.warning(f'{item.get_id()} contains PID (phone numbers)')
# TODO use language detection to choose the country code ?
results = self.regex_phone_iter('US', item.id, content)
for phone in results:
print(phone[2])

if results:
# TAGS
msg = f'infoleak:automatic-detection="phone-number";{item.get_id()}'
self.add_message_to_queue(msg, 'Tags')

stats = {}
for phone_number in results:
try:
x = phonenumbers.parse(phone_number, None)
country_code = x.country_code
if stats.get(country_code) is None:
stats[country_code] = 1
else:
stats[country_code] = stats[country_code] + 1
except:
pass
for country_code in stats:
if stats[country_code] > 4:
self.redis_logger.warning(f'{item.get_id()} contains Phone numbers with country code {country_code}')
self.redis_logger.warning(f'{item.get_id()} contains {len(phone)} Phone numbers')

# # List of the regex results in the Item, may be null
# results = self.REG_PHONE.findall(content)
#
# # If the list is greater than 4, we consider the Item may contain a list of phone numbers
# if len(results) > 4:
# self.logger.debug(results)
# self.redis_logger.warning(f'{item.get_id()} contains PID (phone numbers)')
#
# msg = f'infoleak:automatic-detection="phone-number";{item.get_id()}'
# self.add_message_to_queue(msg, 'Tags')
#
# stats = {}
# for phone_number in results:
# try:
# x = phonenumbers.parse(phone_number, None)
# country_code = x.country_code
# if stats.get(country_code) is None:
# stats[country_code] = 1
# else:
# stats[country_code] = stats[country_code] + 1
# except:
# pass
# for country_code in stats:
# if stats[country_code] > 4:
# self.redis_logger.warning(f'{item.get_id()} contains Phone numbers with country code {country_code}')


if __name__ == '__main__':
module = Phone()
module.run()
# module.run()
module.compute('crawled/2023/02/21/circl.luc90be694-a559-4d77-bfa4-9c54ea8bc2f7')
11 changes: 11 additions & 0 deletions bin/modules/abstract_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,17 @@ def regex_findall(self, regex, obj_id, content, r_set=False):
return regex_helper.regex_findall(self.module_name, self.r_cache_key, regex, obj_id, content,
max_time=self.max_execution_time, r_set=r_set)

def regex_phone_iter(self, country_code, obj_id, content):
"""
regex findall helper (force timeout)
:param regex: compiled regex
:param obj_id: object id
:param content: object content
:param r_set: return result as set
"""
return regex_helper.regex_phone_iter(self.r_cache_key, country_code, obj_id, content,
max_time=self.max_execution_time)

def run(self):
"""
Run Module endless process
Expand Down
7 changes: 3 additions & 4 deletions configs/modules.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,9 @@ publish = Duplicate,Tags
subscribe = Cve
publish = Tags

# Disabled
#[Phone]
#subscribe = Item
#publish = Tags
[Phone]
subscribe = Item
publish = Tags

[Keys]
subscribe = Item
Expand Down

0 comments on commit 353b290

Please sign in to comment.