/
Bluecoat_Checker.py
105 lines (90 loc) · 3.56 KB
/
Bluecoat_Checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#
# This script will take an input file called domains.txt
# which it will check against a csv called 'TI_Request_DB.csv'
# and, assuming the URL or IP from the input file doesn't
# exist in the csv, it will submit it to Bluecoat and
# return the site review categories for that IP or URL
#
#
# IndexError: list index out of range
# This error normally means there is a extra newline
# at the end of the TI_Request_DB.csv
# This seems to occur when the .csv is edited in a text
# editor. Leave a single newline at the end of the file and
# the error should go away.
#
import csv
import re
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
# Storage lists
TI_Raw_Data_List = []
Processed_TI_List = []
Bluecoat_Tmp_List = []
Seen_Before_List = []
Output_List = []
TI_Input_File = './domains.txt'
# Function to check Bluecoat Sitereview
def SiteReview(URL):
sleep(randint(5,10))
category_check = requests.post("http://sitereview.bluecoat.com/rest/categorization", data = {'url':URL})
if category_check.status_code != 200:
Processed_TI_List.append("{} could not be checked. Status {} was returned.".format(line, category_check.status_code))
else:
soup = BeautifulSoup(category_check.text, "lxml")
is_captcha_on_page = soup.findAll(text=re.compile('captcha'))
if not is_captcha_on_page:
cat_list = ''
for category in soup.findAll('a'):
cat_list += category.get_text()
cat_list += ', '
Processed_TI_List.append("{} is category {}".format(line, cat_list))
else:
print("Captcha detected for {}. Re-enter URL once captcha resolved".format(line))
URL2 = raw_input('Re-Enter URL: ')
category_check = requests.post("http://sitereview.bluecoat.com/rest/categorization", data = {'url':URL2})
soup = BeautifulSoup(category_check.text, "lxml")
cat_list = ''
for category in soup.findAll('a'):
cat_list += category.get_text()
cat_list += ', '
Processed_TI_List.append("{} is category {}".format(line, cat_list))
# Input TI text file location
# TI_Input_File = raw_input("Enter the location of the Threat Intelligence input text file: eg ./threatintel.txt\n")
# Perform initial sift of intel data
with open(TI_Input_File,'rw') as file:
for line in file:
if not line.isspace():
line = line.lower()
replacements = {'[.]':'.','hxxp://':'','hxxps://':'','(.)':'.','http://':'','https://':''}
for src, target in replacements.iteritems():
line = line.replace(src, target)
TI_Raw_Data_List.append(line.rstrip())
with open('TI_Request_DB.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
for line in TI_Raw_Data_List:
if line == row[0]:
Processed_TI_List.append("{} has been requested before".format(line))
if line not in Seen_Before_List:
Seen_Before_List.append(line)
else:
if line not in Bluecoat_Tmp_List:
Bluecoat_Tmp_List.append(line)
Bluecoat_List = set(Bluecoat_Tmp_List).difference(Seen_Before_List)
for line in Bluecoat_List:
SiteReview(line)
with open('TI_Request_DB.csv', 'a') as csvfile:
for line in TI_Raw_Data_List:
fields = [line]
writer = csv.writer(csvfile)
writer.writerow(fields)
for line in Processed_TI_List:
replacements = {'.':'[.]'}
for src, target in replacements.iteritems():
line = line.replace(src, target)
Output_List.append(line.rstrip())
for item in Output_List:
print(item)