This repository has been archived by the owner on May 12, 2020. It is now read-only.
/
urlcheck.py
169 lines (136 loc) · 5.36 KB
/
urlcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/python
import ConfigParser
import requests
import cymysql as MySQLdb
import os
import sys
import re
import socket
import datetime
import logging
class ContextFilter(logging.Filter):
hostname = socket.gethostname()
def filter(self, record):
record.hostname = ContextFilter.hostname
return True
class urlCheck():
config = {}
logger = False
def __init__(self):
self.loadConfig()
self.connectToDB()
self.logger = logging.getLogger('urlcheck')
logging.getLogger('').setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s [%(name)s] %(message)s')
f = ContextFilter()
self.logger.addFilter(f)
def loadConfig(self):
basedir = os.path.dirname(os.path.abspath(sys.argv[0]))
config = ConfigParser.ConfigParser()
config.read(["defaults.ini", basedir + '/config.ini'])
self.config = config
def connectToDB(self):
self.dbconnection = MySQLdb.connect(
user=self.config.get(
"database", "user"), passwd=self.config.get(
"database", "password"), db=self.config.get(
"database", "database"))
def grabUrls(self, text):
"""Given a text string, returns all the urls we can find in it."""
urls = '(?: %s)' % '|'.join(
"""http https telnet gopher file wais ftp""".split())
ltrs = r'\w'
gunk = r'/#~:.?+=&%@!\-'
punc = r'.:?\-'
any = "%(ltrs)s%(gunk)s%(punc)s" % {'ltrs': ltrs,
'gunk': gunk,
'punc': punc}
url = r"""
\b # start at word boundary
%(urls)s : # need resource and a colon
[%(any)s] +? # followed by one or more
# of any valid character, but
# be conservative and take only
# what you need to....
(?= # look-ahead non-consumptive assertion
[%(punc)s]* # either 0 or more punctuation
(?: [^%(any)s] # followed by a non-url char
| # or end of the string
$
)
)
""" % {'urls' : urls,
'any': any,
'punc': punc}
url_re = re.compile(url, re.VERBOSE | re.MULTILINE)
return url_re.findall(text)
def colour(self, text, level):
colours = {
'HEADER': '\033[95m',
'OKBLUE': '\033[94m',
'OK': '\033[92m',
'WARNING': '\033[93m',
'FAIL': '\033[91m',
'ENDC': '\033[0m',
}
return "%s%s%s" % (colours[level], text, colours['ENDC']);
def checkBatch(self):
query = 'SELECT id, message, checked_status, checked_repeat, `time` FROM urllist WHERE datediff(NOW(), `checked_date`) > 365 or `checked_date` = 0 '
update_query = "UPDATE `urllist` SET checked_date = NOW(), checked_status = %s, checked_repeat = %s where id = %s"
cursor = self.dbconnection.cursor()
cursor.execute(query)
links = cursor.fetchall()
n = 0
for link in links:
urls = self.grabUrls(link[1])
if len(urls) == 0:
params = (400, 1, link[0])
cursor.execute(update_query, params)
self.dbconnection.commit()
continue
url = urls[0]
seconds_since_epoch = link[4]
status_code = self.get_url(url)
if status_code == link[2]:
repeat_code = link[3] + 1
else:
repeat_code = 1
if status_code <= 200:
pretty_code = self.colour(status_code, "OK")
elif status_code == 429:
pretty_code = self.colour(status_code, "OK")
elif status_code < 500:
pretty_code = self.colour(status_code, "WARNING")
else:
pretty_code = self.colour(status_code, "FAIL")
params = (status_code, repeat_code, link[0])
dt = datetime.datetime.utcfromtimestamp(seconds_since_epoch)
iso_format = dt.isoformat() + 'Z'
l = "[%s] %s %s" % (pretty_code, iso_format, url)
self.logger.info(l)
# print l
cursor.execute(update_query, params)
n = n + 1
if n == 20:
n = 0
self.logger.info("Commited")
#print self.colour("Commitment!!\n-", "OKBLUE")
self.dbconnection.commit()
self.dbconnection.commit()
def get_url(self, url):
try:
r = requests.get(url, timeout=10)
return r.status_code
except requests.exceptions.Timeout:
return 504 # Gateway Timeout
except (requests.exceptions.ConnectionError, socket.gaierror, socket.error):
return 502 # Bad Gateway
except requests.exceptions.TooManyRedirects:
return 508 # Loop Detected
except (requests.exceptions.RequestException, AttributeError, UnicodeError):
return 418 # I'm a teapot
except:
return 501 # I'm a teapot
if __name__ == '__main__':
checker = urlCheck()
checker.checkBatch()