/
similarityMetrics.py
executable file
·160 lines (129 loc) · 4.67 KB
/
similarityMetrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#! /usr/bin/python
# -*- coding: utf-8 -*-
# global import
# from bs4 import BeautifulSoup
from BeautifulSoup import BeautifulSoup
import collections
from HTMLParser import HTMLParser
import math
import numpy as np
import sys
import traceback
import urllib2
class TagParser(HTMLParser):
cnt = collections.Counter() # counter of html tag frequency
attr_list = [u'color', u'width', u'height'] # listed of interested signature attributes
def handle_starttag(self, tag, attributes):
# extract attributes and their values into a single list if they appear in attr_list
attr_with_value = [item for sub_list in
[list(attr) for attr in attributes if attr[0] in self.attr_list] for item in sub_list]
# concatenate html tag, attribute, and value with underscores
index = u'_'.join([tag] + attr_with_value)
# increase combined 'new' tag counts by 1
self.cnt[index] += 1
def handle_endtag(self, tag):
self.cnt[unicode(tag)] += 1
def reset_counter(self):
self.cnt.clear()
def __str__(self):
return str(self.cnt)
def get_term_frequency_vectors(content):
tp = TagParser()
tp.feed(content)
return dict(tp.cnt)
def cosine_similarity(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[key] * vec2[key] for key in intersection])
sum1 = sum([vec1[key]**2 for key in vec1.keys()])
sum2 = sum([vec2[key]**2 for key in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def equal_weight(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
union = set(vec1.keys()) | set(vec2.keys())
numerator = reduce(float.__add__,
[1.0 - abs(float(vec1[key]-vec2[key]) / (vec1[key]+vec2[key])) for key in intersection])
denominator = len(union)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def length_ratio(page1, page2):
small, large = sorted([len(page1), len(page2)])
return float(small) / large
def dom_similarity(page1, page2):
# return 1 if lab and field are identical
if page1 == page2:
return 1.0
# return 0 if one of the page is empty
if page1 is None or page2 is None:
return 0.0
# compute all parent-child couples
tags = ['[document]'] # root tag for BeautifulSoup
couples_lab = []
couples_field = []
dom1 = BeautifulSoup(page1)
dom2 = BeautifulSoup(page2)
for x in dom1.findAll():
couples_lab.append((str(x.parent.name), str(x.name)))
if str(x.name) not in tags:
tags.append(str(x.name))
for x in dom2.findAll():
couples_field.append((str(x.parent.name), str(x.name)))
if str(x.name) not in tags:
tags.append(str(x.name))
# compute DOM couples matrix
matrix1 = np.zeros((len(tags), len(tags)))
matrix2 = np.zeros((len(tags), len(tags)))
for c in couples_lab:
x = tags.index(c[0])
y = tags.index(c[1])
matrix1[x, y] += 1
for c in couples_field:
x = tags.index(c[0])
y = tags.index(c[1])
matrix2[x, y] += 1
correlation = np.vdot(matrix1, matrix2)
correlation /= np.linalg.norm(matrix1) * np.linalg.norm(matrix2)
return abs(correlation)
def similarity_metrics(page1, page2):
vec1 = get_term_frequency_vectors(page1)
vec2 = get_term_frequency_vectors(page2)
results = {
'cosine similarity': cosine_similarity(vec1, vec2),
'equal weight': equal_weight(vec1, vec2),
'length ratio': length_ratio(page1, page2),
'dom similarity': dom_similarity(page1, page2)
}
return results
def get_file_content(filename):
"""Read in the contents of the file or download the site if appropriate"""
if "http://" in filename:
try:
return urllib2.urlopen(filename).read()
except urllib2.URLError as exp:
print "Error: cannot open %s" % filename
raise exp
else:
try:
with open(filename, 'r') as file_p:
return file_p.read()
except IOError:
print "Error: cannot open %s" % filename
raise exp
def compare_files(file1, file2):
try:
page1 = get_file_content(file1)
page2 = get_file_content(file2)
except Exception:
traceback.print_exc()
sys.exit(1)
return similarity_metrics(page1, page2)
if __name__ == "__main__":
if len(sys.argv) < 3:
print "Usage: % python <FILENAME1> <FILENAME2>"
else:
print compare_files(sys.argv[1], sys.argv[2])