/
knearest.py
executable file
·85 lines (61 loc) · 2.08 KB
/
knearest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
COMP3308 - AI
Assignment 1
@author Hugh Purnell <hugh.purnell@gmail.com>
SSID: 310181941
K-nearest neighbor classifier
"""
from heapq import heappush, heappop
import random
import data_preprocessing
def euclid_distance_squared(a, b, attributes):
d = float(0)
for i in attributes:
d+= ( a[i] - b[i] ) **2
return d
def convert_att_names_to_indexes(attributes):
attribute_indexes = []
for attribute_name in attributes:
if attribute_name == "class":
continue
index = data_preprocessing.get_header().index(attribute_name)
if index > -1:
attribute_indexes.append(index)
return attribute_indexes
def classify(k, sample, training_data, att_names = None):
if att_names == None:
att_names = data_preprocessing.get_header()
attribute_indexes = convert_att_names_to_indexes(att_names)
# class_index = att_names.index("class")
class_index = data_preprocessing.get_header().index("class")
distances = []
class0_count = 0
class1_count = 0
for training_sample in training_data:
dist = euclid_distance_squared(sample, training_sample, attribute_indexes)
heappush(distances, (dist, training_sample) )
for i in range(k):
(_, training_sample) = heappop(distances)
if training_sample[class_index] == "class0":
class0_count+=1
if training_sample[class_index] == "class1":
class1_count+=1
if class0_count > class1_count:
if sample[class_index] == 'class0':
return 0,True
return 0,False
else:
if sample[class_index] == 'class1':
return 1,True
return 1,False
def main():
#attr_names = ['plasma_glucose_concentration','bmi','diabetes_pedigree','age','class'] # For CFS
attr_names = None
training_data = data_preprocessing.load_csv_data("pima.csv")
training_data.pop(0) #pop the header off
test_sample = training_data.pop(random.randint(0, len(training_data) - 1 ))
#print "Test sample:", test_sample
(c,correctness) = classify(10, test_sample, training_data, attr_names)
print "Classifier predicted: ", c, " Correctness: ", correctness
if __name__ == "__main__":
main()