/
main.py
142 lines (118 loc) · 5.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
This is an example script of the K-nearest-neighbor
algorithm in the context of machine-learning.
The use case uses the MINST dataset for digit classification
based on 28x28 pixel images.
Author: Alan A. Diaz-Montiel
email: alan.diaz@connectcentre.ie
"""
import numpy as np
from mlxtend.data import loadlocal_mnist
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
def euclidean_distance(row1, row2):
"""
Calculate the Euclidean distance between two vectors
:param row1: numpy array with training samples
:param row2: numpy array with testing samples
:return: euclidean distance of row2 to with respect to row1
"""
distance = 0.0
for i in range(len(row1) - 1):
distance += (int(row1[i]) - int(row2[i])) ** 2
return np.sqrt(distance)
def nearest_neighbor(x_trainset, y_trainset, unknown, k=1):
"""
K-nearest-neighbor implementation with euclidean distance
:param x_trainset: numpy array of training samples
:param y_trainset: numpy array of training labels
:param unknown: numpy array of unknown/unlabeled sample
:param k: number of neighbors to consider in evaluation
:return: the class/number given to unknown
"""
# store distances from all samples to the unknown
distances = []
for i, sample in enumerate(x_trainset):
distances.append((y_trainset[i], euclidean_distance(sample, unknown)))
# sort the distances in ascending order
distances.sort(key=lambda x: x[1])
# look only at the k entries on distances
kn = distances[:k]
# take all the classes
classes = [x[0] for x in kn]
# create a Counter object with the list of classes
occurence_count = Counter(classes)
# return the class with more occurences
return occurence_count.most_common(1)[0][0]
def evaluate_knn(X_train, X_test, y_train, y_test, k=1):
"""
Create a classification model with customised
implementation and evaluate the test set.
:param X_train: numpy array with training samples
:param X_test: numpy array with testing samples
:param y_train: numpy array with training labels
:param y_test: numpy array with training labels
:param k: number of k neighbors
:return: print accuracy of model
"""
# number of samples to be considered in evaluation
trainset_size = 100
# number of tests to evaluate
testset_size = 100
# keep track of correctly classified digits
correct = 0
for i in range(testset_size):
# classify digit based on our KNN implementation
estimation = nearest_neighbor(X_train[:trainset_size], y_train[:trainset_size], X_test[i], k=k)
# retrieve the correct value
truth = y_test[i]
if truth == estimation:
correct += 1
# compute accuracy
accuracy = correct / testset_size * 100
print("Our algorithm correctly classified a number %s / %s times" % (str(correct), str(testset_size)))
print("Accuracy of our model: %s %%" % str(accuracy))
def sklearn_knn(X_train, X_test, y_train, y_test, k=1):
"""
Create a classification model with the KNeighborsClassifier
from sklearn and evaluate the test set.
:param X_train: numpy array with training samples
:param X_test: numpy array with testing samples
:param y_train: numpy array with training labels
:param y_test: numpy array with training labels
:param k: number of k neighbors
:return: print accuracy of model
"""
# number of samples to be considered in evaluation
trainset_size = 100
# number of tests to evaluate
testset_size = 100
# create an instance of KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=k)
# fit/train/create the model with the training data
model.fit(X_train[:trainset_size], y_train[:trainset_size].ravel())
# evaluate the model and compute its accuracy
accuracy = model.score(X_test[:testset_size], y_test[:testset_size]) * 100
print("Accuracy of sklearn model %s %%" % str(accuracy))
def main():
# Store image descriptors and labels from dataset
# X = np.array(pd.read_csv('dataset/images.csv', header=None))
# y = np.array(pd.read_csv('dataset/labels.csv', header=None))
X, y = loadlocal_mnist(
images_path='dataset/train-images.idx3-ubyte',
labels_path='dataset/train-labels.idx1-ubyte')
print('Dimensions of the dataset (samples): %s x %s' % (X.shape[0], X.shape[1]))
print('Dimensions of the dataset (labels): %s' % len(y))
# Split dataset in training and testing samples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print('Dimensions of X_train: %s x %s' % (X_train.shape[0], X_train.shape[1]))
print('Dimensions of y_train: %s' % (len(y_train)))
print('Dimensions of X_test: %s x %s' % (X_test.shape[0], X_test.shape[1]))
print('Dimensions of y_test: %s' % (len(y_test)))
# Evaluate our KNN implementation
evaluate_knn(X_train, X_test, y_train, y_test, k=1)
# Evaluate the Sklearn model implementation
sklearn_knn(X_train, X_test, y_train, y_test, k=1)
if __name__ == '__main__':
main()