/
myKnnTest.py
142 lines (114 loc) · 4.7 KB
/
myKnnTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Run some setup code for this notebook.
import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# Load the raw CIFAR-10 data.
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# As a sanity check, we print out the size of the training and test data.
print 'Training data shape: ', X_train.shape
print 'Training labels shape: ', y_train.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape
# Subsample the data for more efficient code execution in this exercise
num_training = 5000
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
num_test = 500
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
# Reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
print X_train.shape, X_test.shape
from cs231n.classifiers import KNearestNeighbor
# Create a kNN classifier instance.
# Remember that training a kNN classifier is a noop:
# the Classifier simply remembers the data and does no further processing
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)
# Open cs231n/classifiers/k_nearest_neighbor.py and implement
# compute_distances_two_loops.
# Test your implementation:
dists = classifier.compute_distances_two_loops(X_test)
#print dists.shape
## Now implement the function predict_labels and run the code below:
## We use k = 1 (which is Nearest Neighbor).
#y_test_pred = classifier.predict_labels(dists, k=1)
#
## Compute and print the fraction of correctly predicted examples
#num_correct = np.sum(y_test_pred == y_test)
#accuracy = float(num_correct) / num_test
#print 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)
#dists_one = classifier.compute_distances_one_loop(X_test)
#
## To ensure that our vectorized implementation is correct, we make sure that it
## agrees with the naive implementation. There are many ways to decide whether
#difference = np.linalg.norm(dists - dists_one, ord='fro')
#print 'Difference was: %f' % (difference, )
#if difference < 0.001:
# print 'Good! The distance matrices are the same'
#else:
# print 'Uh-oh! The distance matrices are different'
dists_two = classifier.compute_distances_no_loops(X_test)
# check that the distance matrix agrees with the one we computed before:
difference = np.linalg.norm(dists - dists_two, ord='fro')
print 'Difference was: %f' % (difference, )
if difference < 0.001:
print 'Good! The distance matrices are the same'
else:
print 'Uh-oh! The distance matrices are different'
def crossValidate(X_fold, y_fold, k, idx):
#print "Use idx ", idx , " for crossvalidation"
#X_train = np.array(len(X_fold)-1)
#X_cross = np.array(l)
#y_train = np.array(len(y_fold)-1)
#y_cross = np.array(len(y_fold))
for i in xrange(0, len(X_fold)):
if i == idx:
X_cross = X_fold[i]
y_cross = y_fold[i]
else:
X_train = np.vstack(X_fold[0:i]+X_fold[i+1:])
y_train = np.hstack(y_fold[0:i]+y_fold[i+1:])
# print "dim train ", X_train.shape
# print "dim cross ", X_cross.shape
# print "dim y train ", y_train.shape
# print "dim y cross ", y_cross.shape
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)
dists = classifier.compute_distances_no_loops(X_cross)
y_cross_pred = classifier.predict_labels(dists, k)
num_correct = np.sum(y_cross_pred == y_cross)
print "cross val has ", y_cross.shape
accuracy = float(num_correct) / len(y_cross)
return accuracy
def kFoldValidation(X_folds, y_folds, k, folds):
accuracyFold = []
for i in range(0, folds):
# print "Withhold idx ", i , " for validation"
accuracyFold.append(crossValidate(X_folds, y_folds, k, i))
return accuracyFold
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)
k_to_accuracies = {}
for k in k_choices:
k_to_accuracies[k] = []
for i in range(0, len(k_choices)):
# print "Evaluating for k = ", k_choices[i]
k_to_accuracies[k_choices[i]] = kFoldValidation(X_train_folds, y_train_folds, k_choices[i], num_folds)
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print 'k = %d, accuracy = %f' % (k, accuracy)