-
Notifications
You must be signed in to change notification settings - Fork 0
/
part1.py
72 lines (46 loc) · 2.98 KB
/
part1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, KFold, GridSearchCV
dataset1_path = r'Part_1\\dataset\\'
### part1.1
reader = Reader(line_format = 'user item rating', sep = ',', rating_scale = [0.5, 5], skip_lines = 1)
data = Dataset.load_from_file(dataset1_path + "ratings.csv",
reader)
# splitting dataset into k consecutive folds (without shuffling by default)
# out of the k folds, k-1 sets are used for training while the remaining set is used for testing.
# n_splits (int) – number of folds
# random_state – determines the RNG that will be used for determining the folds
kf = KFold(n_splits = 5, random_state = 0)
# defining the list of reccomendation algorithms
algo_list = ['SVD()', 'SlopeOne()', 'NMF()', 'NormalPredictor()', 'KNNBaseline()', 'KNNBasic()', 'KNNWithMeans()', 'KNNWithZScore()', 'BaselineOnly()', 'CoClustering()', 'SVDpp()']
# evaluating RMSE of algorithm in algo_list on 5 splits
benchmark = []
# iterating over all algorithms
for i in range(len(algo_list)):
# perfoming cross validation
results = cross_validate(eval(algo_list[i]), data, measures = ['RMSE'], cv = kf, verbose = True, n_jobs = -1)
# getting results & appending algorithm name
tmp = pd.DataFrame.from_dict(results).mean(axis = 0)
tmp = tmp.append(pd.Series([algo_list[i].split(' ')[0].split('.')[-1]], index = ['Algorithm']))
benchmark.append(tmp)
# results_pd is a dataframe containing test_rmse, fit_time and test_time for each algorithm
results_pd = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
### part1.2
# SVD grid of parameter for the GridSearch
grid_of_parameters = {'n_factors' : [105, 110], 'n_epochs' : [20, 25, 30], 'lr_all' : [0.005, 0.009], 'reg_all' : [0.04, 0.06]}
gs = GridSearchCV(SVD, # estimator parameter – the algorithm that you want to execute
param_grid = grid_of_parameters, # param_grid – takes the parameter dictionary that we just created as parameter
measures = ['rmse'], # scoring parameter – takes the performance metrics
cv = kf, n_jobs = -1) # when -1 n_jobs it use all processors
gs.fit(data)
# KNNBaseline grid of parameter for the GridSearch
grid_of_parameters = {'bsl_options' : {'method' : ['als'], 'n_epochs' : [20]},
'sim_options' : {'name' : ['cosine', 'pearson', 'pearson_baseline'], 'min_support' : [1,5], 'user_based' : [False, True],
'k': [20, 40, 100]}
}
gs = GridSearchCV(KNNBaseline,
param_grid = grid_of_parameters,
measures = ['rmse'],
cv = kf, n_jobs = -1) # when -1 n_jobs it use all processors
gs.fit(data)