/
netflix_sparse_matrix_prep.py
54 lines (45 loc) · 1.71 KB
/
netflix_sparse_matrix_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from os.path import join, abspath
from os import listdir
import pandas as pd
import numpy as np
import scipy as sp
import re
pardir = join(abspath('.'), 'nf_prize_dataset')
curdir = join(abspath('.'), 'nf_prize_dataset', 'training_set')
fileslist = sorted(listdir(curdir))
def get_data_from_fileslist(fileslist):
CustID_index = {}
numIDs = 0
#
data = []
indices = []
index_ptr = [0]
fileno = 0
#
for filename in fileslist:
filepath = join(curdir, filename)
dataframe = pd.read_csv( filepath, header=1,
names=['CustomerID', 'Rating', 'Date'])
for entry_index, entry in dataframe.iterrows():
if numIDs != 480189 - 1:
try:
row_num = CustID_index[entry.CustomerID]
except KeyError:
CustID_index[entry.CustomerID] = numIDs
row_num = numIDs
numIDs += 1
data.append(entry.Rating)
#row_ind.append(row_num)
#col_ind.append(fileno)
indices.append(row_num)
index_ptr.append(len(indices))
print('\t finished reading file #{} : {}'.format(fileno+1, filepath))
fileno += 1
data, indices, index_ptr = (np.array(x, dtype='int32') for x in [data, indices, index_ptr])
return data, indices, index_ptr
data, indices, index_ptr = get_data_from_fileslist(fileslist)
matrix = sp.sparse.csr_matrix((data, indices, index_ptr))#, shape=(480189, 17770))
sp.sparse.save_npz(join(pardir, 'data.npz'), matrix)
movie_titles_file = join(pardir, 'movie_titles.txt')
movie_titles_file2 = join(pardir, 'movie_titles_2.txt')
movie_titles = pd.read_csv(movie_titles_file2, sep='|')