/
preprocessing.py
149 lines (100 loc) · 4.57 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# based on https://github.com/dawenl/vae_cf
import os
import sys
import numpy as np
from scipy import sparse
import pandas as pd
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str)
parser.add_argument('--output_dir', type=str)
parser.add_argument('--threshold', type=float)
parser.add_argument('--min_items_per_user', type=int, default=5)
parser.add_argument('--min_users_per_item', type=int, default=0)
parser.add_argument('--heldout_users', type=int)
args = parser.parse_args()
dataset = args.dataset
output_dir = args.output_dir
threshold = args.threshold
min_uc = args.min_items_per_user
min_sc = args.min_users_per_item
n_heldout_users = args.heldout_users
raw_data = pd.read_csv(dataset, header=0)
raw_data = raw_data[raw_data['rating'] > threshold]
raw_data.head()
def get_count(tp, id):
playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
count = playcount_groupbyid.size()
return count
def filter_triplets(tp, min_uc=min_uc, min_sc=min_sc):
if min_sc > 0:
itemcount = get_count(tp, 'movieId')
tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
if min_uc > 0:
usercount = get_count(tp, 'userId')
tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId')
return tp, usercount, itemcount
raw_data, user_activity, item_popularity = filter_triplets(raw_data)
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])
print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" %
(raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))
unique_uid = user_activity.index
np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]
n_users = unique_uid.size
tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]
unique_sid = pd.unique(train_plays['movieId'])
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(os.path.join(output_dir, 'unique_sid.txt'), 'w') as f:
for sid in unique_sid:
f.write('%s\n' % sid)
with open(os.path.join(output_dir, 'unique_uid.txt'), 'w') as f:
for uid in unique_uid:
f.write('%s\n' % uid)
def split_train_test_proportion(data, test_prop=0.2):
data_grouped_by_user = data.groupby('userId')
tr_list, te_list = list(), list()
np.random.seed(98765)
for i, (_, group) in enumerate(data_grouped_by_user):
n_items_u = len(group)
if n_items_u >= 5:
idx = np.zeros(n_items_u, dtype='bool')
idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True
tr_list.append(group[np.logical_not(idx)])
te_list.append(group[idx])
else:
tr_list.append(group)
if i % 1000 == 0:
print("%d users sampled" % i)
sys.stdout.flush()
data_tr = pd.concat(tr_list)
data_te = pd.concat(te_list)
return data_tr, data_te
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)
def numerize(tp):
uid = list(map(lambda x: profile2id[x], tp['userId']))
sid = list(map(lambda x: show2id[x], tp['movieId']))
return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])
train_data = numerize(train_plays)
train_data.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(output_dir, 'validation_tr.csv'), index=False)
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(output_dir, 'validation_te.csv'), index=False)
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(output_dir, 'test_tr.csv'), index=False)
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(output_dir, 'test_te.csv'), index=False)