-
Notifications
You must be signed in to change notification settings - Fork 4
/
music_recommendation_binary.py
694 lines (549 loc) · 24.5 KB
/
music_recommendation_binary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
# -*- coding: utf-8 -*-
"""music_recommendation_binary.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/github/YIZHE12/music_recom/blob/master/music_recommendation_binary.ipynb
# Music recommendation system using TensorFlow and Keras
## Load all require packages
"""
from __future__ import print_function
import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import sklearn.manifold
import tensorflow as tf
from sklearn import preprocessing
import time
import seaborn as sns
import re
from gensim.models import word2vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os
from keras.models import Sequential
from keras.layers import Embedding
tf.logging.set_verbosity(tf.logging.ERROR)
# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.3f}'.format
def mask(df, key, function):
"""Returns a filtered dataframe, by applying function to key"""
return df[function(df[key])]
def flatten_cols(df):
df.columns = [' '.join(col).strip() for col in df.columns.values]
return df
pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols
# Install spreadsheets and import authentication module.
USER_RATINGS = False
!pip install --upgrade -q gspread
from google.colab import auth
import gspread
from oauth2client.client import GoogleCredentials
"""## Load data
song_df_1 is the table of user's listening record, which has columns of 'user_id',' 'song_id', 'listen_count'
song_df_2 is song metadata file, which has 'song_id', 'title', 'release', 'artist_name', 'year'
"""
triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt'
songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'
song_df_1 = pd.read_table(triplets_file,header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']
#Read song metadata
song_df_2 = pd.read_csv(songs_metadata_file)
song_df_1.head()
song_df_2.head()
len(song_df_2.artist_name.unique())
len(song_df_2.title.unique())
"""## Data cleaning
The song_id and user_id is a very long string, we can increase efficiency by converting this long string to a number by label encoding
"""
le_song_id = preprocessing.LabelEncoder()
le_song_id.fit(song_df_2.song_id)
song_df_2.song_id = le_song_id.transform(song_df_2.song_id)
song_df_1.song_id = le_song_id.transform(song_df_1.song_id)
le_user_id = preprocessing.LabelEncoder()
song_df_1.user_id = le_user_id.fit_transform(song_df_1.user_id)
"""## Merge two dataset"""
#Merge the two dataframes above to create input dataframe for recommender systems
song_df = pd.merge(song_df_1, \
song_df_2.drop_duplicates(['song_id']), \
on="song_id", how="left")
song_df.head()
song_df.describe(include=[np.int, np.object])
song_df = song_df.fillna(value=0)
"""#### Hiistogram of listen_count In log scale"""
g = sns.distplot(song_df.listen_count, kde=False, rug=False)
g.set_yscale('log')
"""#### number of unique user"""
len(pd.unique(song_df.user_id))
"""#### number of unique song"""
len(pd.unique(song_df.song_id))
"""#### check the most popular songs"""
song_ratings = song_df_2.merge(
song_df
.groupby('song_id', as_index=False)
.agg({'listen_count': ['count', 'mean']})
.flatten_cols(),
on='song_id')
sorted_song = (song_ratings[['title', 'listen_count count', 'listen_count mean']]
.sort_values('listen_count count', ascending=False))
sorted_song.head(10)
"""## Data Normalization
The maximum number of 'listen_count' is 8277, while the minimal is 0. This large range will create problems for the model and require normalization.
Here, I first grouped the data by user then normalized the listen_count value.
"""
zscore = lambda x: (x - x.mean()) / x.std()
# min_max = lambda x: (x - x.min()) / (x.max() - x.min())
norm = song_df.groupby('user_id').transform(zscore)
song_df.listen_count = norm.listen_count
song_df.head()
min_max = lambda x: (x - x.min()) / (x.max() - x.min())
norm = song_df.groupby('user_id').transform(min_max)
song_df.listen_count = norm.listen_count
song_df.head()
song_df = song_df.fillna(value=0)
"""# Build a collaborative filtering model
## CFModel (Collaborative Filtering Model) helper class
This is a simple class to train a matrix factorization model using stochastic gradient descent.
The class constructor takes
- the user embeddings U (a `tf.Variable`).
- the song embeddings V, (a `tf.Variable`).
- a loss to optimize (a `tf.Tensor`).
- an optional list of metrics dictionaries, each mapping a string (the name of the metric) to a tensor. These are evaluated and plotted during training (e.g. training error and test error).
After training, one can access the trained embeddings using the `model.embeddings` dictionary.
Example usage:
```
U_var = ...
V_var = ...
loss = ...
model = CFModel(U_var, V_var, loss)
model.train(iterations=100, learning_rate=1.0)
user_embeddings = model.embeddings['user_id']
song_embeddings = model.embeddings['song_id']
```
"""
# Utility to split the data into training and test sets.
def split_dataframe(df, holdout_fraction=0.1):
"""Splits a DataFrame into training and test sets.
Args:
df: a dataframe.
holdout_fraction: fraction of dataframe rows to use in the test set.
Returns:
train: dataframe for training
test: dataframe for testing
"""
test = df.sample(frac=holdout_fraction, replace=False)
train = df[~df.index.isin(test.index)]
return train, test
def build_rating_sparse_tensor(song_df):
"""
Args:
ratings_df: a pd.DataFrame with `user_id`, `song_id` and `listen_count` columns.
Returns:
a tf.SparseTensor representing the ratings matrix.
"""
indices = song_df[['user_id', 'song_id']].values
values = song_df['listen_count'].values
return tf.SparseTensor(
indices=indices,
values=values,
dense_shape=[song_df_1.shape[0], song_df_2.shape[0]])
def sparse_mean_square_error(sparse_ratings, user_embeddings, music_embeddings):
"""
Args:
sparse_ratings: A SparseTensor rating matrix, of dense_shape [N, M]
user_embeddings: A dense Tensor U of shape [N, k] where k is the embedding
dimension, such that U_i is the embedding of user i.
music_embeddings: A dense Tensor V of shape [M, k] where k is the embedding
dimension, such that V_j is the embedding of music j.
Returns:
A scalar Tensor representing the MSE between the true ratings and the
model's predictions.
"""
predictions = tf.reduce_sum(
tf.gather(user_embeddings, sparse_ratings.indices[:, 0]) *
tf.gather(music_embeddings, sparse_ratings.indices[:, 1]),
axis=1)
loss = tf.losses.mean_squared_error(sparse_ratings.values, predictions)
return loss
class CFModel(object):
"""Simple class that represents a collaborative filtering model"""
def __init__(self, embedding_vars, loss, metrics=None):
"""Initializes a CFModel.
Args:
embedding_vars: A dictionary of tf.Variables.
loss: A float Tensor. The loss to optimize.
metrics: optional list of dictionaries of Tensors. The metrics in each
dictionary will be plotted in a separate figure during training.
"""
self._embedding_vars = embedding_vars
self._loss = loss
self._metrics = metrics
self._embeddings = {k: None for k in embedding_vars}
self._session = None
@property
def embeddings(self):
"""The embeddings dictionary."""
return self._embeddings
def train(self, num_iterations=100, learning_rate=1.0, plot_results=True,
optimizer=tf.train.GradientDescentOptimizer):
"""Trains the model.
Args:
iterations: number of iterations to run.
learning_rate: optimizer learning rate.
plot_results: whether to plot the results at the end of training.
optimizer: the optimizer to use. Default to GradientDescentOptimizer.
Returns:
The metrics dictionary evaluated at the last iteration.
"""
with self._loss.graph.as_default():
opt = optimizer(learning_rate)
train_op = opt.minimize(self._loss)
local_init_op = tf.group(
tf.variables_initializer(opt.variables()),
tf.local_variables_initializer())
if self._session is None:
self._session = tf.Session()
with self._session.as_default():
self._session.run(tf.global_variables_initializer())
self._session.run(tf.tables_initializer())
tf.train.start_queue_runners()
with self._session.as_default():
local_init_op.run()
iterations = []
metrics = self._metrics or ({},)
metrics_vals = [collections.defaultdict(list) for _ in self._metrics]
# Train and append results.
for i in range(num_iterations + 1):
_, results = self._session.run((train_op, metrics))
if (i % 10 == 0) or i == num_iterations:
print("\r iteration %d: " % i + ", ".join(
["%s=%f" % (k, v) for r in results for k, v in r.items()]),
end='')
iterations.append(i)
for metric_val, result in zip(metrics_vals, results):
for k, v in result.items():
metric_val[k].append(v)
for k, v in self._embedding_vars.items():
self._embeddings[k] = v.eval()
if plot_results:
# Plot the metrics.
num_subplots = len(metrics)+1
fig = plt.figure()
fig.set_size_inches(num_subplots*10, 8)
for i, metric_vals in enumerate(metrics_vals):
ax = fig.add_subplot(1, num_subplots, i+1)
for k, v in metric_vals.items():
ax.plot(iterations, v, label=k)
ax.set_xlim([1, num_iterations])
ax.legend()
return results
def build_model(ratings, embedding_dim=3, init_stddev=1.):
"""
Args:
ratings: a DataFrame of the ratings
embedding_dim: the dimension of the embedding vectors.
init_stddev: float, the standard deviation of the random initial embeddings.
Returns:
model: a CFModel.
"""
# Split the ratings DataFrame into train and test.
train_ratings, test_ratings = split_dataframe(ratings)
# SparseTensor representation of the train and test datasets.
A_train = build_rating_sparse_tensor(train_ratings)
A_test = build_rating_sparse_tensor(test_ratings)
# Initialize the embeddings using a normal distribution.
U = tf.Variable(tf.random_normal(
[A_train.dense_shape[0], embedding_dim], stddev=init_stddev))
V = tf.Variable(tf.random_normal(
[A_train.dense_shape[1], embedding_dim], stddev=init_stddev))
train_loss = sparse_mean_square_error(A_train, U, V)
test_loss = sparse_mean_square_error(A_test, U, V)
metrics = {
'train_error': train_loss,
'test_error': test_loss
}
embeddings = {
"user_id": U,
"song_id": V
}
return CFModel(embeddings, train_loss, [metrics])
# Build the CF model and train it.
model = build_model(song_df, embedding_dim=3, init_stddev=0.5)
model.train(num_iterations=5000, learning_rate=25.)
"""## Find similar items"""
DOT = 'dot'
COSINE = 'cosine'
def compute_scores(query_embedding, item_embeddings, measure=DOT):
"""Computes the scores of the candidates given a query.
Args:
query_embedding: a vector of shape [k], representing the query embedding.
item_embeddings: a matrix of shape [N, k], such that row i is the embedding
of item i.
measure: a string specifying the similarity measure to be used. Can be
either DOT or COSINE.
Returns:
scores: a vector of shape [N], such that scores[i] is the score of item i.
"""
u = query_embedding
V = item_embeddings
if measure == COSINE:
V = V / np.linalg.norm(V, axis=1, keepdims=True)
u = u / np.linalg.norm(u)
scores = u.dot(V.T)
return scores
def music_neighbors(model, title_substring, measure=DOT, k=6):
# Search for music ids that match the given substring.
ids = music[music['title']==title_substring].index.values
titles = music.iloc[ids]['title'].values
if len(titles) == 0:
raise ValueError("Found no music with title %s" % title_substring)
print("Nearest neighbors of : %s." % titles[0])
if len(titles) > 1:
print("[Found more than one matching music. Other candidates: {}]".format(
", ".join(titles[1:])))
song_id = ids[0]
scores = compute_scores(
model.embeddings["song_id"][song_id], model.embeddings["song_id"],
measure)
score_key = measure + ' score'
df = pd.DataFrame({
score_key: list(scores),
'titles': music['title'],
})
display.display(df.sort_values([score_key], ascending=False).head(k))
"""#### Find similar songs to Stronger"""
music = song_df_2
music_neighbors(model, "Stronger", COSINE)
music_neighbors(model, "Stronger", DOT)
"""## Regularization In Matrix Factorization
In the previous section, the loss was defined as the mean squared error on the observed part of the rating matrix. This can be problematic as the model does not learn how to place the embeddings of irrelevant movies. This phenomenon is known as *folding*.
We will add regularization terms that will address this issue. We will use two types of regularization:
- Regularization of the model parameters. This is a common $\ell_2$ regularization term on the embedding matrices, given by $r(U, V) = \frac{1}{N} \sum_i \|U_i\|^2 + \frac{1}{M}\sum_j \|V_j\|^2$.
- A global prior that pushes the prediction of any pair towards zero, called the *gravity* term. This is given by $g(U, V) = \frac{1}{MN} \sum_{i = 1}^N \sum_{j = 1}^M \langle U_i, V_j \rangle^2$.
The total loss is then given by
$$
\frac{1}{|\Omega|}\sum_{(i, j) \in \Omega} (A_{ij} - \langle U_i, V_j\rangle)^2 + \lambda _r r(U, V) + \lambda_g g(U, V)
$$
where $\lambda_r$ and $\lambda_g$ are two regularization coefficients (hyper-parameters).
"""
def gravity(U, V):
"""Creates a gravity loss given two embedding matrices."""
return 1. / (U.shape[0].value*V.shape[0].value) * tf.reduce_sum(
tf.matmul(U, U, transpose_a=True) * tf.matmul(V, V, transpose_a=True))
def build_regularized_model(
ratings, embedding_dim=3, regularization_coeff=.1, gravity_coeff=1.,
init_stddev=0.1):
"""
Args:
ratings: the DataFrame of song ratings.
embedding_dim: The dimension of the embedding space.
regularization_coeff: The regularization coefficient lambda.
gravity_coeff: The gravity regularization coefficient lambda_g.
Returns:
A CFModel object that uses a regularized loss.
"""
# Split the ratings DataFrame into train and test.
train_ratings, test_ratings = split_dataframe(ratings)
# SparseTensor representation of the train and test datasets.
A_train = build_rating_sparse_tensor(train_ratings)
A_test = build_rating_sparse_tensor(test_ratings)
U = tf.Variable(tf.random_normal(
[A_train.dense_shape[0], embedding_dim], stddev=init_stddev))
V = tf.Variable(tf.random_normal(
[A_train.dense_shape[1], embedding_dim], stddev=init_stddev))
error_train = sparse_mean_square_error(A_train, U, V)
error_test = sparse_mean_square_error(A_test, U, V)
gravity_loss = gravity_coeff * gravity(U, V)
regularization_loss = regularization_coeff * (
tf.reduce_sum(U*U)/U.shape[0].value + tf.reduce_sum(V*V)/V.shape[0].value)
total_loss = error_train + regularization_loss + gravity_loss
losses = {
'train_error_observed': error_train,
'test_error_observed': error_test,
}
loss_components = {
'observed_loss': error_train,
'regularization_loss': regularization_loss,
'gravity_loss': gravity_loss,
}
embeddings = {"user_id": U, "song_id": V}
return CFModel(embeddings, total_loss, [losses, loss_components])
reg_model = build_regularized_model(
song_df, regularization_coeff=0.1, gravity_coeff=1.0, embedding_dim=35,
init_stddev=.05)
reg_model.train(num_iterations=2000, learning_rate=20.)
np.shape(reg_model.embeddings['song_id'])
np.shape(reg_model.embeddings['user_id'])
music_neighbors(reg_model, "Stronger", DOT)
music_neighbors(reg_model, "Stronger", COSINE)
check_user_list = song_df[song_df.title == 'Stronger'].user_id.unique()
print(song_df[song_df.user_id == check_user_list[100]].title.unique())
"""## Make your own prediction to build a play list"""
USER_RATINGS = True #@param {type:"boolean"}
users = song_df_1
# @title Run to create a spreadsheet, then use it to enter your ratings.
# Authenticate user.
if USER_RATINGS:
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())
# Create the spreadsheet and print a link to it.
try:
sh = gc.open('music-test')
except(gspread.SpreadsheetNotFound):
sh = gc.create('music-test')
worksheet = sh.sheet1
titles = music['title'].values[0:1000] # take the first 100 songs
# titles = [re.sub(r'/\s+/g', '-', str(item)) for item in titles]
cell_list = worksheet.range(1, 1, len(titles), 1)
for cell, title in zip(cell_list, titles):
cell.value = title
worksheet.update_cells(cell_list)
print("Link to the spreadsheet: "
"https://docs.google.com/spreadsheets/d/{}/edit".format(sh.id))
# Run to load your ratings.
# Load the ratings from the spreadsheet and create a DataFrame.
if USER_RATINGS:
my_ratings = pd.DataFrame.from_records(worksheet.get_all_values()).reset_index()
my_ratings = my_ratings[my_ratings[1] != '']
my_ratings = pd.DataFrame({
'user_id': "943",
'title':list(map(str, my_ratings[0])),
'listen_count': list(map(float, my_ratings[1])),
})
my_ratings.merge(song_df_2, on="title", how="left")
# Remove previous ratings.
song_df = song_df[song_df.user_id != "943"]
# Add new ratings.
song_df = song_df.append(my_ratings, ignore_index=True)
# Add new user to the users DataFrame.
if users.shape[0] == 943:
users = users.append(users.iloc[942], ignore_index=True)
users["user_id"][943] = "943"
print("Added your %d ratings; you have great taste!" % len(my_ratings))
song_df[song_df.user_id=="943"].merge(music[['song_id', 'title']])
def user_recommendations(model, measure=DOT, exclude_rated=False, k=20):
if USER_RATINGS:
scores = compute_scores(
model.embeddings["user_id"][943], model.embeddings["song_id"], measure)
score_key = measure + ' score'
df = pd.DataFrame({
score_key: list(scores),
'song_id': music['song_id'],
'titles': music['title'],
})
if exclude_rated:
# remove music that are already rated
rated_music = ratings[ratings.user_id == "943"]["song_id"].values
df = df[df.song_id.apply(lambda song_id: song_id not in rated_music)]
display.display(df.sort_values([score_key], ascending=False).head(k))
if USER_RATINGS:
user_recommendations(reg_model, measure=DOT, k=20)
"""## Build a content-based recommendation system
For the music data, we have "song_id", "title", "release", "artist_name", "year"
We can first use Word2Vec to convert the title and artist name into vectors, then with some normalization, combining it with year, we can use knn to generate the song embedding (V) for unseen item and feed it back to the collobrative model in order to recommend unseen songs to users
### Use a Word2Vec to convert song title to a 100 dimension vector by averaging the vector of each word in the title
"""
# artist = set(song_df.artist_name.values)
music = song_df_2 # note that as some song is not cover in the first file
song_name = music.title.values
song_name_clean = [re.sub(r'[^\w]', ' ', str(item))for item in song_name]
song_name_clean = [re.sub(r" \d+", '', str(item.strip())) for item in song_name_clean]
sentences = list()
for item in song_name_clean:
sentences.append(item.split())
unique_sentence = np.unique(sentences) # build the model on all unique sentence but not all sentence to save time
## create word2vec model
# Set values for NN parameters
num_features = 50 # Word vector dimensionality
min_word_count = 50
num_workers = 1 # Number of CPUs
context = 3 # Context window size;
downsampling = 1e-3 # threshold for configuring which
# higher-frequency words are randomly downsampled
# Initialize and train the model
model_wv = word2vec.Word2Vec(unique_sentence, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model_wv.init_sims(replace=True)
# model_wv.most_similar(u'Love') # similar word to love
"""Store the word2vec model"""
corpus = sorted(model_wv.wv.vocab.keys())
emb_tuple = tuple([model_wv[v] for v in corpus])
X = np.vstack(emb_tuple)
model_wv.wv.save_word2vec_format('song_tile_embedding.txt', binary = False)
"""Converting the song title one by and one and average the word vector is too time-consuming, we can take advantage of the GPU by building a neural network model and fix the weights as the Word2Vec weights to convert our data to vector"""
X = sentences
EMBEDDING_DIM = num_features
max_length = max([len(s) for s in X])
# maximum length of a number of ingredients
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(X)
X_token = tokenizer_obj.texts_to_sequences(X)
X_pad = pad_sequences(X_token, maxlen = max_length, padding = 'post')
embeddings_index = {}
f = open(os.path.join('','song_tile_embedding.txt'), encoding = 'utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:])
embeddings_index[word] = coefs
f.close()
word_index = tokenizer_obj.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector[-EMBEDDING_DIM:]
model_wv_seq = Sequential()
embedding_layer = Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=max_length,
trainable=False)
model_wv_seq.add(embedding_layer)
model_wv_seq.compile(optimizer = 'adam', loss = 'mse')
model_wv_seq.save_weights('model_wv_seq.hdf5')
"""However, to compute all the song we have, it is still very time consuming. Here, I play a small trick. For a new song X_new, something that the model hasn't seen before and I would like to recommend it to some users.
Here is the step:
1. Compute the word2vec vector for each word in the title, and average it on each dimension to get a vector of the whole title, here I chose a dimension of 50
1. Randomly pick a few songs in the selected database, also compute the vector representation of the title and then calculate its similarity to the new song. If I manage to find enough songs that passed that threshold, find the users who listened to these old songs and recommend the new song to this user.
Note that here I used the information of the title, but it can be any text, including lyrics. However, to get the vector representation of a song's lyrics, which will be much longer than the title, should use something rather than averaging. One idea is to use a LSTM autoencoder to genreate the features.
"""
# pick one unseen songs from our pool
old_songs = set(song_df_1.song_id.values)
all_songs = set(song_df_2.song_id.values)
unseen_song = all_songs- old_songs
new_title = song_df_2[song_df_2.song_id == unseen_song.pop()].title
new_title = song_df_2[song_df_2.song_id == unseen_song.pop()].title
new_title = song_df_2[song_df_2.song_id == unseen_song.pop()].title
new_title
new_sentences = list()
for item in new_title:
new_sentences.append(item.split())
X_token = tokenizer_obj.texts_to_sequences(new_sentences)
X_pad_new = pad_sequences(X_token, maxlen = max_length, padding = 'post')
Song_vector_new = model_wv_seq.predict(X_pad_new)
Song_vector_copy = Song_vector_new.copy()
Song_vector_copy[Song_vector_copy == 0] = np.nan
means_new_song = np.nanmean(Song_vector_copy, axis=1) # the first axis of mean is example
np.nonzero(Song_vector_new)
Song_vector = model_wv_seq.predict(X_pad[0:16,:], batch_size = 4)
Song_vector_copy = Song_vector.copy()
Song_vector_copy[Song_vector_copy == 0] = np.nan
means = np.nanmean(Song_vector_copy, axis=1) # the first axis of mean is example
np.shape(means)
means[0]
from sklearn.metrics.pairwise import cosine_similarity
means[2]
scores = cosine_similarity(means[0].reshape(1, -1), means[2].reshape(1, -1))
scores