music_recommendation_binary.py

# -*- coding: utf-8 -*-
"""music_recommendation_binary.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/github/YIZHE12/music_recom/blob/master/music_recommendation_binary.ipynb

# Music recommendation system using TensorFlow and Keras

## Load all require packages
"""

from __future__ import print_function

import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import sklearn.manifold
import tensorflow as tf
from sklearn import preprocessing
import time
import seaborn as sns
import re
from gensim.models import word2vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os
from keras.models import Sequential

from keras.layers import Embedding

tf.logging.set_verbosity(tf.logging.ERROR)


# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.3f}'.format
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols


# Install spreadsheets and import authentication module.
USER_RATINGS = False
!pip install --upgrade -q gspread
from google.colab import auth
import gspread
from oauth2client.client import GoogleCredentials

"""## Load data

song_df_1 is the table of user's listening record, which has columns of 'user_id',' 'song_id', 'listen_count'

song_df_2 is song metadata file, which has 'song_id',	'title',	'release',	'artist_name',	'year'
"""

triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt'
songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'

song_df_1 = pd.read_table(triplets_file,header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

#Read song  metadata
song_df_2 =  pd.read_csv(songs_metadata_file)

song_df_1.head()

song_df_2.head()

len(song_df_2.artist_name.unique())

len(song_df_2.title.unique())

"""## Data cleaning
The song_id and user_id is a very long string, we can increase efficiency by converting this long string to a number by label encoding
"""

le_song_id = preprocessing.LabelEncoder()
le_song_id.fit(song_df_2.song_id)
song_df_2.song_id = le_song_id.transform(song_df_2.song_id)
song_df_1.song_id = le_song_id.transform(song_df_1.song_id)

le_user_id = preprocessing.LabelEncoder()
song_df_1.user_id = le_user_id.fit_transform(song_df_1.user_id)

"""## Merge two dataset"""

#Merge the two dataframes above to create input dataframe for recommender systems
song_df = pd.merge(song_df_1, \
                   song_df_2.drop_duplicates(['song_id']), \
                   on="song_id", how="left")

song_df.head()

song_df.describe(include=[np.int, np.object])

song_df = song_df.fillna(value=0)

"""#### Hiistogram of listen_count In log scale"""

g = sns.distplot(song_df.listen_count, kde=False, rug=False)
g.set_yscale('log')

"""#### number of unique user"""

len(pd.unique(song_df.user_id))

"""#### number of unique song"""

len(pd.unique(song_df.song_id))

"""#### check the most popular songs"""

song_ratings = song_df_2.merge(
    song_df
    .groupby('song_id', as_index=False)
    .agg({'listen_count': ['count', 'mean']})
    .flatten_cols(),
    on='song_id')

sorted_song = (song_ratings[['title', 'listen_count count', 'listen_count mean']]
 .sort_values('listen_count count', ascending=False))

sorted_song.head(10)

"""## Data Normalization

The maximum number of 'listen_count' is 8277, while the minimal is 0. This large range will create problems for the model and require normalization. 

Here, I first grouped the data by user then normalized the listen_count value.
"""

zscore = lambda x: (x - x.mean()) / x.std() 
# min_max = lambda x: (x - x.min()) / (x.max() - x.min())
norm = song_df.groupby('user_id').transform(zscore)
song_df.listen_count = norm.listen_count 

song_df.head()

min_max = lambda x: (x - x.min()) / (x.max() - x.min())
norm = song_df.groupby('user_id').transform(min_max)

song_df.listen_count = norm.listen_count 

song_df.head()

song_df = song_df.fillna(value=0)

"""# Build a collaborative filtering model

## CFModel (Collaborative Filtering Model) helper class
This is a simple class to train a matrix factorization model using stochastic gradient descent.

The class constructor takes
- the user embeddings U (a `tf.Variable`).
- the song embeddings V, (a `tf.Variable`).
- a loss to optimize (a `tf.Tensor`).
- an optional list of metrics dictionaries, each mapping a string (the name of the metric) to a tensor. These are evaluated and plotted during training (e.g. training error and test error).

After training, one can access the trained embeddings using the `model.embeddings` dictionary.

Example usage:
```
U_var = ...
V_var = ...
loss = ...
model = CFModel(U_var, V_var, loss)
model.train(iterations=100, learning_rate=1.0)
user_embeddings = model.embeddings['user_id']
song_embeddings = model.embeddings['song_id']
```
"""

# Utility to split the data into training and test sets.
def split_dataframe(df, holdout_fraction=0.1):
  """Splits a DataFrame into training and test sets.
  Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
  Returns:
    train: dataframe for training
    test: dataframe for testing
  """
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test

def build_rating_sparse_tensor(song_df):
  """
  Args:
    ratings_df: a pd.DataFrame with `user_id`, `song_id` and `listen_count` columns.
  Returns:
    a tf.SparseTensor representing the ratings matrix.
  """
  indices = song_df[['user_id', 'song_id']].values
  values = song_df['listen_count'].values
  return tf.SparseTensor(
      indices=indices,
      values=values,
      dense_shape=[song_df_1.shape[0], song_df_2.shape[0]])

def sparse_mean_square_error(sparse_ratings, user_embeddings, music_embeddings):
  """
  Args:
    sparse_ratings: A SparseTensor rating matrix, of dense_shape [N, M]
    user_embeddings: A dense Tensor U of shape [N, k] where k is the embedding
      dimension, such that U_i is the embedding of user i.
    music_embeddings: A dense Tensor V of shape [M, k] where k is the embedding
      dimension, such that V_j is the embedding of music j.
  Returns:
    A scalar Tensor representing the MSE between the true ratings and the
      model's predictions.
  """
  predictions = tf.reduce_sum(
      tf.gather(user_embeddings, sparse_ratings.indices[:, 0]) *
      tf.gather(music_embeddings, sparse_ratings.indices[:, 1]),
      axis=1)
  loss = tf.losses.mean_squared_error(sparse_ratings.values, predictions)
  return loss

class CFModel(object):
  """Simple class that represents a collaborative filtering model"""
  def __init__(self, embedding_vars, loss, metrics=None):
    """Initializes a CFModel.
    Args:
      embedding_vars: A dictionary of tf.Variables.
      loss: A float Tensor. The loss to optimize.
      metrics: optional list of dictionaries of Tensors. The metrics in each
        dictionary will be plotted in a separate figure during training.
    """
    self._embedding_vars = embedding_vars
    self._loss = loss
    self._metrics = metrics
    self._embeddings = {k: None for k in embedding_vars}
    self._session = None

  @property
  def embeddings(self):
    """The embeddings dictionary."""
    return self._embeddings

  def train(self, num_iterations=100, learning_rate=1.0, plot_results=True,
            optimizer=tf.train.GradientDescentOptimizer):
    """Trains the model.
    Args:
      iterations: number of iterations to run.
      learning_rate: optimizer learning rate.
      plot_results: whether to plot the results at the end of training.
      optimizer: the optimizer to use. Default to GradientDescentOptimizer.
    Returns:
      The metrics dictionary evaluated at the last iteration.
    """
    with self._loss.graph.as_default():
      opt = optimizer(learning_rate)
      train_op = opt.minimize(self._loss)
      local_init_op = tf.group(
          tf.variables_initializer(opt.variables()),
          tf.local_variables_initializer())
      if self._session is None:
        self._session = tf.Session()
        with self._session.as_default():
          self._session.run(tf.global_variables_initializer())
          self._session.run(tf.tables_initializer())
          tf.train.start_queue_runners()

    with self._session.as_default():
      local_init_op.run()
      iterations = []
      metrics = self._metrics or ({},)
      metrics_vals = [collections.defaultdict(list) for _ in self._metrics]

      # Train and append results.
      for i in range(num_iterations + 1):
        _, results = self._session.run((train_op, metrics))
        if (i % 10 == 0) or i == num_iterations:
          print("\r iteration %d: " % i + ", ".join(
                ["%s=%f" % (k, v) for r in results for k, v in r.items()]),
                end='')
          iterations.append(i)
          for metric_val, result in zip(metrics_vals, results):
            for k, v in result.items():
              metric_val[k].append(v)

      for k, v in self._embedding_vars.items():
        self._embeddings[k] = v.eval()

      if plot_results:
        # Plot the metrics.
        num_subplots = len(metrics)+1
        fig = plt.figure()
        fig.set_size_inches(num_subplots*10, 8)
        for i, metric_vals in enumerate(metrics_vals):
          ax = fig.add_subplot(1, num_subplots, i+1)
          for k, v in metric_vals.items():
            ax.plot(iterations, v, label=k)
          ax.set_xlim([1, num_iterations])
          ax.legend()
      return results

def build_model(ratings, embedding_dim=3, init_stddev=1.):
  """
  Args:
    ratings: a DataFrame of the ratings
    embedding_dim: the dimension of the embedding vectors.
    init_stddev: float, the standard deviation of the random initial embeddings.
  Returns:
    model: a CFModel.
  """
  # Split the ratings DataFrame into train and test.
  train_ratings, test_ratings = split_dataframe(ratings)
  # SparseTensor representation of the train and test datasets.
  A_train = build_rating_sparse_tensor(train_ratings)
  A_test = build_rating_sparse_tensor(test_ratings)
  # Initialize the embeddings using a normal distribution.
  U = tf.Variable(tf.random_normal(
      [A_train.dense_shape[0], embedding_dim], stddev=init_stddev))
  V = tf.Variable(tf.random_normal(
      [A_train.dense_shape[1], embedding_dim], stddev=init_stddev))
  train_loss = sparse_mean_square_error(A_train, U, V)
  test_loss = sparse_mean_square_error(A_test, U, V)
  metrics = {
      'train_error': train_loss,
      'test_error': test_loss
  }
  embeddings = {
      "user_id": U,
      "song_id": V
  }
  return CFModel(embeddings, train_loss, [metrics])

# Build the CF model and train it.
model = build_model(song_df, embedding_dim=3, init_stddev=0.5)
model.train(num_iterations=5000, learning_rate=25.)

"""## Find similar items"""

DOT = 'dot'
COSINE = 'cosine'
def compute_scores(query_embedding, item_embeddings, measure=DOT):
  """Computes the scores of the candidates given a query.
  Args:
    query_embedding: a vector of shape [k], representing the query embedding.
    item_embeddings: a matrix of shape [N, k], such that row i is the embedding
      of item i.
    measure: a string specifying the similarity measure to be used. Can be
      either DOT or COSINE.
  Returns:
    scores: a vector of shape [N], such that scores[i] is the score of item i.
  """
  u = query_embedding
  V = item_embeddings
  if measure == COSINE:
    V = V / np.linalg.norm(V, axis=1, keepdims=True)
    u = u / np.linalg.norm(u)
  scores = u.dot(V.T)
  return scores

def music_neighbors(model, title_substring, measure=DOT, k=6):
  # Search for music ids that match the given substring.
  ids =  music[music['title']==title_substring].index.values
  titles = music.iloc[ids]['title'].values
  if len(titles) == 0:
    raise ValueError("Found no music with title %s" % title_substring)
  print("Nearest neighbors of : %s." % titles[0])
  if len(titles) > 1:
    print("[Found more than one matching music. Other candidates: {}]".format(
        ", ".join(titles[1:])))
  song_id = ids[0]
  scores = compute_scores(
      model.embeddings["song_id"][song_id], model.embeddings["song_id"],
      measure)
  score_key = measure + ' score'
  df = pd.DataFrame({
      score_key: list(scores),
      'titles': music['title'],
  })
  display.display(df.sort_values([score_key], ascending=False).head(k))

"""#### Find similar songs to Stronger"""

music = song_df_2

music_neighbors(model, "Stronger", COSINE)

music_neighbors(model, "Stronger", DOT)

"""## Regularization In Matrix Factorization

In the previous section, the loss was defined as the mean squared error on the observed part of the rating matrix.  This can be problematic as the model does not learn how to place the embeddings of irrelevant movies. This phenomenon is known as *folding*.

We will add regularization terms that will address this issue. We will use two types of regularization:
- Regularization of the model parameters. This is a common $\ell_2$ regularization term on the embedding matrices, given by $r(U, V) =  \frac{1}{N} \sum_i \|U_i\|^2 + \frac{1}{M}\sum_j \|V_j\|^2$.
- A global prior that pushes the prediction of any pair towards zero, called the *gravity* term. This is given by $g(U, V) = \frac{1}{MN} \sum_{i = 1}^N \sum_{j = 1}^M \langle U_i, V_j \rangle^2$.

The total loss is then given by
$$
\frac{1}{|\Omega|}\sum_{(i, j) \in \Omega} (A_{ij} - \langle U_i, V_j\rangle)^2 + \lambda _r r(U, V) + \lambda_g g(U, V)
$$
where $\lambda_r$ and $\lambda_g$ are two regularization coefficients (hyper-parameters).
"""

def gravity(U, V):
  """Creates a gravity loss given two embedding matrices."""
  return 1. / (U.shape[0].value*V.shape[0].value) * tf.reduce_sum(
      tf.matmul(U, U, transpose_a=True) * tf.matmul(V, V, transpose_a=True))

def build_regularized_model(
    ratings, embedding_dim=3, regularization_coeff=.1, gravity_coeff=1.,
    init_stddev=0.1):
  """
  Args:
    ratings: the DataFrame of song ratings.
    embedding_dim: The dimension of the embedding space.
    regularization_coeff: The regularization coefficient lambda.
    gravity_coeff: The gravity regularization coefficient lambda_g.
  Returns:
    A CFModel object that uses a regularized loss.
  """
  # Split the ratings DataFrame into train and test.
  train_ratings, test_ratings = split_dataframe(ratings)
  # SparseTensor representation of the train and test datasets.
  A_train = build_rating_sparse_tensor(train_ratings)
  A_test = build_rating_sparse_tensor(test_ratings)
  U = tf.Variable(tf.random_normal(
      [A_train.dense_shape[0], embedding_dim], stddev=init_stddev))
  V = tf.Variable(tf.random_normal(
      [A_train.dense_shape[1], embedding_dim], stddev=init_stddev))

  error_train = sparse_mean_square_error(A_train, U, V)
  error_test = sparse_mean_square_error(A_test, U, V)
  gravity_loss = gravity_coeff * gravity(U, V)
  regularization_loss = regularization_coeff * (
      tf.reduce_sum(U*U)/U.shape[0].value + tf.reduce_sum(V*V)/V.shape[0].value)
  total_loss = error_train + regularization_loss + gravity_loss
  losses = {
      'train_error_observed': error_train,
      'test_error_observed': error_test,
  }
  loss_components = {
      'observed_loss': error_train,
      'regularization_loss': regularization_loss,
      'gravity_loss': gravity_loss,
  }
  embeddings = {"user_id": U, "song_id": V}

  return CFModel(embeddings, total_loss, [losses, loss_components])

reg_model = build_regularized_model(
    song_df, regularization_coeff=0.1, gravity_coeff=1.0, embedding_dim=35,
    init_stddev=.05)
reg_model.train(num_iterations=2000, learning_rate=20.)

np.shape(reg_model.embeddings['song_id'])

np.shape(reg_model.embeddings['user_id'])

music_neighbors(reg_model, "Stronger", DOT)

music_neighbors(reg_model, "Stronger", COSINE)

check_user_list = song_df[song_df.title == 'Stronger'].user_id.unique()

print(song_df[song_df.user_id == check_user_list[100]].title.unique())

"""## Make your own prediction to build a play list"""

USER_RATINGS = True #@param {type:"boolean"}
users = song_df_1

# @title Run to create a spreadsheet, then use it to enter your ratings.
# Authenticate user.
if USER_RATINGS:
  auth.authenticate_user()
  gc = gspread.authorize(GoogleCredentials.get_application_default())
  # Create the spreadsheet and print a link to it.
  try:
    sh = gc.open('music-test')
  except(gspread.SpreadsheetNotFound):
    sh = gc.create('music-test')

  worksheet = sh.sheet1
  titles = music['title'].values[0:1000] # take the first 100 songs
#   titles = [re.sub(r'/\s+/g', '-', str(item)) for item in titles]
  cell_list = worksheet.range(1, 1, len(titles), 1)
  for cell, title in zip(cell_list, titles):
    cell.value = title
  worksheet.update_cells(cell_list)
  print("Link to the spreadsheet: "
        "https://docs.google.com/spreadsheets/d/{}/edit".format(sh.id))

# Run to load your ratings.
# Load the ratings from the spreadsheet and create a DataFrame.
if USER_RATINGS:
  my_ratings = pd.DataFrame.from_records(worksheet.get_all_values()).reset_index()
  my_ratings = my_ratings[my_ratings[1] != '']
  my_ratings = pd.DataFrame({
      'user_id': "943",
      'title':list(map(str, my_ratings[0])),
      'listen_count': list(map(float, my_ratings[1])),
  })
  
  my_ratings.merge(song_df_2, on="title", how="left")
   
  # Remove previous ratings.
  song_df = song_df[song_df.user_id != "943"]
  # Add new ratings.
  song_df = song_df.append(my_ratings, ignore_index=True)
  # Add new user to the users DataFrame.
  if users.shape[0] == 943:
    users = users.append(users.iloc[942], ignore_index=True)
    users["user_id"][943] = "943"
  print("Added your %d ratings; you have great taste!" % len(my_ratings))
  song_df[song_df.user_id=="943"].merge(music[['song_id', 'title']])

def user_recommendations(model, measure=DOT, exclude_rated=False, k=20):
  if USER_RATINGS:
    scores = compute_scores(
        model.embeddings["user_id"][943], model.embeddings["song_id"], measure)
    score_key = measure + ' score'
    df = pd.DataFrame({
        score_key: list(scores),
        'song_id': music['song_id'],
        'titles': music['title'],
    })
    if exclude_rated:
      # remove music that are already rated
      rated_music = ratings[ratings.user_id == "943"]["song_id"].values
      df = df[df.song_id.apply(lambda song_id: song_id not in rated_music)]
    display.display(df.sort_values([score_key], ascending=False).head(k))

if USER_RATINGS:
  user_recommendations(reg_model, measure=DOT, k=20)

"""## Build a content-based recommendation system

For the music data, we have "song_id",	"title",	"release", "artist_name",	"year"

We can first use Word2Vec to convert the title and artist name into vectors, then with some normalization, combining it with year, we can use knn to generate the song embedding (V) for unseen item and feed it back to the collobrative model in order to recommend unseen songs to users

### Use a Word2Vec to convert song title to a 100 dimension vector by averaging the vector of each word in the title
"""

# artist = set(song_df.artist_name.values)
music = song_df_2 # note that as some song is not cover in the first file
song_name = music.title.values
song_name_clean = [re.sub(r'[^\w]', ' ', str(item))for item in song_name]
song_name_clean = [re.sub(r" \d+", '', str(item.strip())) for item in song_name_clean]

sentences = list()
for item in song_name_clean:
  sentences.append(item.split())
  
unique_sentence = np.unique(sentences) # build the model on all unique sentence but not all sentence to save time

## create word2vec model
# Set values for NN parameters
num_features = 50    # Word vector dimensionality                      
min_word_count = 50                      
num_workers = 1      # Number of CPUs
context = 3          # Context window size; 
                                                                                                        
downsampling = 1e-3   # threshold for configuring which 
                      # higher-frequency words are randomly downsampled

# Initialize and train the model 
model_wv = word2vec.Word2Vec(unique_sentence, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model_wv.init_sims(replace=True)

# model_wv.most_similar(u'Love') # similar word to love

"""Store the word2vec model"""

corpus = sorted(model_wv.wv.vocab.keys()) 
emb_tuple = tuple([model_wv[v] for v in corpus])
X = np.vstack(emb_tuple)
model_wv.wv.save_word2vec_format('song_tile_embedding.txt', binary = False)

"""Converting the song title one by and one and average the word vector is too time-consuming, we can take advantage of the GPU by building a neural network model and fix the weights as the Word2Vec weights to convert our data to vector"""

X = sentences
EMBEDDING_DIM = num_features
max_length = max([len(s) for s in X])
# maximum length of a number of ingredients

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(X)

X_token = tokenizer_obj.texts_to_sequences(X)
X_pad = pad_sequences(X_token, maxlen = max_length, padding = 'post')

embeddings_index = {}
f = open(os.path.join('','song_tile_embedding.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
    
f.close()

word_index = tokenizer_obj.word_index
num_words = len(word_index) + 1

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector[-EMBEDDING_DIM:]

model_wv_seq = Sequential()
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)
model_wv_seq.add(embedding_layer)
model_wv_seq.compile(optimizer = 'adam', loss = 'mse')

model_wv_seq.save_weights('model_wv_seq.hdf5')

"""However, to compute all the song we have, it is still very time consuming. Here, I play a small trick. For a new song X_new, something that the model hasn't seen before and I would like to recommend it to some users.

Here is the step:

1. Compute the word2vec vector for each word in the title, and average it on each dimension to get a vector of the whole title, here I chose a dimension of 50

1. Randomly pick a few songs in the selected database, also compute the vector representation of the title and then calculate its similarity to the new song. If I manage to find enough songs that passed that threshold, find the users who listened to these old songs and recommend the new song to this user.

Note that here I used the information of the title, but it can be any text, including lyrics. However, to get the vector representation of a song's lyrics, which will be much longer than the title, should use something rather than averaging. One idea is to use a LSTM autoencoder to genreate the features.
"""

# pick one unseen songs from our pool

old_songs = set(song_df_1.song_id.values)
all_songs = set(song_df_2.song_id.values)

unseen_song = all_songs- old_songs
new_title = song_df_2[song_df_2.song_id == unseen_song.pop()].title
new_title = song_df_2[song_df_2.song_id == unseen_song.pop()].title

new_title = song_df_2[song_df_2.song_id == unseen_song.pop()].title

new_title

new_sentences = list()
for item in new_title:
  new_sentences.append(item.split())
  

X_token = tokenizer_obj.texts_to_sequences(new_sentences)
X_pad_new = pad_sequences(X_token, maxlen = max_length, padding = 'post')
Song_vector_new = model_wv_seq.predict(X_pad_new)

Song_vector_copy = Song_vector_new.copy()
Song_vector_copy[Song_vector_copy == 0] = np.nan
means_new_song = np.nanmean(Song_vector_copy, axis=1) # the first axis of mean is example

np.nonzero(Song_vector_new)

Song_vector = model_wv_seq.predict(X_pad[0:16,:],  batch_size = 4)

Song_vector_copy = Song_vector.copy()
Song_vector_copy[Song_vector_copy == 0] = np.nan
means = np.nanmean(Song_vector_copy, axis=1) # the first axis of mean is example

np.shape(means)

means[0]

from sklearn.metrics.pairwise import cosine_similarity

means[2]

scores = cosine_similarity(means[0].reshape(1, -1), means[2].reshape(1, -1))
scores