Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

combine listwise ranking and feature preprocessing #702

Open
hyyan112 opened this issue Nov 7, 2023 · 0 comments
Open

combine listwise ranking and feature preprocessing #702

hyyan112 opened this issue Nov 7, 2023 · 0 comments

Comments

@hyyan112
Copy link

hyyan112 commented Nov 7, 2023

https://www.tensorflow.org/recommenders/examples/listwise_ranking
by follow this doucument I wrote some code

import pprint

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(
    lambda x: {
        "movie_title": x["movie_title"],
        "user_id": x["user_id"],
        "user_rating": x["user_rating"],
    }
)
movies = movies.map(lambda x: x["movie_title"])

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(
    np.concatenate(list(ratings.batch(1_000).map(lambda x: x["user_id"])))
)

tf.random.set_seed(42)

# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train, num_list_per_user=50, num_examples_per_list=5, seed=42
)
test = tfrs.examples.movielens.sample_listwise(
    test, num_list_per_user=1, num_examples_per_list=5, seed=42
)

for example in train.take(1):
    pprint.pprint(example)


class RankingModel(tfrs.Model):
    def __init__(self, loss):
        super().__init__()
        embedding_dimension = 32

        self.user_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(vocabulary=unique_user_ids),
                tf.keras.layers.Embedding(
                    len(unique_user_ids) + 2, embedding_dimension
                ),
            ]
        )

        self.movie_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(vocabulary=unique_movie_titles),
                tf.keras.layers.Embedding(
                    len(unique_movie_titles) + 2, embedding_dimension
                ),
            ]
        )

        max_tokens = 1000
        self.title_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(max_tokens=max_tokens),
                tf.keras.layers.Embedding(max_tokens, embedding_dimension, mask_zero=True),
                # We average the embedding of individual words to get one embedding vector
                # per title.
                tf.keras.layers.GlobalAveragePooling1D(),
            ]
        )

        self.score_model = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(256, activation="relu"),
                tf.keras.layers.Dense(64, activation="relu"),
                tf.keras.layers.Dense(1),
            ]
        )

        self.task = tfrs.tasks.Ranking(
            loss=loss,
            metrics=[
                tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
                tf.keras.metrics.RootMeanSquaredError(),
            ],
        )

    def call(self, features):
        tf.print(features)
        user_embeddings = self.user_embeddings(features["user_id"])

        movie_embeddings = self.movie_embeddings(features["movie_title"])

        text_embeddings = self.title_text_embedding(features["movie_title"])

        movie_model = tf.concat([movie_embeddings, text_embeddings], axis=1)

        list_length = features["movie_title"].shape[1]
        user_embedding_repeated = tf.repeat(
            tf.expand_dims(user_embeddings, 1), [list_length], axis=1
        )

        concatenated_embeddings = tf.concat(
            [user_embedding_repeated, movie_model], 2
        )

        return self.score_model(concatenated_embeddings)

    def compute_loss(self, features, training=False):
        labels = features.pop("user_rating")

        scores = self(features)

        return self.task(
            labels=labels,
            predictions=tf.squeeze(scores, axis=-1),
        )


epochs = 30

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(8192).cache()

listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

listwise_model.fit(cached_train, epochs=epochs, verbose=False)

listwise_model_result = listwise_model.evaluate(cached_test, return_dict=True)
print("NDCG of the ListMLE model: {:.4f}".format(listwise_model_result["ndcg_metric"]))

notice that I add a text feature

text_embeddings = self.title_text_embedding(features["movie_title"])

but got error

in user code:
    
        File "xxxxx/rank_demo.py", line 104, in call  *
            text_embeddings = self.title_text_embedding(features["movie_title"])
        File "xxxxx/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "xxxxx/text_vectorization.py", line 573, in _preprocess
            raise ValueError(
    
        ValueError: Exception encountered when calling layer 'text_vectorization' (type TextVectorization).
        
        When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 5) with rank=2
        
        Call arguments received by layer 'text_vectorization' (type TextVectorization):
          • inputs=tf.Tensor(shape=(None, 5), dtype=string)
    
    
    Call arguments received by layer 'ranking_model' (type RankingModel):
      • features={'user_id': 'tf.Tensor(shape=(None,), dtype=string)', 'movie_title': 'tf.Tensor(shape=(None, 5), dtype=string)'}

I think it's because the tfrs.examples.movielens.sample_listwise reshaped the datasets to shape=(None, 5), but how should I fix it? Really need some help here

@hyyan112 hyyan112 changed the title combine listwise ranking with feature preprocessing combine listwise ranking and feature preprocessing Nov 7, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant