Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding best practices and cleaning code #191

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions sourcecode/scoring/matrix_factorization/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"PARAMS": {
"l2_lambda": 0.03,
"l2_intercept_multiplier": 5,
"init_lr": 0.2,
"noinit_lr": 1.0,
"convergence": 1e-7,
"num_factors": 1,
"use_global_intercept": true,
"logging": true,
"flip_factor_identification": true
}
}
166 changes: 107 additions & 59 deletions sourcecode/scoring/matrix_factorization/matrix_factorization.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import dataclasses
from typing import List, Optional, Tuple

Expand All @@ -8,6 +9,26 @@
import pandas as pd
import torch

import json

current_file_path = os.path.dirname(os.path.abspath(__file__))
config_path = os.path.join(current_file_path, "config.json")
with open(config_path) as json_file:
config = json.load(json_file)

CONFIG_PARAMS = config.get("PARAMS", {})
EXPECTED_TYPES = {
"l2_lambda": float,
"l2_intercept_multiplier": int,
"init_lr": float,
"noinit_lr": float,
"convergence": float,
"num_factors": int,
"use_global_intercept": bool,
"use_sigmoid_crossentropy": bool,
"logging": bool,
"flip_factor_identification": bool
}

@dataclasses.dataclass
class Constants:
Expand All @@ -16,52 +37,80 @@ class Constants:


class MatrixFactorization:
"""
This class implements a Matrix Factorization model, commonly used in recommendation systems
and collaborative filtering. It decomposes a matrix into the product of two lower-dimensional matrices,
capturing latent factors in the data.

Attributes:
l2_lambda (float): Regularization parameter for L2 regularization.
l2_intercept_multiplier (float): Multiplier for the intercept in L2 regularization.
init_lr (float): Initial learning rate for the optimizer.
noinit_lr (float): Learning rate used when no initial values are provided.
convergence (float): Convergence threshold for the training process.
num_factors (int): Number of latent factors to model.
use_global_intercept (bool): Flag to use a global intercept in the model.
use_sigmoid_crossentropy (bool): Use sigmoid cross-entropy loss if True, else mean squared error loss.
logging (bool): Enable or disable logging.
flip_factor_identification (bool): Adjust factors for model identification.
model (BiasedMatrixFactorization, optional): An instance of a biased matrix factorization model.
feature_cols (List[str]): Feature columns to use in the model.
label_col (str): Label column in the data.
pos_weight (optional): Positive weight parameter for the loss function.

Methods:
get_final_train_error(): Returns the final training error after model fitting.
get_new_mf_with_same_args(): Creates a new instance of MatrixFactorization with the same configuration.
_initialize_note_and_rater_id_maps(ratings): Initializes mappings for note and rater IDs based on the provided ratings DataFrame.
get_note_and_rater_id_maps(ratings): Extracts and returns mappings for note and rater IDs along with processed rating features and labels.
_initialize_parameters(): Initializes or resets the model parameters with given initial values or defaults.
_get_parameters_from_trained_model(): Retrieves parameters from the trained model for analysis or further use.
_create_mf_model(): Initializes the matrix factorization model and its parameters.
_compute_and_print_loss(): Computes and logs the loss during training, useful for monitoring model performance.
_create_train_validate_sets(): Splits the data into training and validation sets for model fitting.
_fit_model(): Executes the model training process, adjusting parameters to minimize the loss.
prepare_features_and_labels(): Prepares features and labels from the dataset for model training.
run_mf(): Main method to run matrix factorization on provided data, returning trained model parameters and performance metrics.
_flip_factors_for_identification(): Adjusts factor sign for model identifiability and interpretation.
"""

def __init__(
self,
l2_lambda=0.03,
l2_intercept_multiplier=5,
initLearningRate=0.2,
noInitLearningRate=1.0,
convergence=1e-7,
numFactors=1,
useGlobalIntercept=True,
logging=True,
flipFactorsForIdentification=True,
self,
config = CONFIG_PARAMS,
model: Optional[BiasedMatrixFactorization] = None,
featureCols: List[str] = [c.noteIdKey, c.raterParticipantIdKey],
labelCol: str = c.helpfulNumKey,
useSigmoidCrossEntropy=False,
posWeight=None,
) -> None:
"""Configure matrix factorization note ranking."""
self._l2_lambda = l2_lambda
self._l2_intercept_multiplier = l2_intercept_multiplier
self._initLearningRate = initLearningRate
self._noInitLearningRate = noInitLearningRate
self._convergence = convergence
self._numFactors = numFactors
self._useGlobalIntercept = useGlobalIntercept
self._logging = logging
self._flipFactorsForIdentification = flipFactorsForIdentification
self._featureCols = featureCols
self._labelCol = labelCol
self._useSigmoidCrossEntropy = useSigmoidCrossEntropy
self._posWeight = posWeight

if self._useSigmoidCrossEntropy:
if self._posWeight:
feature_cols: List[str] = [c.noteIdKey, c.raterParticipantIdKey],
label_col: str = c.helpfulNumKey,
pos_weight: Optional[float] = None,
) -> None:
for param, expected_type in EXPECTED_TYPES.items():
value = config.get(param, CONFIG_PARAMS.get(param))
if value is not None and not isinstance(value, expected_type):
try:
value = expected_type(value)
except ValueError:
raise ValueError(f"Parameter {param} is expected to be of type {expected_type.__name__}, but got {type(value).__name__}")

setattr(self, f"_{param}", value)

self._flip_factor_identification = flip_factor_identification
self._feature_cols = feature_cols
self._label_col = label_col
self._pos_weight = pos_weight

if self._use_sigmoid_crossentropy:
if self._pos_weight:
if logging:
print(f"Using pos weight: {self._posWeight} with BCEWithLogitsLoss")
print(f"Using pos weight: {self._pos_weight} with BCEWithLogitsLoss")
self.criterion = torch.nn.BCEWithLogitsLoss(
pos_weight=torch.Tensor(np.array(self._posWeight))
pos_weight=torch.Tensor(np.array(self._pos_weight))
)
else:
if logging:
print("Using BCEWithLogitsLoss")
self.criterion = torch.nn.BCEWithLogitsLoss()
else:
if self._posWeight:
raise ValueError("posWeight is not supported for MSELoss")
if self._pos_weight:
raise ValueError("pos_weight is not supported for MSELoss")
self.criterion = torch.nn.MSELoss()

self.train_errors: List[float] = []
Expand All @@ -72,23 +121,22 @@ def __init__(
self.trainModelData: Optional[ModelData] = None
self.validateModelData: Optional[ModelData] = None

def get_final_train_error(self) -> Optional[float]:
return self.train_errors[-1] if self.train_errors else None
def get_final_train_error(self) -> Optional[float]: return self.train_errors[-1] if self.train_errors else None

def get_new_mf_with_same_args(self):
return MatrixFactorization(
l2_lambda=self._l2_lambda,
l2_intercept_multiplier=self._l2_intercept_multiplier,
initLearningRate=self._initLearningRate,
noInitLearningRate=self._noInitLearningRate,
init_lr=self._init_lr,
noinit_lr=self._noinit_lr,
convergence=self._convergence,
numFactors=self._numFactors,
useGlobalIntercept=self._useGlobalIntercept,
num_factors=self._num_factors,
use_global_intercept=self._use_global_intercept,
logging=self._logging,
flipFactorsForIdentification=self._flipFactorsForIdentification,
flip_factor_identification=self._flip_factor_identification,
model=None,
featureCols=self._featureCols,
labelCol=self._labelCol,
feature_cols=self._feature_cols,
label_col=self._label_col,
)

def _initialize_note_and_rater_id_maps(
Expand All @@ -112,7 +160,7 @@ def get_note_and_rater_id_maps(
"""
# We are extracting only the subset of note data from the ratings data frame that is needed to
# run matrix factorization. This avoids accidentally losing data through `dropna`.
noteData = ratings[self._featureCols + [self._labelCol]]
noteData = ratings[self._feature_cols + [self._label_col]]
assert not pd.isna(noteData).values.any(), "noteData must not contain nan values"

raterIdMap = (
Expand Down Expand Up @@ -164,10 +212,10 @@ def _initialize_parameters(
np.expand_dims(noteInit[c.internalNoteInterceptKey].astype(np.float32).values, axis=1)
)

for i in range(1, self._numFactors + 1):
for i in range(1, self._num_factors + 1):
noteInit[c.note_factor_key(i)].fillna(0.0, inplace=True)
self.mf_model.note_factors.weight.data = torch.tensor(
noteInit[[c.note_factor_key(i) for i in range(1, self._numFactors + 1)]]
noteInit[[c.note_factor_key(i) for i in range(1, self._num_factors + 1)]]
.astype(np.float32)
.values
)
Expand All @@ -182,10 +230,10 @@ def _initialize_parameters(
np.expand_dims(userInit[c.internalRaterInterceptKey].astype(np.float32).values, axis=1)
)

for i in range(1, self._numFactors + 1):
for i in range(1, self._num_factors + 1):
userInit[c.rater_factor_key(i)].fillna(0.0, inplace=True)
self.mf_model.user_factors.weight.data = torch.tensor(
userInit[[c.rater_factor_key(i) for i in range(1, self._numFactors + 1)]]
userInit[[c.rater_factor_key(i) for i in range(1, self._num_factors + 1)]]
.astype(np.float32)
.values
)
Expand All @@ -211,15 +259,15 @@ def _get_parameters_from_trained_model(self) -> Tuple[pd.DataFrame, pd.DataFrame
c.internalRaterInterceptKey
] = self.mf_model.user_intercepts.weight.data.cpu().numpy()

for i in range(self._numFactors):
for i in range(self._num_factors):
noteParams[c.note_factor_key(i + 1)] = self.mf_model.note_factors.weight.data.cpu().numpy()[
:, i
]
raterParams[c.rater_factor_key(i + 1)] = self.mf_model.user_factors.weight.data.cpu().numpy()[
:, i
]

if self._flipFactorsForIdentification:
if self._flip_factor_identification:
noteParams, raterParams = self._flip_factors_for_identification(noteParams, raterParams)

return noteParams, raterParams
Expand All @@ -246,10 +294,10 @@ def _create_mf_model(

if (noteInit is not None) and (userInit is not None):
self.optimizer = torch.optim.Adam(
self.mf_model.parameters(), lr=self._initLearningRate
self.mf_model.parameters(), lr=self._init_lr
) # smaller learning rate
else:
self.optimizer = torch.optim.Adam(self.mf_model.parameters(), lr=self._noInitLearningRate)
self.optimizer = torch.optim.Adam(self.mf_model.parameters(), lr=self._noinit_lr)
if self._logging:
print(self.mf_model.device)
self.mf_model.to(self.mf_model.device)
Expand All @@ -260,8 +308,8 @@ def _instantiate_biased_mf_model(self):
self.mf_model = BiasedMatrixFactorization(
n_users,
n_notes,
use_global_intercept=self._useGlobalIntercept,
n_factors=self._numFactors,
use_global_intercept=self._use_global_intercept,
n_factors=self._num_factors,
logging=self._logging,
)
if self._logging:
Expand Down Expand Up @@ -406,7 +454,7 @@ def prepare_features_and_labels(
self.ratingFeaturesAndLabels[c.noteIdKey] == specificNoteId
]

rating_labels = torch.FloatTensor(ratingFeaturesAndLabels[self._labelCol].values).to(
rating_labels = torch.FloatTensor(ratingFeaturesAndLabels[self._label_col].values).to(
self.mf_model.device
)
user_indexes = torch.LongTensor(ratingFeaturesAndLabels[Constants.raterIndexKey].values).to(
Expand Down Expand Up @@ -457,7 +505,7 @@ def run_mf(
assert self.mf_model.note_factors.weight.data.cpu().numpy().shape[0] == self.noteIdMap.shape[0]

globalIntercept = None
if self._useGlobalIntercept:
if self._use_global_intercept:
globalIntercept = self.mf_model.global_intercept
if self._logging:
print("Global Intercept: ", globalIntercept.item())
Expand All @@ -482,7 +530,7 @@ def _flip_factors_for_identification(
Returns:
Tuple[pd.DataFrame, pd.DataFrame]: noteParams, raterParams
"""
for i in range(1, self._numFactors + 1):
for i in range(1, self._num_factors + 1):
noteFactorName = c.note_factor_key(i)
raterFactorName = c.rater_factor_key(i)

Expand Down