diff --git a/binarybeech/attributehandler.py b/binarybeech/attributehandler.py index 332e5cc..65ae4ec 100644 --- a/binarybeech/attributehandler.py +++ b/binarybeech/attributehandler.py @@ -77,18 +77,18 @@ def split(self, df): ] N = len(df.index) n = [len(df_.index) for df_ in split_df] - + + loss_args = [{}, {}] if "__weights__" in df: - w = [df_["__weights__"].values for df_ in split_df] - else: - w = [None for df_ in split_df] + loss_args = [{"weights":df_["__weights__"].values} for df_ in split_df] + val = [ - self.metrics.node_value(df_[self.y_name], w[i]) + self.metrics.node_value(df_[self.y_name], **loss_args[i]) for i, df_ in enumerate(split_df) ] loss = n[0] / N * self.metrics.loss( - split_df[0][self.y_name], val[0], w[0] - ) + n[1] / N * self.metrics.loss(split_df[1][self.y_name], val[1], w[1]) + split_df[0][self.y_name], val[0], **loss_args[0] + ) + n[1] / N * self.metrics.loss(split_df[1][self.y_name], val[1], **loss_args[1]) if loss < self.loss: success = True self.loss = loss @@ -162,17 +162,16 @@ def fun(x): if min(n) == 0: return np.Inf + loss_args = [{}, {}] if "__weights__" in df: - w = [df_["__weights__"].values for df_ in split_df] - else: - w = [None for df_ in split_df] + w = [{"weights":df_["__weights__"].values} for df_ in split_df] val = [ - self.metrics.node_value(df_[self.y_name], w[i]) + self.metrics.node_value(df_[self.y_name], **loss_args[i]) for i, df_ in enumerate(split_df) ] return n[0] / N * self.metrics.loss( - split_df[0][self.y_name], val[0], w[0] - ) + n[1] / N * self.metrics.loss(split_df[1][self.y_name], val[1], w[1]) + split_df[0][self.y_name], val[0], **loss_args[0] + ) + n[1] / N * self.metrics.loss(split_df[1][self.y_name], val[1], **loss_args[1]) return fun @@ -214,17 +213,17 @@ def split(self, df): N = len(df.index) n = [len(df_.index) for df_ in self.split_df] + loss_args = [{}, {}] if "__weights__" in df: - w = [df_["__weights__"].values for df_ in self.split_df] - else: - w = [None for df_ in self.split_df] + loss_args = [{"weights":df_["__weights__"].values} for df_ in self.split_df] + val = [ - self.metrics.node_value(df_[self.y_name], w[i]) + self.metrics.node_value(df_[self.y_name], **loss_args[i]) for i, df_ in enumerate(self.split_df) ] self.loss = n[0] / N * self.metrics.loss( - self.split_df[0][self.y_name], val[0], w[0] - ) + n[1] / N * self.metrics.loss(self.split_df[1][self.y_name], val[1], w[1]) + self.split_df[0][self.y_name], val[0], **loss_args[0] + ) + n[1] / N * self.metrics.loss(self.split_df[1][self.y_name], val[1], **loss_args[1]) return success @@ -295,17 +294,16 @@ def fun(x): split_df = [df[df[split_name] < x], df[df[split_name] >= x]] n = [len(df_.index) for df_ in split_df] + loss_args = [{}, {}] if "__weights__" in df: - w = [df_["__weights__"].values for df_ in split_df] - else: - w = [None for df_ in split_df] + loss_args = [{"weights":df_["__weights__"].values} for df_ in split_df] val = [ - self.metrics.node_value(df_[self.y_name], w[i]) + self.metrics.node_value(df_[self.y_name], **loss_args[i]) for i, df_ in enumerate(split_df) ] return n[0] / N * self.metrics.loss( - split_df[0][self.y_name], val[0], w[0] - ) + n[1] / N * self.metrics.loss(split_df[1][self.y_name], val[1], w[1]) + split_df[0][self.y_name], val[0], **loss_args[0] + ) + n[1] / N * self.metrics.loss(split_df[1][self.y_name], val[1], **loss_args[1]) return fun diff --git a/binarybeech/binarybeech.py b/binarybeech/binarybeech.py index 094ce04..ab305be 100644 --- a/binarybeech/binarybeech.py +++ b/binarybeech/binarybeech.py @@ -226,12 +226,12 @@ def create_tree(self, leaf_loss_threshold=1e-12): def _node_or_leaf(self, df): y = df[self.y_name] + loss_args = {} if "__weights__" in df: - w = df["__weights__"].values - else: - w = None - y_hat = self.dmgr.metrics.node_value(y, w) - loss_parent = self.dmgr.metrics.loss(y, y_hat, w) + loss_args["weights"] = df["__weights__"].values + + y_hat = self.dmgr.metrics.node_value(y, **loss_args) + loss_parent = self.dmgr.metrics.loss(y, y_hat, **loss_args) # p = self._probability(df) if ( loss_parent < self.leaf_loss_threshold @@ -270,7 +270,8 @@ def _node_or_leaf(self, df): decision_fun=self.dmgr[split_name].decide, ) item.pinfo["N"] = len(df.index) - item.pinfo["r"] = self.dmgr.metrics.loss_prune(y, y_hat) + loss_args ={} + item.pinfo["r"] = self.dmgr.metrics.loss_prune(y, y_hat, **loss_args) item.pinfo["R"] = ( item.pinfo["N"] / len(self.training_data.df.index) * item.pinfo["r"] ) @@ -285,7 +286,8 @@ def _leaf(self, y, y_hat): leaf = Node(value=y_hat) leaf.pinfo["N"] = y.size - leaf.pinfo["r"] = self.dmgr.metrics.loss_prune(y, y_hat) + loss_args = {} + leaf.pinfo["r"] = self.dmgr.metrics.loss_prune(y, y_hat, **loss_args) leaf.pinfo["R"] = ( leaf.pinfo["N"] / len(self.training_data.df.index) * leaf.pinfo["r"] ) @@ -542,15 +544,15 @@ def _opt_fun(self, tree): for i, x in enumerate(self.df.iloc): delta[i] = tree.traverse(x).value y = self.df[self.y_name].values + + loss_args = {} if "__weights__" in self.df: - w = self.df["__weights__"].values - else: - w = None + loss_args["weights"] = self.df["__weights__"].values def fun(gamma): y_ = y_hat + gamma * delta p = self.dmgr.metrics.output_transform(y_) - return self.dmgr.metrics.loss(y, p, w) + return self.dmgr.metrics.loss(y, p, **loss_args) return fun diff --git a/binarybeech/math.py b/binarybeech/math.py index 6dd81f6..7ee7ebc 100644 --- a/binarybeech/math.py +++ b/binarybeech/math.py @@ -17,7 +17,7 @@ def unique_weighted(x, w): return np.array(u), np.array(c) / np.sum(c) -def gini_impurity_fast(x): +def gini_impurity(x): unique, counts = np.unique(x, return_counts=True) N = x.size p = counts / N @@ -29,13 +29,7 @@ def gini_impurity_weighted(x, w): return 1.0 - np.sum(p**2) -def gini_impurity(x, w=None): - if w is None: - return gini_impurity_fast(x) - return gini_impurity_weighted(x, w) - - -def shannon_entropy_fast(x): +def shannon_entropy(x): unique, counts = np.unique(x, return_counts=True) N = x.size p = counts / N @@ -47,13 +41,7 @@ def shannon_entropy_weighted(x, w): return -np.sum(p * np.log2(p)) -def shannon_entropy(x, w=None): - if w is None: - return shannon_entropy_fast(x) - return shannon_entropy_weighted(x, w) - - -def misclassification_cost_fast(x): +def misclassification_cost(x): unique, counts = np.unique(x, return_counts=True) N = x.size p = np.max(counts) / N @@ -66,12 +54,6 @@ def misclassification_cost_weighted(x, w): return 1.0 - p -def misclassification_cost(x, w=None): - if w is None: - return misclassification_cost_fast(x) - return misclassification_cost_weighted(x, w) - - def logistic_loss(y, p): p = np.clip(p, 1e-12, 1.0 - 1e-12) return -np.sum(y * np.log(p) + (1 - y) * np.log(1 - p)) @@ -94,13 +76,7 @@ def r_squared(y, y_hat): return 1 - sse / sst -def majority_class(x, w=None): - if w is None: - return majority_class_fast(x) - return majority_class_weighted(x, w) - - -def majority_class_fast(x): +def majority_class(x): unique, counts = np.unique(x, return_counts=True) ind_max = np.argmax(counts) return unique[ind_max] diff --git a/binarybeech/metrics.py b/binarybeech/metrics.py index 0c44654..a0f3ac2 100644 --- a/binarybeech/metrics.py +++ b/binarybeech/metrics.py @@ -52,15 +52,15 @@ def inverse_transform(arr): return arr @abstractmethod - def loss(self, y, y_hat, w): + def loss(self, y, y_hat, **kwargs): pass @abstractmethod - def loss_prune(self, y, y_hat): + def loss_prune(self, y, y_hat, **kwargs): pass @abstractmethod - def node_value(self, y, w): + def node_value(self, y, **kwargs): pass @abstractmethod @@ -85,15 +85,17 @@ class RegressionMetrics(Metrics): def __init__(self): pass - def loss(self, y, y_hat, w): + def loss(self, y, y_hat, **kwargs): # Implementation of the loss calculation for regression + if "weights" in kwargs.keys(): + return math.mean_squared_error_weighted(y, y_hat, kwargs["weights"]) return math.mean_squared_error(y, y_hat) - def loss_prune(self, y, y_hat): + def loss_prune(self, y, y_hat, **kwargs): # Implementation of the loss pruning calculation for regression - return self.loss(y, y_hat, None) + return self.loss(y, y_hat, **kwargs) - def node_value(self, y, w): + def node_value(self, y, **kwargs): # Implementation of the node value calculation for regression return np.nanmean(y) @@ -110,20 +112,22 @@ def goodness_of_fit(self, y, y_hat): def bins(self, df, y_name, attribute): y = df[y_name] + + kwargs = {} if "__weights__" in df: - w = df["__weights__"].values - else: - w = None - y_hat = self.node_value(y, w) + kwargs["weights"] = df["__weights__"].values + + y_hat = self.node_value(y, **kwargs) bins = [[], []] unique = np.unique(df[attribute]) for u in unique: y_u = df[df[attribute] == u][y_name] + + kwargs = {} if "__weights__" in df: - w = df[df[attribute] == u]["__weights__"].values - else: - w = None - y_hat_u = self.node_value(y_u, w) + kwargs["weights"] = df[df[attribute] == u]["__weights__"].values + + y_hat_u = self.node_value(y_u, **kwargs) if y_hat_u > y_hat: bins[0].append(u) else: @@ -140,15 +144,15 @@ class LogisticMetrics(Metrics): def __init__(self): pass - def loss(self, y, y_hat, w): + def loss(self, y, y_hat, **kwargs): # Implementation of the loss calculation for logistic return math.logistic_loss(y, y_hat) - def loss_prune(self, y, y_hat): + def loss_prune(self, y, y_hat, **kwargs): # Implementation of the loss pruning calculation for logistic return math.misclassification_cost(y) - def node_value(self, y, w): + def node_value(self, y, **kwargs): # Implementation of the node value calculation for logistic return math.max_probability(y) @@ -179,20 +183,22 @@ def inverse_transform(arr): def bins(self, df, y_name, attribute): y = df[y_name] + + kwargs = {} if "__weights__" in df: - w = df["__weights__"].values - else: - w = None - y_hat = self.node_value(y, w) + kwargs ["weights"] = df["__weights__"].values + + y_hat = self.node_value(y, **kwargs) bins = [[], []] unique = np.unique(df[attribute]) for u in unique: y_u = df[df[attribute] == u][y_name] + + kwargs = {} if "__weights__" in df: - w = df[df[attribute] == u]["__weights__"].values - else: - w = None - y_hat_u = self.node_value(y_u, w) + kwargs["weights"] = df[df[attribute] == u]["__weights__"].values + + y_hat_u = self.node_value(y_u, **kwargs) if y_hat_u == y_hat: bins[0].append(u) else: @@ -223,17 +229,23 @@ class ClassificationMetrics(Metrics): def __init__(self): pass - def loss(self, y, y_hat, w): + def loss(self, y, y_hat, **kwargs): # Implementation of the loss calculation for classification - return math.gini_impurity(y, w) + if "weights" in kwargs.keys(): + return math.gini_impurity_weighted(y, kwargs["weights"]) + return math.gini_impurity(y) - def loss_prune(self, y, y_hat): + def loss_prune(self, y, y_hat, **kwargs): # Implementation of the loss pruning calculation for classification + if "weights" in kwargs.keys(): + return math.misclassification_cost_weighted(y, kwargs["weights"]) return math.misclassification_cost(y) - def node_value(self, y, w): + def node_value(self, y, **kwargs): # Implementation of the node value calculation for classification - return math.majority_class(y, w) + if "weights" in kwargs.keys(): + return math.majority_class_weighted(y, kwargs["weights"]) + return math.majority_class(y) def validate(self, y, y_hat): return self._classification_metrics(y, y_hat) @@ -259,20 +271,22 @@ def goodness_of_fit(self, y, y_hat): def bins(self, df, y_name, attribute): y = df[y_name] + + kwargs = {} if "__weights__" in df: - w = df["__weights__"].values - else: - w = None - y_hat = self.node_value(y, w) + kwargs["weights"] = df["__weights__"].values + + y_hat = self.node_value(y, **kwargs) bins = [[], []] unique = np.unique(df[attribute]) for u in unique: y_u = df[df[attribute] == u][y_name] + + kwargs = {} if "__weights__" in df: - w = df[df[attribute] == u]["__weights__"].values - else: - w = None - y_hat_u = self.node_value(y_u, w) + kwargs["weights"] = df[df[attribute] == u]["__weights__"].values + + y_hat_u = self.node_value(y_u, **kwargs) if y_hat_u == y_hat: bins[0].append(u) else: @@ -289,9 +303,11 @@ class ClassificationMetricsEntropy(ClassificationMetrics): def __init__(self): pass - def loss(self, y, y_hat, w): + def loss(self, y, y_hat, **kwargs): # Implementation of the loss calculation for classification - return math.shannon_entropy(y, w) + if "weights" in kwargs.keys(): + return math.shannon_entropy_weighted(y, y_hat, kwargs["weights"]) + return math.shannon_entropy(y) # ============================= @@ -301,13 +317,13 @@ class UnsupervisedMetrics(Metrics): def __init__(self): pass - def loss(self, y, y_hat, w): + def loss(self, y, y_hat, **kwargs): return np.inf - def loss_prune(self, y, y_hat): - return self.loss(y, y_hat, None) + def loss_prune(self, y, y_hat, **kwargs): + return self.loss(y, y_hat, **kwargs) - def node_value(self, y, w): + def node_value(self, y, **kwargs): return f"cluster {str(uuid.uuid4())}" def validate(self, y, y_hat):