Remove that code, general cleanup.

Signed-off-by: Terence Parr <parrt@antlr.org>
parrt · Dec 27, 2022 · b74db63 · b74db63
1 parent 4e536b6
commit b74db63
Show file tree

Hide file tree

Showing 8 changed files with 51 additions and 156 deletions.
diff --git a/dtreeviz/__init__.py b/dtreeviz/__init__.py
@@ -1,6 +1,7 @@
 from .version import __version__
 
 # NEW API
+# import dtreeviz
 # call m = dtreeviz.model(...) then m.view() etc...
 from dtreeviz.utils import DTreeVizRender
 from dtreeviz.trees import DTreeVizAPI, model

diff --git a/dtreeviz/classifiers.py b/dtreeviz/classifiers.py
@@ -1,16 +1,16 @@
 from typing import Tuple
-import numpy as np
-import pandas as pd
 
-import matplotlib.patches as patches
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from PIL import ImageColor
+from colour import Color
 from matplotlib import patches as patches
 from matplotlib.collections import PatchCollection
-from colour import Color
-from PIL import ImageColor
 
-from dtreeviz.colors import adjust_colors, GREY
 from dtreeviz import utils
+from dtreeviz.colors import adjust_colors
+from dtreeviz.utils import add_classifier_legend
 
 
 def decision_boundaries(model, X: np.ndarray, y: np.ndarray,
@@ -536,34 +536,3 @@ def _predict_proba(model, X):
 
     # sklearn etc...
     return model.predict_proba(X)
-
-
-def add_classifier_legend(ax, class_names, class_values, facecolors, target_name,
-                          colors, fontsize=10, fontname='Arial'):
-    # add boxes for legend
-    boxes = []
-    for c in class_values:
-        box = patches.Rectangle((0, 0), 20, 10, linewidth=.4, edgecolor=colors['rect_edge'],
-                                facecolor=facecolors[c], label=class_names[c])
-        boxes.append(box)
-    leg = ax.legend(handles=boxes,
-                    frameon=True,
-                    shadow=False,
-                    fancybox=True,
-                    handletextpad=.35,
-                    borderpad=.8,
-                    bbox_to_anchor=(1.0, 1.0),
-                    edgecolor=colors['legend_edge'])
-
-    leg.set_title(target_name, prop={'size': fontsize,
-                                     'weight': 'bold',
-                                     'family': fontname})
-
-    leg.get_frame().set_linewidth(.5)
-    leg.get_title().set_color(colors['legend_title'])
-    leg.get_title().set_fontsize(fontsize)
-    leg.get_title().set_fontname(fontname)
-    # leg.get_title().set_fontweight('bold')
-    for text in leg.get_texts():
-        text.set_color(colors['text'])
-        text.set_fontsize(fontsize)
diff --git a/dtreeviz/compatibility.py b/dtreeviz/compatibility.py
@@ -62,7 +62,6 @@ def rtreeviz_bivar_heatmap(tree_model,
     Show tesselated 2D feature space for bivariate regression tree. X_train can
     have lots of features but features lists indexes of 2 features to train tree with.
     """
-
     warnings.warn("rtreeviz_bivar_heatmap() function is deprecated starting from version 2.0. \n "
                   "For the same functionality, please use this code instead: \n m = dtreeviz.model(...) \n m.rtree_feature_space(...)",
                   DeprecationWarning, stacklevel=2)
@@ -93,7 +92,6 @@ def rtreeviz_bivar_3D(tree_model,
     Show 3D feature space for bivariate regression tree. X_train should have
     just the 2 variables used for training.
     """
-
     warnings.warn("rtreeviz_bivar_3D() function is deprecated starting from version 2.0. \n "
                   "For the same functionality, please use this code instead: \n m = dtreeviz.model(...) \n m.rtree_feature_space3D(...)",
                   DeprecationWarning, stacklevel=2)
@@ -116,7 +114,6 @@ def ctreeviz_univar(tree_model,
                     show={'title', 'legend', 'splits'},
                     colors=None,
                     ax=None):
-
     warnings.warn("ctreeviz_univar() function is deprecated starting from version 2.0. \n "
                   "For the same functionality, please use this code instead: \n m = dtreeviz.model(...) \n m.ctree_feature_space(...)",
                   DeprecationWarning, stacklevel=2)
@@ -248,7 +245,6 @@ def dtreeviz(tree_model,
     :param scale: Default is 1.0. Scale the width, height of the overall SVG preserving aspect ratio
     :return: A string in graphviz DOT language that describes the decision tree.
     """
-
     warnings.warn("dtreeviz() function is deprecated starting from version 2.0. \n "
                   "For the same functionality, please use this code instead: \n m = dtreeviz.model(...) \n m.view()",
                   DeprecationWarning, stacklevel=2)
@@ -345,7 +341,6 @@ def viz_leaf_samples(tree_model,
     :param figsize: optional (width, height) in inches for the entire plot
     :param ax: optional matplotlib "axes" to draw into
     """
-
     warnings.warn("viz_leaf_samples() function is deprecated starting from version 2.0. \n "
                   "For the same functionality, please use this code instead: \n m = dtreeviz.model(...) \n m.leaf_sizes()",
                   DeprecationWarning, stacklevel=2)
@@ -494,30 +489,6 @@ def ctreeviz_leaf_samples(tree_model,
     model.ctree_leaf_distributions(display_type, plot_ylim, colors, fontsize, fontname, grid, figsize, ax)
 
 
-def _get_leaf_target_input(shadow_tree: ShadowDecTree,
-                           precision: int):
-    x = []
-    y = []
-    means = []
-    means_range = []
-    x_labels = []
-    sigma = .05
-    for i, node in enumerate(shadow_tree.leaves):
-        leaf_index_sample = node.samples()
-        leaf_target = shadow_tree.y_train[leaf_index_sample]
-        leaf_target_mean = np.mean(leaf_target)
-        np.random.seed(0)  # generate the same list of random values for each call
-        X = np.random.normal(i, sigma, size=len(leaf_target))
-
-        x.extend(X)
-        y.extend(leaf_target)
-        means.append([leaf_target_mean, leaf_target_mean])
-        means_range.append([i - (sigma * 3), i + (sigma * 3)])
-        x_labels.append(f"{myround(leaf_target_mean, precision)}")
-
-    return x, y, means, means_range, x_labels
-
-
 def viz_leaf_target(tree_model,
                     X_train: (pd.DataFrame, np.ndarray) = None,
                     y_train: (pd.DataFrame, np.ndarray) = None,
@@ -622,7 +593,6 @@ def describe_node_sample(tree_model,
     :return: pd.DataFrame
         Node training samples' stats
     """
-
     warnings.warn("describe_node_sample() function is deprecated starting from version 2.0. \n "
                   "For the same functionality, please use this code instead: \n m = dtreeviz.model(...) \n m.node_stats()",
                   DeprecationWarning, stacklevel=2)
@@ -675,7 +645,6 @@ def explain_prediction_path(tree_model,
         Required in case of tree ensemble. Specify the tree index to interpret.
 
     """
-
     shadow_tree = ShadowDecTree.get_shadow_tree(tree_model, X_train, y_train, feature_names, None, class_names,
                                                 tree_index)
     model = DTreeVizAPI(shadow_tree)
@@ -691,33 +660,3 @@ def explain_prediction_path(tree_model,
                       "For the same functionality, please use this code instead: \n m = dtreeviz.model(...) \n m.explain_prediction_path()",
                       DeprecationWarning, stacklevel=2)
         return model.explain_prediction_path(x)
-
-
-def model(model,
-          X_train,
-          y_train,
-          tree_index: int = None,
-          feature_names: List[str] = None,
-          target_name: str = None,
-          class_names: (List[str], Mapping[int, str]) = None
-          ):
-    """
-    Given a decision tree-based model from a supported decision-tree library, training data, and
-    information about the data, create a model adaptor that
-    provides a consistent interface for the overall dtreeviz lib to the various supported tree libraries.
-    Call methods such as v.view(), v.explain_prediction_path(), v.rtree_feature_space3D() on returned adaptor v.
-
-    :param model: A tree-based model from a supportive decision tree library, such as sklearn, XGBoost, and TensorFlow.
-    :param X_train: Features used to train model; 2D array-like object of shape (n_samples, n_features).
-    :param y_train: Classifier or regressor target used to train model; 1D array-like object of shape (n_samples, 1).
-    :param tree_index: Index (from 0) of tree if model is an ensemble of trees like a random forest.
-    :param feature_names: Names of features in the same order of X_train.
-    :param target_name: What is the (string) name of the target variable; e.g., for a house price regressor, this might be "price".
-    :param class_names: For classifiers, what are the names associated with the labels?
-    :return: a DTreeVizAPI object that provides the main API for dtreeviz (version 2.0.0+);
-             e.g., call the view() method on the return object to display it in a notebook.
-    """
-    shadow_tree = ShadowDecTree.get_shadow_tree(model, X_train, y_train, feature_names, target_name, class_names,
-                                                tree_index)
-    dtreeviz_model = DTreeVizAPI(shadow_tree)
-    return dtreeviz_model
diff --git a/dtreeviz/interpretation.py b/dtreeviz/interpretation.py
@@ -33,7 +33,6 @@ def explain_prediction_plain_english(shadow_tree: ShadowDecTree,
     :return: str
         Prediction path explanation in plain english.
     """
-
     node_feature_index = shadow_tree.get_features()
     feature_names = shadow_tree.feature_names
     node_threshold = shadow_tree.get_thresholds()
@@ -42,7 +41,6 @@ def explain_prediction_plain_english(shadow_tree: ShadowDecTree,
     # TODO - refactor this logic and find a way to make it simpler
     feature_smaller_values = {}
     feature_bigger_values = {}
-    # feature_categorical_value = {}
     feature_categorical_value = defaultdict(lambda: set())
     feature_categorical_value_not_in = defaultdict(lambda: set())
 
@@ -86,7 +84,6 @@ def explain_prediction_plain_english(shadow_tree: ShadowDecTree,
             prediction_path_output += feature_range + "\n"
 
     for feature_name in set(list(feature_categorical_value.keys()) + list(feature_categorical_value_not_in.keys())):
-        # prediction_path_output += f"{feature_name} in {feature_categorical_value[feature_name]} \n"
         prediction_path_output += f"{feature_name}{' in ' + str(feature_categorical_value[feature_name]) if feature_name in feature_categorical_value else ''}" \
                                   f"{' not in ' + str(feature_categorical_value_not_in[feature_name]) if feature_name in feature_categorical_value_not_in else ''}  \n"
 
@@ -152,5 +149,3 @@ def explain_prediction_sklearn_default(shadow_tree: ShadowDecTree,
     ax.set_xlabel("feature importance", fontsize=fontsize, fontname=fontname, color=colors['axis_label'])
     ax.grid(b=grid)
     return ax
-
-
diff --git a/dtreeviz/models/shadow_decision_tree.py b/dtreeviz/models/shadow_decision_tree.py
@@ -278,8 +278,6 @@ def get_split_node_heights(self, X_train, y_train, nbins) -> Mapping[int, int]:
 
             bins = np.linspace(overall_feature_range[0],
                                overall_feature_range[1], nbins + 1)
-            # bins = np.arange(overall_feature_range[0],
-            #                  overall_feature_range[1] + binwidth, binwidth)
             # print(f"\tlen(bins)={len(bins):2d} bins={bins}")
             X, y = X_feature[node.samples()], y_train[node.samples()]
             X_hist = [X[y == cl] for cl in class_values]
@@ -386,7 +384,6 @@ def get_leaf_sample_counts(self, min_samples=0, max_samples=None):
         :return: tuple
             Contains a numpy array of leaf ids and an array of leaf samples
         """
-
         max_samples = max_samples if max_samples else max([node.nsamples() for node in self.leaves])
         leaf_samples = [(node.id, node.nsamples()) for node in self.leaves if
                         min_samples <= node.nsamples() <= max_samples]
@@ -399,7 +396,6 @@ def get_leaf_criterion(self):
         For classification, supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.
         For regression, supported criteria are “mse”, “friedman_mse”, “mae”.
         """
-
         leaf_criterion = [(node.id, node.criterion()) for node in self.leaves]
         x, y = zip(*leaf_criterion)
         return np.array(x), np.array(y)
@@ -410,7 +406,6 @@ def get_leaf_sample_counts_by_class(self):
         :return: tuple
             Contains a list of leaf ids and a two lists of leaf samples(one for each class)
         """
-
         leaf_samples = [(node.id, node.n_sample_classes()[0], node.n_sample_classes()[1]) for node in self.leaves]
         index, leaf_sample_0, leaf_samples_1 = zip(*leaf_samples)
         return index, leaf_sample_0, leaf_samples_1
@@ -507,24 +502,20 @@ def __init__(self, shadow_tree: ShadowDecTree, id: int, left=None, right=None, l
 
     def split(self) -> (int, float):
         """Returns the split/threshold value used at this node."""
-
         return self.shadow_tree.get_node_split(self.id)
 
     def feature(self) -> int:
         """Returns feature index used at this node"""
-
         return self.shadow_tree.get_node_feature(self.id)
 
     def feature_name(self) -> (str, None):
         """Returns the feature name used at this node"""
-
         if self.shadow_tree.feature_names is not None:
             return self.shadow_tree.feature_names[self.feature()]
         return None
 
     def samples(self) -> List[int]:
         """Returns samples indexes from this node"""
-
         return self.shadow_tree.get_node_samples()[self.id]
 
     def nsamples(self) -> int:
@@ -533,7 +524,6 @@ def nsamples(self) -> int:
         used to compute the predicted value or class . If this is an internal node, it is the number of samples used
         to compute the split point.
         """
-
         return self.shadow_tree.get_node_nsamples(self.id)
 
     # TODO
@@ -544,7 +534,6 @@ def n_sample_classes(self):
 
         Returns the sample count values for each classes.
         """
-
         samples = np.array(self.samples())
         if samples.size == 0:
             return [0, 0]
@@ -565,7 +554,6 @@ def criterion(self):
 
     def split_samples(self) -> Tuple[np.ndarray, np.ndarray]:
         """Returns the list of indexes to the left and the right of the split value."""
-
         return self.shadow_tree.get_split_samples(self.id)
 
     def isleaf(self) -> bool:
@@ -582,14 +570,8 @@ def prediction(self) -> (Number, None):
 
         If the node is an internal node, returns None
         """
-
         if not self.isleaf():
             return None
-        # if self.isclassifier():
-        #     counts = self.shadow_tree.get_prediction_value(self.id)
-        #     return np.argmax(counts)
-        # else:
-        #     return self.shadow_tree.get_prediction_value(self.id)
         return self.shadow_tree.get_prediction(self.id)
 
     def prediction_name(self) -> (str, None):
@@ -599,7 +581,6 @@ def prediction_name(self) -> (str, None):
 
         Return prediction class or value otherwise.
         """
-
         if self.isclassifier():
             if self.shadow_tree.class_names is not None:
                 return self.shadow_tree.class_names[self.prediction()]
@@ -609,7 +590,6 @@ def class_counts(self) -> (List[int], None):
         """
         If this tree model is a classifier, return a list with the count associated with each class.
         """
-
         if self.isclassifier():
             if self.shadow_tree.get_class_weight() is None:
                 # return np.array(np.round(self.shadow_tree.tree_model.tree_.value[self.id][0]), dtype=int)

diff --git a/dtreeviz/models/tensorflow_decision_tree.py b/dtreeviz/models/tensorflow_decision_tree.py
@@ -148,20 +148,18 @@ def get_node_split(self, id) -> (int, float):
     def get_node_feature(self, id) -> int:
         return self.get_features()[id]
 
-    # TODO check if we can pun this method in the super class
+    # TODO check if we can put this method in the super class
     def get_node_nsamples_by_class(self, id):
         all_nodes = self.internal + self.leaves
         if self.is_classifier():
             node_value = [node.n_sample_classes() for node in all_nodes if node.id == id]
             return node_value[0][0], node_value[0][1]
 
-    # TODO implement for regression tree
     def get_prediction(self, id):
         if self.is_classifier():
             return np.argmax(self.tree_nodes[id].value.probability)
         else:
             return self.tree_nodes[id].value.value
-        # raise VisualisationNotYetSupportedError("get_prediction()", "TensorFlow Decision Forests2")
 
     def is_categorical_split(self, id) -> bool:
         node_condition = self.tree_nodes[id].condition
@@ -201,7 +199,6 @@ def _get_nodes_info(self):
         Get individual node info and left/right child node. We are using a dict as a data structure to keep
         the left and right child node info.
         """
-
         tree_nodes = defaultdict(lambda: None)
         children_left = defaultdict(lambda: -1)
         children_right = defaultdict(lambda: -1)