Merge pull request #44 from VesnaT/ice

ICE: Init ICE widget
biolab · Aug 3, 2022 · 334edee · 334edee
2 parents 2a131df + 3547f2b
commit 334edee
Show file tree

Hide file tree

Showing 10 changed files with 1,214 additions and 8 deletions.
diff --git a/doc/index.rst b/doc/index.rst
@@ -12,6 +12,7 @@ Widgets
    widgets/explain-model
    widgets/explain-prediction
    widgets/explain-predictions
+   widgets/ice
 
 Indices and tables
 ==================

diff --git a/doc/widgets/ice.md b/doc/widgets/ice.md
@@ -0,0 +1,34 @@
+ICE
+===
+
+Displays one line per instance that shows how the instance’s prediction changes when a feature changes.
+
+**Inputs**
+
+- Model: model
+- Data: dataset
+
+The **ICE** (Individual Conditional Expectation) widget visualizes the dependence of the prediction on a feature for each instance separately, resulting in one line per instance, compared to one line overall in partial dependence plots.
+
+
+![](images/ICE.png)
+
+1. Select a target class.
+2. Select a feature.
+3. Order features by importance (partial dependence averaged across all the samples).
+4. Apply the color of a discrete feature.
+5. If **Centered** is ticked, the plot lines will start at the origin of the y-axis.
+5. If **Show mean** is ticked, the average across all the samples in the dataset is shown. 
+6. If **Send Automatically** is ticked, the output is sent automatically after any change.
+   Alternatively, click **Send**.
+7. Get help, save the plot, make the report, set plot properties, or observe the size of input and output data.
+8. Plot shows a line for each instance in the input dataset.
+
+Example
+-------
+
+In the flowing example, we use the ICE widget to explain Random Forest model. In the File widget, we open the Housing dataset. We connect it to the Random Forest widget, which trains the model. The ICE widget accepts the model and data which are used to explain the model.
+
+By selecting some arbitrary lines, the selected instances of the input dataset appear on the output of the ICE widget.
+
+![](images/ICE-Example.png)
diff --git a/doc/widgets/images/ICE-example.png b/doc/widgets/images/ICE-example.png
diff --git a/doc/widgets/images/ICE.png b/doc/widgets/images/ICE.png
diff --git a/orangecontrib/explain/inspection.py b/orangecontrib/explain/inspection.py
@@ -1,12 +1,13 @@
 """ Permutation feature importance for models. """
-from typing import Callable
+from typing import Callable, Tuple, Optional, Dict
 
 import numpy as np
 import scipy.sparse as sp
+from sklearn.inspection import partial_dependence
 
-from Orange.base import Model
+from Orange.base import Model, SklModel
 from Orange.classification import Model as ClsModel
-from Orange.data import Table
+from Orange.data import Table, Variable, DiscreteVariable
 from Orange.evaluation import Results
 from Orange.evaluation.scoring import Score, TargetScore, RegressionScore, R2
 from Orange.regression import Model as RegModel
@@ -19,7 +20,7 @@ def permutation_feature_importance(
         score: Score,
         n_repeats: int = 5,
         progress_callback: Callable = None
-):
+) -> np.ndarray:
     """
     Function calculates feature importance of a model for a given data.
 
@@ -174,3 +175,46 @@ def _calculate_permutation_scores(
 
     progress_callback(1)
     return scores
+
+
+def individual_condition_expectation(
+        model: SklModel,
+        data: Table,
+        feature: Variable,
+        grid_resolution: int = 1000,
+        kind: str = "both",
+        progress_callback: Callable = dummy_callback
+) -> Dict[str, np.ndarray]:
+    progress_callback(0)
+    _check_data(data)
+    needs_pp = _check_model(model, data)
+    if needs_pp:
+        data = model.data_to_model_domain(data)
+
+    assert feature.name in [a.name for a in data.domain.attributes]
+    feature_index = data.domain.index(feature.name)
+
+    assert isinstance(model, SklModel), f"Model ({model}) is not supported."
+    progress_callback(0.1)
+
+    dep = partial_dependence(model.skl_model,
+                             data.X,
+                             [feature_index],
+                             grid_resolution=grid_resolution,
+                             kind=kind)
+
+    results = {"average": dep["average"], "values": dep["values"][0]}
+    if kind == "both":
+        results["individual"] = dep["individual"]
+
+    if data.domain.has_discrete_class and \
+            len(data.domain.class_var.values) == 2:
+        results = {"average": np.vstack([1 - dep["average"], dep["average"]]),
+                   "values": dep["values"][0]}
+        if kind == "both":
+            results["individual"] = \
+                np.vstack([1 - dep["individual"], dep["individual"]])
+
+    progress_callback(1)
+
+    return results
diff --git a/orangecontrib/explain/tests/test_inspection.py b/orangecontrib/explain/tests/test_inspection.py
@@ -3,19 +3,19 @@
 import pkg_resources
 
 import numpy as np
-from sklearn.inspection import permutation_importance
+from sklearn.inspection import permutation_importance, partial_dependence
 
 from Orange.base import Model
 from Orange.classification import NaiveBayesLearner, RandomForestLearner, \
     LogisticRegressionLearner, TreeLearner
-from Orange.data import Table, Domain
+from Orange.data import Table, Domain, DiscreteVariable
 from Orange.data.table import DomainTransformationError
 from Orange.evaluation import CA, MSE, AUC
 from Orange.regression import RandomForestRegressionLearner, \
     TreeLearner as TreeRegressionLearner
 
 from orangecontrib.explain.inspection import permutation_feature_importance, \
-    _wrap_score, _check_model
+    _wrap_score, _check_model, individual_condition_expectation
 
 
 def _permutation_feature_importance_skl(
@@ -284,5 +284,101 @@ def test_sparse_data(self):
         )
 
 
+class TestIndividualConditionalExpectation(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.iris = Table.from_file("iris")
+        cls.heart = Table.from_file("heart_disease")
+        cls.housing = Table.from_file("housing")
+
+    def test_discrete_class(self):
+        data = self.iris[:100]
+        class_var = DiscreteVariable("iris", data.domain.class_var.values[:2])
+        data = data.transform(Domain(data.domain.attributes, class_var))
+        model = RandomForestLearner(n_estimators=10, random_state=0)(data)
+        res = individual_condition_expectation(model, data, data.domain[0])
+        self.assertIsInstance(res, dict)
+        self.assertEqual(res["average"].shape, (2, 28))
+        self.assertEqual(res["individual"].shape, (2, 100, 28))
+        self.assertEqual(res["values"].shape, (28,))
+
+    def test_discrete_class_result_values(self):
+        data = self.iris[:100]
+        class_var = DiscreteVariable("iris", data.domain.class_var.values[:2])
+        data = data.transform(Domain(data.domain.attributes, class_var))
+        model1 = RandomForestLearner(n_estimators=10, random_state=0)(data)
+
+        data.Y = np.abs(data.Y - 1)
+        model2 = RandomForestLearner(n_estimators=10, random_state=0)(data)
+
+        res = individual_condition_expectation(model1, data, data.domain[0])
+        dep1 = partial_dependence(model1.skl_model, data.X, [0], kind="both")
+        dep2 = partial_dependence(model2.skl_model, data.X, [0], kind="both")
+        np.testing.assert_array_almost_equal(
+            res["average"][:1], dep2["average"])
+        np.testing.assert_array_almost_equal(
+            res["average"][1:], dep1["average"])
+        np.testing.assert_array_almost_equal(
+            res["individual"][:1], dep2["individual"])
+        np.testing.assert_array_almost_equal(
+            res["individual"][1:], dep1["individual"])
+
+    def test_continuous_class(self):
+        data = self.housing
+        model = RandomForestRegressionLearner(n_estimators=10, random_state=0)(data)
+        res = individual_condition_expectation(model, data, data.domain[0])
+        self.assertIsInstance(res, dict)
+        self.assertEqual(res["average"].shape, (1, 504))
+        self.assertEqual(res["individual"].shape, (1, 506, 504))
+        self.assertEqual(res["values"].shape, (504,))
+
+    def test_multi_class(self):
+        data = self.iris
+        model = RandomForestLearner(n_estimators=10, random_state=0)(data)
+        res = individual_condition_expectation(model, data, data.domain[0])
+        self.assertIsInstance(res, dict)
+        self.assertEqual(res["average"].shape, (3, 35))
+        self.assertEqual(res["individual"].shape, (3, 150, 35))
+        self.assertEqual(res["values"].shape, (35,))
+
+    def test_mixed_features(self):
+        data = self.heart
+        model = RandomForestLearner(n_estimators=10, random_state=0)(data)
+        res = individual_condition_expectation(model, data, data.domain[0])
+        self.assertIsInstance(res, dict)
+        self.assertEqual(res["average"].shape, (2, 41))
+        self.assertEqual(res["individual"].shape, (2, 303, 41))
+        self.assertEqual(res["values"].shape, (41,))
+
+    def _test_sklearn(self):
+        from matplotlib import pyplot as plt
+        from sklearn.ensemble import RandomForestClassifier, \
+            RandomForestRegressor
+        from sklearn.inspection import PartialDependenceDisplay
+
+        X = self.housing.X
+        y = self.housing.Y
+        model = RandomForestRegressor(random_state=0)
+
+        # X = self.iris.X[:100]
+        # y = self.iris.Y[:100]
+        # y = np.abs(y - 1)
+        # model = RandomForestClassifier(random_state=0)
+        model.fit(X, y)
+        display = PartialDependenceDisplay.from_estimator(
+            model,
+            X,
+            [X.shape[1] - 1],
+            target=0,
+            kind="both",
+            centered=True,
+            subsample=1000,
+            # grid_resolution=100,
+            random_state=0,
+        )
+
+        plt.show()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/orangecontrib/explain/widgets/icons/ICE.svg b/orangecontrib/explain/widgets/icons/ICE.svg