neurodata · adam2392 · Apr 3, 2024 · Apr 3, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
@@ -320,6 +320,7 @@ def __init__(
         max_samples=None,
         max_bins=None,
         store_leaf_values=False,
+        missing_car=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -337,6 +338,7 @@ def __init__(
         self.max_samples = max_samples
         self.max_bins = max_bins
         self.store_leaf_values = store_leaf_values
+        self.missing_car = missing_car
 
     def apply(self, X):
         """
@@ -1085,6 +1087,7 @@ def __init__(
         max_samples=None,
         max_bins=None,
         store_leaf_values=False,
+        missing_car=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -1100,6 +1103,7 @@ def __init__(
             max_samples=max_samples,
             max_bins=max_bins,
             store_leaf_values=store_leaf_values,
+            missing_car=missing_car,
         )
 
     @staticmethod
@@ -1970,6 +1974,9 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 1.4
 
+    missing_car : bool, default=False
+        Whether the missing values are missing completely at random (CAR).
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
@@ -2111,6 +2118,7 @@ def __init__(
         max_bins=None,
         store_leaf_values=False,
         monotonic_cst=None,
+        missing_car=False,
     ):
         super().__init__(
             estimator=DecisionTreeClassifier(),
@@ -2128,6 +2136,7 @@ def __init__(
                 "ccp_alpha",
                 "store_leaf_values",
                 "monotonic_cst",
+                "missing_car",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2139,6 +2148,7 @@ def __init__(
             max_samples=max_samples,
             max_bins=max_bins,
             store_leaf_values=store_leaf_values,
+            missing_car=missing_car,
         )
 
         self.criterion = criterion
@@ -2742,6 +2752,9 @@ class ExtraTreesClassifier(ForestClassifier):
 
         .. versionadded:: 1.4
 
+    missing_car : bool, default=False
+        Whether the missing values are missing completely at random (CAR).
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier`
@@ -2872,6 +2885,7 @@ def __init__(
         max_bins=None,
         store_leaf_values=False,
         monotonic_cst=None,
+        missing_car=False,
     ):
         super().__init__(
             estimator=ExtraTreeClassifier(),
@@ -2889,6 +2903,7 @@ def __init__(
                 "ccp_alpha",
                 "store_leaf_values",
                 "monotonic_cst",
+                "missing_car",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2900,6 +2915,7 @@ def __init__(
             max_samples=max_samples,
             max_bins=max_bins,
             store_leaf_values=store_leaf_values,
+            missing_car=missing_car,
         )
 
         self.criterion = criterion

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
@@ -129,6 +129,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
         "store_leaf_values": ["boolean"],
         "monotonic_cst": ["array-like", None],
+        "missing_car": ["boolean"],
     }
 
     @abstractmethod
@@ -149,6 +150,7 @@ def __init__(
         ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
+        missing_car=False,
     ):
         self.criterion = criterion
         self.splitter = splitter
@@ -164,6 +166,7 @@ def __init__(
         self.ccp_alpha = ccp_alpha
         self.store_leaf_values = store_leaf_values
         self.monotonic_cst = monotonic_cst
+        self.missing_car = missing_car
 
     def get_depth(self):
         """Return the depth of the decision tree.
@@ -532,6 +535,7 @@ def _build_tree(
                 min_weight_leaf,
                 random_state,
                 monotonic_cst,
+                self.missing_car,
             )
 
         if is_classifier(self):
@@ -614,6 +618,7 @@ def _update_tree(self, X, y, sample_weight):
             min_weight_leaf,
             random_state,
             monotonic_cst,
+            self.missing_car,
         )
 
         # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
@@ -1152,6 +1157,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
         .. versionadded:: 1.4
 
+    missing_car : bool, default=False
+        Whether the missing values are missing completely at random (CAR).
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -1280,6 +1288,7 @@ def __init__(
         ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
+        missing_car=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -1296,6 +1305,7 @@ def __init__(
             monotonic_cst=monotonic_cst,
             ccp_alpha=ccp_alpha,
             store_leaf_values=store_leaf_values,
+            missing_car=missing_car,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
@@ -1784,6 +1794,7 @@ def __init__(
         ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
+        missing_car=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -1799,6 +1810,7 @@ def __init__(
             ccp_alpha=ccp_alpha,
             store_leaf_values=store_leaf_values,
             monotonic_cst=monotonic_cst,
+            missing_car=missing_car,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
@@ -2054,6 +2066,9 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
         .. versionadded:: 1.4
 
+    missing_car : bool, default=False
+        Whether the missing values are missing completely at random (CAR).
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -2168,6 +2183,7 @@ def __init__(
         ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
+        missing_car=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -2184,6 +2200,7 @@ def __init__(
             ccp_alpha=ccp_alpha,
             store_leaf_values=store_leaf_values,
             monotonic_cst=monotonic_cst,
+            missing_car=missing_car,
         )
 
 

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
@@ -96,6 +96,7 @@ cdef class Splitter(BaseSplitter):
 
     cdef public Criterion criterion      # Impurity criterion
     cdef const float64_t[:, ::1] y
+    cdef bint missing_car
 
     # Monotonicity constraints for each feature.
     # The encoding is as follows:

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
@@ -148,6 +148,7 @@ cdef class Splitter(BaseSplitter):
         float64_t min_weight_leaf,
         object random_state,
         const int8_t[:] monotonic_cst,
+        bint missing_car,
         *argv
     ):
         """
@@ -173,8 +174,17 @@ cdef class Splitter(BaseSplitter):
             The user inputted random state to be used for pseudo-randomness
 
         monotonic_cst : const int8_t[:]
-            Monotonicity constraints
+            Indicates the monotonicity constraint to enforce on each feature.
+            - 1: monotonic increase
+            - 0: no constraint
+            - -1: monotonic decrease
 
+            If monotonic_cst is None, no constraints are applied.
+
+        missing_car : bool
+            Indicates if the missing-values should be assumed as missing completely
+            at random. If that is the case, the missing values will be randomly
+            assigned to the left or right child of the split.
         """
         self.criterion = criterion
 
@@ -187,14 +197,18 @@ cdef class Splitter(BaseSplitter):
         self.random_state = random_state
         self.monotonic_cst = monotonic_cst
         self.with_monotonic_cst = monotonic_cst is not None
+        self.missing_car = missing_car
 
     def __reduce__(self):
-        return (type(self), (self.criterion,
-                             self.max_features,
-                             self.min_samples_leaf,
-                             self.min_weight_leaf,
-                             self.random_state,
-                             self.monotonic_cst.base if self.monotonic_cst is not None else None), self.__getstate__())
+        return (type(self), (
+            self.criterion,
+            self.max_features,
+            self.min_samples_leaf,
+            self.min_weight_leaf,
+            self.random_state,
+            self.monotonic_cst.base if self.monotonic_cst is not None else None,
+            self.missing_car,
+        ), self.__getstate__())
 
     cdef int init(
         self,
@@ -562,10 +576,13 @@ cdef inline intp_t node_split_best(
         # The second search will have all the missing values going to the left node.
         # If there are no missing values, then we search only once for the most
         # optimal split.
-        n_searches = 2 if has_missing else 1
+        n_searches = 2 if has_missing and not splitter.missing_car else 1
 
         for i in range(n_searches):
-            missing_go_to_left = i == 1
+            if not splitter.missing_car:
+                missing_go_to_left = i == 1
+            else:
+                missing_go_to_left = rand_int(0, 2, random_state)
             criterion.missing_go_to_left = missing_go_to_left
             criterion.reset()
 
@@ -645,7 +662,7 @@ cdef inline intp_t node_split_best(
 
         # Evaluate when there are missing values and all missing values goes
         # to the right node and non-missing values goes to the left node.
-        if has_missing:
+        if has_missing and not splitter.missing_car:
             n_left, n_right = end - start - n_missing, n_missing
             p = end - n_missing
             missing_go_to_left = 0

diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
@@ -2349,7 +2349,9 @@ def test_splitter_serializable(Splitter):
     n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp)
 
     criterion = CRITERIA_CLF["gini"](n_outputs, n_classes)
-    splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None)
+    splitter = Splitter(
+        criterion, max_features, 5, 0.5, rng, monotonic_cst=None, missing_car=False
+    )
     splitter_serialize = pickle.dumps(splitter)
 
     splitter_back = pickle.loads(splitter_serialize)
@@ -2600,6 +2602,37 @@ def test_missing_value_is_predictive():
     assert tree.score(X_test, y_test) >= 0.85
 
 
+def test_missing_value_is_not_predictive_with_mcar():
+    """Check the tree doesnt learns when the missing value is forced to be
+    unpredictive.
+    """
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+
+    X = rng.standard_normal(size=(n_samples, 10))
+    y = rng.randint(0, high=2, size=n_samples)
+
+    # Create a predictive feature using `y` and with some noise
+    X_random_mask = rng.choice([False, True], size=n_samples, p=[0.9, 0.1])
+    y_mask = y.copy().astype(bool)
+    y_mask[X_random_mask] = ~y_mask[X_random_mask]
+
+    X_predictive = rng.standard_normal(size=n_samples)
+    X_predictive[y_mask] = np.nan
+
+    X[:, 5] = X_predictive
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+    tree = DecisionTreeClassifier(random_state=rng, missing_car=True).fit(
+        X_train, y_train
+    )
+    non_mcar_tree = DecisionTreeClassifier(random_state=rng, missing_car=False).fit(
+        X_train, y_train
+    )
+
+    non_mcar_tree.score(X_test, y_test) > tree.score(X_test, y_test) + 0.2
+
+
 @pytest.mark.parametrize(
     "make_data, Tree",
     [