Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] Add option to treat nans as mcar #65

Open
wants to merge 5 commits into
base: submodulev3
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 16 additions & 0 deletions sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ def __init__(
max_samples=None,
max_bins=None,
store_leaf_values=False,
missing_car=False,
):
super().__init__(
estimator=estimator,
Expand All @@ -337,6 +338,7 @@ def __init__(
self.max_samples = max_samples
self.max_bins = max_bins
self.store_leaf_values = store_leaf_values
self.missing_car = missing_car

def apply(self, X):
"""
Expand Down Expand Up @@ -1085,6 +1087,7 @@ def __init__(
max_samples=None,
max_bins=None,
store_leaf_values=False,
missing_car=False,
):
super().__init__(
estimator=estimator,
Expand All @@ -1100,6 +1103,7 @@ def __init__(
max_samples=max_samples,
max_bins=max_bins,
store_leaf_values=store_leaf_values,
missing_car=missing_car,
)

@staticmethod
Expand Down Expand Up @@ -1970,6 +1974,9 @@ class RandomForestClassifier(ForestClassifier):

.. versionadded:: 1.4

missing_car : bool, default=False
Whether the missing values are missing completely at random (CAR).

Attributes
----------
estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
Expand Down Expand Up @@ -2111,6 +2118,7 @@ def __init__(
max_bins=None,
store_leaf_values=False,
monotonic_cst=None,
missing_car=False,
):
super().__init__(
estimator=DecisionTreeClassifier(),
Expand All @@ -2128,6 +2136,7 @@ def __init__(
"ccp_alpha",
"store_leaf_values",
"monotonic_cst",
"missing_car",
),
bootstrap=bootstrap,
oob_score=oob_score,
Expand All @@ -2139,6 +2148,7 @@ def __init__(
max_samples=max_samples,
max_bins=max_bins,
store_leaf_values=store_leaf_values,
missing_car=missing_car,
)

self.criterion = criterion
Expand Down Expand Up @@ -2742,6 +2752,9 @@ class ExtraTreesClassifier(ForestClassifier):

.. versionadded:: 1.4

missing_car : bool, default=False
Whether the missing values are missing completely at random (CAR).

Attributes
----------
estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier`
Expand Down Expand Up @@ -2872,6 +2885,7 @@ def __init__(
max_bins=None,
store_leaf_values=False,
monotonic_cst=None,
missing_car=False,
):
super().__init__(
estimator=ExtraTreeClassifier(),
Expand All @@ -2889,6 +2903,7 @@ def __init__(
"ccp_alpha",
"store_leaf_values",
"monotonic_cst",
"missing_car",
),
bootstrap=bootstrap,
oob_score=oob_score,
Expand All @@ -2900,6 +2915,7 @@ def __init__(
max_samples=max_samples,
max_bins=max_bins,
store_leaf_values=store_leaf_values,
missing_car=missing_car,
)

self.criterion = criterion
Expand Down
17 changes: 17 additions & 0 deletions sklearn/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
"ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
"store_leaf_values": ["boolean"],
"monotonic_cst": ["array-like", None],
"missing_car": ["boolean"],
}

@abstractmethod
Expand All @@ -149,6 +150,7 @@ def __init__(
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
missing_car=False,
):
self.criterion = criterion
self.splitter = splitter
Expand All @@ -164,6 +166,7 @@ def __init__(
self.ccp_alpha = ccp_alpha
self.store_leaf_values = store_leaf_values
self.monotonic_cst = monotonic_cst
self.missing_car = missing_car

def get_depth(self):
"""Return the depth of the decision tree.
Expand Down Expand Up @@ -532,6 +535,7 @@ def _build_tree(
min_weight_leaf,
random_state,
monotonic_cst,
self.missing_car,
)

if is_classifier(self):
Expand Down Expand Up @@ -614,6 +618,7 @@ def _update_tree(self, X, y, sample_weight):
min_weight_leaf,
random_state,
monotonic_cst,
self.missing_car,
)

# Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
Expand Down Expand Up @@ -1152,6 +1157,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):

.. versionadded:: 1.4

missing_car : bool, default=False
Whether the missing values are missing completely at random (CAR).

Attributes
----------
classes_ : ndarray of shape (n_classes,) or list of ndarray
Expand Down Expand Up @@ -1280,6 +1288,7 @@ def __init__(
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
missing_car=False,
):
super().__init__(
criterion=criterion,
Expand All @@ -1296,6 +1305,7 @@ def __init__(
monotonic_cst=monotonic_cst,
ccp_alpha=ccp_alpha,
store_leaf_values=store_leaf_values,
missing_car=missing_car,
)

@_fit_context(prefer_skip_nested_validation=True)
Expand Down Expand Up @@ -1784,6 +1794,7 @@ def __init__(
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
missing_car=False,
):
super().__init__(
criterion=criterion,
Expand All @@ -1799,6 +1810,7 @@ def __init__(
ccp_alpha=ccp_alpha,
store_leaf_values=store_leaf_values,
monotonic_cst=monotonic_cst,
missing_car=missing_car,
)

@_fit_context(prefer_skip_nested_validation=True)
Expand Down Expand Up @@ -2054,6 +2066,9 @@ class ExtraTreeClassifier(DecisionTreeClassifier):

.. versionadded:: 1.4

missing_car : bool, default=False
Whether the missing values are missing completely at random (CAR).

Attributes
----------
classes_ : ndarray of shape (n_classes,) or list of ndarray
Expand Down Expand Up @@ -2168,6 +2183,7 @@ def __init__(
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
missing_car=False,
):
super().__init__(
criterion=criterion,
Expand All @@ -2184,6 +2200,7 @@ def __init__(
ccp_alpha=ccp_alpha,
store_leaf_values=store_leaf_values,
monotonic_cst=monotonic_cst,
missing_car=missing_car,
)


Expand Down
1 change: 1 addition & 0 deletions sklearn/tree/_splitter.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ cdef class Splitter(BaseSplitter):

cdef public Criterion criterion # Impurity criterion
cdef const float64_t[:, ::1] y
cdef bint missing_car

# Monotonicity constraints for each feature.
# The encoding is as follows:
Expand Down
37 changes: 27 additions & 10 deletions sklearn/tree/_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ cdef class Splitter(BaseSplitter):
float64_t min_weight_leaf,
object random_state,
const int8_t[:] monotonic_cst,
bint missing_car,
*argv
):
"""
Expand All @@ -173,8 +174,17 @@ cdef class Splitter(BaseSplitter):
The user inputted random state to be used for pseudo-randomness

monotonic_cst : const int8_t[:]
Monotonicity constraints
Indicates the monotonicity constraint to enforce on each feature.
- 1: monotonic increase
- 0: no constraint
- -1: monotonic decrease

If monotonic_cst is None, no constraints are applied.

missing_car : bool
Indicates if the missing-values should be assumed as missing completely
at random. If that is the case, the missing values will be randomly
assigned to the left or right child of the split.
"""
self.criterion = criterion

Expand All @@ -187,14 +197,18 @@ cdef class Splitter(BaseSplitter):
self.random_state = random_state
self.monotonic_cst = monotonic_cst
self.with_monotonic_cst = monotonic_cst is not None
self.missing_car = missing_car

def __reduce__(self):
return (type(self), (self.criterion,
self.max_features,
self.min_samples_leaf,
self.min_weight_leaf,
self.random_state,
self.monotonic_cst.base if self.monotonic_cst is not None else None), self.__getstate__())
return (type(self), (
self.criterion,
self.max_features,
self.min_samples_leaf,
self.min_weight_leaf,
self.random_state,
self.monotonic_cst.base if self.monotonic_cst is not None else None,
self.missing_car,
), self.__getstate__())

cdef int init(
self,
Expand Down Expand Up @@ -562,10 +576,13 @@ cdef inline intp_t node_split_best(
# The second search will have all the missing values going to the left node.
# If there are no missing values, then we search only once for the most
# optimal split.
n_searches = 2 if has_missing else 1
n_searches = 2 if has_missing and not splitter.missing_car else 1

for i in range(n_searches):
missing_go_to_left = i == 1
if not splitter.missing_car:
missing_go_to_left = i == 1
else:
missing_go_to_left = rand_int(0, 2, random_state)
criterion.missing_go_to_left = missing_go_to_left
criterion.reset()

Expand Down Expand Up @@ -645,7 +662,7 @@ cdef inline intp_t node_split_best(

# Evaluate when there are missing values and all missing values goes
# to the right node and non-missing values goes to the left node.
if has_missing:
if has_missing and not splitter.missing_car:
n_left, n_right = end - start - n_missing, n_missing
p = end - n_missing
missing_go_to_left = 0
Expand Down
35 changes: 34 additions & 1 deletion sklearn/tree/tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2349,7 +2349,9 @@ def test_splitter_serializable(Splitter):
n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp)

criterion = CRITERIA_CLF["gini"](n_outputs, n_classes)
splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None)
splitter = Splitter(
criterion, max_features, 5, 0.5, rng, monotonic_cst=None, missing_car=False
)
splitter_serialize = pickle.dumps(splitter)

splitter_back = pickle.loads(splitter_serialize)
Expand Down Expand Up @@ -2600,6 +2602,37 @@ def test_missing_value_is_predictive():
assert tree.score(X_test, y_test) >= 0.85


def test_missing_value_is_not_predictive_with_mcar():
"""Check the tree doesnt learns when the missing value is forced to be
unpredictive.
"""
rng = np.random.RandomState(0)
n_samples = 1000

X = rng.standard_normal(size=(n_samples, 10))
y = rng.randint(0, high=2, size=n_samples)

# Create a predictive feature using `y` and with some noise
X_random_mask = rng.choice([False, True], size=n_samples, p=[0.9, 0.1])
y_mask = y.copy().astype(bool)
y_mask[X_random_mask] = ~y_mask[X_random_mask]

X_predictive = rng.standard_normal(size=n_samples)
X_predictive[y_mask] = np.nan

X[:, 5] = X_predictive

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
tree = DecisionTreeClassifier(random_state=rng, missing_car=True).fit(
X_train, y_train
)
non_mcar_tree = DecisionTreeClassifier(random_state=rng, missing_car=False).fit(
X_train, y_train
)

non_mcar_tree.score(X_test, y_test) > tree.score(X_test, y_test) + 0.2


@pytest.mark.parametrize(
"make_data, Tree",
[
Expand Down