neurodata · adam2392 · May 6, 2024 · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst
@@ -13,6 +13,12 @@ Version 0.8
 Changelog
 ---------
 
+- |Fix| Previously missing-values in ``X`` input array for sktree estimators
+    did not raise an error, and silently ran, assuming the missing-values were
+    encoded as infinity value. This is now fixed, and the estimators will raise an
+    ValueError if missing-values are encountered in ``X`` input array.
+    By `Adam Li`_ (:pr:`#264`)
+
 Code and Documentation Contributors
 -----------------------------------
 

diff --git a/sktree/tree/_neighbors.py b/sktree/tree/_neighbors.py
@@ -64,3 +64,7 @@ def compute_similarity_matrix(self, X):
             The similarity matrix among the samples.
         """
         return compute_forest_similarity_matrix(self, X)
+
+    def _more_tags(self):
+        # XXX: no scikit-tree estimators support NaNs as of now
+        return {"allow_nan": False}
diff --git a/sktree/tree/tests/test_all_trees.py b/sktree/tree/tests/test_all_trees.py
@@ -3,7 +3,7 @@
 import pytest
 from numpy.testing import assert_almost_equal, assert_array_equal
 from sklearn.base import is_classifier
-from sklearn.datasets import make_blobs
+from sklearn.datasets import load_iris, make_blobs
 from sklearn.tree._tree import TREE_LEAF
 
 from sktree.tree import (
@@ -162,3 +162,24 @@ def test_similarity_matrix(tree):
 
     assert np.allclose(sim_mat, sim_mat.T)
     assert np.all((sim_mat.diagonal() == 1))
+
+
+@pytest.mark.parametrize("tree", ALL_TREES)
+def test_missing_values(tree):
+    """Smoke test to ensure that correct error is raised when missing values are present.
+
+    xref: https://github.com/neurodata/scikit-tree/issues/263
+    """
+    rng = np.random.default_rng(123)
+
+    iris_X, iris_y = load_iris(return_X_y=True, as_frame=True)
+
+    # Make the feature matrix 25% sparse
+    iris_X = iris_X.mask(rng.standard_normal(iris_X.shape) < 0.25)
+
+    classifier = tree()
+    with pytest.raises(ValueError, match="Input X contains NaN"):
+        if tree.__name__.startswith("Unsupervised"):
+            classifier.fit(iris_X)
+        else:
+            classifier.fit(iris_X, iris_y)