neurodata · adam2392 · Sep 2, 2023 · Sep 2, 2023 · Sep 7, 2023 · Sep 8, 2023
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -66,6 +66,11 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev
+          sudo apt-get install -y gcc g++
+
+      - name: show-gcc
+        run: |
+          gcc --version
 
       - name: Install Python packages
         run: |

diff --git a/.spin/cmds.py b/.spin/cmds.py
@@ -7,6 +7,8 @@
 from spin import util
 from spin.cmds import meson
 
+PROJECT_MODULE = "sktree"
+
 
 def get_git_revision_hash(submodule) -> str:
     return subprocess.check_output(["git", "rev-parse", f"@:{submodule}"]).decode("ascii").strip()

diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -76,4 +76,16 @@ Verify that installations work as expected on your machine.
 
     twine upload dist/*
 
-4. Update version number on ``meson.build`` and ``_version.py`` to the relevant version.
+4. Update version number on ``meson.build`` and ``_version.py`` to the relevant version.
+
+# Comparing branches using ASV benchmarks
+
+We use asv to compare performance between the current branch and main branch. For
+example, one can run:
+
+    asv continuous --verbose --split --bench ObliqueRandomForest origin/main constantsv2
+
+# Extending Cython API
+
+Due to the current state of scikit-learn's internal Cython code for trees, we have to instead leverage a fork of scikit-learn at https://github.com/neurodata/scikit-learn when
+extending the decision tree model API of scikit-learn. Specifically, we extend the Python and Cython API of the tree submodule in scikit-learn in our submodule, so we can introduce the tree models housed in this package. Thus these extend the functionality of decision-tree based models in a way that is not possible yet in scikit-learn itself. As one example, we introduce an abstract API to allow users to implement their own oblique splits. Our plan in the future is to benchmark these functionalities and introduce them upstream to scikit-learn where applicable and inclusion criterion are met.
diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@
 [![codecov](https://codecov.io/gh/neurodata/scikit-tree/branch/main/graph/badge.svg?token=H1reh7Qwf4)](https://codecov.io/gh/neurodata/scikit-tree)
 [![PyPI Download count](https://img.shields.io/pypi/dm/scikit-tree.svg)](https://pypistats.org/packages/scikit-tree)
 [![Latest PyPI release](https://img.shields.io/pypi/v/scikit-tree.svg)](https://pypi.org/project/scikit-tree/)
+[![Benchmark](https://img.shields.io/badge/Benchmarked%20by-asv-blue)](https://img.shields.io/badge/Benchmarked%20by-asv-blue)
 
 scikit-tree
 ===========
@@ -15,17 +16,14 @@ Tree-models have withstood the test of time, and are consistently used for moder
 
 We welcome contributions for modern tree-based algorithms. We use Cython to achieve fast C/C++ speeds, while abiding by a scikit-learn compatible (tested) API. Moreover, our Cython internals are easily extensible because they follow the internal Cython API of scikit-learn as well.
 
-Due to the current state of scikit-learn's internal Cython code for trees, we have to instead leverage a fork of scikit-learn at https://github.com/neurodata/scikit-learn when
-extending the decision tree model API of scikit-learn. Specifically, we extend the Python and Cython API of the tree submodule in scikit-learn in our submodule, so we can introduce the tree models housed in this package. Thus these extend the functionality of decision-tree based models in a way that is not possible yet in scikit-learn itself. As one example, we introduce an abstract API to allow users to implement their own oblique splits. Our plan in the future is to benchmark these functionalities and introduce them upstream to scikit-learn where applicable and inclusion criterion are met.
-
 Documentation
 =============
 
 See here for the documentation for our dev version: https://docs.neurodata.io/scikit-tree/dev/index.html
 
 Why oblique trees and why trees beyond those in scikit-learn?
 =============================================================
-In 2001, Leo Breiman proposed two types of Random Forests. One was known as ``Forest-RI``, which is the axis-aligned traditional random forest. One was known as ``Forest-RC``, which is the random oblique linear combinations random forest. This leveraged random combinations of features to perform splits. [MORF](1) builds upon ``Forest-RC`` by proposing additional functions to combine features. Other modern tree variants such as Canonical Correlation Forests (CCF), or unsupervised random forests are also important at solving real-world problems using robust decision tree models.
+In 2001, Leo Breiman proposed two types of Random Forests. One was known as ``Forest-RI``, which is the axis-aligned traditional random forest. One was known as ``Forest-RC``, which is the random oblique linear combinations random forest. This leveraged random combinations of features to perform splits. A generalization of ``Forest-RC`` known as [MORF](1) builds upon ``Forest-RC`` by proposing additional functions to combine features. Other modern tree variants such as extended isolation forests, or unsupervised random forests are also important at solving real-world problems using robust decision tree models.
 
 Installation
 ============

diff --git a/build_requirements.txt b/build_requirements.txt
@@ -1,6 +1,6 @@
 meson
 meson-python
-cython>=3.0
+cython<3.0
 ninja
 numpy
 scikit-learn>=1.3

diff --git a/doc/whats_new/v0.2.rst b/doc/whats_new/v0.2.rst
@@ -31,6 +31,7 @@ Changelog
 - |Efficiency| Around 1.5-2x speed improvement for unsupervised forests, by `Adam Li`_  (:pr:`114`)
 - |API| Allow ``sqrt`` and ``log2`` keywords to be used for ``min_samples_split`` parameter in unsupervised forests, by `Adam Li`_  (:pr:`114`)
 - |Feature| Implement extended isolation forest, by `Adam Li`_ (:pr:`101`)
+- |Feature| Track constant columns within the tree to prevent splitting or evaluating columns in oblique splits that are constant, by `Adam Li`_  (:pr:`121`)
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/pyproject.toml b/pyproject.toml
@@ -77,7 +77,7 @@ requires = [
     "wheel",
     "setuptools<=65.5",
     "packaging",
-    "Cython>=0.29.24",
+    "Cython==0.29.36",
     "scikit-learn>=1.3",
     "lazy_loader>=0.1",
 

diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork
diff --git a/sktree/tests/test_supervised_forest.py b/sktree/tests/test_supervised_forest.py
@@ -193,6 +193,7 @@ def _trunk(n, p=10, random_state=None):
 )
 def test_sklearn_compatible_estimator(estimator, check):
     # TODO: remove when we can replicate the CI error...
+    # this seems to be due to a compiler issue since it is not replicable on MacOSx
     if isinstance(
         estimator,
         (
@@ -207,17 +208,17 @@ def test_sklearn_compatible_estimator(estimator, check):
 
 def test_oblique_forest_sparse_parity():
     # Sparse parity dataset
-    n = 1000
+    n = 500
     X, y = _sparse_parity(n, random_state=0)
-    n_test = 0.1
+    n_test = 0.2
     X_train, X_test, y_train, y_test = train_test_split(
         X,
         y,
         test_size=n_test,
         random_state=0,
     )
 
-    rc_clf = ObliqueRandomForestClassifier(max_features=None, random_state=0)
+    rc_clf = ObliqueRandomForestClassifier(random_state=0)
     rc_clf.fit(X_train, y_train)
     y_hat = rc_clf.predict(X_test)
     rc_accuracy = accuracy_score(y_test, y_hat)
@@ -227,9 +228,11 @@ def test_oblique_forest_sparse_parity():
     y_hat = ri_clf.predict(X_test)
     ri_accuracy = accuracy_score(y_test, y_hat)
 
-    assert ri_accuracy < rc_accuracy
-    assert ri_accuracy > 0.45
-    assert rc_accuracy > 0.5
+    assert (
+        ri_accuracy < rc_accuracy
+    ), f"Oblique forest: {rc_accuracy} < Axis-aligned forest: {ri_accuracy}"
+    assert ri_accuracy > 0.45, f"Axis-aligned forest: {ri_accuracy} < 0.45"
+    assert rc_accuracy > 0.5, f"Oblique forest: {rc_accuracy} < 0.5"
 
 
 def test_oblique_forest_orthant():
@@ -258,7 +261,9 @@ def test_oblique_forest_orthant():
     y_hat = ri_clf.predict(X_test)
     ri_accuracy = accuracy_score(y_test, y_hat)
 
-    assert rc_accuracy >= ri_accuracy
+    assert (
+        rc_accuracy >= ri_accuracy
+    ), f"Oblique forest: {rc_accuracy} < Axis-aligned forest: {ri_accuracy}"
     assert ri_accuracy > 0.84
     assert rc_accuracy > 0.85
 

diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd
@@ -11,6 +11,7 @@
 import numpy as np
 
 cimport numpy as cnp
+from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 
 from .._lib.sklearn.tree._criterion cimport Criterion
@@ -46,6 +47,13 @@ cdef class BaseObliqueSplitter(Splitter):
     cdef vector[vector[DTYPE_t]] proj_mat_weights       # nonzero weights of sparse proj_mat matrix
     cdef vector[vector[SIZE_t]] proj_mat_indices        # nonzero indices of sparse proj_mat matrix
 
+    # keep a hashmap of every projection vector indices sampled
+    cdef unordered_map[size_t, bint] proj_vec_hash
+
+    cdef unordered_map[SIZE_t, DTYPE_t] min_val_map
+    cdef unordered_map[SIZE_t, DTYPE_t] max_val_map
+    cdef unordered_map[SIZE_t, bint] constant_col_map
+
     # TODO: assumes all oblique splitters only work with dense data
     cdef const DTYPE_t[:, :] X
 
@@ -58,7 +66,8 @@ cdef class BaseObliqueSplitter(Splitter):
     cdef void sample_proj_mat(
         self,
         vector[vector[DTYPE_t]]& proj_mat_weights,
-        vector[vector[SIZE_t]]& proj_mat_indices
+        vector[vector[SIZE_t]]& proj_mat_indices,
+        SIZE_t n_known_constants,
     ) noexcept nogil
 
     # Redefined here since the new logic requires calling sample_proj_mat
@@ -76,7 +85,8 @@ cdef class BaseObliqueSplitter(Splitter):
         const SIZE_t[:] samples,
         DTYPE_t[:] feature_values,
         vector[DTYPE_t]* proj_vec_weights,  # weights of the vector (max_features,)
-        vector[SIZE_t]* proj_vec_indices    # indices of the features (max_features,)
+        vector[SIZE_t]* proj_vec_indices,    # indices of the features (max_features,)
+        SIZE_t* n_known_constants
     ) noexcept nogil
 
     cdef int node_split(
@@ -97,12 +107,14 @@ cdef class ObliqueSplitter(BaseObliqueSplitter):
     cdef public double feature_combinations             # Number of features to combine
     cdef SIZE_t n_non_zeros                             # Number of non-zero features
     cdef SIZE_t[::1] indices_to_sample                  # an array of indices to sample of size mtry X n_features
+    cdef SIZE_t floor_feature_combinations
 
     # All oblique splitters (i.e. non-axis aligned splitters) require a
     # function to sample a projection matrix that is applied to the feature matrix
     # to quickly obtain the sampled projections for candidate splits.
     cdef void sample_proj_mat(
         self,
         vector[vector[DTYPE_t]]& proj_mat_weights,
-        vector[vector[SIZE_t]]& proj_mat_indices
+        vector[vector[SIZE_t]]& proj_mat_indices,
+        SIZE_t n_known_constants,
     ) noexcept nogil