[ENH] A modular approach to constructing forests that have permuted c…

…ovariates (#211) * Add `oob_samples_` and `predict_proba_per_tree` methods to each relevant Forest Classifier (and Regressor) * Basic sketch of a PermutationHonestForestClassifier * Isort in pre-commit --------- Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Feb 2, 2024 · 18c2f45 · 18c2f45
1 parent aa655d6
commit 18c2f45
Show file tree

Hide file tree

Showing 34 changed files with 1,031 additions and 205 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,10 +1,19 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.11.0
+    rev: 24.1.1
     hooks:
       - id: black
         args: [--quiet]
 
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        name: isort (python)
+      - id: isort
+        name: isort (cython)
+        types: [cython]
+
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.0
     hooks:
@@ -13,7 +22,7 @@ repos:
 
   # Ruff sktree
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.6
+    rev: v0.2.0
     hooks:
       - id: ruff
         name: ruff sktree
@@ -22,7 +31,7 @@ repos:
 
   # Ruff tutorials and examples
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.6
+    rev: v0.2.0
     hooks:
       - id: ruff
         name: ruff tutorials and examples
@@ -57,7 +66,7 @@ repos:
 
   # mypy
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.7.1
+    rev: v1.8.0
     hooks:
       - id: mypy
         # Avoid the conflict between mne/__init__.py and mne/__init__.pyi by ignoring the former

diff --git a/benchmarks_nonasv/bench_forestht.py b/benchmarks_nonasv/bench_forestht.py
@@ -4,6 +4,7 @@
 
 .. note:: This script will take a long time to run, since a power curve is generated.
 """
+
 from collections import defaultdict
 
 import matplotlib.pyplot as plt

diff --git a/benchmarks_nonasv/bench_oblique_tree.py b/benchmarks_nonasv/bench_oblique_tree.py
@@ -14,6 +14,7 @@
 training set, classify a sample and plot the time taken as a function
 of the number of dimensions.
 """
+
 import gc
 from datetime import datetime
 

diff --git a/doc/api.rst b/doc/api.rst
@@ -152,7 +152,8 @@ tree models.
    build_coleman_forest
    build_hyppo_oob_forest
    build_hyppo_cv_forest
-
+   PermutationHonestForestClassifier
+
 Datasets
 ------------------------------
 We provide some convenience functions for simulating datasets beyond

diff --git a/doc/sphinxext/sphinx_issues.py b/doc/sphinxext/sphinx_issues.py
@@ -18,6 +18,7 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
+
 import re
 
 from docutils import nodes, utils

diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst
@@ -13,7 +13,16 @@ Version 0.7
 Changelog
 ---------
 
--
+- |Feature| Introduce a new light-weight class for fitting honest forests while
+    permuting the covariate index :class:`sktree.stats.PermutationHonestForestClassifier`,
+    by `Adam Li`_ (:pr:`#211`)
+- |Feature| Introduce a new class method ``predict_proba_per_tree`` for all
+    Forest classifiers, which will predict the probability per tree and keep the
+    output as a ``(n_estimators, n_samples, n_classes)`` output,
+    by `Adam Li`_ (:pr:`#211`)
+- |Feature| Introduce a new class fitted attribute ``oob_samples_`` for all
+    Forest models, which will keep track of the samples used.
+    by `Adam Li`_ (:pr:`#211`)
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/examples/sklearn_vs_sktree/plot_iris_dtc.py b/examples/sklearn_vs_sktree/plot_iris_dtc.py
@@ -16,6 +16,7 @@
 
 We also show the tree structure of a model built on all of the features.
 """
+
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.datasets import load_iris

diff --git a/examples/sparse_oblique_trees/plot_oblique_forests_iris.py b/examples/sparse_oblique_trees/plot_oblique_forests_iris.py
@@ -15,6 +15,7 @@
 sepal length only, and on the third row using the petal width and the
 petal length only.
 """
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd

diff --git a/examples/splitters/plot_sparse_projection_matrix.py b/examples/splitters/plot_sparse_projection_matrix.py
@@ -15,6 +15,7 @@
 For details on how to use the hyperparameters related to the patches, see
 :class:`sktree.tree.ObliqueDecisionTreeClassifier`.
 """
+
 import matplotlib.pyplot as plt
 
 # import modules

diff --git a/pyproject.toml b/pyproject.toml
@@ -193,46 +193,6 @@ exclude = [
   'benchmarks_nonasv/'
 ]
 
-[tool.poe.tasks]
-_flake8 = 'flake8'
-_bandit = 'bandit -r sktree -c pyproject.toml'
-_black = 'black .'
-_isort = 'isort .'
-_cythonlint = 'cython-lint sktree/ --exclude "sktree/_lib/*"'
-_black_check = 'black --check sktree examples'
-_isort_check = 'isort --check .'
-_pydocstyle = 'pydocstyle ./sktree'
-_codespell = 'codespell sktree/ doc/ examples/ --ignore-words=.codespellignore --skip "**/_build/*,**/_lib/*,doc/_build/*,doc/auto_examples/*,doc/tutorials/*,doc/generated/*"'
-_changelog = 'semversioner changelog > CHANGELOG.md'
-_apply_version = 'semversioner release'
-type_check = 'mypy -p sktree --config-file pyproject.toml'
-unit_test = 'pytest ./sktree --cov=sktree --cov-report=xml --cov-config=pyproject.toml'
-build_docs = 'make -C doc/ clean html'
-build_docs_noplot = 'make -C doc/ clean html-noplot'
-clean = {shell = 'rm -rf builddir; rm -rf clean; rm -rf build; rm -rf _build; rm -rf dist;'}
-
-[[tool.poe.tasks.format]]
-sequence = ['_black', '_isort']
-ignore_fail = 'return_non_zero'
-
-[[tool.poe.tasks.format_check]]
-sequence = ['_black_check', '_isort_check']
-ignore_fail = 'return_non_zero'
-
-[[tool.poe.tasks.lint]]
-sequence = ['_flake8', '_bandit', '_codespell', '_pydocstyle', '_cythonlint']
-ignore_fail = 'return_non_zero'
-
-[[tool.poe.tasks.release]]
-sequence = ['_changelog', '_apply_version']
-
-#
-# a standard verification sequence for use in pull requests
-#
-[[tool.poe.tasks.verify]]
-sequence = ['format', 'lint', 'type_check', 'unit_test']
-ignore_fail = "return_non_zero"
-
 [tool.pydocstyle]
 convention = 'numpy'
 ignore-decorators = '(copy_doc|property|.*setter|.*getter)'

diff --git a/sktree/__init__.py b/sktree/__init__.py
@@ -1,4 +1,5 @@
 """Scikit manifold oblique random forests."""
+
 import logging
 import os
 import sys

diff --git a/sktree/datasets/__init__.py b/sktree/datasets/__init__.py
@@ -1,7 +1,7 @@
 from .hyppo import (
-    make_quadratic_classification,
-    make_trunk_classification,
     approximate_clf_mutual_information,
     approximate_clf_mutual_information_with_monte_carlo,
+    make_quadratic_classification,
+    make_trunk_classification,
 )
 from .multiview import make_gaussian_mixture, make_joint_factor_model
diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py
@@ -1,6 +1,6 @@
 import numpy as np
-from scipy.stats import multivariate_normal, entropy
 from scipy.integrate import nquad
+from scipy.stats import entropy, multivariate_normal
 
 
 def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None):
@@ -49,8 +49,8 @@ def make_quadratic_classification(n_samples: int, n_features: int, noise=False,
 
 def make_trunk_classification(
     n_samples,
-    n_dim=10,
-    n_informative=10,
+    n_dim=4096,
+    n_informative=256,
     m_factor: int = -1,
     rho: int = 0,
     band_type: str = "ma",
@@ -76,10 +76,10 @@ def make_trunk_classification(
         Number of sample to generate.
     n_dim : int, optional
         The dimensionality of the dataset and the number of
-        unique labels, by default 10.
+        unique labels, by default 4096.
     n_informative : int, optional
         The informative dimensions. All others for ``n_dim - n_informative``
-        are uniform noise.
+        are uniform noise. Default is 256.
     m_factor : int, optional
         The multiplicative factor to apply to the mean-vector of the first
         distribution to obtain the mean-vector of the second distribution.
@@ -169,7 +169,7 @@ def make_trunk_classification(
         )
 
     if n_dim > n_informative:
-        X = np.hstack((X, rng.uniform(low=0, high=1, size=(n_samples, n_dim - n_informative))))
+        X = np.hstack((X, rng.uniform(low=0, high=1, size=(X.shape[0], n_dim - n_informative))))
 
     y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2)))
 

diff --git a/sktree/datasets/tests/test_hyppo.py b/sktree/datasets/tests/test_hyppo.py
@@ -1,11 +1,12 @@
-import pytest
 import numpy as np
+import pytest
 from numpy.testing import assert_array_equal
+
 from sktree.datasets import (
-    make_quadratic_classification,
-    make_trunk_classification,
     approximate_clf_mutual_information,
     approximate_clf_mutual_information_with_monte_carlo,
+    make_quadratic_classification,
+    make_trunk_classification,
 )
 
 
@@ -18,13 +19,6 @@ def test_make_quadratic_classification_v():
     assert len(x) == len(v)
 
 
-def test_make_trunk_classification_default():
-    # Test with default parameters
-    X, y = make_trunk_classification(n_samples=100)
-    assert X.shape == (100, 10)
-    assert y.shape == (100,)
-
-
 def test_make_trunk_classification_custom_parameters():
     # Test with custom parameters
     X, y = make_trunk_classification(
@@ -43,9 +37,15 @@ def test_make_trunk_classification_custom_parameters():
 def test_make_trunk_classification_autoregressive_cov():
     # Test with default parameters
     n_dim = 10
+    n_informative = 10
     rho = 0.5
     _, _, _, cov_list = make_trunk_classification(
-        n_samples=100, n_dim=n_dim, rho=rho, band_type="ar", return_params=True
+        n_samples=100,
+        n_dim=n_dim,
+        n_informative=n_informative,
+        rho=rho,
+        band_type="ar",
+        return_params=True,
     )
     assert_array_equal(cov_list[0], cov_list[1])
     assert cov_list[0].shape == (n_dim, n_dim)
@@ -54,14 +54,19 @@ def test_make_trunk_classification_autoregressive_cov():
 
 def test_make_trunk_classification_mixture():
     # Test with default parameters
-    X, y, _, _ = make_trunk_classification(n_samples=100, mix=0.5, return_params=True)
-    assert X.shape == (100, 10)
+    X, y, _, _ = make_trunk_classification(
+        n_samples=100, n_dim=10, n_informative=5, mix=0.5, return_params=True
+    )
+    assert X.shape == (100, 10), X.shape
     assert y.shape == (100,)
 
 
 def test_make_trunk_classification_return_params():
     # Test with return_params=True and uneven number of samples
-    X, y, means, covs = make_trunk_classification(n_samples=75, n_dim=10, return_params=True)
+    n_informative = 5
+    X, y, means, covs = make_trunk_classification(
+        n_samples=75, n_dim=10, n_informative=n_informative, return_params=True
+    )
     assert X.shape == (74, 10), X.shape
     assert y.shape == (74,)
     assert len(means) == 2