Skip to content

Commit

Permalink
[ENH] A modular approach to constructing forests that have permuted c…
Browse files Browse the repository at this point in the history
…ovariates (#211)

* Add `oob_samples_` and `predict_proba_per_tree` methods to each relevant Forest Classifier (and Regressor)
* Basic sketch of a PermutationHonestForestClassifier
* Isort in pre-commit

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 committed Feb 2, 2024
1 parent aa655d6 commit 18c2f45
Show file tree
Hide file tree
Showing 34 changed files with 1,031 additions and 205 deletions.
17 changes: 13 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
repos:
- repo: https://github.com/psf/black
rev: 23.11.0
rev: 24.1.1
hooks:
- id: black
args: [--quiet]

- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
- id: isort
name: isort (cython)
types: [cython]

- repo: https://github.com/MarcoGorelli/cython-lint
rev: v0.16.0
hooks:
Expand All @@ -13,7 +22,7 @@ repos:

# Ruff sktree
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.6
rev: v0.2.0
hooks:
- id: ruff
name: ruff sktree
Expand All @@ -22,7 +31,7 @@ repos:

# Ruff tutorials and examples
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.6
rev: v0.2.0
hooks:
- id: ruff
name: ruff tutorials and examples
Expand Down Expand Up @@ -57,7 +66,7 @@ repos:

# mypy
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.7.1
rev: v1.8.0
hooks:
- id: mypy
# Avoid the conflict between mne/__init__.py and mne/__init__.pyi by ignoring the former
Expand Down
1 change: 1 addition & 0 deletions benchmarks_nonasv/bench_forestht.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
.. note:: This script will take a long time to run, since a power curve is generated.
"""

from collections import defaultdict

import matplotlib.pyplot as plt
Expand Down
1 change: 1 addition & 0 deletions benchmarks_nonasv/bench_oblique_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
training set, classify a sample and plot the time taken as a function
of the number of dimensions.
"""

import gc
from datetime import datetime

Expand Down
3 changes: 2 additions & 1 deletion doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ tree models.
build_coleman_forest
build_hyppo_oob_forest
build_hyppo_cv_forest

PermutationHonestForestClassifier

Datasets
------------------------------
We provide some convenience functions for simulating datasets beyond
Expand Down
1 change: 1 addition & 0 deletions doc/sphinxext/sphinx_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""

import re

from docutils import nodes, utils
Expand Down
11 changes: 10 additions & 1 deletion doc/whats_new/v0.7.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,16 @@ Version 0.7
Changelog
---------

-
- |Feature| Introduce a new light-weight class for fitting honest forests while
permuting the covariate index :class:`sktree.stats.PermutationHonestForestClassifier`,
by `Adam Li`_ (:pr:`#211`)
- |Feature| Introduce a new class method ``predict_proba_per_tree`` for all
Forest classifiers, which will predict the probability per tree and keep the
output as a ``(n_estimators, n_samples, n_classes)`` output,
by `Adam Li`_ (:pr:`#211`)
- |Feature| Introduce a new class fitted attribute ``oob_samples_`` for all
Forest models, which will keep track of the samples used.
by `Adam Li`_ (:pr:`#211`)

Code and Documentation Contributors
-----------------------------------
Expand Down
1 change: 1 addition & 0 deletions examples/sklearn_vs_sktree/plot_iris_dtc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
We also show the tree structure of a model built on all of the features.
"""

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
Expand Down
1 change: 1 addition & 0 deletions examples/sparse_oblique_trees/plot_oblique_forests_iris.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
sepal length only, and on the third row using the petal width and the
petal length only.
"""

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Expand Down
1 change: 1 addition & 0 deletions examples/splitters/plot_sparse_projection_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
For details on how to use the hyperparameters related to the patches, see
:class:`sktree.tree.ObliqueDecisionTreeClassifier`.
"""

import matplotlib.pyplot as plt

# import modules
Expand Down
40 changes: 0 additions & 40 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -193,46 +193,6 @@ exclude = [
'benchmarks_nonasv/'
]

[tool.poe.tasks]
_flake8 = 'flake8'
_bandit = 'bandit -r sktree -c pyproject.toml'
_black = 'black .'
_isort = 'isort .'
_cythonlint = 'cython-lint sktree/ --exclude "sktree/_lib/*"'
_black_check = 'black --check sktree examples'
_isort_check = 'isort --check .'
_pydocstyle = 'pydocstyle ./sktree'
_codespell = 'codespell sktree/ doc/ examples/ --ignore-words=.codespellignore --skip "**/_build/*,**/_lib/*,doc/_build/*,doc/auto_examples/*,doc/tutorials/*,doc/generated/*"'
_changelog = 'semversioner changelog > CHANGELOG.md'
_apply_version = 'semversioner release'
type_check = 'mypy -p sktree --config-file pyproject.toml'
unit_test = 'pytest ./sktree --cov=sktree --cov-report=xml --cov-config=pyproject.toml'
build_docs = 'make -C doc/ clean html'
build_docs_noplot = 'make -C doc/ clean html-noplot'
clean = {shell = 'rm -rf builddir; rm -rf clean; rm -rf build; rm -rf _build; rm -rf dist;'}

[[tool.poe.tasks.format]]
sequence = ['_black', '_isort']
ignore_fail = 'return_non_zero'

[[tool.poe.tasks.format_check]]
sequence = ['_black_check', '_isort_check']
ignore_fail = 'return_non_zero'

[[tool.poe.tasks.lint]]
sequence = ['_flake8', '_bandit', '_codespell', '_pydocstyle', '_cythonlint']
ignore_fail = 'return_non_zero'

[[tool.poe.tasks.release]]
sequence = ['_changelog', '_apply_version']

#
# a standard verification sequence for use in pull requests
#
[[tool.poe.tasks.verify]]
sequence = ['format', 'lint', 'type_check', 'unit_test']
ignore_fail = "return_non_zero"

[tool.pydocstyle]
convention = 'numpy'
ignore-decorators = '(copy_doc|property|.*setter|.*getter)'
Expand Down
1 change: 1 addition & 0 deletions sktree/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Scikit manifold oblique random forests."""

import logging
import os
import sys
Expand Down
4 changes: 2 additions & 2 deletions sktree/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .hyppo import (
make_quadratic_classification,
make_trunk_classification,
approximate_clf_mutual_information,
approximate_clf_mutual_information_with_monte_carlo,
make_quadratic_classification,
make_trunk_classification,
)
from .multiview import make_gaussian_mixture, make_joint_factor_model
12 changes: 6 additions & 6 deletions sktree/datasets/hyppo.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
from scipy.stats import multivariate_normal, entropy
from scipy.integrate import nquad
from scipy.stats import entropy, multivariate_normal


def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None):
Expand Down Expand Up @@ -49,8 +49,8 @@ def make_quadratic_classification(n_samples: int, n_features: int, noise=False,

def make_trunk_classification(
n_samples,
n_dim=10,
n_informative=10,
n_dim=4096,
n_informative=256,
m_factor: int = -1,
rho: int = 0,
band_type: str = "ma",
Expand All @@ -76,10 +76,10 @@ def make_trunk_classification(
Number of sample to generate.
n_dim : int, optional
The dimensionality of the dataset and the number of
unique labels, by default 10.
unique labels, by default 4096.
n_informative : int, optional
The informative dimensions. All others for ``n_dim - n_informative``
are uniform noise.
are uniform noise. Default is 256.
m_factor : int, optional
The multiplicative factor to apply to the mean-vector of the first
distribution to obtain the mean-vector of the second distribution.
Expand Down Expand Up @@ -169,7 +169,7 @@ def make_trunk_classification(
)

if n_dim > n_informative:
X = np.hstack((X, rng.uniform(low=0, high=1, size=(n_samples, n_dim - n_informative))))
X = np.hstack((X, rng.uniform(low=0, high=1, size=(X.shape[0], n_dim - n_informative))))

y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2)))

Expand Down
33 changes: 19 additions & 14 deletions sktree/datasets/tests/test_hyppo.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import pytest
import numpy as np
import pytest
from numpy.testing import assert_array_equal

from sktree.datasets import (
make_quadratic_classification,
make_trunk_classification,
approximate_clf_mutual_information,
approximate_clf_mutual_information_with_monte_carlo,
make_quadratic_classification,
make_trunk_classification,
)


Expand All @@ -18,13 +19,6 @@ def test_make_quadratic_classification_v():
assert len(x) == len(v)


def test_make_trunk_classification_default():
# Test with default parameters
X, y = make_trunk_classification(n_samples=100)
assert X.shape == (100, 10)
assert y.shape == (100,)


def test_make_trunk_classification_custom_parameters():
# Test with custom parameters
X, y = make_trunk_classification(
Expand All @@ -43,9 +37,15 @@ def test_make_trunk_classification_custom_parameters():
def test_make_trunk_classification_autoregressive_cov():
# Test with default parameters
n_dim = 10
n_informative = 10
rho = 0.5
_, _, _, cov_list = make_trunk_classification(
n_samples=100, n_dim=n_dim, rho=rho, band_type="ar", return_params=True
n_samples=100,
n_dim=n_dim,
n_informative=n_informative,
rho=rho,
band_type="ar",
return_params=True,
)
assert_array_equal(cov_list[0], cov_list[1])
assert cov_list[0].shape == (n_dim, n_dim)
Expand All @@ -54,14 +54,19 @@ def test_make_trunk_classification_autoregressive_cov():

def test_make_trunk_classification_mixture():
# Test with default parameters
X, y, _, _ = make_trunk_classification(n_samples=100, mix=0.5, return_params=True)
assert X.shape == (100, 10)
X, y, _, _ = make_trunk_classification(
n_samples=100, n_dim=10, n_informative=5, mix=0.5, return_params=True
)
assert X.shape == (100, 10), X.shape
assert y.shape == (100,)


def test_make_trunk_classification_return_params():
# Test with return_params=True and uneven number of samples
X, y, means, covs = make_trunk_classification(n_samples=75, n_dim=10, return_params=True)
n_informative = 5
X, y, means, covs = make_trunk_classification(
n_samples=75, n_dim=10, n_informative=n_informative, return_params=True
)
assert X.shape == (74, 10), X.shape
assert y.shape == (74,)
assert len(means) == 2
Expand Down

0 comments on commit 18c2f45

Please sign in to comment.