Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH Multiview axis-aligned more like sklearn #266

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
199 changes: 199 additions & 0 deletions benchmarks_nonasv/bench_multiview_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
"""
To run this, you'll need to have installed.

* scikit-learn
* scikit-tree

Does two benchmarks

First, we fix a training set, increase the number of
samples to classify and plot number of classified samples as a
function of time.

In the second benchmark, we increase the number of dimensions of the
training set, classify a sample and plot the time taken as a function
of the number of dimensions.
"""

import gc
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np

from sktree import HonestForestClassifier
from sktree.tree import HonestTreeClassifier

# to store the results
scikit_classifier_results = []
sklearn_classifier_results = []
honest_classifier_results = []
honest_sklearn_results = []

mu_second = 0.0 + 10**6 # number of microseconds in a second
n_estimators = 1000
n_jobs = -1


def bench_scikitlearn_tree_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""

from sklearn.ensemble import RandomForestClassifier

gc.collect()

# start time
tstart = datetime.now()
clf = RandomForestClassifier(n_estimators=n_estimators, max_features=0.3, n_jobs=n_jobs)
clf.fit(X, Y)
delta = datetime.now() - tstart
# stop time

sklearn_classifier_results.append(delta.seconds + delta.microseconds / mu_second)


def bench_oblique_tree_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""

from sktree import MultiViewRandomForestClassifier

gc.collect()

# start time
tstart = datetime.now()
clf = MultiViewRandomForestClassifier(
n_estimators=n_estimators,
feature_set_ends=[X.shape[1] // 2, X.shape[1]],
max_features=0.3,
n_jobs=n_jobs,
)
clf.fit(X, Y)
delta = datetime.now() - tstart
# stop time

# tstart = datetime.now()
# clf.predict(X)
# delta = datetime.now() - tstart

scikit_classifier_results.append(delta.seconds + delta.microseconds / mu_second)


def bench_honest_tree_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""

from sktree.tree import MultiViewDecisionTreeClassifier

gc.collect()

# start time
tstart = datetime.now()
clf = HonestForestClassifier(
max_features=0.3,
honest_fraction=0.5,
n_jobs=n_jobs,
feature_set_ends=[X.shape[1] // 2, X.shape[1]],
tree_estimator=MultiViewDecisionTreeClassifier(),
)
clf.fit(X, Y)
delta = datetime.now() - tstart
# stop time

# tstart = datetime.now()
# clf.predict(X)
# delta = datetime.now() - tstart

honest_classifier_results.append(delta.seconds + delta.microseconds / mu_second)


def bench_honest_sklearn_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""

gc.collect()

# start time
tstart = datetime.now()
clf = HonestForestClassifier(
max_features=0.3,
honest_fraction=0.5,
n_jobs=n_jobs,
)
clf.fit(X, Y)
delta = datetime.now() - tstart
# stop time

# tstart = datetime.now()
# clf.predict(X)
# delta = datetime.now() - tstart

honest_sklearn_results.append(delta.seconds + delta.microseconds / mu_second)


if __name__ == "__main__":
print("============================================")
print("Warning: this is going to take a looong time")
print("============================================")

n = 10
step = 1000
n_samples = 100
dim = 100
n_classes = 2
for i in range(n):
print("============================================")
print("Entering iteration %s of %s" % (i, n))
print("============================================")
n_samples += step
X = np.random.randn(n_samples, dim)
Y = np.random.randint(0, n_classes, (n_samples,))
bench_oblique_tree_classifier(X, Y)
bench_scikitlearn_tree_classifier(X, Y)
bench_honest_tree_classifier(X, Y)
bench_honest_sklearn_classifier(X, Y)

xx = range(0, n * step, step)
plt.figure("scikit-tree oblique tree benchmark results")
plt.subplot(211)
plt.title("Learning with varying number of samples")
plt.plot(xx, scikit_classifier_results, "g-", label="classification")
plt.plot(xx, sklearn_classifier_results, "o--", label="sklearn-classification")
plt.plot(xx, honest_classifier_results, "r-", label="honest-classification")
plt.plot(xx, honest_sklearn_results, "b-", label="honest-sklearn-classification")
plt.legend(loc="upper left")
plt.xlabel("number of samples")
plt.ylabel("Time (s)")

scikit_classifier_results = []
sklearn_classifier_results = []
honest_classifier_results = []
honest_sklearn_results = []
n = 10
step = 500
start_dim = 500
n_classes = 2
n_samples = 500

dim = start_dim
for i in range(0, n):
print("============================================")
print("Entering iteration %s of %s" % (i, n))
print("============================================")
dim += step
X = np.random.randn(n_samples, dim)
Y = np.random.randint(0, n_classes, (n_samples,))
bench_oblique_tree_classifier(X, Y)
bench_scikitlearn_tree_classifier(X, Y)
bench_honest_tree_classifier(X, Y)
bench_honest_sklearn_classifier(X, Y)

xx = np.arange(start_dim, start_dim + n * step, step)
plt.subplot(212)
plt.title("Learning in high dimensional spaces")
plt.plot(xx, scikit_classifier_results, "g-", label="classification")
plt.plot(xx, sklearn_classifier_results, "o--", label="sklearn-classification")
plt.plot(xx, honest_classifier_results, "r-", label="honest-classification")
plt.plot(xx, honest_sklearn_results, "b-", label="honest-sklearn-classification")
plt.legend(loc="upper left")
plt.xlabel("number of dimensions")
plt.ylabel("Time (s)")
plt.axis("tight")
plt.show()
158 changes: 158 additions & 0 deletions benchmarks_nonasv/bench_multiview_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""
To run this, you'll need to have installed.

* scikit-learn
* scikit-tree

Does two benchmarks

First, we fix a training set, increase the number of
samples to classify and plot number of classified samples as a
function of time.

In the second benchmark, we increase the number of dimensions of the
training set, classify a sample and plot the time taken as a function
of the number of dimensions.
"""

import gc
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np

from sktree.tree import HonestTreeClassifier

# to store the results
scikit_classifier_results = []
sklearn_classifier_results = []
honest_classifier_results = []

mu_second = 0.0 + 10**6 # number of microseconds in a second


def bench_scikitlearn_tree_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""

from sklearn.tree import DecisionTreeClassifier

gc.collect()

# start time
tstart = datetime.now()
clf = DecisionTreeClassifier(max_features=0.3)
clf.fit(X, Y)
delta = datetime.now() - tstart
# stop time

sklearn_classifier_results.append(delta.seconds + delta.microseconds / mu_second)


def bench_oblique_tree_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""

from sktree.tree import MultiViewDecisionTreeClassifier

gc.collect()

# start time
tstart = datetime.now()
clf = MultiViewDecisionTreeClassifier(max_features=0.3)
clf.fit(X, Y)
delta = datetime.now() - tstart
# stop time

# tstart = datetime.now()
# clf.predict(X)
# delta = datetime.now() - tstart

scikit_classifier_results.append(delta.seconds + delta.microseconds / mu_second)


def bench_honest_tree_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""

from sktree.tree import MultiViewDecisionTreeClassifier

gc.collect()

# start time
tstart = datetime.now()
clf = HonestTreeClassifier(
max_features=0.3, honest_fraction=0.5, tree_estimator=MultiViewDecisionTreeClassifier()
)
clf.fit(X, Y)
delta = datetime.now() - tstart
# stop time

# tstart = datetime.now()
# clf.predict(X)
# delta = datetime.now() - tstart

honest_classifier_results.append(delta.seconds + delta.microseconds / mu_second)


if __name__ == "__main__":
print("============================================")
print("Warning: this is going to take a looong time")
print("============================================")

n = 10
step = 1000
n_samples = 100
dim = 100
n_classes = 2
for i in range(n):
print("============================================")
print("Entering iteration %s of %s" % (i, n))
print("============================================")
n_samples += step
X = np.random.randn(n_samples, dim)
Y = np.random.randint(0, n_classes, (n_samples,))
bench_oblique_tree_classifier(X, Y)
bench_scikitlearn_tree_classifier(X, Y)
bench_honest_tree_classifier(X, Y)

xx = range(0, n * step, step)
plt.figure("scikit-tree oblique tree benchmark results")
plt.subplot(211)
plt.title("Learning with varying number of samples")
plt.plot(xx, scikit_classifier_results, "g-", label="classification")
plt.plot(xx, sklearn_classifier_results, "o--", label="sklearn-classification")
plt.plot(xx, honest_classifier_results, "r-", label="honest-classification")
plt.legend(loc="upper left")
plt.xlabel("number of samples")
plt.ylabel("Time (s)")

scikit_classifier_results = []
sklearn_classifier_results = []
honest_classifier_results = []
n = 10
step = 500
start_dim = 500
n_classes = 2
n_samples = 500

dim = start_dim
for i in range(0, n):
print("============================================")
print("Entering iteration %s of %s" % (i, n))
print("============================================")
dim += step
X = np.random.randn(n_samples, dim)
Y = np.random.randint(0, n_classes, (n_samples,))
bench_oblique_tree_classifier(X, Y)
bench_scikitlearn_tree_classifier(X, Y)
bench_honest_tree_classifier(X, Y)

xx = np.arange(start_dim, start_dim + n * step, step)
plt.subplot(212)
plt.title("Learning in high dimensional spaces")
plt.plot(xx, scikit_classifier_results, "g-", label="classification")
plt.plot(xx, sklearn_classifier_results, "o--", label="sklearn-classification")
plt.plot(xx, honest_classifier_results, "r-", label="honest-classification")
plt.legend(loc="upper left")
plt.xlabel("number of dimensions")
plt.ylabel("Time (s)")
plt.axis("tight")
plt.show()