Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement SPORF #374

Merged
merged 19 commits into from
Dec 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/tutorials.rst
Expand Up @@ -13,6 +13,8 @@ The following tutorials highlight what one can do with the ``ProgLearn`` package
tutorials/random_class_exp
tutorials/rotation_cifar
tutorials/spiral_exp
tutorials/sporf_datasets
tutorials/sporf_decision_boundaries
tutorials/uncertaintyforest_running_example
tutorials/uncertaintyforest_posteriorestimates
tutorials/uncertaintyforest_conditionalentropyestimates
Expand Down
90 changes: 90 additions & 0 deletions docs/tutorials/functions/sporf_datasets_functions.py
@@ -0,0 +1,90 @@
import sys
import numpy as np
import pandas as pd
import csv
from numpy import genfromtxt

from proglearn.progressive_learner import ProgressiveLearner
from proglearn.voters import TreeClassificationVoter
from proglearn.transformers import TreeClassificationTransformer
from proglearn.transformers import ObliqueTreeClassificationTransformer
from proglearn.deciders import SimpleArgmaxAverage

from sklearn.model_selection import train_test_split, cross_val_score

def load_simulated_data(file):
data = genfromtxt(file, delimiter=',')
X = data[:, :-1]
y = data[:, -1]

return X, y

def load_data(data_file, task_num):
if "Hill_Valley" in data_file:
df = pd.read_csv(data_file)
X = df[df.columns[:-1]].to_numpy()
y = df[df.columns[-1]].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)

if "acute" in data_file:

df = pd.read_table(data_file, encoding='utf-16')
df[df == "no"] = 0
df[df == "yes"] = 1

data = df.to_numpy()
temps = data[:, 0]

temperature = []
for i in range(len(temps)):
temp_str = temps[i]
temp_str = temp_str.replace(",", ".")
temperature.append(float(temp_str))

data[:, 0] = np.array(temperature)

X = np.array(data[:, :5], dtype=float)

# 6 for task 1, 7 for task 2
if task_num == 1:
y = np.array(data[:, 6], dtype=float)
else:
y = np.array(data[:, 7], dtype=float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)

return X_train, X_test, y_train, y_test, len(np.unique(y))


def test(data_file, reps, n_trees, task_num,
default_transformer_class, default_transformer_kwargs):
default_voter_class = TreeClassificationVoter
default_voter_kwargs = {}

default_decider_class = SimpleArgmaxAverage

kappa = np.zeros(reps)
for i in range(reps):
X_train, X_test, y_train, y_test, n_classes = load_data(data_file, task_num)
default_decider_kwargs = {"classes": np.arange(n_classes)}

pl = ProgressiveLearner(
default_transformer_class=default_transformer_class,
default_transformer_kwargs=default_transformer_kwargs,
default_voter_class=default_voter_class,
default_voter_kwargs=default_voter_kwargs,
default_decider_class=default_decider_class,
default_decider_kwargs=default_decider_kwargs)

pl.add_task(X_train, y_train, num_transformers=n_trees)

y_hat = pl.predict(X_test, task_id=0)

acc = np.sum(y_test == y_hat) / len(y_test)
print("Accuracy after iteration ", i, ": ", acc)

chance_pred = 1 / n_classes
kappa[i] = (acc - chance_pred) / (1 - chance_pred)

return np.mean(kappa) * 100, (np.std(kappa) * 100) / np.sqrt(reps)
96 changes: 96 additions & 0 deletions docs/tutorials/functions/sporf_decision_boundaries_functions.py
@@ -0,0 +1,96 @@
from rerf.rerfClassifier import rerfClassifier

import numpy as np
np.random.seed(42)

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.ensemble import RandomForestClassifier

from proglearn.forest import LifelongClassificationForest
from proglearn.voters import TreeClassificationVoter
from proglearn.transformers import TreeClassificationTransformer
from proglearn.transformers import ObliqueTreeClassificationTransformer
from proglearn.deciders import SimpleArgmaxAverage

def test(NT, h, names, classifiers, datasets):
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.4, random_state=42)

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))

# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1

# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

if "Proglearn" in name:

clf = LifelongClassificationForest(oblique=True,
default_feature_combinations=1, default_density=0.5)
clf.add_task(X_train, y_train, n_estimators=NT)
y_hat = clf.predict(X_test, task_id=0)
score = np.sum(y_hat == y_test) / len(y_test)

else:
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
elif "Proglearn" in name:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()], task_id=0)[:, 1]
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
edgecolors='k', alpha=0.6)

ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1