Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft of SPORF #454

Open
wants to merge 31 commits into
base: sporf
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
cac6699
bug fix for gini computation
parthgvora Feb 11, 2021
e68176b
cythonized oblique tree seems to run :)
parthgvora Feb 11, 2021
70d1162
requirements and setup
parthgvora Feb 11, 2021
56bc9f8
cython sporf
parthgvora Feb 15, 2021
ea4b6ee
Updated predict to not do a matrix multiplication at each node
parthgvora Feb 25, 2021
b8b7fbd
parameters made equivalent with sporf
parthgvora Mar 5, 2021
50b9e66
params updated
parthgvora Mar 5, 2021
664a751
was missing a split
parthgvora Mar 13, 2021
9847dfa
fixed bug in argsort
parthgvora Mar 21, 2021
0601060
tests for splitter
parthgvora Mar 21, 2021
b624d7a
benchmarking works!
parthgvora Mar 21, 2021
9f17a5d
need to match projection matrix somehow
parthgvora Mar 22, 2021
0cec9b8
projection matrix might be fixed?
parthgvora Mar 25, 2021
100cf4e
catch edge case of projection dimension = 0
parthgvora Mar 25, 2021
f67badb
got rid of a for loop in projmat
parthgvora Mar 25, 2021
5def420
Initial commit of MORF.
adam2392 Mar 29, 2021
f3e79f2
Merge branch 'sporf' of github.com:parthgvora/ProgLearn into sporf
adam2392 Mar 29, 2021
815d719
bug fix: check if valid split!
parthgvora Mar 29, 2021
25b79f5
Merge branch 'sporf' of https://github.com/parthgvora/ProgLearn into …
parthgvora Mar 29, 2021
449f502
left debug mode on
parthgvora Mar 29, 2021
8369bd3
Adding working morf splitter.
adam2392 Mar 31, 2021
1b4171c
Adding testing for morf in jupyter notebook.
adam2392 Mar 31, 2021
96c5406
Delete Pipfile
adam2392 Mar 31, 2021
7af3b00
Splitter unit tests
parthgvora Apr 1, 2021
e51a24f
all unit tests in fpGiniSplitTest.h pass
parthgvora Apr 1, 2021
ced5fcc
Merge branch 'sporf' of https://github.com/parthgvora/ProgLearn into …
parthgvora Apr 1, 2021
c13500b
Pushing working morf forest classifier.
adam2392 Apr 1, 2021
be9e73b
Pushing working morf forest classifier.
adam2392 Apr 1, 2021
cadc724
SPORF + sklearn unit tests, fixed predict_proba
parthgvora Apr 1, 2021
e13051a
fixed predict_log_proba
parthgvora Apr 1, 2021
9f77e7b
Relevant sklearn unit tests added. random seed actually does somethin…
parthgvora Apr 1, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ __pycache__/
# C extensions
*.so

*.c
*.cpp

# Distribution / packaging
.Python
build/
Expand Down
31 changes: 31 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# simple makefile to simplify repetitive build env management tasks under posix

# caution: testing won't work on windows, see README

PYTHON ?= python
CYTHON ?= cython
PYTEST ?= pytest
CTAGS ?= ctags

all: clean inplace test

clean-ctags:
rm -f tags

clean: clean-ctags
$(PYTHON) setup.py clean
rm -rf dist
# TODO: Remove in when all modules are removed.
$(PYTHON) sklearn/_build_utils/deprecated_modules.py

in: inplace # just a shortcut
inplace:
$(PYTHON) setup.py build_ext -i

cython:
python setup.py build_src

ctags:
# make tags for symbol based navigation in emacs and vim
# Install with: sudo apt-get install exuberant-ctags
$(CTAGS) --python-kinds=-i -R sklearn
30 changes: 17 additions & 13 deletions proglearn/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ class LifelongClassificationForest(ClassificationProgressiveLearner):
oblique : bool, default=False
Specifies if an oblique tree should used for the classifier or not.

feature_combinations : float, default=1.5
feature_combinations : float, default=2
The feature combinations to use for the oblique split.
Equal to the parameter 'L' in the sporf paper.

density : float, default=0.5
Density estimate.
max_features : float, default=1.0
Controls the max number of features to consider for oblique split.
The parameter 'd' in the sporf paper is equal to ceil(max_features * dimensions)

Attributes
----------
Expand All @@ -61,8 +63,8 @@ def __init__(
default_kappa=np.inf,
default_max_depth=30,
oblique=False,
default_feature_combinations=1.5,
default_density=0.5,
default_feature_combinations=2,
default_max_features=1.0,
):
self.default_n_estimators = default_n_estimators
self.default_tree_construction_proportion = default_tree_construction_proportion
Expand All @@ -73,7 +75,7 @@ def __init__(
if oblique:
default_transformer_class = ObliqueTreeClassificationTransformer
self.default_feature_combinations = default_feature_combinations
self.default_density = default_density
self.default_max_features = default_max_features

else:
default_transformer_class = TreeClassificationTransformer
Expand All @@ -97,7 +99,7 @@ def add_task(
kappa="default",
max_depth="default",
feature_combinations="default",
density="default",
max_features="default",
):
"""
adds a task with id task_id, max tree depth max_depth, given input data matrix X
Expand Down Expand Up @@ -133,11 +135,13 @@ def add_task(
The maximum depth of a tree in the Lifelong Classification Forest.
The default is used if 'default' is provided.

feature_combinations : float, default='default'
feature_combinations : float, default=2
The feature combinations to use for the oblique split.
Equal to the parameter 'L' in the sporf paper.

density : float, default='default'
Density estimate.
max_features : int, default=None
The max number of features to consider for oblique split.
Equal to the parameter 'd' in the sporf paper.

Returns
-------
Expand All @@ -156,14 +160,14 @@ def add_task(
if self.oblique:
if feature_combinations == "default":
feature_combinations = self.default_feature_combinations
if density == "default":
density = self.default_density
if max_features == "default":
max_features = self.default_max_features

transformer_kwargs = {
"kwargs": {
"max_depth": max_depth,
"feature_combinations": feature_combinations,
"density": density,
"max_features": max_features,
}
}

Expand Down
245 changes: 245 additions & 0 deletions proglearn/split.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
#cython: language_level=3
#cython: boundscheck=False
#cython: wraparound=False

cimport cython

import numpy as np

from libcpp.unordered_map cimport unordered_map
from cython.operator import dereference, postincrement

from libcpp.algorithm cimport sort as stdsort

from libcpp.vector cimport vector
from libcpp.pair cimport pair

from cython.parallel import prange

# Computes the gini score for a split
# 0 < t < len(y)

cdef class BaseObliqueSplitter:

cdef void argsort(self, double[:] y, int[:] idx) nogil:

cdef int length = y.shape[0]
cdef int i = 0
cdef pair[double, int] p
cdef vector[pair[double, int]] v

for i in range(length):
p.first = y[i]
p.second = i
v.push_back(p)

stdsort(v.begin(), v.end())

for i in range(length):
idx[i] = v[i].second

cdef (int, int) argmin(self, double[:, :] A) nogil:
cdef int N = A.shape[0]
cdef int M = A.shape[1]
cdef int i = 0
cdef int j = 0
cdef int min_i = 0
cdef int min_j = 0
cdef double minimum = A[0, 0]

for i in range(N):
for j in range(M):

if A[i, j] < minimum:
minimum = A[i, j]
min_i = i
min_j = j

return (min_i, min_j)

cdef int argmax(self, double[:] A) nogil:
cdef int N = A.shape[0]
cdef int i = 0
cdef int max_i = 0
cdef double maximum = A[0]

for i in range(N):
if A[i] > maximum:
maximum = A[i]
max_i = i

return max_i

cdef double impurity(self, double[:] y) nogil:
cdef int length = y.shape[0]
cdef double dlength = y.shape[0]
cdef double temp = 0
cdef double gini = 1.0

cdef unordered_map[double, double] counts
cdef unordered_map[double, double].iterator it = counts.begin()

if length == 0:
return 0

# Count all unique elements
for i in range(0, length):
temp = y[i]
counts[temp] += 1

it = counts.begin()
while it != counts.end():
temp = dereference(it).second
temp = temp / dlength
temp = temp * temp
gini -= temp

postincrement(it)

return gini

cdef double score(self, double[:] y, int t) nogil:
cdef double length = y.shape[0]
cdef double left_gini = 1.0
cdef double right_gini = 1.0
cdef double gini = 0

cdef double[:] left = y[:t]
cdef double[:] right = y[t:]

cdef double l_length = left.shape[0]
cdef double r_length = right.shape[0]

left_gini = self.impurity(left)
right_gini = self.impurity(right)

gini = (l_length / length) * left_gini + (r_length / length) * right_gini
return gini

# X = proj_X, y = y_sample
cpdef best_split(self, double[:, :] X, double[:] y, int[:] sample_inds):

cdef int n_samples = X.shape[0]
cdef int proj_dims = X.shape[1]
cdef int i = 0
cdef int j = 0
cdef long temp_int = 0;
cdef double node_impurity = 0;

cdef int thresh_i = 0
cdef int feature = 0
cdef double best_gini = 0
cdef double threshold = 0
cdef double improvement = 0
cdef double left_impurity = 0
cdef double right_impurity = 0

Q = np.zeros((n_samples, proj_dims), dtype=np.float64)
cdef double[:, :] Q_view = Q

idx = np.zeros(n_samples, dtype=np.intc)
cdef int[:] idx_view = idx

y_sort = np.zeros(n_samples, dtype=np.float64)
cdef double[:] y_sort_view = y_sort

feat_sort = np.zeros(n_samples, dtype=np.float64)
cdef double[:] feat_sort_view = feat_sort

si_return = np.zeros(n_samples, dtype=np.intc)
cdef int[:] si_return_view = si_return

# No split or invalid split --> node impurity
node_impurity = self.impurity(y)
Q_view[:, :] = node_impurity

for j in range(0, proj_dims):

self.argsort(X[:, j], idx_view)
for i in range(0, n_samples):
temp_int = idx_view[i]
y_sort_view[i] = y[temp_int]
feat_sort_view[i] = X[temp_int, j]

for i in prange(1, n_samples, nogil=True):

# Check if the split is valid!
if feat_sort_view[i-1] < feat_sort_view[i]:
Q_view[i, j] = self.score(y_sort_view, i)

# Identify best split
(thresh_i, feature) = self.argmin(Q_view)

best_gini = Q_view[thresh_i, feature]
# Sort samples by split feature
self.argsort(X[:, feature], idx_view)
for i in range(0, n_samples):
temp_int = idx_view[i]

# Sort X so we can get threshold
feat_sort_view[i] = X[temp_int, feature]

# Sort y so we can get left_y, right_y
y_sort_view[i] = y[temp_int]

# Sort true sample inds
si_return_view[i] = sample_inds[temp_int]

# Get threshold, split samples into left and right
if (thresh_i == 0):
threshold = node_impurity #feat_sort_view[thresh_i]
else:
threshold = 0.5 * (feat_sort_view[thresh_i] + feat_sort_view[thresh_i - 1])

left_idx = si_return_view[:thresh_i]
right_idx = si_return_view[thresh_i:]

# Evaluate improvement
improvement = node_impurity - best_gini

# Evaluate impurities for left and right children
left_impurity = self.impurity(y_sort_view[:thresh_i])
right_impurity = self.impurity(y_sort_view[thresh_i:])

return feature, threshold, left_impurity, left_idx, right_impurity, right_idx, improvement

"""
Python wrappers for cdef functions.
Only to be used for testing purposes.
"""

def test_argsort(self, y):
idx = np.zeros(len(y), dtype=np.intc)
self.argsort(y, idx)
return idx

def test_argmin(self, M):
return self.argmin(M)

def test_impurity(self, y):
return self.impurity(y)

def test_score(self, y, t):
return self.score(y, t)

def test_best_split(self, X, y, idx):
return self.best_split(X, y, idx)

def test(self):

# Test score
y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype=np.float64)
s = [self.score(y, i) for i in range(10)]
print(s)

# Test splitter
# This one worked
X = np.array([[0, 0, 0, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1]], dtype=np.float64)
y = np.array([0, 0, 0, 1, 1, 1, 1], dtype=np.float64)
si = np.array([0, 1, 2, 3, 4, 5, 6], dtype=np.intc)

(f, t, li, lidx, ri, ridx, imp) = self.best_split(X, y, si)
print(f, t)