Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Fix patch sampling for discontiguous case #219

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v0.7.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ Changelog
by `Adam Li`_ (:pr:`#211`)
- |Feature| Introduce a new set of simulations based on Marron and Wand 1992.
by `Sambit Panda`_ (:pr:`#203`)
- |Fix| :class:`sktree.tree.PatchObliqueDecisionTreeClassifier` now correctly
handles the case where one or more features are discontiguous.
by `Jaewon Chung`_ (:pr:`#219`).

Code and Documentation Contributors
-----------------------------------
Expand All @@ -34,3 +37,4 @@ the project since version inception, including:

* `Adam Li`_
* `Sambit Panda`_
* `Jaewon Chung`_
11 changes: 8 additions & 3 deletions sktree/tree/_utils.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np

cimport numpy as cnp
from libcpp.vector cimport vector

cnp.import_array()

Expand All @@ -15,8 +16,12 @@ cpdef unravel_index(
intp_t index, cnp.ndarray[intp_t, ndim=1] shape
)

cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape)
cpdef ravel_multi_index(vector[intp_t] coords, const intp_t[:] shape)

cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil
cdef vector[intp_t] unravel_index_cython(intp_t index, const intp_t[:] shape) noexcept nogil

cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil
cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shape) noexcept nogil

cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]] sequences) noexcept nogil

cpdef cartesian_python(vector[vector[intp_t]]& sequences)
31 changes: 26 additions & 5 deletions sktree/tree/_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import numpy as np
cimport numpy as cnp

cnp.import_array()
from libcpp.vector cimport vector

from .._lib.sklearn.tree._utils cimport rand_uniform

Expand Down Expand Up @@ -53,12 +54,11 @@ cpdef unravel_index(
"""
index = np.intp(index)
shape = np.array(shape)
coords = np.empty(shape.shape[0], dtype=np.intp)
unravel_index_cython(index, shape, coords)
coords = unravel_index_cython(index, shape)
return coords


cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape):
cpdef ravel_multi_index(vector[intp_t] coords, const intp_t[:] shape):
"""Converts a tuple of coordinate arrays into a flat index.

Purely used for testing purposes.
Expand All @@ -83,7 +83,7 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape):
return ravel_multi_index_cython(coords, shape)


cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil:
cdef vector[intp_t] unravel_index_cython(intp_t index, const intp_t[:] shape) noexcept nogil:
"""Converts a flat index into a tuple of coordinate arrays.

Parameters
Expand All @@ -102,14 +102,17 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co
"""
cdef intp_t ndim = shape.shape[0]
cdef intp_t j, size
cdef vector[intp_t] coords = vector[intp_t](ndim)

for j in range(ndim - 1, -1, -1):
size = shape[j]
coords[j] = index % size
index //= size

return coords


cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil:
cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shape) noexcept nogil:
"""Converts a tuple of coordinate arrays into a flat index.

Parameters
Expand Down Expand Up @@ -145,3 +148,21 @@ cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) no
flat_index *= shape[i + 1]

return flat_index


cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]] sequences) noexcept nogil:
cdef vector[vector[intp_t]] results = vector[vector[intp_t]](1)
cdef vector[vector[intp_t]] next_results
for new_values in sequences:
for result in results:
for value in new_values:
result_copy = result
result_copy.push_back(value)
next_results.push_back(result_copy)
results = next_results
next_results.clear()
return results


cpdef cartesian_python(vector[vector[intp_t]]& sequences):
return cartesian_cython(sequences)
147 changes: 57 additions & 90 deletions sktree/tree/manifold/_morf_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ from libcpp.vector cimport vector

from ..._lib.sklearn.tree._criterion cimport Criterion
from ..._lib.sklearn.tree._utils cimport rand_int
from .._utils cimport ravel_multi_index_cython, unravel_index_cython
from .._utils cimport cartesian_cython, ravel_multi_index_cython, unravel_index_cython


cdef class PatchSplitter(BestObliqueSplitter):
Expand Down Expand Up @@ -156,17 +156,12 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):

# create a buffer for storing the patch dimensions sampled per projection matrix
self.patch_dims_buff = np.zeros(data_dims.shape[0], dtype=np.intp)
self.unraveled_patch_point = np.zeros(data_dims.shape[0], dtype=np.intp)

# store the min and max patch dimension constraints
self.min_patch_dims = min_patch_dims
self.max_patch_dims = max_patch_dims
self.dim_contiguous = dim_contiguous

# initialize a buffer to allow for Fisher-Yates
self._index_patch_buffer = np.zeros(np.max(self.max_patch_dims), dtype=np.intp)
self._index_data_buffer = np.zeros(np.max(self.data_dims), dtype=np.intp)

# whether or not to perform some discontinuous sampling
if not all(self.dim_contiguous):
self._discontiguous = True
Expand Down Expand Up @@ -211,6 +206,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
# compute top-left seed for the multi-dimensional patch
cdef intp_t top_left_patch_seed
cdef intp_t patch_size = 1
cdef vector[intp_t] unraveled_patch_point = vector[intp_t](self.ndim)

cdef UINT32_t* random_state = &self.rand_r_state

Expand All @@ -232,7 +228,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
self.max_patch_dims[idx] + 1,
random_state
)

# sample the top-left index and patch size for this dimension based on boundary effects
if self.boundary is None:
# compute the difference between the image dimensions and the current
Expand Down Expand Up @@ -262,10 +257,10 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
# Convert the top-left-seed value to it's appropriate index in the full image.
top_left_patch_seed = max(0, dim - patch_dim + 1)

self.unraveled_patch_point[idx] = top_left_patch_seed
unraveled_patch_point[idx] = top_left_patch_seed

top_left_patch_seed = ravel_multi_index_cython(
self.unraveled_patch_point,
unraveled_patch_point,
self.data_dims
)
return top_left_patch_seed, patch_size
Expand Down Expand Up @@ -315,93 +310,65 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
intp_t top_left_patch_seed,
const intp_t[:] patch_dims,
) noexcept nogil:
cdef UINT32_t* random_state = &self.rand_r_state
# iterates over the size of the patch
cdef intp_t patch_idx
"""
Samples projection vector.

# stores how many patches we have iterated so far
cdef intp_t vectorized_patch_offset
cdef intp_t vectorized_point_offset
cdef intp_t vectorized_point
Parameters
----------
proj_mat_weights : vector[vector[float32_t]]
The weights of the projection matrix.
proj_mat_indices : vector[vector[intp_t]]
The indices of the projection matrix.
proj_i : intp_t
The index of feature.
patch_size : intp_t
The total size of the patch.
top_left_patch_seed : intp_t
The top-left seed of the patch raveled.
patch_dims : array-like, shape (n_dims,)
The dimensions of the patch.
"""
# initialize a buffer to allow for Fisher-Yates
cdef vector[intp_t] _index_data_buffer

cdef intp_t dim_idx
cdef UINT32_t* random_state = &self.rand_r_state
cdef intp_t num_rows
cdef int ndim = self.ndim
cdef vector[vector[intp_t]] points = vector[vector[intp_t]](ndim)
cdef intp_t patch_dim
cdef intp_t idx
cdef intp_t i

# weights are default to 1
cdef float32_t weight = 1.

# XXX: still unsure if it works yet
# XXX: THIS ONLY WORKS FOR THE FIRST DIMENSION THAT IS DISCONTIGUOUS.
cdef intp_t other_dims_offset
cdef intp_t row_index

cdef intp_t i
cdef intp_t num_rows = self.data_dims[0]
if self._discontiguous:
# fill with values 0, 1, ..., dimension - 1
for i in range(0, self.data_dims[0]):
self._index_data_buffer[i] = i
# then shuffle indices using Fisher-Yates
for i in range(num_rows):
j = rand_int(0, num_rows - i, random_state)
self._index_data_buffer[i], self._index_data_buffer[j] = \
self._index_data_buffer[j], self._index_data_buffer[i]
# now select the first `patch_dims[0]` indices
for i in range(num_rows):
self._index_patch_buffer[i] = self._index_data_buffer[i]

for patch_idx in range(patch_size):
# keep track of which dimensions of the patch we have iterated over
vectorized_patch_offset = 1

# Once the vectorized top-left-seed is unraveled, you can add the patch
# points in the array structure and compute their vectorized (unraveled)
# points, which are added to the projection vector
unravel_index_cython(top_left_patch_seed, self.data_dims, self.unraveled_patch_point)

for dim_idx in range(self.ndim):
# compute the offset from the top-left patch seed based on:
# 1. the current patch index
# 2. the patch dimension indexed by `dim_idx`
# 3. and the vectorized patch dimensions that we have seen so far
# the `vectorized_point_offset` is the offset from the top-left vectorized seed for this dimension
vectorized_point_offset = (patch_idx // (vectorized_patch_offset)) % patch_dims[dim_idx]

# then we compute the actual point in the original data shape
self.unraveled_patch_point[dim_idx] = self.unraveled_patch_point[dim_idx] + vectorized_point_offset
vectorized_patch_offset *= patch_dims[dim_idx]

# if any dimensions are discontiguous, we want to migrate the entire axis a fixed amount
# based on the shuffling
if self._discontiguous is True:
for dim_idx in range(self.ndim):
if self.dim_contiguous[dim_idx] is True:
continue

# determine the "row" we are currently on
# other_dims_offset = 1
# for idx in range(dim_idx + 1, self.ndim):
# other_dims_offset *= self.data_dims[idx]
# row_index = self.unraveled_patch_point[dim_idx] % other_dims_offset
# determine the "row" we are currently on
other_dims_offset = 1
for idx in range(dim_idx + 1, self.ndim):
if not self.dim_contiguous[idx]:
other_dims_offset *= self.data_dims[idx]

row_index = 0
for idx in range(dim_idx + 1, self.ndim):
if not self.dim_contiguous[idx]:
row_index += (
(self.unraveled_patch_point[idx] // other_dims_offset) %
self.patch_dims_buff[idx]
) * other_dims_offset
other_dims_offset //= self.data_dims[idx]

# assign random row index now
self.unraveled_patch_point[dim_idx] = self._index_patch_buffer[row_index]

# ravel the patch point into the original data dimensions
vectorized_point = ravel_multi_index_cython(self.unraveled_patch_point, self.data_dims)
cdef vector[intp_t] unraveled_patch_point = vector[intp_t](self.ndim)
unraveled_patch_point = unravel_index_cython(top_left_patch_seed, self.data_dims)

for dim_idx in range(ndim):
if self.dim_contiguous[dim_idx]:
patch_dim = patch_dims[dim_idx]
idx = patch_dim + unraveled_patch_point[dim_idx]
for i in range(unraveled_patch_point[dim_idx], idx):
points[dim_idx].push_back(i)
else:
num_rows = self.data_dims[dim_idx]
for i in range(0, num_rows):
_index_data_buffer.push_back(i)
# then shuffle indices using Fisher-Yates
for i in range(num_rows):
j = rand_int(0, num_rows - i, random_state)
_index_data_buffer[i], _index_data_buffer[j] = \
_index_data_buffer[j], _index_data_buffer[i]

for i in range(0, patch_dims[dim_idx]): # populate
points[dim_idx].push_back(_index_data_buffer[i])
_index_data_buffer.clear()

# make cartesian product of the points, ravel, then add to proj_mat_indices
cdef vector[vector[intp_t]] products = cartesian_cython(points)
for point in products:
vectorized_point = ravel_multi_index_cython(point, self.data_dims)
proj_mat_indices[proj_i].push_back(vectorized_point)
proj_mat_weights[proj_i].push_back(weight)

Expand Down
13 changes: 11 additions & 2 deletions sktree/tree/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sktree._lib.sklearn.tree._criterion import Gini
from sktree._lib.sklearn.tree._utils import _any_isnan_axis0

from .._utils import ravel_multi_index, unravel_index
from .._utils import cartesian_python, ravel_multi_index, unravel_index
from ..manifold._morf_splitter import BestPatchSplitterTester


Expand Down Expand Up @@ -142,7 +142,7 @@ def test_unravel_index():
shape = np.asarray((5,))
expected_output = [(0,), (1,), (2,), (3,), (4,)]
for idx, index in enumerate(indices):
assert unravel_index(index, shape) == expected_output[idx]
assert_equal(unravel_index(index, shape), expected_output[idx])

# Test with 2D array
indices = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
Expand Down Expand Up @@ -202,3 +202,12 @@ def test_ravel_multi_index():
# assert str(e) == "Invalid index"
# else:
# assert False, "Expected ValueError"


def test_cartesian_prod():
sequences = [[1, 2], [3, 4, 5]]

from_itertools = list(product(*sequences))
from_cython = cartesian_python(sequences)

assert_equal(from_itertools, from_cython)