neurodata · j1c · Feb 13, 2024 · Feb 13, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst
@@ -25,6 +25,9 @@ Changelog
     by `Adam Li`_ (:pr:`#211`)
 - |Feature| Introduce a new set of simulations based on Marron and Wand 1992.
     by `Sambit Panda`_ (:pr:`#203`)
+- |Fix| :class:`sktree.tree.PatchObliqueDecisionTreeClassifier` now correctly
+  handles the case where one or more features are discontiguous.
+  by `Jaewon Chung`_ (:pr:`#219`).
 
 Code and Documentation Contributors
 -----------------------------------
@@ -34,3 +37,4 @@ the project since version inception, including:
 
 * `Adam Li`_
 * `Sambit Panda`_
+* `Jaewon Chung`_
diff --git a/sktree/tree/_utils.pxd b/sktree/tree/_utils.pxd
@@ -1,6 +1,7 @@
 import numpy as np
 
 cimport numpy as cnp
+from libcpp.vector cimport vector
 
 cnp.import_array()
 
@@ -15,8 +16,12 @@ cpdef unravel_index(
     intp_t index, cnp.ndarray[intp_t, ndim=1] shape
 )
 
-cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape)
+cpdef ravel_multi_index(vector[intp_t] coords, const intp_t[:] shape)
 
-cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil
+cdef vector[intp_t] unravel_index_cython(intp_t index, const intp_t[:] shape) noexcept nogil
 
-cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil
+cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shape) noexcept nogil
+
+cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]] sequences) noexcept nogil
+
+cpdef cartesian_python(vector[vector[intp_t]]& sequences)
diff --git a/sktree/tree/_utils.pyx b/sktree/tree/_utils.pyx
@@ -10,6 +10,7 @@ import numpy as np
 cimport numpy as cnp
 
 cnp.import_array()
+from libcpp.vector cimport vector
 
 from .._lib.sklearn.tree._utils cimport rand_uniform
 
@@ -53,12 +54,11 @@ cpdef unravel_index(
     """
     index = np.intp(index)
     shape = np.array(shape)
-    coords = np.empty(shape.shape[0], dtype=np.intp)
-    unravel_index_cython(index, shape, coords)
+    coords = unravel_index_cython(index, shape)
     return coords
 
 
-cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape):
+cpdef ravel_multi_index(vector[intp_t] coords, const intp_t[:] shape):
     """Converts a tuple of coordinate arrays into a flat index.
 
     Purely used for testing purposes.
@@ -83,7 +83,7 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape):
     return ravel_multi_index_cython(coords, shape)
 
 
-cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil:
+cdef vector[intp_t] unravel_index_cython(intp_t index, const intp_t[:] shape) noexcept nogil:
     """Converts a flat index into a tuple of coordinate arrays.
 
     Parameters
@@ -102,14 +102,17 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co
     """
     cdef intp_t ndim = shape.shape[0]
     cdef intp_t j, size
+    cdef vector[intp_t] coords = vector[intp_t](ndim)
 
     for j in range(ndim - 1, -1, -1):
         size = shape[j]
         coords[j] = index % size
         index //= size
 
+    return coords
+
 
-cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil:
+cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shape) noexcept nogil:
     """Converts a tuple of coordinate arrays into a flat index.
 
     Parameters
@@ -145,3 +148,21 @@ cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) no
             flat_index *= shape[i + 1]
 
     return flat_index
+
+
+cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]] sequences) noexcept nogil:
+    cdef vector[vector[intp_t]] results = vector[vector[intp_t]](1)
+    cdef vector[vector[intp_t]] next_results
+    for new_values in sequences:
+        for result in results:
+            for value in new_values:
+                result_copy = result
+                result_copy.push_back(value)
+                next_results.push_back(result_copy)
+        results = next_results
+        next_results.clear()
+    return results
+
+
+cpdef cartesian_python(vector[vector[intp_t]]& sequences):
+    return cartesian_cython(sequences)
diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx
@@ -16,7 +16,7 @@ from libcpp.vector cimport vector
 
 from ..._lib.sklearn.tree._criterion cimport Criterion
 from ..._lib.sklearn.tree._utils cimport rand_int
-from .._utils cimport ravel_multi_index_cython, unravel_index_cython
+from .._utils cimport cartesian_cython, ravel_multi_index_cython, unravel_index_cython
 
 
 cdef class PatchSplitter(BestObliqueSplitter):
@@ -156,17 +156,12 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
 
         # create a buffer for storing the patch dimensions sampled per projection matrix
         self.patch_dims_buff = np.zeros(data_dims.shape[0], dtype=np.intp)
-        self.unraveled_patch_point = np.zeros(data_dims.shape[0], dtype=np.intp)
 
         # store the min and max patch dimension constraints
         self.min_patch_dims = min_patch_dims
         self.max_patch_dims = max_patch_dims
         self.dim_contiguous = dim_contiguous
 
-        # initialize a buffer to allow for Fisher-Yates
-        self._index_patch_buffer = np.zeros(np.max(self.max_patch_dims), dtype=np.intp)
-        self._index_data_buffer = np.zeros(np.max(self.data_dims), dtype=np.intp)
-
         # whether or not to perform some discontinuous sampling
         if not all(self.dim_contiguous):
             self._discontiguous = True
@@ -211,6 +206,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
         # compute top-left seed for the multi-dimensional patch
         cdef intp_t top_left_patch_seed
         cdef intp_t patch_size = 1
+        cdef vector[intp_t] unraveled_patch_point = vector[intp_t](self.ndim)
 
         cdef UINT32_t* random_state = &self.rand_r_state
 
@@ -232,7 +228,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
                 self.max_patch_dims[idx] + 1,
                 random_state
             )
-
             # sample the top-left index and patch size for this dimension based on boundary effects
             if self.boundary is None:
                 # compute the difference between the image dimensions and the current
@@ -262,10 +257,10 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
                 # Convert the top-left-seed value to it's appropriate index in the full image.
                 top_left_patch_seed = max(0, dim - patch_dim + 1)
 
-            self.unraveled_patch_point[idx] = top_left_patch_seed
+            unraveled_patch_point[idx] = top_left_patch_seed
 
         top_left_patch_seed = ravel_multi_index_cython(
-            self.unraveled_patch_point,
+            unraveled_patch_point,
             self.data_dims
         )
         return top_left_patch_seed, patch_size
@@ -315,93 +310,65 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter):
         intp_t top_left_patch_seed,
         const intp_t[:] patch_dims,
     ) noexcept nogil:
-        cdef UINT32_t* random_state = &self.rand_r_state
-        # iterates over the size of the patch
-        cdef intp_t patch_idx
+        """
+        Samples projection vector.
 
-        # stores how many patches we have iterated so far
-        cdef intp_t vectorized_patch_offset
-        cdef intp_t vectorized_point_offset
-        cdef intp_t vectorized_point
+        Parameters
+        ----------
+        proj_mat_weights : vector[vector[float32_t]]
+            The weights of the projection matrix.
+        proj_mat_indices : vector[vector[intp_t]]
+            The indices of the projection matrix.
+        proj_i : intp_t
+            The index of feature.
+        patch_size : intp_t
+            The total size of the patch.
+        top_left_patch_seed : intp_t
+            The top-left seed of the patch raveled.
+        patch_dims : array-like, shape (n_dims,)
+            The dimensions of the patch.
+        """
+        # initialize a buffer to allow for Fisher-Yates
+        cdef vector[intp_t] _index_data_buffer
 
-        cdef intp_t dim_idx
+        cdef UINT32_t* random_state = &self.rand_r_state
+        cdef intp_t num_rows
+        cdef int ndim = self.ndim
+        cdef vector[vector[intp_t]] points = vector[vector[intp_t]](ndim)
+        cdef intp_t patch_dim
+        cdef intp_t idx
+        cdef intp_t i
 
         # weights are default to 1
         cdef float32_t weight = 1.
 
-        # XXX: still unsure if it works yet
-        # XXX: THIS ONLY WORKS FOR THE FIRST DIMENSION THAT IS DISCONTIGUOUS.
-        cdef intp_t other_dims_offset
-        cdef intp_t row_index
-
-        cdef intp_t i
-        cdef intp_t num_rows = self.data_dims[0]
-        if self._discontiguous:
-            # fill with values 0, 1, ..., dimension - 1
-            for i in range(0, self.data_dims[0]):
-                self._index_data_buffer[i] = i
-            # then shuffle indices using Fisher-Yates
-            for i in range(num_rows):
-                j = rand_int(0, num_rows - i, random_state)
-                self._index_data_buffer[i], self._index_data_buffer[j] = \
-                    self._index_data_buffer[j], self._index_data_buffer[i]
-            # now select the first `patch_dims[0]` indices
-            for i in range(num_rows):
-                self._index_patch_buffer[i] = self._index_data_buffer[i]
-
-        for patch_idx in range(patch_size):
-            # keep track of which dimensions of the patch we have iterated over
-            vectorized_patch_offset = 1
-
-            # Once the vectorized top-left-seed is unraveled, you can add the patch
-            # points in the array structure and compute their vectorized (unraveled)
-            # points, which are added to the projection vector
-            unravel_index_cython(top_left_patch_seed, self.data_dims, self.unraveled_patch_point)
-
-            for dim_idx in range(self.ndim):
-                # compute the offset from the top-left patch seed based on:
-                # 1. the current patch index
-                # 2. the patch dimension indexed by `dim_idx`
-                # 3. and the vectorized patch dimensions that we have seen so far
-                # the `vectorized_point_offset` is the offset from the top-left vectorized seed for this dimension
-                vectorized_point_offset = (patch_idx // (vectorized_patch_offset)) % patch_dims[dim_idx]
-
-                # then we compute the actual point in the original data shape
-                self.unraveled_patch_point[dim_idx] = self.unraveled_patch_point[dim_idx] + vectorized_point_offset
-                vectorized_patch_offset *= patch_dims[dim_idx]
-
-            # if any dimensions are discontiguous, we want to migrate the entire axis a fixed amount
-            # based on the shuffling
-            if self._discontiguous is True:
-                for dim_idx in range(self.ndim):
-                    if self.dim_contiguous[dim_idx] is True:
-                        continue
-
-                    # determine the "row" we are currently on
-                    # other_dims_offset = 1
-                    # for idx in range(dim_idx + 1, self.ndim):
-                    #     other_dims_offset *= self.data_dims[idx]
-                    # row_index = self.unraveled_patch_point[dim_idx] % other_dims_offset
-                    # determine the "row" we are currently on
-                    other_dims_offset = 1
-                    for idx in range(dim_idx + 1, self.ndim):
-                        if not self.dim_contiguous[idx]:
-                            other_dims_offset *= self.data_dims[idx]
-
-                    row_index = 0
-                    for idx in range(dim_idx + 1, self.ndim):
-                        if not self.dim_contiguous[idx]:
-                            row_index += (
-                                (self.unraveled_patch_point[idx] // other_dims_offset) %
-                                self.patch_dims_buff[idx]
-                            ) * other_dims_offset
-                            other_dims_offset //= self.data_dims[idx]
-
-                    # assign random row index now
-                    self.unraveled_patch_point[dim_idx] = self._index_patch_buffer[row_index]
-
-            # ravel the patch point into the original data dimensions
-            vectorized_point = ravel_multi_index_cython(self.unraveled_patch_point, self.data_dims)
+        cdef vector[intp_t] unraveled_patch_point = vector[intp_t](self.ndim)
+        unraveled_patch_point = unravel_index_cython(top_left_patch_seed, self.data_dims)
+
+        for dim_idx in range(ndim):
+            if self.dim_contiguous[dim_idx]:
+                patch_dim = patch_dims[dim_idx]
+                idx = patch_dim + unraveled_patch_point[dim_idx]
+                for i in range(unraveled_patch_point[dim_idx], idx):
+                    points[dim_idx].push_back(i)
+            else:
+                num_rows = self.data_dims[dim_idx]
+                for i in range(0, num_rows):
+                    _index_data_buffer.push_back(i)
+                # then shuffle indices using Fisher-Yates
+                for i in range(num_rows):
+                    j = rand_int(0, num_rows - i, random_state)
+                    _index_data_buffer[i], _index_data_buffer[j] = \
+                        _index_data_buffer[j], _index_data_buffer[i]
+
+                for i in range(0, patch_dims[dim_idx]):  # populate
+                    points[dim_idx].push_back(_index_data_buffer[i])
+                _index_data_buffer.clear()
+
+        # make cartesian product of the points, ravel, then add to proj_mat_indices
+        cdef vector[vector[intp_t]] products = cartesian_cython(points)
+        for point in products:
+            vectorized_point = ravel_multi_index_cython(point, self.data_dims)
             proj_mat_indices[proj_i].push_back(vectorized_point)
             proj_mat_weights[proj_i].push_back(weight)
 

diff --git a/sktree/tree/tests/test_utils.py b/sktree/tree/tests/test_utils.py
@@ -7,7 +7,7 @@
 from sktree._lib.sklearn.tree._criterion import Gini
 from sktree._lib.sklearn.tree._utils import _any_isnan_axis0
 
-from .._utils import ravel_multi_index, unravel_index
+from .._utils import cartesian_python, ravel_multi_index, unravel_index
 from ..manifold._morf_splitter import BestPatchSplitterTester
 
 
@@ -142,7 +142,7 @@ def test_unravel_index():
     shape = np.asarray((5,))
     expected_output = [(0,), (1,), (2,), (3,), (4,)]
     for idx, index in enumerate(indices):
-        assert unravel_index(index, shape) == expected_output[idx]
+        assert_equal(unravel_index(index, shape), expected_output[idx])
 
     # Test with 2D array
     indices = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
@@ -202,3 +202,12 @@ def test_ravel_multi_index():
     #     assert str(e) == "Invalid index"
     # else:
     #     assert False, "Expected ValueError"
+
+
+def test_cartesian_prod():
+    sequences = [[1, 2], [3, 4, 5]]
+
+    from_itertools = list(product(*sequences))
+    from_cython = cartesian_python(sequences)
+
+    assert_equal(from_itertools, from_cython)