Merge pull request #200 from hyun-seo/master

Adding GroupKFold and testing it
hyperopt · Oct 20, 2023 · 840cfc9 · 840cfc9
2 parents 4b3f6fd + dee6e68
commit 840cfc9
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 7 deletions.
diff --git a/hpsklearn/estimator/_cost_fn.py b/hpsklearn/estimator/_cost_fn.py
@@ -9,6 +9,7 @@
     LeaveOneOut, \
     StratifiedKFold, \
     KFold, \
+    GroupKFold, \
     PredefinedSplit
 from sklearn.metrics import accuracy_score, r2_score
 
@@ -24,6 +25,7 @@ def _cost_fn(argd,
              EX_list: typing.Union[list, tuple] = None,
              valid_size: float = 0.2,
              n_folds: int = None,
+             kfolds_group: typing.Union[list, np.ndarray] = None,
              shuffle: bool = False,
              random_state: typing.Union[int, np.random.Generator] = np.random.default_rng(),
              use_partial_fit: bool = False,
@@ -55,7 +57,13 @@ def _cost_fn(argd,
         n_folds: int, default is None
             When n_folds is not None, use K-fold cross-validation when
             n_folds > 2. Or, use leave-one-out cross-validation when
-            n_folds = -1.
+            n_folds = -1. For Group K-fold cross-validation, functions as
+            `n_splits`.
+
+        kfolds_group: list or ndarray, default is None
+            When kfolds_group is not None, use Group K-fold cross-validation
+            with the specified groups. The length of kfolds_group must be
+            equal to the number of samples in X.
 
         shuffle: bool, default is False
             Whether to perform sample shuffling before splitting the
@@ -145,10 +153,14 @@ def _cost_fn(argd,
                                           random_state=random_state_sklearn
                                           ).split(X, y)
             else:
-                info(f"Will use K-fold CV with K: {n_folds} and Shuffle: {shuffle}")
-                cv_iter = KFold(n_splits=n_folds,
-                                shuffle=shuffle,
-                                random_state=random_state_sklearn).split(X)
+                if kfolds_group is not None:
+                    info(f"Will use Group K-fold CV with K: {n_folds} and Shuffle: {shuffle}")
+                    cv_iter = GroupKFold(n_splits=n_folds).split(X, y, kfolds_group)
+                else:
+                    info(f"Will use K-fold CV with K: {n_folds} and Shuffle: {shuffle}")
+                    cv_iter = KFold(n_splits=n_folds,
+                                    shuffle=shuffle,
+                                    random_state=random_state_sklearn).split(X)
         else:
             if not shuffle:  # always choose the last samples.
                 info(f"Will use the last {valid_size} portion of samples for validation")

diff --git a/hpsklearn/estimator/estimator.py b/hpsklearn/estimator/estimator.py
@@ -215,6 +215,7 @@ def fit_iter(self, X, y,
                  EX_list: typing.Union[list, tuple] = None,
                  valid_size: float = .2,
                  n_folds: int = None,
+                 kfolds_group: typing.Union[list, np.ndarray] = None,
                  cv_shuffle: bool = False,
                  warm_start: bool = False,
                  random_state: np.random.Generator = np.random.default_rng(),
@@ -240,7 +241,13 @@ def fit_iter(self, X, y,
             n_folds: int, default is None
                 When n_folds is not None, use K-fold cross-validation when
                 n_folds > 2. Or, use leave-one-out cross-validation when
-                n_folds = -1.
+                n_folds = -1. For Group K-fold cross-validation, functions as
+                `n_splits`.
+
+            kfolds_group: list or ndarray, default is None
+                When kfolds_group is not None, use Group K-fold cross-validation
+                with the specified groups. The length of kfolds_group must be
+                equal to the number of samples in X.
 
             cv_shuffle: bool, default is False
                 Whether to perform sample shuffling before splitting the
@@ -277,6 +284,7 @@ def fit_iter(self, X, y,
                      EX_list=EX_list,
                      valid_size=valid_size,
                      n_folds=n_folds,
+                     kfolds_group=kfolds_group,
                      shuffle=cv_shuffle,
                      random_state=random_state,
                      use_partial_fit=self.use_partial_fit,
@@ -398,6 +406,7 @@ def fit(self, X, y,
             EX_list: typing.Union[list, tuple] = None,
             valid_size: float = .2,
             n_folds: int = None,
+            kfolds_group: typing.Union[list, np.ndarray] = None,
             cv_shuffle: bool = False,
             warm_start: bool = False,
             random_state: np.random.Generator = np.random.default_rng()
@@ -424,7 +433,13 @@ def fit(self, X, y,
             n_folds: int, default is None
                 When n_folds is not None, use K-fold cross-validation when
                 n_folds > 2. Or, use leave-one-out cross-validation when
-                n_folds = -1.
+                n_folds = -1. For Group K-fold cross-validation, functions as
+                `n_splits`.
+
+            kfolds_group: list or ndarray, default is None
+                When kfolds_group is not None, use Group K-fold cross-validation
+                with the specified groups. The length of group_kfolds must be
+                equal to the number of samples in X.
 
             cv_shuffle: bool, default is False
                 Whether to perform sample shuffling before splitting the
@@ -450,6 +465,7 @@ def fit(self, X, y,
                                  EX_list=EX_list,
                                  valid_size=valid_size,
                                  n_folds=n_folds,
+                                 kfolds_group=kfolds_group,
                                  cv_shuffle=cv_shuffle,
                                  warm_start=warm_start,
                                  random_state=random_state)

diff --git a/tests/test_estimator/test_estimator.py b/tests/test_estimator/test_estimator.py
@@ -246,5 +246,38 @@ def test_crossvalidation(self):
         cls.fit(X, y, cv_shuffle=True, n_folds=5)
 
 
+class TestGroupCrossValidation(unittest.TestCase):
+    """
+    Class for testing estimator with group cross validation
+    """
+
+    def setUp(self):
+        """
+        Setup the random seed
+        """
+        np.random.seed(123)
+
+    @RetryOnTrialsException
+    def test_crossvalidation(self):
+        """
+        Demonstrate performing a group k-fold CV using the fit() method.
+        """
+        # Generate some random data
+        X = np.hstack([
+            np.vstack([
+                np.random.normal(0, 1, size=(1000, 10)),
+                np.random.normal(1, 1, size=(1000, 10)),
+            ]),
+            np.random.normal(0, 1, size=(2000, 10)),
+        ])
+        y = np.zeros(2000)
+        y[:1000] = 1
+
+        # Try to fit a model
+        cls = HyperoptEstimator(classifier=sgd_classifier("sgd", loss="log"), preprocessing=[])
+        cls.fit(X, y, cv_shuffle=True, n_folds=5,
+                kfolds_group=np.array([0]*500 + [1]*500 + [2]*500 + [3]*500))  # noqa: E226
+
+
 if __name__ == '__main__':
     unittest.main()