Skip to content

Commit

Permalink
Merge pull request #200 from hyun-seo/master
Browse files Browse the repository at this point in the history
Adding GroupKFold and testing it
  • Loading branch information
mandjevant committed Oct 20, 2023
2 parents 4b3f6fd + dee6e68 commit 840cfc9
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 7 deletions.
22 changes: 17 additions & 5 deletions hpsklearn/estimator/_cost_fn.py
Expand Up @@ -9,6 +9,7 @@
LeaveOneOut, \
StratifiedKFold, \
KFold, \
GroupKFold, \
PredefinedSplit
from sklearn.metrics import accuracy_score, r2_score

Expand All @@ -24,6 +25,7 @@ def _cost_fn(argd,
EX_list: typing.Union[list, tuple] = None,
valid_size: float = 0.2,
n_folds: int = None,
kfolds_group: typing.Union[list, np.ndarray] = None,
shuffle: bool = False,
random_state: typing.Union[int, np.random.Generator] = np.random.default_rng(),
use_partial_fit: bool = False,
Expand Down Expand Up @@ -55,7 +57,13 @@ def _cost_fn(argd,
n_folds: int, default is None
When n_folds is not None, use K-fold cross-validation when
n_folds > 2. Or, use leave-one-out cross-validation when
n_folds = -1.
n_folds = -1. For Group K-fold cross-validation, functions as
`n_splits`.
kfolds_group: list or ndarray, default is None
When kfolds_group is not None, use Group K-fold cross-validation
with the specified groups. The length of kfolds_group must be
equal to the number of samples in X.
shuffle: bool, default is False
Whether to perform sample shuffling before splitting the
Expand Down Expand Up @@ -145,10 +153,14 @@ def _cost_fn(argd,
random_state=random_state_sklearn
).split(X, y)
else:
info(f"Will use K-fold CV with K: {n_folds} and Shuffle: {shuffle}")
cv_iter = KFold(n_splits=n_folds,
shuffle=shuffle,
random_state=random_state_sklearn).split(X)
if kfolds_group is not None:
info(f"Will use Group K-fold CV with K: {n_folds} and Shuffle: {shuffle}")
cv_iter = GroupKFold(n_splits=n_folds).split(X, y, kfolds_group)
else:
info(f"Will use K-fold CV with K: {n_folds} and Shuffle: {shuffle}")
cv_iter = KFold(n_splits=n_folds,
shuffle=shuffle,
random_state=random_state_sklearn).split(X)
else:
if not shuffle: # always choose the last samples.
info(f"Will use the last {valid_size} portion of samples for validation")
Expand Down
20 changes: 18 additions & 2 deletions hpsklearn/estimator/estimator.py
Expand Up @@ -215,6 +215,7 @@ def fit_iter(self, X, y,
EX_list: typing.Union[list, tuple] = None,
valid_size: float = .2,
n_folds: int = None,
kfolds_group: typing.Union[list, np.ndarray] = None,
cv_shuffle: bool = False,
warm_start: bool = False,
random_state: np.random.Generator = np.random.default_rng(),
Expand All @@ -240,7 +241,13 @@ def fit_iter(self, X, y,
n_folds: int, default is None
When n_folds is not None, use K-fold cross-validation when
n_folds > 2. Or, use leave-one-out cross-validation when
n_folds = -1.
n_folds = -1. For Group K-fold cross-validation, functions as
`n_splits`.
kfolds_group: list or ndarray, default is None
When kfolds_group is not None, use Group K-fold cross-validation
with the specified groups. The length of kfolds_group must be
equal to the number of samples in X.
cv_shuffle: bool, default is False
Whether to perform sample shuffling before splitting the
Expand Down Expand Up @@ -277,6 +284,7 @@ def fit_iter(self, X, y,
EX_list=EX_list,
valid_size=valid_size,
n_folds=n_folds,
kfolds_group=kfolds_group,
shuffle=cv_shuffle,
random_state=random_state,
use_partial_fit=self.use_partial_fit,
Expand Down Expand Up @@ -398,6 +406,7 @@ def fit(self, X, y,
EX_list: typing.Union[list, tuple] = None,
valid_size: float = .2,
n_folds: int = None,
kfolds_group: typing.Union[list, np.ndarray] = None,
cv_shuffle: bool = False,
warm_start: bool = False,
random_state: np.random.Generator = np.random.default_rng()
Expand All @@ -424,7 +433,13 @@ def fit(self, X, y,
n_folds: int, default is None
When n_folds is not None, use K-fold cross-validation when
n_folds > 2. Or, use leave-one-out cross-validation when
n_folds = -1.
n_folds = -1. For Group K-fold cross-validation, functions as
`n_splits`.
kfolds_group: list or ndarray, default is None
When kfolds_group is not None, use Group K-fold cross-validation
with the specified groups. The length of group_kfolds must be
equal to the number of samples in X.
cv_shuffle: bool, default is False
Whether to perform sample shuffling before splitting the
Expand All @@ -450,6 +465,7 @@ def fit(self, X, y,
EX_list=EX_list,
valid_size=valid_size,
n_folds=n_folds,
kfolds_group=kfolds_group,
cv_shuffle=cv_shuffle,
warm_start=warm_start,
random_state=random_state)
Expand Down
33 changes: 33 additions & 0 deletions tests/test_estimator/test_estimator.py
Expand Up @@ -246,5 +246,38 @@ def test_crossvalidation(self):
cls.fit(X, y, cv_shuffle=True, n_folds=5)


class TestGroupCrossValidation(unittest.TestCase):
"""
Class for testing estimator with group cross validation
"""

def setUp(self):
"""
Setup the random seed
"""
np.random.seed(123)

@RetryOnTrialsException
def test_crossvalidation(self):
"""
Demonstrate performing a group k-fold CV using the fit() method.
"""
# Generate some random data
X = np.hstack([
np.vstack([
np.random.normal(0, 1, size=(1000, 10)),
np.random.normal(1, 1, size=(1000, 10)),
]),
np.random.normal(0, 1, size=(2000, 10)),
])
y = np.zeros(2000)
y[:1000] = 1

# Try to fit a model
cls = HyperoptEstimator(classifier=sgd_classifier("sgd", loss="log"), preprocessing=[])
cls.fit(X, y, cv_shuffle=True, n_folds=5,
kfolds_group=np.array([0]*500 + [1]*500 + [2]*500 + [3]*500)) # noqa: E226


if __name__ == '__main__':
unittest.main()

0 comments on commit 840cfc9

Please sign in to comment.