Replicating SelectKBest + GridSearchCV results #19674
-
I would like to be able to reproduce import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import itertools
r = 1
X, y = make_classification(n_samples = 50, n_features = 20, weights = [3/5], random_state = r)
np.random.seed(r)
X = np.random.rand(X.shape[0], X.shape[1])
K = [1,3,5]
C = [0.1,1]
cv = StratifiedKFold(n_splits = 10)
# SKLEARN GRID-SEARCH
space = dict()
space['anova__k'] = K
space['svc__C'] = C
clf = Pipeline([('anova', SelectKBest()), ('svc', SVC(probability = True, random_state = r))])
search = GridSearchCV(clf, space, scoring = 'roc_auc', cv = cv, refit = True, n_jobs = -1)
result = search.fit(X, y)
print('GridSearchCV results:')
print(result.cv_results_['mean_test_score'])
# MANUAL GRID-SEARCH
scores = []
for train_indx, test_indx in cv.split(X, y):
X_train, y_train = X[train_indx,:], y[train_indx]
X_test, y_test = X[test_indx,:], y[test_indx]
scores_ = []
for k, c in itertools.product(K, C):
anova = SelectKBest(k = k)
X_train_k = anova.fit_transform(X_train, y_train)
clf = SVC(C = c, probability = True, random_state = r).fit(X_train_k, y_train)
y_pred = clf.predict_proba(anova.transform(X_test))[:, 1]
scores_.append(roc_auc_score(y_test, y_pred))
scores.append(scores_)
print('Manual grid-search CV results:')
print(np.mean(np.array(scores), axis = 0)) For me, this produces the following output:
On the other hand, when commenting the Is there something trivial I'm missing? Over the course of 8 days, I have posted this very same question on:
and I have received no answer whatsoever. Is the question unclear? |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 4 replies
-
The difference between the 2 snippets is in how Change this: # y_pred = clf.predict_proba(anova.transform(X_test))[:, 1]
y_pred = clf.decision_function(anova.transform(X_test)) and you'll get the same results. For a better code: also remove |
Beta Was this translation helpful? Give feedback.
The difference between the 2 snippets is in how
auc
is computed. I haven't doubled checked but I would bet thatdecision_function
is used inGridSearch
, while you're using calibrated probabilities (probabilities=True
) in the for loop.Change this:
and you'll get the same results.
For a better code: also remove
probabilities=True
everywhere (and also remove passing random_state to the estimators): you don't need the probabilities to compute the AUC, the output of the decision_function is enough.