Today we are going to use all the skills we have learned to tackle a real problem in industry. The problem is churn prediction with a ride-sharing company in San Francisco.
df = pd.read_csv('data/churn.csv')
df
df_train = pd.read_csv('data/churn_train.csv')
df_train
df_test = pd.read_csv('data/churn_test.csv')
df_test
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_dist | avg_rating_by_driver | avg_rating_of_driver | avg_surge | city | last_trip_date | phone | signup_date | surge_pct | trips_in_first_30_days | luxury_car_user | weekday_pct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.67 | 5.0 | 4.7 | 1.10 | King's Landing | 2014-06-17 | iPhone | 2014-01-25 | 15.4 | 4 | True | 46.2 |
1 | 8.26 | 5.0 | 5.0 | 1.00 | Astapor | 2014-05-05 | Android | 2014-01-29 | 0.0 | 0 | False | 50.0 |
2 | 0.77 | 5.0 | 4.3 | 1.00 | Astapor | 2014-01-07 | iPhone | 2014-01-06 | 0.0 | 3 | False | 100.0 |
3 | 2.36 | 4.9 | 4.6 | 1.14 | King's Landing | 2014-06-29 | iPhone | 2014-01-10 | 20.0 | 9 | True | 80.0 |
4 | 3.13 | 4.9 | 4.4 | 1.19 | Winterfell | 2014-03-15 | Android | 2014-01-27 | 11.8 | 14 | False | 82.4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 5.63 | 4.2 | 5.0 | 1.00 | King's Landing | 2014-06-05 | iPhone | 2014-01-25 | 0.0 | 0 | False | 100.0 |
49996 | 0.00 | 4.0 | NaN | 1.00 | Astapor | 2014-01-25 | iPhone | 2014-01-24 | 0.0 | 1 | False | 0.0 |
49997 | 3.86 | 5.0 | 5.0 | 1.00 | Winterfell | 2014-05-22 | Android | 2014-01-31 | 0.0 | 0 | True | 100.0 |
49998 | 4.58 | 3.5 | 3.0 | 1.00 | Astapor | 2014-01-15 | iPhone | 2014-01-14 | 0.0 | 2 | False | 100.0 |
49999 | 3.49 | 5.0 | NaN | 1.00 | Astapor | 2014-04-20 | Android | 2014-01-18 | 0.0 | 0 | False | 0.0 |
50000 rows × 12 columns
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_dist | avg_rating_by_driver | avg_rating_of_driver | avg_surge | city | last_trip_date | phone | signup_date | surge_pct | trips_in_first_30_days | luxury_car_user | weekday_pct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6.94 | 5.0 | 5.0 | 1.00 | Astapor | 2014-05-03 | Android | 2014-01-12 | 0.0 | 0 | False | 100.0 |
1 | 8.06 | 5.0 | 5.0 | 1.00 | Astapor | 2014-01-26 | Android | 2014-01-25 | 0.0 | 2 | True | 0.0 |
2 | 21.50 | 4.0 | NaN | 1.00 | Winterfell | 2014-05-21 | iPhone | 2014-01-02 | 0.0 | 1 | True | 100.0 |
3 | 9.46 | 5.0 | NaN | 2.75 | Winterfell | 2014-01-10 | Android | 2014-01-09 | 100.0 | 1 | False | 100.0 |
4 | 13.77 | 5.0 | NaN | 1.00 | Winterfell | 2014-05-13 | iPhone | 2014-01-31 | 0.0 | 0 | False | 100.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
39995 | 2.06 | 4.8 | 4.3 | 1.08 | Winterfell | 2014-04-02 | Android | 2014-01-26 | 9.5 | 8 | False | 90.5 |
39996 | 2.05 | 5.0 | 5.0 | 1.00 | King's Landing | 2014-05-09 | iPhone | 2014-01-08 | 0.0 | 2 | False | 85.7 |
39997 | 3.04 | 5.0 | 5.0 | 1.00 | Winterfell | 2014-06-24 | Android | 2014-01-04 | 0.0 | 3 | True | 33.3 |
39998 | 3.49 | 4.3 | 3.3 | 1.50 | Astapor | 2014-02-09 | iPhone | 2014-01-08 | 40.0 | 5 | False | 60.0 |
39999 | 4.25 | 4.7 | 5.0 | 1.00 | Astapor | 2014-06-27 | iPhone | 2014-01-18 | 0.0 | 2 | True | 42.9 |
40000 rows × 12 columns
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_dist | avg_rating_by_driver | avg_rating_of_driver | avg_surge | city | last_trip_date | phone | signup_date | surge_pct | trips_in_first_30_days | luxury_car_user | weekday_pct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.48 | 5.0 | 5.0 | 1.00 | Winterfell | 2014-01-07 | Android | 2014-01-06 | 0.0 | 2 | True | 100.0 |
1 | 10.81 | 5.0 | 5.0 | 1.00 | Winterfell | 2014-04-29 | iPhone | 2014-01-06 | 0.0 | 3 | True | 100.0 |
2 | 12.95 | 5.0 | 5.0 | 1.00 | Astapor | 2014-01-29 | Android | 2014-01-19 | 0.0 | 1 | True | 100.0 |
3 | 3.92 | 5.0 | NaN | 1.00 | Winterfell | 2014-02-16 | iPhone | 2014-01-09 | 0.0 | 0 | False | 0.0 |
4 | 1.46 | 5.0 | 4.5 | 1.00 | Astapor | 2014-01-09 | iPhone | 2014-01-07 | 0.0 | 2 | False | 100.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9995 | 2.11 | 5.0 | 5.0 | 1.00 | King's Landing | 2014-02-01 | iPhone | 2014-01-31 | 0.0 | 1 | False | 100.0 |
9996 | 5.49 | 4.9 | 4.1 | 1.00 | Winterfell | 2014-05-27 | iPhone | 2014-01-30 | 0.0 | 0 | True | 33.3 |
9997 | 2.49 | 4.2 | 4.6 | 1.25 | Winterfell | 2014-05-24 | Android | 2014-01-15 | 18.8 | 9 | True | 90.6 |
9998 | 1.05 | 4.0 | 5.0 | 1.00 | Astapor | 2014-01-23 | Android | 2014-01-22 | 0.0 | 1 | False | 100.0 |
9999 | 6.31 | 5.0 | 5.0 | 1.00 | Astapor | 2014-04-27 | iPhone | 2014-01-25 | 0.0 | 0 | True | 0.0 |
10000 rows × 12 columns
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_dist | avg_rating_by_driver | avg_rating_of_driver | avg_surge | city | last_trip_date | phone | signup_date | surge_pct | trips_in_first_30_days | luxury_car_user | weekday_pct | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.67 | 5.0 | 4.700000 | 1.10 | King's Landing | 2014-06-17 | False | 2014-01-25 | 15.4 | 4 | True | 46.2 | False |
1 | 8.26 | 5.0 | 5.000000 | 1.00 | Astapor | 2014-05-05 | True | 2014-01-29 | 0.0 | 0 | False | 50.0 | True |
2 | 0.77 | 5.0 | 4.300000 | 1.00 | Astapor | 2014-01-07 | False | 2014-01-06 | 0.0 | 3 | False | 100.0 | True |
3 | 2.36 | 4.9 | 4.600000 | 1.14 | King's Landing | 2014-06-29 | False | 2014-01-10 | 20.0 | 9 | True | 80.0 | False |
4 | 3.13 | 4.9 | 4.400000 | 1.19 | Winterfell | 2014-03-15 | True | 2014-01-27 | 11.8 | 14 | False | 82.4 | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 5.63 | 4.2 | 5.000000 | 1.00 | King's Landing | 2014-06-05 | False | 2014-01-25 | 0.0 | 0 | False | 100.0 | False |
49996 | 0.00 | 4.0 | 4.601559 | 1.00 | Astapor | 2014-01-25 | False | 2014-01-24 | 0.0 | 1 | False | 0.0 | True |
49997 | 3.86 | 5.0 | 5.000000 | 1.00 | Winterfell | 2014-05-22 | True | 2014-01-31 | 0.0 | 0 | True | 100.0 | True |
49998 | 4.58 | 3.5 | 3.000000 | 1.00 | Astapor | 2014-01-15 | False | 2014-01-14 | 0.0 | 2 | False | 100.0 | True |
49999 | 3.49 | 5.0 | 4.601559 | 1.00 | Astapor | 2014-04-20 | True | 2014-01-18 | 0.0 | 0 | False | 0.0 | True |
50000 rows × 13 columns
df_clean = pd.read_csv('data/Churn_clean.csv', index_col=0)
df_clean.dropna(inplace=True)
df_clean['signup_date']=pd.to_datetime(df_clean['signup_date'])
features = ['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge',
'city', 'phone', 'signup_date', 'surge_pct',
'trips_in_first_30_days', 'luxury_car_user', 'weekday_pct']
X = df_clean[features]
y = df_clean['Churn']
X
y
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_dist | avg_rating_by_driver | avg_rating_of_driver | avg_surge | city | phone | signup_date | surge_pct | trips_in_first_30_days | luxury_car_user | weekday_pct | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.67 | 5.0 | 4.700000 | 1.10 | King's Landing | False | 2014-01-25 | 15.4 | 4 | True | 46.2 |
1 | 8.26 | 5.0 | 5.000000 | 1.00 | Astapor | True | 2014-01-29 | 0.0 | 0 | False | 50.0 |
2 | 0.77 | 5.0 | 4.300000 | 1.00 | Astapor | False | 2014-01-06 | 0.0 | 3 | False | 100.0 |
3 | 2.36 | 4.9 | 4.600000 | 1.14 | King's Landing | False | 2014-01-10 | 20.0 | 9 | True | 80.0 |
4 | 3.13 | 4.9 | 4.400000 | 1.19 | Winterfell | True | 2014-01-27 | 11.8 | 14 | False | 82.4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 5.63 | 4.2 | 5.000000 | 1.00 | King's Landing | False | 2014-01-25 | 0.0 | 0 | False | 100.0 |
49996 | 0.00 | 4.0 | 4.601559 | 1.00 | Astapor | False | 2014-01-24 | 0.0 | 1 | False | 0.0 |
49997 | 3.86 | 5.0 | 5.000000 | 1.00 | Winterfell | True | 2014-01-31 | 0.0 | 0 | True | 100.0 |
49998 | 4.58 | 3.5 | 3.000000 | 1.00 | Astapor | False | 2014-01-14 | 0.0 | 2 | False | 100.0 |
49999 | 3.49 | 5.0 | 4.601559 | 1.00 | Astapor | True | 2014-01-18 | 0.0 | 0 | False | 0.0 |
49604 rows × 11 columns
0 False
1 True
2 True
3 False
4 True
...
49995 False
49996 True
49997 True
49998 True
49999 True
Name: Churn, Length: 49604, dtype: bool
X['phone'] = X['phone'].astype(bool)
/Users/jacobmullins/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
X['kings_landing'] = X['city'].apply(lambda x: x == "King's Landing")
X['astapor'] = X['city'].apply(lambda x: x == "Astapor")
X.drop(columns=['city'], inplace=True)
/Users/jacobmullins/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
X['signup_date'] = X['signup_date'].apply(lambda x: x.day)
X
X.dtypes
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
avg_dist | avg_rating_by_driver | avg_rating_of_driver | avg_surge | city | phone | signup_date | surge_pct | trips_in_first_30_days | luxury_car_user | weekday_pct | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.67 | 5.0 | 4.700000 | 1.10 | King's Landing | False | 2014-01-25 | 15.4 | 4 | True | 46.2 |
1 | 8.26 | 5.0 | 5.000000 | 1.00 | Astapor | True | 2014-01-29 | 0.0 | 0 | False | 50.0 |
2 | 0.77 | 5.0 | 4.300000 | 1.00 | Astapor | False | 2014-01-06 | 0.0 | 3 | False | 100.0 |
3 | 2.36 | 4.9 | 4.600000 | 1.14 | King's Landing | False | 2014-01-10 | 20.0 | 9 | True | 80.0 |
4 | 3.13 | 4.9 | 4.400000 | 1.19 | Winterfell | True | 2014-01-27 | 11.8 | 14 | False | 82.4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 5.63 | 4.2 | 5.000000 | 1.00 | King's Landing | False | 2014-01-25 | 0.0 | 0 | False | 100.0 |
49996 | 0.00 | 4.0 | 4.601559 | 1.00 | Astapor | False | 2014-01-24 | 0.0 | 1 | False | 0.0 |
49997 | 3.86 | 5.0 | 5.000000 | 1.00 | Winterfell | True | 2014-01-31 | 0.0 | 0 | True | 100.0 |
49998 | 4.58 | 3.5 | 3.000000 | 1.00 | Astapor | False | 2014-01-14 | 0.0 | 2 | False | 100.0 |
49999 | 3.49 | 5.0 | 4.601559 | 1.00 | Astapor | True | 2014-01-18 | 0.0 | 0 | False | 0.0 |
49604 rows × 11 columns
avg_dist float64
avg_rating_by_driver float64
avg_rating_of_driver float64
avg_surge float64
city object
phone object
signup_date datetime64[ns]
surge_pct float64
trips_in_first_30_days int64
luxury_car_user bool
weekday_pct float64
dtype: object
#X_training = pd.read_csv('data/Churn_Xtrain.csv', index_col=0)
#X_test = pd.read_csv('data/Churn_Xtest.csv', index_col=0)
#y_training = pd.read_csv('data/Churn_ytrain.csv', index_col=0)
#y_test = pd.read_csv('data/Churn_ytest.csv', index_col=0)
#
#X_training.shape
#X_test.shape
#y_training.shape
#y_test.shape
X_training, X_test, y_training, y_test = train_test_split(X, y)
import seaborn as sns
def plot_correlation_matrix(X):
corr = X.corr()
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
);
return
def plot_logistic_regression(X, feature, y):
pass
def get_p_values(X, y):
pass
plot_correlation_matrix(X_training)
plt.title('Feature Correlation')
plt.tight_layout()
plt.savefig('feature_correlations', dpi=240)
Text(0.5, 1, 'Feature Correlation')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import plot_partial_dependence
#logistic
def train_logistic_model(X, y):
model = LogisticRegression()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)
model.fit(X_train, y_train)
#coeffs = model._coeffs
y_predict = model.predict(X_valid)
score = mean_squared_error(y_valid, y_predict)
return score
#random forest
X_train, X_valid, y_train, y_valid = train_test_split(X_training, y_training)
y_train = y_train.values
y_valid = y_valid.values
def train_random_forest(X_train, y_train, X_valid, y_valid, num_trees=100):
#X_train, X_valid, y_train, y_valid = train_test_split(X, y)
rf = RandomForestClassifier(oob_score=True, n_estimators=num_trees)
rf.fit(X_train, y_train)
feature_importance = pd.DataFrame(rf.feature_importances_, index=X_train.columns)
oob_score = rf.oob_score_
return rf, feature_importance, oob_score
#def test_random_forest(X_train, y_train, X_valid, y_valid, num_trees=100):
# model = train_random_forest(X_train, y_train, X_valid, y_valid, num_trees=num_trees)[0]
# y_predict = model.predict(X_valid)
# y_predict.reshape(-1, 1)
# y_valid.reshape(-1, 1)
# print(y_predict.shape, y_valid.shape)
# score = model.score(y_valid, y_predict)
#
# return score
#
rf1, feature_importance, oob_score = train_random_forest(X_train, y_train, X_valid, y_valid)
feature_importance
score
oob_score
rf1
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | |
---|---|
avg_dist | 0.203373 |
avg_rating_by_driver | 0.108149 |
avg_rating_of_driver | 0.075538 |
avg_surge | 0.063987 |
phone | 0.034142 |
signup_date | 0.136856 |
surge_pct | 0.075776 |
trips_in_first_30_days | 0.074940 |
luxury_car_user | 0.029972 |
weekday_pct | 0.120915 |
kings_landing | 0.055638 |
astapor | 0.020714 |
0.7688089670187888
0.7641746111389864
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=True, random_state=None,
verbose=0, warm_start=False)
#def partial_dependence(model, X, feature_index, classification=True):
# '''
# Parameters
# ----------
# model: fitted model
# anything with .predict()
# X: numpy array
# data the model was trained on.
# feature_index: int
# feature to calculate partial dependence for
# classification: boolean.
# True if the model is a classifier
# (in which case, it must have .predict_proba()
# False if the model is a regressor
# Returns
# -------
# x_values: numpy array
# x values to plot partial dependence over
# pdp: numpy array
# partial dependence values
# example:
# >> x, pdp = partial_dependence(model, X_train, 3, classification=False)
# >> plt.plot(x, pdp)
# '''
# x_values = np.unique(X[:,feature_index])
# pdp = np.zeros(x_values.shape)
# for i, value in enumerate(x_values):
# X_new = replace_column(X, feature_index, value)
# if classification:
# y_pred_prob = model.predict_proba(X_new)[:,1]
# y_pred_prob = np.clip(y_pred_prob, 0.001, 0.999)
# y_pred = np.log(y_pred_prob / (1 - y_pred_prob))
# else:
# y_pred = model.predict(X_new)
# pdp[i] = y_pred.mean()
# return (x_values, pdp)
#
#feature_indices = X_train.columns
#print(feature_indices)
#fig, axes = plt.subplots(4,3, figsize=(16,8), sharey=True)
#for ax, feat_ind in zip(axes.flatten(), feature_indices):
# xx, pdp = partial_dependence(rf1, X_train, feat_ind)
# ax.plot(xx, pdp)
# ax.set_title(feature_names[feat_ind])
#plt.tight_layout()
model = train_random_forest(X_train, y_train, X_valid, y_valid,num_trees=num)[0]
y_predict = model.predict(X_valid)
model.score(X_valid, y_valid)
y_predict.shape
y_valid.shape
(12401,)
(12401,)
num_trees = [50, 100, 250, 500, 1000]
scores=[]
for num in num_trees:
model = train_random_forest(X_train, y_train, X_valid, y_valid,num_trees=num)[0]
#y_predict = model.predict(X_valid)
#y_predict.reshape(-1,1)
#y_valid.reshape(-1,1)
score = model.score(X_valid, y_valid)
scores.append(score)
print(scores)
[0.7681969680679497, 0.7751854639286099, 0.7769057090635415, 0.7774432856682078, 0.7764756477798086]
sns.lineplot(x=num_trees, y=scores)
plt.xlabel('Number of Trees')
plt.ylabel('Mean Accuracy')
plt.title('Optimizing Number of Trees in Random Forest')
plt.tight_layout()
plt.savefig('MeanAccuracyPlot', dpi=240)
<matplotlib.axes._subplots.AxesSubplot at 0x1a58534290>
Text(0.5, 0, 'Number of Trees')
Text(0, 0.5, 'Mean Accuracy')
Text(0.5, 1.0, 'Optimizing Number of Trees in Random Forest')
final_rf, feature_importance, oob_score = train_random_forest(X_training, y_training, X_test, y_test, num_trees=500)
feature_importance
oob_score
final_rf.score(X_test, y_test)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | |
---|---|
avg_dist | 0.207971 |
avg_rating_by_driver | 0.108332 |
avg_rating_of_driver | 0.072820 |
avg_surge | 0.063629 |
phone | 0.034464 |
signup_date | 0.135938 |
surge_pct | 0.077600 |
trips_in_first_30_days | 0.072872 |
luxury_car_user | 0.031577 |
weekday_pct | 0.119234 |
kings_landing | 0.055713 |
astapor | 0.019848 |
0.7736204069564282
0.7734053705346343
ax = sns.scatterplot(data=feature_importance,legend=False)
ax.set_xticklabels(
labels=X_valid.columns,
rotation=90,
horizontalalignment='right')
plt.tight_layout()
plt.savefig('FeatureImportance', dpi=240)
[Text(0, 0, 'avg_dist'),
Text(0, 0, 'avg_rating_by_driver'),
Text(0, 0, 'avg_rating_of_driver'),
Text(0, 0, 'avg_surge'),
Text(0, 0, 'phone'),
Text(0, 0, 'signup_date'),
Text(0, 0, 'surge_pct'),
Text(0, 0, 'trips_in_first_30_days'),
Text(0, 0, 'luxury_car_user'),
Text(0, 0, 'weekday_pct'),
Text(0, 0, 'kings_landing'),
Text(0, 0, 'astapor')]
final_score = final_rf.score(X_test, y_test)
final_score
0.7734053705346343