scikit-learn · adrinjalali · May 3, 2024 · Mar 31, 2023 · Apr 1, 2023 · Apr 1, 2023
diff --git a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: a58a98732e5815c15757bc1def8ddc0d87f20f11edcf6e7b408594bf948cbb3e
+# input_hash: 46b1818af4901a4b14e79dab7a99627a28da9815d13cdb73c40e4590b2bd6259
 @EXPLICIT
 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2023.11.17-hbcca054_0.conda#01ffc8d36f9eba0ce0b3c1955fa780ee
@@ -48,7 +48,7 @@ https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.3.2-hd590300_0.co
 https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc
 https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad
 https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.4-hcb278e6_0.conda#318b08df404f9c9be5712aaa5a6f0bb0
-https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.3-h59595ed_0.conda#bdadff838d5437aea83607ced8b37f75
+https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.4-h59595ed_0.conda#3f1017b4141e943d9bc8739237f749e8
 https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4-h59595ed_2.conda#7dbaa197d7ba6032caf7ae7f32c1efa0
 https://conda.anaconda.org/conda-forge/linux-64/nspr-4.35-h27087fc_0.conda#da0ec11a6454ae19bff5b02ed881a2b1
 https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.0-hd590300_1.conda#603827b39ea2b835268adb8c821b8570
@@ -105,7 +105,7 @@ https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.0-h8ee46fc_
 https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.9-hd590300_1.conda#e995b155d938b6779da6ace6c6b13816
 https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.1-h8ee46fc_1.conda#90108a432fb5c6150ccfee3f03388656
 https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.7-h8ee46fc_0.conda#49e482d882669206653b095f5206c05b
-https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.13-pyhd8ed1ab_0.conda#06006184e203b61d3525f90de394471e
+https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.16-pyhd8ed1ab_0.conda#def531a3ac77b7fb8c21d17bb5d0badb
 https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py39h3d6467e_1.conda#c48418c8b35f1d59ae9ae1174812b40a
 https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_0.conda#fad1d0a651bf929c6c16fbf1f6ccfa7c
 https://conda.anaconda.org/conda-forge/noarch/certifi-2023.11.17-pyhd8ed1ab_0.conda#2011bcf45376341dd1d690263fdbc789
@@ -117,7 +117,7 @@ https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_0.conda#5
 https://conda.anaconda.org/conda-forge/linux-64/cython-0.29.33-py39h227be39_0.conda#34bab6ef3e8cdf86fe78c46a984d3217
 https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-h5008d03_3.tar.bz2#ecfff944ba3960ecb334b9a2663d708d
 https://conda.anaconda.org/conda-forge/linux-64/docutils-0.19-py39hf3d152e_1.tar.bz2#adb733ec2ee669f6d010758d054da60f
-https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_0.conda#f6c211fee3c98229652b60a9a42ef363
+https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_2.conda#8d652ea2ee8eaee02ed8dc820bc794aa
 https://conda.anaconda.org/conda-forge/noarch/execnet-2.0.2-pyhd8ed1ab_0.conda#67de0d8241e1060a479e3c37793e26f9
 https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.14.2-h14ed4e7_0.conda#0f69b688f52ff6da70bccb7ff7001d1d
 https://conda.anaconda.org/conda-forge/noarch/fsspec-2023.12.2-pyhca7485f_0.conda#bf40f2a8835b78b1f91083d306b493d2
@@ -175,7 +175,7 @@ https://conda.anaconda.org/conda-forge/linux-64/cytoolz-0.12.2-py39hd1e30aa_1.co
 https://conda.anaconda.org/conda-forge/linux-64/fortran-compiler-1.7.0-heb67821_0.conda#7ef7c0f111dad1c8006504a0f1ccd820
 https://conda.anaconda.org/conda-forge/linux-64/glib-2.78.3-hfc55251_0.conda#e08e51acc7d1ae8dbe13255e7b4c64ac
 https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-7.0.1-pyha770c72_0.conda#746623a787e06191d80a2133e5daff17
-https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.2-pyhd8ed1ab_1.tar.bz2#c8490ed5c70966d232fdd389d0dbed37
+https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.3-pyhd8ed1ab_0.conda#e7d8df6509ba635247ff9aea31134262
 https://conda.anaconda.org/conda-forge/noarch/joblib-1.3.2-pyhd8ed1ab_0.conda#4da50d410f553db77e62ab62ffaa1abc
 https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-20_linux64_openblas.conda#36d486d72ab64ffea932329a1d3729a3
 https://conda.anaconda.org/conda-forge/linux-64/libclang-15.0.7-default_hb11cfb5_4.conda#c90f4cbb57839c98fef8f830e4b9972f
@@ -201,7 +201,7 @@ https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-12.12.2-py39h3d6467e_5
 https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.5.0-pyhd8ed1ab_0.conda#d5f595da2daead898ca958ac62f0307b
 https://conda.anaconda.org/conda-forge/noarch/requests-2.31.0-pyhd8ed1ab_0.conda#a30144e4156cdbb236f99ebb49828f8b
 https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-20_linux64_openblas.conda#9932a1d4e9ecf2d35fb19475446e361e
-https://conda.anaconda.org/conda-forge/noarch/dask-core-2023.12.1-pyhd8ed1ab_0.conda#bf6ad72d882bc3f04e6a0fb50fd2cce8
+https://conda.anaconda.org/conda-forge/noarch/dask-core-2024.1.0-pyhd8ed1ab_0.conda#cab4cec272dc1e30086f7d32faa4f130
 https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.22.8-h8e1006c_1.conda#3926dab94fe06d88ade0e716d77b8cf8
 https://conda.anaconda.org/conda-forge/linux-64/imagecodecs-lite-2019.12.3-py39hd257fcd_5.tar.bz2#32dba66d6abc2b4b5b019c9e54307312
 https://conda.anaconda.org/conda-forge/noarch/imageio-2.33.1-pyh8c1a49c_0.conda#1c34d58ac469a34e7e96832861368bce
@@ -226,10 +226,10 @@ https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.2-pyhd8ed1ab_0.tar.bz2#
 https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_0.conda#ac832cc43adc79118cf6e23f1f9b8995
 https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.15.0-pyhd8ed1ab_0.conda#1a49ca9515ef9a96edff2eea06143dc6
 https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.3.0-py_0.tar.bz2#9363002e2a134a287af4e32ff0f26cdc
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-1.0.7-pyhd8ed1ab_0.conda#aebfabcb60c33a89c1f9290cab49bc93
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-1.0.5-pyhd8ed1ab_0.conda#ebf08f5184d8eaa486697bc060031953
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.4-pyhd8ed1ab_0.conda#a9a89000dfd19656ad004b937eeb6828
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-1.0.6-pyhd8ed1ab_0.conda#cf5c9649272c677a964a7313279e3a9b
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-1.0.8-pyhd8ed1ab_0.conda#611a35a27914fac3aa37611a6fe40bb5
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-1.0.6-pyhd8ed1ab_0.conda#d7e4954df0d3aea2eacc7835ad12671d
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.5-pyhd8ed1ab_0.conda#7e1e7437273682ada2ed5e9e9714b140
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-1.0.7-pyhd8ed1ab_0.conda#26acae54b06f178681bfb551760f5dd1
 https://conda.anaconda.org/conda-forge/noarch/sphinx-6.0.0-pyhd8ed1ab_2.conda#ac1d3b55da1669ee3a56973054fd7efb
-https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.9-pyhd8ed1ab_0.conda#0612e497d7860728f2cda421ea2aec09
+https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_0.conda#e507335cb4ca9cff4c3d0fa9cdab255e
 # pip sphinxext-opengraph @ https://files.pythonhosted.org/packages/50/ac/c105ed3e0a00b14b28c0aa630935af858fd8a32affeff19574b16e2c6ae8/sphinxext_opengraph-0.4.2-py3-none-any.whl#sha256=a51f2604f9a5b6c0d25d3a88e694d5c02e20812dc0e482adf96c8628f9109357
diff --git a/doc/model_selection.rst b/doc/model_selection.rst
@@ -14,5 +14,6 @@ Model selection and evaluation
 
     modules/cross_validation
     modules/grid_search
+    modules/classification_threshold
     modules/model_evaluation
     modules/learning_curve
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1248,6 +1248,16 @@ Hyper-parameter optimizers
    model_selection.RandomizedSearchCV
    model_selection.HalvingRandomSearchCV
 
+Model post-fit tuning
+---------------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   model_selection.TunedThresholdClassifier
 
 Model validation
 ----------------

diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst
@@ -0,0 +1,166 @@
+.. currentmodule:: sklearn.model_selection
+
+.. _tunedthresholdclassifier:
+
+======================================================
+Tuning cut-off decision threshold for class prediction
+======================================================
+
+Classifiers are predictive models: they use statistical learning to predict outcomes.
+The outcomes of a classifier are scores for each sample in relation to each class and
+categorical prediction (class label). Scores are obtained from :term:`predict_proba` or
+:term:`decision_function`. The former returns posterior probability estimates for each
+class, while the latter returns a decision score for each class. The decision score is a
+measure of how strongly the sample is predicted to belong to the positive class (e.g.,
+the distance to the decision boundary). In binary classification, a decision rule is
+then defined by thresholding the scores, leading to a single class label for each
+sample. Those labels are obtained with :term:`predict`.
+
+For binary classification in scikit-learn, class labels are obtained by associating the
+positive class with posterior probability estimates greater than 0.5 (obtained with
+:term:`predict_proba`) or decision scores greater than 0 (obtained with
+:term:`decision_function`).
+
+Here, we show an example that illustrates the relation between posterior
+probability estimates and class labels::
+
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.tree import DecisionTreeClassifier
+    >>> X, y = make_classification(random_state=0)
+    >>> classifier = DecisionTreeClassifier(max_depth=2, random_state=0).fit(X, y)
+    >>> classifier.predict_proba(X[:4])
+    array([[0.94     , 0.06     ],
+           [0.94     , 0.06     ],
+           [0.0416..., 0.9583...],
+           [0.0416..., 0.9583...]])
+    >>> classifier.predict(X[:4])
+    array([0, 0, 1, 1])
+
+While these approaches are reasonable as default behaviors, they are not ideal for
+all cases. Let's illustrate with an example.
+
+Let's consider a scenario where a predictive model is being deployed to assist medical
+doctors in detecting tumors. In this setting, doctors will be most likely interested in
+correctly identifying all patients with cancer so that they can provide them with the
+right treatment. In other words, doctors prioritize achieving a high recall rate,
+meaning they want to identify all cases of cancer without missing any patients who have
+it. This emphasis on recall comes, of course, with the trade-off of potentially more
+false-positive predictions, reducing the precision of the model, but that is a risk
+doctors are willing to take. Consequently, when it comes to deciding whether to classify
+a patient as having cancer or not, it may be more beneficial to classify them as
+positive for cancer when the posterior probability estimate is lower than 0.5.
+
+Post-tuning the decision threshold
+==================================
+
+One solution to address the problem stated in the introduction is to tune the decision
+threshold of the classifier once the model has been trained. The
+:class:`~sklearn.model_selection.TunedThresholdClassifier` tunes this threshold using an
+internal cross-validation. The optimum threshold is chosen to maximize a given metric
+with or without constraints.
+
+The following image illustrates the tuning of the cut-off point for a gradient boosting
+classifier. While the vanilla and tuned classifiers provide the same Receiver Operating
+Characteristic (ROC) and Precision-Recall curves, and thus the same
+:term:`predict_proba` outputs, the class label predictions differ because of the tuned
+decision threshold. The vanilla classifier predicts the class of interest for a
+posterior probability greater than 0.5 while the tuned classifier predicts the class of
+interest for a very low probability (around 0.02). This cut-off point optimizes a
+utility metric defined by the business (in this case an insurance company).
+
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cost_sensitive_learning_002.png
+   :target: ../auto_examples/model_selection/plot_cost_sensitive_learning.html
+   :align: center
+
+Options to tune the cut-off point
+---------------------------------
+
+The cut-off point can be tuned through different strategies controlled by the parameter
+`objective_metric`.
+
+One way to tune the threshold is by maximizing a pre-defined scikit-learn metric. These
+metrics can be found by calling the function :func:`~sklearn.metrics.get_scorer_names`.
+In this example, we maximize the balanced accuracy.
+
+.. note::
+
+    It is important to notice that these metrics come with default parameters, notably
+    the label of the class of interested (i.e. `pos_label`). Thus, if this label is not
+    the right one for your application, you need to define a scorer and pass the right
+    `pos_label` (and additional parameters) using the
+    :func:`~sklearn.metrics.make_scorer`. Refer to :ref:`scoring` to get
+    information to define your own scoring function. For instance, we show how to pass
+    the information to the scorer that the label of interest is `0` when maximizing the
+    :func:`~sklearn.metrics.f1_score`:
+
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import (
+        ...     TunedThresholdClassifier, train_test_split
+        ... )
+        >>> from sklearn.metrics import make_scorer, f1_score
+        >>> X, y = make_classification(
+        ...    n_samples=1_000, weights=[0.1, 0.9], random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+        >>> pos_label = 0
+        >>> scorer = make_scorer(f1_score, pos_label=pos_label)
+        >>> base_model = LogisticRegression()
+        >>> model = TunedThresholdClassifier(base_model, objective_metric=scorer).fit(
+        ...     X_train, y_train)
+        >>> scorer(model, X_test, y_test)
+        0.79...
+        >>> # compare it with the internal score found by cross-validation
+        >>> model.objective_score_
+        0.86...
+
+A second strategy aims to maximize one metric while imposing constraints on another
+metric. There are four pre-defined options, two use the Receiver Operating
+Characteristic (ROC) statistics and two use the Precision-Recall statistics.
+
+- `"max_tpr_at_tnr_constraint"`: maximizes the True Positive Rate (TPR) such that the
+  True Negative Rate (TNR) is the closest to a given value.
+- `"max_tnr_at_tpr_constraint"`: maximizes the TNR such that the TPR is the closest to
+  a given value.
+- `"max_precision_at_recall_constraint"`: maximizes the precision such that the recall
+  is the closest to a given value.
+- `"max_recall_at_precision_constraint"`: maximizes the recall such that the precision
+  is the closest to a given value.
+
+For these options, the `constraint_value` parameter needs to be defined. In addition,
+you can use the `pos_label` parameter to indicate the label of the class of interest.
+
+Important notes regarding the internal cross-validation
+-------------------------------------------------------
+
+By default :class:`~sklearn.model_selection.TunedThresholdClassifier` uses a 5-fold
+stratified cross-validation to tune the cut-off point. The parameter `cv` allows to
+control the cross-validation strategy. It is possible to bypass cross-validation by
+setting `cv="prefit"` and providing a fitted classifier. In this case, the cut-off point
+is tuned on the data provided to the `fit` method.
+
+However, you should be extremely careful when using this option. You should never use
+the same data for training the classifier and tuning the cut-off point due to the risk
+of overfitting. Refer to the following example section for more details (cf.
+:ref:`tunedthresholdclassifier_no_cv`). If you have limited resources, consider using a
+float number to limit to an internal single train-test split.
+
+The option `cv="prefit"` should only be used when the provided classifier was already
+trained, and you just want to find the best cut-off using a new validation set.
+
+Manually setting the decision threshold
+---------------------------------------
+
+The previous sections discussed strategies to find an optimal decision threshold. It is
+also possible to manually set the decision threshold in
+:class`~sklearn.model_selection.TunedThresholdClassifier` by setting the parameter
+`strategy` to `"constant"` and providing the desired threshold using the parameter
+`constant_threshold`.
+
+Examples
+--------
+
+- See the example entitled
+  :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_decision_threshold.py`,
+  to get insights on the post-tuning of the decision threshold.
+- See the example entitled
+  :ref:`sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py`,
+  to learn about cost-sensitive learning and decision threshold tuning.
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -31,6 +31,14 @@ Changelog
   by passing a function in place of a strategy name.
   :pr:`28053` by :user:`Mark Elliot <mark-thm>`.
 
+:mod:`sklearn.model_selection`
+..............................
+
+- |MajorFeature| :class:`model_selection.TunedThresholdClassifier` calibrates
+  the decision threshold function of a binary classifier by maximizing a
+  classification metric through cross-validation.
+  :pr:`26120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Code and Documentation Contributors
 -----------------------------------