scikit-learn · ogrisel · Sep 7, 2021 · Nov 20, 2018 · Nov 20, 2018 · Nov 20, 2018
diff --git a/doc/glossary.rst b/doc/glossary.rst
@@ -868,6 +868,7 @@ Class APIs and Estimator Types
         * :term:`fit`
         * :term:`transform`
         * :term:`get_feature_names`
+        * :term:`get_output_names`
 
     meta-estimator
     meta-estimators
@@ -1236,6 +1237,14 @@ Methods
         to the names of input columns from which output column names can
         be generated.  By default input features are named x0, x1, ....
 
+    ``get_output_names``
+        Primarily for :term:`feature extractors`, but also used for other
+        transformers to provide string names for each column in the output of
+        the estimator's :term:`transform` method.  It outputs a list of
+        strings and may take a list of strings as input, corresponding
+        to the names of input columns from which output column names can
+        be generated.  By default input features are named x0, x1, ....
+
     ``get_n_splits``
         On a :term:`CV splitter` (not an estimator), returns the number of
         elements one would get if iterating through the return value of

diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
@@ -139,6 +139,27 @@ or by name::
     >>> pipe['reduce_dim']
     PCA()
 
+To enable model inspection, `Pipeline` has an ``get_output_names()`` method,
+just like all transformers. You can use pipeline slicing to get the feature names
+going into each step::
+
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.feature_selection import SelectKBest
+    >>> iris = load_iris()
+    >>> pipe = Pipeline(steps=[
+    ...    ('select', SelectKBest(k=2)),
+    ...    ('clf', LogisticRegression())])
+    >>> pipe.fit(iris.data, iris.target)
+    Pipeline(steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))])
+    >>> pipe[:-1].get_output_names()
+    array(['x2', 'x3'], dtype='<U2')
+
+You can also provide custom feature names for a more human readable format using
+``get_output_names``::
+
+    >>> pipe[:-1].get_output_names(iris.feature_names)
+    array(['petal length (cm)', 'petal width (cm)'], dtype='<U17')
+
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py`
@@ -426,21 +447,21 @@ By default, the remaining rating columns are ignored (``remainder='drop'``)::
   >>> from sklearn.feature_extraction.text import CountVectorizer
   >>> from sklearn.preprocessing import OneHotEncoder
   >>> column_trans = ColumnTransformer(
-  ...     [('city_category', OneHotEncoder(dtype='int'),['city']),
+  ...     [('categories', OneHotEncoder(dtype='int'),['city']),
-  ...     [('categories', OneHotEncoder(dtype='int'),['city']),
+  ...     [('categories', OneHotEncoder(dtype='int'), ['city']),
-  ...     [('categories', OneHotEncoder(dtype='int'),['city']),
+  ...     [('categories', OneHotEncoder(dtype='int'), ['city']),
   ...      ('title_bow', CountVectorizer(), 'title')],
   ...     remainder='drop')
 
   >>> column_trans.fit(X)
-  ColumnTransformer(transformers=[('city_category', OneHotEncoder(dtype='int'),
+  ColumnTransformer(transformers=[('categories', OneHotEncoder(dtype='int'),
                                    ['city']),
                                   ('title_bow', CountVectorizer(), 'title')])
 
-  >>> column_trans.get_feature_names()
-  ['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw',
-  'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his',
-  'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable',
-  'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson',
-  'title_bow__wrath']
+  >>> column_trans.get_output_names()
+  ['categories__city_London', 'categories__city_Paris',
+   'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast',
+   'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last',
+   'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the',
+   'title_bow__trick', 'title_bow__watson', 'title_bow__wrath']
 
   >>> column_trans.transform(X).toarray()
   array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],

diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
@@ -53,7 +53,7 @@ is a traditional numerical feature::
          [ 0.,  1.,  0., 12.],
          [ 0.,  0.,  1., 18.]])
 
-  >>> vec.get_feature_names()
+  >>> vec.get_output_names()
   ['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']
 
 :class:`DictVectorizer` accepts multiple string values for one
@@ -69,7 +69,7 @@ and its year of release.
     array([[0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.003e+03],
            [1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.011e+03],
            [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.974e+03]])
-    >>> vec.get_feature_names() == ['category=animation', 'category=drama',
+    >>> vec.get_output_names() == ['category=animation', 'category=drama',
     ...                             'category=family', 'category=thriller',
     ...                             'year']
     True
@@ -111,7 +111,7 @@ suitable for feeding into a classifier (maybe after being piped into a
       with 6 stored elements in Compressed Sparse ... format>
   >>> pos_vectorized.toarray()
   array([[1., 1., 1., 1., 1., 1.]])
-  >>> vec.get_feature_names()
+  >>> vec.get_output_names()
   ['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the']
 
 As you can imagine, if one extracts such a context around each individual
@@ -340,7 +340,7 @@ Each term found by the analyzer during the fit is assigned a unique
 integer index corresponding to a column in the resulting matrix. This
 interpretation of the columns can be retrieved as follows::
 
-  >>> vectorizer.get_feature_names() == (
+  >>> vectorizer.get_output_names() == (
   ...     ['and', 'document', 'first', 'is', 'one',
   ...      'second', 'the', 'third', 'this'])
   True
@@ -406,8 +406,8 @@ however, similar words are useful for prediction, such as in classifying
 writing style or personality.
 
 There are several known issues in our provided 'english' stop word list. It
-does not aim to be a general, 'one-size-fits-all' solution as some tasks 
-may require a more custom solution. See [NQY18]_ for more details. 
+does not aim to be a general, 'one-size-fits-all' solution as some tasks
+may require a more custom solution. See [NQY18]_ for more details.
 
 Please take care in choosing a stop word list.
 Popular stop word lists may include words that are highly informative to
@@ -742,7 +742,7 @@ decide better::
 
   >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
   >>> counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
-  >>> ngram_vectorizer.get_feature_names() == (
+  >>> ngram_vectorizer.get_output_names() == (
   ...     [' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp'])
   True
   >>> counts.toarray().astype(int)
@@ -758,15 +758,15 @@ span across words::
   >>> ngram_vectorizer.fit_transform(['jumpy fox'])
   <1x4 sparse matrix of type '<... 'numpy.int64'>'
      with 4 stored elements in Compressed Sparse ... format>
-  >>> ngram_vectorizer.get_feature_names() == (
+  >>> ngram_vectorizer.get_output_names() == (
   ...     [' fox ', ' jump', 'jumpy', 'umpy '])
   True
 
   >>> ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5))
   >>> ngram_vectorizer.fit_transform(['jumpy fox'])
   <1x5 sparse matrix of type '<... 'numpy.int64'>'
       with 5 stored elements in Compressed Sparse ... format>
-  >>> ngram_vectorizer.get_feature_names() == (
+  >>> ngram_vectorizer.get_output_names() == (
   ...     ['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox'])
   True
 

diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py
@@ -103,7 +103,7 @@ def plot_top_words(model, feature_names, n_top_words, title):
 print("done in %0.3fs." % (time() - t0))
 
 
-tfidf_feature_names = tfidf_vectorizer.get_feature_names()
+tfidf_feature_names = tfidf_vectorizer.get_output_names()
 plot_top_words(nmf, tfidf_feature_names, n_top_words,
                'Topics in NMF model (Frobenius norm)')
 
@@ -117,7 +117,7 @@ def plot_top_words(model, feature_names, n_top_words, title):
           l1_ratio=.5).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
 
-tfidf_feature_names = tfidf_vectorizer.get_feature_names()
+tfidf_feature_names = tfidf_vectorizer.get_output_names()
 plot_top_words(nmf, tfidf_feature_names, n_top_words,
                'Topics in NMF model (generalized Kullback-Leibler divergence)')
 
@@ -132,5 +132,5 @@ def plot_top_words(model, feature_names, n_top_words, title):
 lda.fit(tf)
 print("done in %0.3fs." % (time() - t0))
 
-tf_feature_names = tf_vectorizer.get_feature_names()
+tf_feature_names = tf_vectorizer.get_output_names()
 plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')
diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py
@@ -89,7 +89,7 @@ def build_tokenizer(self):
     time() - start_time,
     v_measure_score(y_kmeans, y_true)))
 
-feature_names = vectorizer.get_feature_names()
+feature_names = vectorizer.get_output_names()
 document_names = list(newsgroups.target_names[i] for i in newsgroups.target)
 
 

diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
@@ -147,6 +147,50 @@
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
 
+
+# %%
+# Inspecting the coefficients values of the classifier
+###############################################################################
+# The coefficients of the final classification step of the pipeline gives an
+# idea how each feature impacts the likelihood of survival assuming that the
+# usual linear model assumptions hold (uncorrelated features, linear
+# separability, homoschedastic errors...) which we do not verify in this
+# example.
+#
+# To get error bars we perform cross-validation and compute the mean and
+# standard deviation for each coefficient accross CV splits. Because we use a
+# standard scaler on the numerical features, the coefficient weights gives us
+# an idea on how much the log odds of surviving are impacted by a change in
+# this dimension contrasted to the mean. Note that the categorical features
+# here are overspecified which makes it slightly harder to interpret because of
+# the information redundancy.
+#
+# We can see that the linear model coefficients are in agreement with the
+# historical reports: people in higher classes and therefore in the upper decks
+# were the first to reach the lifeboats, and often, priority was given to women
+# and children.
+#
+# Note that conditionned on the "pclass_x" one-hot features, the "fare"
+# numerical feature does not seem to be significantly predictive. If we drop
+# the "pclass" feature, then higher "fare" values would appear significantly
+# correlated with a higher likelihood of survival as the "fare" and "pclass"
+# features have a strong statistical dependency.
+
+import matplotlib.pyplot as plt
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import StratifiedShuffleSplit
+
+cv = StratifiedShuffleSplit(n_splits=20, test_size=0.25, random_state=42)
+cv_results = cross_validate(clf, X_train, y_train, cv=cv,
+                            return_estimator=True)
+cv_coefs = np.concatenate([cv_pipeline[-1].coef_
+                           for cv_pipeline in cv_results["estimator"]])
+fig, ax = plt.subplots()
+ax.barh(clf[:-1].get_output_names(),
+        cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0))
+plt.tight_layout()
+plt.show()
+
 # %%
 # The resulting score is not exactly the same as the one from the previous
 # pipeline becase the dtype-based selector treats the ``pclass`` columns as

diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py
@@ -9,6 +9,7 @@
 Using a sub-pipeline, the fitted coefficients can be mapped back into
 the original feature space.
 """
+import matplotlib.pyplot as plt
 from sklearn import svm
 from sklearn.datasets import make_classification
 from sklearn.feature_selection import SelectKBest, f_classif
@@ -20,7 +21,7 @@
 
 # import some data to play with
 X, y = make_classification(
-    n_features=20, n_informative=3, n_redundant=0, n_classes=4,
+    n_features=20, n_informative=3, n_redundant=0, n_classes=2,
     n_clusters_per_class=2)
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
@@ -36,5 +37,7 @@
 y_pred = anova_svm.predict(X_test)
 print(classification_report(y_test, y_pred))
 
-coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_)
-print(coef)
+# access and plot the coefficients of the fitted model
+plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel())
+plt.yticks((0, 1, 2), anova_svm[:-1].get_output_names())
+plt.show()
diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py
@@ -208,7 +208,7 @@
 
 feature_names = (model.named_steps['columntransformer']
                       .named_transformers_['onehotencoder']
-                      .get_feature_names(input_features=categorical_columns))
+                      .get_output_names(input_features=categorical_columns))
 feature_names = np.concatenate(
     [feature_names, numerical_columns])
 

diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py
@@ -124,7 +124,7 @@
 ohe = (rf.named_steps['preprocess']
          .named_transformers_['cat']
          .named_steps['onehot'])
-feature_names = ohe.get_feature_names(input_features=categorical_columns)
+feature_names = ohe.get_output_names(input_features=categorical_columns)
 feature_names = np.r_[feature_names, numerical_columns]
 
 tree_feature_importances = (

diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py
@@ -174,7 +174,7 @@ def size_mb(docs):
 if opts.use_hashing:
     feature_names = None
 else:
-    feature_names = vectorizer.get_feature_names()
+    feature_names = vectorizer.get_output_names()
 
 if opts.select_chi2:
     print("Extracting %d best features by a chi-squared test" %

diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py
@@ -217,7 +217,7 @@ def is_interactive():
     else:
         order_centroids = km.cluster_centers_.argsort()[:, ::-1]
 
-    terms = vectorizer.get_feature_names()
+    terms = vectorizer.get_output_names()
     for i in range(true_k):
         print("Cluster %d:" % i, end='')
         for ind in order_centroids[i, :10]:

diff --git a/examples/text/plot_hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py
@@ -89,7 +89,7 @@ def token_freqs(doc):
 vectorizer.fit_transform(token_freqs(d) for d in raw_data)
 duration = time() - t0
 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
-print("Found %d unique terms" % len(vectorizer.get_feature_names()))
+print("Found %d unique terms" % len(vectorizer.get_output_names()))
 print()
 
 print("FeatureHasher on frequency dicts")

diff --git a/sklearn/base.py b/sklearn/base.py
@@ -17,6 +17,7 @@
 from .utils import _IS_32BIT
 from .utils.validation import check_X_y
 from .utils.validation import check_array
+from .utils._feature_names import _make_feature_names
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
 
@@ -747,6 +748,34 @@ def fit_predict(self, X, y=None):
         return self.fit(X).predict(X)
 
 
+class OneToOneMixin:
+    """Provides get_feature_names for simple transformers
+
+    Assumes there's a 1-to-1 correspondence between input features
+    and output features.
+    """
+
+    def get_output_names(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Returns input_features as this transformation
+        doesn't add or drop features.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features. If None, they are generated as
+            x0, x1, ..., xn_features.
-            x0, x1, ..., xn_features.
+            `[x0, x1, ..., xn_features]`.
-            x0, x1, ..., xn_features.
+            `[x0, x1, ..., xn_features]`.
+
+        Returns
+        -------
+        feature_names : array-like of str
+            Transformed feature names.
+        """
+        return _make_feature_names(self.n_features_in_,
+                                   input_features=input_features)
+
+
 class MetaEstimatorMixin:
     _required_parameters = ["estimator"]
     """Mixin class for all meta estimators in scikit-learn."""

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
@@ -20,6 +20,7 @@
 from ..neighbors._dist_metrics import METRIC_MAPPING
 from ..utils import check_array
 from ..utils._fast_dict import IntFloatDict
+from ..utils._feature_names import _make_feature_names
 from ..utils.fixes import _astype_copy_false
 from ..utils.validation import _deprecate_positional_args, check_memory
 # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
@@ -945,6 +946,22 @@ def fit_predict(self, X, y=None):
         """
         return super().fit_predict(X, y)
 
+    def get_output_names(self, input_features=None):
+        """Get output feature names.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        output_feature_names : list of str
+            Feature names for transformer output.
+        """
+        return _make_feature_names(n_features=self.n_clusters,
+                                   prefix=type(self).__name__.lower())
+
 
 class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     """Agglomerate features.