scikit-learn · thomasjpfan · Mar 11, 2020 · Mar 12, 2020 · Mar 12, 2020 · Mar 16, 2020
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
@@ -64,7 +64,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     make_conda "python=$PYTHON_VERSION"
     python -m pip install -U pip
 
-    python -m pip install pandas matplotlib pyamg scikit-image
+    python -m pip install pandas matplotlib pyamg scikit-image xarray sparse
     # do not install dependencies for lightgbm since it requires scikit-learn
     # and install a version less than 3.0.0 until the issue #18316 is solved.
     python -m pip install "lightgbm<3.0.0" --no-deps

diff --git a/sklearn/_config.py b/sklearn/_config.py
@@ -8,6 +8,7 @@
     'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)),
     'print_changed_only': True,
     'display': 'text',
+    'array_out': 'default',
 }
 
 
@@ -28,7 +29,7 @@ def get_config():
 
 
 def set_config(assume_finite=None, working_memory=None,
-               print_changed_only=None, display=None):
+               print_changed_only=None, display=None, array_out=None):
     """Set global scikit-learn configuration
 
     .. versionadded:: 0.19
@@ -67,6 +68,9 @@ def set_config(assume_finite=None, working_memory=None,
 
         .. versionadded:: 0.23
 
+    array_out : {'default', 'pandas', 'xarray'}, optional
+        Kind of array output for transformers
+
     See Also
     --------
     config_context: Context manager for global scikit-learn configuration
@@ -80,6 +84,8 @@ def set_config(assume_finite=None, working_memory=None,
         _global_config['print_changed_only'] = print_changed_only
     if display is not None:
         _global_config['display'] = display
+    if array_out is not None:
+        _global_config['array_out'] = array_out
 
 
 @contextmanager

diff --git a/sklearn/base.py b/sklearn/base.py
@@ -6,6 +6,7 @@
 import copy
 import warnings
 from collections import defaultdict
+from functools import partial
 import platform
 import inspect
 import re
@@ -19,6 +20,9 @@
 from .utils.validation import check_array
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
+from .utils._array_out import _get_feature_names
+from .utils._array_out import _make_array_out
+
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -377,6 +381,33 @@ def _check_n_features(self, X, reset):
                                        self.n_features_in_)
                 )
 
+    def _check_feature_names(self, X, reset=True):
+        """Set the `feature_names_in_` attribute, or check against it.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+        reset : bool, default=True
+            If True, the `n_feature_names_` attribute is set to the feature
+            names of `X`.
+            Else, the attribute must already exist and the function checks
+            that it is equal to the feature names of `X`.
+        """
+        feature_names = _get_feature_names(X)
+        if reset:
+            self.feature_names_in_ = feature_names
+            return
+
+        if (not hasattr(self, 'feature_names_in_') or
+                self.feature_names_in_ is None or
+                feature_names is None):
+            return
+
+        if np.any(feature_names != self.feature_names_in_):
+            raise ValueError("The feature names of X does not match the "
+                             "feature_names_in_ attribute")
+
     def _validate_data(self, X, y=None, reset=True,
                        validate_separately=False, **check_params):
         """Validate input data and set or check the `n_features_in_` attribute.
@@ -407,7 +438,7 @@ def _validate_data(self, X, y=None, reset=True,
         out : {ndarray, sparse matrix} or tuple of these
             The validated input. A tuple is returned if `y` is not None.
         """
-
+        self._check_feature_names(X, reset=reset)
         if y is None:
             if self._get_tags()['requires_y']:
                 raise ValueError(
@@ -462,6 +493,74 @@ def _repr_mimebundle_(self, **kwargs):
             output["text/html"] = estimator_html_repr(self)
         return output
 
+    def _make_array_out(self, X_out, X_orig, get_feature_names_out):
+        """Construct array container based on global configuration.
+
+        Parameters
+        ----------
+        X_out: {ndarray, sparse matrix} of shape (n_samples, n_features_out)
+            Output data to be wrapped.
+
+        X_orig: array-like of shape (n_samples, n_features)
+            Original input data. For panda's DataFrames, this is used to get
+            the index. For xarray's DataArrays, this is used to get the name
+            of the dims and the coordinates for the first dims.
+
+        get_feature_names_out: callable or {'one_to_one', 'class_name'}
+            Called to get the feature names out. If `one_to_one`, then the
+            feature_names_in will be used as feature name out. If `class_name`,
+            then the class name will be used as prefixes for the feature names
+            out.
+
+        Return
+        ------
+        array_out: {ndarray, sparse matrix, dataframe, dataarray} of shape \
+                   (n_samples, n_features_out)
+            Wrapped array with feature names.
+        """
+        array_out = get_config()['array_out']
+        if array_out == 'default':
+            return X_out
+
+        # TODO This can be removed when all estimators use `_validate_data`
+        # in transform to check for feature names
+        self._check_feature_names(X_orig, reset=False)
+
+        if callable(get_feature_names_out):
+            get_feature_names_out_callable = get_feature_names_out
+        elif get_feature_names_out == 'one_to_one':
+            def get_feature_names_out_callable(names):
+                return names
+        else:
+            # get_feature_names_out == 'class_name'
+            class_name = self.__class__.__name__.lower()
+
+            def get_feature_names_out_callable():
+                return np.array([f"{class_name}{i}"
+                                 for i in range(X_out.shape[1])])
+
+        # feature names in can have zero or one argument. For one argument
+        # it would be the input feature names
+        parameters = (inspect.signature(get_feature_names_out_callable)
+                      .parameters)
+        if parameters:
+            if hasattr(self, "feature_names_in_"):
+                feature_names_in = self.feature_names_in_
+            else:
+                # If there are no feature_names_in_ attribute use the
+                # feature names from the input are feature names
+                feature_names_in = _get_feature_names(X_orig)
+
+            # If there no feature names at this point, generate the
+            # feature names for the input features
+            if feature_names_in is None:
+                feature_names_in = np.array(
+                    [f'X{i}' for i in range(self.n_features_in_)])
+            get_feature_names_out_callable = partial(
+                get_feature_names_out_callable, feature_names_in)
+
+        return _make_array_out(X_out, X_orig, get_feature_names_out_callable)
+
 
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
@@ -1100,8 +1100,10 @@ def fit(self, X, y=None, **params):
         # save n_features_in_ attribute here to reset it after, because it will
         # be overridden in AgglomerativeClustering since we passed it X.T.
         n_features_in_ = self.n_features_in_
+        feature_names_in_ = self.feature_names_in_
         AgglomerativeClustering.fit(self, X.T, **params)
         self.n_features_in_ = n_features_in_
+        self.feature_names_in_ = feature_names_in_
         return self
 
     @property

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
@@ -614,7 +614,8 @@ def transform(self, X):
             Transformed data.
         """
         check_is_fitted(self)
-        return euclidean_distances(X, self.subcluster_centers_)
+        out = euclidean_distances(X, self.subcluster_centers_)
+        return self._make_array_out(out, X, 'class_name')
 
     def _global_clustering(self, X=None):
         """

diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py
@@ -37,6 +37,7 @@ def transform(self, X):
             The pooled values for each feature cluster.
         """
         check_is_fitted(self)
+        X_orig = X
 
         X = check_array(X)
         if len(self.labels_) != X.shape[1]:
@@ -52,7 +53,7 @@ def transform(self, X):
             nX = [self.pooling_func(X[:, self.labels_ == l], axis=1)
                   for l in np.unique(self.labels_)]
             nX = np.array(nX).T
-        return nX
+        return self._make_array_out(nX, X_orig, 'class_name')
 
     def inverse_transform(self, Xred):
         """

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
@@ -1073,7 +1073,8 @@ def fit_transform(self, X, y=None, sample_weight=None):
         # np.array or CSR format already.
         # XXX This skips _check_test_data, which may change the dtype;
         # we should refactor the input validation.
-        return self.fit(X, sample_weight=sample_weight)._transform(X)
+        out = self.fit(X, sample_weight=sample_weight)._transform(X)
+        return self._make_array_out(out, X, 'class_name')
 
     def transform(self, X):
         """Transform X to a cluster-distance space.
@@ -1093,9 +1094,9 @@ def transform(self, X):
             X transformed in the new space.
         """
         check_is_fitted(self)
-
+        X_orig = X
         X = self._check_test_data(X)
-        return self._transform(X)
+        return self._make_array_out(self._transform(X), X_orig, 'class_name')
 
     def _transform(self, X):
         """guts of transform method; no input validation"""