scikit-learn · thomasjpfan · Mar 11, 2020 · Mar 12, 2020 · Mar 12, 2020 · Mar 16, 2020
diff --git a/sklearn/_config.py b/sklearn/_config.py
@@ -8,6 +8,7 @@
     'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)),
     'print_changed_only': True,
     'display': 'text',
+    'array_out': 'default',
 }
 
 
@@ -28,7 +29,7 @@ def get_config():
 
 
 def set_config(assume_finite=None, working_memory=None,
-               print_changed_only=None, display=None):
+               print_changed_only=None, display=None, array_out=None):
     """Set global scikit-learn configuration
 
     .. versionadded:: 0.19
@@ -67,6 +68,9 @@ def set_config(assume_finite=None, working_memory=None,
 
         .. versionadded:: 0.23
 
+    array_out : {'default', 'pandas', 'xarray'}, optional
+        Kind of array output for transformers
+
     See Also
     --------
     config_context: Context manager for global scikit-learn configuration
@@ -80,6 +84,8 @@ def set_config(assume_finite=None, working_memory=None,
         _global_config['print_changed_only'] = print_changed_only
     if display is not None:
         _global_config['display'] = display
+    if array_out is not None:
+        _global_config['array_out'] = array_out
 
 
 @contextmanager

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -25,6 +25,8 @@
 from ..utils.metaestimators import _BaseComposition
 from ..utils.validation import check_array, check_is_fitted
 from ..utils.validation import _deprecate_positional_args
+from .._config import get_config
+from ..utils._array_transformer import _ManyArrayTransformer
 
 
 __all__ = [
@@ -462,7 +464,8 @@ def _fit_transform(self, X, y, func, fitted=False):
                     y=y,
                     weight=weight,
                     message_clsname='ColumnTransformer',
-                    message=self._log_message(name, idx, len(transformers)))
+                    message=self._log_message(name, idx, len(transformers)),
+                    config=get_config())
                 for idx, (name, trans, column, weight) in enumerate(
                         self._iter(fitted=fitted, replace_strings=True), 1))
         except ValueError as e:
@@ -537,20 +540,50 @@ def fit_transform(self, X, y=None):
 
         Xs, transformers = zip(*result)
 
+        wrapper = _ManyArrayTransformer(Xs)
         # determine if concatenated output will be sparse or not
-        if any(sparse.issparse(X) for X in Xs):
-            nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs)
-            total = sum(X.shape[0] * X.shape[1] if sparse.issparse(X)
-                        else X.size for X in Xs)
-            density = nnz / total
-            self.sparse_output_ = density < self.sparse_threshold
-        else:
-            self.sparse_output_ = False
-
+        self._check_sparse_output(Xs)
         self._update_fitted_transformers(transformers)
         self._validate_output(Xs)
 
-        return self._hstack(list(Xs))
+        return wrapper.transform(self._hstack(list(Xs)))
+
+    def _check_sparse_output(self, Xs):
+        def _get_Xtype(X):
+            # pandas sparse dataframe
+            if hasattr(X, "iloc") and hasattr(X, "sparse"):
+                return 'pd'
+            # xarray sparse
+            if hasattr(X, 'data') and hasattr(X.data, 'to_scipy_sparse'):
+                return 'xr'
+            if sparse.issparse(X):
+                return 'sp'
+            return 'dense'
+
+        Xs_types = [(X, _get_Xtype(X)) for X in Xs]
+
+        # all dense
+        if all(X_type == 'dense' for _, X_type in Xs_types):
+            self.sparse_output_ = False
+            return
+
+        nnz = 0.0
+        total = 0.0
+        for X, X_type in Xs_types:
+            if X_type == 'pd':
+                nnz += X.sparse.density * X.size
+                total += X.size
+            elif X_type == 'sp':
+                nnz += X.nnz
+                total += np.prod(X.shape)
+            elif X_type == 'xr':
+                nnz += X.data.nnz
+                total += X.data.size
+            else:
+                nnz += X.size
+                total += X.size
+        density = nnz / total
+        self.sparse_output_ = density < self.sparse_threshold
 
     def transform(self, X):
         """Transform X separately by each transformer, concatenate results.
@@ -608,7 +641,8 @@ def transform(self, X):
             # All transformers are None
             return np.zeros((X.shape[0], 0))
 
-        return self._hstack(list(Xs))
+        wrapper = _ManyArrayTransformer(Xs)
+        return wrapper.transform(self._hstack(list(Xs)))
 
     def _hstack(self, Xs):
         """Stacks Xs horizontally.
@@ -635,8 +669,16 @@ def _hstack(self, Xs):
 
             return sparse.hstack(converted_Xs).tocsr()
         else:
-            Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
-            return np.hstack(Xs)
+            output = []
+            for X in Xs:
+                # xarray sparse
+                if hasattr(X, 'coords') and hasattr(X.data, "todense"):
+                    output.append(X.data.todense())
+                elif sparse.issparse(X):
+                    output.append(X.toarray())
+                else:
+                    output.append(X)
+            return np.hstack(output)
 
     def _sk_visual_block_(self):
         names, transformers, name_details = zip(*self.transformers)

diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py
@@ -14,6 +14,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
+from ..utils._array_transformer import _ArrayTransformer
 from abc import ABCMeta, abstractmethod
 
 
@@ -123,14 +124,18 @@ def transform(self, X):
         >>> ipca.transform(X) # doctest: +SKIP
         """
         check_is_fitted(self)
+        wrapper = _ArrayTransformer(X, needs_feature_names_in=False)
 
         X = check_array(X)
         if self.mean_ is not None:
             X = X - self.mean_
         X_transformed = np.dot(X, self.components_.T)
         if self.whiten:
             X_transformed /= np.sqrt(self.explained_variance_)
-        return X_transformed
+
+        def get_feature_names_out():
+            return [f'pca{i}' for i in range(X_transformed.shape[1])]
+        return wrapper.transform(X_transformed, get_feature_names_out)
 
     def inverse_transform(self, X):
         """Transform data back to its original space.

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
@@ -25,6 +25,7 @@
 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
+from ..utils._array_transformer import _ArrayTransformer
 from ..utils.validation import _deprecate_positional_args
 
 
@@ -373,6 +374,7 @@ def fit_transform(self, X, y=None):
         This method returns a Fortran-ordered array. To convert it to a
         C-ordered array, use 'np.ascontiguousarray'.
         """
+        wrapper = _ArrayTransformer(X, needs_feature_names_in=False)
         U, S, Vt = self._fit(X)
         U = U[:, :self.n_components_]
 
@@ -383,7 +385,10 @@ def fit_transform(self, X, y=None):
             # X_new = X * V = U * S * Vt * V = U * S
             U *= S[:self.n_components_]
 
-        return U
+        def get_feature_names_out():
+            return [f'pca{i}' for i in range(U.shape[1])]
+
+        return wrapper.transform(U, get_feature_names_out)
 
     def _fit(self, X):
         """Dispatch to the right submethod depending on the chosen solver."""

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -31,6 +31,7 @@
 from ._stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
 from ..utils import _IS_32BIT
+from ..utils._array_transformer import _ArrayTransformer
 from ..utils.fixes import _astype_copy_false
 from ..exceptions import NotFittedError
 from ..utils.validation import _deprecate_positional_args
@@ -1185,12 +1186,14 @@ def fit_transform(self, raw_documents, y=None):
 
         self._validate_params()
         self._validate_vocabulary()
+
         max_df = self.max_df
         min_df = self.min_df
         max_features = self.max_features
 
         vocabulary, X = self._count_vocab(raw_documents,
                                           self.fixed_vocabulary_)
+        wrapper = _ArrayTransformer(X, needs_feature_names_in=False)
 
         if self.binary:
             X.data.fill(1)
@@ -1215,7 +1218,10 @@ def fit_transform(self, raw_documents, y=None):
 
             self.vocabulary_ = vocabulary
 
-        return X
+        def get_output_feature_names():
+            return self.get_feature_names()
+
+        return wrapper.transform(X, get_output_feature_names)
 
     def transform(self, raw_documents):
         """Transform documents to document-term matrix.
@@ -1238,12 +1244,18 @@ def transform(self, raw_documents):
                 "Iterable over raw text documents expected, "
                 "string object received.")
         self._check_vocabulary()
-
         # use the same matrix-building strategy as fit_transform
         _, X = self._count_vocab(raw_documents, fixed_vocab=True)
+
+        wrapper = _ArrayTransformer(X, needs_feature_names_in=False)
+
         if self.binary:
             X.data.fill(1)
-        return X
+
+        def get_output_feature_names():
+            return self.get_feature_names()
+
+        return wrapper.transform(X, get_output_feature_names)
 
     def inverse_transform(self, X):
         """Return terms per document with nonzero entries in X.
@@ -1462,6 +1474,7 @@ def transform(self, X, copy=True):
         -------
         vectors : sparse matrix of shape (n_samples, n_features)
         """
+        wrapper = _ArrayTransformer(X)
         X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
         if not sp.issparse(X):
             X = sp.csr_matrix(X, dtype=np.float64)
@@ -1490,7 +1503,7 @@ def transform(self, X, copy=True):
         if self.norm:
             X = normalize(X, norm=self.norm, copy=False)
 
-        return X
+        return wrapper.transform(X)
 
     @property
     def idf_(self):

diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py
@@ -15,6 +15,7 @@
 from ..utils import check_array
 from ..utils import safe_mask
 from ..utils import safe_sqr
+from ..utils._array_transformer import _ArrayTransformer
 
 
 class SelectorMixin(TransformerMixin, metaclass=ABCMeta):
@@ -74,6 +75,7 @@ def transform(self, X):
         X_r : array of shape [n_samples, n_selected_features]
             The input samples with only the selected features.
         """
+        wrapper = _ArrayTransformer(X)
         tags = self._get_tags()
         X = check_array(X, dtype=None, accept_sparse='csr',
                         force_all_finite=not tags.get('allow_nan', True))
@@ -85,7 +87,14 @@ def transform(self, X):
             return np.empty(0).reshape((X.shape[0], 0))
         if len(mask) != X.shape[1]:
             raise ValueError("X has a different shape than during fitting.")
-        return X[:, safe_mask(X, mask)]
+
+        _safe_mask = safe_mask(X, mask)
+
+        def get_feature_names_out(feature_names_in):
+            return feature_names_in[_safe_mask]
+
+        out = X[:, _safe_mask]
+        return wrapper.transform(out, get_feature_names_out)
 
     def inverse_transform(self, X):
         """

diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
@@ -17,6 +17,7 @@
 from ..utils.validation import _deprecate_positional_args
 from ..utils._mask import _get_mask
 from ..utils import is_scalar_nan
+from ..utils._array_transformer import _ArrayTransformer
 
 
 def _check_inputs_dtype(X, missing_values):
@@ -418,6 +419,7 @@ def transform(self, X):
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             The input data to complete.
         """
+        wrapper = _ArrayTransformer(X)
         check_is_fitted(self)
 
         X = self._validate_input(X, in_fit=False)
@@ -432,6 +434,7 @@ def transform(self, X):
         # Delete the invalid columns if strategy is not constant
         if self.strategy == "constant":
             valid_statistics = statistics
+            valid_mask = slice(None)
         else:
             # same as np.isnan but also works for object dtypes
             invalid_mask = _get_mask(statistics, np.nan)
@@ -467,7 +470,17 @@ def transform(self, X):
             coordinates = np.where(mask.transpose())[::-1]
             X[coordinates] = values
 
-        return super()._concatenate_indicator(X, X_indicator)
+        def get_feature_names_out(feature_names_in):
+            imputed_names = feature_names_in[valid_mask]
+            if self.indicator_ is None:
+                return imputed_names
+
+            indicator_names = self.indicator_._get_feature_names_out(
+                feature_names_in)
+            return np.r_[imputed_names, indicator_names]
+
+        out = super()._concatenate_indicator(X, X_indicator)
+        return wrapper.transform(out, get_feature_names_out)
 
     def inverse_transform(self, X):
         """Convert the data back to the original representation.
@@ -751,6 +764,7 @@ def transform(self, X):
             will be boolean.
 
         """
+        wrapper = _ArrayTransformer(X)
         check_is_fitted(self)
         X = self._validate_input(X, in_fit=False)
 
@@ -770,7 +784,7 @@ def transform(self, X):
             if self.features_.size < self._n_features:
                 imputer_mask = imputer_mask[:, self.features_]
 
-        return imputer_mask
+        return wrapper.transform(imputer_mask, self._get_feature_names_out)
 
     def fit_transform(self, X, y=None):
         """Generate missing values indicator for X.
@@ -788,12 +802,23 @@ def fit_transform(self, X, y=None):
             will be boolean.
 
         """
+        wrapper = _ArrayTransformer(X)
         imputer_mask = self._fit(X, y)
 
         if self.features_.size < self._n_features:
             imputer_mask = imputer_mask[:, self.features_]
 
-        return imputer_mask
+        return wrapper.transform(imputer_mask, self._get_feature_names_out)
+
+    def _get_feature_names_out(self, feature_names_in):
+        if feature_names_in is None:
+            return None
+        if self.features_.size < self._n_features:
+            feature_names_in = feature_names_in[self.features_]
+
+        feature_names_in = np.array([f'mask_{name}'
+                                     for name in feature_names_in])
+        return feature_names_in
 
     def _more_tags(self):
         return {'allow_nan': True,