Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Feature names with pandas or xarray data structures #16772

Closed
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
37bf69f
TST Check
thomasjpfan Mar 11, 2020
7599089
Merge remote-tracking branch 'upstream/master'
thomasjpfan Mar 12, 2020
53e0260
Merge remote-tracking branch 'upstream/master'
thomasjpfan Mar 12, 2020
60c84f5
Merge remote-tracking branch 'upstream/master'
thomasjpfan Mar 16, 2020
9940a7d
Merge remote-tracking branch 'upstream/master'
thomasjpfan Mar 18, 2020
156ec25
ENH Adds array_out
thomasjpfan Mar 26, 2020
cabb7c1
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Mar 26, 2020
6435391
STY Flake8
thomasjpfan Mar 26, 2020
edebf84
REV
thomasjpfan Mar 26, 2020
496cf93
API crazy api changes lol
thomasjpfan Mar 26, 2020
ef30659
WIP More internal API changes
thomasjpfan Mar 26, 2020
2071253
BUG
thomasjpfan Mar 26, 2020
1c6b3d4
More streamline api (i hope)
thomasjpfan Mar 26, 2020
2ef6815
DOC Add comment
thomasjpfan Mar 26, 2020
95069e1
API More API thoughts
thomasjpfan Mar 26, 2020
e42333d
API Fix
thomasjpfan Mar 26, 2020
49a3c34
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Apr 15, 2020
c8e8e0b
ENH Copy for ndarray
thomasjpfan Apr 21, 2020
2fd6300
ENH Better happening for sparse in xarray
thomasjpfan Jun 19, 2020
b23a2f3
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Jun 19, 2020
7ff1639
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Jun 20, 2020
b6fbc51
BUG Fix test
thomasjpfan Jun 20, 2020
7336cfe
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Jun 24, 2020
319ce56
CLN Simplifies array wrapping and unwrapping
thomasjpfan Jun 24, 2020
71b13c8
CLN Rename custom class
thomasjpfan Jun 27, 2020
ffdd983
Bug Fix issues from renaming
thomasjpfan Jun 27, 2020
621e6f4
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Jun 27, 2020
c106856
ENH Do not crash for array-like
thomasjpfan Jun 28, 2020
7c61307
ENH Everything is a duck
thomasjpfan Jun 29, 2020
f28190b
CLN Make sures the ducks quack
thomasjpfan Jun 29, 2020
c901ce8
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Aug 26, 2020
6e57487
WIP: Improves interface for array_out
thomasjpfan Aug 26, 2020
7dc4338
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Aug 26, 2020
6a5b42a
STY Linting
thomasjpfan Aug 26, 2020
d97d6e6
WIP Adds more tests
thomasjpfan Aug 30, 2020
f0946f0
WIP Enables array_out for all transformers
thomasjpfan Aug 30, 2020
34e74c3
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Aug 30, 2020
cce6f42
ENH Adds get feature names out to imputers
thomasjpfan Aug 31, 2020
d434122
STY Lint fixes
thomasjpfan Aug 31, 2020
3661f5a
STY Lint fixes
thomasjpfan Aug 31, 2020
6d960a5
ENH Slightly better improvements
thomasjpfan Aug 31, 2020
8b269a8
ENH Major refactor to QuantileTransformer
thomasjpfan Aug 31, 2020
0348059
FIX Fixes get feature out names
thomasjpfan Aug 31, 2020
f70e7cd
ENH Adds feature names out for FeatureUnion
thomasjpfan Aug 31, 2020
55f6b4f
MNT Fixes functiontransformer
thomasjpfan Aug 31, 2020
4ee8f44
Merge remote-tracking branch 'upstream/master' into feature_names_in_…
thomasjpfan Sep 1, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion sklearn/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)),
'print_changed_only': True,
'display': 'text',
'array_out': 'default',
}


Expand All @@ -28,7 +29,7 @@ def get_config():


def set_config(assume_finite=None, working_memory=None,
print_changed_only=None, display=None):
print_changed_only=None, display=None, array_out=None):
"""Set global scikit-learn configuration

.. versionadded:: 0.19
Expand Down Expand Up @@ -67,6 +68,9 @@ def set_config(assume_finite=None, working_memory=None,

.. versionadded:: 0.23

array_out : {'default', 'pandas', 'xarray'}, optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be ndarray instead of default?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sometimes the output is sparse. The default means "sparse or ndarray"

Kind of array output for transformers

See Also
--------
config_context: Context manager for global scikit-learn configuration
Expand All @@ -80,6 +84,8 @@ def set_config(assume_finite=None, working_memory=None,
_global_config['print_changed_only'] = print_changed_only
if display is not None:
_global_config['display'] = display
if array_out is not None:
_global_config['array_out'] = array_out


@contextmanager
Expand Down
70 changes: 56 additions & 14 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from ..utils.metaestimators import _BaseComposition
from ..utils.validation import check_array, check_is_fitted
from ..utils.validation import _deprecate_positional_args
from .._config import get_config
from ..utils._array_transformer import _ManyArrayTransformer


__all__ = [
Expand Down Expand Up @@ -462,7 +464,8 @@ def _fit_transform(self, X, y, func, fitted=False):
y=y,
weight=weight,
message_clsname='ColumnTransformer',
message=self._log_message(name, idx, len(transformers)))
message=self._log_message(name, idx, len(transformers)),
config=get_config())
for idx, (name, trans, column, weight) in enumerate(
self._iter(fitted=fitted, replace_strings=True), 1))
except ValueError as e:
Expand Down Expand Up @@ -537,20 +540,50 @@ def fit_transform(self, X, y=None):

Xs, transformers = zip(*result)

wrapper = _ManyArrayTransformer(Xs)
# determine if concatenated output will be sparse or not
if any(sparse.issparse(X) for X in Xs):
nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs)
total = sum(X.shape[0] * X.shape[1] if sparse.issparse(X)
else X.size for X in Xs)
density = nnz / total
self.sparse_output_ = density < self.sparse_threshold
else:
self.sparse_output_ = False

self._check_sparse_output(Xs)
self._update_fitted_transformers(transformers)
self._validate_output(Xs)

return self._hstack(list(Xs))
return wrapper.transform(self._hstack(list(Xs)))

def _check_sparse_output(self, Xs):
def _get_Xtype(X):
# pandas sparse dataframe
if hasattr(X, "iloc") and hasattr(X, "sparse"):
return 'pd'
# xarray sparse
if hasattr(X, 'data') and hasattr(X.data, 'to_scipy_sparse'):
return 'xr'
if sparse.issparse(X):
return 'sp'
return 'dense'

Xs_types = [(X, _get_Xtype(X)) for X in Xs]

# all dense
if all(X_type == 'dense' for _, X_type in Xs_types):
self.sparse_output_ = False
return

nnz = 0.0
total = 0.0
for X, X_type in Xs_types:
if X_type == 'pd':
nnz += X.sparse.density * X.size
total += X.size
elif X_type == 'sp':
nnz += X.nnz
total += np.prod(X.shape)
elif X_type == 'xr':
nnz += X.data.nnz
total += X.data.size
else:
nnz += X.size
total += X.size
density = nnz / total
self.sparse_output_ = density < self.sparse_threshold

def transform(self, X):
"""Transform X separately by each transformer, concatenate results.
Expand Down Expand Up @@ -608,7 +641,8 @@ def transform(self, X):
# All transformers are None
return np.zeros((X.shape[0], 0))

return self._hstack(list(Xs))
wrapper = _ManyArrayTransformer(Xs)
return wrapper.transform(self._hstack(list(Xs)))

def _hstack(self, Xs):
"""Stacks Xs horizontally.
Expand All @@ -635,8 +669,16 @@ def _hstack(self, Xs):

return sparse.hstack(converted_Xs).tocsr()
else:
Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
return np.hstack(Xs)
output = []
for X in Xs:
# xarray sparse
if hasattr(X, 'coords') and hasattr(X.data, "todense"):
output.append(X.data.todense())
elif sparse.issparse(X):
output.append(X.toarray())
else:
output.append(X)
return np.hstack(output)

def _sk_visual_block_(self):
names, transformers, name_details = zip(*self.transformers)
Expand Down
7 changes: 6 additions & 1 deletion sklearn/decomposition/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
from ..utils.validation import check_is_fitted
from ..utils._array_transformer import _ArrayTransformer
from abc import ABCMeta, abstractmethod


Expand Down Expand Up @@ -123,14 +124,18 @@ def transform(self, X):
>>> ipca.transform(X) # doctest: +SKIP
"""
check_is_fitted(self)
wrapper = _ArrayTransformer(X, needs_feature_names_in=False)

X = check_array(X)
if self.mean_ is not None:
X = X - self.mean_
X_transformed = np.dot(X, self.components_.T)
if self.whiten:
X_transformed /= np.sqrt(self.explained_variance_)
return X_transformed

def get_feature_names_out():
return [f'pca{i}' for i in range(X_transformed.shape[1])]
return wrapper.transform(X_transformed, get_feature_names_out)

def inverse_transform(self, X):
"""Transform data back to its original space.
Expand Down
7 changes: 6 additions & 1 deletion sklearn/decomposition/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
from ..utils.extmath import stable_cumsum
from ..utils.validation import check_is_fitted
from ..utils._array_transformer import _ArrayTransformer
from ..utils.validation import _deprecate_positional_args


Expand Down Expand Up @@ -373,6 +374,7 @@ def fit_transform(self, X, y=None):
This method returns a Fortran-ordered array. To convert it to a
C-ordered array, use 'np.ascontiguousarray'.
"""
wrapper = _ArrayTransformer(X, needs_feature_names_in=False)
U, S, Vt = self._fit(X)
U = U[:, :self.n_components_]

Expand All @@ -383,7 +385,10 @@ def fit_transform(self, X, y=None):
# X_new = X * V = U * S * Vt * V = U * S
U *= S[:self.n_components_]

return U
def get_feature_names_out():
return [f'pca{i}' for i in range(U.shape[1])]

return wrapper.transform(U, get_feature_names_out)

def _fit(self, X):
"""Dispatch to the right submethod depending on the chosen solver."""
Expand Down
21 changes: 17 additions & 4 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from ._stop_words import ENGLISH_STOP_WORDS
from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
from ..utils import _IS_32BIT
from ..utils._array_transformer import _ArrayTransformer
from ..utils.fixes import _astype_copy_false
from ..exceptions import NotFittedError
from ..utils.validation import _deprecate_positional_args
Expand Down Expand Up @@ -1185,12 +1186,14 @@ def fit_transform(self, raw_documents, y=None):

self._validate_params()
self._validate_vocabulary()

max_df = self.max_df
min_df = self.min_df
max_features = self.max_features

vocabulary, X = self._count_vocab(raw_documents,
self.fixed_vocabulary_)
wrapper = _ArrayTransformer(X, needs_feature_names_in=False)

if self.binary:
X.data.fill(1)
Expand All @@ -1215,7 +1218,10 @@ def fit_transform(self, raw_documents, y=None):

self.vocabulary_ = vocabulary

return X
def get_output_feature_names():
return self.get_feature_names()

return wrapper.transform(X, get_output_feature_names)

def transform(self, raw_documents):
"""Transform documents to document-term matrix.
Expand All @@ -1238,12 +1244,18 @@ def transform(self, raw_documents):
"Iterable over raw text documents expected, "
"string object received.")
self._check_vocabulary()

# use the same matrix-building strategy as fit_transform
_, X = self._count_vocab(raw_documents, fixed_vocab=True)

wrapper = _ArrayTransformer(X, needs_feature_names_in=False)

if self.binary:
X.data.fill(1)
return X

def get_output_feature_names():
return self.get_feature_names()

return wrapper.transform(X, get_output_feature_names)

def inverse_transform(self, X):
"""Return terms per document with nonzero entries in X.
Expand Down Expand Up @@ -1462,6 +1474,7 @@ def transform(self, X, copy=True):
-------
vectors : sparse matrix of shape (n_samples, n_features)
"""
wrapper = _ArrayTransformer(X)
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
if not sp.issparse(X):
X = sp.csr_matrix(X, dtype=np.float64)
Expand Down Expand Up @@ -1490,7 +1503,7 @@ def transform(self, X, copy=True):
if self.norm:
X = normalize(X, norm=self.norm, copy=False)

return X
return wrapper.transform(X)

@property
def idf_(self):
Expand Down
11 changes: 10 additions & 1 deletion sklearn/feature_selection/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ..utils import check_array
from ..utils import safe_mask
from ..utils import safe_sqr
from ..utils._array_transformer import _ArrayTransformer


class SelectorMixin(TransformerMixin, metaclass=ABCMeta):
Expand Down Expand Up @@ -74,6 +75,7 @@ def transform(self, X):
X_r : array of shape [n_samples, n_selected_features]
The input samples with only the selected features.
"""
wrapper = _ArrayTransformer(X)
tags = self._get_tags()
X = check_array(X, dtype=None, accept_sparse='csr',
force_all_finite=not tags.get('allow_nan', True))
Expand All @@ -85,7 +87,14 @@ def transform(self, X):
return np.empty(0).reshape((X.shape[0], 0))
if len(mask) != X.shape[1]:
raise ValueError("X has a different shape than during fitting.")
return X[:, safe_mask(X, mask)]

_safe_mask = safe_mask(X, mask)

def get_feature_names_out(feature_names_in):
return feature_names_in[_safe_mask]

out = X[:, _safe_mask]
return wrapper.transform(out, get_feature_names_out)

def inverse_transform(self, X):
"""
Expand Down
31 changes: 28 additions & 3 deletions sklearn/impute/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from ..utils.validation import _deprecate_positional_args
from ..utils._mask import _get_mask
from ..utils import is_scalar_nan
from ..utils._array_transformer import _ArrayTransformer


def _check_inputs_dtype(X, missing_values):
Expand Down Expand Up @@ -418,6 +419,7 @@ def transform(self, X):
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data to complete.
"""
wrapper = _ArrayTransformer(X)
check_is_fitted(self)

X = self._validate_input(X, in_fit=False)
Expand All @@ -432,6 +434,7 @@ def transform(self, X):
# Delete the invalid columns if strategy is not constant
if self.strategy == "constant":
valid_statistics = statistics
valid_mask = slice(None)
else:
# same as np.isnan but also works for object dtypes
invalid_mask = _get_mask(statistics, np.nan)
Expand Down Expand Up @@ -467,7 +470,17 @@ def transform(self, X):
coordinates = np.where(mask.transpose())[::-1]
X[coordinates] = values

return super()._concatenate_indicator(X, X_indicator)
def get_feature_names_out(feature_names_in):
imputed_names = feature_names_in[valid_mask]
if self.indicator_ is None:
return imputed_names

indicator_names = self.indicator_._get_feature_names_out(
feature_names_in)
return np.r_[imputed_names, indicator_names]

out = super()._concatenate_indicator(X, X_indicator)
return wrapper.transform(out, get_feature_names_out)

def inverse_transform(self, X):
"""Convert the data back to the original representation.
Expand Down Expand Up @@ -751,6 +764,7 @@ def transform(self, X):
will be boolean.

"""
wrapper = _ArrayTransformer(X)
check_is_fitted(self)
X = self._validate_input(X, in_fit=False)

Expand All @@ -770,7 +784,7 @@ def transform(self, X):
if self.features_.size < self._n_features:
imputer_mask = imputer_mask[:, self.features_]

return imputer_mask
return wrapper.transform(imputer_mask, self._get_feature_names_out)

def fit_transform(self, X, y=None):
"""Generate missing values indicator for X.
Expand All @@ -788,12 +802,23 @@ def fit_transform(self, X, y=None):
will be boolean.

"""
wrapper = _ArrayTransformer(X)
imputer_mask = self._fit(X, y)

if self.features_.size < self._n_features:
imputer_mask = imputer_mask[:, self.features_]

return imputer_mask
return wrapper.transform(imputer_mask, self._get_feature_names_out)

def _get_feature_names_out(self, feature_names_in):
if feature_names_in is None:
return None
if self.features_.size < self._n_features:
feature_names_in = feature_names_in[self.features_]

feature_names_in = np.array([f'mask_{name}'
for name in feature_names_in])
return feature_names_in

def _more_tags(self):
return {'allow_nan': True,
Expand Down