Skip to content

Commit

Permalink
Adds recommended actions for InvalidTargetDataCheck and update _make_…
Browse files Browse the repository at this point in the history
…component_list_from_actions to address this action (#1989)
  • Loading branch information
angela97lin committed Mar 31, 2021
1 parent c335c4e commit 2f46b6a
Show file tree
Hide file tree
Showing 18 changed files with 537 additions and 85 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Added recommended actions for ``InvalidTargetDataCheck``, updated ``_make_component_list_from_actions`` to address new action, and added ``TargetImputer`` component :pr:`1989`
* Updated ``AutoMLSearch._check_for_high_variance`` to not emit ``RuntimeWarning`` :pr:`2024`
* Added exception when pipeline passed to ``explain_predictions`` is a ``Stacked Ensemble`` pipeline :pr:`2033`
* Added sensitivity at low alert rates as an objective :pr:`2001`
Expand Down
3 changes: 3 additions & 0 deletions evalml/data_checks/data_check_action_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ class DataCheckActionCode(Enum):

DROP_COL = "drop_col"
"""Action code for dropping a column."""

IMPUTE_COL = "impute_col"
"""Action code for imputing a column."""
3 changes: 3 additions & 0 deletions evalml/data_checks/data_check_message_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ class DataCheckMessageCode(Enum):
TARGET_IS_NONE = "target_is_none"
"""Message code for when target is None."""

TARGET_IS_EMPTY_OR_FULLY_NULL = "target_is_empty_or_fully_null"
"""Message code for target data that is empty or has all null values."""

TARGET_HAS_NULL = "target_has_null"
"""Message code for target data that has null values."""

Expand Down
27 changes: 22 additions & 5 deletions evalml/data_checks/invalid_targets_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@

from evalml.data_checks import (
DataCheck,
DataCheckAction,
DataCheckActionCode,
DataCheckError,
DataCheckMessageCode,
DataCheckWarning
)
from evalml.objectives import get_objective
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.problem_types import (
ProblemTypes,
handle_problem_types,
is_binary,
is_multiclass,
is_regression
)
from evalml.utils.woodwork_utils import (
_convert_woodwork_types_wrapper,
infer_feature_types,
Expand Down Expand Up @@ -57,7 +65,7 @@ def validate(self, X, y):
"code": "TARGET_HAS_NULL",\
"details": {"num_null_rows": 2, "pct_null_rows": 50}}],\
"warnings": [],\
"actions": []}
"actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]}
"""
results = {
"warnings": [],
Expand All @@ -82,18 +90,27 @@ def validate(self, X, y):
details={"unsupported_type": y.logical_type.type_string}).to_dict())
y_df = _convert_woodwork_types_wrapper(y.to_series())
null_rows = y_df.isnull()
if null_rows.any():
if null_rows.all():
results["errors"].append(DataCheckError(message="Target is either empty or fully null.",
data_check_name=self.name,
message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
details={}).to_dict())
return results
elif null_rows.any():
num_null_rows = null_rows.sum()
pct_null_rows = null_rows.mean() * 100
results["errors"].append(DataCheckError(message="{} row(s) ({}%) of target values are null".format(num_null_rows, pct_null_rows),
data_check_name=self.name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows}).to_dict())
impute_strategy = "mean" if is_regression(self.problem_type) else "most_frequent"
results["actions"].append(DataCheckAction(DataCheckActionCode.IMPUTE_COL,
metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict())

value_counts = y_df.value_counts()
unique_values = value_counts.index.tolist()

if self.problem_type == ProblemTypes.BINARY and len(value_counts) != 2:
if is_binary(self.problem_type) and len(value_counts) != 2:
if self.n_unique is None:
details = {"target_values": unique_values}
else:
Expand All @@ -109,7 +126,7 @@ def validate(self, X, y):
message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
details={}).to_dict())

if self.problem_type == ProblemTypes.MULTICLASS:
if is_multiclass(self.problem_type):
if value_counts.min() <= 1:
least_populated = value_counts[value_counts <= 1]
details = {"least_populated_class_labels": least_populated.index.tolist()}
Expand Down
1 change: 1 addition & 0 deletions evalml/pipelines/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
LSA,
PCA,
DFSTransformer,
TargetImputer,
PolynomialDetrender
)
from .ensemble import (
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/components/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .transformer import Transformer
from .encoders import OneHotEncoder, TargetEncoder
from .feature_selection import FeatureSelector, RFClassifierSelectFromModel, RFRegressorSelectFromModel
from .imputers import PerColumnImputer, SimpleImputer, Imputer
from .imputers import PerColumnImputer, SimpleImputer, Imputer, TargetImputer
from .scalers import StandardScaler
from .column_selectors import DropColumns, SelectColumns
from .dimensionality_reduction import LinearDiscriminantAnalysis, PCA
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .per_column_imputer import PerColumnImputer
from .simple_imputer import SimpleImputer
from .imputer import Imputer
from .target_imputer import TargetImputer
121 changes: 121 additions & 0 deletions evalml/pipelines/components/transformers/imputers/target_imputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@

from functools import wraps

import pandas as pd
from sklearn.impute import SimpleImputer as SkImputer

from evalml.exceptions import ComponentNotYetFittedError
from evalml.pipelines.components import ComponentBaseMeta
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import (
_convert_woodwork_types_wrapper,
_retain_custom_types_and_initalize_woodwork,
infer_feature_types
)


class TargetImputerMeta(ComponentBaseMeta):
"""A version of the ComponentBaseMeta class which handles when input features is None"""

@classmethod
def check_for_fit(cls, method):
"""`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`.
It raises an exception if `False` and calls and returns the wrapped method if `True`.
"""
@wraps(method)
def _check_for_fit(self, X=None, y=None):
klass = type(self).__name__
if not self._is_fitted and self.needs_fitting:
raise ComponentNotYetFittedError(f'This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.')
else:
return method(self, X, y)
return _check_for_fit


class TargetImputer(Transformer, metaclass=TargetImputerMeta):
"""Imputes missing target data according to a specified imputation strategy."""
name = 'Target Imputer'
hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]}

def __init__(self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs):
"""Initalizes an transformer that imputes missing target data according to the specified imputation strategy."
Arguments:
impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for
numerical data, and "most_frequent", "constant" for object data types.
fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data.
Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types.
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
parameters = {"impute_strategy": impute_strategy,
"fill_value": fill_value}
parameters.update(kwargs)
imputer = SkImputer(strategy=impute_strategy,
fill_value=fill_value,
**kwargs)
super().__init__(parameters=parameters,
component_obj=imputer,
random_seed=random_seed)

def fit(self, X, y):
"""Fits imputer to target data. 'None' values are converted to np.nan before imputation and are
treated as the same.
Arguments:
X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored.
y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples].
Returns:
self
"""
if y is None:
return self
y = infer_feature_types(y)
y = _convert_woodwork_types_wrapper(y.to_series()).to_frame()

# Convert all bool dtypes to category for fitting
if (y.dtypes == bool).all():
y = y.astype('category')

self._component_obj.fit(y)
return self

def transform(self, X, y):
"""Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.
Arguments:
X (ww.DataTable, pd.DataFrame): Features. Ignored.
y (ww.DataColumn, pd.Series): Target data to impute.
Returns:
(ww.DataTable, ww.DataColumn): The original X, transformed y
"""

if X is not None:
X = infer_feature_types(X)
if y is None:
return X, None
y_ww = infer_feature_types(y)
y = _convert_woodwork_types_wrapper(y_ww.to_series())
y_df = y.to_frame()

# Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
if (y_df.dtypes == bool).all():
return X, _retain_custom_types_and_initalize_woodwork(y_ww, y)

transformed = self._component_obj.transform(y_df)
if transformed.shape[1] == 0:
raise RuntimeError("Transformed data is empty")
y_t = pd.Series(transformed[:, 0], index=y.index)
return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)

def fit_transform(self, X, y):
"""Fits on and transforms the input target data.
Arguments:
X (ww.DataTable, pd.DataFrame): Features. Ignored.
y (ww.DataColumn, pd.Series): Target data to impute.
Returns:
(ww.DataTable, ww.DataColumn): The original X, transformed y
"""
return self.fit(X, y).transform(X, y)
5 changes: 5 additions & 0 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
StackedEnsembleClassifier,
StackedEnsembleRegressor,
StandardScaler,
TargetImputer,
TextFeaturizer
)
from evalml.pipelines.components.utils import all_components, get_estimators
Expand Down Expand Up @@ -268,4 +269,8 @@ def _make_component_list_from_actions(actions):
for action in actions:
if action.action_code == DataCheckActionCode.DROP_COL:
components.append(DropColumns(columns=action.metadata["columns"]))
if action.action_code == DataCheckActionCode.IMPUTE_COL:
metadata = action.metadata
if metadata["is_target"]:
components.append(TargetImputer(impute_strategy=metadata["impute_strategy"]))
return components
38 changes: 30 additions & 8 deletions evalml/tests/component_tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
StandardScaler,
SVMClassifier,
SVMRegressor,
TargetImputer,
TextFeaturizer,
TimeSeriesBaselineEstimator,
Transformer,
Expand Down Expand Up @@ -534,7 +535,11 @@ def test_transformer_transform_output_type(X_y_binary):

component.fit(X, y=y)
transform_output = component.transform(X, y=y)
assert isinstance(transform_output, ww.DataTable)
if isinstance(component, TargetImputer):
assert isinstance(transform_output[0], ww.DataTable)
assert isinstance(transform_output[1], ww.DataColumn)
else:
assert isinstance(transform_output, ww.DataTable)

if isinstance(component, SelectColumns):
assert transform_output.shape == (X.shape[0], 0)
Expand All @@ -548,12 +553,20 @@ def test_transformer_transform_output_type(X_y_binary):
# We just want to check that DelayedFeaturesTransformer outputs a DataFrame
# The dataframe shape and index are checked in test_delayed_features_transformer.py
continue
elif isinstance(component, TargetImputer):
assert transform_output[0].shape == X.shape
assert transform_output[1].shape[0] == X.shape[0]
assert len(transform_output[1].shape) == 1
else:
assert transform_output.shape == X.shape
assert (list(transform_output.columns) == list(X_cols_expected))

transform_output = component.fit_transform(X, y=y)
assert isinstance(transform_output, ww.DataTable)
if isinstance(component, TargetImputer):
assert isinstance(transform_output[0], ww.DataTable)
assert isinstance(transform_output[1], ww.DataColumn)
else:
assert isinstance(transform_output, ww.DataTable)

if isinstance(component, SelectColumns):
assert transform_output.shape == (X.shape[0], 0)
Expand All @@ -563,6 +576,10 @@ def test_transformer_transform_output_type(X_y_binary):
elif isinstance(component, DFSTransformer):
assert transform_output.shape[0] == X.shape[0]
assert transform_output.shape[1] >= X.shape[1]
elif isinstance(component, TargetImputer):
assert transform_output[0].shape == X.shape
assert transform_output[1].shape[0] == X.shape[0]
assert len(transform_output[1].shape) == 1
else:
assert transform_output.shape == X.shape
assert (list(transform_output.columns) == list(X_cols_expected))
Expand Down Expand Up @@ -704,14 +721,14 @@ def test_all_transformers_check_fit(X_y_binary):

component = component_class()
with pytest.raises(ComponentNotYetFittedError, match=f'You must fit {component_class.__name__}'):
component.transform(X)
component.transform(X, y)

component.fit(X, y)
component.transform(X)
component.transform(X, y)

component = component_class()
component.fit_transform(X, y)
component.transform(X)
component.transform(X, y)


def test_all_estimators_check_fit(X_y_binary, ts_data, test_estimator_needs_fitting_false, helper_functions):
Expand Down Expand Up @@ -1067,9 +1084,14 @@ def test_transformer_fit_and_transform_respect_custom_indices(use_custom_index,
pd.testing.assert_index_equal(X.index, X_original_index)
pd.testing.assert_index_equal(y.index, y_original_index)

X_t = transformer.transform(X, y).to_dataframe()
pd.testing.assert_index_equal(X_t.index, X_original_index, check_names=check_names)
pd.testing.assert_index_equal(y.index, y_original_index, check_names=check_names)
if transformer_class == TargetImputer:
X_t, y_t = transformer.transform(X, y)
pd.testing.assert_index_equal(X_t.to_dataframe().index, X_original_index, check_names=check_names)
pd.testing.assert_index_equal(y_t.to_series().index, y_original_index, check_names=check_names)
else:
X_t = transformer.transform(X, y).to_dataframe()
pd.testing.assert_index_equal(X_t.index, X_original_index, check_names=check_names)
pd.testing.assert_index_equal(y.index, y_original_index, check_names=check_names)


@pytest.mark.parametrize("estimator_class", _all_estimators())
Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def test_simple_imputer_does_not_reset_index():
imputer = SimpleImputer(impute_strategy="mean")
imputer.fit(X, y=y)
transformed = imputer.transform(X)
pd.testing.assert_frame_equal(pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9]},
pd.testing.assert_frame_equal(pd.DataFrame({'input_val': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
dtype=float,
index=list(range(1, 10))),
transformed.to_dataframe())
Expand Down

0 comments on commit 2f46b6a

Please sign in to comment.