Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds recommended actions for InvalidTargetDataCheck and update _make_component_list_from_actions to address this action #1989

Merged
merged 47 commits into from
Mar 31, 2021
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
c9e2e66
init
angela97lin Mar 16, 2021
3d76716
fix tests
angela97lin Mar 17, 2021
70299b5
release notes
angela97lin Mar 17, 2021
8ac18d3
add init code for target imputer
angela97lin Mar 17, 2021
a035a41
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 17, 2021
1327230
welp
angela97lin Mar 17, 2021
b53407d
hmm testing
angela97lin Mar 17, 2021
542ec07
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 18, 2021
8c08bc8
fix some tests
angela97lin Mar 18, 2021
97e2f48
test renaming
angela97lin Mar 18, 2021
18497fb
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 18, 2021
ac28999
some updates, more tests to go
angela97lin Mar 18, 2021
f9c04e8
Merge branch '1881_fill_in_actions_cont' of github.com:alteryx/evalml…
angela97lin Mar 18, 2021
d0ed8ee
fix tests, add impute strategy
angela97lin Mar 18, 2021
15ec313
lint mclint
angela97lin Mar 18, 2021
364fd95
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 19, 2021
fbf7ead
fix tests
angela97lin Mar 19, 2021
f681b28
codecov testing
angela97lin Mar 19, 2021
5f8f2b1
linting
angela97lin Mar 19, 2021
e68927f
clean up and fix tests
angela97lin Mar 20, 2021
0944f08
remove unreachable
angela97lin Mar 21, 2021
31145e5
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 21, 2021
d34e0c9
cleanup docstrings
angela97lin Mar 21, 2021
ff80ad1
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 23, 2021
81ba4b8
address feedback, update 100% null or empty case, address target impu…
angela97lin Mar 23, 2021
ed223d9
a lot of cleanup
angela97lin Mar 23, 2021
8bd8633
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 23, 2021
5306bf9
undo simpleimputer subclassing
angela97lin Mar 23, 2021
43a442f
fix up tests
angela97lin Mar 23, 2021
9890722
merge
angela97lin Mar 23, 2021
3b9f13c
oops
angela97lin Mar 24, 2021
cdc320c
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 25, 2021
da982d9
move release notes
angela97lin Mar 25, 2021
ff4b679
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 25, 2021
c1919e3
merge
angela97lin Mar 29, 2021
cccb717
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 29, 2021
6068a8c
fixing from comments and rename from details to metadata
angela97lin Mar 29, 2021
1402507
Merge branch '1881_fill_in_actions_cont' of github.com:alteryx/evalml…
angela97lin Mar 29, 2021
65a301a
fix test and add one for X not None
angela97lin Mar 29, 2021
9c2ff4e
fix tests with indices
angela97lin Mar 29, 2021
aabdbd2
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 30, 2021
fb82cee
cleanup
angela97lin Mar 31, 2021
4f093c9
codecov
angela97lin Mar 31, 2021
56388f8
remove from component graph and cleanup
angela97lin Mar 31, 2021
2d72bc6
add another test
angela97lin Mar 31, 2021
5cdb59b
clean up merge:
angela97lin Mar 31, 2021
f0cdcbd
Merge branch 'main' into 1881_fill_in_actions_cont
angela97lin Mar 31, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Added recommended actions for ``InvalidTargetDataCheck`` and update ``_make_component_list_from_actions`` to address this action :pr:`1989`
* Fixes
* Changes
* Documentation Changes
Expand Down
3 changes: 3 additions & 0 deletions evalml/data_checks/data_check_action_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ class DataCheckActionCode(Enum):

DROP_COL = "drop_col"
"""Action code for dropping a column."""

IMPUTE_COL = "impute_col"
"""Action code for imputing a column."""
3 changes: 3 additions & 0 deletions evalml/data_checks/data_check_message_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ class DataCheckMessageCode(Enum):
TARGET_IS_NONE = "target_is_none"
"""Message code for when target is None."""

TARGET_IS_EMPTY_OR_FULLY_NULL = "target_is_empty_or_fully_null"
"""Message code for target data that is empty or has all null values."""

TARGET_HAS_NULL = "target_has_null"
"""Message code for target data that has null values."""

Expand Down
27 changes: 22 additions & 5 deletions evalml/data_checks/invalid_targets_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@

from evalml.data_checks import (
DataCheck,
DataCheckAction,
DataCheckActionCode,
DataCheckError,
DataCheckMessageCode,
DataCheckWarning
)
from evalml.objectives import get_objective
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.problem_types import (
ProblemTypes,
handle_problem_types,
is_binary,
is_multiclass,
is_regression
)
from evalml.utils.woodwork_utils import (
_convert_woodwork_types_wrapper,
infer_feature_types,
Expand Down Expand Up @@ -57,7 +65,7 @@ def validate(self, X, y):
"code": "TARGET_HAS_NULL",\
"details": {"num_null_rows": 2, "pct_null_rows": 50}}],\
"warnings": [],\
"actions": []}
"actions": [{'code': 'IMPUTE_COL', 'details': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wanted a way to specify that we want to impute the target without relying on the name of the column

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense!

"""
results = {
"warnings": [],
Expand All @@ -82,18 +90,27 @@ def validate(self, X, y):
details={"unsupported_type": y.logical_type.type_string}).to_dict())
y_df = _convert_woodwork_types_wrapper(y.to_series())
null_rows = y_df.isnull()
if null_rows.any():
if null_rows.all():
results["errors"].append(DataCheckError(message="Target values are either empty or fully null.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: are "empty" and "fully null" different? If they're not I'd just go with "Target values are fully null."

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, they're different in that empty refers to len(y) == 0, and fully null is len(y) != 0 but all nan values 😢

data_check_name=self.name,
message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
details={}).to_dict())
return results
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
elif null_rows.any():
num_null_rows = null_rows.sum()
pct_null_rows = null_rows.mean() * 100
results["errors"].append(DataCheckError(message="{} row(s) ({}%) of target values are null".format(num_null_rows, pct_null_rows),
data_check_name=self.name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows}).to_dict())
impute_strategy = "mean" if is_regression(self.problem_type) else "most_frequent"
results["actions"].append(DataCheckAction(DataCheckActionCode.IMPUTE_COL,
details={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict())

value_counts = y_df.value_counts()
unique_values = value_counts.index.tolist()

if self.problem_type == ProblemTypes.BINARY and len(value_counts) != 2:
if is_binary(self.problem_type) and len(value_counts) != 2:
if self.n_unique is None:
details = {"target_values": unique_values}
else:
Expand All @@ -109,7 +126,7 @@ def validate(self, X, y):
message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
details={}).to_dict())

if self.problem_type == ProblemTypes.MULTICLASS:
if is_multiclass(self.problem_type):
if value_counts.min() <= 1:
least_populated = value_counts[value_counts <= 1]
details = {"least_populated_class_labels": least_populated.index.tolist()}
Expand Down
1 change: 1 addition & 0 deletions evalml/pipelines/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
LSA,
PCA,
DFSTransformer,
TargetImputer,
PolynomialDetrender
)
from .ensemble import (
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/components/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .transformer import Transformer
from .encoders import OneHotEncoder, TargetEncoder
from .feature_selection import FeatureSelector, RFClassifierSelectFromModel, RFRegressorSelectFromModel
from .imputers import PerColumnImputer, SimpleImputer, Imputer
from .imputers import PerColumnImputer, SimpleImputer, Imputer, TargetImputer
from .scalers import StandardScaler
from .column_selectors import DropColumns, SelectColumns
from .dimensionality_reduction import LinearDiscriminantAnalysis, PCA
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .per_column_imputer import PerColumnImputer
from .simple_imputer import SimpleImputer
from .imputer import Imputer
from .target_imputer import TargetImputer
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@

import pandas as pd
from sklearn.impute import SimpleImputer as SkImputer

from evalml.pipelines.components.transformers import Transformer
from evalml.utils import (
_convert_woodwork_types_wrapper,
_retain_custom_types_and_initalize_woodwork,
infer_feature_types
)


class TargetImputer(Transformer):
chukarsten marked this conversation as resolved.
Show resolved Hide resolved
"""Imputes missing target data according to a specified imputation strategy."""
name = 'Target Imputer'
hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]}

def __init__(self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs):
"""Initalizes an transformer that imputes missing target data according to the specified imputation strategy."
Arguments:
impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for
numerical data, and "most_frequent", "constant" for object data types.
fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data.
Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types.
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
parameters = {"impute_strategy": impute_strategy,
"fill_value": fill_value}
parameters.update(kwargs)
imputer = SkImputer(strategy=impute_strategy,
fill_value=fill_value,
**kwargs)
super().__init__(parameters=parameters,
component_obj=imputer,
random_seed=random_seed)

def fit(self, X, y):
"""Fits imputer to target data. 'None' values are converted to np.nan before imputation and are
treated as the same.
Arguments:
X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored.
y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
Returns:
self
"""
if y is None:
raise ValueError("y cannot be None")
y = infer_feature_types(y)
y = _convert_woodwork_types_wrapper(y.to_series()).to_frame()

# Convert all bool dtypes to category for fitting
if (y.dtypes == bool).all():
y = y.astype('category')
angela97lin marked this conversation as resolved.
Show resolved Hide resolved

self._component_obj.fit(y)
return self

def transform(self, X, y):
"""Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.
Arguments:
X (ww.DataTable, pd.DataFrame): Features. Ignored.
y (ww.DataColumn, pd.Series): Target data to impute.
Returns:
ww.DataColumn: Transformed y
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
"""
y_ww = infer_feature_types(y)
y = _convert_woodwork_types_wrapper(y_ww.to_series())
y_df = y.to_frame()
angela97lin marked this conversation as resolved.
Show resolved Hide resolved

# Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
if (y_df.dtypes == bool).all():
return X, _retain_custom_types_and_initalize_woodwork(y_ww, y)

transformed = self._component_obj.transform(y_df)
if transformed.shape[1] == 0:
raise RuntimeError("Transformed data is empty")
y_t = pd.Series(transformed[:, 0], index=y.index)
return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)

def fit_transform(self, X, y):
"""Fits on y and transforms y
Arguments:
X (ww.DataTable, pd.DataFrame): Features. Ignored.
y (ww.DataColumn, pd.Series): Target data to impute.
Returns:
ww.DataColumn: Transformed y
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
"""
return self.fit(X, y).transform(X, y)
5 changes: 5 additions & 0 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
StackedEnsembleClassifier,
StackedEnsembleRegressor,
StandardScaler,
TargetImputer,
TextFeaturizer
)
from evalml.pipelines.components.utils import all_components, get_estimators
Expand Down Expand Up @@ -268,4 +269,8 @@ def _make_component_list_from_actions(actions):
for action in actions:
if action.action_code == DataCheckActionCode.DROP_COL:
components.append(DropColumns(columns=action.details["columns"]))
if action.action_code == DataCheckActionCode.IMPUTE_COL:
details = action.details
if details["is_target"]:
components.append(TargetImputer(impute_strategy=details["impute_strategy"]))
return components
36 changes: 28 additions & 8 deletions evalml/tests/component_tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
StandardScaler,
SVMClassifier,
SVMRegressor,
TargetImputer,
TextFeaturizer,
TimeSeriesBaselineEstimator,
Transformer,
Expand Down Expand Up @@ -534,7 +535,10 @@ def test_transformer_transform_output_type(X_y_binary):

component.fit(X, y=y)
transform_output = component.transform(X, y=y)
assert isinstance(transform_output, ww.DataTable)
if isinstance(component, TargetImputer):
assert isinstance(transform_output[1], ww.DataColumn)
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
else:
assert isinstance(transform_output, ww.DataTable)

if isinstance(component, SelectColumns):
assert transform_output.shape == (X.shape[0], 0)
Expand All @@ -548,12 +552,19 @@ def test_transformer_transform_output_type(X_y_binary):
# We just want to check that DelayedFeaturesTransformer outputs a DataFrame
# The dataframe shape and index are checked in test_delayed_features_transformer.py
continue
elif isinstance(component, TargetImputer):
assert transform_output[0].shape == X.shape
assert transform_output[1].shape[0] == X.shape[0]
assert len(transform_output[1].shape) == 1
else:
assert transform_output.shape == X.shape
assert (list(transform_output.columns) == list(X_cols_expected))

transform_output = component.fit_transform(X, y=y)
assert isinstance(transform_output, ww.DataTable)
if isinstance(component, TargetImputer):
assert isinstance(transform_output[1], ww.DataColumn)
else:
assert isinstance(transform_output, ww.DataTable)

if isinstance(component, SelectColumns):
assert transform_output.shape == (X.shape[0], 0)
Expand All @@ -563,6 +574,10 @@ def test_transformer_transform_output_type(X_y_binary):
elif isinstance(component, DFSTransformer):
assert transform_output.shape[0] == X.shape[0]
assert transform_output.shape[1] >= X.shape[1]
elif isinstance(component, TargetImputer):
assert transform_output[0].shape == X.shape
assert transform_output[1].shape[0] == X.shape[0]
assert len(transform_output[1].shape) == 1
else:
assert transform_output.shape == X.shape
assert (list(transform_output.columns) == list(X_cols_expected))
Expand Down Expand Up @@ -704,14 +719,14 @@ def test_all_transformers_check_fit(X_y_binary):

component = component_class()
with pytest.raises(ComponentNotYetFittedError, match=f'You must fit {component_class.__name__}'):
component.transform(X)
component.transform(X, y)

component.fit(X, y)
component.transform(X)
component.transform(X, y)

component = component_class()
component.fit_transform(X, y)
component.transform(X)
component.transform(X, y)


def test_all_estimators_check_fit(X_y_binary, ts_data, test_estimator_needs_fitting_false, helper_functions):
Expand Down Expand Up @@ -1067,9 +1082,14 @@ def test_transformer_fit_and_transform_respect_custom_indices(use_custom_index,
pd.testing.assert_index_equal(X.index, X_original_index)
pd.testing.assert_index_equal(y.index, y_original_index)

X_t = transformer.transform(X, y).to_dataframe()
pd.testing.assert_index_equal(X_t.index, X_original_index, check_names=check_names)
pd.testing.assert_index_equal(y.index, y_original_index, check_names=check_names)
if transformer_class == TargetImputer:
X_t, y_t = transformer.transform(X, y)
pd.testing.assert_index_equal(X_t.index, X_original_index, check_names=check_names)
pd.testing.assert_index_equal(y_t.to_series().index, y_original_index, check_names=check_names)
else:
X_t = transformer.transform(X, y).to_dataframe()
pd.testing.assert_index_equal(X_t.index, X_original_index, check_names=check_names)
pd.testing.assert_index_equal(y.index, y_original_index, check_names=check_names)


@pytest.mark.parametrize("estimator_class", _all_estimators())
Expand Down