-
Notifications
You must be signed in to change notification settings - Fork 82
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds recommended actions for InvalidTargetDataCheck and update _make_component_list_from_actions to address this action #1989
Changes from 33 commits
c9e2e66
3d76716
70299b5
8ac18d3
a035a41
1327230
b53407d
542ec07
8c08bc8
97e2f48
18497fb
ac28999
f9c04e8
d0ed8ee
15ec313
364fd95
fbf7ead
f681b28
5f8f2b1
e68927f
0944f08
31145e5
d34e0c9
ff80ad1
81ba4b8
ed223d9
8bd8633
5306bf9
43a442f
9890722
3b9f13c
cdc320c
da982d9
ff4b679
c1919e3
cccb717
6068a8c
1402507
65a301a
9c2ff4e
aabdbd2
fb82cee
4f093c9
56388f8
2d72bc6
5cdb59b
f0cdcbd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,12 +2,20 @@ | |
|
||
from evalml.data_checks import ( | ||
DataCheck, | ||
DataCheckAction, | ||
DataCheckActionCode, | ||
DataCheckError, | ||
DataCheckMessageCode, | ||
DataCheckWarning | ||
) | ||
from evalml.objectives import get_objective | ||
from evalml.problem_types import ProblemTypes, handle_problem_types | ||
from evalml.problem_types import ( | ||
ProblemTypes, | ||
handle_problem_types, | ||
is_binary, | ||
is_multiclass, | ||
is_regression | ||
) | ||
from evalml.utils.woodwork_utils import ( | ||
_convert_woodwork_types_wrapper, | ||
infer_feature_types, | ||
|
@@ -57,7 +65,7 @@ def validate(self, X, y): | |
"code": "TARGET_HAS_NULL",\ | ||
"details": {"num_null_rows": 2, "pct_null_rows": 50}}],\ | ||
"warnings": [],\ | ||
"actions": []} | ||
"actions": [{'code': 'IMPUTE_COL', 'details': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]} | ||
""" | ||
results = { | ||
"warnings": [], | ||
|
@@ -82,18 +90,27 @@ def validate(self, X, y): | |
details={"unsupported_type": y.logical_type.type_string}).to_dict()) | ||
y_df = _convert_woodwork_types_wrapper(y.to_series()) | ||
null_rows = y_df.isnull() | ||
if null_rows.any(): | ||
if null_rows.all(): | ||
results["errors"].append(DataCheckError(message="Target values are either empty or fully null.", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: are "empty" and "fully null" different? If they're not I'd just go with "Target values are fully null." There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, they're different in that empty refers to len(y) == 0, and fully null is len(y) != 0 but all nan values 😢 |
||
data_check_name=self.name, | ||
message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, | ||
details={}).to_dict()) | ||
return results | ||
angela97lin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
elif null_rows.any(): | ||
num_null_rows = null_rows.sum() | ||
pct_null_rows = null_rows.mean() * 100 | ||
results["errors"].append(DataCheckError(message="{} row(s) ({}%) of target values are null".format(num_null_rows, pct_null_rows), | ||
data_check_name=self.name, | ||
message_code=DataCheckMessageCode.TARGET_HAS_NULL, | ||
details={"num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows}).to_dict()) | ||
impute_strategy = "mean" if is_regression(self.problem_type) else "most_frequent" | ||
results["actions"].append(DataCheckAction(DataCheckActionCode.IMPUTE_COL, | ||
details={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()) | ||
|
||
value_counts = y_df.value_counts() | ||
unique_values = value_counts.index.tolist() | ||
|
||
if self.problem_type == ProblemTypes.BINARY and len(value_counts) != 2: | ||
if is_binary(self.problem_type) and len(value_counts) != 2: | ||
if self.n_unique is None: | ||
details = {"target_values": unique_values} | ||
else: | ||
|
@@ -109,7 +126,7 @@ def validate(self, X, y): | |
message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, | ||
details={}).to_dict()) | ||
|
||
if self.problem_type == ProblemTypes.MULTICLASS: | ||
if is_multiclass(self.problem_type): | ||
if value_counts.min() <= 1: | ||
least_populated = value_counts[value_counts <= 1] | ||
details = {"least_populated_class_labels": least_populated.index.tolist()} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,6 +46,7 @@ | |
LSA, | ||
PCA, | ||
DFSTransformer, | ||
TargetImputer, | ||
PolynomialDetrender | ||
) | ||
from .ensemble import ( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .per_column_imputer import PerColumnImputer | ||
from .simple_imputer import SimpleImputer | ||
from .imputer import Imputer | ||
from .target_imputer import TargetImputer |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
|
||
import pandas as pd | ||
from sklearn.impute import SimpleImputer as SkImputer | ||
|
||
from evalml.pipelines.components.transformers import Transformer | ||
from evalml.utils import ( | ||
_convert_woodwork_types_wrapper, | ||
_retain_custom_types_and_initalize_woodwork, | ||
infer_feature_types | ||
) | ||
|
||
|
||
class TargetImputer(Transformer): | ||
chukarsten marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Imputes missing target data according to a specified imputation strategy.""" | ||
name = 'Target Imputer' | ||
hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} | ||
|
||
def __init__(self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs): | ||
"""Initalizes an transformer that imputes missing target data according to the specified imputation strategy." | ||
Arguments: | ||
impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for | ||
numerical data, and "most_frequent", "constant" for object data types. | ||
fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. | ||
Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. | ||
random_seed (int): Seed for the random number generator. Defaults to 0. | ||
""" | ||
parameters = {"impute_strategy": impute_strategy, | ||
"fill_value": fill_value} | ||
parameters.update(kwargs) | ||
imputer = SkImputer(strategy=impute_strategy, | ||
fill_value=fill_value, | ||
**kwargs) | ||
super().__init__(parameters=parameters, | ||
component_obj=imputer, | ||
random_seed=random_seed) | ||
|
||
def fit(self, X, y): | ||
"""Fits imputer to target data. 'None' values are converted to np.nan before imputation and are | ||
treated as the same. | ||
Arguments: | ||
X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored. | ||
y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples] | ||
angela97lin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Returns: | ||
self | ||
""" | ||
if y is None: | ||
raise ValueError("y cannot be None") | ||
y = infer_feature_types(y) | ||
y = _convert_woodwork_types_wrapper(y.to_series()).to_frame() | ||
|
||
# Convert all bool dtypes to category for fitting | ||
if (y.dtypes == bool).all(): | ||
y = y.astype('category') | ||
angela97lin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
self._component_obj.fit(y) | ||
return self | ||
|
||
def transform(self, X, y): | ||
"""Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. | ||
Arguments: | ||
X (ww.DataTable, pd.DataFrame): Features. Ignored. | ||
y (ww.DataColumn, pd.Series): Target data to impute. | ||
Returns: | ||
ww.DataColumn: Transformed y | ||
angela97lin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
y_ww = infer_feature_types(y) | ||
y = _convert_woodwork_types_wrapper(y_ww.to_series()) | ||
y_df = y.to_frame() | ||
angela97lin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool | ||
if (y_df.dtypes == bool).all(): | ||
return X, _retain_custom_types_and_initalize_woodwork(y_ww, y) | ||
|
||
transformed = self._component_obj.transform(y_df) | ||
if transformed.shape[1] == 0: | ||
raise RuntimeError("Transformed data is empty") | ||
y_t = pd.Series(transformed[:, 0], index=y.index) | ||
return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t) | ||
|
||
def fit_transform(self, X, y): | ||
"""Fits on y and transforms y | ||
Arguments: | ||
X (ww.DataTable, pd.DataFrame): Features. Ignored. | ||
y (ww.DataColumn, pd.Series): Target data to impute. | ||
Returns: | ||
ww.DataColumn: Transformed y | ||
angela97lin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
return self.fit(X, y).transform(X, y) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wanted a way to specify that we want to impute the target without relying on the name of the column
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense!