Adds recommended actions for InvalidTargetDataCheck and update _make_…

…component_list_from_actions to address this action (#1989)
alteryx · Mar 31, 2021 · 2f46b6a · 2f46b6a
1 parent c335c4e
commit 2f46b6a
Show file tree

Hide file tree

Showing 18 changed files with 537 additions and 85 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Added recommended actions for ``InvalidTargetDataCheck``, updated ``_make_component_list_from_actions`` to address new action, and added ``TargetImputer`` component :pr:`1989`
         * Updated ``AutoMLSearch._check_for_high_variance`` to not emit ``RuntimeWarning`` :pr:`2024`
         * Added exception when pipeline passed to ``explain_predictions`` is a ``Stacked Ensemble`` pipeline :pr:`2033`
         * Added sensitivity at low alert rates as an objective :pr:`2001`

diff --git a/evalml/data_checks/data_check_action_code.py b/evalml/data_checks/data_check_action_code.py
@@ -6,3 +6,6 @@ class DataCheckActionCode(Enum):
 
     DROP_COL = "drop_col"
     """Action code for dropping a column."""
+
+    IMPUTE_COL = "impute_col"
+    """Action code for imputing a column."""
diff --git a/evalml/data_checks/data_check_message_code.py b/evalml/data_checks/data_check_message_code.py
@@ -16,6 +16,9 @@ class DataCheckMessageCode(Enum):
     TARGET_IS_NONE = "target_is_none"
     """Message code for when target is None."""
 
+    TARGET_IS_EMPTY_OR_FULLY_NULL = "target_is_empty_or_fully_null"
+    """Message code for target data that is empty or has all null values."""
+
     TARGET_HAS_NULL = "target_has_null"
     """Message code for target data that has null values."""
 

diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py
@@ -2,12 +2,20 @@
 
 from evalml.data_checks import (
     DataCheck,
+    DataCheckAction,
+    DataCheckActionCode,
     DataCheckError,
     DataCheckMessageCode,
     DataCheckWarning
 )
 from evalml.objectives import get_objective
-from evalml.problem_types import ProblemTypes, handle_problem_types
+from evalml.problem_types import (
+    ProblemTypes,
+    handle_problem_types,
+    is_binary,
+    is_multiclass,
+    is_regression
+)
 from evalml.utils.woodwork_utils import (
     _convert_woodwork_types_wrapper,
     infer_feature_types,
@@ -57,7 +65,7 @@ def validate(self, X, y):
                                                                    "code": "TARGET_HAS_NULL",\
                                                                    "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\
                                                        "warnings": [],\
-                                                       "actions": []}
+                                                       "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]}
         """
         results = {
             "warnings": [],
@@ -82,18 +90,27 @@ def validate(self, X, y):
                                                     details={"unsupported_type": y.logical_type.type_string}).to_dict())
         y_df = _convert_woodwork_types_wrapper(y.to_series())
         null_rows = y_df.isnull()
-        if null_rows.any():
+        if null_rows.all():
+            results["errors"].append(DataCheckError(message="Target is either empty or fully null.",
+                                                    data_check_name=self.name,
+                                                    message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
+                                                    details={}).to_dict())
+            return results
+        elif null_rows.any():
             num_null_rows = null_rows.sum()
             pct_null_rows = null_rows.mean() * 100
             results["errors"].append(DataCheckError(message="{} row(s) ({}%) of target values are null".format(num_null_rows, pct_null_rows),
                                                     data_check_name=self.name,
                                                     message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                                     details={"num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows}).to_dict())
+            impute_strategy = "mean" if is_regression(self.problem_type) else "most_frequent"
+            results["actions"].append(DataCheckAction(DataCheckActionCode.IMPUTE_COL,
+                                                      metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict())
 
         value_counts = y_df.value_counts()
         unique_values = value_counts.index.tolist()
 
-        if self.problem_type == ProblemTypes.BINARY and len(value_counts) != 2:
+        if is_binary(self.problem_type) and len(value_counts) != 2:
             if self.n_unique is None:
                 details = {"target_values": unique_values}
             else:
@@ -109,7 +126,7 @@ def validate(self, X, y):
                                                     message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                                                     details={}).to_dict())
 
-        if self.problem_type == ProblemTypes.MULTICLASS:
+        if is_multiclass(self.problem_type):
             if value_counts.min() <= 1:
                 least_populated = value_counts[value_counts <= 1]
                 details = {"least_populated_class_labels": least_populated.index.tolist()}

diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py
@@ -46,6 +46,7 @@
     LSA,
     PCA,
     DFSTransformer,
+    TargetImputer,
     PolynomialDetrender
 )
 from .ensemble import (

diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py
@@ -1,7 +1,7 @@
 from .transformer import Transformer
 from .encoders import OneHotEncoder, TargetEncoder
 from .feature_selection import FeatureSelector, RFClassifierSelectFromModel, RFRegressorSelectFromModel
-from .imputers import PerColumnImputer, SimpleImputer, Imputer
+from .imputers import PerColumnImputer, SimpleImputer, Imputer, TargetImputer
 from .scalers import StandardScaler
 from .column_selectors import DropColumns, SelectColumns
 from .dimensionality_reduction import LinearDiscriminantAnalysis, PCA

diff --git a/evalml/pipelines/components/transformers/imputers/__init__.py b/evalml/pipelines/components/transformers/imputers/__init__.py
@@ -1,3 +1,4 @@
 from .per_column_imputer import PerColumnImputer
 from .simple_imputer import SimpleImputer
 from .imputer import Imputer
+from .target_imputer import TargetImputer
diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py
@@ -0,0 +1,121 @@
+
+from functools import wraps
+
+import pandas as pd
+from sklearn.impute import SimpleImputer as SkImputer
+
+from evalml.exceptions import ComponentNotYetFittedError
+from evalml.pipelines.components import ComponentBaseMeta
+from evalml.pipelines.components.transformers import Transformer
+from evalml.utils import (
+    _convert_woodwork_types_wrapper,
+    _retain_custom_types_and_initalize_woodwork,
+    infer_feature_types
+)
+
+
+class TargetImputerMeta(ComponentBaseMeta):
+    """A version of the ComponentBaseMeta class which handles when input features is None"""
+
+    @classmethod
+    def check_for_fit(cls, method):
+        """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`.
+            It raises an exception if `False` and calls and returns the wrapped method if `True`.
+        """
+        @wraps(method)
+        def _check_for_fit(self, X=None, y=None):
+            klass = type(self).__name__
+            if not self._is_fitted and self.needs_fitting:
+                raise ComponentNotYetFittedError(f'This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.')
+            else:
+                return method(self, X, y)
+        return _check_for_fit
+
+
+class TargetImputer(Transformer, metaclass=TargetImputerMeta):
+    """Imputes missing target data according to a specified imputation strategy."""
+    name = 'Target Imputer'
+    hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]}
+
+    def __init__(self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs):
+        """Initalizes an transformer that imputes missing target data according to the specified imputation strategy."
+        Arguments:
+            impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for
+               numerical data, and "most_frequent", "constant" for object data types.
+            fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data.
+               Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types.
+            random_seed (int): Seed for the random number generator. Defaults to 0.
+        """
+        parameters = {"impute_strategy": impute_strategy,
+                      "fill_value": fill_value}
+        parameters.update(kwargs)
+        imputer = SkImputer(strategy=impute_strategy,
+                            fill_value=fill_value,
+                            **kwargs)
+        super().__init__(parameters=parameters,
+                         component_obj=imputer,
+                         random_seed=random_seed)
+
+    def fit(self, X, y):
+        """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are
+            treated as the same.
+
+        Arguments:
+            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored.
+            y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples].
+
+        Returns:
+            self
+        """
+        if y is None:
+            return self
+        y = infer_feature_types(y)
+        y = _convert_woodwork_types_wrapper(y.to_series()).to_frame()
+
+        # Convert all bool dtypes to category for fitting
+        if (y.dtypes == bool).all():
+            y = y.astype('category')
+
+        self._component_obj.fit(y)
+        return self
+
+    def transform(self, X, y):
+        """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.
+
+        Arguments:
+            X (ww.DataTable, pd.DataFrame): Features. Ignored.
+            y (ww.DataColumn, pd.Series): Target data to impute.
+
+        Returns:
+            (ww.DataTable, ww.DataColumn): The original X, transformed y
+        """
+
+        if X is not None:
+            X = infer_feature_types(X)
+        if y is None:
+            return X, None
+        y_ww = infer_feature_types(y)
+        y = _convert_woodwork_types_wrapper(y_ww.to_series())
+        y_df = y.to_frame()
+
+        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
+        if (y_df.dtypes == bool).all():
+            return X, _retain_custom_types_and_initalize_woodwork(y_ww, y)
+
+        transformed = self._component_obj.transform(y_df)
+        if transformed.shape[1] == 0:
+            raise RuntimeError("Transformed data is empty")
+        y_t = pd.Series(transformed[:, 0], index=y.index)
+        return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)
+
+    def fit_transform(self, X, y):
+        """Fits on and transforms the input target data.
+
+        Arguments:
+            X (ww.DataTable, pd.DataFrame): Features. Ignored.
+            y (ww.DataColumn, pd.Series): Target data to impute.
+
+        Returns:
+            (ww.DataTable, ww.DataColumn): The original X, transformed y
+        """
+        return self.fit(X, y).transform(X, y)
diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -31,6 +31,7 @@
     StackedEnsembleClassifier,
     StackedEnsembleRegressor,
     StandardScaler,
+    TargetImputer,
     TextFeaturizer
 )
 from evalml.pipelines.components.utils import all_components, get_estimators
@@ -268,4 +269,8 @@ def _make_component_list_from_actions(actions):
     for action in actions:
         if action.action_code == DataCheckActionCode.DROP_COL:
             components.append(DropColumns(columns=action.metadata["columns"]))
+        if action.action_code == DataCheckActionCode.IMPUTE_COL:
+            metadata = action.metadata
+            if metadata["is_target"]:
+                components.append(TargetImputer(impute_strategy=metadata["impute_strategy"]))
     return components
diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py
@@ -53,6 +53,7 @@
     StandardScaler,
     SVMClassifier,
     SVMRegressor,
+    TargetImputer,
     TextFeaturizer,
     TimeSeriesBaselineEstimator,
     Transformer,
@@ -534,7 +535,11 @@ def test_transformer_transform_output_type(X_y_binary):
 
             component.fit(X, y=y)
             transform_output = component.transform(X, y=y)
-            assert isinstance(transform_output, ww.DataTable)
+            if isinstance(component, TargetImputer):
+                assert isinstance(transform_output[0], ww.DataTable)
+                assert isinstance(transform_output[1], ww.DataColumn)
+            else:
+                assert isinstance(transform_output, ww.DataTable)
 
             if isinstance(component, SelectColumns):
                 assert transform_output.shape == (X.shape[0], 0)
@@ -548,12 +553,20 @@ def test_transformer_transform_output_type(X_y_binary):
                 # We just want to check that DelayedFeaturesTransformer outputs a DataFrame
                 # The dataframe shape and index are checked in test_delayed_features_transformer.py
                 continue
+            elif isinstance(component, TargetImputer):
+                assert transform_output[0].shape == X.shape
+                assert transform_output[1].shape[0] == X.shape[0]
+                assert len(transform_output[1].shape) == 1
             else:
                 assert transform_output.shape == X.shape
                 assert (list(transform_output.columns) == list(X_cols_expected))
 
             transform_output = component.fit_transform(X, y=y)
-            assert isinstance(transform_output, ww.DataTable)
+            if isinstance(component, TargetImputer):
+                assert isinstance(transform_output[0], ww.DataTable)
+                assert isinstance(transform_output[1], ww.DataColumn)
+            else:
+                assert isinstance(transform_output, ww.DataTable)
 
             if isinstance(component, SelectColumns):
                 assert transform_output.shape == (X.shape[0], 0)
@@ -563,6 +576,10 @@ def test_transformer_transform_output_type(X_y_binary):
             elif isinstance(component, DFSTransformer):
                 assert transform_output.shape[0] == X.shape[0]
                 assert transform_output.shape[1] >= X.shape[1]
+            elif isinstance(component, TargetImputer):
+                assert transform_output[0].shape == X.shape
+                assert transform_output[1].shape[0] == X.shape[0]
+                assert len(transform_output[1].shape) == 1
             else:
                 assert transform_output.shape == X.shape
                 assert (list(transform_output.columns) == list(X_cols_expected))
@@ -704,14 +721,14 @@ def test_all_transformers_check_fit(X_y_binary):
 
         component = component_class()
         with pytest.raises(ComponentNotYetFittedError, match=f'You must fit {component_class.__name__}'):
-            component.transform(X)
+            component.transform(X, y)
 
         component.fit(X, y)
-        component.transform(X)
+        component.transform(X, y)
 
         component = component_class()
         component.fit_transform(X, y)
-        component.transform(X)
+        component.transform(X, y)
 
 
 def test_all_estimators_check_fit(X_y_binary, ts_data, test_estimator_needs_fitting_false, helper_functions):
@@ -1067,9 +1084,14 @@ def test_transformer_fit_and_transform_respect_custom_indices(use_custom_index,
     pd.testing.assert_index_equal(X.index, X_original_index)
     pd.testing.assert_index_equal(y.index, y_original_index)
 
-    X_t = transformer.transform(X, y).to_dataframe()
-    pd.testing.assert_index_equal(X_t.index, X_original_index, check_names=check_names)
-    pd.testing.assert_index_equal(y.index, y_original_index, check_names=check_names)
+    if transformer_class == TargetImputer:
+        X_t, y_t = transformer.transform(X, y)
+        pd.testing.assert_index_equal(X_t.to_dataframe().index, X_original_index, check_names=check_names)
+        pd.testing.assert_index_equal(y_t.to_series().index, y_original_index, check_names=check_names)
+    else:
+        X_t = transformer.transform(X, y).to_dataframe()
+        pd.testing.assert_index_equal(X_t.index, X_original_index, check_names=check_names)
+        pd.testing.assert_index_equal(y.index, y_original_index, check_names=check_names)
 
 
 @pytest.mark.parametrize("estimator_class", _all_estimators())

diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py
@@ -253,7 +253,7 @@ def test_simple_imputer_does_not_reset_index():
     imputer = SimpleImputer(impute_strategy="mean")
     imputer.fit(X, y=y)
     transformed = imputer.transform(X)
-    pd.testing.assert_frame_equal(pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9]},
+    pd.testing.assert_frame_equal(pd.DataFrame({'input_val': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
                                                dtype=float,
                                                index=list(range(1, 10))),
                                   transformed.to_dataframe())