"""Module pointing to different implementations of Data class
-DiCE requires only few parameters about the data such as the range of continuous features and the levels of categorical features. Hence, DiCE can be used for a private data whose meta data are only available (such as the feature names and range/levels of different features) by specifying appropriate parameters.
+DiCE requires only few parameters about the data such as the range of continuous
+features and the levels of categorical features. Hence, DiCE can be used for a
+private data whose meta data are only available (such as the feature names and
+range/levels of different features) by specifying appropriate parameters.
"""
+from dice_ml.data_interfaces.base_data_interface import _BaseData
-[docs]class Data:
+
+
[docs]class Data(_BaseData):
"""Class containing all required information about the data for DiCE."""
def __init__(self, **params):
@@ -176,24 +185,27 @@
Source code for dice_ml.data
:param **params: a dictionary of required parameters.
"""
-
self.decide_implementation_type(params)
[docs] def decide_implementation_type(self, params):
"""Decides if the Data class is for public or private data."""
-
-
self.__class__ = decide(params)
+
self.__class__ = decide(params)
self.__init__(params)
-
# To add new implementations of Data, add the class in data_interfaces subpackage and import-and-return the class in an elif loop as shown in the below method.
[docs]def decide(params):
-
"""Decides if the Data class is for public or private data."""
-
-
if 'dataframe' in params: # if params contain a Pandas dataframe, then use PublicData class
+
"""Decides if the Data class is for public or private data.
+
+
To add new implementations of Data, add the class in data_interfaces
+
subpackage and import-and-return the class in an elif loop as shown
+
in the below method.
+
"""
+
if 'dataframe' in params:
+
# if params contain a Pandas dataframe, then use PublicData class
from dice_ml.data_interfaces.public_data_interface import PublicData
return PublicData
-
else: # use PrivateData if only meta data is provided
+
else:
+
# use PrivateData if only meta data is provided
from dice_ml.data_interfaces.private_data_interface import PrivateData
return PrivateData
diff --git a/docs/_modules/dice_ml/data_interfaces/private_data_interface.html b/docs/_modules/dice_ml/data_interfaces/private_data_interface.html
index 2e52c5ff..18c77891 100644
--- a/docs/_modules/dice_ml/data_interfaces/private_data_interface.html
+++ b/docs/_modules/dice_ml/data_interfaces/private_data_interface.html
@@ -7,7 +7,7 @@
-
dice_ml.data_interfaces.private_data_interface — DiCE 0.5 documentation
+
dice_ml.data_interfaces.private_data_interface — DiCE 0.7 documentation
@@ -31,6 +31,8 @@
+
+
@@ -88,9 +90,11 @@
Notebooks:
- Quick introduction to generating counterfactual explanations using DiCE
-- Defining meta data
-- Loading trained ML model
-- Generate diverse counterfactuals
+- Estimating local and global feature importance scores using DiCE
+- Generating counterfactuals for multi-class classification and regression models
+- Regression
+- Generating counterfactual explanations with any ML model
+- Generating Counterfactual Explanations without access to training data
- Advanced options to customize Counterfactual Explanations
- Generate feasible counterfactual explanations using a VAE
- Adding feasibility constraints
@@ -167,44 +171,49 @@ Source code for dice_ml.data_interfaces.private_data_interface
import
sys
import pandas as pd
import numpy as np
-
from sklearn.model_selection import train_test_split
import collections
-
from collections import OrderedDict
import logging
+
+
from dice_ml.data_interfaces.base_data_interface import _BaseData
+
+
logging.basicConfig(level=logging.NOTSET)
-
from sklearn.preprocessing import LabelEncoder
-
[docs]class PrivateData:
+
+
[docs]class PrivateData(_BaseData):
"""A data interface for private data with meta information."""
def __init__(self, params):
"""Init method
-
:param features: Dictionary or OrderedDict with feature names as keys and range in int/float (for continuous features) or categories in string (for categorical features) as values. For python version <=3.6, should provide only an OrderedDict.
+
:param features: Dictionary or OrderedDict with feature names as keys and range in int/float
+
(for continuous features) or categories in string (for categorical features)
+
as values. For python version <=3.6, should provide only an OrderedDict.
:param outcome_name: Outcome feature name.
-
:param type_and_precision (optional): Dictionary with continuous feature names as keys. If the feature is of type int, just string 'int' should be provided, if the feature is of type float, a list of type and precision should be provided. For instance, type_and_precision: {cont_f1: 'int', cont_f2: ['float', 2]} for continuous features cont_f1 and cont_f2 of type int and float (and precision up to 2 decimal places) respectively. Default value is None and all features are treated as int.
-
:param mad (optional): Dictionary with feature names as keys and corresponding Median Absolute Deviations (MAD) as values. Default MAD value is 1 for all features.
+
:param type_and_precision (optional): Dictionary with continuous feature names as keys.
+
If the feature is of type int, just string 'int' should be provided,
+
if the feature is of type float, a list of type and precision should be
+
provided. For instance, type_and_precision: {cont_f1: 'int',
+
cont_f2: ['float', 2]} for continuous features cont_f1 and cont_f2 of
+
type int and float (and precision up to 2 decimal places) respectively.
+
Default value is None and all features are treated as int.
+
:param mad (optional): Dictionary with feature names as keys and corresponding Median Absolute Deviations (MAD)
+
as values.
+
Default MAD value is 1 for all features.
:param data_name (optional): Dataset name
-
"""
-
-
if sys.version_info > (3,6,0) and type(params['features']) in [dict, collections.OrderedDict]:
+
if sys.version_info > (3, 6, 0) and type(params['features']) in [dict, collections.OrderedDict]:
features_dict = params['features']
-
elif sys.version_info <= (3,6,0) and type(params['features']) is collections.OrderedDict:
+
elif sys.version_info <= (3, 6, 0) and type(params['features']) is collections.OrderedDict:
features_dict = params['features']
else:
raise ValueError(
-
"should provide dictionary with feature names as keys and range (for continuous features) or categories (for categorical features) as values. For python version <3.6, should provide an OrderedDict")
+
"should provide dictionary with feature names as keys and range"
+
"(for continuous features) or categories (for categorical features) as values. "
+
"For python version <3.6, should provide an OrderedDict")
-
if type(params['outcome_name']) is str:
-
self.outcome_name = params['outcome_name']
-
else:
-
raise ValueError("should provide the name of outcome feature")
-
-
if 'type_and_precision' in params:
-
self.type_and_precision = params['type_and_precision']
-
else:
-
self.type_and_precision = {}
+
self._validate_and_set_outcome_name(params=params)
+
self._validate_and_set_type_and_precision(params=params)
self.continuous_feature_names = []
self.permitted_range = {}
@@ -219,10 +228,7 @@
Source code for dice_ml.data_interfaces.private_data_interface
self
.categorical_feature_names.append(feature)
self.categorical_levels[feature] = features_dict[feature]
-
if 'mad' in params:
-
self.mad = params['mad']
-
else:
-
self.mad = {}
+
self._validate_and_set_mad(params=params)
# self.continuous_feature_names + self.categorical_feature_names
self.feature_names = list(features_dict.keys())
@@ -244,16 +250,28 @@
Source code for dice_ml.data_interfaces.private_data_interface
#
# for column in self.categorical_feature_names:
# self.labelencoder[column] = LabelEncoder()
-
# self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.categorical_levels[column])
+
# self.label_encoded_data[column] = \
+
# self.labelencoder[column].fit_transform(self.categorical_levels[column])
# self.max_range = -np.inf
# for feature in self.continuous_feature_names:
# self.max_range = max(self.max_range, self.permitted_range[feature][1])
-
if 'data_name' in params:
-
self.data_name = params['data_name']
+
self._validate_and_set_data_name(params=params)
+
+
def _validate_and_set_type_and_precision(self, params):
+
"""Validate and set the type and precision."""
+
if 'type_and_precision' in params:
+
self.type_and_precision = params['type_and_precision']
else:
-
self.data_name = 'mydata'
+
self.type_and_precision = {}
+
+
def _validate_and_set_mad(self, params):
+
"""Validate and set the MAD."""
+
if 'mad' in params:
+
self.mad = params['mad']
+
else:
+
self.mad = {}
[docs] def one_hot_encode_data(self, data):
"""One-hot-encodes the data."""
@@ -342,8 +360,10 @@
Source code for dice_ml.data_interfaces.private_data_interface
# one-hot-encoded data is same as original data if there is no categorical features.
self.ohe_encoded_feature_names = [feat for feat in self.feature_names]
- self.ohe_base_df = self.prepare_df_for_ohe_encoding() # base dataframe for doing one-hot-encoding
-
# ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters) when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers
+
# base dataframe for doing one-hot-encoding
+
# ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters)
+
# when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers
+
self.ohe_base_df = self.prepare_df_for_ohe_encoding()
[docs] def get_data_params_for_gradient_dice(self):
"""Gets all data related params for DiCE."""
@@ -364,8 +384,8 @@
Source code for dice_ml.data_interfaces.private_data_interface
# decimal precisions for continuous features
cont_precisions = [self.get_decimal_precisions()[ix] for ix in range(len(self.continuous_feature_names))]
- return minx, maxx, encoded_categorical_feature_indexes, encoded_continuous_feature_indexes, cont_minx, cont_maxx, cont_precisions
-
+
return minx, maxx, encoded_categorical_feature_indexes, encoded_continuous_feature_indexes, \
+
cont_minx, cont_maxx, cont_precisions
[docs] def get_encoded_categorical_feature_indexes(self):
"""Gets the column indexes categorical features after one-hot-encoding."""
@@ -407,10 +427,10 @@
Source code for dice_ml.data_interfaces.private_data_interface
[docs] def from_dummies(self, data, prefix_sep='_'):
"""Gets the original data from dummy encoded data with k levels."""
out = data.copy()
-
for l in self.categorical_feature_names:
+
for feature_name in self.categorical_feature_names:
cols, labs = [[c.replace(
-
x, "") for c in data.columns if l+prefix_sep in c] for x in ["", l+prefix_sep]]
-
out[l] = pd.Categorical(
+
x, "") for c in data.columns if feature_name+prefix_sep in c] for x in ["", feature_name+prefix_sep]]
+
out[feature_name] = pd.Categorical(
np.array(labs)[np.argmax(data[cols].values, axis=1)])
out.drop(cols, axis=1, inplace=True)
return out
@@ -494,22 +514,26 @@
Source code for dice_ml.data_interfaces.private_data_interface
return test
[docs] def get_ohe_min_max_normalized_data(self, query_instance):
-
"""Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts"""
+
"""Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict,
+
a dataframe, a list, or a list of dicts"""
query_instance = self.prepare_query_instance(query_instance)
temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False)
temp = self.one_hot_encode_data(temp)
temp = temp.tail(query_instance.shape[0]).reset_index(drop=True)
-
return self.normalize_data(temp) # returns a pandas dataframe
+
# returns a pandas dataframe
+
return self.normalize_data(temp)
[docs] def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
-
"""Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data should be a dataframe or an array"""
+
"""Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data
+
should be a dataframe or an array"""
raw_data = self.get_decoded_data(transformed_data, encoding='one-hot')
raw_data = self.de_normalize_data(raw_data)
precisions = self.get_decimal_precisions()
for ix, feature in enumerate(self.continuous_feature_names):
raw_data[feature] = raw_data[feature].astype(float).round(precisions[ix])
raw_data = raw_data[self.feature_names]
-
return raw_data # returns a pandas dataframe
+
# returns a pandas dataframe
+
return raw_data
diff --git a/docs/_modules/dice_ml/data_interfaces/public_data_interface.html b/docs/_modules/dice_ml/data_interfaces/public_data_interface.html
index 675ec317..ed22dd39 100644
--- a/docs/_modules/dice_ml/data_interfaces/public_data_interface.html
+++ b/docs/_modules/dice_ml/data_interfaces/public_data_interface.html
@@ -7,7 +7,7 @@
- dice_ml.data_interfaces.public_data_interface — DiCE 0.5 documentation
+ dice_ml.data_interfaces.public_data_interface — DiCE 0.7 documentation
@@ -31,6 +31,8 @@
+
+
@@ -88,9 +90,11 @@
Notebooks:
- Quick introduction to generating counterfactual explanations using DiCE
-- Defining meta data
-- Loading trained ML model
-- Generate diverse counterfactuals
+- Estimating local and global feature importance scores using DiCE
+- Generating counterfactuals for multi-class classification and regression models
+- Regression
+- Generating counterfactual explanations with any ML model
+- Generating Counterfactual Explanations without access to training data
- Advanced options to customize Counterfactual Explanations
- Generate feasible counterfactual explanations using a VAE
- Adding feasibility constraints
@@ -162,17 +166,22 @@
Source code for dice_ml.data_interfaces.public_data_interface
-"""Module containing all required information about the interface between raw (or transformed) public data and DiCE explainers."""
+"""Module containing all required information about the interface between raw (or transformed)
+public data and DiCE explainers."""
import pandas as pd
import numpy as np
-from sklearn.model_selection import train_test_split
import logging
from collections import defaultdict
+from dice_ml.data_interfaces.base_data_interface import _BaseData
+from dice_ml.utils.exception import SystemException, UserConfigValidationException
-[docs]class PublicData:
-
"""A data interface for public data. This class is an interface to DiCE explainers and contains methods to transform user-fed raw data into the format a DiCE explainer requires, and vice versa."""
+
+
[docs]class PublicData(_BaseData):
+
"""A data interface for public data. This class is an interface to DiCE explainers
+
and contains methods to transform user-fed raw data into the format a DiCE explainer
+
requires, and vice versa."""
def __init__(self, params):
"""Init method
@@ -180,44 +189,32 @@
Source code for dice_ml.data_interfaces.public_data_interface
:param dataframe: The train dataframe used by explainer method.
:param continuous_features: List of names of continuous features. The remaining features are categorical features.
:param outcome_name: Outcome feature name.
-
:param permitted_range (optional): Dictionary with feature names as keys and permitted range in list as values. Defaults to the range inferred from training data.
+
:param permitted_range (optional): Dictionary with feature names as keys and permitted range in list as values.
+
Defaults to the range inferred from training data.
:param continuous_features_precision (optional): Dictionary with feature names as keys and precisions as values.
:param data_name (optional): Dataset name
-
"""
+
self._validate_and_set_outcome_name(params=params)
+
self._validate_and_set_dataframe(params=params)
+
self._validate_and_set_continuous_features(params=params)
-
if isinstance(params['dataframe'], pd.DataFrame):
-
self.data_df = params['dataframe']
-
else:
-
raise ValueError("should provide a pandas dataframe")
+
self.feature_names = [
+
name for name in self.data_df.columns.tolist() if name != self.outcome_name]
-
if type(params['continuous_features']) is list:
-
self.continuous_feature_names = params['continuous_features']
-
else:
-
raise ValueError(
-
"should provide the name(s) of continuous features in the data")
+
self.number_of_features = len(self.feature_names)
-
if type(params['outcome_name']) is str:
-
self.outcome_name = params['outcome_name']
-
else:
-
raise ValueError("should provide the name of outcome feature")
+
if len(set(self.continuous_feature_names) - set(self.feature_names)) != 0:
+
raise UserConfigValidationException(
+
"continuous_features contains some feature names which are not part of columns in dataframe"
+
)
self.categorical_feature_names = [name for name in self.data_df.columns.tolist(
) if name not in self.continuous_feature_names + [self.outcome_name]]
-
self.feature_names = [
-
name for name in self.data_df.columns.tolist() if name != self.outcome_name]
-
-
self.continuous_feature_indexes = [self.data_df.columns.get_loc(
-
name) for name in self.continuous_feature_names if name in self.data_df]
-
self.categorical_feature_indexes = [self.data_df.columns.get_loc(
name) for name in self.categorical_feature_names if name in self.data_df]
-
if 'continuous_features_precision' in params:
-
self.continuous_features_precision = params['continuous_features_precision']
-
else:
-
self.continuous_features_precision = None
+
self._validate_and_set_continuous_features_precision(params=params)
if len(self.categorical_feature_names) > 0:
for feature in self.categorical_feature_names:
@@ -235,34 +232,119 @@
Source code for dice_ml.data_interfaces.public_data_interface
np
.int32)
# should move the below snippet to gradient based dice interfaces
-
# self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df)
-
# self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist(
-
# ) if x not in np.array([self.outcome_name])]
+
# self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df)
+
# self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist(
+
# ) if x not in np.array([self.outcome_name])]
# should move the below snippet to model agnostic dice interfaces
-
# # Initializing a label encoder to obtain label-encoded values for categorical variables
-
# self.labelencoder = {}
-
#
-
# self.label_encoded_data = self.data_df.copy()
-
#
-
# for column in self.categorical_feature_names:
-
# self.labelencoder[column] = LabelEncoder()
-
# self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column])
+
# # Initializing a label encoder to obtain label-encoded values for categorical variables
+
# self.labelencoder = {}
+
#
+
# self.label_encoded_data = self.data_df.copy()
+
#
+
# for column in self.categorical_feature_names:
+
# self.labelencoder[column] = LabelEncoder()
+
# self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column])
-
input_permitted_range = None
-
if 'permitted_range' in params:
-
input_permitted_range = params['permitted_range']
-
self.permitted_range, feature_ranges_orig = self.get_features_range(input_permitted_range)
+
self._validate_and_set_permitted_range(params=params)
# should move the below snippet to model agnostic dice interfaces
-
# self.max_range = -np.inf
-
# for feature in self.continuous_feature_names:
-
# self.max_range = max(self.max_range, self.permitted_range[feature][1])
+
# self.max_range = -np.inf
+
# for feature in self.continuous_feature_names:
+
# self.max_range = max(self.max_range, self.permitted_range[feature][1])
+
+
self._validate_and_set_data_name(params=params)
+
+
def _validate_and_set_dataframe(self, params):
+
"""Validate and set the dataframe."""
+
if 'dataframe' not in params:
+
raise ValueError("dataframe not found in params")
-
if 'data_name' in params:
-
self.data_name = params['data_name']
+
if isinstance(params['dataframe'], pd.DataFrame):
+
self.data_df = params['dataframe'].copy()
else:
-
self.data_name = 'mydata'
+
raise ValueError("should provide a pandas dataframe")
+
+
if 'outcome_name' in params and params['outcome_name'] not in self.data_df.columns.tolist():
+
raise UserConfigValidationException(
+
"outcome_name {0} not found in {1}".format(
+
params['outcome_name'], ','.join(self.data_df.columns.tolist())
+
)
+
)
+
+
def _validate_and_set_continuous_features(self, params):
+
"""Validate and set the list of continuous features."""
+
if 'continuous_features' not in params:
+
raise ValueError('continuous_features should be provided')
+
+
if type(params['continuous_features']) is list:
+
self.continuous_feature_names = params['continuous_features']
+
else:
+
raise ValueError(
+
"should provide the name(s) of continuous features in the data as a list")
+
+
def _validate_and_set_continuous_features_precision(self, params):
+
"""Validate and set the dictionary of precision for continuous features."""
+
if 'continuous_features_precision' in params:
+
self.continuous_features_precision = params['continuous_features_precision']
+
+
if not hasattr(self, 'feature_names'):
+
raise SystemException('Feature names not correctly set in public data interface')
+
+
for continuous_features_precision_feature_name in self.continuous_features_precision:
+
if continuous_features_precision_feature_name not in self.feature_names:
+
raise UserConfigValidationException(
+
"continuous_features_precision contains some feature names which are not part of columns in dataframe"
+
)
+
else:
+
self.continuous_features_precision = None
+
+
def _validate_and_set_permitted_range(self, params):
+
"""Validate and set the dictionary of permitted ranges for continuous features."""
+
input_permitted_range = None
+
if 'permitted_range' in params:
+
input_permitted_range = params['permitted_range']
+
+
if not hasattr(self, 'feature_names'):
+
raise SystemException('Feature names not correctly set in public data interface')
+
+
for input_permitted_range_feature_name in input_permitted_range:
+
if input_permitted_range_feature_name not in self.feature_names:
+
raise UserConfigValidationException(
+
"permitted_range contains some feature names which are not part of columns in dataframe"
+
)
+
self.permitted_range, _ = self.get_features_range(input_permitted_range)
+
+
[docs] def check_features_to_vary(self, features_to_vary):
+
if features_to_vary is not None and features_to_vary != 'all':
+
not_training_features = set(features_to_vary) - set(self.feature_names)
+
if len(not_training_features) > 0:
+
raise UserConfigValidationException("Got features {0} which are not present in training data".format(
+
not_training_features))
+
+
[docs] def check_permitted_range(self, permitted_range):
+
if permitted_range is not None:
+
permitted_range_features = list(permitted_range)
+
not_training_features = set(permitted_range_features) - set(self.feature_names)
+
if len(not_training_features) > 0:
+
raise UserConfigValidationException("Got features {0} which are not present in training data".format(
+
not_training_features))
+
+
for feature in permitted_range_features:
+
if feature in self.categorical_feature_names:
+
train_categories = self.permitted_range[feature]
+
for test_category in permitted_range[feature]:
+
if test_category not in train_categories:
+
raise UserConfigValidationException(
+
'The category {0} does not occur in the training data for feature {1}.'
+
' Allowed categories are {2}'.format(test_category, feature, train_categories))
+
+
[docs] def check_mad_validity(self, feature_weights):
+
"""checks feature MAD validity and throw warnings.
+
TODO: add comments as to where this is used if this function is necessary, else remove.
+
"""
+
if feature_weights == "inverse_mad":
+
self.get_valid_mads(display_warnings=True, return_mads=False)
[docs] def get_features_range(self, permitted_range_input=None):
ranges = {}
@@ -281,9 +363,9 @@
Source code for dice_ml.data_interfaces.public_data_interface
[docs] def get_data_type(self, col):
"""Infers data type of a continuous feature from the training data."""
-
if ((self.data_df[col].dtype == np.int64) or (self.data_df[col].dtype == np.int32)):
+
if (self.data_df[col].dtype == np.int64) or (self.data_df[col].dtype == np.int32):
return 'int'
-
elif ((self.data_df[col].dtype == np.float64) or (self.data_df[col].dtype == np.float32)):
+
elif (self.data_df[col].dtype == np.float64) or (self.data_df[col].dtype == np.float32):
return 'float'
else:
raise ValueError("Unknown data type of feature %s: must be int or float" % col)
@@ -295,17 +377,31 @@
Source code for dice_ml.data_interfaces.public_data_interface
[docs] def normalize_data(self, df):
"""Normalizes continuous features to make them fall in the range [0,1]."""
result = df.copy()
-
for feature_name in self.continuous_feature_names:
-
max_value = self.data_df[feature_name].max()
-
min_value = self.data_df[feature_name].min()
-
result[feature_name] = (
-
df[feature_name] - min_value) / (max_value - min_value)
-
#if encoding == 'label':
-
# for ix in self.categorical_feature_indexes:
-
# feature_name = self.feature_names[ix]
-
# max_value = len(self.train_df[feature_name].unique())-1
-
# min_value = 0
-
# result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
+
if isinstance(df, pd.DataFrame) or isinstance(df, dict):
+
for feature_name in self.continuous_feature_names:
+
max_value = self.data_df[feature_name].max()
+
min_value = self.data_df[feature_name].min()
+
if min_value == max_value:
+
result[feature_name] = 0
+
else:
+
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
+
else:
+
result = result.astype('float')
+
for feature_index in self.continuous_feature_indexes:
+
feature_name = self.feature_names[feature_index]
+
max_value = self.data_df[feature_name].max()
+
min_value = self.data_df[feature_name].min()
+
if len(df.shape) == 1:
+
if min_value == max_value:
+
value = 0
+
else:
+
value = (df[feature_index] - min_value) / (max_value - min_value)
+
result[feature_index] = value
+
else:
+
if min_value == max_value:
+
result[:, feature_index] = np.zeros(len(df[:, feature_index]))
+
else:
+
result[:, feature_index] = (df[:, feature_index] - min_value) / (max_value - min_value)
return result
[docs] def de_normalize_data(self, df):
@@ -338,9 +434,9 @@
Source code for dice_ml.data_interfaces.public_data_interface
if
normalized:
minx = (feature_range_input[feature_name]
-
[0] - min_value) / (max_value - min_value)
+
[0] - min_value) / (max_value - min_value)
maxx = (feature_range_input[feature_name]
-
[1] - min_value) / (max_value - min_value)
+
[1] - min_value) / (max_value - min_value)
else:
minx = feature_range_input[feature_name][0]
maxx = feature_range_input[feature_name][1]
@@ -369,7 +465,7 @@
Source code for dice_ml.data_interfaces.public_data_interface
minx[0][idx] = self.permitted_range[feature_name][0]
maxx[0][idx] = self.permitted_range[feature_name][1]
return minx, maxx
-
#if encoding=='one-hot':
+
# if encoding=='one-hot':
# minx = np.array([[0.0] * len(self.ohe_encoded_feature_names)])
# maxx = np.array([[1.0] * len(self.ohe_encoded_feature_names)])
@@ -385,7 +481,7 @@
Source code for dice_ml.data_interfaces.public_data_interface
# else:
# minx[0][idx] = self.permitted_range[feature_name][0]
# maxx[0][idx] = self.permitted_range[feature_name][1]
-
#else:
+
# else:
# minx = np.array([[0.0] * len(self.feature_names)])
# maxx = np.array([[1.0] * len(self.feature_names)])
@@ -441,8 +537,10 @@
Source code for dice_ml.data_interfaces.public_data_interface
# one-hot-encoded data is same as original data if there is no categorical features.
self.ohe_encoded_feature_names = [feat for feat in self.feature_names]
- self.ohe_base_df = self.prepare_df_for_ohe_encoding() # base dataframe for doing one-hot-encoding
-
# ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters) when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers
+
# base dataframe for doing one-hot-encoding
+
# ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters)
+
# when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers
+
self.ohe_base_df = self.prepare_df_for_ohe_encoding()
[docs] def get_data_params_for_gradient_dice(self):
"""Gets all data related params for DiCE."""
@@ -463,7 +561,8 @@
Source code for dice_ml.data_interfaces.public_data_interface
# decimal precisions for continuous features
cont_precisions = [self.get_decimal_precisions()[ix] for ix in range(len(self.continuous_feature_names))]
- return minx, maxx, encoded_categorical_feature_indexes, encoded_continuous_feature_indexes, cont_minx, cont_maxx, cont_precisions
+
return minx, maxx, encoded_categorical_feature_indexes, encoded_continuous_feature_indexes, cont_minx, \
+
cont_maxx, cont_precisions
[docs] def get_encoded_categorical_feature_indexes(self):
"""Gets the column indexes categorical features after one-hot-encoding."""
@@ -515,7 +614,8 @@
Source code for dice_ml.data_interfaces.public_data_interface
match_cols
= [c for c in data.columns if
c in cat_col_values] # check for the above matching columns in the encoded data
-
# then, recreate original data by removing the suffixes - based on the GitHub issue comment: https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271
+
# then, recreate original data by removing the suffixes - based on the GitHub issue comment:
+
# https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271
cols, labs = [[c.replace(
x, "") for c in match_cols] for x in ["", feat + prefix_sep]]
out[feat] = pd.Categorical(
@@ -525,14 +625,15 @@
Source code for dice_ml.data_interfaces.public_data_interface
[docs] def get_decimal_precisions(self, output_type="list"):
""""Gets the precision of continuous features in the data."""
-
# if the precision of a continuous feature is not given, we use the maximum precision of the modes to capture the precision of majority of values in the column.
+
# if the precision of a continuous feature is not given, we use the maximum precision of the modes to capture the
+
# precision of majority of values in the column.
precisions_dict = defaultdict(int)
precisions = [0] * len(self.feature_names)
for ix, col in enumerate(self.continuous_feature_names):
-
if ((self.continuous_features_precision is not None) and (col in self.continuous_features_precision)):
+
if (self.continuous_features_precision is not None) and (col in self.continuous_features_precision):
precisions[ix] = self.continuous_features_precision[col]
precisions_dict[col] = self.continuous_features_precision[col]
-
elif ((self.data_df[col].dtype == np.float32) or (self.data_df[col].dtype == np.float64)):
+
elif self.data_df[col].dtype == np.float32 or self.data_df[col].dtype == np.float64:
modes = self.data_df[col].mode()
maxp = len(str(modes[0]).split('.')[1]) # maxp stores the maximum precision of the modes
for mx in range(len(modes)):
@@ -613,7 +714,8 @@
Source code for dice_ml.data_interfaces.public_data_interface
test = test.reset_index(drop=True)
return test
-
# TODO: create a new method, get_LE_min_max_normalized_data() to get label-encoded and normalized data. Keep this method only for converting query_instance to pd.DataFrame
+
# TODO: create a new method, get_LE_min_max_normalized_data() to get label-encoded and normalized data. Keep this
+
# method only for converting query_instance to pd.DataFrame
# if encoding == 'label':
# for column in self.categorical_feature_names:
# test[column] = self.labelencoder[column].transform(test[column])
@@ -628,22 +730,26 @@
Source code for dice_ml.data_interfaces.public_data_interface
# return temp.tail(test.shape[0]).reset_index(drop=True)
[docs] def get_ohe_min_max_normalized_data(self, query_instance):
-
"""Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts"""
+
"""Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict,
+
a dataframe, a list, or a list of dicts"""
query_instance = self.prepare_query_instance(query_instance)
temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False)
temp = self.one_hot_encode_data(temp)
temp = temp.tail(query_instance.shape[0]).reset_index(drop=True)
-
return self.normalize_data(temp) # returns a pandas dataframe
+
# returns a pandas dataframe
+
return self.normalize_data(temp)
[docs] def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
-
"""Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data should be a dataframe or an array"""
+
"""Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data
+
should be a dataframe or an array"""
raw_data = self.get_decoded_data(transformed_data, encoding='one-hot')
raw_data = self.de_normalize_data(raw_data)
precisions = self.get_decimal_precisions()
for ix, feature in enumerate(self.continuous_feature_names):
raw_data[feature] = raw_data[feature].astype(float).round(precisions[ix])
raw_data = raw_data[self.feature_names]
-
return raw_data # returns a pandas dataframe
+
# returns a pandas dataframe
+
return raw_data
diff --git a/docs/_modules/dice_ml/dice.html b/docs/_modules/dice_ml/dice.html
index 69671998..dc839d6b 100644
--- a/docs/_modules/dice_ml/dice.html
+++ b/docs/_modules/dice_ml/dice.html
@@ -7,7 +7,7 @@
-
dice_ml.dice — DiCE 0.5 documentation
+
dice_ml.dice — DiCE 0.7 documentation
@@ -31,6 +31,8 @@
+
+
@@ -88,9 +90,11 @@
Notebooks:
- Quick introduction to generating counterfactual explanations using DiCE
-- Defining meta data
-- Loading trained ML model
-- Generate diverse counterfactuals
+- Estimating local and global feature importance scores using DiCE
+- Generating counterfactuals for multi-class classification and regression models
+- Regression
+- Generating counterfactual explanations with any ML model
+- Generating Counterfactual Explanations without access to training data
- Advanced options to customize Counterfactual Explanations
- Generate feasible counterfactual explanations using a VAE
- Adding feasibility constraints
@@ -162,9 +166,17 @@
Source code for dice_ml.dice
-"""Module pointing to different implementations of DiCE based on different frameworks such as Tensorflow or PyTorch or sklearn, and different methods such as RandomSampling, DiCEKD or DiCEGenetic"""
+"""Module pointing to different implementations of DiCE based on different
+ frameworks such as Tensorflow or PyTorch or sklearn, and different methods
+ such as RandomSampling, DiCEKD or DiCEGenetic"""
-[docs]class Dice:
+
from dice_ml.constants import BackEndTypes, SamplingStrategy
+
from dice_ml.utils.exception import UserConfigValidationException
+
from dice_ml.explainer_interfaces.explainer_base import ExplainerBase
+
from dice_ml.data_interfaces.private_data_interface import PrivateData
+
+
+
[docs]class Dice(ExplainerBase):
"""An interface class to different DiCE implementations."""
def __init__(self, data_interface, model_interface, method="random", **kwargs):
@@ -173,46 +185,71 @@
Source code for dice_ml.dice
:param data_interface: an interface to access data related params.
:param model_interface: an interface to access the output or gradients of a trained ML model.
:param method: Name of the method to use for generating counterfactuals
-
"""
-
self.decide_implementation_type(data_interface, model_interface, method, **kwargs)
[docs] def decide_implementation_type(self, data_interface, model_interface, method, **kwargs):
"""Decides DiCE implementation type."""
+
if model_interface.backend == BackEndTypes.Sklearn:
+
if method == SamplingStrategy.KdTree and isinstance(data_interface, PrivateData):
+
raise UserConfigValidationException(
+
'Private data interface is not supported with sklearn kdtree explainer'
+
' since kdtree explainer needs access to entire training data')
+
self.__class__ = decide(model_interface, method)
+
self.__init__(data_interface, model_interface, **kwargs)
+
+ def _generate_counterfactuals(self, query_instance, total_CFs,
+ desired_class="opposite", desired_range=None,
+ permitted_range=None, features_to_vary="all",
+ stopping_threshold=0.5, posthoc_sparsity_param=0.1,
+ posthoc_sparsity_algorithm="linear", verbose=False, **kwargs):
+ raise NotImplementedError("This method should be implemented by the concrete classes "
+ "that inherit from ExplainerBase")
-
self.__class__ = decide(model_interface, method)
-
self.__init__(data_interface, model_interface, **kwargs)
-
-# To add new implementations of DiCE, add the class in explainer_interfaces subpackage and import-and-return the class in an elif loop as shown in the below method.
[docs]def decide(model_interface, method):
-
"""Decides DiCE implementation type."""
-
-
if model_interface.backend == 'sklearn':
-
if method == "random": # random sampling of CFs
+
"""Decides DiCE implementation type.
+
+
To add new implementations of DiCE, add the class in explainer_interfaces
+
subpackage and import-and-return the class in an elif loop as shown in
+
the below method.
+
"""
+
if model_interface.backend == BackEndTypes.Sklearn:
+
if method == SamplingStrategy.Random:
+
# random sampling of CFs
from dice_ml.explainer_interfaces.dice_random import DiceRandom
return DiceRandom
-
elif method == "genetic":
+
elif method == SamplingStrategy.Genetic:
from dice_ml.explainer_interfaces.dice_genetic import DiceGenetic
return DiceGenetic
-
elif method == "kdtree":
+
elif method == SamplingStrategy.KdTree:
from dice_ml.explainer_interfaces.dice_KD import DiceKD
return DiceKD
-
-
elif model_interface.backend == 'TF1': # pretrained Keras Sequential model with Tensorflow 1.x backend
+
else:
+
raise UserConfigValidationException("Unsupported sample strategy {0} provided. "
+
"Please choose one of {1}, {2} or {3}".format(
+
method, SamplingStrategy.Random,
+
SamplingStrategy.Genetic,
+
SamplingStrategy.KdTree
+
))
+
+
elif model_interface.backend == BackEndTypes.Tensorflow1:
+
# pretrained Keras Sequential model with Tensorflow 1.x backend
from dice_ml.explainer_interfaces.dice_tensorflow1 import DiceTensorFlow1
return DiceTensorFlow1
-
elif model_interface.backend == 'TF2': # pretrained Keras Sequential model with Tensorflow 2.x backend
+
elif model_interface.backend == BackEndTypes.Tensorflow2:
+
# pretrained Keras Sequential model with Tensorflow 2.x backend
from dice_ml.explainer_interfaces.dice_tensorflow2 import DiceTensorFlow2
return DiceTensorFlow2
-
elif model_interface.backend == 'PYT': # PyTorch backend
+
elif model_interface.backend == BackEndTypes.Pytorch:
+
# PyTorch backend
from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch
return DicePyTorch
-
else: # all other backends
+
else:
+
# all other backends
backend_dice = model_interface.backend['explainer']
module_name, class_name = backend_dice.split('.')
module = __import__("dice_ml.explainer_interfaces." + module_name, fromlist=[class_name])
diff --git a/docs/_modules/dice_ml/diverse_counterfactuals.html b/docs/_modules/dice_ml/diverse_counterfactuals.html
index e6dafce6..fd659b46 100644
--- a/docs/_modules/dice_ml/diverse_counterfactuals.html
+++ b/docs/_modules/dice_ml/diverse_counterfactuals.html
@@ -7,7 +7,7 @@
-
dice_ml.diverse_counterfactuals — DiCE 0.5 documentation
+
dice_ml.diverse_counterfactuals — DiCE 0.7 documentation
@@ -31,6 +31,8 @@
+
+
@@ -88,9 +90,11 @@
Notebooks:
- Quick introduction to generating counterfactual explanations using DiCE
-- Defining meta data
-- Loading trained ML model
-- Generate diverse counterfactuals
+- Estimating local and global feature importance scores using DiCE
+- Generating counterfactuals for multi-class classification and regression models
+- Regression
+- Generating counterfactual explanations with any ML model
+- Generating Counterfactual Explanations without access to training data
- Advanced options to customize Counterfactual Explanations
- Generate feasible counterfactual explanations using a VAE
- Adding feasibility constraints
@@ -162,12 +166,31 @@
Source code for dice_ml.diverse_counterfactuals
-import numpy as np
-import pandas as pd
+import pandas as pd
import copy
-from IPython.display import display
import json
from dice_ml.utils.serialize import DummyDataInterface
+from dice_ml.constants import _SchemaVersions, ModelTypes
+
+
+class _DiverseCFV1SchemaConstants:
+ DATA_INTERFACE = 'data_interface'
+ MODEL_TYPE = 'model_type'
+ DESIRED_CLASS = 'desired_class'
+ DESIRED_RANGE = 'desired_range'
+ TEST_INSTANCE_DF = 'test_instance_df'
+ FINAL_CFS_DF = 'final_cfs_df'
+
+
+class _DiverseCFV2SchemaConstants:
+ DATA_INTERFACE = 'data_interface'
+ MODEL_TYPE = 'model_type'
+ DESIRED_CLASS = 'desired_class'
+ DESIRED_RANGE = 'desired_range'
+ FEATURE_NAMES_INCLUDING_TARGET = 'feature_names_including_target'
+ FEATURE_NAMES = 'feature_names'
+ TEST_INSTANCE_LIST = 'test_instance_list'
+ FIANL_CFS_LIST = 'final_cfs_list'
[docs]def json_converter(obj):
@@ -178,10 +201,13 @@
Source code for dice_ml.diverse_counterfactuals
<
except AttributeError:
return obj.__dict__
+
[docs]class CounterfactualExamples:
"""A class to store and visualize the resulting counterfactual explanations."""
-
-
def __init__(self, data_interface=None, final_cfs_df=None, test_instance_df=None, final_cfs_df_sparse=None, posthoc_sparsity_param=0, desired_range=None, desired_class="opposite", model_type='classifier'):
+
def __init__(self, data_interface=None, final_cfs_df=None, test_instance_df=None,
+
final_cfs_df_sparse=None, posthoc_sparsity_param=0,
+
desired_range=None, desired_class="opposite",
+
model_type=ModelTypes.Classifier):
self.data_interface = data_interface
self.final_cfs_df = final_cfs_df
@@ -192,48 +218,80 @@
Source code for dice_ml.diverse_counterfactuals
<
self.desired_range = desired_range
self.final_cfs_list = None
-
self.posthoc_sparsity_param = posthoc_sparsity_param # might be useful for future additions
+
self.posthoc_sparsity_param = posthoc_sparsity_param # might be useful for future additions
-
self.test_pred = self.test_instance_df[self.data_interface.outcome_name].iloc[0]
-
if model_type == 'classifier':
+
self.test_pred = self.test_instance_df[self.data_interface.outcome_name].iat[0]
+
if model_type == ModelTypes.Classifier:
if desired_class == "opposite":
self.new_outcome = 1.0 - round(self.test_pred)
else:
self.new_outcome = desired_class
-
elif model_type == 'regressor':
+
elif model_type == ModelTypes.Regressor:
self.new_outcome = desired_range
-
[docs] def visualize_as_dataframe(self, display_sparse_df=True, show_only_changes=False):
-
# original instance
-
print('Query instance (original outcome : %i)' %round(self.test_pred))
-
display(self.test_instance_df) # works only in Jupyter notebook
+
def __eq__(self, other_counterfactual_example):
+
if isinstance(other_counterfactual_example, CounterfactualExamples):
+
return self.desired_class == other_counterfactual_example.desired_class and \
+
self.desired_range == other_counterfactual_example.desired_range and \
+
self.model_type == other_counterfactual_example.model_type and \
+
(self.final_cfs_df is None) == \
+
(other_counterfactual_example.final_cfs_df is None) and \
+
(self.final_cfs_df_sparse is None) == \
+
(other_counterfactual_example.final_cfs_df_sparse is None)
+
return False
+
+
def _dump_output(self, content, show_only_changes=False, is_notebook_console=False):
+
if is_notebook_console:
+
self.display_df(content, show_only_changes=show_only_changes)
+
else:
+
assert isinstance(content, pd.DataFrame), "Expecting a pandas dataframe"
+
self.print_list(content.values.tolist(),
+
show_only_changes=show_only_changes)
+
+
def _visualize_internal(self, display_sparse_df=True, show_only_changes=False,
+
is_notebook_console=False):
if self.final_cfs_df is not None and len(self.final_cfs_df) > 0:
-
if self.posthoc_sparsity_param == None:
+
if self.posthoc_sparsity_param is None:
print('\nCounterfactual set (new outcome: {0})'.format(self.new_outcome))
-
self.display_df(self.final_cfs_df, show_only_changes)
-
-
elif hasattr(self.data_interface, 'data_df') and display_sparse_df==True and self.final_cfs_df_sparse is not None:
+
self._dump_output(content=self.final_cfs_df, show_only_changes=show_only_changes,
+
is_notebook_console=is_notebook_console)
+
elif hasattr(self.data_interface, 'data_df') and \
+
display_sparse_df is True and self.final_cfs_df_sparse is not None:
# CFs
print('\nDiverse Counterfactual set (new outcome: {0})'.format(self.new_outcome))
-
self.display_df(self.final_cfs_df_sparse, show_only_changes)
-
-
-
elif hasattr(self.data_interface, 'data_df') and display_sparse_df==True and self.final_cfs_df_sparse is None:
-
print('\nPlease specify a valid posthoc_sparsity_param to perform sparsity correction.. displaying Diverse Counterfactual set without sparsity correction (new outcome : %i)' %(self.new_outcome))
-
self.display_df(self.final_cfs_df, show_only_changes)
-
-
elif not hasattr(self.data_interface, 'data_df'):# for private data
-
print('\nDiverse Counterfactual set without sparsity correction since only metadata about each feature is available (new outcome: ', self.new_outcome)
-
self.display_df(self.final_cfs_df, show_only_changes)
-
+
self._dump_output(content=self.final_cfs_df_sparse, show_only_changes=show_only_changes,
+
is_notebook_console=is_notebook_console)
+
elif hasattr(self.data_interface, 'data_df') and \
+
display_sparse_df is True and self.final_cfs_df_sparse is None:
+
print('\nPlease specify a valid posthoc_sparsity_param to perform sparsity correction.. ',
+
'displaying Diverse Counterfactual set without sparsity correction (new outcome : %i)' %
+
(self.new_outcome))
+
self._dump_output(content=self.final_cfs_df, show_only_changes=show_only_changes,
+
is_notebook_console=is_notebook_console)
+
elif not hasattr(self.data_interface, 'data_df'): # for private data
+
print('\nDiverse Counterfactual set without sparsity correction since only metadata about each',
+
' feature is available (new outcome: ', self.new_outcome)
+
self._dump_output(content=self.final_cfs_df, show_only_changes=show_only_changes,
+
is_notebook_console=is_notebook_console)
else:
# CFs
print('\nDiverse Counterfactual set without sparsity correction (new outcome: ', self.new_outcome)
-
self.display_df(self.final_cfs_df, show_only_changes)
+
self._dump_output(content=self.final_cfs_df, show_only_changes=show_only_changes,
+
is_notebook_console=is_notebook_console)
else:
-
print('\nNo counterfactuals found!')
+
print('\nNo counterfactuals found!')
+
+
[docs] def visualize_as_dataframe(self, display_sparse_df=True, show_only_changes=False):
+
from IPython.display import display
+
# original instance
+
print('Query instance (original outcome : %i)' % round(self.test_pred))
+
display(self.test_instance_df) # works only in Jupyter notebook
+
self._visualize_internal(display_sparse_df=display_sparse_df,
+
show_only_changes=show_only_changes,
+
is_notebook_console=True)
[docs] def display_df(self, df, show_only_changes):
+
from IPython.display import display
if show_only_changes is False:
display(df) # works only in Jupyter notebook
else:
@@ -245,37 +303,15 @@
Source code for dice_ml.diverse_counterfactuals
<
newdf[ix][jx] = '-'
else:
newdf[ix][jx] = str(newdf[ix][jx])
- display(pd.DataFrame(newdf, columns=df.columns)) # works only in Jupyter notebook
+
display(pd.DataFrame(newdf, columns=df.columns, index=df.index)) # works only in Jupyter notebook
[docs] def visualize_as_list(self, display_sparse_df=True, show_only_changes=False):
# original instance
-
print('Query instance (original outcome : %i)' %round(self.test_pred))
+
print('Query instance (original outcome : %i)' % round(self.test_pred))
print(self.test_instance_df.values.tolist()[0])
-
-
if len(self.final_cfs) > 0:
-
if self.posthoc_sparsity_param == None:
-
print('\nCounterfactual set (new outcome : %i)' %(self.new_outcome))
-
self.print_list(self.final_cfs_df.values.tolist(), show_only_changes)
-
-
elif hasattr(self.data_interface, 'data_df') and display_sparse_df==True and self.final_cfs_df_sparse is not None:
-
# CFs
-
print('\nDiverse Counterfactual set (new outcome : %i)' %(self.new_outcome))
-
self.print_list(self.final_cfs_df_sparse.values.tolist(), show_only_changes)
-
-
elif hasattr(self.data_interface, 'data_df') and display_sparse_df==True and self.final_cfs_df_sparse is None:
-
print('\nPlease specify a valid posthoc_sparsity_param to perform sparsity correction.. displaying Diverse Counterfactual set without sparsity correction (new outcome : %i)' %(self.new_outcome))
-
self.print_list(self.final_cfs_df.values.tolist(), show_only_changes)
-
-
elif not hasattr(self.data_interface, 'data_df'): # for private data
-
print('\nDiverse Counterfactual set without sparsity correction since only metadata about each feature is available (new outcome : %i)' %(self.new_outcome))
-
self.print_list(self.final_cfs_df.values.tolist(), show_only_changes)
-
-
else:
-
# CFs
-
print('\nDiverse Counterfactual set without sparsity correction (new outcome : %i)' %(self.new_outcome))
-
self.print_list(self.final_cfs_df.values.tolist(), show_only_changes)
-
else:
-
print('\n0 counterfactuals found!')
+
self._visualize_internal(display_sparse_df=display_sparse_df,
+
show_only_changes=show_only_changes,
+
is_notebook_console=False)
[docs] def print_list(self, li, show_only_changes):
if show_only_changes is False:
@@ -290,7 +326,7 @@
Source code for dice_ml.diverse_counterfactuals
<
newli[ix][jx] = '-'
print(newli[ix])
-
[docs] def to_json(self):
+
[docs] def to_json(self, serialization_version):
if self.final_cfs_df_sparse is not None:
df = self.final_cfs_df_sparse
else:
@@ -304,13 +340,88 @@
Source code for dice_ml.diverse_counterfactuals
<
else:
dummy_data_interface = DummyDataInterface(
self.data_interface.outcome_name)
- obj = {'data_interface': dummy_data_interface,
- 'model_type': self.model_type,
- 'desired_class': self.desired_class,
- 'desired_range': self.desired_range,
- 'test_instance_df': self.test_instance_df,
- 'final_cfs_df': df}
- return json.dumps(obj, default=json_converter)
+
+
if serialization_version == _SchemaVersions.V1:
+
obj = {
+
_DiverseCFV1SchemaConstants.DATA_INTERFACE: dummy_data_interface,
+
_DiverseCFV1SchemaConstants.MODEL_TYPE: self.model_type,
+
_DiverseCFV1SchemaConstants.DESIRED_CLASS: self.desired_class,
+
_DiverseCFV1SchemaConstants.DESIRED_RANGE: self.desired_range,
+
_DiverseCFV1SchemaConstants.TEST_INSTANCE_DF: self.test_instance_df,
+
_DiverseCFV1SchemaConstants.FINAL_CFS_DF: df
+
}
+
return json.dumps(obj, default=json_converter)
+
elif serialization_version == _SchemaVersions.V2:
+
dummy_data_interface_dict = dummy_data_interface.to_json()
+
feature_names_including_target = self.test_instance_df.columns.tolist()
+
feature_names = self.test_instance_df.columns.tolist().copy()
+
feature_names.remove(dummy_data_interface.outcome_name)
+
test_instance_df_as_list = self.test_instance_df.values.tolist()
+
if df is not None:
+
final_cfs_df_as_as_list = df.values.tolist()
+
else:
+
final_cfs_df_as_as_list = None
+
+
alternate_obj = {
+
_DiverseCFV2SchemaConstants.TEST_INSTANCE_LIST: test_instance_df_as_list,
+
_DiverseCFV2SchemaConstants.FIANL_CFS_LIST: final_cfs_df_as_as_list,
+
_DiverseCFV2SchemaConstants.DATA_INTERFACE: dummy_data_interface_dict,
+
_DiverseCFV2SchemaConstants.FEATURE_NAMES: feature_names,
+
_DiverseCFV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET: feature_names_including_target,
+
_DiverseCFV2SchemaConstants.MODEL_TYPE: self.model_type,
+
_DiverseCFV2SchemaConstants.DESIRED_CLASS: self.desired_class,
+
_DiverseCFV2SchemaConstants.DESIRED_RANGE: self.desired_range
+
}
+
return json.dumps(alternate_obj)
+
+
[docs] @staticmethod
+
def from_json(cf_example_json_str):
+
cf_example_dict = json.loads(cf_example_json_str)
+
if cf_example_dict.get(_DiverseCFV1SchemaConstants.TEST_INSTANCE_DF) is not None:
+
test_instance_df = pd.read_json(cf_example_dict[
+
_DiverseCFV1SchemaConstants.TEST_INSTANCE_DF])
+
if cf_example_dict[_DiverseCFV1SchemaConstants.FINAL_CFS_DF] is not None:
+
cfs_df = pd.read_json(cf_example_dict[_DiverseCFV1SchemaConstants.FINAL_CFS_DF])
+
else:
+
cfs_df = None
+
+
# Creating the object for dummy_data_interface
+
dummy_data_interface = DummyDataInterface(**cf_example_dict[_DiverseCFV1SchemaConstants.DATA_INTERFACE])
+
return CounterfactualExamples(data_interface=dummy_data_interface,
+
test_instance_df=test_instance_df,
+
final_cfs_df=cfs_df,
+
final_cfs_df_sparse=cfs_df,
+
posthoc_sparsity_param=None,
+
desired_class=cf_example_dict[_DiverseCFV1SchemaConstants.DESIRED_CLASS],
+
desired_range=cf_example_dict[_DiverseCFV1SchemaConstants.DESIRED_RANGE],
+
model_type=cf_example_dict[_DiverseCFV1SchemaConstants.MODEL_TYPE])
+
else:
+
final_cfs_list = cf_example_dict[_DiverseCFV2SchemaConstants.FIANL_CFS_LIST]
+
test_instance_list = cf_example_dict[_DiverseCFV2SchemaConstants.TEST_INSTANCE_LIST]
+
feature_names_including_target = cf_example_dict[_DiverseCFV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET]
+
+
data_interface = cf_example_dict[_DiverseCFV2SchemaConstants.DATA_INTERFACE]
+
desired_class = cf_example_dict[_DiverseCFV2SchemaConstants.DESIRED_CLASS]
+
desired_range = cf_example_dict[_DiverseCFV2SchemaConstants.DESIRED_RANGE]
+
model_type = cf_example_dict[_DiverseCFV2SchemaConstants.MODEL_TYPE]
+
+
test_instance_df = pd.DataFrame(data=test_instance_list,
+
columns=feature_names_including_target)
+
if final_cfs_list is not None:
+
cfs_df = pd.DataFrame(data=final_cfs_list,
+
columns=feature_names_including_target)
+
else:
+
cfs_df = None
+
# Creating the object for dummy_data_interface
+
dummy_data_interface = DummyDataInterface(**data_interface)
+
return CounterfactualExamples(data_interface=dummy_data_interface,
+
test_instance_df=test_instance_df,
+
final_cfs_df=cfs_df,
+
final_cfs_df_sparse=cfs_df,
+
posthoc_sparsity_param=None,
+
desired_class=desired_class,
+
desired_range=desired_range,
+
model_type=model_type)
diff --git a/docs/_modules/dice_ml/explainer_interfaces/dice_KD.html b/docs/_modules/dice_ml/explainer_interfaces/dice_KD.html
index 8202a101..ce067df3 100644
--- a/docs/_modules/dice_ml/explainer_interfaces/dice_KD.html
+++ b/docs/_modules/dice_ml/explainer_interfaces/dice_KD.html
@@ -7,7 +7,7 @@
- dice_ml.explainer_interfaces.dice_KD — DiCE 0.5 documentation
+ dice_ml.explainer_interfaces.dice_KD — DiCE 0.7 documentation
@@ -31,6 +31,8 @@
+
+
@@ -88,9 +90,11 @@
Notebooks:
- Quick introduction to generating counterfactual explanations using DiCE
-- Defining meta data
-- Loading trained ML model
-- Generate diverse counterfactuals
+- Estimating local and global feature importance scores using DiCE
+- Generating counterfactuals for multi-class classification and regression models
+- Regression
+- Generating counterfactual explanations with any ML model
+- Generating Counterfactual Explanations without access to training data
- Advanced options to customize Counterfactual Explanations
- Generate feasible counterfactual explanations using a VAE
- Adding feasibility constraints
@@ -171,9 +175,9 @@ Source code for dice_ml.explainer_interfaces.dice_KD
import
timeit
import pandas as pd
import copy
-
import random
from dice_ml import diverse_counterfactuals as exp
+
from dice_ml.constants import ModelTypes
[docs]class DiceKD(ExplainerBase):
@@ -201,7 +205,7 @@
Source code for dice_ml.explainer_interfaces.dice_KD
self
.model.load_model()
# number of output nodes of ML model
-
if self.model.model_type == 'classifier':
+
if self.model.model_type == ModelTypes.Classifier:
self.num_output_nodes = self.model.get_num_output_nodes2(
self.data_interface.data_df[0:1][self.data_interface.feature_names])
@@ -217,17 +221,27 @@
Source code for dice_ml.explainer_interfaces.dice_KD
:param query_instance: A dictionary of feature names and values. Test point of interest.
:param total_CFs: Total number of counterfactuals required.
:param desired_range: For regression problems. Contains the outcome range to generate counterfactuals in.
-
:param desired_class: Desired counterfactual class - can take 0 or 1. Default value is "opposite" to the outcome class of query_instance for binary classification.
+
:param desired_class: Desired counterfactual class - can take 0 or 1. Default value is "opposite" to the
+
outcome class of query_instance for binary classification.
:param features_to_vary: Either a string "all" or a list of feature names to vary.
-
:param permitted_range: Dictionary with continuous feature names as keys and permitted min-max range in list as values. Defaults to the range inferred from training data. If None, uses the parameters initialized in data_interface.
+
:param permitted_range: Dictionary with continuous feature names as keys and permitted min-max range in
+
list as values. Defaults to the range inferred from training data.
+
If None, uses the parameters initialized in data_interface.
:param sparsity_weight: Parameter to determine how much importance to give to sparsity
-
:param feature_weights: Either "inverse_mad" or a dictionary with feature names as keys and corresponding weights as values. Default option is "inverse_mad" where the weight for a continuous feature is the inverse of the Median Absolute Devidation (MAD) of the feature's values in the training set; the weight for a categorical feature is equal to 1 by default.
+
:param feature_weights: Either "inverse_mad" or a dictionary with feature names as keys and corresponding
+
weights as values. Default option is "inverse_mad" where the weight for a continuous
+
feature is the inverse of the Median Absolute Devidation (MAD) of the feature's
+
values in the training set; the weight for a categorical feature is equal to 1 by default.
:param stopping_threshold: Minimum threshold for counterfactuals target class probability.
:param posthoc_sparsity_param: Parameter for the post-hoc operation on continuous features to enhance sparsity.
-
:param posthoc_sparsity_algorithm: Perform either linear or binary search. Takes "linear" or "binary". Prefer binary search when a feature range is large (for instance, income varying from 10k to 1000k) and only if the features share a monotonic relationship with predicted outcome in the model.
+
:param posthoc_sparsity_algorithm: Perform either linear or binary search. Takes "linear" or "binary".
+
Prefer binary search when a feature range is large (for instance, income
+
varying from 10k to 1000k) and only if the features share a monotonic
+
relationship with predicted outcome in the model.
:param verbose: Parameter to determine whether to print 'Diverse Counterfactuals found!'
-
:return: A CounterfactualExamples object to store and visualize the resulting counterfactual explanations (see diverse_counterfactuals.py).
+
:return: A CounterfactualExamples object to store and visualize the resulting counterfactual explanations
+
(see diverse_counterfactuals.py).
"""
data_df_copy = self.data_interface.data_df.copy()
@@ -241,12 +255,12 @@
Source code for dice_ml.explainer_interfaces.dice_KD
test_pred
= self.predict_fn(query_instance)[0]
query_instance[self.data_interface.outcome_name] = test_pred
-
-
if desired_range != None:
+
desired_class = self.misc_init(stopping_threshold, desired_class, desired_range, test_pred)
+
if desired_range is not None:
if desired_range[0] > desired_range[1]:
raise ValueError("Invalid Range!")
-
if desired_class == "opposite" and self.model.model_type == 'classifier':
+
if desired_class == "opposite" and self.model.model_type == ModelTypes.Classifier:
if self.num_output_nodes == 2:
desired_class = 1.0 - test_pred
@@ -257,24 +271,25 @@
Source code for dice_ml.explainer_interfaces.dice_KD
raise
ValueError("Desired class should be within 0 and num_classes-1.")
# Partitioned dataset and KD Tree for each class (binary) of the dataset
-
self.dataset_with_predictions, self.KD_tree, self.predictions = self.build_KD_tree(data_df_copy, desired_range,
-
desired_class, self.predicted_outcome_name)
-
-
query_instance, final_cfs, cfs_preds = self.find_counterfactuals(data_df_copy,
-
query_instance, query_instance_orig,
-
desired_range,
-
desired_class,
-
total_CFs, features_to_vary,
-
permitted_range,
-
sparsity_weight,
-
stopping_threshold,
-
posthoc_sparsity_param,
-
posthoc_sparsity_algorithm, verbose)
+
self.dataset_with_predictions, self.KD_tree, self.predictions = \
+
self.build_KD_tree(data_df_copy, desired_range, desired_class, self.predicted_outcome_name)
+
+
query_instance, cfs_preds = self.find_counterfactuals(data_df_copy,
+
query_instance, query_instance_orig,
+
desired_range,
+
desired_class,
+
total_CFs, features_to_vary,
+
permitted_range,
+
sparsity_weight,
+
stopping_threshold,
+
posthoc_sparsity_param,
+
posthoc_sparsity_algorithm, verbose)
+
self.cfs_preds = cfs_preds
return exp.CounterfactualExamples(data_interface=self.data_interface,
-
final_cfs_df=final_cfs,
+
final_cfs_df=self.final_cfs_df,
test_instance_df=query_instance,
-
final_cfs_df_sparse=self.final_cfs_sparse,
+
final_cfs_df_sparse=self.final_cfs_df_sparse,
posthoc_sparsity_param=posthoc_sparsity_param,
desired_range=desired_range,
desired_class=desired_class,
@@ -282,7 +297,7 @@
Source code for dice_ml.explainer_interfaces.dice_KD
[docs] def predict_fn(self, input_instance):
"""returns predictions"""
-
return self.model.model.predict(input_instance)
+
return self.model.get_output(input_instance, model_score=False)
[docs] def do_sparsity_check(self, cfs, query_instance, sparsity_weight):
cfs = cfs.assign(sparsity=np.nan, distancesparsity=np.nan)
@@ -313,7 +328,7 @@
Source code for dice_ml.explainer_interfaces.dice_KD
num_queries
= min(len(self.dataset_with_predictions), total_CFs * 10)
cfs = []
-
if self.KD_tree is not None:
+
if self.KD_tree is not None and num_queries > 0:
KD_tree_output = self.KD_tree.query(KD_query_instance, num_queries)
distances = KD_tree_output[0][0]
indices = KD_tree_output[1][0]
@@ -323,28 +338,27 @@
Source code for dice_ml.explainer_interfaces.dice_KD
cfs
= self.do_sparsity_check(cfs, query_instance, sparsity_weight)
cfs = cfs.drop(self.data_interface.outcome_name, axis=1)
-
final_cfs = pd.DataFrame()
+
self.final_cfs = pd.DataFrame()
final_indices = []
cfs_preds = []
total_cfs_found = 0
# Iterating through the closest points from the KD tree and checking if any of these are valid
-
if self.KD_tree is not None:
-
cfs = cfs.reset_index(drop=True)
+
if self.KD_tree is not None and total_CFs > 0:
for i in range(len(cfs)):
if total_cfs_found == total_CFs:
break
valid_cf_found = True
for feature in self.data_interface.feature_names:
-
if feature not in features_to_vary and cfs.iloc[i][feature] != query_instance[feature].values[0]:
+
if feature not in features_to_vary and cfs[feature].iat[i] != query_instance[feature].values[0]:
valid_cf_found = False
break
if feature in self.data_interface.continuous_feature_names:
-
if not self.feature_range[feature][0] <= cfs.iloc[i][feature] <= self.feature_range[feature][1]:
+
if not self.feature_range[feature][0] <= cfs[feature].iat[i] <= self.feature_range[feature][1]:
valid_cf_found = False
break
else:
-
if not cfs.iloc[i][feature] in self.feature_range[feature]:
+
if not cfs[feature].iat[i] in self.feature_range[feature]:
valid_cf_found = False
break
@@ -353,14 +367,14 @@
Source code for dice_ml.explainer_interfaces.dice_KD
total_cfs_found += 1
final_indices.append(i)
if total_cfs_found > 0:
- final_cfs = cfs.iloc[final_indices]
- final_cfs = final_cfs.drop([self.predicted_outcome_name], axis=1)
+ self.final_cfs = cfs.iloc[final_indices]
+ self.final_cfs = self.final_cfs.drop([self.predicted_outcome_name], axis=1)
# Finding the predicted outcome for each cf
for i in range(total_cfs_found):
cfs_preds.append(
self.dataset_with_predictions.iloc[final_indices[i]][self.predicted_outcome_name])
- return final_cfs[:total_CFs], cfs_preds
+
return self.final_cfs[:total_CFs], cfs_preds
[docs] def duplicates(self, cfs, final_indices, i):
final_indices.append(i)
@@ -373,18 +387,6 @@
Source code for dice_ml.explainer_interfaces.dice_KD
verbose
):
"""Finds counterfactuals by querying a K-D tree for the nearest data points in the desired class from the dataset."""
-
self.stopping_threshold = stopping_threshold
-
if self.model.model_type == 'classifier':
-
self.target_cf_class = np.array([[desired_class]], dtype=np.float32)
-
elif self.model.model_type == 'regressor':
-
self.target_cf_range = desired_range
-
-
if self.model.model_type == 'classifier':
-
if self.target_cf_class == 0 and self.stopping_threshold > 0.5:
-
self.stopping_threshold = 0.25
-
elif self.target_cf_class == 1 and self.stopping_threshold < 0.5:
-
self.stopping_threshold = 0.75
-
start_time = timeit.default_timer()
# Making the one-hot-encoded version of query instance match the one-hot encoded version of the dataset
@@ -393,32 +395,29 @@
Source code for dice_ml.explainer_interfaces.dice_KD
if
col not in query_instance_df_dummies.columns:
query_instance_df_dummies[col] = 0
-
final_cfs, cfs_preds = self.vary_valid(query_instance_df_dummies,
-
total_CFs,
-
features_to_vary,
-
permitted_range,
-
query_instance_orig,
-
sparsity_weight)
+
self.final_cfs, cfs_preds = self.vary_valid(query_instance_df_dummies,
+
total_CFs,
+
features_to_vary,
+
permitted_range,
+
query_instance_orig,
+
sparsity_weight)
-
total_cfs_found = len(final_cfs)
+
total_cfs_found = len(self.final_cfs)
if total_cfs_found > 0:
# post-hoc operation on continuous features to enhance sparsity - only for public data
-
if posthoc_sparsity_param != None and posthoc_sparsity_param > 0 and 'data_df' in self.data_interface.__dict__:
-
final_cfs_sparse = copy.deepcopy(final_cfs)
-
self.final_cfs_sparse = self.do_posthoc_sparsity_enhancement(final_cfs_sparse, query_instance,
-
posthoc_sparsity_param,
-
posthoc_sparsity_algorithm)
+
if posthoc_sparsity_param is not None and posthoc_sparsity_param > 0 and 'data_df' in self.data_interface.__dict__:
+
self.final_cfs_df_sparse = copy.deepcopy(self.final_cfs)
+
self.final_cfs_df_sparse = self.do_posthoc_sparsity_enhancement(self.final_cfs_df_sparse, query_instance,
+
posthoc_sparsity_param,
+
posthoc_sparsity_algorithm)
else:
-
self.final_cfs_sparse = None
+
self.final_cfs_df_sparse = None
else:
-
self.final_cfs_sparse = None
+
self.final_cfs_df_sparse = None
+
self.final_cfs_df = self.final_cfs
if total_cfs_found > 0:
-
# to display the values with the same precision as the original data
-
precisions = self.data_interface.get_decimal_precisions()
-
for ix, feature in enumerate(self.data_interface.continuous_feature_names):
-
final_cfs[feature] = final_cfs[feature].astype(float).round(precisions[ix])
-
self.final_cfs_sparse[feature] = self.final_cfs_sparse[feature].astype(float).round(precisions[ix])
+
self.round_to_precision()
self.elapsed = timeit.default_timer() - start_time
@@ -428,13 +427,14 @@
Source code for dice_ml.explainer_interfaces.dice_KD
if total_cfs_found < total_CFs:
self.elapsed = timeit.default_timer() - start_time
m, s = divmod(self.elapsed, 60)
- print(
- 'Only %d (required %d) Diverse Counterfactuals found for the given configuation, perhaps change the query instance or the features to vary...' % (
- total_cfs_found, total_CFs), '; total time taken: %02d' % m, 'min %02d' % s, 'sec')
+ print('Only %d (required %d) ' % (total_cfs_found, self.total_CFs),
+ 'Diverse Counterfactuals found for the given configuation, perhaps ',
+ 'change the query instance or the features to vary...' '; total time taken: %02d' % m,
+ 'min %02d' % s, 'sec')
else:
print('Diverse Counterfactuals found! total time taken: %02d' % m, 'min %02d' % s, 'sec')
- return query_instance, final_cfs, cfs_preds
+
return query_instance, cfs_preds
diff --git a/docs/_modules/dice_ml/explainer_interfaces/dice_genetic.html b/docs/_modules/dice_ml/explainer_interfaces/dice_genetic.html
index e4e5249f..760ec49c 100644
--- a/docs/_modules/dice_ml/explainer_interfaces/dice_genetic.html
+++ b/docs/_modules/dice_ml/explainer_interfaces/dice_genetic.html
@@ -7,7 +7,7 @@
-
dice_ml.explainer_interfaces.dice_genetic — DiCE 0.5 documentation
+
dice_ml.explainer_interfaces.dice_genetic — DiCE 0.7 documentation
@@ -31,6 +31,8 @@
+
+
@@ -88,9 +90,11 @@
Notebooks: