Skip to content

Commit

Permalink
Merge pull request #6 from jotterbach/MDLP
Browse files Browse the repository at this point in the history
MDLP for discretizer
  • Loading branch information
jotterbach committed Sep 27, 2016
2 parents b974040 + ce97362 commit abce121
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 4 deletions.
79 changes: 75 additions & 4 deletions DSTK/FeatureBinning/TreeBasedFeatureBinning.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,91 @@
import numpy as np


def _recurse_tree(tree, lst, node_id=0, depth=0, min_val=np.NINF, max_val=np.PINF):
def _recurse_tree(tree, lst, mdlp, node_id=0, depth=0, min_val=np.NINF, max_val=np.PINF):
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]

if left_child == sklearn.tree._tree.TREE_LEAF:
lst.append(((min_val, max_val), tree.value[node_id].flatten().tolist()))
return
else:
_recurse_tree(tree, lst, left_child, depth=depth + 1, min_val=min_val, max_val=tree.threshold[node_id])
if mdlp and _check_mdlp_stop(tree, node_id):
lst.append(((min_val, max_val), tree.value[node_id].flatten().tolist()))
return
_recurse_tree(tree, lst, mdlp, left_child, depth=depth + 1, min_val=min_val, max_val=tree.threshold[node_id])

if right_child == sklearn.tree._tree.TREE_LEAF:
lst.append(((min_val, max_val), tree.value[node_id].flatten().tolist()))
return
else:
_recurse_tree(tree, lst, right_child, depth=depth + 1, min_val=tree.threshold[node_id], max_val=max_val)
if mdlp and _check_mdlp_stop(tree, node_id):
lst.append(((min_val, max_val), tree.value[node_id].flatten().tolist()))
return
_recurse_tree(tree, lst, mdlp, right_child, depth=depth + 1, min_val=tree.threshold[node_id], max_val=max_val)


def _convert_to_conditional_proba_buckets(sorted_nodes):
return [(bucket, (vals / np.sum(vals)).tolist()) for bucket, vals in sorted_nodes]


def _check_mdlp_stop(tree, node_id):
"""
The MDLP implementation follows the paper of
U. S. Fayyad and K. B. Irani, Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning, JPL TRS 1992
http://hdl.handle.net/2014/35171
"""

num_samples = tree.value[node_id].flatten().sum()

gain = _calculate_gain(tree, node_id)
delta = _calculate_noise_delta(tree, node_id)

return gain < (delta + np.log2(num_samples - 1)) / num_samples


def _calculate_entropy(array):
non_zero_array = array / array.sum()
return -1 * np.sum(non_zero_array * np.log2(non_zero_array))


def _calculate_gain(tree, node_id):
S, nS, S1, nS1, S2, nS2 = _get_variables_for_entropy_calculation(tree, node_id)

return _calculate_entropy(S) \
- S1.sum() / S.sum() * _calculate_entropy(S1) \
- S2.sum() / S.sum() * _calculate_entropy(S2)


def _calculate_noise_delta(tree, node_id):
S, nS, S1, nS1, S2, nS2 = _get_variables_for_entropy_calculation(tree, node_id)

return np.log2(np.power(3, nS) - 2) \
- (nS * _calculate_entropy(S)
- nS1 * _calculate_entropy(S1)
- nS2 * _calculate_entropy(S2))


def _get_variables_for_entropy_calculation(tree, node_id):
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]

full_set_values = tree.value[node_id].flatten()
left_set_values = tree.value[left_child].flatten()
right_set_values = tree.value[right_child].flatten()

# remove zeros from value_counts to continue processing
full_set_without_zero_counts = full_set_values[np.where(full_set_values > 0)[0]]
full_set_tree_classes = full_set_without_zero_counts.size

left_set_without_zero_counts = left_set_values[np.where(left_set_values > 0)[0]]
left_set_tree_classes = left_set_without_zero_counts.size

right_set_without_zero_counts = right_set_values[np.where(right_set_values > 0)[0]]
right_set_tree_classes = right_set_without_zero_counts.size

return full_set_without_zero_counts, full_set_tree_classes, left_set_without_zero_counts, left_set_tree_classes, right_set_without_zero_counts, right_set_tree_classes

class TreeBasedFeatureBinning(object):

def __init__(self, name, **kwargs):
Expand All @@ -45,6 +109,13 @@ def __init__(self, name, **kwargs):
class_weight = kwargs.get('class_weight', None)
presort = kwargs.get('presort', False)

self.mdlp = kwargs.get('mdlp', False)

if self.mdlp:
criterion = 'entropy'
max_leaf_nodes = None
max_depth = None

self.dtc = DecisionTreeClassifier(criterion=criterion,
splitter=splitter,
max_depth=max_depth,
Expand Down Expand Up @@ -103,7 +174,7 @@ def fit(self, values, target):
if (np.min(label_dist) > 0) & (len(label_dist) == 2):
lst_of_bins = list()
# recurse edits the list inplace
_recurse_tree(self.dtc.tree_, lst_of_bins)
_recurse_tree(self.dtc.tree_, lst_of_bins, self.mdlp)
self._count_buckets = sorted(lst_of_bins, key=lambda x: x[0][0])
self.cond_proba_buckets = _convert_to_conditional_proba_buckets(self._count_buckets)
else:
Expand Down
18 changes: 18 additions & 0 deletions DSTK/tests/test_feature_binner/test_recursive_feature_binner.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,24 @@ def test_recursion():
(np.NaN, [0.5, 0.5])]


def test_recursion_with_mdlp():
binner = tfb.TreeBasedFeatureBinning('test', mdlp=True)
binner.fit(data[:, 0], target)

assert binner._count_buckets == [
((np.NINF, 13.094999313354492), [13.0, 252.0]),
((13.094999313354492, 15.045000076293945), [38.0, 94.0]),
((15.045000076293945, 17.880001068115234), [64.0, 11.0]),
((17.880001068115234, np.PINF), [97.0, 0.0])]

assert binner.cond_proba_buckets == [
((np.NINF, 13.094999313354492), [0.04905660377358491, 0.9509433962264151]),
((13.094999313354492, 15.045000076293945), [0.2878787878787879, 0.7121212121212122]),
((15.045000076293945, 17.880001068115234), [0.8533333333333334, 0.14666666666666667]),
((17.880001068115234, np.PINF), [1.0, 0.0]),
(np.NaN, [0.5, 0.5])]


def test_fit():
feats = [0, 1, 2, np.nan, 5, np.nan]
labels = [0, 1, 0, 1, 1, 0]
Expand Down

0 comments on commit abce121

Please sign in to comment.