/
ForestForTheTrees.py
1644 lines (1424 loc) · 73.6 KB
/
ForestForTheTrees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#imports
#from __future__ import division
import time
import datetime
import copy
from itertools import product
import operator
from collections import OrderedDict
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError
import altair as alt
alt.renderers.enable("default")
from IPython.display import display
#seed for reproducibility
np.random.seed = 15
class ForestForTheTrees:
ALLOWED_REGRESSION_ERROR_METRICS = ["r_squared", "mae"]
def __init__(
self,
dataset,
model,
target_col,
task = "Regression",
sample_size = None,
num_tiles = 40,
quantiles = False
):
""" Initialize an instance with a dataset, model, target, task, and other parameters
Generates an internal data representation from the dataframe, fits the model if not done already,
initializes default variables, and raises errors if bad arguments are passed.
--Arguments
dataset (pd.DataFrame or string):
If DataFrame, the whole dataset with predictor and target features
If String, the name of one of the sample datasets in get_sample_dataset
model (sklearn.GradientBoostingRegressor or None)
Model to explain. If None, initialize and fit model with default hyperparameters
target_col (String):
Column name found in dataset.columns. Value to be predicted
task (String):
"Regression" is the only task currently supported
sample_size (Integer or None):
For large datasets, select a number of datapoints to randomly sample (without replacement)
when evaluating explanation components. If none, use the whole dataset.
num_tiles (Integer):
For quantitative features, the number of bins into which the feature range is divided,
and therefore also the number of squares in the heatmap for that feature.
Bounds the max size of a single component heatmap at num_tiles^2.
Higher numbers result in more accurate explanations at the cost of slower running time.
quantiles (Boolean):
If true, divide feature range into num_tiles bins of equal numbers of datapoints, rather than equal size.
--Returns
None
"""
#initialize variables
self.model = model
self.target_col = target_col
self.task = task
self.mean_prediction = None
self.no_predictor_features = []
self.oned_features = []
self.binned_data = None
self.sample_size = sample_size #may be set to none here, will be handled in load_dataset()
self.num_tiles = num_tiles
self.quantiles = quantiles
self.predictions_base = None
self.chart_components = {}
self.explanation_components = {}
self.base_explanation = []
self.evaluation_details = []
self.base_components = []
self.explanation = []
self.cache = {}
#get sample dataset if string passed
if isinstance(dataset, str):
dataset = ForestForTheTrees.get_sample_dataset(dataset)
elif not isinstance(dataset, pd.DataFrame):
raise ValueError(f"Invalid type {type(dataset)} passed for dataset argument. Please provide "\
+ "a DataFrame or a string if using one of the sample datasets.")
#generate internal data representation
features = [x for x in dataset.columns if x != self.target_col]
self.x = dataset.loc[:,features].values
self.y = dataset.loc[:,target_col]
self.feature_names = features
self.feature_locs = {x:i for i,x in enumerate(features)}
self.feature_ranges = {
feature : self._get_quantiles(feature)
for feature in self.feature_names
}
if self.sample_size is None: #if no sample size use whole dataset
self.sample_size = self.x.shape[0]
self.binned_data = self._bin_data()
#if no model passed, instantiate one with default settings
if self.model is None:
if self.task == "Regression":
self.model = GradientBoostingRegressor(
n_estimators = 300,
max_depth = 2,
learning_rate = 1.
)
else:
raise NotImplementedError("Regression is the only task currently implemented.")
#now handle models which are not yet fitted
#the ideal scenario is that models are passed already fitted according to appropriate train/test split
#so this is provided primarily as a convenience
try:
if check_is_fitted(self.model, "tree_", "OK") == "OK":
pass
else:
self.model.fit(self.x, self.y)
except NotFittedError:
self.model.fit(self.x, self.y)
#all models should be fitted at this point
self.pred_y = self.model.predict(self.x)
def get_sample_dataset(dataset_name):
"""Return dataframe of sample dataset"""
if dataset_name != "bike":
raise NotImplementedError("The only dataset currently available is the bike dataset.")
if dataset_name == "bike":
def _datestr_to_timestamp(s):
return time.mktime(datetime.datetime.strptime(s, "%Y-%m-%d").timetuple())
dataLoad = pd.read_csv('data/bike.csv')
dataLoad['dteday'] = dataLoad['dteday'].apply(_datestr_to_timestamp)
dataLoad = pd.get_dummies(dataLoad, prefix=["weathersit"], columns=["weathersit"], drop_first = False)
#de-normalize data to produce human-readable features.
#Original range info from http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
dataLoad["hum"] = dataLoad["hum"].apply(lambda x: x*100.)
dataLoad["windspeed"] = dataLoad["windspeed"].apply(lambda x: x*67.)
#convert Celsius to Fahrenheit
dataLoad["temp"] = dataLoad["temp"].apply(lambda x: (x*47. - 8)*9/5 +32)
dataLoad["atemp"] = dataLoad["atemp"].apply(lambda x: (x*66. - 16)*9/5 + 32)
#rename features to make them interpretable for novice users
feature_names_dict = {
"yr":"First or Second Year",
"season":"Season",
"hr":"Hour of Day",
"workingday":"Work Day",
"weathersit_2":"Misty Weather",
"weathersit_3":"Light Precipitation",
"weathersit_4":"Heavy Precipitation",
"temp":"Temperature (F)",
"atemp":"Feels Like (F)",
"hum":"Humidity",
"windspeed":"Wind Speed"
}
dataLoad = dataLoad.rename({"cnt" : "Ridership"}, axis = 1)
dataLoad = dataLoad.rename(mapper=feature_names_dict,axis=1)
return dataLoad.loc[:, list(feature_names_dict.values()) + ["Ridership"]]
def get_series_as_array(self, ser):
return np.array(ser).reshape(-1,1)
def return_dataset_as_dataframe(self):
"""Construct dataframe from internal NumPy data representation"""
return pd.DataFrame(data = np.hstack((self.x, self.get_series_as_array(self.y))),
columns = self.feature_names + [self.target_col]
)
def _bin_data(self):
"""For each pair of features in the dataset, generate an array of bin values in an X/Y grid."""
prediction_contributions = {}
sample_data = pd.DataFrame(
self._get_sample(self.x),
columns = self.feature_names
)
for key in self._get_feature_pairs():
tempH = np.digitize(
sample_data.loc[:,key[0]],
self.feature_ranges[key[0]]
)-1.
tempV = np.digitize(
sample_data.loc[:,key[1]],
self.feature_ranges[key[1]]
)-1.
prediction_contributions[key] = (tempV*len(self.feature_ranges[key[0]]) + tempH).astype(int)
return prediction_contributions
def _get_coordinate_matrix(self, lst, length, direction):
"""Expand a list out to length to create the indices for a 2d feature array"""
if direction=="h":
return lst*length
else:
return [item for item in lst\
for i in range(length)]
def _get_quantile_matrix(self, feat1, feat2):
"""Get the flattened lists of indices needed to index a 2d array of 2 features"""
h = self._get_coordinate_matrix(
list(self.feature_ranges[feat1]),
len(self.feature_ranges[feat2]),
"h"
)
v = self._get_coordinate_matrix(
list(self.feature_ranges[feat2]),
len(self.feature_ranges[feat1]),
"v"
)
return h,v
def _get_leaf_value(self, tree, node_position):
"""Access one node in a tree by its position"""
return tree.value[node_position]
def _get_feature_pair_key(self, feat1, feat2):
"""Generate a tuple key from two features with stable sorting. Used to define chart components."""
if self.feature_ranges[feat1].shape[0] == self.feature_ranges[feat2].shape[0]:
#need stable order so keys with same number of quantiles appear in only one order
return tuple(sorted([feat1, feat2]))
elif self.feature_ranges[feat1].shape[0] > self.feature_ranges[feat2].shape[0]:
return tuple([feat1, feat2])
else:
return tuple([feat2, feat1])
def _get_quantiles(self, feat):
"""For a given feature, generate the unique categories or quantitative bins"""
loc = self.feature_locs[feat]
if np.unique(self.x[:,loc]).shape[0] < 30 or type(self.x[0,loc]) is str: #is categorical/ordinal?
return np.unique(self.x[:,loc])
else:
if self.quantiles:
return np.around(
np.unique(
np.quantile(
a=self.x[:,loc],
q=np.linspace(0, 1, self.num_tiles)
)
),
1)
else:
return np.around(
np.linspace(
np.min(self.x[:,loc]),
np.max(self.x[:,loc]),
self.num_tiles
)
,1)
def _reduce_to_1d(self, arr, threshold, direction):
"""Determine if the loss from reducing an array to its first column or row is below user-defined threshold"""
if direction == "h":
reduced_arr = arr - arr[:,0].reshape(-1,1)
else:
reduced_arr = arr - arr[0,:].reshape(1,-1)
return (np.max(np.abs(reduced_arr))/np.max(np.abs(arr))) <= threshold
def _get_sample(self, arr):
"""Get the first self.sample_size datapoints in the dataset as a sample"""
#NOTE: this needs to be improved to use a persistent random sample of datapoint ids instead
#which should be set upon initialization or whenever self.sample_size is changed
return arr[:self.sample_size]
def _get_predictions_base(self):
"""Generate the default (estimators = 0) predictions consisting of the target mean in an array of sample_size."""
return np.full((self.sample_size,1), np.mean(self.y))
def _get_empty_sample(self, size = None):
"""Get an array of zeroes of user-defined size."""
return np.full((self.sample_size if size is None else size,1), 0)
def _get_explanation_accuracy(self, explanation_predictions, error_metric, subset_indices = None):
""" Compare a list of predictions to the actual Y value and generate desired error score
--Arguments
explanation_predictions (np.array):
Array of same length and ordering as self.y
error_metric (string)
Name of score to apply. Currently only r_squared and mae are implemented
--Returns
error_metric: float
Score from desired error function
"""
if error_metric == "r_squared":
func = r2_score
elif error_metric == "mae":
func = mean_absolute_error
if subset_indices is not None:
return func(self.pred_y[subset_indices], explanation_predictions[subset_indices])
else:
return func(self._get_sample(self.pred_y), explanation_predictions)
def _get_prediction_contributions(self, chart, data_positions):
""" For a given chart/explanation component, get the prediction contribution corresponding to the bin in which a datapoint falls.
--Arguments
chart: np.array:
A chart component or explanation component.
This has shape of (len(_get_quantiles(feat1)), len(_get_quantiles(feat2)))
data_positions: np.array
Array of positions to "take" from chart components, each position corresponding to the binned
feat1+feat2 values for one datapoint. Positions are flattened into a single value.
--Returns
values: np.array
Array in which the value of each iten corresponds to the prediction contribution that chart
made for the datapoint represented at position.
"""
return np.take(chart, data_positions)
def _sum_arrays(self, temp_outputs, keyMain, keyAdd, arr_to_add):
""" Roll up an aggregated view of a chart component into a more predictive one.
--Arguments
temp_outputs: dict of dict of arrays:
Dictionary of feature pair tuples, each with multiple outputs generated by X
keyMain: tuple(feat1, feat2)
The key of the chart to be modified
keyAdd: tuple(feat1, feat2)
The key of the chart being 'rolled up'
arr_to_add: string
The particular form of derived output to roll up, based on which feature axis is being aggregated
and whether the other axis was already rolled up into another chart
--Returns
values: np.array
The updated version of temp_outputs[keyMain]['output']
"""
return temp_outputs[keyMain]["output"]\
+ temp_outputs[keyAdd][arr_to_add].reshape(
temp_outputs[keyMain]["output"].shape[0]
if(keyMain[1]==keyAdd[1] or keyMain[1]==keyAdd[0])
else 1,-1
)
def _drop_alternate_outputs(self,component):
"""Discard output_v, output_h, etc once they have been folded into 'output' during extract_components()"""
return {"output": component["output"]}
def _get_prediction_contributions_df(self, components, explanation):
"""Convert the prediction contributions for a set of components into a dataframe"""
return np.hstack(
tuple(
[
self._get_prediction_contributions(
components[expKey]["output"],
self.binned_data[expKey]
).reshape(-1, 1)\
for expKey in explanation
]
)
)
def get_prediction_contributions_by_key(self, components, explanation):
""" Generate prediction contributions by components of an explanation
for each datapoint in get_sample(self.x)
--Arguments
components: dict of dict of arrays:
Typically, self.chart_components or self.explanation_components
explanation: list of feature pair tuples, ordered by importance
The key of the chart to be modified
--Returns
values: dict of arrays
Dict of feature pair tuples where each value is the point-wise prediction contributions
for each datapoint in get_sample(self.x)
"""
return {
expKey :
self._get_prediction_contributions(
components[expKey]["output"],
self.binned_data[expKey]
) for expKey in explanation
}
def evaluate_explanation(self, error_metric = "r_squared"):
"""Evaluate an explanation's fidelity to self.model using the provided error metric
--Arguments
error_metric: String
The error metric by which to evaluate the explanation's fidelity
to the model. Currently, only "r_squared" is accepted.
--Returns
fidelity: Float
Depends on the definition of the error metric. R-squared is a float from -inf => 1.
"""
if self.task == "Regression":
if error_metric in self.ALLOWED_REGRESSION_ERROR_METRICS:
return self._evaluate_single_explanation(self.chart_components, self.explanation, error_metric)
else:
raise ValueError(f"Not an allowed error metric for {self.task}. Try one of"\
+ f"{self.ALLOWED_REGRESSION_ERROR_METRICS}.")
elif self.task == "Classification":
raise NotImplementedError("Classification not yet implemented.")
def _evaluate_single_explanation(self, components, explanation, error_metric, subset_indices = None):
"""Internal version of evaluate_explanation()
--Arguments
components: dict of dict of arrays
Dictionary of feature pair keys with each value having output, output_v, output_h, etc
Each form of output is an array of prediction contributions indexed by chart_indices
explanation: List
List of feature pair keys specifying an explanation. Typically from self.explanation or self.base_explanation
error_metric: String
The error metric by which to evaluate the explanation's fidelity
to the model. Currently, only "r_squared" is accepted.
--Returns
fidelity: Float
Depends on the definition of the error metric. R-squared is a float where -inf < x <= 1.
"""
return self._get_explanation_accuracy(
self.predictions_base +\
np.sum(
np.array(
list(
self.get_prediction_contributions_by_key(
components,
explanation
).values()
)
),
axis = 0
).reshape(-1,1),
error_metric,
subset_indices
)
def _get_parallel_coordinate_columns(self, explanation, cumulative):
"""Get the list of column names for a parallel coordinates chart showing prediction contributions
--Arguments
explanation: List
List of feature pair keys specifying an explanation. Typically from self.explanation or self.base_explanation
cumulative: Boolean
Whether the parallel coordinates chart calling this displays cumulative predictions or individual
prediction components. Controls whether "mean y" and "prediction" are added to the output
--Returns
column_names: List
List of stringified column names for Altair to use in the chart generated by visualize_datapoints()
"""
#make these strings because Altair doesn't like a tuple as a key and turns it into a list
return (["mean y"] if cumulative else [])\
+ [x[0] + "," + x[1] for x in explanation]\
+ (["prediction"] if cumulative else [])
def _get_altair_data_type(self, feature_name, abbreviation = True):
"""Return the data type of a field in the format required by Altair field encodings
--Arguments
feature_name: String
Name of a valid feature in this dataset.
abbreviation: Boolean
Whether to return the data type as the full Altair name or the abbreviated form
--Returns
data_type: String
Name or abbreviation of the feature's data type
"""
if feature_name not in self.feature_ranges:
raise ValueError("Feature is not present in dataset. Check f2t.feature_ranges()")
elif self.feature_ranges[feature_name].shape[0] == self.num_tiles:
return "Q" if abbreviation else "quantitative"
else:
return "O" if abbreviation else "ordinal"
def _get_datapoint_contributions(self, components, explanation):
"""Build the data structure for the chart generated by visualize_datapoints().
This is basically the result of _get_prediction_contributions_df() with additional columns
for prediction loss, sorting, etc.
--Arguments
components: dict of dict of arrays
Dictionary of feature pair keys with each value having output, output_v, output_h, etc
Each form of output is an array of prediction contributions indexed by chart_indices
explanation: List
List of feature pair keys specifying an explanation. Typically from self.explanation or self.base_explanation
--Returns
datapoints: pandas.DataFrame
Dataframe with one row per datapoint, and columns for each feature pair key indicating their contribution
to the prediction, with additional columns for sort order, prediction, mean_y, and explanation_loss.
"""
contributions = self._get_prediction_contributions_df(components, explanation)
#raw contributions
arr = np.hstack(
(
self._get_empty_sample().reshape(-1,1),
contributions.reshape(self.sample_size, -1),
self._get_sample(self.pred_y).reshape(-1,1),
np.array([0. for x in range(self.sample_size)]).reshape(-1,1)
)
)
#cumulative version
arr_cum = np.cumsum(
np.hstack(
(
self._get_predictions_base().reshape(-1,1),
contributions.reshape(self.sample_size,-1)
)
),
axis = 1
)
arr_cum = np.hstack(
(
arr_cum,
self._get_sample(self.pred_y).reshape(-1,1),
np.array([1. for x in range(self.sample_size)]).reshape(-1,1)
)
)
arr_df = pd.DataFrame(
arr,
columns = self._get_parallel_coordinate_columns(explanation, True) + ["view"]
)
#combine arrays
arr_cum_df = pd.DataFrame(
arr_cum,
columns = self._get_parallel_coordinate_columns(explanation, True) + ["view"]
)
#generate datapoint id columns
arr_df = arr_df.reset_index(drop = False)
arr_cum_df = arr_cum_df.reset_index(drop = False)
datapoints = pd.concat([arr_df, arr_cum_df])
#couldn't do this earlier, as you can't vstack a mixed type array
datapoints["view"] = datapoints["view"].apply(lambda x:\
'Predictions by Chart'\
if x < 1. else\
'Cumulative Predictions'\
)
#calculate explanation loss
datapoints["explanation_loss"] = np.abs( #otherwise the chart is hard to read with 0 in the middle of the axis
datapoints.loc[:,"prediction"] - datapoints.iloc[:,-3]#last cumulative column
)
datapoints["prediction_index"] = datapoints["prediction"]
datapoints = datapoints.melt(
id_vars = ['index', "prediction_index", "view", "explanation_loss"],
var_name = 'component',
value_name = 'contribution'
)
#rename prediction again
datapoints = datapoints.rename({"prediction_index" : "prediction"}, axis = 1)
#drop fake columns for "Predictions by Chart"
datapoints = datapoints[
(datapoints["view"] == "Cumulative Predictions")
| (~datapoints["component"].isin(["prediction", "mean y", "explanation_loss"]))
]
#build sort column for Altair
datapoints["sort"] = datapoints["component"].apply(lambda x:
self._get_parallel_coordinate_columns(
explanation,
True
).index(x)
)
return datapoints
def _copy_chart_components(self):
"""Generate a deep copy of self.chart_components so they can be safely modified during roll-up"""
return copy.deepcopy(self.chart_components)
def _get_feature_pairs(self):
"""Generate the full list of feature pair keys based on the feature names of the dataset"""
return [
self._get_feature_pair_key(key[0], key[1])
for key in [tuple(t) for t in product(self.feature_names, repeat = 2)]
]
def _rollup_components(self, explanation):
"""ROLLUP is under development and may not function as intended"""
temp_outputs = self._copy_chart_components()
for keyRollup in [k for k in self.chart_components.keys() if k not in explanation]:
hUsed = False
vUsed = False
for keyExisting in explanation:
if (keyRollup[1] == keyExisting[0] or keyRollup[1] == keyExisting[1]) and not hUsed:
hUsed = True
if vUsed:
temp_outputs[keyExisting]["output"] = self._sum_arrays(
temp_outputs,
keyExisting,
keyRollup,
"output_HReduced"
)
break
else:
temp_outputs[keyExisting]["output"] = self._sum_arrays(
temp_outputs,
keyExisting,
keyRollup,
"output_H"
)
elif (keyRollup[0] == keyExisting[0] or keyRollup[0] == keyExisting[1]) and not vUsed:
vUsed = True
if hUsed:
temp_outputs[keyExisting]["output"] = self._sum_arrays(
temp_outputs,
keyExisting,
keyRollup,
"output_VReduced"
)
break
else:
temp_outputs[keyExisting]["output"] = self._sum_arrays(
temp_outputs,
keyExisting,
keyRollup,
"output_V"
)
return temp_outputs
def visualize_estimator(self, estimator_nums, try_collapse = False,
print_function_text = True, auto_display = True):
"""Visualize the chart components for 1 or more individual estimators within a model.
--Arguments
estimator_nums: Integer or List
The list of estimators to visualize. Represents the index of the tree in model.estimators_
try_collapse: Boolean
If true, collapse as many of the estimators if possible, if they share the same feature pair key.
print_function_text: Boolean
If true, print the text of the decision function specified by the tree on top of the chart.
auto_display: Boolean
If true, display the chart. Otherwise, just return it for manual alteration and display
--Returns
output: Altair.chart
Returns the Altair visualization spec for further modification if necessary.
If auto_display = True, this can be discarded.
"""
estimators = [self.model.estimators_[estimator_nums]] if type(estimator_nums) is int\
else [x for i,x in enumerate(self.model.estimators_) if i in estimator_nums]
chart_components,\
chart_indices,\
_,\
_,\
function_texts = self._extract_components(
try_collapse,
estimators,
print_function_text
)
if print_function_text:
if len(function_texts) == 1:
for x in [x for x in list(function_texts.values())[0]["function_texts"]]:
print(x)
else:
for x in [x["function_texts"] for x in list(function_texts.values())]:
for text in x:
print(x[0])
chart = self._visualize_components(
chart_components.keys(),
chart_components,
chart_indices,
None,
None,
None,
300,
4
)
if auto_display:
display(chart)
return chart
def _get_function_text(self, decision_func_dict):
"""Convert the internal representation of a tree into a natural language if-then statement of the decision function.
--Arguments
decision_func_dict: Dict
The representation of an estimator used in _extract_components(). See that function for more details.
--Returns
text: String
Decision function for the passed estimator as an if-then statement.
"""
def _get_left_right_text(op, le, gt):
if op == operator.le:
return " is less than or equal to ", str(round(le,1)), str(round(gt,1))
else:
return " is greater than ", str(gt), str(le)
if "feature_name" in decision_func_dict: #1-deep
comparison_text, left, right = _get_left_right_text(
decision_func_dict["operator"],
decision_func_dict["prob_le"],
decision_func_dict["prob_gt"]
)
text = "If " + decision_func_dict["feature_name"] + comparison_text\
+ str(round(decision_func_dict["threshold"],1)) + " then " + left\
+ " else " + right + ". "
return text
else: #2-deep
comparison_text_1, left_1, right_1 = _get_left_right_text(
decision_func_dict["operator_1"],
0,
0
)
comparison_text_2, left_2, right_2 = _get_left_right_text(
decision_func_dict["operator_2"],
decision_func_dict["prob_le"],
decision_func_dict["prob_gt"]
)
text = "If " + decision_func_dict["feature_name_1"] + comparison_text_1\
+ str(round(decision_func_dict["threshold_1"],1)) + " then proceed. If "\
+ decision_func_dict["feature_name_2"] + comparison_text_2\
+ str(round(decision_func_dict["threshold_2"],1)) + " then " + left_2 + " else " + right_2 + ". "
return text
def extract_components(self, collapse_1d = True, return_text = False):
""" Extract chart components from the underlying tree structures.
--Arguments
collapse_1d: Boolean
If True, perform basic rollup and add 1d charts to the chart component
that has the highest feature importance among components that include the field in question
return_text: Boolean
If True, generate text descriptions of the function described by each tree.
Generally only necessary for instructional purposes, see notebook.ipynb.
--Returns
None
Chart components, chart indices, and debugging data saved to internal state
See _extract_components() for more detail.
"""
self.chart_components,\
self.chart_indices,\
self.no_predictor_features,\
self.oned_features,\
self.estimator_texts = self._extract_components(collapse_1d, self.model.estimators_, return_text)
self.predictions_base = self._get_predictions_base()
#get the full explanation and store it. one reason for this is so that
#charts can also be sorted appropriately
#don't save explanation components as they will be the same as chart_components
self.base_explanation, _, _\
= self._explain(1., None)
def _extract_components(self, collapse_1d, estimators, return_text):
""" Extract chart components from the underlying tree structures.
Internal version of extract_components()
--Arguments
collapse_1d: Boolean
If True, perform basic rollup and add 1d charts to the chart component
that has the highest feature importance among components that include the field in question
return_text: Boolean
If True, generate text descriptions of the function described by each tree.
Generally only necessary for instructional purposes, see notebook.ipynb.
--Returns
chart_components: dict of dict of arrays
Dictionary of feature pair keys with each value having output, output_v, output_h, etc
Each form of output is an array of prediction contributions indexed by chart_indices
chart_indices: dict of dict of lists
Dictionary of feature pair keys with each value having "h_indices" and "v_indices",
each of which is a list containing the bins for the feature in the horizontal or vertical position
no_predictor_features: list of feature pair keys
List of feature tuples for those pairs that were not included in any trees in the model
self.oned_features: list of features
List of features (tuple of form (feat1, feat1)) that were rolled up
If collapse_1d is False, return empty list
self.estimator_texts: list of strings
If return_text, the string at position i in the list represents the decision function
for tree i in self.mode. If False, return empty list.
"""
#generate data structure for pairwise charts
feature_pairs = {
key : {
"map":None,
"predicates":[],
"function_texts":[]
}
for key in self._get_feature_pairs()
}
for key, value in feature_pairs.items():
h, v = self._get_quantile_matrix(key[0], key[1])
value["map"] = np.array(
[
{
key[0] : x,
key[1] : y
}
for x,y in zip(h,v)
]
).reshape(len(self.feature_ranges[key[1]]), len(self.feature_ranges[key[0]]))
for modelT in estimators:
curr_model = modelT[0]
feature_ids = {
i : {
"number":x,
"name":self.feature_names[x]
} for i,x in enumerate(list(curr_model.tree_.feature))
if x >= 0
} #-2 means leaf node
#for 1-layer trees
if curr_model.tree_.feature[1] < 0:
feature_pair_key = self._get_feature_pair_key(
feature_ids[0]["name"],
feature_ids[0]["name"]
)
decision_func_dict = {
"feature_name": feature_ids[0]["name"],
"threshold": curr_model.tree_.threshold[0],
"operator": operator.le,
"prob_le": self._get_leaf_value(curr_model.tree_, 1),
"prob_gt": self._get_leaf_value(curr_model.tree_, 2)
}
#build the predictive function used in the decision tree
def dt_predicate(data_case, decision_func_dict=decision_func_dict):
if decision_func_dict["operator"](\
data_case[decision_func_dict["feature_name"]],\
decision_func_dict["threshold"]\
):
return decision_func_dict["prob_le"]
else:
return decision_func_dict["prob_gt"]
else:
for node_position in [1,4]: #positions for left and right nodes at layer 2
if node_position in feature_ids:
feature_pair_key = self._get_feature_pair_key(
feature_ids[0]["name"],
feature_ids[node_position]["name"]
)
#get the decision rules
decision_func_dict = {
"feature_name_1": feature_ids[0]["name"],
"threshold_1": curr_model.tree_.threshold[0],
"operator_1": operator.le if node_position == 1 else operator.gt,
"feature_name_2": feature_ids[node_position]["name"],
"threshold_2": curr_model.tree_.threshold[node_position],
"operator_2": operator.le,
"prob_le": self._get_leaf_value(curr_model.tree_, node_position+1),
"prob_gt": self._get_leaf_value(curr_model.tree_, node_position+2)
}
#build the predictive function used in the decision tree
def dt_predicate(data_case, decision_func_dict=decision_func_dict):
if decision_func_dict["operator_1"](\
data_case[decision_func_dict["feature_name_1"]],\
decision_func_dict["threshold_1"]\
):
if decision_func_dict["operator_2"](\
data_case[decision_func_dict["feature_name_2"]],\
decision_func_dict["threshold_2"]\
):
return decision_func_dict["prob_le"]
else:
return decision_func_dict["prob_gt"]
else:
return 0.
else: #asymmetric tree, this is a leaf node
feature_pair_key = self._get_feature_pair_key(
feature_ids[0]["name"],
feature_ids[0]["name"]
)
decision_func_dict = {
"feature_name": feature_ids[0]["name"],
"threshold": curr_model.tree_.threshold[0],
"operator": operator.le if node_position == 1 else operator.gt,
"prob": curr_model.tree_.value[node_position]
}
#build the predictive function used in the decision tree
def dt_predicate(data_case, decision_func_dict=decision_func_dict):
if decision_func_dict["operator"](\
data_case[decision_func_dict["feature_name"]],\
decision_func_dict["threshold"]\
):
return decision_func_dict["prob"]
else:
return 0.
feature_pairs[feature_pair_key]["predicates"].append(dt_predicate)
if return_text:
feature_pairs[feature_pair_key]["function_texts"].append(
self._get_function_text(
decision_func_dict
)
)
#now calculate output array for each feature pair
for key, value in feature_pairs.items():
arrs = []
for predicate in value["predicates"]:
f = np.vectorize(predicate)
arrs.append(f(value["map"]))
if len(arrs) > 0:
#details of vote aggreggation method for random forest
#https://stats.stackexchange.com/questions/127077/random-forest-probabilistic-prediction-vs-majority-vote
value["output"] = np.sum(np.stack(arrs, axis=-1), axis=-1)*self.model.learning_rate
else:
value["output"] = None
#build chart data
for key, value in feature_pairs.items():
h,v = self._get_quantile_matrix(key[0], key[1])
value["h_indices"] = h
value["v_indices"] = v
no_predictor_features = []
oned_features = []
chart_data = {}
for key, value in feature_pairs.items():
newKey = key
if value["output"] is None:
no_predictor_features.append(key)
value["removed"] = True
else:
if collapse_1d:
if self._reduce_to_1d(value["output"], 0., "v"):
newKey = key[1]
value["output"] = value["output"][0,:]
value["h_indices"] = self.feature_ranges[newKey]
value["v_indices"] = None
value["1d_key"] = newKey
value["removed"] = True
oned_features.append(key)
elif self._reduce_to_1d(value["output"], 0., "h"):
newKey = key[0]
value["output"] = value["output"][:,0]
value["h_indices"] = self.feature_ranges[newKey]
value["v_indices"] = None
value["1d_key"] = newKey
value["removed"] = True
oned_features.append(key)
#do another loop through chart_data to push 1d charts into 2d
if collapse_1d:
for value in list(feature_pairs.values()):
if value["v_indices"] is None:
key = value["1d_key"]
#get list of charts with this feature
matchList = sorted([{"key": kInner, "feature_importance": np.std(vInner["output"])}\
for kInner, vInner in feature_pairs.items()\
if "removed" not in vInner and key in kInner],\
key=lambda x: x["feature_importance"], reverse=True)
if len(matchList) > 0: