-
Notifications
You must be signed in to change notification settings - Fork 23
/
FFTrees.Rd
251 lines (208 loc) · 13.1 KB
/
FFTrees.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/FFTrees.R, R/util_abc.R
\docType{package}
\name{FFTrees}
\alias{FFTrees}
\alias{FFTrees-package}
\title{Main function to create and apply fast-and-frugal trees (FFTs)}
\usage{
FFTrees(
formula = NULL,
data = NULL,
data.test = NULL,
algorithm = "ifan",
train.p = 1,
goal = NULL,
goal.chase = NULL,
goal.threshold = NULL,
max.levels = NULL,
numthresh.method = "o",
numthresh.n = 10,
repeat.cues = TRUE,
stopping.rule = "exemplars",
stopping.par = 0.1,
sens.w = 0.5,
cost.outcomes = NULL,
cost.cues = NULL,
main = NULL,
decision.labels = c("False", "True"),
my.goal = NULL,
my.goal.fun = NULL,
my.tree = NULL,
object = NULL,
tree.definitions = NULL,
do.comp = TRUE,
do.cart = TRUE,
do.lr = TRUE,
do.rf = TRUE,
do.svm = TRUE,
quiet = list(ini = TRUE, fin = FALSE, mis = FALSE, set = TRUE),
comp = NULL,
force = NULL,
rank.method = NULL,
rounding = NULL,
store.data = NULL,
verbose = NULL
)
}
\arguments{
\item{formula}{A formula. A \code{\link{formula}} specifying a binary criterion variable (as logical) as a function of 1 or more predictor variables (cues).}
\item{data}{A data frame. A dataset used for training (fitting) FFTs and alternative algorithms.
\code{data} must contain the binary criterion variable specified in \code{formula} and potential predictors (which can be categorical or numeric variables).}
\item{data.test}{A data frame. An optional dataset used for model testing (prediction) with the same structure as data.}
\item{algorithm}{A character string. The algorithm used to create FFTs. Can be \code{'ifan'}, \code{'dfan'}.}
\item{train.p}{numeric. What percentage of the data to use for training when \code{data.test} is not specified?
For example, \code{train.p = .50} will randomly split \code{data} into a 50\% training set and a 50\% test set.
Default: \code{train.p = 1} (i.e., using \emph{all} data for training).}
\item{goal}{A character string indicating the statistic to maximize when \emph{selecting trees}:
\code{"acc"} = overall accuracy, \code{"bacc"} = balanced accuracy, \code{"wacc"} = weighted accuracy,
\code{"dprime"} = discriminability, \code{"cost"} = costs (based on \code{cost.outcomes} and \code{cost.cues}).}
\item{goal.chase}{A character string indicating the statistic to maximize when \emph{constructing trees}:
\code{"acc"} = overall accuracy, \code{"bacc"} = balanced accuracy, \code{"wacc"} = weighted accuracy,
\code{"dprime"} = discriminability, \code{"cost"} = costs (based on \code{cost.outcomes} and \code{cost.cues}).}
\item{goal.threshold}{A character string indicating the criterion to maximize when \emph{optimizing cue thresholds}:
\code{"acc"} = overall accuracy, \code{"bacc"} = balanced accuracy, \code{"wacc"} = weighted accuracy,
\code{"dprime"} = discriminability, \code{"cost"} = costs (based only on \code{cost.outcomes}, as \code{cost.cues} are constant per cue).
All default goals are set in \code{\link{fftrees_create}}.}
\item{max.levels}{integer. The maximum number of nodes (or levels) considered for an FFT.
As all combinations of possible exit structures are considered, larger values of \code{max.levels} will create larger sets of FFTs.}
\item{numthresh.method}{How should thresholds for numeric cues be determined (as character)?
\code{"o"} will optimize thresholds (for \code{goal.threshold}), while \code{"m"} will use the median.
Default: \code{numthresh.method = "o"}.}
\item{numthresh.n}{The number of numeric thresholds to try (as integer).
Default: \code{numthresh.n = 10}.}
\item{repeat.cues}{May cues occur multiple times within a tree (as logical)?
Default: \code{repeat.cues = TRUE}.}
\item{stopping.rule}{A character string indicating the method to stop growing trees.
Available options are:
\itemize{
\item{\code{"exemplars"}: A tree grows until only a small proportion of unclassified exemplars remain;}
\item{\code{"levels"}: A tree grows until a certain level is reached;}
\item{\code{"statdelta"}: A tree grows until the change in the criterion statistic \code{goal.chase} exceeds some threshold level.
(This setting is currently experimental and includes the first level beyond threshold.
As tree statistics can be non-monotonic, this option may yield inconsistent results.)}
}
All stopping methods use \code{stopping.par} to set a numeric threshold value.
Default: \code{stopping.rule = "exemplars"}.}
\item{stopping.par}{numeric. A numeric parameter indicating the criterion value for the current \code{stopping.rule}.
For stopping.rule \code{"levels"}, this is the number of desired levels (as an integer).
For stopping rule \code{"exemplars"}, this is the smallest proportion of exemplars allowed in the last level.
For stopping.rule \code{"statdelta"}, this is the minimum required change (in the \code{goal.chase} value) to include a level.
Default: \code{stopping.par = .10}.}
\item{sens.w}{A numeric value from \code{0} to \code{1} indicating how to weight
sensitivity relative to specificity when optimizing \emph{weighted} accuracy (e.g., \code{goal = 'wacc'}).
Default: \code{sens.w = .50} (i.e., \code{wacc} corresponds to \code{bacc}).}
\item{cost.outcomes}{A list of length 4 specifying the cost value for one of the 4 possible classification outcomes.
The list elements must be named \code{'hi'}, \code{'fa'}, \code{'mi'}, and \code{'cr'}
(for specifying the costs of a hit, false alarm, miss, and correct rejection, respectively) and provide a numeric cost value.
E.g.; \code{cost.outcomes = listc("hi" = 0, "fa" = 10, "mi" = 20, "cr" = 0)} imposes false alarm and miss costs of \code{10} and \code{20} units, respectively, while correct decisions have no costs.}
\item{cost.cues}{A list containing the cost of each cue (in some common unit).
Each list element must have a name corresponding to a cue (i.e., a variable in \code{data}), and should be a single (positive numeric) value.
Cues in \code{data} that are not present in \code{cost.cues} are assumed to have no costs (i.e., a cost value of \code{0}).}
\item{main}{string. An optional label for the dataset. Passed on to other functions, like \code{\link{plot.FFTrees}}, and \code{\link{print.FFTrees}}.}
\item{decision.labels}{A vector of strings of length 2 for the text labels for negative and positive decision/prediction outcomes
(i.e., left vs. right, noise vs. signal, 0 vs. 1, respectively, as character).
E.g.; \code{decision.labels = c("Healthy", "Diseased")}.}
\item{my.goal}{The name of an optimization measure defined by \code{my.goal.fun} (as a character string).
Example: \code{my.goal = "my_acc"} (see \code{my.goal.fun} for corresponding function).
Default: \code{my.goal = NULL}.}
\item{my.goal.fun}{The definition of an outcome measure to optimize, defined as a function
of the frequency counts of the 4 basic classification outcomes \code{hi, fa, mi, cr}
(i.e., an R function with 4 arguments \code{hi, fa, mi, cr}).
Example: \code{my.goal.fun = function(hi, fa, mi, cr){(hi + cr)/(hi + fa + mi + cr)}} (i.e., accuracy).
Default: \code{my.goal.fun = NULL}.}
\item{my.tree}{A verbal description of an FFT, i.e., an "FFT in words" (as character string).
For example, \code{my.tree = "If age > 20, predict TRUE. If sex = {m}, predict FALSE. Otherwise, predict TRUE."}.}
\item{object}{An optional existing \code{FFTrees} object.
When specified, no new FFTs are fitted, but existing trees are applied to \code{data} and \code{data.test}.
When \code{formula}, \code{data} or \code{data.test} are not specified, the current values of \code{object} are used.}
\item{tree.definitions}{An optional \code{data.frame} of hard-coded FFT definitions (in the format of \code{x$trees$definitions} of an \code{FFTrees} object \code{x}).
If specified, no new FFTs are being fitted (i.e., \code{algorithm} and functions for evaluating cues and creating FFTs are skipped).
Instead, the tree definitions provided are used to re-evaluate the current \code{FFTrees} object on current data.}
\item{do.comp, do.lr, do.cart, do.svm, do.rf}{Should alternative algorithms be used for comparison (as logical)?
All options are set to \code{TRUE} by default. Available options correspond to:
\itemize{
\item{\code{do.lr}: Logistic regression (LR, using \code{\link{glm}} from \strong{stats} with \code{family = "binomial"});}
\item{\code{do.cart}: Classification and regression trees (CART, using \code{rpart} from \strong{rpart});}
\item{\code{do.svm}: Support vector machines (SVM, using \code{svm} from \strong{e1071});}
\item{\code{do.rf}: Random forests (RF, using \code{randomForest} from \strong{randomForest}.}
}
Specifying \code{do.comp = FALSE} sets all available options to \code{FALSE}.}
\item{quiet}{A list of 4 logical arguments: Should detailed progress reports be suppressed?
Setting list elements to \code{FALSE} is helpful when diagnosing errors.
Default: \code{quiet = list(ini = TRUE, fin = FALSE, mis = FALSE, set = TRUE)},
for initial vs. final steps, missing cases, and parameter settings, respectively.
Providing a single logical value sets all elements to \code{TRUE} or \code{FALSE}.}
\item{comp, force, rank.method, rounding, store.data, verbose}{Deprecated arguments (unused or replaced, to be retired in future releases).}
}
\value{
An \code{FFTrees} object with the following elements:
\describe{
\item{criterion_name}{The name of the binary criterion variable (as character).}
\item{cue_names}{The names of all potential predictor variables (cues) in the data (as character).}
\item{formula}{The \code{\link{formula}} specified when creating the FFTs.}
\item{trees}{A list of FFTs created, with further details contained in \code{n}, \code{best}, \code{definitions}, \code{inwords}, \code{stats}, \code{level_stats}, and \code{decisions}.}
\item{data}{The original training and test data (if available).}
\item{params}{A list of defined control parameters (e.g.; \code{algorithm}, \code{goal}, \code{sens.w}, as well as various thresholds, stopping rule, and cost parameters).}
\item{competition}{Models and classification statistics for competitive classification algorithms:
Logistic regression (\code{lr}), classification and regression trees (\code{cart}), random forests (\code{rf}), and support vector machines (\code{svm}).}
\item{cues}{A list of cue information, with further details contained in \code{thresholds} and \code{stats}.}
}
}
\description{
\code{FFTrees} is the workhorse function of the \strong{FFTrees} package for creating fast-and-frugal trees (FFTs).
FFTs are decision algorithms for solving binary classification tasks, i.e., they predict the values of a binary criterion variable based on 1 or multiple predictor variables (cues).
Using \code{FFTrees} on \code{data} usually generates a range of FFTs and corresponding summary statistics (as an \code{FFTrees} object)
that can then be printed, plotted, and examined further.
The criterion and predictor variables are specified in \code{\link{formula}} notation.
Based on the settings of \code{data} and \code{data.test}, FFTs are trained on a (required) training dataset
(given the set of current \code{goal} values) and evaluated on (or predict) an (optional) test dataset.
If an existing \code{FFTrees} object \code{object} or \code{tree.definitions} are provided as inputs,
no new FFTs are created.
When both arguments are provided, \code{tree.definitions} take priority over the FFTs in an existing \code{object}.
Specifically,
\itemize{
\item{If \code{tree.definitions} are provided, these are assigned to the FFTs of \code{x}.}
\item{If no \code{tree.definitions} are provided, but an existing \code{FFTrees} object \code{object} is provided,
the trees from \code{object} are assigned to the FFTs of \code{x}.}
}
Create and evaluate fast-and-frugal trees (FFTs).
}
\examples{
# 1. Create fast-and-frugal trees (FFTs) for heart disease:
heart.fft <- FFTrees(formula = diagnosis ~ .,
data = heart.train,
data.test = heart.test,
main = "Heart Disease",
decision.labels = c("Healthy", "Diseased")
)
# 2. Print a summary of the result:
heart.fft # same as:
# print(heart.fft, data = "train", tree = "best.train")
# 3. Plot an FFT applied to training data:
plot(heart.fft) # same as:
# plot(heart.fft, what = "all", data = "train", tree = "best.train")
# 4. Apply FFT to (new) testing data:
plot(heart.fft, data = "test") # predict for Tree 1
plot(heart.fft, data = "test", tree = 2) # predict for Tree 2
# 5. Predict classes and probabilities for new data:
predict(heart.fft, newdata = heartdisease)
predict(heart.fft, newdata = heartdisease, type = "prob")
# 6. Create a custom tree (from verbal description) with my.tree:
custom.fft <- FFTrees(
formula = diagnosis ~ .,
data = heartdisease,
my.tree = "If age < 50, predict False.
If sex = 1, predict True.
If chol > 300, predict True, otherwise predict False.",
main = "My custom FFT")
# Plot the (pretty bad) custom tree:
plot(custom.fft)
}
\seealso{
\code{\link{print.FFTrees}} for printing FFTs;
\code{\link{plot.FFTrees}} for plotting FFTs;
\code{\link{summary.FFTrees}} for summarizing FFTs;
\code{\link{inwords}} for obtaining a verbal description of FFTs;
\code{\link{showcues}} for plotting cue accuracies.
}