man/FFTrees.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/FFTrees.R, R/util_abc.R
\docType{package}
\name{FFTrees}
\alias{FFTrees}
\alias{FFTrees-package}
\title{Main function to create and apply fast-and-frugal trees (FFTs)}
\usage{
FFTrees(
  formula = NULL,
  data = NULL,
  data.test = NULL,
  algorithm = "ifan",
  train.p = 1,
  goal = NULL,
  goal.chase = NULL,
  goal.threshold = NULL,
  max.levels = NULL,
  numthresh.method = "o",
  numthresh.n = 10,
  repeat.cues = TRUE,
  stopping.rule = "exemplars",
  stopping.par = 0.1,
  sens.w = 0.5,
  cost.outcomes = NULL,
  cost.cues = NULL,
  main = NULL,
  decision.labels = c("False", "True"),
  my.goal = NULL,
  my.goal.fun = NULL,
  my.tree = NULL,
  object = NULL,
  tree.definitions = NULL,
  do.comp = TRUE,
  do.cart = TRUE,
  do.lr = TRUE,
  do.rf = TRUE,
  do.svm = TRUE,
  quiet = list(ini = TRUE, fin = FALSE, mis = FALSE, set = TRUE),
  comp = NULL,
  force = NULL,
  rank.method = NULL,
  rounding = NULL,
  store.data = NULL,
  verbose = NULL
)
}
\arguments{
\item{formula}{A formula. A \code{\link{formula}} specifying a binary criterion variable (as logical) as a function of 1 or more predictor variables (cues).}

\item{data}{A data frame. A dataset used for training (fitting) FFTs and alternative algorithms.
\code{data} must contain the binary criterion variable specified in \code{formula} and potential predictors (which can be categorical or numeric variables).}

\item{data.test}{A data frame. An optional dataset used for model testing (prediction) with the same structure as data.}

\item{algorithm}{A character string. The algorithm used to create FFTs. Can be \code{'ifan'}, \code{'dfan'}.}

\item{train.p}{numeric. What percentage of the data to use for training when \code{data.test} is not specified?
For example, \code{train.p = .50} will randomly split \code{data} into a 50\% training set and a 50\% test set.
Default: \code{train.p = 1} (i.e., using \emph{all} data for training).}

\item{goal}{A character string indicating the statistic to maximize when \emph{selecting trees}:
\code{"acc"} = overall accuracy, \code{"bacc"} = balanced accuracy, \code{"wacc"} = weighted accuracy,
\code{"dprime"} = discriminability, \code{"cost"} = costs (based on \code{cost.outcomes} and \code{cost.cues}).}

\item{goal.chase}{A character string indicating the statistic to maximize when \emph{constructing trees}:
\code{"acc"} = overall accuracy, \code{"bacc"} = balanced accuracy, \code{"wacc"} = weighted accuracy,
\code{"dprime"} = discriminability, \code{"cost"} = costs (based on \code{cost.outcomes} and \code{cost.cues}).}

\item{goal.threshold}{A character string indicating the criterion to maximize when \emph{optimizing cue thresholds}:
\code{"acc"} = overall accuracy, \code{"bacc"} = balanced accuracy, \code{"wacc"} = weighted accuracy,
\code{"dprime"} = discriminability, \code{"cost"} = costs (based only on \code{cost.outcomes}, as \code{cost.cues} are constant per cue).
All default goals are set in \code{\link{fftrees_create}}.}

\item{max.levels}{integer. The maximum number of nodes (or levels) considered for an FFT.
As all combinations of possible exit structures are considered, larger values of \code{max.levels} will create larger sets of FFTs.}

\item{numthresh.method}{How should thresholds for numeric cues be determined (as character)?
\code{"o"} will optimize thresholds (for \code{goal.threshold}), while \code{"m"} will use the median.
Default: \code{numthresh.method = "o"}.}

\item{numthresh.n}{The number of numeric thresholds to try (as integer).
Default: \code{numthresh.n = 10}.}

\item{repeat.cues}{May cues occur multiple times within a tree (as logical)?
Default: \code{repeat.cues = TRUE}.}

\item{stopping.rule}{A character string indicating the method to stop growing trees.
Available options are:
\itemize{
  \item{\code{"exemplars"}: A tree grows until only a small proportion of unclassified exemplars remain;}
  \item{\code{"levels"}: A tree grows until a certain level is reached;}
  \item{\code{"statdelta"}: A tree grows until the change in the criterion statistic \code{goal.chase} exceeds some threshold level.
  (This setting is currently experimental and includes the first level beyond threshold.
  As tree statistics can be non-monotonic, this option may yield inconsistent results.)}
  }
All stopping methods use \code{stopping.par} to set a numeric threshold value.
Default: \code{stopping.rule = "exemplars"}.}

\item{stopping.par}{numeric. A numeric parameter indicating the criterion value for the current \code{stopping.rule}.
For stopping.rule \code{"levels"}, this is the number of desired levels (as an integer).
For stopping rule \code{"exemplars"}, this is the smallest proportion of exemplars allowed in the last level.
For stopping.rule \code{"statdelta"}, this is the minimum required change (in the \code{goal.chase} value) to include a level.
Default: \code{stopping.par = .10}.}

\item{sens.w}{A numeric value from \code{0} to \code{1} indicating how to weight
sensitivity relative to specificity when optimizing \emph{weighted} accuracy (e.g., \code{goal = 'wacc'}).
Default: \code{sens.w = .50} (i.e., \code{wacc} corresponds to \code{bacc}).}

\item{cost.outcomes}{A list of length 4 specifying the cost value for one of the 4 possible classification outcomes.
The list elements must be named \code{'hi'}, \code{'fa'}, \code{'mi'}, and \code{'cr'}
(for specifying the costs of a hit, false alarm, miss, and correct rejection, respectively) and provide a numeric cost value.
E.g.; \code{cost.outcomes = listc("hi" = 0, "fa" = 10, "mi" = 20, "cr" = 0)} imposes false alarm and miss costs of \code{10} and \code{20} units, respectively, while correct decisions have no costs.}

\item{cost.cues}{A list containing the cost of each cue (in some common unit).
Each list element must have a name corresponding to a cue (i.e., a variable in \code{data}), and should be a single (positive numeric) value.
Cues in \code{data} that are not present in \code{cost.cues} are assumed to have no costs (i.e., a cost value of \code{0}).}

\item{main}{string. An optional label for the dataset. Passed on to other functions, like \code{\link{plot.FFTrees}}, and \code{\link{print.FFTrees}}.}

\item{decision.labels}{A vector of strings of length 2 for the text labels for negative and positive decision/prediction outcomes
(i.e., left vs. right, noise vs. signal, 0 vs. 1, respectively, as character).
E.g.; \code{decision.labels = c("Healthy", "Diseased")}.}

\item{my.goal}{The name of an optimization measure defined by \code{my.goal.fun} (as a character string).
Example: \code{my.goal = "my_acc"} (see \code{my.goal.fun} for corresponding function).
Default: \code{my.goal = NULL}.}

\item{my.goal.fun}{The definition of an outcome measure to optimize, defined as a function
of the frequency counts of the 4 basic classification outcomes \code{hi, fa, mi, cr}
(i.e., an R function with 4 arguments \code{hi, fa, mi, cr}).
Example: \code{my.goal.fun = function(hi, fa, mi, cr){(hi + cr)/(hi + fa + mi + cr)}} (i.e., accuracy).
Default: \code{my.goal.fun = NULL}.}

\item{my.tree}{A verbal description of an FFT, i.e., an "FFT in words" (as character string).
For example, \code{my.tree = "If age > 20, predict TRUE. If sex = {m}, predict FALSE. Otherwise, predict TRUE."}.}

\item{object}{An optional existing \code{FFTrees} object.
When specified, no new FFTs are fitted, but existing trees are applied to \code{data} and \code{data.test}.
When \code{formula}, \code{data} or \code{data.test} are not specified, the current values of \code{object} are used.}

\item{tree.definitions}{An optional \code{data.frame} of hard-coded FFT definitions (in the format of \code{x$trees$definitions} of an \code{FFTrees} object \code{x}).
If specified, no new FFTs are being fitted (i.e., \code{algorithm} and functions for evaluating cues and creating FFTs are skipped).
Instead, the tree definitions provided are used to re-evaluate the current \code{FFTrees} object on current data.}

\item{do.comp, do.lr, do.cart, do.svm, do.rf}{Should alternative algorithms be used for comparison (as logical)?
All options are set to \code{TRUE} by default. Available options correspond to:
\itemize{
  \item{\code{do.lr}: Logistic regression (LR, using \code{\link{glm}} from \strong{stats} with \code{family = "binomial"});}
  \item{\code{do.cart}: Classification and regression trees (CART, using \code{rpart} from \strong{rpart});}
  \item{\code{do.svm}: Support vector machines (SVM, using \code{svm} from \strong{e1071});}
  \item{\code{do.rf}: Random forests (RF, using \code{randomForest} from \strong{randomForest}.}
}
Specifying \code{do.comp = FALSE} sets all available options to \code{FALSE}.}

\item{quiet}{A list of 4 logical arguments: Should detailed progress reports be suppressed?
Setting list elements to \code{FALSE} is helpful when diagnosing errors.
Default: \code{quiet = list(ini = TRUE, fin = FALSE, mis = FALSE, set = TRUE)},
for initial vs. final steps, missing cases, and parameter settings, respectively.
Providing a single logical value sets all elements to \code{TRUE} or \code{FALSE}.}

\item{comp, force, rank.method, rounding, store.data, verbose}{Deprecated arguments (unused or replaced, to be retired in future releases).}
}
\value{
An \code{FFTrees} object with the following elements:
\describe{
  \item{criterion_name}{The name of the binary criterion variable (as character).}
  \item{cue_names}{The names of all potential predictor variables (cues) in the data (as character).}
  \item{formula}{The \code{\link{formula}} specified when creating the FFTs.}
  \item{trees}{A list of FFTs created, with further details contained in \code{n}, \code{best}, \code{definitions}, \code{inwords}, \code{stats}, \code{level_stats}, and \code{decisions}.}
  \item{data}{The original training and test data (if available).}
  \item{params}{A list of defined control parameters (e.g.; \code{algorithm}, \code{goal}, \code{sens.w}, as well as various thresholds, stopping rule, and cost parameters).}
  \item{competition}{Models and classification statistics for competitive classification algorithms:
  Logistic regression (\code{lr}), classification and regression trees (\code{cart}), random forests (\code{rf}), and support vector machines (\code{svm}).}
  \item{cues}{A list of cue information, with further details contained in \code{thresholds} and \code{stats}.}
}
}
\description{
\code{FFTrees} is the workhorse function of the \strong{FFTrees} package for creating fast-and-frugal trees (FFTs).

FFTs are decision algorithms for solving binary classification tasks, i.e., they predict the values of a binary criterion variable based on 1 or multiple predictor variables (cues).

Using \code{FFTrees} on \code{data} usually generates a range of FFTs and corresponding summary statistics (as an \code{FFTrees} object)
that can then be printed, plotted, and examined further.

The criterion and predictor variables are specified in \code{\link{formula}} notation.
Based on the settings of \code{data} and \code{data.test}, FFTs are trained on a (required) training dataset
(given the set of current \code{goal} values) and evaluated on (or predict) an (optional) test dataset.

If an existing \code{FFTrees} object \code{object} or \code{tree.definitions} are provided as inputs,
no new FFTs are created.
When both arguments are provided, \code{tree.definitions} take priority over the FFTs in an existing \code{object}.
Specifically,

\itemize{

  \item{If \code{tree.definitions} are provided, these are assigned to the FFTs of \code{x}.}

  \item{If no \code{tree.definitions} are provided, but an existing \code{FFTrees} object \code{object} is provided,
  the trees from \code{object} are assigned to the FFTs of \code{x}.}

}

Create and evaluate fast-and-frugal trees (FFTs).
}
\examples{

# 1. Create fast-and-frugal trees (FFTs) for heart disease:
heart.fft <- FFTrees(formula = diagnosis ~ .,
                     data = heart.train,
                     data.test = heart.test,
                     main = "Heart Disease",
                     decision.labels = c("Healthy", "Diseased")
                     )

# 2. Print a summary of the result:
heart.fft  # same as:
# print(heart.fft, data = "train", tree = "best.train")

# 3. Plot an FFT applied to training data:
plot(heart.fft)  # same as:
# plot(heart.fft, what = "all", data = "train", tree = "best.train")

# 4. Apply FFT to (new) testing data:
plot(heart.fft, data = "test")            # predict for Tree 1
plot(heart.fft, data = "test", tree = 2)  # predict for Tree 2

# 5. Predict classes and probabilities for new data:
predict(heart.fft, newdata = heartdisease)
predict(heart.fft, newdata = heartdisease, type = "prob")

# 6. Create a custom tree (from verbal description) with my.tree:
custom.fft <- FFTrees(
  formula = diagnosis ~ .,
  data = heartdisease,
  my.tree = "If age < 50, predict False.
             If sex = 1, predict True.
             If chol > 300, predict True, otherwise predict False.",
  main = "My custom FFT")

# Plot the (pretty bad) custom tree:
plot(custom.fft)

}
\seealso{
\code{\link{print.FFTrees}} for printing FFTs;
\code{\link{plot.FFTrees}} for plotting FFTs;
\code{\link{summary.FFTrees}} for summarizing FFTs;
\code{\link{inwords}} for obtaining a verbal description of FFTs;
\code{\link{showcues}} for plotting cue accuracies.
}