Skip to content

Commit

Permalink
Merge pull request #187 from hneth/master
Browse files Browse the repository at this point in the history
Handle numeric NA values
  • Loading branch information
hneth committed Mar 22, 2023
2 parents 2aa27e1 + 2c6246e commit 8d69cb1
Show file tree
Hide file tree
Showing 20 changed files with 241 additions and 81 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: FFTrees
Type: Package
Title: Generate, Visualise, and Evaluate Fast-and-Frugal Decision Trees
Version: 1.9.0.9020
Date: 2023-03-12
Version: 1.9.0.9022
Date: 2023-03-22
Authors@R: c(person("Nathaniel", "Phillips", role = c("aut"), email = "Nathaniel.D.Phillips.is@gmail.com", comment = c(ORCID = "0000-0002-8969-7013")),
person("Hansjoerg", "Neth", role = c("aut", "cre"), email = "h.neth@uni.kn", comment = c(ORCID = "0000-0001-5427-3141")),
person("Jan", "Woike", role = "aut", comment = c(ORCID = "0000-0002-6816-121X")),
Expand Down
6 changes: 3 additions & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

# FFTrees 1.9

## 1.9.0.9018
## 1.9.0.9022

This is the current development version of **FFTrees**, available at <https://github.com/ndphillips/FFTrees>.

Expand Down Expand Up @@ -49,7 +49,7 @@ Changes since last release:
- Added progress bar of **cli** package (removing dependency on **progress**).
- Added `exit_types` as global constant.
- Improved data cleaning (consistent for training and test data).
- Revised documentation and vignettes.
- Revised documentation, vignettes, and tests.


<!-- Development version: -->
Expand Down Expand Up @@ -462,6 +462,6 @@ Thus, the main tree building function is now `FFTrees()` and the new tree object

------

[File `NEWS.md` last updated on 2023-03-10.]
[File `NEWS.md` last updated on 2023-03-22.]

<!-- eof. -->
8 changes: 4 additions & 4 deletions R/fftrees_apply.R
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,11 @@ fftrees_apply <- function(x,
threshold_i <- threshold_v[level_i]

# Current cue values from data (as df):
cue_values <- data[[cue_i]]
cue_values <- as.vector(data[[cue_i]]) # as.vector() turns "matrix" "array" into (numeric) vector
cur_class <- substr(class(cue_values), 1, 1)

cur_class <- substr(class(cue_values), 1, 1)

# print(paste0("class_i = ", class_i, "; cur_class = ", cur_class)) # 4debugging
# print(paste0("class_i = ", class_i)) # 4debugging
# print(paste0("cur_class = ", cur_class))

if (cur_class != class_i){
warning(paste0("Mismatch: class_i = ", class_i, "; cur_class = ", cur_class))
Expand Down
16 changes: 14 additions & 2 deletions R/fftrees_cuerank.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,22 @@ fftrees_cuerank <- function(x = NULL,

# Get key information of the current cue:
cue_i_name <- names(cue_df)[cue_i]
cue_i_class <- class(cue_df %>% dplyr::pull(cue_i))
cue_i_v <- unlist(cue_df[, cue_i])
cue_i_class <- class(as.vector(cue_df %>% dplyr::pull(cue_i))) # dplyr/tidyverse
# cue_i_class <- class(as.vector(cue_df[[cue_i_name]])) # base R
cue_i_v <- unlist(cue_df[ , cue_i])
cue_i_cost <- x$params$cost.cues[[cue_i_name]]

# Problem: cue_i_class can be c("matrix", "array")
# print(cue_i_class) # 4debugging: 1. before
#
# if ("matrix" %in% cue_i_class){
#
# cue_i_class <- cue_class_of_matrix(cue_i_v, cue_i_class)
# print(cue_i_class) # 4debugging: 2. after
#
# }
#
# FIXED: Added as.vector() when determining cue_i_class() above.

if (all(is.na(cue_i_v)) == FALSE) { # (A) Some non-missing values:

Expand Down
3 changes: 2 additions & 1 deletion R/util_const.R
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ fin_NA_options <- c("noise", "signal", "baseline", "majority") # (global consta

# Provide additional details (as debugging feedback):

debug <- TRUE # FALSE # (global constant)
debug <- FALSE # (global constant)



# The following are now obsolete, as handled by the FFTrees() argument:
Expand Down
149 changes: 128 additions & 21 deletions R/util_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ handle_NA_data <- function(data, criterion_name, mydata, quiet){
testthat::expect_true(is.list(quiet))


# Identify roles & NA data types: ----
# Identify columns/variables with NA values (by roles & data types): ----

# NA values (in data):
nr_NA <- colSums(is.na(data)) # Note: per column in data
Expand Down Expand Up @@ -268,65 +268,85 @@ handle_NA_data <- function(data, criterion_name, mydata, quiet){
# dplyr::mutate_if(is.factor, addNA) %>%
# dplyr::mutate_if(is.character, addNA)

if (any(ix_pred_chr_NA)){ # NA values in character predictors: ----
if (any(ix_pred_chr_NA)){ # 1. NA values in character predictors: ----

# Replace NA values:
data[ix_pred_chr] <- data[ix_pred_chr] %>%
dplyr::mutate_if(is.character, addNA) # add NA as a new factor level

if (!quiet$mis) { # Provide user feedback:
cli::cli_alert_success("Converted {sum(nr_pred_chr_NA)} NA case{?s} in {sum(ix_pred_chr_NA)} character predictor{?s} to <NA>.")
cli::cli_alert_success("Converted {sum(nr_pred_chr_NA)} NA value{?s} in {sum(ix_pred_chr_NA)} character predictor{?s} to <NA>.")
}

}
} # 1. character NA.


if (any(ix_pred_fct_NA)){ # NA values in factor predictors: ----
if (any(ix_pred_fct_NA)){ # 2. NA values in factor predictors: ----

# Replace NA values:
data[ix_pred_fct] <- data[ix_pred_fct] %>%
dplyr::mutate_if(is.factor, addNA) # add NA as a new factor level

if (!quiet$mis) { # Provide user feedback:
cli::cli_alert_success("Converted {sum(nr_pred_fct_NA)} NA case{?s} in {sum(ix_pred_fct_NA)} factor predictor{?s} to <NA>.")
cli::cli_alert_success("Converted {sum(nr_pred_fct_NA)} NA value{?s} in {sum(ix_pred_fct_NA)} factor predictor{?s} to <NA>.")
}

}
} # 2. factor NA.


if (any(ix_pred_log_NA)){ # NA values in logical predictors: ----
if (any(ix_pred_log_NA)){ # 3. NA values in logical predictors: ----

# Replace NA values:
data[ix_pred_log] <- data[ix_pred_log] %>%
dplyr::mutate_if(is.logical, addNA) # add NA as a new factor level

if (!quiet$mis) { # Provide user feedback:
cli::cli_alert_success("Converted {sum(nr_pred_log_NA)} NA case{?s} in {sum(ix_pred_log_NA)} logical predictor{?s} to <NA>.")
cli::cli_alert_success("Converted {sum(nr_pred_log_NA)} NA value{?s} in {sum(ix_pred_log_NA)} logical predictor{?s} to <NA>.")
}

}

} # 3. logical NA.

# +++ here now +++


if (any(ix_pred_num_NA)){ # NA values in numeric predictors: ----
if (any(ix_pred_num_NA)){ # 4. NA values in numeric predictors: ----

# Keep NA values in numeric predictors (but remove in classtable() of 'util_stats.R').
# +++ here now +++ : OR: Allow to replace NA-values in numeric predictors by mean/median?

replace_num_NA <- TRUE # TRUE replaces NA in numeric predictors by their mean / FALSE keeps (but handles them later)

if (replace_num_NA){

# Replace NAs in numeric predictors:
data[ix_pred_num_NA] <- replace_NA_num(df = data[ix_pred_num_NA])

if (!quiet$mis) { # Provide user feedback:
cli::cli_alert_warning("Replaced {sum(nr_pred_num_NA)} NA value{?s} in {sum(ix_pred_num_NA)} numeric predictor{?s} of '{mydata}' data.")
}

} else {

# Do nothing / keep NA values.

if (!quiet$mis) { # Provide user feedback:
cli::cli_alert_warning("Keeping {sum(nr_pred_num_NA)} NA value{?s} in {sum(ix_pred_num_NA)} numeric predictor{?s}.")
}

if (!quiet$mis) { # Provide user feedback:
cli::cli_alert_warning("Keeping {sum(nr_pred_num_NA)} NA case{?s} in {sum(ix_pred_num_NA)} numeric predictor{?s}.")
}

}
} # 4. numeric NA.

if (any(ix_crit_NA)){ # NA values in criterion: ----


if (any(ix_crit_NA)){ # 5. NA values in criterion variable: ----

# ToDo: What to do about NA values in criterion?

if (!quiet$mis) { # Provide user feedback:
cli::cli_alert_warning("Keeping {sum(nr_crit_NA)} NA case{?s} in the criterion {nm_crit_NA}.")
cli::cli_alert_warning("Keeping {sum(nr_crit_NA)} NA value{?s} in the criterion {nm_crit_NA}.")
}

}
} # 5. criterion NA.


# print(data) # 4debugging
Expand All @@ -339,10 +359,97 @@ handle_NA_data <- function(data, criterion_name, mydata, quiet){
} # handle_NA_data().


# replace_NA_vec: ------

# Goal: Replace NA-values in a vector by mean() of existing values.
# df$x_1[is.na(df$x_1)] <- mean(df$x_1, na.rm = TRUE)

replace_NA_vec <- function(v){

# by data type:
if (is.numeric(v)){

v[is.na(v)] <- mean(v, na.rm = TRUE)

} else {

stop("Cannot handle data type of v")

}

return(v)

} # replace_NA_vec().

# # Check:
# v <- c(4, 2, NA, 9, 4)
# replace_NA_vec(v)



# replace_NA_num: ------

# Goal: Replace NA-values in all numeric variables (in df) by mean().

replace_NA_num <- function(df){

# Apply replace_NA_vec() ONLY to numeric columns of df:

ix_num <- sapply(X = df, FUN = is.numeric) # ix of numeric columns

df[ix_num] <- apply(X = df[ix_num], MARGIN = 2, FUN = replace_NA_vec) # replace

# Output:
return(df)

} # replace_NA_num().

# # Check:
# df <- data.frame(a_0 = letters[1:5],
# x_1 = c(4, 2, NA, 9, 4),
# x_2 = c(-2, -1, NA, 2, 1),
# x_3 = c(1, NA, 3, 4, 5))
#
# replace_NA_num(df)
# class(df$x_3)

# Note: See some tidyverse solutions at
# <https://www.codingprof.com/how-to-replace-nas-with-the-mean-in-r-examples/>



# cue_class_of_matrix: ------

# # Handle special case:
# # Numeric cues have been turned into class c("matrix", "array").
# # Goal: If data type is "double" or "integer", then set class to "numeric".
#
# cue_class_of_matrix <- function(cue, cue_class){
#
# if ("matrix" %in% cue_class){
#
# cue_type <- typeof(cue)
#
# if (cue_type %in% c("double", "integer")){
# return("numeric")
# } else {
# return(cue_type)
# }
#
# } else {
#
# return(cue_class) # unchanged
#
# }
#
# } # cue_class_of_matrix().

# Note: Obsolete/Fixed by adding as.vector() when determining cue class.


# ToDo: ------

# - Handle consequences of allowing NAs in numeric predictors.
# - Handle NAs in criterion variable.
# - Handle the consequences of allowing NAs in numeric predictors.
# - Handle NAs in the criterion variable.

# eof.
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ url_JDM_doi <- "https://doi.org/10.1017/S1930297500006239"
[![R-CMD-check](https://github.com/ndphillips/FFTrees/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/ndphillips/FFTrees/actions/workflows/R-CMD-check.yaml)
<!-- Devel badges end. -->


<!-- Release badges start: -->
<!-- [![CRAN status](https://www.r-pkg.org/badges/version/FFTrees)](https://CRAN.R-project.org/package=FFTrees) -->
<!-- [![Total downloads](https://cranlogs.r-pkg.org/badges/grand-total/FFTrees?color='00a9e0')](https://www.r-pkg.org/pkg/FFTrees) -->
<!-- Release badges end. -->


<!-- ALL badges start: -->
<!-- [![CRAN status](https://www.r-pkg.org/badges/version/FFTrees)](https://CRAN.R-project.org/package=FFTrees) -->
<!-- [![Build Status](https://travis-ci.org/ndphillips/FFTrees.svg?branch=master)](https://travis-ci.org/ndphillips/FFTrees) -->
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<!-- README.md is generated from README.Rmd. Please only edit the .Rmd file! -->
<!-- Title, version and logo: -->

# FFTrees 1.9.0.9020 <img src = "./inst/FFTrees_Logo.jpg" align = "right" alt = "FFTrees" width = "225" />
# FFTrees 1.9.0.9022 <img src = "./inst/FFTrees_Logo.jpg" align = "right" alt = "FFTrees" width = "225" />

<!-- Devel badges start: -->

Expand Down Expand Up @@ -203,8 +203,8 @@ heart_fft$competition$test
#> 3 cart 153 50 19 23 61 0.685 0.762 0.238 0.725 0.726 0.725
#> 4 rf 153 59 8 14 72 0.808 0.9 0.1 0.881 0.837 0.856
#> 5 svm 153 55 7 18 73 0.753 0.912 0.0875 0.887 0.802 0.837
#> # … with 6 more variables: bacc <dbl>, wacc <dbl>, dprime <dbl>,
#> # cost_dec <dbl>, cost_cue <dbl>, cost <dbl>
#> # 6 more variables: bacc <dbl>, wacc <dbl>, dprime <dbl>, cost_dec <dbl>,
#> # cost_cue <dbl>, cost <dbl>

<!-- FFTs by verbal description: -->

Expand Down Expand Up @@ -333,6 +333,6 @@ Examples include:

------------------------------------------------------------------------

\[File `README.Rmd` last updated on 2023-03-12.\]
\[File `README.Rmd` last updated on 2023-03-22.\]

<!-- eof. -->
17 changes: 0 additions & 17 deletions tests/testthat/test-na_numeric_feature.R

This file was deleted.

15 changes: 0 additions & 15 deletions tests/testthat/test-plotFFTrees_function.R

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
context("Creating basic FFTrees objects")
context("Create basic FFTrees objects")

test_that("Can create FFTrees object", {

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
context("ifan vs dfan")
context("Check tree algorithms: ifan vs dfan")


test_that("Can create FFTrees object with dfan", {

Expand All @@ -7,6 +8,7 @@ test_that("Can create FFTrees object with dfan", {
expect_s3_class(object = object, class = "FFTrees")
})


test_that("Different results with ifan and dfan", {

trees_ifan <- FFTrees(diagnosis ~ .,
Expand Down

0 comments on commit 8d69cb1

Please sign in to comment.