SchlossLab · megancoden · Apr 7, 2023 · Apr 7, 2023 · Apr 7, 2023 · Apr 7, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,9 @@
     - New column `lower` and `upper` to report the bounds of the empirical 95% confidence interval from the permutation test.
       See `vignette('parallel')` for an example of plotting feature importance with confidence intervals.
 - Minor documentation improvements (#323, @kelly-sovacool).
+- Added option to impute missing data during training rather than preprocessing (#301, @megancoden and @shah-priyal).
+  - Added impute_in_training option to `run_ml()`, which defaults to FALSE.
+  - Added impute_in_preprocessing option to `preprocess()`, which defaults to TRUE.
 
 # mikropml 1.5.0
 

diff --git a/R/impute.R b/R/impute.R
@@ -0,0 +1,49 @@
+impute <- function(transformed_cont, n_missing) {
+  transformed_cont <- sapply_fn(transformed_cont, function(x) {
+    if (class(x) %in% c("integer", "numeric")) {
+      m <- is.na(x)
+      x[m] <- stats::median(x, na.rm = TRUE)
+    }
+    return(x)
+  }) %>% dplyr::as_tibble()
+  message(
+    paste0(
+      n_missing,
+      " missing continuous value(s) were imputed using the median value of the feature."
+    )
+  )
+  return (transformed_cont)
+}
+
+prep_data <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) {
+  dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]])
+  dataset <- rm_missing_outcome(dataset, outcome_colname)
+  split_dat <- split_outcome_features(dataset, outcome_colname)
+
+  features <- split_dat$features
+  removed_feats <- character(0)
+  if (to_numeric) {
+    feats <- change_to_num(features) %>%
+      remove_singleton_columns(threshold = prefilter_threshold)
+    removed_feats <- feats$removed_feats
+    features <- feats$dat
+  }
+  pbtick(progbar)
+
+  nv_feats <- process_novar_feats(features, progbar = progbar)
+  pbtick(progbar)
+  split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
+  pbtick(progbar)
+  cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
+  pbtick(progbar)
+  # combine all processed features
+  processed_feats <- dplyr::bind_cols(
+    cont_feats$transformed_cont,
+    split_feats$cat_feats,
+    nv_feats$novar_feats
+  )
+  pbtick(progbar)
+
+  processed_data <- list(cont_feats = cont_feats, removed_feats = removed_feats, split_dat = split_dat, processed_feats = processed_feats)
+  return(processed_data)
+}
diff --git a/R/preprocess.R b/R/preprocess.R
@@ -1,3 +1,6 @@
+# TODO: set this for a generic path (probably using here::here)
+library(here)
+here("R", "impute.R")
 #' Preprocess data prior to running machine learning
 #'
 #' Function to preprocess your data for input into [run_ml()].
@@ -60,7 +63,7 @@ preprocess_data <- function(dataset, outcome_colname,
                             method = c("center", "scale"),
                             remove_var = "nzv", collapse_corr_feats = TRUE,
                             to_numeric = TRUE, group_neg_corr = TRUE,
-                            prefilter_threshold = 1) {
+                            prefilter_threshold = 1, impute_in_preprocessing = TRUE) {
   progbar <- NULL
   if (isTRUE(check_packages_installed("progressr"))) {
     progbar <- progressr::progressor(steps = 20, message = "preprocessing")
@@ -70,34 +73,15 @@ preprocess_data <- function(dataset, outcome_colname,
   check_outcome_column(dataset, outcome_colname, check_values = FALSE)
   check_remove_var(remove_var)
   pbtick(progbar)
-  dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]])
-  dataset <- rm_missing_outcome(dataset, outcome_colname)
-  split_dat <- split_outcome_features(dataset, outcome_colname)
-
-  features <- split_dat$features
-  removed_feats <- character(0)
-  if (to_numeric) {
-    feats <- change_to_num(features) %>%
-      remove_singleton_columns(threshold = prefilter_threshold)
-    removed_feats <- feats$removed_feats
-    features <- feats$dat
-  }
-  pbtick(progbar)
-
-  nv_feats <- process_novar_feats(features, progbar = progbar)
-  pbtick(progbar)
-  split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
-  pbtick(progbar)
-  cont_feats <- process_cont_feats(split_feats$cont_feats, method)
-  pbtick(progbar)
-
+
+  processed_data <- prep_data(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric)
+  removed_feats <- processed_data$removed_feats
+  processed_feats <- processed_data$processed_feats
+  split_dat <- processed_data$split_dat
+  cont_feats <- processed_data$cont_feats
+
   # combine all processed features
-  processed_feats <- dplyr::bind_cols(
-    cont_feats$transformed_cont,
-    split_feats$cat_feats,
-    nv_feats$novar_feats
-  )
-  pbtick(progbar)
+
 
   # remove features with (near-)zero variance
   feats <- get_caret_processed_df(processed_feats, remove_var)
@@ -364,7 +348,7 @@ process_cat_feats <- function(features, progbar = NULL) {
 #'
 #' @examples
 #' process_cont_feats(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale"))
-process_cont_feats <- function(features, method) {
+process_cont_feats <- function(features, method, impute_in_preprocessing) {
   transformed_cont <- NULL
   removed_cont <- NULL
 
@@ -388,19 +372,10 @@ process_cont_feats <- function(features, method) {
       n_missing <- sum(missing)
       if (n_missing > 0) {
         # impute missing data using the median value
-        transformed_cont <- sapply_fn(transformed_cont, function(x) {
-          if (class(x) %in% c("integer", "numeric")) {
-            m <- is.na(x)
-            x[m] <- stats::median(x, na.rm = TRUE)
-          }
-          return(x)
-        }) %>% dplyr::as_tibble()
-        message(
-          paste0(
-            n_missing,
-            " missing continuous value(s) were imputed using the median value of the feature."
-          )
-        )
+        if (impute_in_preprocessing) {
+          source("impute.R")
+          transformed_cont <- impute(transformed_cont, n_missing)
+        }
       }
     }
   }

diff --git a/R/run_ml.R b/R/run_ml.R
@@ -1,3 +1,5 @@
+# TODO: test if calling these functions works
+# TODO: figure out if there's a way to only specify option in one place (for runml and for preprocess)
 #' Run the machine learning pipeline
 #'
 #' This function splits the data set into a train & test set,
@@ -144,6 +146,7 @@ run_ml <-
            group_partitions = NULL,
            corr_thresh = 1,
            seed = NA,
+           impute_in_training = FALSE,
            ...) {
     check_all(
       dataset,
@@ -162,7 +165,7 @@ run_ml <-
     if (!is.na(seed)) {
       set.seed(seed)
     }
-
+    
     # `future.apply` is required for `find_feature_importance()`.
     # check it here to adhere to the fail fast principle.
     if (find_feature_importance) {
@@ -173,20 +176,20 @@ run_ml <-
     if (find_feature_importance) {
       check_cat_feats(dataset %>% dplyr::select(-outcome_colname))
     }
-
+    
     dataset <- dataset %>%
       randomize_feature_order(outcome_colname) %>%
       # convert tibble to dataframe to silence warning from caret::train():
       # "Warning: Setting row names on a tibble is deprecated.."
       as.data.frame()
-
+    
     outcomes_vctr <- dataset %>% dplyr::pull(outcome_colname)
-
+    
     if (length(training_frac) == 1) {
       training_inds <- get_partition_indices(outcomes_vctr,
-        training_frac = training_frac,
-        groups = groups,
-        group_partitions = group_partitions
+                                             training_frac = training_frac,
+                                             groups = groups,
+                                             group_partitions = group_partitions
       )
     } else {
       training_inds <- training_frac
@@ -201,30 +204,43 @@ run_ml <-
     }
     check_training_frac(training_frac)
     check_training_indices(training_inds, dataset)
-
+    
     train_data <- dataset[training_inds, ]
     test_data <- dataset[-training_inds, ]
+    if (impute_in_training == TRUE) {
+
+      train_processed_data <- prep_data(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
+      train_processed_feats <- train_processed_data$processed_feats
+      split_dat <- train_processed_data$split_dat
+      train_data <- dplyr::bind_cols(split_dat$outcome, train_processed_feats) %>%
+        dplyr::as_tibble()
+      test_processed_data <- prep_data(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
+      test_processed_feats <- test_processed_data$processed_feats
+      split_dat <- test_processed_data$split_dat
+      test_data <- dplyr::bind_cols(split_dat$outcome, test_processed_feats) %>%
+        dplyr::as_tibble()
+    }
     # train_groups & test_groups will be NULL if groups is NULL
     train_groups <- groups[training_inds]
     test_groups <- groups[-training_inds]
-
+    
     if (is.null(hyperparameters)) {
       hyperparameters <- get_hyperparams_list(dataset, method)
     }
     tune_grid <- get_tuning_grid(hyperparameters, method)
-
-
+    
+    
     outcome_type <- get_outcome_type(outcomes_vctr)
     class_probs <- outcome_type != "continuous"
-
+    
     if (is.null(perf_metric_function)) {
       perf_metric_function <- get_perf_metric_fn(outcome_type)
     }
-
+    
     if (is.null(perf_metric_name)) {
       perf_metric_name <- get_perf_metric_name(outcome_type)
     }
-
+    
     if (is.null(cross_val)) {
       cross_val <- define_cv(
         train_data,
@@ -238,8 +254,8 @@ run_ml <-
         group_partitions = group_partitions
       )
     }
-
-
+    
+    
     message("Training the model...")
     trained_model_caret <- train_model(
       train_data = train_data,
@@ -254,7 +270,7 @@ run_ml <-
     if (!is.na(seed)) {
       set.seed(seed)
     }
-
+    
     if (calculate_performance) {
       performance_tbl <- get_performance_tbl(
         trained_model_caret,
@@ -269,7 +285,7 @@ run_ml <-
     } else {
       performance_tbl <- "Skipped calculating performance"
     }
-
+    
     if (find_feature_importance) {
       message("Finding feature importance...")
       feature_importance_tbl <- get_feature_importance(
@@ -287,7 +303,7 @@ run_ml <-
     } else {
       feature_importance_tbl <- "Skipped feature importance"
     }
-
+    
     return(
       list(
         trained_model = trained_model_caret,