Merge pull request #630 from cmu-delphi/modeltools

Merge `modeltools` into `main`
cmu-delphi · Jun 2, 2023 · 4431561 · 4431561
2 parents 54785d8 + 1d9e25e
commit 4431561
Show file tree

Hide file tree

Showing 22 changed files with 1,102 additions and 39 deletions.
diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: modeltools
 Type: Package
-Title: Tools for Building COVID Forecasters and Related Models 
+Title: Tools for Building COVID Forecasters and Related Models
 Version: 0.1.0
 Authors@R:
 	c(
@@ -15,15 +15,23 @@ License: MIT
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.3
 Remotes:
-    github::cmu-delphi/covidcast/R-packages/covidcast@main,
-    github::cmu-delphi/covidcast/R-packages/evalcast@main
-Imports: 
+    cmu-delphi/covidcast/R-packages/covidcast@main,
+    cmu-delphi/covidcast/R-packages/evalcast@evalcast,
+    ryantibs/quantgen/quantgen@master
+Imports:
+    assertthat,
     covidcast,
     dplyr,
-    evalcast,
     genlasso,
+    lubridate,
+    MMWRweek,
+    purrr,
+    quantgen,
     slider,
+    tibble,
     tidyselect,
     tidyr
+Suggests:
+    evalcast
diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
@@ -7,15 +7,20 @@ export(Median)
 export(Min)
 export(Start)
 export(Sum)
+export(add_response_columns)
+export(create_train_and_predict_matrices)
 export(estimate_deriv)
+export(example_forecaster)
 export(pct_change)
 export(quantgen_forecaster)
 export(quiet)
 export(slide_by_geo)
+importFrom(assertthat,assert_that)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,arrange)
 importFrom(dplyr,between)
 importFrom(dplyr,bind_cols)
+importFrom(dplyr,bind_rows)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_modify)
@@ -27,10 +32,14 @@ importFrom(dplyr,ungroup)
 importFrom(genlasso,coef.genlasso)
 importFrom(genlasso,cv.trendfilter)
 importFrom(genlasso,trendfilter)
+importFrom(lubridate,as_date)
 importFrom(lubridate,days)
+importFrom(purrr,map2)
+importFrom(purrr,map_dbl)
 importFrom(stats,coef)
 importFrom(stats,lsfit)
 importFrom(stats,predict)
 importFrom(stats,smooth.spline)
+importFrom(tibble,tibble)
 importFrom(tidyr,drop_na)
 importFrom(tidyr,pivot_longer)
diff --git a/R-packages/modeltools/R/change.R b/R-packages/modeltools/R/change.R
@@ -19,6 +19,19 @@
 #' @return A data frame given by appending a new column to `x` named according
 #'   to the `col_name` argument, containing the percentage change values. 
 #'
+#' @examples \dontrun{
+#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", 
+#'     start_day = "2021-01-01", 
+#'     end_day = "2021-01-31",
+#'     geo_type = "state")
+#' 
+#' # percentage change between back-to-back weeks (default, as n = 14)
+#' pct_change(df)
+#' 
+#' # percentage change between back-to-back days
+#' pct_change(df, n = 2)
+#' }
+#'
 #' @export
 pct_change = function(x, n = 14, col_name = "pct_change") {
   # Check if n is odd and if so bump it up one

diff --git a/R-packages/modeltools/R/deriv.R b/R-packages/modeltools/R/deriv.R
@@ -74,6 +74,26 @@
 #' 
 #' @return A data frame given by appending a new column to `x` named according
 #'   to the `col_name` argument, containing the estimated derivative values. 
+#'   
+#' @examples \dontrun{
+#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", 
+#'     start_day = "2021-01-01", 
+#'     end_day = "2021-02-28",
+#'     geo_type = "state",
+#'     geo_values = c("ca", "fl"))
+#' 
+#' # estimate derivative using linear regression and n = 10 days
+#' estimate_deriv(df, method = "lin", n = 10)
+#' 
+#' # keep the linear regression fits
+#' estimate_deriv(df, method = "lin", n = 10, keep_obj = TRUE)
+#' 
+#' # estimate derivative using smoothing spline with 8 degrees of freedom
+#' estimate_deriv(df, method = "ss", n = 28, df = 8)
+#' 
+#' # estimate derivative using trend filtering with 8 degrees of freedom
+#' estimate_deriv(df, method = "tf", n = 28, df = 8)
+#' }
 #'
 #' @export
 estimate_deriv = function(x, method = c("lin", "ss", "tf"), n = 14,

diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R
@@ -0,0 +1,56 @@
+#' Simple example forecaster that forecasts confirmed cases.
+#'
+#' @param df_list List of downloaded covidcast signals.
+#' @param forecast_date Date from which the forecast should be made.
+#' @param ahead Vector of epiweeks ahead to predict.
+#' @param training_window_size Number of days prior to `forecast_date` to use for training.
+#' @param lags Vector of lags or list of vectors of lags to apply to `df_list`.  See documentation
+#'     of `dt` argument to `covidcast::aggregate_signals()` for details.
+#' @param quantiles Vector of quantile values at which to predict
+#'
+#' preds <- evalcast::get_predictions(modeltools::example_forecaster,
+#'                                    "example",
+#'                                    tibble(
+#'                                      data_source = "jhu-csse",
+#'                                      signal = "confirmed_incidence_num",
+#'                                      geo_type = "state",
+#'                                      start_date = as.Date("2021-01-01"),
+#'                                    ),
+#'                                    as.Date("2021-03-01"),
+#'                                    "epiweek")
+#'
+#' @importFrom dplyr mutate bind_cols bind_rows
+#' @importFrom tidyr pivot_longer
+#'
+#' @export
+example_forecaster <- function(df_list,
+                               forecast_date,
+                               ahead = 1:4,
+                               training_window_size = 14,
+                               lags = seq(0, -1*training_window_size, -7),
+                               quantiles = modeltools::covidhub_probs) {
+    covariates <- covidcast::aggregate_signals(df_list, dt = lags, format = "wide")
+    covariates_with_response <- add_response_columns(covariates,
+                                                     "jhu-csse_confirmed_incidence_num",
+                                                     forecast_date,
+                                                     "epiweek",
+                                                     ahead)
+
+    results <- list()
+    for (a in ahead) {
+        mats <- create_train_and_predict_matrices(covariates_with_response, a, training_window_size)
+        model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0)
+        predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x)
+
+        colnames(predictions) <- quantiles
+        predict_df <- bind_cols(geo_value = mats$predict_geo_values,
+                                predictions) %>%
+            pivot_longer(cols = -geo_value,
+                         names_to = "quantile",
+                         values_to = "value") %>%
+            mutate(ahead = a)
+
+        results[[a]] <- predict_df
+    }
+    return(bind_rows(results))
+}
diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
@@ -0,0 +1,216 @@
+#' Create training and test data matrices and a training response for given aheads.
+#' 
+#' Create training and test data matrices and training response for a set of
+#' given aheads. Works for both single ahead values and a vector of ahead values.
+#' For multiple ahead values, the function has the ability to return separate
+#' data matrices and responses for each ahead, or a single data matrix and
+#' response matrix for all aheads at once.
+#'
+#' @param lagged_df Data frame of lagged data. It should have the following columns:
+#'     \itemize{
+#'     \item `geo_value`: Strings of geographic locations.
+#'     \item `time_value`: Dates of training data.
+#'     \item Covariate columns: Columns with names of the form `value-{days}:{signal}` or
+#'         `value+0:{signal}` whose values correspond to `{signal}` `{days}` before `time_value`.
+#'     \item Response columns: Columns with names of the form `response+{n}:{response}` whose values
+#'         correspond to `{response}` `{n}` incidence period units after `time_value`.
+#'     }
+#'     A data frame in this format can be made using `covidcast::aggregate_signals()` and
+#'     `modeltools::get_response_columns()`.
+#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) 
+#'     ahead to forecast. Can be a single positive integer or a vector of 
+#'     positive integers. Note that for each `{a}` in `ahead`, the column 
+#'     `response+{a}:{response}` should be present in `lagged_df`.
+#' @param training_window_size Size of the local training window in days to use. For example, if
+#'     `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on
+#'     data from December 1 to December 14.
+#' @param aheads_separate If `length(ahead) > 1`, should there be separate
+#'     data matrices and responses for each ahead? Default is `TRUE`.
+#'
+#' @return For a single ahead value, named list with entries:
+#'     \itemize{
+#'     \item `train_x`: Matrix of training data whose columns correspond to the
+#'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
+#'         latest date with an non-null response, plus all data from the `training_window_size`
+#'         days prior to it.
+#'     \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of
+#'         `lagged_df` corresponding to the rows of `train_x`.
+#'     \item `train_geo_values`: Vector of geo values corresponding to the rows
+#'         of `train_x`.
+#'     \item `train_time_values`: Vector of time values corresponding to the rows
+#'         of `train_x`.
+#'     \item `train_end_date`: Latest `time_value` used in the training period.
+#'     \item `predict_x`: Matrix of prediction data in the same format as `train_x`.  The
+#'         prediction data contains the most recent `training_window_size` days.
+#'     \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`.
+#'     \item `predict_time_value`: Time value corresponding to `predict_x`.
+#'     }
+#'     For multiple ahead values and `aheads_separate = TRUE`, a list having
+#'     the same length as `ahead`, with each element being a named list as above.
+#'     For multiple ahead values and `ahead_separate = FALSE`, a named list as
+#'     above, except `train_y` is a matrix of responses rather than a vector.
+#'
+#' @examples \dontrun{
+#' create_train_and_predict_matrices(
+#'   tibble(
+#'     geo_value = rep(c("az", "wv"), 5),
+#'     time_value = rep(
+#'       as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")),
+#'       each = 2),
+#'     `value-2:signal_1` = seq(-3, 6),
+#'     `value-1:signal_1` = seq(-1, 8),
+#'     `value+0:signal_1` = seq(1, 10),
+#'     `response+2:signal_1` = c(seq(5, 10), rep(NA, 4))
+#'   ),
+#'   ahead = 2,
+#'   training_window_size = 1)
+#' }
+#'
+#' @importFrom tibble tibble
+#' @importFrom assertthat assert_that
+#' @importFrom lubridate as_date
+#' @importFrom purrr map_dbl map2
+#'
+#' @export
+create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size,
+                                              aheads_separate = TRUE) {
+    # make sure the response columns are unique
+    for (a in ahead) {
+        responses_at_ahead <- lagged_df %>%
+            select(tidyselect::starts_with(sprintf("response+%i:", a))) %>%
+            ncol()
+        assert_that(responses_at_ahead == 1,
+                    msg = paste("multiple responses at ahead =", a))
+    }
+
+    # prediction matrices are the same for all aheads
+    predict_time_value <- max(lagged_df$time_value)
+    predict_x <- lagged_df %>%
+        filter(time_value == predict_time_value) %>%
+        select(tidyselect::starts_with("value")) %>%
+        as.matrix()
+    predict_geo_values <- lagged_df %>%
+        filter(time_value == predict_time_value) %>%
+        select(geo_value) %>% pull()
+
+    all_out <- list()
+
+    if (aheads_separate) {
+        train_end_dates <- ahead %>%
+            purrr::map_dbl(~ get_train_end_date(lagged_df, .x)) %>%
+            lubridate::as_date()
+
+        all_out <- purrr::map2(ahead, train_end_dates,
+                              ~ create_train_matrices(
+                                  lagged_df, .x, training_window_size, .y))
+    } else {  # ahead_separate = FALSE: We want a single training data matrix and 
+              # a matrix of training responses containing all the aheads
+        # Need to recompute training_window_size 
+        training_window_size <- training_window_size + max(ahead) - min(ahead)
+
+        # Find the last possible date of training data
+        # (corresponds to the smallest ahead)
+        train_end_date <- get_train_end_date(lagged_df, ahead)
+
+        # Training matrices
+        all_out[[1]] <- create_train_matrices(lagged_df, ahead, 
+                                              training_window_size,
+                                              train_end_date)
+    }
+
+    # Add prediction matrices / info
+    for (i in seq_along(all_out)) {
+        all_out[[i]]$predict_x <- predict_x
+        all_out[[i]]$predict_geo_values <- predict_geo_values
+        all_out[[i]]$predict_time_value <- predict_time_value
+    }
+
+    if (length(ahead) == 1 || aheads_separate == FALSE) {
+        return(all_out[[1]])
+    } else {
+        names(all_out) <- paste0("ahead+", ahead)
+        return(all_out)
+    }
+}
+
+#' Get last possible date of training data for given aheads.
+#' 
+#' Returns the last possible date of the training data for a given set of
+#' aheads. If more than one ahead is given, the date return corresponds to the
+#' last possible date corresponding to the smallest ahead value. This is because
+#' the smallest ahead value will have the latest possible date.
+#'
+#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`.
+#' @param ahead Number of days ahead to forecast. Can be a single positive 
+#'     integer or a vector of positive integers.
+#' 
+#' @return Single date corresponding to the last possible date of the training
+#' data.
+get_train_end_date <- function(lagged_df, ahead) {
+    response_end_date <- lagged_df %>%
+        select(time_value, 
+               tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>%
+        tidyr::drop_na() %>%
+        summarize(max(time_value)) %>%
+        pull()
+    train_end_date <- min(max(lagged_df$time_value), response_end_date)
+    return(train_end_date)
+}
+
+#' Create training data matrix and a training response for given aheads.
+#' 
+#' Create training and data matrix and training response for a set of
+#' given aheads. Works for both single ahead values and a vector of ahead 
+#' values. However, note that this function works different from
+#' `create_train_and_predict_matrices()` for multiple ahead values. If
+#' multiple ahead values are supplied, we return a matrix of responses
+#' containing all aheads at once.
+#'
+#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`.
+#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) 
+#'     ahead to forecast. Can be a single positive integer or a vector of 
+#'     positive integers. Note that for each `{a}` in `ahead`, the column 
+#'     `response+{a}:{response}` should be present in `lagged_df`.
+#' @param n_days Number of days worth of data to pull.
+#' @param train_end_date The last date to be included in the training data.
+#'
+#' @return A named list with entries:
+#'     \itemize{
+#'     \item `train_x`: Matrix of training data whose columns correspond to the
+#'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
+#'         latest date with an non-null response, plus all data from the `training_window_size`
+#'         days prior to it.
+#'     \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of
+#'         `lagged_df` corresponding to the rows of `train_x`. If multiple ahead
+#'         values are provided, then this is a matrix instead.
+#'     \item `train_geo_values`: Vector of geo values corresponding to the rows
+#'         of `train_x`.
+#'     \item `train_time_values`: Vector of time values corresponding to the rows
+#'         of `train_x`.
+#'     \item `train_end_date`: Latest `time_value` used in the training period.
+#'     }
+create_train_matrices <- function(lagged_df, ahead, n_days,
+                                   train_end_date) {
+    train_df <- lagged_df %>%
+        filter(between(time_value,
+                       train_end_date - n_days + 1,
+                       train_end_date))
+    out <- list()
+    out$train_x <- train_df %>%
+        select(tidyselect::starts_with("value")) %>%
+        as.matrix()
+
+    train_y <- train_df %>%
+        select(tidyselect::starts_with(paste0("response+", ahead, ":")))
+    if (length(ahead) == 1) {
+        out$train_y <- pull(train_y)
+    } else {
+        out$train_y <- as.matrix(train_y)
+    }
+
+    out$train_geo_values  <- train_df$geo_value
+    out$train_time_values <- train_df$time_value
+    out$train_end_date    <- as.Date(train_end_date)
+
+    return(out)
+}