Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #630 from cmu-delphi/modeltools
Merge `modeltools` into `main`
- Loading branch information
Showing
22 changed files
with
1,102 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#' Simple example forecaster that forecasts confirmed cases. | ||
#' | ||
#' @param df_list List of downloaded covidcast signals. | ||
#' @param forecast_date Date from which the forecast should be made. | ||
#' @param ahead Vector of epiweeks ahead to predict. | ||
#' @param training_window_size Number of days prior to `forecast_date` to use for training. | ||
#' @param lags Vector of lags or list of vectors of lags to apply to `df_list`. See documentation | ||
#' of `dt` argument to `covidcast::aggregate_signals()` for details. | ||
#' @param quantiles Vector of quantile values at which to predict | ||
#' | ||
#' preds <- evalcast::get_predictions(modeltools::example_forecaster, | ||
#' "example", | ||
#' tibble( | ||
#' data_source = "jhu-csse", | ||
#' signal = "confirmed_incidence_num", | ||
#' geo_type = "state", | ||
#' start_date = as.Date("2021-01-01"), | ||
#' ), | ||
#' as.Date("2021-03-01"), | ||
#' "epiweek") | ||
#' | ||
#' @importFrom dplyr mutate bind_cols bind_rows | ||
#' @importFrom tidyr pivot_longer | ||
#' | ||
#' @export | ||
example_forecaster <- function(df_list, | ||
forecast_date, | ||
ahead = 1:4, | ||
training_window_size = 14, | ||
lags = seq(0, -1*training_window_size, -7), | ||
quantiles = modeltools::covidhub_probs) { | ||
covariates <- covidcast::aggregate_signals(df_list, dt = lags, format = "wide") | ||
covariates_with_response <- add_response_columns(covariates, | ||
"jhu-csse_confirmed_incidence_num", | ||
forecast_date, | ||
"epiweek", | ||
ahead) | ||
|
||
results <- list() | ||
for (a in ahead) { | ||
mats <- create_train_and_predict_matrices(covariates_with_response, a, training_window_size) | ||
model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0) | ||
predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x) | ||
|
||
colnames(predictions) <- quantiles | ||
predict_df <- bind_cols(geo_value = mats$predict_geo_values, | ||
predictions) %>% | ||
pivot_longer(cols = -geo_value, | ||
names_to = "quantile", | ||
values_to = "value") %>% | ||
mutate(ahead = a) | ||
|
||
results[[a]] <- predict_df | ||
} | ||
return(bind_rows(results)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
#' Create training and test data matrices and a training response for given aheads. | ||
#' | ||
#' Create training and test data matrices and training response for a set of | ||
#' given aheads. Works for both single ahead values and a vector of ahead values. | ||
#' For multiple ahead values, the function has the ability to return separate | ||
#' data matrices and responses for each ahead, or a single data matrix and | ||
#' response matrix for all aheads at once. | ||
#' | ||
#' @param lagged_df Data frame of lagged data. It should have the following columns: | ||
#' \itemize{ | ||
#' \item `geo_value`: Strings of geographic locations. | ||
#' \item `time_value`: Dates of training data. | ||
#' \item Covariate columns: Columns with names of the form `value-{days}:{signal}` or | ||
#' `value+0:{signal}` whose values correspond to `{signal}` `{days}` before `time_value`. | ||
#' \item Response columns: Columns with names of the form `response+{n}:{response}` whose values | ||
#' correspond to `{response}` `{n}` incidence period units after `time_value`. | ||
#' } | ||
#' A data frame in this format can be made using `covidcast::aggregate_signals()` and | ||
#' `modeltools::get_response_columns()`. | ||
#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) | ||
#' ahead to forecast. Can be a single positive integer or a vector of | ||
#' positive integers. Note that for each `{a}` in `ahead`, the column | ||
#' `response+{a}:{response}` should be present in `lagged_df`. | ||
#' @param training_window_size Size of the local training window in days to use. For example, if | ||
#' `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on | ||
#' data from December 1 to December 14. | ||
#' @param aheads_separate If `length(ahead) > 1`, should there be separate | ||
#' data matrices and responses for each ahead? Default is `TRUE`. | ||
#' | ||
#' @return For a single ahead value, named list with entries: | ||
#' \itemize{ | ||
#' \item `train_x`: Matrix of training data whose columns correspond to the | ||
#' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the | ||
#' latest date with an non-null response, plus all data from the `training_window_size` | ||
#' days prior to it. | ||
#' \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of | ||
#' `lagged_df` corresponding to the rows of `train_x`. | ||
#' \item `train_geo_values`: Vector of geo values corresponding to the rows | ||
#' of `train_x`. | ||
#' \item `train_time_values`: Vector of time values corresponding to the rows | ||
#' of `train_x`. | ||
#' \item `train_end_date`: Latest `time_value` used in the training period. | ||
#' \item `predict_x`: Matrix of prediction data in the same format as `train_x`. The | ||
#' prediction data contains the most recent `training_window_size` days. | ||
#' \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`. | ||
#' \item `predict_time_value`: Time value corresponding to `predict_x`. | ||
#' } | ||
#' For multiple ahead values and `aheads_separate = TRUE`, a list having | ||
#' the same length as `ahead`, with each element being a named list as above. | ||
#' For multiple ahead values and `ahead_separate = FALSE`, a named list as | ||
#' above, except `train_y` is a matrix of responses rather than a vector. | ||
#' | ||
#' @examples \dontrun{ | ||
#' create_train_and_predict_matrices( | ||
#' tibble( | ||
#' geo_value = rep(c("az", "wv"), 5), | ||
#' time_value = rep( | ||
#' as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")), | ||
#' each = 2), | ||
#' `value-2:signal_1` = seq(-3, 6), | ||
#' `value-1:signal_1` = seq(-1, 8), | ||
#' `value+0:signal_1` = seq(1, 10), | ||
#' `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)) | ||
#' ), | ||
#' ahead = 2, | ||
#' training_window_size = 1) | ||
#' } | ||
#' | ||
#' @importFrom tibble tibble | ||
#' @importFrom assertthat assert_that | ||
#' @importFrom lubridate as_date | ||
#' @importFrom purrr map_dbl map2 | ||
#' | ||
#' @export | ||
create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size, | ||
aheads_separate = TRUE) { | ||
# make sure the response columns are unique | ||
for (a in ahead) { | ||
responses_at_ahead <- lagged_df %>% | ||
select(tidyselect::starts_with(sprintf("response+%i:", a))) %>% | ||
ncol() | ||
assert_that(responses_at_ahead == 1, | ||
msg = paste("multiple responses at ahead =", a)) | ||
} | ||
|
||
# prediction matrices are the same for all aheads | ||
predict_time_value <- max(lagged_df$time_value) | ||
predict_x <- lagged_df %>% | ||
filter(time_value == predict_time_value) %>% | ||
select(tidyselect::starts_with("value")) %>% | ||
as.matrix() | ||
predict_geo_values <- lagged_df %>% | ||
filter(time_value == predict_time_value) %>% | ||
select(geo_value) %>% pull() | ||
|
||
all_out <- list() | ||
|
||
if (aheads_separate) { | ||
train_end_dates <- ahead %>% | ||
purrr::map_dbl(~ get_train_end_date(lagged_df, .x)) %>% | ||
lubridate::as_date() | ||
|
||
all_out <- purrr::map2(ahead, train_end_dates, | ||
~ create_train_matrices( | ||
lagged_df, .x, training_window_size, .y)) | ||
} else { # ahead_separate = FALSE: We want a single training data matrix and | ||
# a matrix of training responses containing all the aheads | ||
# Need to recompute training_window_size | ||
training_window_size <- training_window_size + max(ahead) - min(ahead) | ||
|
||
# Find the last possible date of training data | ||
# (corresponds to the smallest ahead) | ||
train_end_date <- get_train_end_date(lagged_df, ahead) | ||
|
||
# Training matrices | ||
all_out[[1]] <- create_train_matrices(lagged_df, ahead, | ||
training_window_size, | ||
train_end_date) | ||
} | ||
|
||
# Add prediction matrices / info | ||
for (i in seq_along(all_out)) { | ||
all_out[[i]]$predict_x <- predict_x | ||
all_out[[i]]$predict_geo_values <- predict_geo_values | ||
all_out[[i]]$predict_time_value <- predict_time_value | ||
} | ||
|
||
if (length(ahead) == 1 || aheads_separate == FALSE) { | ||
return(all_out[[1]]) | ||
} else { | ||
names(all_out) <- paste0("ahead+", ahead) | ||
return(all_out) | ||
} | ||
} | ||
|
||
#' Get last possible date of training data for given aheads. | ||
#' | ||
#' Returns the last possible date of the training data for a given set of | ||
#' aheads. If more than one ahead is given, the date return corresponds to the | ||
#' last possible date corresponding to the smallest ahead value. This is because | ||
#' the smallest ahead value will have the latest possible date. | ||
#' | ||
#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`. | ||
#' @param ahead Number of days ahead to forecast. Can be a single positive | ||
#' integer or a vector of positive integers. | ||
#' | ||
#' @return Single date corresponding to the last possible date of the training | ||
#' data. | ||
get_train_end_date <- function(lagged_df, ahead) { | ||
response_end_date <- lagged_df %>% | ||
select(time_value, | ||
tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>% | ||
tidyr::drop_na() %>% | ||
summarize(max(time_value)) %>% | ||
pull() | ||
train_end_date <- min(max(lagged_df$time_value), response_end_date) | ||
return(train_end_date) | ||
} | ||
|
||
#' Create training data matrix and a training response for given aheads. | ||
#' | ||
#' Create training and data matrix and training response for a set of | ||
#' given aheads. Works for both single ahead values and a vector of ahead | ||
#' values. However, note that this function works different from | ||
#' `create_train_and_predict_matrices()` for multiple ahead values. If | ||
#' multiple ahead values are supplied, we return a matrix of responses | ||
#' containing all aheads at once. | ||
#' | ||
#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`. | ||
#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) | ||
#' ahead to forecast. Can be a single positive integer or a vector of | ||
#' positive integers. Note that for each `{a}` in `ahead`, the column | ||
#' `response+{a}:{response}` should be present in `lagged_df`. | ||
#' @param n_days Number of days worth of data to pull. | ||
#' @param train_end_date The last date to be included in the training data. | ||
#' | ||
#' @return A named list with entries: | ||
#' \itemize{ | ||
#' \item `train_x`: Matrix of training data whose columns correspond to the | ||
#' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the | ||
#' latest date with an non-null response, plus all data from the `training_window_size` | ||
#' days prior to it. | ||
#' \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of | ||
#' `lagged_df` corresponding to the rows of `train_x`. If multiple ahead | ||
#' values are provided, then this is a matrix instead. | ||
#' \item `train_geo_values`: Vector of geo values corresponding to the rows | ||
#' of `train_x`. | ||
#' \item `train_time_values`: Vector of time values corresponding to the rows | ||
#' of `train_x`. | ||
#' \item `train_end_date`: Latest `time_value` used in the training period. | ||
#' } | ||
create_train_matrices <- function(lagged_df, ahead, n_days, | ||
train_end_date) { | ||
train_df <- lagged_df %>% | ||
filter(between(time_value, | ||
train_end_date - n_days + 1, | ||
train_end_date)) | ||
out <- list() | ||
out$train_x <- train_df %>% | ||
select(tidyselect::starts_with("value")) %>% | ||
as.matrix() | ||
|
||
train_y <- train_df %>% | ||
select(tidyselect::starts_with(paste0("response+", ahead, ":"))) | ||
if (length(ahead) == 1) { | ||
out$train_y <- pull(train_y) | ||
} else { | ||
out$train_y <- as.matrix(train_y) | ||
} | ||
|
||
out$train_geo_values <- train_df$geo_value | ||
out$train_time_values <- train_df$time_value | ||
out$train_end_date <- as.Date(train_end_date) | ||
|
||
return(out) | ||
} |
Oops, something went wrong.