Skip to content

Commit

Permalink
Merge pull request #630 from cmu-delphi/modeltools
Browse files Browse the repository at this point in the history
Merge `modeltools` into `main`
  • Loading branch information
dshemetov committed Jun 2, 2023
2 parents 54785d8 + 1d9e25e commit 4431561
Show file tree
Hide file tree
Showing 22 changed files with 1,102 additions and 39 deletions.
20 changes: 14 additions & 6 deletions R-packages/modeltools/DESCRIPTION
@@ -1,6 +1,6 @@
Package: modeltools
Type: Package
Title: Tools for Building COVID Forecasters and Related Models
Title: Tools for Building COVID Forecasters and Related Models
Version: 0.1.0
Authors@R:
c(
Expand All @@ -15,15 +15,23 @@ License: MIT
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.2.3
Remotes:
github::cmu-delphi/covidcast/R-packages/covidcast@main,
github::cmu-delphi/covidcast/R-packages/evalcast@main
Imports:
cmu-delphi/covidcast/R-packages/covidcast@main,
cmu-delphi/covidcast/R-packages/evalcast@evalcast,
ryantibs/quantgen/quantgen@master
Imports:
assertthat,
covidcast,
dplyr,
evalcast,
genlasso,
lubridate,
MMWRweek,
purrr,
quantgen,
slider,
tibble,
tidyselect,
tidyr
Suggests:
evalcast
9 changes: 9 additions & 0 deletions R-packages/modeltools/NAMESPACE
Expand Up @@ -7,15 +7,20 @@ export(Median)
export(Min)
export(Start)
export(Sum)
export(add_response_columns)
export(create_train_and_predict_matrices)
export(estimate_deriv)
export(example_forecaster)
export(pct_change)
export(quantgen_forecaster)
export(quiet)
export(slide_by_geo)
importFrom(assertthat,assert_that)
importFrom(dplyr,"%>%")
importFrom(dplyr,arrange)
importFrom(dplyr,between)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,group_modify)
Expand All @@ -27,10 +32,14 @@ importFrom(dplyr,ungroup)
importFrom(genlasso,coef.genlasso)
importFrom(genlasso,cv.trendfilter)
importFrom(genlasso,trendfilter)
importFrom(lubridate,as_date)
importFrom(lubridate,days)
importFrom(purrr,map2)
importFrom(purrr,map_dbl)
importFrom(stats,coef)
importFrom(stats,lsfit)
importFrom(stats,predict)
importFrom(stats,smooth.spline)
importFrom(tibble,tibble)
importFrom(tidyr,drop_na)
importFrom(tidyr,pivot_longer)
13 changes: 13 additions & 0 deletions R-packages/modeltools/R/change.R
Expand Up @@ -19,6 +19,19 @@
#' @return A data frame given by appending a new column to `x` named according
#' to the `col_name` argument, containing the percentage change values.
#'
#' @examples \dontrun{
#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli",
#' start_day = "2021-01-01",
#' end_day = "2021-01-31",
#' geo_type = "state")
#'
#' # percentage change between back-to-back weeks (default, as n = 14)
#' pct_change(df)
#'
#' # percentage change between back-to-back days
#' pct_change(df, n = 2)
#' }
#'
#' @export
pct_change = function(x, n = 14, col_name = "pct_change") {
# Check if n is odd and if so bump it up one
Expand Down
20 changes: 20 additions & 0 deletions R-packages/modeltools/R/deriv.R
Expand Up @@ -74,6 +74,26 @@
#'
#' @return A data frame given by appending a new column to `x` named according
#' to the `col_name` argument, containing the estimated derivative values.
#'
#' @examples \dontrun{
#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli",
#' start_day = "2021-01-01",
#' end_day = "2021-02-28",
#' geo_type = "state",
#' geo_values = c("ca", "fl"))
#'
#' # estimate derivative using linear regression and n = 10 days
#' estimate_deriv(df, method = "lin", n = 10)
#'
#' # keep the linear regression fits
#' estimate_deriv(df, method = "lin", n = 10, keep_obj = TRUE)
#'
#' # estimate derivative using smoothing spline with 8 degrees of freedom
#' estimate_deriv(df, method = "ss", n = 28, df = 8)
#'
#' # estimate derivative using trend filtering with 8 degrees of freedom
#' estimate_deriv(df, method = "tf", n = 28, df = 8)
#' }
#'
#' @export
estimate_deriv = function(x, method = c("lin", "ss", "tf"), n = 14,
Expand Down
56 changes: 56 additions & 0 deletions R-packages/modeltools/R/example_forecaster.R
@@ -0,0 +1,56 @@
#' Simple example forecaster that forecasts confirmed cases.
#'
#' @param df_list List of downloaded covidcast signals.
#' @param forecast_date Date from which the forecast should be made.
#' @param ahead Vector of epiweeks ahead to predict.
#' @param training_window_size Number of days prior to `forecast_date` to use for training.
#' @param lags Vector of lags or list of vectors of lags to apply to `df_list`. See documentation
#' of `dt` argument to `covidcast::aggregate_signals()` for details.
#' @param quantiles Vector of quantile values at which to predict
#'
#' preds <- evalcast::get_predictions(modeltools::example_forecaster,
#' "example",
#' tibble(
#' data_source = "jhu-csse",
#' signal = "confirmed_incidence_num",
#' geo_type = "state",
#' start_date = as.Date("2021-01-01"),
#' ),
#' as.Date("2021-03-01"),
#' "epiweek")
#'
#' @importFrom dplyr mutate bind_cols bind_rows
#' @importFrom tidyr pivot_longer
#'
#' @export
example_forecaster <- function(df_list,
forecast_date,
ahead = 1:4,
training_window_size = 14,
lags = seq(0, -1*training_window_size, -7),
quantiles = modeltools::covidhub_probs) {
covariates <- covidcast::aggregate_signals(df_list, dt = lags, format = "wide")
covariates_with_response <- add_response_columns(covariates,
"jhu-csse_confirmed_incidence_num",
forecast_date,
"epiweek",
ahead)

results <- list()
for (a in ahead) {
mats <- create_train_and_predict_matrices(covariates_with_response, a, training_window_size)
model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0)
predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x)

colnames(predictions) <- quantiles
predict_df <- bind_cols(geo_value = mats$predict_geo_values,
predictions) %>%
pivot_longer(cols = -geo_value,
names_to = "quantile",
values_to = "value") %>%
mutate(ahead = a)

results[[a]] <- predict_df
}
return(bind_rows(results))
}
216 changes: 216 additions & 0 deletions R-packages/modeltools/R/matrix.R
@@ -0,0 +1,216 @@
#' Create training and test data matrices and a training response for given aheads.
#'
#' Create training and test data matrices and training response for a set of
#' given aheads. Works for both single ahead values and a vector of ahead values.
#' For multiple ahead values, the function has the ability to return separate
#' data matrices and responses for each ahead, or a single data matrix and
#' response matrix for all aheads at once.
#'
#' @param lagged_df Data frame of lagged data. It should have the following columns:
#' \itemize{
#' \item `geo_value`: Strings of geographic locations.
#' \item `time_value`: Dates of training data.
#' \item Covariate columns: Columns with names of the form `value-{days}:{signal}` or
#' `value+0:{signal}` whose values correspond to `{signal}` `{days}` before `time_value`.
#' \item Response columns: Columns with names of the form `response+{n}:{response}` whose values
#' correspond to `{response}` `{n}` incidence period units after `time_value`.
#' }
#' A data frame in this format can be made using `covidcast::aggregate_signals()` and
#' `modeltools::get_response_columns()`.
#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.)
#' ahead to forecast. Can be a single positive integer or a vector of
#' positive integers. Note that for each `{a}` in `ahead`, the column
#' `response+{a}:{response}` should be present in `lagged_df`.
#' @param training_window_size Size of the local training window in days to use. For example, if
#' `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on
#' data from December 1 to December 14.
#' @param aheads_separate If `length(ahead) > 1`, should there be separate
#' data matrices and responses for each ahead? Default is `TRUE`.
#'
#' @return For a single ahead value, named list with entries:
#' \itemize{
#' \item `train_x`: Matrix of training data whose columns correspond to the
#' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the
#' latest date with an non-null response, plus all data from the `training_window_size`
#' days prior to it.
#' \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of
#' `lagged_df` corresponding to the rows of `train_x`.
#' \item `train_geo_values`: Vector of geo values corresponding to the rows
#' of `train_x`.
#' \item `train_time_values`: Vector of time values corresponding to the rows
#' of `train_x`.
#' \item `train_end_date`: Latest `time_value` used in the training period.
#' \item `predict_x`: Matrix of prediction data in the same format as `train_x`. The
#' prediction data contains the most recent `training_window_size` days.
#' \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`.
#' \item `predict_time_value`: Time value corresponding to `predict_x`.
#' }
#' For multiple ahead values and `aheads_separate = TRUE`, a list having
#' the same length as `ahead`, with each element being a named list as above.
#' For multiple ahead values and `ahead_separate = FALSE`, a named list as
#' above, except `train_y` is a matrix of responses rather than a vector.
#'
#' @examples \dontrun{
#' create_train_and_predict_matrices(
#' tibble(
#' geo_value = rep(c("az", "wv"), 5),
#' time_value = rep(
#' as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")),
#' each = 2),
#' `value-2:signal_1` = seq(-3, 6),
#' `value-1:signal_1` = seq(-1, 8),
#' `value+0:signal_1` = seq(1, 10),
#' `response+2:signal_1` = c(seq(5, 10), rep(NA, 4))
#' ),
#' ahead = 2,
#' training_window_size = 1)
#' }
#'
#' @importFrom tibble tibble
#' @importFrom assertthat assert_that
#' @importFrom lubridate as_date
#' @importFrom purrr map_dbl map2
#'
#' @export
create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size,
aheads_separate = TRUE) {
# make sure the response columns are unique
for (a in ahead) {
responses_at_ahead <- lagged_df %>%
select(tidyselect::starts_with(sprintf("response+%i:", a))) %>%
ncol()
assert_that(responses_at_ahead == 1,
msg = paste("multiple responses at ahead =", a))
}

# prediction matrices are the same for all aheads
predict_time_value <- max(lagged_df$time_value)
predict_x <- lagged_df %>%
filter(time_value == predict_time_value) %>%
select(tidyselect::starts_with("value")) %>%
as.matrix()
predict_geo_values <- lagged_df %>%
filter(time_value == predict_time_value) %>%
select(geo_value) %>% pull()

all_out <- list()

if (aheads_separate) {
train_end_dates <- ahead %>%
purrr::map_dbl(~ get_train_end_date(lagged_df, .x)) %>%
lubridate::as_date()

all_out <- purrr::map2(ahead, train_end_dates,
~ create_train_matrices(
lagged_df, .x, training_window_size, .y))
} else { # ahead_separate = FALSE: We want a single training data matrix and
# a matrix of training responses containing all the aheads
# Need to recompute training_window_size
training_window_size <- training_window_size + max(ahead) - min(ahead)

# Find the last possible date of training data
# (corresponds to the smallest ahead)
train_end_date <- get_train_end_date(lagged_df, ahead)

# Training matrices
all_out[[1]] <- create_train_matrices(lagged_df, ahead,
training_window_size,
train_end_date)
}

# Add prediction matrices / info
for (i in seq_along(all_out)) {
all_out[[i]]$predict_x <- predict_x
all_out[[i]]$predict_geo_values <- predict_geo_values
all_out[[i]]$predict_time_value <- predict_time_value
}

if (length(ahead) == 1 || aheads_separate == FALSE) {
return(all_out[[1]])
} else {
names(all_out) <- paste0("ahead+", ahead)
return(all_out)
}
}

#' Get last possible date of training data for given aheads.
#'
#' Returns the last possible date of the training data for a given set of
#' aheads. If more than one ahead is given, the date return corresponds to the
#' last possible date corresponding to the smallest ahead value. This is because
#' the smallest ahead value will have the latest possible date.
#'
#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`.
#' @param ahead Number of days ahead to forecast. Can be a single positive
#' integer or a vector of positive integers.
#'
#' @return Single date corresponding to the last possible date of the training
#' data.
get_train_end_date <- function(lagged_df, ahead) {
response_end_date <- lagged_df %>%
select(time_value,
tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>%
tidyr::drop_na() %>%
summarize(max(time_value)) %>%
pull()
train_end_date <- min(max(lagged_df$time_value), response_end_date)
return(train_end_date)
}

#' Create training data matrix and a training response for given aheads.
#'
#' Create training and data matrix and training response for a set of
#' given aheads. Works for both single ahead values and a vector of ahead
#' values. However, note that this function works different from
#' `create_train_and_predict_matrices()` for multiple ahead values. If
#' multiple ahead values are supplied, we return a matrix of responses
#' containing all aheads at once.
#'
#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`.
#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.)
#' ahead to forecast. Can be a single positive integer or a vector of
#' positive integers. Note that for each `{a}` in `ahead`, the column
#' `response+{a}:{response}` should be present in `lagged_df`.
#' @param n_days Number of days worth of data to pull.
#' @param train_end_date The last date to be included in the training data.
#'
#' @return A named list with entries:
#' \itemize{
#' \item `train_x`: Matrix of training data whose columns correspond to the
#' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the
#' latest date with an non-null response, plus all data from the `training_window_size`
#' days prior to it.
#' \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of
#' `lagged_df` corresponding to the rows of `train_x`. If multiple ahead
#' values are provided, then this is a matrix instead.
#' \item `train_geo_values`: Vector of geo values corresponding to the rows
#' of `train_x`.
#' \item `train_time_values`: Vector of time values corresponding to the rows
#' of `train_x`.
#' \item `train_end_date`: Latest `time_value` used in the training period.
#' }
create_train_matrices <- function(lagged_df, ahead, n_days,
train_end_date) {
train_df <- lagged_df %>%
filter(between(time_value,
train_end_date - n_days + 1,
train_end_date))
out <- list()
out$train_x <- train_df %>%
select(tidyselect::starts_with("value")) %>%
as.matrix()

train_y <- train_df %>%
select(tidyselect::starts_with(paste0("response+", ahead, ":")))
if (length(ahead) == 1) {
out$train_y <- pull(train_y)
} else {
out$train_y <- as.matrix(train_y)
}

out$train_geo_values <- train_df$geo_value
out$train_time_values <- train_df$time_value
out$train_end_date <- as.Date(train_end_date)

return(out)
}

0 comments on commit 4431561

Please sign in to comment.