Skip to content

Commit

Permalink
Merge pull request #652 from cmu-delphi/ndefries/use-fread-instead-of…
Browse files Browse the repository at this point in the history
…-arrow

Use `fread` instead of `arrow` to get data in `get_forecaster_predictions_alt`
  • Loading branch information
dshemetov committed Jul 24, 2023
2 parents d52e709 + 53d7737 commit fa4bc21
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 56 deletions.
3 changes: 1 addition & 2 deletions R-packages/evalcast/DESCRIPTION
@@ -1,7 +1,7 @@
Package: evalcast
Type: Package
Title: Tools For Evaluating COVID Forecasters
Version: 0.3.3
Version: 0.3.4
Authors@R:
c(
person(given = "Daniel",
Expand Down Expand Up @@ -77,7 +77,6 @@ Imports:
covidHubUtils,
forcats,
data.table,
arrow,
fs,
plyr,
bettermc,
Expand Down
6 changes: 2 additions & 4 deletions R-packages/evalcast/NAMESPACE
Expand Up @@ -46,12 +46,11 @@ import(lubridate)
import(purrr)
import(stringr)
import(tibble)
importFrom(arrow,open_dataset)
importFrom(arrow,schema)
importFrom(arrow,string)
importFrom(assertthat,assert_that)
importFrom(covidcast,covidcast_signal)
importFrom(data.table,"%chin%")
importFrom(data.table,fread)
importFrom(dplyr,bind_rows)
importFrom(magrittr,"%>%")
importFrom(purrr,pmap)
importFrom(rlang,":=")
Expand All @@ -65,4 +64,3 @@ importFrom(tidyr,drop_na)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,separate)
importFrom(utils,download.file)
11 changes: 10 additions & 1 deletion R-packages/evalcast/NEWS.md
@@ -1,7 +1,16 @@
# evalcast 0.3.4

- Change `get_forecaster_predictions_alt` to read forecaster input files using
`data.table::fread`. This replaces a several-years-old version of
`arrow::open_dataset`. `data.table::fread` is faster, doesn't require a
separate download step, and has more flexibility in column specification.

# evalcast 0.3.3

- Fix `get_covidhub_forecast_dates`, likely broken by GitHub website format
change, by refactoring it to use GitHub API
change, by refactoring it to use GitHub API. If requesting many dates, this
may require authentication with a
[GitHub API key](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens)

# evalcast 0.3.2

Expand Down
75 changes: 26 additions & 49 deletions R-packages/evalcast/R/get_covidhub_predictions.R
Expand Up @@ -301,8 +301,8 @@ get_forecaster_predictions <- function(covidhub_forecaster_name,
#' @return Predictions card. For more flexible processing of COVID Hub data, try
#' using [zoltr](https://docs.zoltardata.com/zoltr/)
#'
#' @importFrom arrow open_dataset schema string
#' @importFrom utils download.file
#' @importFrom data.table fread %chin%
#' @importFrom dplyr bind_rows
get_forecaster_predictions_alt <- function(covidhub_forecaster_name,
forecast_dates = NULL,
geo_values = "*",
Expand All @@ -318,20 +318,15 @@ get_forecaster_predictions_alt <- function(covidhub_forecaster_name,
if (is.null(forecast_dates))
forecast_dates <- get_covidhub_forecast_dates(covidhub_forecaster_name)
forecast_dates <- as.character(forecast_dates)
# Download files to disk first
# File layout is data/<covidhub_forecaster_name>/<forecast_date>/data.csv
for (forecast_date in forecast_dates) {
output_dir <- file.path("data", covidhub_forecaster_name, forecast_date)
output_file <- file.path("data", covidhub_forecaster_name, forecast_date, "data.csv")
filename <- sprintf("%s/%s/%s-%s.csv",
# Read files directly from GitHub
ds <- lapply(forecast_dates, function(forecast_date) {
target_url <- sprintf("%s/%s/%s-%s.csv",
url,
covidhub_forecaster_name,
forecast_date,
covidhub_forecaster_name)

dir.create(output_dir, recursive = TRUE)

# Download file. Re-attempt up to 8 times (max 2 min wait).
# Read file. Re-attempt up to 8 times (max 2 min wait).
attempt <- 0
n_max_attempt <- 8
base_wait <- 1 # second
Expand All @@ -340,11 +335,17 @@ get_forecaster_predictions_alt <- function(covidhub_forecaster_name,
attempt <- attempt + 1
# Increase time between download attempts in exponential backoff
wait <- base_wait * 2 ^ (attempt - 1)
download_status <- try({
download.file(filename, output_file, mode="w", quiet=TRUE)
# If the read attempt succeeds, returns a dataframe; else a try-error
read_status <- try({
fread(target_url, showProgress = FALSE, data.table = FALSE,
colClasses=list(
character=c("forecast_date", "target", "target_end_date", "type", "location"),
numeric = c("quantile", "value")
)
)
})

if (download_status != 0) {
if (inherits(read_status, "try-error")) {
if (attempt < n_max_attempt) { message("retrying...") }
Sys.sleep(wait)
next
Expand All @@ -353,63 +354,42 @@ get_forecaster_predictions_alt <- function(covidhub_forecaster_name,
}
}

if (attempt == n_max_attempt & download_status != 0) {
warning(filename, " could not be downloaded")
# Delete dir. We expect it to be empty, but double check.
if (length(list.files(output_dir)) == 0) {
unlink(output_dir, recursive = TRUE)
}
} else if (attempt > 1 & download_status == 0) {
if (attempt == n_max_attempt & inherits(read_status, "try-error")) {
warning(target_url, " could not be read")
} else if (attempt > 1 & !inherits(read_status, "try-error")) {
message("succeeded after ", attempt, " attempts")
}
}

sch <- schema(forecast_date=string(),
target=string(),
target_end_date=string(),
location=string(),
type=string(),
quantile=string(),
value=string())

ds <- open_dataset(file.path("data", covidhub_forecaster_name),
format = "csv", schema = sch)
return(read_status)
}) %>%
bind_rows()

# Create all derived columns from target first to join later
# Works with just the distinct values of target for efficiency
target_separated <- ds %>%
select(.data$target) %>%
collect() %>%
distinct() %>%
target_separated <- distinct(ds, target) %>%
process_target(remove = FALSE)

pcards <- ds %>%
collect() %>%
left_join(target_separated, by = "target") %>%
select(-.data$target) %>%
mutate(forecaster = covidhub_forecaster_name,
forecast_date = lubridate::ymd(.data$forecast_date),
target_end_date = lubridate::ymd(.data$target_end_date),
quantile = as.double(.data$quantile),
value = as.double(.data$value)) %>%
target_end_date = lubridate::ymd(.data$target_end_date)) %>%
filter_predictions(forecast_type, incidence_period, signal) %>%
select_pcard_cols()

pcards <- pcards %>%
location_2_geo_value()
if (!identical(geo_values, "*")) {
pcards <- filter(pcards, .data$geo_value %in% geo_values)
pcards <- filter(pcards, .data$geo_value %chin% geo_values)
}
if (!is.null(ahead)) {
pcards <- filter(pcards, .data$ahead %in% !!ahead)
}
pcards <- filter(pcards, .data$signal %in% !!signal)
pcards <- filter(pcards, .data$signal %chin% !!signal)
class(pcards) = c("predictions_cards", class(pcards))

# Cleanup, delete downloaded CSVs from disk
unlink(file.path("data", covidhub_forecaster_name), recursive = TRUE)

pcards
return(pcards)
}

#' Get available forecast dates for a forecaster on the COVID Hub
Expand Down Expand Up @@ -532,9 +512,6 @@ get_covidhub_forecaster_names <- function(
return(forecaster_names)
}




#' Vector of quantiles used for submission to the COVID 19 Forecast Hub
#'
#' See the [Forecast Hub Documentation](https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md#quantile)
Expand Down

0 comments on commit fa4bc21

Please sign in to comment.