Merge pull request #652 from cmu-delphi/ndefries/use-fread-instead-of…

…-arrow Use `fread` instead of `arrow` to get data in `get_forecaster_predictions_alt`
cmu-delphi · Jul 24, 2023 · fa4bc21 · fa4bc21
2 parents d52e709 + 53d7737
commit fa4bc21
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 56 deletions.
diff --git a/R-packages/evalcast/DESCRIPTION b/R-packages/evalcast/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: evalcast
 Type: Package
 Title: Tools For Evaluating COVID Forecasters
-Version: 0.3.3
+Version: 0.3.4
 Authors@R:
   c(
     person(given = "Daniel",
@@ -77,7 +77,6 @@ Imports:
     covidHubUtils,
     forcats,
     data.table,
-    arrow,
     fs,
     plyr,
     bettermc,

diff --git a/R-packages/evalcast/NAMESPACE b/R-packages/evalcast/NAMESPACE
@@ -46,12 +46,11 @@ import(lubridate)
 import(purrr)
 import(stringr)
 import(tibble)
-importFrom(arrow,open_dataset)
-importFrom(arrow,schema)
-importFrom(arrow,string)
 importFrom(assertthat,assert_that)
 importFrom(covidcast,covidcast_signal)
+importFrom(data.table,"%chin%")
 importFrom(data.table,fread)
+importFrom(dplyr,bind_rows)
 importFrom(magrittr,"%>%")
 importFrom(purrr,pmap)
 importFrom(rlang,":=")
@@ -65,4 +64,3 @@ importFrom(tidyr,drop_na)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,separate)
-importFrom(utils,download.file)
diff --git a/R-packages/evalcast/NEWS.md b/R-packages/evalcast/NEWS.md
@@ -1,7 +1,16 @@
+# evalcast 0.3.4
+
+- Change `get_forecaster_predictions_alt` to read forecaster input files using
+  `data.table::fread`. This replaces a several-years-old version of
+  `arrow::open_dataset`. `data.table::fread` is faster, doesn't require a
+  separate download step, and has more flexibility in column specification.
+
 # evalcast 0.3.3
 
 - Fix `get_covidhub_forecast_dates`, likely broken by GitHub website format
-  change, by refactoring it to use GitHub API
+  change, by refactoring it to use GitHub API. If requesting many dates, this
+  may require authentication with a
+  [GitHub API key](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens)
 
 # evalcast 0.3.2
 

diff --git a/R-packages/evalcast/R/get_covidhub_predictions.R b/R-packages/evalcast/R/get_covidhub_predictions.R
@@ -301,8 +301,8 @@ get_forecaster_predictions <- function(covidhub_forecaster_name,
 #' @return Predictions card. For more flexible processing of COVID Hub data, try
 #'   using [zoltr](https://docs.zoltardata.com/zoltr/)
 #'
-#' @importFrom arrow open_dataset schema string
-#' @importFrom utils download.file
+#' @importFrom data.table fread %chin%
+#' @importFrom dplyr bind_rows
 get_forecaster_predictions_alt <- function(covidhub_forecaster_name,
                                            forecast_dates = NULL,
                                            geo_values = "*",
@@ -318,20 +318,15 @@ get_forecaster_predictions_alt <- function(covidhub_forecaster_name,
   if (is.null(forecast_dates))
     forecast_dates <- get_covidhub_forecast_dates(covidhub_forecaster_name)
   forecast_dates <- as.character(forecast_dates)
-  # Download files to disk first
-  # File layout is data/<covidhub_forecaster_name>/<forecast_date>/data.csv
-  for (forecast_date in forecast_dates) {
-    output_dir <- file.path("data", covidhub_forecaster_name, forecast_date)
-    output_file <- file.path("data", covidhub_forecaster_name, forecast_date, "data.csv")
-    filename <- sprintf("%s/%s/%s-%s.csv",
+  # Read files directly from GitHub
+  ds <- lapply(forecast_dates, function(forecast_date) {
+    target_url <- sprintf("%s/%s/%s-%s.csv",
                         url,
                         covidhub_forecaster_name,
                         forecast_date,
                         covidhub_forecaster_name)
 
-    dir.create(output_dir, recursive = TRUE)
-
-    # Download file. Re-attempt up to 8 times (max 2 min wait).
+    # Read file. Re-attempt up to 8 times (max 2 min wait).
     attempt <- 0
     n_max_attempt <- 8
     base_wait <- 1 # second
@@ -340,11 +335,17 @@ get_forecaster_predictions_alt <- function(covidhub_forecaster_name,
       attempt <- attempt + 1
       # Increase time between download attempts in exponential backoff
       wait <- base_wait * 2 ^ (attempt - 1)
-      download_status <- try({
-        download.file(filename, output_file, mode="w", quiet=TRUE)
+      # If the read attempt succeeds, returns a dataframe; else a try-error
+      read_status <- try({
+        fread(target_url, showProgress = FALSE, data.table = FALSE,
+          colClasses=list(
+            character=c("forecast_date", "target", "target_end_date", "type", "location"),
+            numeric = c("quantile", "value")
+          )
+        )
       })
 
-      if (download_status != 0) {
+      if (inherits(read_status, "try-error")) {
         if (attempt < n_max_attempt) { message("retrying...") }
         Sys.sleep(wait)
         next
@@ -353,63 +354,42 @@ get_forecaster_predictions_alt <- function(covidhub_forecaster_name,
       }
     }
 
-    if (attempt == n_max_attempt & download_status != 0) {
-      warning(filename, " could not be downloaded")
-      # Delete dir. We expect it to be empty, but double check.
-      if (length(list.files(output_dir)) == 0) {
-        unlink(output_dir, recursive = TRUE)
-      }
-    } else if (attempt > 1 & download_status == 0) {
+    if (attempt == n_max_attempt & inherits(read_status, "try-error")) {
+      warning(target_url, " could not be read")
+    } else if (attempt > 1 & !inherits(read_status, "try-error")) {
       message("succeeded after ", attempt, " attempts")
     }
-  }
-
-  sch <- schema(forecast_date=string(),
-                target=string(),
-                target_end_date=string(),
-                location=string(),
-                type=string(),
-                quantile=string(),
-                value=string())
 
-  ds <- open_dataset(file.path("data", covidhub_forecaster_name),
-                     format = "csv", schema = sch)
+    return(read_status)
+  }) %>%
+    bind_rows()
 
   # Create all derived columns from target first to join later
   # Works with just the distinct values of target for efficiency
-  target_separated <- ds %>%
-    select(.data$target) %>%
-    collect() %>%
-    distinct() %>%
+  target_separated <- distinct(ds, target) %>%
     process_target(remove = FALSE)
 
   pcards <- ds %>%
-      collect() %>%
       left_join(target_separated, by = "target") %>%
       select(-.data$target) %>%
       mutate(forecaster = covidhub_forecaster_name,
              forecast_date = lubridate::ymd(.data$forecast_date),
-             target_end_date = lubridate::ymd(.data$target_end_date),
-             quantile = as.double(.data$quantile),
-             value = as.double(.data$value)) %>%
+             target_end_date = lubridate::ymd(.data$target_end_date)) %>%
       filter_predictions(forecast_type, incidence_period, signal) %>%
       select_pcard_cols()
 
   pcards <- pcards %>%
     location_2_geo_value()
   if (!identical(geo_values, "*")) {
-    pcards <- filter(pcards, .data$geo_value %in% geo_values)
+    pcards <- filter(pcards, .data$geo_value %chin% geo_values)
   }
   if (!is.null(ahead)) {
     pcards <- filter(pcards, .data$ahead %in% !!ahead)
   }
-  pcards <- filter(pcards, .data$signal %in% !!signal)
+  pcards <- filter(pcards, .data$signal %chin% !!signal)
   class(pcards) = c("predictions_cards", class(pcards))
 
-  # Cleanup, delete downloaded CSVs from disk
-  unlink(file.path("data", covidhub_forecaster_name), recursive = TRUE)
-
-  pcards
+  return(pcards)
 }
 
 #' Get available forecast dates for a forecaster on the COVID Hub
@@ -532,9 +512,6 @@ get_covidhub_forecaster_names <- function(
   return(forecaster_names)
 }
 
-
-
-
 #' Vector of quantiles used for submission to the COVID 19 Forecast Hub
 #'
 #' See the [Forecast Hub Documentation](https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md#quantile)