Skip to content

Commit

Permalink
Move from httr to httr2, improved testing
Browse files Browse the repository at this point in the history
Added better error handling with httr2, switched to smaller states for testing, added tests for grab_crosswalk.
  • Loading branch information
dillonma committed Nov 23, 2023
1 parent cbb5a10 commit 9399c71
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 152 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
@@ -1,7 +1,7 @@
Package: lehdr
Type: Package
Title: Grab Longitudinal Employer-Household Dynamics (LEHD) Flat Files
Version: 1.1.2
Version: 1.1.3
Authors@R: c(person(given="Jamaal", family="Green", role = c("cre","aut"), email="jamaal.green@gmail.com"),
person(given="Liming", family="Wang", role =c("aut"), email="lmwang@pdx.edu"),
person(given="Dillon", family="Mahmoudi", role =c("aut"), email="dillonm@umbc.edu"),
Expand All @@ -20,8 +20,9 @@ Imports:
rlang,
stringr,
glue,
httr,
dplyr
httr2,
dplyr,
magrittr
Suggests:
testthat (>= 3.0.0),
knitr,
Expand Down
6 changes: 2 additions & 4 deletions NAMESPACE
Expand Up @@ -3,13 +3,11 @@
export(grab_crosswalk)
export(grab_lodes)
import(dplyr)
import(httr2)
import(magrittr)
importFrom(dplyr,`%>%`)
importFrom(dplyr,bind_rows)
importFrom(glue,glue)
importFrom(httr,GET)
importFrom(httr,HEAD)
importFrom(httr,stop_for_status)
importFrom(httr,write_disk)
importFrom(readr,col_character)
importFrom(readr,cols)
importFrom(readr,read_csv)
Expand Down
65 changes: 43 additions & 22 deletions R/grab_crosswalk.R
Expand Up @@ -8,33 +8,36 @@
#'
#' @description Download LODES geographic crosswalk
#'
#' @import httr2
#' @importFrom glue glue
#' @importFrom httr GET write_disk HEAD stop_for_status
#' @importFrom dplyr bind_rows `%>%`
#' @importFrom readr read_csv cols col_character
#'
#' @examples
#' \donttest{
#' # Download and load current geographic crosswalk for Alaska
#' alaska_xwalk <- grab_crosswalk('AK')
#'
#' # Download and load current geographic crosswalk for New England
#' new_england_xwalk <- grab_crosswalk(c('CT', 'RI', 'MA', 'VT', 'NH', 'ME'))
#' alaska_xwalk <- grab_crosswalk("VT")
#'
#' # Download and load current geographic crosswalk for small states
#' small_states_xwalk <- grab_crosswalk(c("wy", 'ND', 'SD'))
#' }
#'

grab_crosswalk <- function(state,
download_dir = file.path(tools::R_user_dir("lehdr",
which="cache"))){
download_dir = normalizePath(file.path(tools::R_user_dir("lehdr",
which="cache")),
mustWork = FALSE)) {

states <- tolower(state)

urls <- glue::glue("https://lehd.ces.census.gov/data/lodes/LODES8/{states}/{states}_xwalk.csv.gz")

for (url in urls){
httr::stop_for_status(httr::HEAD(url),
paste0("Data for this state was not found on LODES.\nURL: ", url))
}
# Old method from httr - could be converted to httr2, but error handling is in
# the request now (see download_xwalk).
# for (url in urls){
# httr::stop_for_status(httr::HEAD(url),
# paste0("Data for this state was not found on LODES.\nURL: ", url))
# }

vdownload_xwalk(url = urls, download_dir = download_dir) %>%
vread_xwalk() %>%
Expand All @@ -49,29 +52,47 @@ download_xwalk <- function(url, download_dir){
dir.create(download_dir, recursive=TRUE)
fil <- normalizePath(file.path(download_dir, basename(url)), mustWork = FALSE)

# Read from FTP site
if (file.exists(fil)) {
message(glue::glue("Cached version of file found in {fil}\n"))
# Create httr2 request from url
lodes_req <- request(url)

# Perform request and handle connection errors, writing response to disk
rlang::inform(glue::glue("Downloading crosswalk {url} to {fil}"))
withCallingHandlers(
lodes_resp <- lodes_req |>
req_error(is_error = \(lodes_resp) FALSE) |>
req_perform(path = fil),
httr2_failure = function(cnd) {
rlang::abort(c("lehdr: Could not establish connection to LODES FTP server.",
"i" = "Please check internet connection."))
}
)
if(lodes_resp$status_code >= 400) {
rlang::abort(c(glue::glue("lehdr: Server error while downloading: {lodes_req$url}"),
"i" = glue::glue("{lodes_resp$status_code} status returned."),
"i" = "Please consult the most recent LEHD Technical Document to verify state/year combination availability.",
"i" = "https://lehd.ces.census.gov/data/lodes/8/"))
} else if(length(lodes_resp$body) < 1) {
rlang::abort(c(glue::glue("lehdr: Connection error while downloading: {lodes_req$url}"),
"i" = "Empty response."))
} else {
message(glue::glue("Downloading {url} to {fil}"))
res <- httr::GET(url, httr::write_disk(fil))
message(glue::glue("Download complete."))
rlang::inform(glue::glue("Download complete for {fil}"))
}

return(fil)

}

read_xwalk <- function(filepath){
res <- suppressMessages(readr::read_csv(filepath, col_types = readr::cols(.default = 'c')))
download_dir <- dirname(filepath)
filename <- basename(filepath)

# Remove cached files now that they're read in
# Unlink returns 0 for success, 1 for failure
# Unlink returns 0 for success, 1 for failure
if(unlink(filepath)) {
message(glue::glue("Could not clear crosswalk cache."))
rlang::inform(glue::glue("Could not clear {filename} from crosswalk cache."))
} else {
message(glue::glue("Crosswalk cache cleared."))

rlang::inform(glue::glue("{filename} crosswalk cleared from cache."))
# Now check to see if the cache directory is empty, remove it if it is
if(length(list.files(download_dir)) == 0) {
unlink(download_dir, recursive = TRUE)
Expand Down
108 changes: 69 additions & 39 deletions R/lehdr.R
Expand Up @@ -43,31 +43,32 @@
#' @description Download LODES OD, RAC, and WAC tables
#' @return a dataframe (tibble) of block or tract level LODES files
#' @import dplyr
#' @import magrittr
#' @import httr2
#' @importFrom readr read_csv cols col_character
#' @importFrom httr GET stop_for_status HEAD write_disk
#' @importFrom glue glue
#' @importFrom stats na.omit
#' @importFrom stringr str_sub str_extract
#'
#' @examples
#' \donttest{
#' # download and load 2014 block level O-D data for Oregon
#' blk_df_or_od <- grab_lodes(state = 'or', year = 2014, lodes_type = "od", job_type = "JT01",
#' # download and load 2014 block level O-D data for Vermont
#' blk_df_or_od <- grab_lodes(state = 'vt', year = 2014, lodes_type = "od", job_type = "JT01",
#' segment = "SA01", state_part = "main")
#'
#' # download and load 2014 O-D data for Oregon and aggregate
#' # download and load 2014 O-D data for Vermont and aggregate
#' # to the tract level
#' trt_df_or_od <- grab_lodes(state = 'or', year = 2014, lodes_type = "od", job_type = "JT01",
#' trt_df_or_od <- grab_lodes(state = 'vt', year = 2014, lodes_type = "od", job_type = "JT01",
#' segment = "SA01", state_part = "main", agg_geo = "tract")
#'
#' # download and load 2014 RAC data for Oregon and aggregate
#' # download and load 2020 RAC data for Vermont and aggregate
#' # to the tract level
#' trt_df_or_rac <- grab_lodes(state = 'or', year = 2014, lodes_type = "rac", job_type = "JT01",
#' trt_df_or_rac <- grab_lodes(state = 'vt', year = 2014, lodes_type = "rac", job_type = "JT01",
#' segment = "SA01", agg_geo = "tract")
#'
#' # download and load 2014 WAC data for Oregon and aggregate
#' # download and load 2020 WAC data for Vermont and aggregate
#' # to the tract level
#' trt_df_or_wac <- grab_lodes(state = 'or', year = 2014, lodes_type = "wac", job_type = "JT01",
#' trt_df_or_wac <- grab_lodes(state = 'vt', year = 2014, lodes_type = "wac", job_type = "JT01",
#' segment = "SA01", agg_geo = "tract")
#' }
#' @export
Expand Down Expand Up @@ -144,42 +145,72 @@ grab_lodes <- function(state, year,
}
}

# On URL error, the likely culprit is the lack of state/year combination ...
httr::stop_for_status(httr::HEAD(url),
paste("retrieve data for this combination of state and year on LODES.",
"Please see the most recent LEHD Technical Document for a list of available state/year.",
glue::glue("https://lehd.ces.census.gov/data/lodes/{version}/")
)
)

# Set download directory, check for cache
download_dir <- path.expand(download_dir)
if (!dir.exists(download_dir))
# Set download directory and file name
if (!dir.exists(download_dir)) {
dir.create(download_dir, recursive=TRUE)
fil <- file.path(download_dir, basename(url))
}
fil <- normalizePath(file.path(download_dir, basename(url)), mustWork = FALSE)

# Create httr2 request from url
lodes_req <- request(url)

# Check for cache
if(use_cache) { # User set use_cache to TRUE
# If there is a cache, use it
if(file.exists(fil)) {
message(glue::glue("Cached version of file found in: {fil}"))
} else {
message(glue::glue("Downloading {url} to cache folder: {fil}"))
res <- httr::GET(url, httr::write_disk(fil))
message(glue::glue("Download complete."))
if(file.exists(fil)) { # If there is a cached file, use it
rlang::inform(glue::glue("Using cached version of file found in: {fil}"))
} else { # No cached file
# Perform request and handle connection errors, writing response to disk
withCallingHandlers(
lodes_resp <- lodes_req |>
req_error(is_error = \(lodes_resp) FALSE) |>
req_perform(path = fil),
httr2_failure = function(cnd) {
rlang::abort(c("lehdr: Could not establish connection to LODES FTP server.",
"i" = "Please check internet connection."))
}
)
if(lodes_resp$status_code >= 400) {
rlang::abort(c(glue::glue("lehdr: Server error while downloading: {lodes_req$url}"),
"i" = glue::glue("{lodes_resp$status_code} status returned."),
"i" = "Please consult the most recent LEHD Technical Document to verify state/year combination availability.",
"i" = "https://lehd.ces.census.gov/data/lodes/8/"))
} else if(length(lodes_resp$body) < 1) {
rlang::abort(c(glue::glue("lehdr: Connection error while downloading: {lodes_req$url}"),
"i" = "Empty response."))
} else {
rlang::inform(glue::glue("Download complete for {fil}"))
}
}
} else { # User did not allow cache to be used
if(file.exists(fil)) {
if(file.exists(fil)) { # But there is a cached file
# Existing file found, inform user of use_cache
message(glue::glue("Cached version of file found in: {fil}"))
message(glue::glue("Consider setting use_cache=TRUE to use previously downloaded files."))
} else {
# No file found, inform user that we're downloading
rlang::inform(glue::glue("Cached version of file found in: {fil}"))
rlang::inform(glue::glue("Consider setting use_cache=TRUE to use previously downloaded files."))
}
# Download (and overwite if necessary) data from server
message(glue::glue("Overwriting {url} to {fil} now..."))
res <- httr::GET(url, httr::write_disk(fil, overwrite = TRUE))
message(glue::glue("Overwrite complete."))
rlang::inform(glue::glue("Downloading {url} to {fil} now..."))

# Perform request and handle connection errors, writing response to disk
withCallingHandlers(
lodes_resp <- lodes_req |>
req_error(is_error = \(lodes_resp) FALSE) |>
req_perform(path = fil),
httr2_failure = function(cnd) {
rlang::abort(c("lehdr: Could not establish connection to LODES FTP server.",
"i" = "Please check internet connection."))
}
)
if(lodes_resp$status_code >= 400) {
rlang::abort(c(glue::glue("lehdr: Server error while downloading: {lodes_req$url}"),
"i" = glue::glue("{lodes_resp$status_code} status returned."),
"i" = "Please consult the most recent LEHD Technical Document to verify state/year combination availability.",
"i" = "https://lehd.ces.census.gov/data/lodes/8/"))
} else if(length(lodes_resp$body) < 1) {
rlang::abort(c(glue::glue("lehdr: Connection error while downloading: {lodes_req$url}"),
"i" = "Empty response."))
} else {
rlang::inform(glue::glue("Download complete for {fil}"))
}
}

# Read in the data
Expand All @@ -188,14 +219,13 @@ grab_lodes <- function(state, year,
# Remove temp files if the user did not set use_cache = TRUE
if(!use_cache) {
if(unlink(fil)) { # 0 for success, 1 for failure, invisibly.
message(glue::glue("Could not remove temporary file {fil}."))
rlang::inform(glue::glue("Could not clear {fil} from cache."))
} else {
message(glue::glue("Removed {fil}."))
rlang::inform(glue::glue("{fil} cleared from cache."))
# Now check to see if the cache directory is empty, remove it if it is
if(length(list.files(download_dir)) == 0) {
unlink(download_dir, recursive = TRUE)
}

}
}

Expand Down
11 changes: 6 additions & 5 deletions man/grab_crosswalk.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 8 additions & 8 deletions man/grab_lodes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 9399c71

Please sign in to comment.