Merge pull request #563 from walkerke/pep2023

Add support for 2023 PEP and fix #544
walkerke · Mar 18, 2024 · 1624936 · 1624936
2 parents ba0d73f + 9363bde
commit 1624936
Show file tree

Hide file tree

Showing 49 changed files with 212 additions and 286 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: tidycensus
 Type: Package
 Title: Load US Census Boundary and Attribute Data as 'tidyverse' and 'sf'-Ready Data Frames
-Version: 1.6.2
+Version: 1.6.3
 Authors@R: c(
     person(given = "Kyle", family = "Walker", email="kyle@walker-data.com", role=c("aut", "cre")),
     person(given = "Matt", family = "Herman", email = "mfherman@gmail.com", role = "aut"),
     person(given = "Kris", family = "Eberwein", email = "eberwein@knights.ucf.edu", role = "ctb"))
-Date: 2024-03-05
+Date: 2024-03-18
 URL: https://walker-data.com/tidycensus/
 BugReports: https://github.com/walkerke/tidycensus/issues
 Description: An integrated R interface to several United States Census Bureau 

diff --git a/R/estimates.R b/R/estimates.R
@@ -1,5 +1,7 @@
 #' Get data from the US Census Bureau Population Estimates Program
 #'
+#' The \code{get_estimates()} function requests data from the US Census Bureau's Population Estimates Program (PEP) datasets.  The PEP datasets are defined by the US Census Bureau as follows: "The Census Bureau's Population Estimates Program (PEP) produces estimates of the population for the United States, its states, counties, cities, and towns, as well as for the Commonwealth of Puerto Rico and its municipios. Demographic components of population change (births, deaths, and migration) are produced at the national, state, and county levels of geography. Additionally, housing unit estimates are produced for the nation, states, and counties.  PEP annually utilizes current data on births, deaths, and migration to calculate population change since the most recent decennial census and produce a time series of estimates of population, demographic components of change, and housing units. The annual time series of estimates begins with the most recent decennial census data and extends to the vintage year. As each vintage of estimates includes all years since the most recent decennial census, the latest vintage of data available supersedes all previously-produced estimates for those dates."
+#'
 #' \code{get_estimates()} requests data from the Population Estimates API for years 2019 and earlier; however the Population Estimates are no longer supported on the API as of 2020.  For recent years, \code{get_estimates()} reads a flat file from the Census website and parses it.  This means that arguments and output for 2020 and later datasets may differ slightly from datasets acquired for 2019 and earlier.
 #'
 #' As of April 2022, variables available for 2020 and later datasets are as follows: ESTIMATESBASE, POPESTIMATE, NPOPCHG, BIRTHS, DEATHS, NATURALCHG, INTERNATIONALMIG, DOMESTICMIG, NETMIG, RESIDUAL, GQESTIMATESBASE, GQESTIMATES, RBIRTH, RDEATH, RNATURALCHG, RINTERNATIONALMIG, RDOMESTICMIG, and RNETMIG.
@@ -20,11 +22,9 @@
 #'
 #' @param breakdown_labels Whether or not to label breakdown elements returned when
 #'                         \code{product = "characteristics"}. Defaults to FALSE.
-#'
-#'                         Not yet supported for 2020 and later.
-#' @param year The data year (defaults to 2022). It is recommended to use the most recent vintage
-#'             available for a given decennial series (so, year = 2019 for the 2010s, and year = 2022 for the 2020s)
-#'             and use \code{time_series = TRUE} to access time-series estimates.
+#' @param vintage It is recommended to use the most recent vintage
+#'             available for a given decennial series (so, year = 2019 for the 2010s, and year = 2023 for the 2020s).  Will default to 2022 until the full PEP for 2023 is released.
+#' @param year The data year (defaults to the vintage requested). Use \code{time_series = TRUE} to access time-series estimates.
 #' @param state The state for which you are requesting data. State
 #'              names, postal codes, and FIPS codes are accepted.
 #'              Defaults to NULL.
@@ -58,14 +58,15 @@
 #' @param ... other keyword arguments
 #'
 #' @return A tibble, or sf tibble, of population estimates data
+#' @seealso \url{https://www.census.gov/programs-surveys/popest/about.html}
 #' @export
 get_estimates <- function(geography = c("us", "region", "division", "state", "county", "county subdivision",
                                         "place/balance (or part)", "place", "consolidated city", "place (or part)",
                                         "metropolitan statistical area/micropolitan statistical area", "cbsa",
                                         "metropolitan division", "combined statistical area"),
                           product = NULL, variables = NULL,
-                          breakdown = NULL, breakdown_labels = FALSE,
-                          year = 2022, state = NULL, county = NULL,
+                          breakdown = NULL, breakdown_labels = FALSE, vintage = 2022,
+                          year = vintage, state = NULL, county = NULL,
                           time_series = FALSE,
                           output = "tidy", geometry = FALSE, keep_geo_vars = FALSE,
                           shift_geo = FALSE, key = NULL, show_call = FALSE, ...) {
@@ -91,6 +92,10 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
           rlang::abort("The only supported geographies at this time for population characteristics 2020 and later are 'state' and 'county'.")
         }
 
+        if (vintage > 2022) {
+          rlang::abort("The Characteristics dataset has not yet been released for vintages beyond 2022.")
+        }
+
         if (geography == "state") {
 
           state_raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/state/asrh/sc-est2022-alldata6.csv"))
@@ -311,15 +316,26 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
           if (product == "population") {
             variables <- population_estimates_variables22
           } else if (product == "components") {
-            variables <- components_estimates_variables22
+            if (year == 2021) {
+              variables <- components_estimates_variables21
+
+            } else {
+              variables <- components_estimates_variables22
+
+            }
           }
         }
 
         # Get the data into a reasonable first format that is consistent for downstream use
         if (geography == "us") {
 
-          raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/state/totals/NST-EST2022-ALLDATA.csv")) %>%
-            dplyr::filter(SUMLEV == "010")
+          if (vintage == 2021) {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/state/totals/NST-EST%s-alldata.csv", vintage, vintage))) %>%
+              dplyr::filter(SUMLEV == "010")
+          } else {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/state/totals/NST-EST%s-ALLDATA.csv", vintage, vintage))) %>%
+              dplyr::filter(SUMLEV == "010")
+          }
 
           raw <- raw[,!(names(raw) %in% c("SUMLEV", "REGION", "DIVISION", "STATE"))]
 
@@ -335,9 +351,15 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
 
         } else if (geography == "region") {
 
-          raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/state/totals/NST-EST2022-ALLDATA.csv")) %>%
-            dplyr::filter(SUMLEV == "020") %>%
-            dplyr::mutate(GEOID = REGION)
+          if (vintage == 2021) {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/state/totals/NST-EST%s-alldata.csv", vintage, vintage))) %>%
+              dplyr::filter(SUMLEV == "020") %>%
+              dplyr::mutate(GEOID = REGION)
+          } else {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/state/totals/NST-EST%s-ALLDATA.csv", vintage, vintage))) %>%
+              dplyr::filter(SUMLEV == "020") %>%
+              dplyr::mutate(GEOID = REGION)
+          }
 
           raw <- raw[,!(names(raw) %in% c("SUMLEV", "REGION", "DIVISION", "STATE"))]
 
@@ -350,9 +372,13 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
 
         } else if (geography == "division") {
 
-          raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/state/totals/NST-EST2022-ALLDATA.csv")) %>%
-            dplyr::filter(SUMLEV == "030") %>%
-            dplyr::mutate(GEOID = DIVISION)
+          if (vintage == 2021) {
+            rlang::abort("Divisions are not available in the 2021 vintage dataset.")
+          } else {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/state/totals/NST-EST%s-ALLDATA.csv", vintage, vintage))) %>%
+              dplyr::filter(SUMLEV == "030") %>%
+              dplyr::mutate(GEOID = DIVISION)
+          }
 
           raw <- raw[,!(names(raw) %in% c("SUMLEV", "REGION", "DIVISION", "STATE"))]
 
@@ -365,9 +391,15 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
 
         } else if (geography == "state") {
 
-          raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/state/totals/NST-EST2022-ALLDATA.csv")) %>%
-            dplyr::filter(SUMLEV == "040") %>%
-            dplyr::mutate(GEOID = STATE)
+          if (vintage == 2021) {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/state/totals/NST-EST%s-alldata.csv", vintage, vintage))) %>%
+              dplyr::filter(SUMLEV == "040") %>%
+              dplyr::mutate(GEOID = STATE)
+          } else {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/state/totals/NST-EST%s-ALLDATA.csv", vintage, vintage))) %>%
+              dplyr::filter(SUMLEV == "040") %>%
+              dplyr::mutate(GEOID = STATE)
+          }
 
           raw <- raw[,!(names(raw) %in% c("SUMLEV", "REGION", "DIVISION", "STATE"))]
 
@@ -380,7 +412,7 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
 
         } else if (geography == "county") {
 
-          raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/counties/totals/co-est2022-alldata.csv")) %>%
+          raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/counties/totals/co-est%s-alldata.csv", vintage, vintage))) %>%
             dplyr::filter(SUMLEV == "050") %>%
             dplyr::mutate(GEOID = paste0(STATE, COUNTY),
                           NAME = paste0(CTYNAME, ", ", STNAME))
@@ -397,9 +429,15 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
 
         } else if (geography == "cbsa" || geography == "metropolitan statistical area/micropolitan statistical area") {
 
-          raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/metro/totals/cbsa-est2022.csv")) %>%
-            dplyr::filter(LSAD %in% c("Micropolitan Statistical Area", "Metropolitan Statistical Area")) %>%
-            dplyr::mutate(GEOID = CBSA)
+          if (vintage != 2022) {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/metro/totals/cbsa-est%s-alldata.csv", vintage, vintage))) %>%
+              dplyr::filter(LSAD %in% c("Micropolitan Statistical Area", "Metropolitan Statistical Area")) %>%
+              dplyr::mutate(GEOID = CBSA)
+          } else {
+            raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/metro/totals/cbsa-est2022.csv")) %>%
+              dplyr::filter(LSAD %in% c("Micropolitan Statistical Area", "Metropolitan Statistical Area")) %>%
+              dplyr::mutate(GEOID = CBSA)
+          }
 
           raw <- raw[,!(names(raw) %in% c("MDIV", "STCOU", "LSAD", "CBSA"))]
 
@@ -411,9 +449,16 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
             dplyr::mutate(variable = stringr::str_remove(variable, "_"))
 
         } else if (geography == "combined statistical area") {
-          raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/metro/totals/csa-est2022.csv")) %>%
-            dplyr::filter(LSAD == "Combined Statistical Area") %>%
-            dplyr::mutate(GEOID = CSA)
+
+          if (vintage != 2022) {
+            raw <- suppressMessages(readr::read_csv(sprintf("https://www2.census.gov/programs-surveys/popest/datasets/2020-%s/metro/totals/csa-est%s-alldata.csv", vintage, vintage))) %>%
+              dplyr::filter(LSAD == "Combined Statistical Area") %>%
+              dplyr::mutate(GEOID = CSA)
+          } else {
+            raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/metro/totals/csa-est2022.csv")) %>%
+              dplyr::filter(LSAD == "Combined Statistical Area") %>%
+              dplyr::mutate(GEOID = CSA)
+          }
 
           raw <- raw[,!(names(raw) %in% c("MDIV", "STCOU", "LSAD", "CBSA", "CSA"))]
 
@@ -425,6 +470,11 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
             dplyr::mutate(variable = stringr::str_remove(variable, "_"))
 
         } else if (geography == "place") {
+
+          if (vintage > 2022) {
+            rlang::abort("The most recent PEP release for this geography is 2022.")
+          }
+
           raw <- suppressMessages(readr::read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/cities/totals/sub-est2022.csv")) %>%
             dplyr::filter(SUMLEV == "162") %>%
             dplyr::mutate(GEOID = paste0(STATE, PLACE),
@@ -554,7 +604,7 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
                                    variables = variables,
                                    breakdown = breakdown,
                                    breakdown_labels = breakdown_labels,
-                                   year = year,
+                                   year = vintage,
                                    state = .x,
                                    county = county,
                                    time_series = time_series,
@@ -829,27 +879,14 @@ get_estimates <- function(geography = c("us", "region", "division", "state", "co
 
     } else {
 
-      geom <- try(suppressMessages(use_tigris(geography = geography, year = year,
-                                              state = state, county = county, ...)))
-
-      # if (year > 2021) {
-      #   geom <- try(suppressMessages(use_tigris(geography = geography, year = 2021,
-      #                                           state = state, county = county, ...)))
-      #
-      #   if (geography == "county" && "09" %in% stringr::str_sub(geom$GEOID, 1, 2)) {
-      #     ct_2022 <- clean_connecticut()
-      #
-      #     geom <- geom %>%
-      #       dplyr::filter(!stringr::str_sub(GEOID, 1, 2) == "09") %>%
-      #       dplyr::bind_rows(ct_2022)
-      #   }
-      #
-      # } else {
-      #   geom <- try(suppressMessages(use_tigris(geography = geography, year = year,
-      #                                           state = state, county = county, ...)))
-      # }
-
-
+      # Handle here until 2023 CB files are released
+      if (year == 2023) {
+        geom <- try(suppressMessages(use_tigris(geography = geography, year = 2022,
+                                                state = state, county = county, ...)))
+      } else {
+        geom <- try(suppressMessages(use_tigris(geography = geography, year = year,
+                                                state = state, county = county, ...)))
+      }
 
       if ("try-error" %in% class(geom)) {
         stop("Your geometry data download failed. Please try again later or check the status of the Census Bureau website at https://www2.census.gov/geo/tiger/", call. = FALSE)

diff --git a/R/zzz.r b/R/zzz.r
@@ -33,6 +33,7 @@ population_estimates_variables <- c("POP", "DENSITY")
 components_estimates_variables <- c("BIRTHS", "DEATHS","DOMESTICMIG","INTERNATIONALMIG","NATURALINC","NETMIG","RBIRTH","RDEATH","RDOMESTICMIG","RINTERNATIONALMIG","RNATURALINC","RNETMIG")
 housing_estimates_variables <- "HUEST"
 population_estimates_variables22 <- c("POPESTIMATE", "NPOPCHG")
+components_estimates_variables21 <- c("BIRTHS", "DEATHS", "NATURALINC", "DOMESTICMIG","INTERNATIONALMIG","NETMIG","RBIRTH","RDEATH","RDOMESTICMIG","RINTERNATIONALMIG","RNATURALINC","RNETMIG","RESIDUAL")
 components_estimates_variables22 <- c("BIRTHS", "DEATHS", "NATURALCHG", "DOMESTICMIG","INTERNATIONALMIG","NETMIG","RBIRTH","RDEATH","RDOMESTICMIG","RINTERNATIONALMIG","RNATURALCHG","RNETMIG","RESIDUAL")
 
 

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
diff --git a/docs/articles/basic-usage.html b/docs/articles/basic-usage.html
diff --git a/docs/articles/index.html b/docs/articles/index.html
diff --git a/docs/articles/margins-of-error.html b/docs/articles/margins-of-error.html