fix(sample_index): Remove outfile, .data[[""]]

This major overhaul to sample_index deprecates the outfile argument and aligns the code with modern practices, such as the removal of .data[["columnName"]] in the dplyr code in favor of using a utils-global file. Additional testing for error messages and other features of sample_index were added to the tests folder.
ss3sim · Oct 2, 2023 · 6f5a1be · 6f5a1be
1 parent da581de
commit 6f5a1be
Show file tree

Hide file tree

Showing 7 changed files with 321 additions and 249 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: ss3sim
 Title: Fisheries Stock Assessment Simulation Testing with Stock Synthesis
-Version: 1.20.0
+Version: 1.20.1
 Authors@R: c(
     person(c("Kelli", "F."), "Johnson", , "kelli.johnson@noaa.gov", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-5149-451X")),

diff --git a/R/sample_index.r b/R/sample_index.r
@@ -1,10 +1,27 @@
-#' Sample the biomass with observation error
+#' Sample the CPUE/index data with observation error
 #'
-#' Create indices of abundance sampled from the expected available biomass for
-#' specified fleets in specified years.
+#' Create new catch-per-unit-effort (CPUE)/indices of abundance that are
+#' based on the numbers in a data file. Typically the data file will be filled
+#' with expected values rather than observed data but it does not have to be.
+#' Sampling can only occur on fleets, years, and seasons that have current
+#' observations. If rows of information are not sampled from, then they are
+#' removed. So, you can take away rows of data but you cannot add them with
+#' this function.
 #'
 #' @details
-#' Samples are generated using the following equation:
+#' Limitations to the functionality of this function are as follows:
+#' * you can only generate observations from rows of data that are present,
+#'   e.g., you cannot make a new observation for a year that is not present in
+#'   the passed data file;
+#' * no warning will be given if some of the desired year, seas, fleet
+#'   combinations are available but not all, instead just the combinations
+#'   that are available will be returned in the data list object; and
+#' * sampling uses a log-normal distribution when the log-normal distribution
+#'   is specified in `CPUEinfo[["Errtype"]]` and a normal distribution for all
+#'   other error types, see below for details on the log-normal sampling.
+#'
+#' Samples are generated using the following equation when the log-normal
+#' distribution is specified:
 #' \deqn{
 #'   B_y*exp(stats::rnorm(1, 0, sds_obs)-sds_obs^2/2)
 #' },
@@ -24,167 +41,193 @@
 #' or normal distribution only when the variance is low, i.e., \eqn{CV < 0.50}
 #' or log standard deviation of 0.22.
 #'
-#' @template lcomp-agecomp-index
-#' @template dat_list
-#' @template outfile
-#' @param sds_obs,sds_out A list the same length as `fleets` specifying the
-#'   standard deviation of the observation error used for the sampling and the
-#'   value used in the returned output. Thus, `sds_obs` is what is actually
-#'   used to sample and `sds_out` can be used to test what happens when the
-#'   input values to your model are biased. If `sds_out` is missing, then
-#'   `sds_obs` will be used for the output as well as the input. List elements
-#'   should be either single numeric values or numeric vectors the same length
-#'   as the number of years sampled for each given fleet. Single values are
-#'   repeated for all years. See details for more information, particularly for
-#'   the equations.
-#' @template seas
+#' @param dat_list  A Stock Synthesis data list returned from
+#'   [r4ss::SS_readdat()]. Typically, this will be read from a file that
+#'   contains expected values rather than input values but any Stock Synthesis
+#'   data file is fine.
+#' @param fleets An integer vector specifying which fleets to sample from. The
+#'   order of the fleets matters here because you must retain the ordering for
+#'   all of the remaining input arguments. For example, both `fleets = c(1, 2)`
+#'   and `fleets = c(2, 1)` will work but `years` will expect the years you
+#'   want sampled for fleet 2 to be in the second position in the list in the
+#'   former and the first position in the latter case. So, if you change the
+#'   order of your input, you will also have to modify all of the remaining
+#'   arguments. An entry of `fleets = NULL` will lead to no CPUE samples in the
+#'   returned object.
+#' @param years A list the same length as `fleets` specifying the years you
+#'   want samples from. There must be an integer vector in the list for every
+#'   fleet specified in `fleets`. The function assumes that the information for
+#'   the first fleet specified in `fleets` will be the first object in the list
+#'   and so on so order matters here.
+#' @param outfile A deprecated argument.
+#' @param sds_obs,sds_out,seas A list the same length as `fleets` specifying the
+#'   standard deviation of the observation error used for the sampling; the
+#'   standard deviation of the observation error you would like listed in the
+#'   returned output, which might not always equal what was actually used for
+#'   sampling; and the seasons you want to sample from. Each list element
+#'   should contain a single numeric value or a vector, where vectors need to
+#'   match the structure of `years` for the relevant fleet. If single values
+#'   are passed, then, internally, they will be repeated for each year. If you
+#'   want to repeat a single value for every year and fleet combination, then
+#'   just pass it as a list with one entry, e.g., `seas = list(1)` will sample
+#'   from season one for all fleets and years --- this is the default for
+#'   season. The default for `sds_obs` is 0.01 and if `sds_out` is missing,
+#'   then `sds_obs` will be used for the output as well as the input.
 #'
-#' @template sampling-return
+#' @return
+#' A Stock Synthesis data file list object is returned. The object will be a
+#' modified version of `dat_list`.
+#' @family sampling functions
 #'
 #' @export
 #' @author Cole Monnahan, Kotaro Ono
 #' @examples
-#' # Find the example data location:
-#' set.seed(3)
+#' # Add a list from [r4ss::SS_readdat()] to your workspace, this is example
+#' # data that is saved in the ss3sim package.
+#' # Index data are saved in `dat_list[["CPUE"]]`
 #' dat_list <- r4ss::SS_readdat(
 #'   file = file.path(
 #'     system.file("extdata", "example-om", package = "ss3sim"),
 #'     "ss3_expected_values.dat"
 #'   ),
 #'   verbose = FALSE
 #' )
-#' # Look at expected values for the index data
-#' # fleet 2, every other year from 76 to 100
-#' # dat_list[["CPUE"]]
-#' sam_yrs <- seq(76, 100, by = 2)
-#' ex1 <- sample_index(dat_list,
+#'
+#' \dontshow{set.seed(3)}
+#' # Sample from each available year from fleet 2 with an increasing trend in
+#' # the observation error, i.e., the most recent year has the highest
+#' # likelihood to be the furthest from the input data
+#' ex1 <- sample_index(
+#'   dat_list,
 #'   outfile = NULL,
 #'   fleets = 2,
-#'   seas = list(unique(
+#'   seas = list(
 #'     dat_list[["CPUE"]][dat_list[["CPUE"]][, "index"] == 2, "seas"]
-#'   )),
-#'   years = list(sam_yrs),
-#'   sds_obs = list(seq(0.001, 0.1, length.out = length(sam_yrs)))
+#'   ),
+#'   years = list(dat_list[["CPUE"]][["year"]]),
+#'   sds_obs = list(
+#'     seq(0.001, 0.1, length.out = length(dat_list[["CPUE"]][["year"]]))
+#'   )
 #' )
 #' \dontshow{
+#' # Test to see if example 1 gives the values expected for 76 and 78
 #' testthat::expect_equivalent(
 #'   ex1[["CPUE"]][1:2, "obs"],
 #'   c(1472202421, 1554321845)
 #' )
 #' }
+#'
 #' \dontrun{
-#' ex1[["CPUE"]]
-#' # could sample from less years, but not more:
+#' # Sample from less years, note that sampling from more years than what is
+#' # present in the data will not work
 #' ex2 <- sample_index(dat_list,
 #'   outfile = NULL,
 #'   fleets = 2,
 #'   seas = list(unique(
 #'     dat_list[["CPUE"]][dat_list[["CPUE"]][, "index"] == 2, "seas"]
 #'   )),
-#'   years = list(sam_yrs[c(-1, -2)]),
-#'   sds_obs = list(seq(0.001, 0.1, length.out = length(sam_yrs) - 2))
+#'   years = list(dat_list[["CPUE"]][["year"]][-c(1:2)]),
+#'   sds_obs = list(0.001)
 #' )
-#' ex2[["CPUE"]]
-#' # sd can be fixed across years:
-#' ex3 <- sample_index(dat_list,
-#'   outfile = NULL,
+#'
+#' # sd in the returned file can be different than what is used to sample, this
+#' # is helpful when you want to test what would happen if the estimation method
+#' # was improperly specified
+#' ex3 <- sample_index(
+#'   dat_list = dat_list,
 #'   fleets = 2,
 #'   seas = list(unique(
 #'     dat_list[["CPUE"]][dat_list[["CPUE"]][, "index"] == 2, "seas"]
 #'   )),
-#'   years = list(sam_yrs),
-#'   sds_obs = list(0.01)
+#'   years = list(dat_list[["CPUE"]][["year"]]),
+#'   sds_obs = list(0.01),
+#'   sds_out = list(0.20)
 #' )
-#' ex3[["CPUE"]]
-#' # If fleet 1 also had expected values in the index that you wanted to sample:
-#' testthat::expect_error(
-#'   ex4 <- sample_index(dat_list,
-#'     outfile = NULL,
-#'     fleets = c(1, 2),
-#'     years = list(sam_yrs, sam_yrs),
-#'     sds_obs = list(0.01, 0.01)
-#'   )
+#' ex3[["CPUE"]][["se_log"]]
+#'}
+#' # Sample from two fleets after adding fake CPUE data for fleet 1
+#' dat_list2 <- dat_list
+#' dat_list2[["CPUE"]] <- rbind(
+#'   dat_list[["CPUE"]],
+#'   dat_list[["CPUE"]] |>
+#'     dplyr::mutate(index = 1, seas = 1)
 #' )
-#' # sd in the returned file can be different than what is used to sample:
-#' ex5 <- sample_index(dat_list,
-#'   outfile = NULL,
-#'   fleets = 2,
-#'   seas = list(unique(
-#'     dat_list[["CPUE"]][dat_list[["CPUE"]][, "index"] == 2, "seas"]
-#'   )),
-#'   years = list(sam_yrs),
+#' dat_list2[["N_cpue"]] <- NROW(dat_list2[["CPUE"]])
+#' ex4 <- sample_index(
+#'   dat_list = dat_list2,
+#'   fleets = 1:2,
+#'   seas = list(1, 7),
+#'   # Subset two years from each fleet
+#'   years = list(c(76, 78), c(80, 82)),
+#'   # Use the same sd values for both fleets
 #'   sds_obs = list(0.01),
 #'   sds_out = list(0.20)
 #' )
-#' ex5[["CPUE"]]
-#' testthat::expect_true(all(ex5[["CPUE"]][["se_log"]] == 0.2))
-#' }
-#' @family sampling functions
-
 sample_index <- function(dat_list,
-                         outfile = NULL,
+                         outfile = lifecycle::deprecated(),
                          fleets,
                          years,
-                         sds_obs,
+                         sds_obs = list(0.01),
                          sds_out,
                          seas = list(1)) {
   # Set up inputs
-  cpue <- dat_list[["CPUE"]]
-  colnames(cpue) <- gsub("obs", "obsOLD", colnames(cpue))
   Nfleets <- length(fleets)
   if (missing(sds_out)) {
     sds_out <- sds_obs
   }
 
   # Checks
-  # Check that sampling should occur, else exit early
-  if (Nfleets == 0) {
-    dat_list[["CPUE"]] <- dat_list[["CPUE"]][0, ]
-    dat_list[["N_cpue"]] <- 0
-    if (!is.null(outfile)) {
-      r4ss::SS_writedat(
-        datlist = dat_list,
-        outfile = outfile,
-        overwrite = TRUE,
-        verbose = FALSE
-      )
-    }
-    return(invisible(dat_list))
+  if (lifecycle::is_present(outfile)) {
+    lifecycle::deprecate_warn(
+      "1.20.1",
+      "sample_index(outfile)",
+      details = "Please save the returned output using `r4ss::SS_writedat()`"
+    )
   }
   # Check that dat_list was read in using {r4ss}
   if (!inherits(dat_list, "list") || is.null(dat_list[["CPUE"]])) {
     cli::cli_abort(c(
       "{.var dat_list} must be an object returned from r4ss::SS_readdat().",
       i = "{.var dat_list} is a {class(dat_list)}.",
-      i = "{.var dat_list$cpue} contains {NROW(cpue)} rows."
+      i = "{.var dat_list$CPUE} contains {NROW(dat_list$CPUE)} rows."
     ))
   }
+  # Check that sampling should occur, else exit early
+  if (Nfleets == 0) {
+    dat_list[["CPUE"]] <- dat_list[["CPUE"]][0, ]
+    dat_list[["N_cpue"]] <- 0
+    return(invisible(dat_list))
+  }
   # Check that wanted fleets are present in CPUE data
-  if (!all(fleets %in% cpue[["index"]])) {
-    missing_fleets <- fleets[!fleets %in% cpue[["index"]]]
+  if (!all(fleets %in% dat_list[["CPUE"]][["index"]])) {
+    missing_fleets <- fleets[!fleets %in% dat_list[["CPUE"]][["index"]]]
     cli::cli_abort(c(
       "All {.var fleets} must be found in {.var dat_list$cpue$index}",
-      i = "You tried to sample from fleets {fleets}.",
-      x = paste(
-        "{cli::qty(length(missing_fleets))} Fleet{?s} {missing_fleets}",
-        "{cli::qty(length(missing_fleets))} {?is/are} missing."
-      )
+      i = "{cli::qty(length(fleets))} You tried to sample from fleet{?s}
+           {fleets}.",
+      x = "{cli::qty(length(missing_fleets))} Fleet{?s} {missing_fleets}
+           {cli::qty(length(missing_fleets))} {?is/are} missing."
     ))
   }
 
-  # Start of sampling from the indices. Create a new data frame based on input
-  # arguments and use dplyr::mutate() to apply sample_lognormal to each row
-  # based on input sd and observed values
+  # Create a new data frame based on input arguments and use dplyr::mutate() to
+  # apply sample_lognormal to each row based on input sd and observed values
   xxx <- merge(
-    do.call(rbind, mapply(data.frame,
-      SIMPLIFY = FALSE,
-      year = years,
-      seas = standardize_sampling_args(fleets, years, other_input = seas),
-      index = lapply(fleets, c),
-      se_in = standardize_sampling_args(fleets, years, other_input = sds_obs),
-      se_log = standardize_sampling_args(fleets, years, other_input = sds_out)
-    )),
-    cpue[, c("year", "seas", "index", "obsOLD")],
+    x = do.call(
+      what = rbind,
+      args = mapply(
+        FUN = data.frame,
+        SIMPLIFY = FALSE,
+        year = years,
+        seas = standardize_sampling_args(fleets, years, other_input = seas),
+        index = lapply(fleets, c),
+        se_in = standardize_sampling_args(fleets, years, other_input = sds_obs),
+        se_log = standardize_sampling_args(fleets, years, other_input = sds_out)
+      )
+    ),
+    y = dat_list[["CPUE"]] |>
+      dplyr::rename(obsOLD = obs) |>
+      dplyr::select(-se_log),
     sort = FALSE
   )
   if (NROW(xxx) == 0) {
@@ -194,32 +237,23 @@ sample_index <- function(dat_list,
       i = "sample_index() can't sample non-existent years, fleets, seasons."
     ))
   }
-  cpue.new <- xxx %>%
-    dplyr::arrange(.data[["index"]], .data[["year"]], .data[["seas"]]) %>%
-    dplyr::rowwise() %>%
+  dat_list[["CPUE"]] <- xxx |>
+    dplyr::arrange(index, year, seas) |>
+    dplyr::rowwise() |>
     dplyr::mutate(
-      dist = dat_list[["CPUEinfo"]][["Errtype"]][
-        match(.data[["index"]], dat_list[["CPUEinfo"]][["Fleet"]])
+      distribution = dat_list[["CPUEinfo"]][["Errtype"]][
+        match(index, dat_list[["CPUEinfo"]][["Fleet"]])
       ],
       obs = ifelse(
-        test = .data[["dist"]] == 0,
-        yes = sample_lognormal(.data[["obsOLD"]], .data[["se_in"]]),
-        no = stats::rnorm(n = 1, mean = .data[["obsOLD"]], .data[["se_in"]])
+        test = distribution == 0,
+        yes = sample_lognormal(obsOLD, se_in),
+        no = stats::rnorm(n = 1, mean = obsOLD, sd = se_in)
       )
-    ) %>%
-    dplyr::select(.data[["year"]]:.data[["index"]], .data[["obs"]], .data[["se_log"]])
+    ) |>
+    dplyr::select(year:index, obs, se_log) |>
+    as.data.frame()
 
-  # Overwrite .dat file
-  dat_list$CPUE <- as.data.frame(cpue.new)
-  dat_list$N_cpue <- ifelse(Nfleets > 0, nrow(cpue.new), 0)
-  if (!is.null(outfile)) {
-    r4ss::SS_writedat(
-      datlist = dat_list,
-      outfile = outfile,
-      overwrite = TRUE,
-      verbose = FALSE
-    )
-  }
+  dat_list[["N_cpue"]] <- ifelse(Nfleets > 0, nrow(dat_list[["CPUE"]]), 0)
 
   return(invisible(dat_list))
 }