PecanProject · mdietze · May 21, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -45,6 +45,8 @@ see if you need to change any of these:
 2) allow user-defined parallel mode for the qsub submission; 3) allow user-defined email option to report the progress.
 - The analysis function now supports the parallelization of multi-chain MCMC sampling with the fully randomized inits function.
 - Added the new feature of the block-based SDA workflow, which supports the parallel computation.
+- Added new SDA workflow for the 342 North America anchor sites.
+- Added new feature of preparing initial conditions for MODIS LAI, AGB, ISCN SOC, and soil moisture across NA anchor sites.
 
 ### Fixed
 

diff --git a/base/remote/R/qsub_parallel.R b/base/remote/R/qsub_parallel.R
@@ -148,9 +148,6 @@ qsub_parallel <- function(settings, files = NULL, prefix = "sipnet.out", sleep =
         #compare two progresses and set the maximum progress for the progress bar.
         pbi <- L_folder - length(folders)
         utils::setTxtProgressBar(pb, pbi)
-
-        pbi1 <- L_jobid - length(jobids)
-        utils::setTxtProgressBar(pb1, pbi1)
       }
     } else {
       #special case that only detect the job ids on the server.

diff --git a/book_source/03_topical_pages/04_R_workflow.Rmd b/book_source/03_topical_pages/04_R_workflow.Rmd
@@ -22,6 +22,8 @@ Within the PEcAn repository, code pertaining to input conversion is in the MODUL
 
 ## Initial Conditions {#workflow-input-initial}
 
+### CONUS (NEON/FIA/BADM) Initial Conditions.
+
 To convert initial condition data into the PEcAn Standard and then into the model formats we follow three main steps:
 
   1. Downloading vegetation info
@@ -52,6 +54,21 @@ This function will create ensemble member ncdf files by resampling the veg file
 put_veg_module()
 This function will convert the ensemble member ncdf files into model specific format. Currently the supported models are ED2 and SIPNET. 
 
+### North America (NA) Initial Conditions.
+
+To create initial condition files across North America, you will need to strictly follow the script located at `~/pecan/modules/assim.sequential/inst/anchor/IC_prep_anchorSites.Rmd`. Within the script we will be following those main steps:
+  1. Loading `settings.xml` file and specify paths.
+  2. Downloading/extracting estimations of four major carbon/water pools (leaf, wood, soil C, soil water) into by-site and by-ensemble tables.
+  3. Doing unit conversion. For each ensemble of each site, we will be preparing the `poolinfo` object consisting of converted pool estimations.
+  4. We will finally be writing the NC files through the `PEcAn.SIPNET::veg2model.SIPNET` function.
+  5. Within the loop, we will store the NC file paths into the settings object and rewrite the settings into the XML file to the destination.
+
+Within the script we proposed the following new datasets for handling the NA initial condition preparations:
+
+  1. The leaf carbon is initialized with MODIS LAI observations and the SLA for the corresponding PFT.
+  2. The above ground biomass (AGB) is initialized with the 2010 global AGB map (DOI: https://doi.org/10.3334/ORNLDAAC/1763).
+  3. The soil moisture (SM) is initialized with the SM estimations starting from 1978 (DOI: 10.24381/cds.d7782f18).
+  4. The soil organic carbon (SOC) is initialized with ISCN SOC estimations (data already prepared on PEcAn, use `PEcAn.data.land::iscn_soc` to load) based on the level 2 ecoregion map (pre-downloaded using the following link: https://www.epa.gov/eco-research/ecoregions).
 
 ## Meteorological Data {#workflow-met}
 

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
@@ -77,6 +77,7 @@
 "future","*","modules/data.remote","Imports",FALSE
 "geonames","> 0.998","modules/data.atmosphere","Imports",FALSE
 "getPass","*","base/remote","Suggests",FALSE
+"getPass","*","modules/data.land","Suggests",FALSE
 "ggmcmc","*","modules/meta.analysis","Suggests",FALSE
 "ggplot2","*","base/utils","Suggests",FALSE
 "ggplot2","*","base/visualization","Imports",FALSE
@@ -93,6 +94,7 @@
 "glue","*","models/ed","Imports",FALSE
 "glue","*","modules/assim.sequential","Suggests",FALSE
 "glue","*","modules/data.atmosphere","Imports",FALSE
+"glue","*","modules/data.land","Suggests",FALSE
 "glue","*","modules/data.remote","Imports",FALSE
 "graphics","*","base/qaqc","Imports",FALSE
 "graphics","*","modules/allometry","Imports",FALSE
@@ -451,6 +453,7 @@
 "reshape2","*","modules/data.atmosphere","Imports",FALSE
 "reshape2",">= 1.4.2","modules/assim.sequential","Suggests",FALSE
 "reticulate","*","modules/data.atmosphere","Suggests",FALSE
+"reticulate","*","modules/data.land","Suggests",FALSE
 "reticulate","*","modules/data.remote","Imports",FALSE
 "rjags","*","base/utils","Suggests",FALSE
 "rjags","*","modules/assim.batch","Imports",FALSE

diff --git a/models/sipnet/R/write.configs.SIPNET.R b/models/sipnet/R/write.configs.SIPNET.R
@@ -551,8 +551,10 @@ write.config.SIPNET <- function(defaults, trait.values, settings, run.id, inputs
       }
       ## soilWFracInit fraction
       soilWFrac <- try(ncdf4::ncvar_get(IC.nc,"SoilMoistFrac"),silent = TRUE)
-      if (!is.na(soilWFrac) && is.numeric(soilWFrac)) {
-        param[which(param[, 1] == "soilWFracInit"), 2] <- sum(soilWFrac)
+      if (!"try-error" %in% class(soilWFrac)) {
+        if (!is.na(soilWFrac) && is.numeric(soilWFrac)) {
+          param[which(param[, 1] == "soilWFracInit"), 2] <- sum(soilWFrac)/100
+        }
       }
       ## litterWFracInit fraction
       litterWFrac <- soilWFrac

diff --git a/modules/assim.sequential/R/Analysis_sda_block.R b/modules/assim.sequential/R/Analysis_sda_block.R
@@ -203,8 +203,8 @@ build.block.xy <- function(settings, block.list.all, X, obs.mean, obs.cov, t) {
       block.list[[i]]$data$muf <- mu.f[f.start:f.end]
       block.list[[i]]$data$pf <- Pf[f.start:f.end, f.start:f.end]
       #find indexs for Y.
-      y.start <- sum(obs_per_site[1:i])
-      y.end <- y.start + obs_per_site[i] - 1
+      y.start <- sum(obs_per_site[1:i]) - obs_per_site[i] + 1
+      y.end <- sum(obs_per_site[1:i])
       #fill in y and r
       #if there is no observation for this site.
       if (y.end < y.start) {
@@ -256,8 +256,8 @@ build.block.xy <- function(settings, block.list.all, X, obs.mean, obs.cov, t) {
       for (j in seq_along(ids)) {
         f.start <- (ids[j] - 1) * length(var.names) + 1
         f.end <- ids[j] * length(var.names)
-        y.start <- sum(obs_per_site[1:ids[j]])
-        y.end <- y.start + obs_per_site[ids[j]] - 1
+        y.start <- sum(obs_per_site[1:ids[j]]) - obs_per_site[ids[j]] + 1
+        y.end <- sum(obs_per_site[1:ids[j]])
         f.ind <- c(f.ind, f.start:f.end)
         #if the current site has greater or equal than 1 observation.
         if (y.end >= y.start) {

diff --git a/modules/assim.sequential/R/SDA_OBS_Assembler.R b/modules/assim.sequential/R/SDA_OBS_Assembler.R
@@ -5,7 +5,7 @@
 #' @return list of obs.mean and obs.cov
 #' @export
 #' @author Dongchen Zhang
-#' @importFrom magrittr %>%
+#' @importFrom dplyr %>%
 #' @importFrom lubridate %m+%
 #'
 #' @examples

diff --git a/modules/assim.sequential/R/sda.enkf_MultiSite.R b/modules/assim.sequential/R/sda.enkf_MultiSite.R
@@ -507,7 +507,7 @@ sda.enkf.multisite <- function(settings,
       ###-------------------------------------------------------------------###---- 
       #To trigger the analysis function with free run, you need to first specify the control$forceRun as TRUE,
       #Then specify the settings$state.data.assimilation$scalef as 0, and settings$state.data.assimilation$free.run as TRUE.
-      if (!is.null(obs.mean[[t]][[1]]) | as.logical(settings$state.data.assimilation$free.run) & control$forceRun) {
+      if (!is.null(obs.mean[[t]][[1]]) | (as.logical(settings$state.data.assimilation$free.run) & control$forceRun)) {
         # TODO: as currently configured, Analysis runs even if all obs are NA, 
         #  which clearly should be triggering the `else` of this if, but the
         #  `else` has not been invoked in a while an may need updating
@@ -756,6 +756,7 @@ sda.enkf.multisite <- function(settings,
       system2(sendmail, c("-f", paste0("\"", control$send_email$from, "\""), paste0("\"", control$send_email$to, "\""), "<", mailfile))
       unlink(mailfile)
     }
+      gc()
     # useful for debugging to keep .nc files for assimilated years. T = 2, because this loops removes the files that were run when starting the next loop
 #    if (keepNC && t == 1){
 #      unlink(list.files(outdir, "*.nc", recursive = TRUE, full.names = TRUE))

diff --git a/modules/assim.sequential/R/sda_plotting.R b/modules/assim.sequential/R/sda_plotting.R
@@ -830,7 +830,7 @@ post.analysis.multisite.ggplot <- function(settings, t, obs.times, obs.mean, obs
              Name=.data$site.names)
 
     suppressMessages({
-      aoi_boundary_HARV <- sf::st_read(system.file("extdata", "eco-regionl2.json", package = "PEcAnAssimSequential"))
+      aoi_boundary_HARV <- sf::st_read(system.file("extdata", "eco-regionl2.json", package = "PEcAn.data.land"))
     })
 
     #transform site locs into new projection - UTM 2163
@@ -918,20 +918,29 @@ post.analysis.multisite.ggplot <- function(settings, t, obs.times, obs.mean, obs
 ##' @param CI range of confidence interval.
 ##' @param unit list of unit used for y axis label.
 ##' @param style color option.
+##' @param PDF_w width of exported PDF file.
-##' @param PDF_w width of exported PDF file.
+##' @param PDF_w width of exported PDF file, passed on to `base::pdf()`.
-##' @param PDF_w width of exported PDF file.
+##' @param PDF_w width of exported PDF file, passed on to `base::pdf()`.
+##' @param PDF_h height of exported PDF file.
+##' @param t.inds index of period that will be plotted.
 ##' @export
 ##' @author Dongchen Zhang
 SDA_timeseries_plot <- function(ANALYSIS, FORECAST, obs.mean = NULL, obs.cov = NULL, outdir, pft.path = NULL, by = "site", types = c("FORECAST", "ANALYSIS", "OBS"), CI = c(0.025, 0.975), 
                                 unit = list(AbvGrndWood = "Mg/ha", LAI = "m2/m2", SoilMoistFrac = "", TotSoilCarb = "kg/m2"),
                                 style = list(general_color = c("FORECAST" = "blue", "ANALYSIS" = "red", "OBS" = "black"),
                                              fill_color = c("FORECAST" = "yellow", "ANALYSIS" = "green", "OBS" = "grey"),
-                                             title_color = "red")){
+                                             title_color = "red"), 
+                                PDF_w = 20,
+                                PDF_h = 16,
+                                t.inds = NULL){
   #Check package availability.
   if("try-error" %in% class(try(find.package("ggpubr"), silent = T))){
     PEcAn.logger::logger.info("Package ggpubr is not installed! Please install it and rerun the function!")
     return(0)
   }
   #TODO: make page, font, line, point sizes adjustable.
   time_points <- names(FORECAST)
+  if (!is.null(t.inds)) {
+    time_points <- time_points[t.inds]
+  }
   site_ids <- attributes(FORECAST[[1]])$Site
   var_names <- attributes(FORECAST[[1]])$dimnames[[2]]
   #new diag function: fixed the bug when length==1 then it will return 0x0 matrix
@@ -979,8 +988,6 @@ SDA_timeseries_plot <- function(ANALYSIS, FORECAST, obs.mean = NULL, obs.cov = N
   }
   #if we plot by each site.
   if(by == "site") {
-    PDF_w <- 10
-    PDF_h <- 8
     p <- list()
     for (site.id in sort(unique(site_ids))) {
       site_p <- list()
@@ -1001,8 +1008,6 @@ SDA_timeseries_plot <- function(ANALYSIS, FORECAST, obs.mean = NULL, obs.cov = N
     }
     #if we plot by each state variable
   } else if (by == "var") {
-    PDF_w <- 20
-    PDF_h <- 16
     p <- list()
     for (var.name in sort(unique(var_names))) {
       var_p <- list()
@@ -1028,8 +1033,6 @@ SDA_timeseries_plot <- function(ANALYSIS, FORECAST, obs.mean = NULL, obs.cov = N
       PEcAn.logger::logger.info("Please provide the pdf path!")
       return(0)
     } else {
-      PDF_w <- 20
-      PDF_h <- 16
       p <- list()
       for (PFT in sort(unique(pft$pft))) {
         site_id_pft <- pft$site[which(pft$pft == PFT)]

diff --git a/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R b/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R
@@ -9,14 +9,14 @@ start_date <- "2012/01/01"
 end_date <- "2021/12/31"
 
 #setup working space
-outdir <- "/projectnb/dietzelab/dongchen/All_NEON_SDA/NEON42/SDA/"
-SDA_run_dir <- "/projectnb/dietzelab/dongchen/All_NEON_SDA/NEON42/SDA/run/"
-SDA_out_dir <- "/projectnb/dietzelab/dongchen/All_NEON_SDA/NEON42/SDA/out/"
+outdir <- "/projectnb/dietzelab/dongchen/anchorSites/SDA/"
+SDA_run_dir <- "/projectnb/dietzelab/dongchen/anchorSites/SDA/run/"
+SDA_out_dir <- "/projectnb/dietzelab/dongchen/anchorSites/SDA/out/"
 
-ERA5_dir <- "/projectnb/dietzelab/dongchen/All_NEON_SDA/NEON42/ERA5_2012_2021/"
-XML_out_dir <- "/projectnb/dietzelab/dongchen/All_NEON_SDA/NEON42/SDA/pecan.xml"
+ERA5_dir <- "/projectnb/dietzelab/dongchen/anchorSites/ERA5_2012_2021/"
+XML_out_dir <- "/projectnb/dietzelab/dongchen/anchorSites/SDA/pecan.xml"
 
-pft_csv_dir <- "/projectnb/dietzelab/dongchen/All_NEON_SDA/NEON42/site_pft.csv"
+pft_csv_dir <- "/projectnb/dietzelab/dongchen/anchorSites/site_pft.csv"
 
 #Obs_prep part
 #AGB
@@ -44,14 +44,14 @@ SoilC_export_csv <- TRUE
 #Obs Date
 obs_start_date <- "2012-07-15"
 obs_end_date <- "2021-07-15"
-obs_outdir <- "/projectnb/dietzelab/dongchen/All_NEON_SDA/test_OBS"
+obs_outdir <- "/projectnb/dietzelab/dongchen/anchorSites/Obs"
 timestep <- list(unit="year", num=1)
 
 #specify model binary
 model_binary <- "/usr2/postdoc/istfer/SIPNET/trunk//sipnet_if"
 
 #specify host section
-host.flag <- "rabbitmq"
+host.flag <- "local"
 if (host.flag == "remote") {
   #if we submit jobs through tunnel remotely.
   host = structure(list(
@@ -78,11 +78,10 @@ if (host.flag == "remote") {
   host = structure(list(
     name = "localhost",
     rabbitmq = structure(list(
-      prefix = NULL,
       uri = "amqp://guest:guest@pecan-rabbitmq:15672/%2F",
       queue = "SIPNET_r136",
       cp2cmd = "oc rsync @RUNDIR@ $(oc get pod -l app.kubernetes.io/name=pecan-model-sipnet-136 -o name):@RUNDIR@",
-      cpfcmd = "/data/bin/oc rsync @OUTFOLDER@ $(/data/bin/oc get pod -l app=dongchen-sda -o name):@OUTDIR@"
+      cpfcmd = "/data/bin/oc rsync @OUTDIR@ $(/data/bin/oc get pod -l app=dongchen-sda -o name):@OUTDIR@"
     )),
     folder = SDA_out_dir,
     outdir = SDA_out_dir,
@@ -109,11 +108,11 @@ template <- PEcAn.settings::Settings(list(
     FullYearNC = TRUE,
     NC.Overwrite = FALSE,
     NC.Prefix = "sipnet.out",
-    q.type = "SINGLE",
+    q.type = "vector",
     by.site = FALSE,
     Localization.FUN = "Local.support",
     scalef = 1,
-    chains = 5,
+    chains = 1,
     data = structure(list(format_id = 1000000040, input.id = 1000013298)),
     state.variables = structure(list(
       #you could add more state variables here
@@ -291,7 +290,7 @@ template <- PEcAn.settings::Settings(list(
       #                               )),
       # soilinitcond = structure(list(path = "/projectnb/dietzelab/ahelgeso/EFI_Forecast_Challenge/"
       #                               )),
-      pft.site = structure(list(path = "/projectnb/dietzelab/dongchen/All_NEON_SDA/NEON42/site_pft.csv"))
+      pft.site = structure(list(path = pft_csv_dir))
     ))
   ))
 ))
@@ -304,8 +303,8 @@ template <- PEcAn.settings::Settings(list(
 ############################################################################
 ############################################################################
 
-sitegroupId <- 1000000031
-nSite <- 39
+sitegroupId <- 1000000033
+nSite <- 330
 
 multiRunSettings <- PEcAn.settings::createSitegroupMultiSettings(
   template,
@@ -314,6 +313,9 @@ multiRunSettings <- PEcAn.settings::createSitegroupMultiSettings(
 if(file.exists(XML_out_dir)){
   unlink(XML_out_dir)
 }
+
+
+
 PEcAn.settings::write.settings(multiRunSettings, outputfile = "pecan.xml")
 
 #here we re-read the xml file to fix issues of some special character within the Host section.
@@ -346,6 +348,26 @@ for (i in 1:nSite) {
   settings[[i]]$run$site$name <- site_info$sitename[index_site_info]#temp_ID
 }
 
+#remove overlapped sites
+site.locs <- settings$run %>% 
+  purrr::map('site') %>% 
+  purrr::map_dfr(~c(.x[['lon']],.x[['lat']]) %>% as.numeric)%>% 
+  t %>%
+  `colnames<-`(c("lon","lat")) %>% data.frame
+del.ind <- c()
+for (i in 1:nrow(site.locs)) {
+  for (j in i:nrow(site.locs)) {
+    if (i == j) {
+      next
+    }
+    if (site.locs$lon[i] == site.locs$lon[j] &&
+        site.locs$lat[i] == site.locs$lat[j]) {
+      del.ind <- c(del.ind, j)
+    }
+  }
+}
+settings <- settings[-del.ind]
+
 #####
 unlink(paste0(settings$outdir,"/pecan.xml"))
 PEcAn.settings::write.settings(settings, outputfile = "pecan.xml")