Data exploration

Setup

Load packages

library(raster)
library(tidyverse)
library(tidymodels)
library(sf)
library(here)
library(wesanderson)
library(rnaturalearth)
library(patchwork)
library(blockCV)
library(tictoc)
library(vip)
library(furrr)
library(spdep)
library(ncf)
library(corrr)
library(rstan)
library(tidybayes)
library(bayesplot)
library(BBmisc)
library(rstanarm)
library(glmmfields)
library(projpred)
library(data.table)
library(corrr)

source("R/helper_functions.R")

Map helpers

pal <- wes_palette("Zissou1", 100, type = "continuous")

# for assigning cells to continents. Islands are missed at coarser resolutions
world_base <- rnaturalearth::ne_countries(scale = "large", returnclass = "sf") %>%
  select(continent, name_long) %>% 
  st_transform(crs = "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs") 

# for mapping. this will be smaller so plotting is faster
world_base_map <- rnaturalearth::ne_countries(scale = "small", returnclass = "sf") %>%
  select(continent, name_long) %>% 
  st_transform(crs = "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs")

Data wrangling

Read in the data. Two main data sources- the genetic summary statistics and the environmental data.

sumstats <- read_csv(here("output", "spreadsheets", "cell_medium_3_10_sumstats.csv"))

rast_list_medium <- list.files(here("data", "climate_agg"),
                        pattern = "medium",
                        full.names = TRUE)

rasters_full_medium <- raster::stack(rast_list_medium)
crs(rasters_full_medium) <- "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs"

glimpse(sumstats)
rasters_full_medium

Write raster maps to file for visual inspection.

pdf(file = here("output", "exploratory_plots", "predictor_rasters.pdf"))
for (i in 1:nlayers(rasters_full_medium)) {
  plot(rasters_full_medium[[i]], main = names(rasters_full_medium[[i]]))
  plot(st_geometry(world_base_map), add=TRUE)
}
dev.off()

Extract raster values for each cell that has genetic summary data and join the data frames for a full data set.

explanatory_df <- rasters_full_medium[sumstats$cell] %>% 
  as_tibble() %>% 
  mutate(cell = sumstats$cell) 
  

full_df <- left_join(sumstats, explanatory_df, by = "cell") %>% 
  # arrange for plotting later
  arrange(cell)

glimpse(full_df)

Convert this to a sf polygon object for mapping and spatial cross validation. Also, extracting the continent for each polygon (largest overlap) to assess sampling imbalance.

template_medium <- raster(here("data", "templates", "template_medium.tif"))

template_medium[full_df$cell] <- full_df$avg_pi

full_sf <- rasterToPolygons(template_medium) %>% 
  st_as_sf() %>% 
  bind_cols(full_df) %>% 
  select(-template_medium) %>%
  # Adding continent column to the data frame.
  st_join(world_base["continent"], largest = TRUE)


glimpse(full_sf)

Visualizing temperature to make sure the conversion was successful.

ggplot() +
  geom_sf(data = full_sf, aes(fill = current_medium_bio_1, color = current_medium_bio_1)) +
  scale_fill_gradientn(colors = pal) +
  scale_color_gradientn(colors = pal) + 
  theme_minimal()

Continent mapping

How many cells successfully mapped to a continent, and what is the sample size? Looks like 2 cells did not map correctly.

full_sf %>% 
  count(continent)

Visualize which cells mapped correctly. All of the “Open Ocean” values are islands around Africa. Although these may politically be assigned a different continent (e.g. some islands around Madagascar are European), spatially they’re near Africa, so I’m classifying them as African.

ggplot() + 
  geom_sf(data = world_base_map) +
  geom_sf(data = full_sf, aes(fill = continent, color = continent))

Converting the cells and taking another look. Looks fine, except for the NA.

full_sf <- full_sf %>% 
  mutate(continent = ifelse(str_detect(continent, "Seven"), "Africa", continent))

ggplot() + 
  geom_sf(data = world_base_map) +
  geom_sf(data = full_sf, aes(fill = continent, color = continent))

Let’s see where the NA is. Looks like South America!

full_sf %>% 
  mutate(na_cont = if_else(is.na(continent), "Missing", "Present")) %>% 
  ggplot() + 
  geom_sf(aes(fill = na_cont, color = na_cont))

Replacing the NA value with “South America”. North America has the most representation, while South America has the least.

full_complete <- full_sf %>% 
  mutate(continent = if_else(is.na(continent), "South America", continent))

count(full_complete, continent) %>% mutate(perc = n / sum(n)) %>% 
  as_tibble() %>% 
  select(continent, n, perc) %>%
  arrange(desc(perc)) %>% 
  knitr::kable()

Predictor variables

Predictors have between 0 and 11 NAs. Need to see if these are for the same cells or not

full_complete %>% 
  summarize_all(~sum(is.na(.))) %>% 
  pivot_longer(cols = !contains("geometry"), 
               names_to = "variable",
               values_to = "num_NA") %>% 
  filter(num_NA > 0)

It seems like there is some overlap with the missing data, but we’re going to have to throw out 9 cells.

nrow_filt <- full_complete %>% 
  na.omit() %>% 
  nrow()

nrow(full_complete) - nrow_filt

Missing data

Let’s see what the distribution of cells with an NA is. Most are islands, and there are a few from the Arctic.

full_missing <- full_complete[rowSums(is.na(full_complete)) > 0,]

ggplot() +
  geom_sf(data = world_base_map) +
  geom_sf(data = full_missing, aes(fill = continent, color = continent))

What is the sample size per cell? These are info-rich cells, but there are too many variables to get this info back from to where I don’t think the effort is worth it. I’m going to remove them and call it good.

full_missing %>% select(continent, num_otu, num_ind, num_order)

Filter NAs. Also adding in latitude and longitude columns.

full_filter <- full_complete %>% 
  remove_missing() 

# get centroid coordinate for each cell
coords <- st_centroid(full_filter) %>% 
  st_coordinates()

full_filter <- full_filter %>% 
  mutate(longitude = coords[,1],
         latitude = coords[,2])

glimpse(full_filter)

Map the latitude to make sure I got the correct column.

ggplot() +
  geom_sf(data = full_filter, aes(fill = latitude, color = latitude)) +
  scale_fill_gradientn(colors = pal) + 
  scale_color_gradientn(colors = pal)

Data exploration

Read and filter

Read in genetic summary data for the least restrictive high resollution and medium resolution genetic filtering regimes. I can impose more restrictive filtering protocols from there.

# least restrictive 3 individual filtering regimes
#high_3_df <- read_csv(here("output", "spreadsheets", "cell_high_3_10_sumstats.csv"))
med_3_df <- read_csv(here("output", "spreadsheets", "cell_medium_3_10_sumstats.csv"))
#low_3_df <- read_csv(here("output", "spreadsheets", "cell_low_3_10_sumstats.csv"))

# least restrictive 5 individual filtering regime
#high_5_df <- read_csv(here("output", "spreadsheets", "cell_high_5_10_sumstats.csv"))

Read in predictor variables

# rast_list_high <- list.files(here("data", "climate_agg"),
#                         pattern = "high",
#                         full.names = TRUE)
# 
# rasters_full_high <- raster::stack(rast_list_high)
# crs(rasters_full_high) <- "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs"


rast_list_medium <- list.files(here("data", "climate_agg"),
                        pattern = "medium",
                        full.names = TRUE)

rasters_full_medium <- raster::stack(rast_list_medium)
crs(rasters_full_medium) <- "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs"

# rast_list_low <- list.files(here("data", "climate_agg"),
#                         pattern = "_low",
#                         full.names = TRUE)
# 
# rasters_full_low <- raster::stack(rast_list_low)
# crs(rasters_full_low) <- "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs"

Join the predictors with the genetic summaries.

join_predictors <- function(gen_sumstats, pred_rasts, resolution) {
  exp_df <- pred_rasts[gen_sumstats$cell] %>%
    as_tibble() %>%
    mutate(cell = gen_sumstats$cell)
  
  if (resolution == "high") {
    full_df <- left_join(gen_sumstats, exp_df, by = "cell") %>%
      mutate(resolution = "high") %>%
      arrange(cell)
  } else if (resolution == "medium") {
    full_df <- left_join(gen_sumstats, exp_df, by = "cell") %>%
      mutate(resolution = "medium") %>%
      arrange(cell)
  } else {
    full_df <- left_join(gen_sumstats, exp_df, by = "cell") %>%
      mutate(resolution = "low") %>%
      arrange(cell)
  }
  
  
  return(full_df)
}

# high_3_full <- join_predictors(high_3_df, rasters_full_high, "high")
# high_5_full <- join_predictors(high_5_df, rasters_full_high, "high")
med_3_full <- join_predictors(med_3_df, rasters_full_medium, "medium")
# low_3_full <- join_predictors(low_3_df, rasters_full_low, "low")


# nrow(high_3_full)
# nrow(high_5_full)
nrow(med_3_full)
# nrow(low_3_full)

Filter each dataset for NAs.

filter_dfs <- function(df, resolution) {
  df_filtered <- df %>% 
    remove_missing()
  return(df_filtered)
}

# high_3_filter <- filter_dfs(high_3_full, "high")
# high_5_filter <- filter_dfs(high_5_full, "high")
med_3_filter <- filter_dfs(med_3_full, "medium")
# low_3_filter <- filter_dfs(low_3_full, "low")


# nrow(high_3_filter)
# nrow(high_5_filter)
nrow(med_3_filter)
# nrow(low_3_filter)

PCA

Given that classes of variables are often highly correlated, I’m going to perform principle component analysis on sets of predictor variables and create new composite variables to investigate.
Before performing the PCA on each data set, I’m going to create different data subsets, imposing stricter limits on the minimum number of OTUs per filtering regime. I’m doing this for: current temperature, current precipitation, global habitat heterogeneity, terrain continuous average, terrain continuous standard deviation, and terrain categorical (percentages).

all_dfs <- tibble(
  resolution = c(#paste(rep("high", 12, sep = ", ")), 
                 paste(rep("medium", 6, sep = ", "))#,
                 #paste(rep("low", 6, sep = ", "))
                 ),
  min_ind = c(3L, 3L, 3L, 3L, 3L, 3L, 
              #5L, 5L, 5L, 5L, 5L, 5L, 

              # 3L, 3L, 3L, 3L, 3L, 3L
              ),
  min_otu = c(10L, 25L, 50L, 100L, 150L, 200L),
  df = list(
    # high_3_filter,
    # high_3_filter %>% filter(num_otu >= 20),
    # high_3_filter %>% filter(num_otu >= 50),
    # high_3_filter %>% filter(num_otu >= 100),
    # high_3_filter %>% filter(num_otu >= 150),
    # high_3_filter %>% filter(num_otu >= 200),
    # high_5_filter,
    # high_5_filter %>% filter(num_otu >= 20),
    # high_5_filter %>% filter(num_otu >= 50),
    # high_5_filter %>% filter(num_otu >= 100),
    # high_5_filter %>% filter(num_otu >= 150),
    # high_5_filter %>% filter(num_otu >= 200),
    med_3_filter,
    med_3_filter %>% filter(num_otu >= 25),
    med_3_filter %>% filter(num_otu >= 50),
    med_3_filter %>% filter(num_otu >= 100),
    med_3_filter %>% filter(num_otu >= 150),
    med_3_filter %>% filter(num_otu >= 200)
    # low_3_filter,
    # low_3_filter %>% filter(num_otu >= 20),
    # low_3_filter %>% filter(num_otu >= 50),
    # low_3_filter %>% filter(num_otu >= 100),
    # low_3_filter %>% filter(num_otu >= 150),
    # low_3_filter %>% filter(num_otu >= 200)
  )
) %>% 
  mutate(num_cells = map_int(df, nrow))

### function to conduct a pca for each variable set and each data frame and add the new PCA variables back to the original data frames

get_pc_scores <- function(df) {
  ### vectors of variables to perform pca on
  all_vars <- colnames(df)
  
  if (str_detect(all_vars[15], "high")) {
    # temperature bioclims
    temp_vars <- paste0("current_high_bio_", 1:11)
    # precipitation bioclims
    precip_vars <- paste0("current_high_bio_", 12:19)
  } else if (str_detect(all_vars[15], "medium")) {
    temp_vars <- paste0("current_medium_bio_", 1:11)
    # precipitation bioclims
    precip_vars <- paste0("current_medium_bio_", 12:19)
  } else if (str_detect(all_vars[15], "_low_")) {
    temp_vars <- paste0("current_low_bio_", 1:11)
    # precipitation bioclims
    precip_vars <- paste0("current_low_bio_", 12:19)
  }
  
  # global habitat heterogeneity
  ghh_vars <- all_vars[str_starts(all_vars, "ghh_")]
  
  # terrain continuous average
  terr_median_vars <- all_vars[str_ends(all_vars, "_median")]
  
  # terrain continuous standard deviation
  terr_sd_vars <- all_vars[str_ends(all_vars, "_sd")]
  
  # terrain categorical
  terr_cat_vars <- all_vars[str_detect(all_vars, "_geom")]
  
  # combine all into a single named list for looping
  predictor_list <- list(
    temp = temp_vars,
    precip = precip_vars,
    ghh = ghh_vars,
    terr_median = terr_median_vars,
    terr_sd = terr_sd_vars,
    terr_cat = terr_cat_vars
  )
  
  # function to perform the actual PCA
  perf_pca <- function(preds) {
    df_preds <- df %>% 
      select(all_of(preds))
    
    pca <- prcomp(df_preds, center = TRUE, scale. = TRUE)$x %>%
      as_tibble() %>%
      # retain the first two PCs
      select(paste0("PC", 1:2))
    
    return(pca)
  }
  
  # perform the PCA across variable sets
  pca_list <- invisible(map(predictor_list, perf_pca))
  
  names(pca_list) <- names(predictor_list)
  
  # assign prefixes to the PC column names so each variable set has distinct column names
  for (name in names(pca_list)) {
    colnames(pca_list[[name]]) <- paste0(name, "_", colnames(pca_list[[name]]))
  }

  out_df <- bind_cols(df, pca_list)

  return(out_df)
}

all_dfs_pca <- all_dfs %>% 
  mutate(df = map(df, get_pc_scores))

Normalize all variables so they are centered (mean of 0) and scaled (sd = 1) for linear regression.

normalize_vars <- function(df_in) {
  norm_rec <- recipe(hill_1 ~ ., data = df_in) %>%
    step_normalize(all_numeric(), -cell, -num_otu, -num_ind, -num_order, -contains("_pi"), -contains("hill_"))
  
  df_trans <- norm_rec %>%
    prep(training = df_in) %>%
    juice(all_predictors()) %>%
    mutate(hill_1 = df_in$hill_1)
}

all_dfs_norm <- all_dfs_pca %>% 
  mutate(df_norm = map(df, normalize_vars),
         filter_regime = paste(resolution, min_ind, min_otu, sep = "_")) %>% 
  select(-df)

Map

I’m filtering out data sets that I already know I’m not going to use.

all_dfs_norm <- all_dfs_norm %>% 
  filter(resolution == "medium",
         min_otu >= 100)

Convert to sf

# functions to convert the data frames to sf objects
to_sf <- function(df) {
  if (df$resolution[1] == "high") {
    template <- raster(here("data", "templates", "template_high.tif"))
  } else if (df$resolution[1] == "medium") {
    template <- raster(here("data", "templates", "template_medium.tif"))
  } else {
    template <- raster(here("data", "templates", "template_low.tif"))
  }
  
  template[df$cell] <- df$num_order
  
  df_sf <- rasterToPolygons(template) %>% 
    st_as_sf() %>% 
    bind_cols(df) %>% 
    st_join(world_base["continent"], largest = TRUE)
  
  return(df_sf)
}


# convert the data frames to sf for plotting
all_dfs_sf <- all_dfs_norm %>% 
  mutate(df_sf = map(df_norm, to_sf))

Correlations

Exploring correlations among variables. Need to decide which to keep and which to throw out. I’m exploring the medium resolution, 150 km data set.

Here are the variables I value: Climate- Including all current bioclims in the correlation matrix. I am prioritizing the extremes (e.g. max temp of warmest month), average (e.g. average annual temp), then seasonality. Katie says seasonality shouldn’t have a huge effect since insects tend to aestivate/hibernate when conditions aren’t ideal. However, since insects are ectotherms, extremes likely represent limits to insect tolerances and averages summarize the overall climate regime of the area.

Habitat- I have two datasets summarizing habitat variability: the dynamic habitat indices and habitat heterogeneity. For both, I am prioritizing measures of spatial heterogeneity, followed by average, followed by seasonality. The habitat heterogeneity measures only correspond with spatial heterogeneity.

Terrain- I am limiting terrain to slope median and standard deviation and elevation median and standard deviation. The rest are derived stats that I couldn’t justify using. Prioritizing sd since variation likely drives genetic diversity more than average

Land Cover- No land cover: highly spatially autocorrelated variables that only make sense to use in conjunction with each other.

Human- only doing human modification since it’s a specific measure of human environmental impact, rather than just human density

Stability- including both temperature and precipitation stability. I doubt they’re correlated. Keeping temperature if so, since precipitation is more difficult to model in past climates. NOTE I ended up using Theodoridis et al. 2020’s climate stability measures (in “New climate stability”).

df_corr <- all_dfs_norm %>% 
  filter(filter_regime == "medium_3_150") %>% 
  pull(df_norm) %>% 
  pluck(1) %>% 
  select(hill_1,
         num_ind,
         num_otu,
         num_order,
         contains("_pi"),
    -contains("land_cover"),
         contains("elevation"),
         contains("slope"),
         contains("current"),
         contains("ghh"),
         contains("dhi"),
    contains("gHM"),
    contains("stability")) %>% 
  corrr::correlate()

I’m visualizing each set of variables separately first to make the correlations easier to interpret.

Climate

Keeping: BIO2 (mean diurnal range). It’s uncorrelated with all other variables BIO5, BIO6 (max temp warmest month, min temp coldest month). They’re uncorrelated with each other and aren’t strongly correlated with many other variables. BIO5 is strongly correlated with BIO1, so I’m throwing out BIO1 since extremes are higher priority. BIO7 (temperature annual range). Represents extremes across the year. BIO13, BIO14 (precipitation of wettest month, precipitation of driest month). BIO15 (precipitation seasonality). Uncorrelated with BIO13 and BIO14. Also, precipitation seasonality varies regardless of temperate vs tropical regions, so maybe relevant on a global scale.

# climate correlation matrix
remove_prefix <- function(x, pref = "current_medium_") {
  s <- str_remove_all(x, pref)
  return(s)
}

corr_clim <- df_corr %>% 
  filter(str_detect(term, "current")) %>% 
  select(term, contains("current")) %>% 
  rename_at(vars(contains("current")), remove_prefix) %>% 
  mutate(rowname = str_remove_all(term, "current_medium_"))

corrr::rplot(corr_clim)

Habitat

I’m selecting cumulative DHI, var DHI, variance GHH, and standard deviation GHH, as these are uncorrelated with each other. While minimum DHI could be considered an “extreme”, it does not reflect physiological limits, so since it was correlated with both cum DHI and var DHI, while those two variables are not correlated with each other, I’m going to select them since they likely explain more independent information than min DHI. There were many options for GHH, but variance was correlated with the fewest variables and standard deviation is uncorrelated with variance and also is a recognizable measure of variation.

corr_hab <- df_corr %>% 
  filter(str_detect(rowname, "ghh|dhi"),
         !str_detect(rowname, "PC")) %>% 
  select(rowname, contains("ghh"), contains("dhi"), -contains("PC")) %>% 
  rename_at(vars(contains("ghh")), ~remove_prefix(.x, "_medium")) %>%
  rename_at(vars(contains("dhi")), ~remove_prefix(.x, "_medium")) %>%
  mutate(rowname = str_remove_all(rowname, "_medium"))


corrr::rplot(corr_hab)

Terrain

Retaining elevation median and standard deviation. Both correlate with slope median and sd. Elevation likely matters more than slope.

corr_terr <- df_corr %>% 
  filter(str_detect(rowname, "elevation|slope"),
         !str_detect(rowname, "geom")) %>% 
  select(rowname, contains("elevation"), contains("slope"), -contains("geom")) 

corrr::rplot(corr_terr)

Refined correlation matrix

Now I’m going to look at a correlation matrix of this reduced set of variables. Priority is climate > habitat > human > terrain for selection. Climate variables most likely translate to a larger scale the best, and terrain variables are likely the most indirect predictors of genetic diversity. Habitat is probably most directly relevant, but the noise in the variables is likely very high at the coarse scales we’re looking at. They’re mostly measured at a fine scale.

Bioclims 6 and 7 are both correlated with bio 13, and bioclim 6 is correlated with DHI var, so I’m removing them.

climate_vars <- c("bio_2", "bio_5", "bio_6", "bio_7", "bio_13", "bio_14", "bio_15")
habitat_vars <- c("_cum", "_var", "_variance", "std_dev")
terrain_vars <- c("elevation_median", "elevation_sd")
human_vars <- c("gHM")

all_vars <- c(climate_vars, 
              habitat_vars, 
              terrain_vars, 
              human_vars) %>% 
  paste(collapse = "|")

all_vars_full <- df_corr$rowname[str_detect(df_corr$rowname, all_vars)]

corr_reduced <- df_corr %>% 
  select(rowname, any_of(all_vars_full), -contains("coef_of_var"), contains("gHM")) %>% 
  filter(str_detect(rowname, all_vars), !str_detect(rowname, "coef_of_var"))

Get final correlation matrix. Everything looks good!

corr_final <- corr_reduced %>% 
  filter(!str_detect(rowname, "bio_6|bio_7")) %>% 
  select(-contains("bio_6"), -contains("bio_7"))

corr_final_vec <- corr_final$rowname

corrr::rplot(corr_final)

New climate stability

Read data in and wrangle into an appropriate form.

template_medium_rast <- raster(here("data", "templates", "template_medium.tif"))

new_stab_files <- list.files(here("data", "climate_agg"), 
                             pattern = "GlobalExtreme", 
                             full.names = TRUE)

new_stab_rasters <- stack(new_stab_files)

new_stab_sf <- new_stab_rasters %>%  
  projectRaster(template_medium_rast) %>% 
  rasterToPolygons() %>% 
  st_as_sf()

# take the median of all overlapping cells with each of the medium resolution cells
new_stab_df <- st_join(df_150, 
                         new_stab_sf,
                         largest = TRUE)

new_stab_spatial <- new_stab_df %>% 
  bind_cols(new_stab_df %>% 
              st_centroid() %>% 
              st_coordinates() %>% 
              as_tibble()) %>% 
  as_tibble() %>% 
  mutate(temp_trend = normalize(GlobalExtreme_tsTrendExt),
         temp_var = normalize(GlobalExtreme_tsVarExt),
         precip_trend = normalize(GlobalExtreme_prTrendExt),
         precip_var = normalize(GlobalExtreme_prVarExt)) %>% 
  select(-geometry) %>% 
  mutate(lon_scaled = X * 0.000001,
         lat_scaled = Y * 0.000001) %>% 
  select(template_medium, 
         cell, 
         hill_1,
         sqrt_pi,
         any_of(corr_final_vec), 
         temp_trend,
         temp_var,
         precip_trend,
         precip_var,
         min_temp,
         X,
         Y,
         lon_scaled,
         lat_scaled)

Write the new data frame and projected new stability layer to file.

st_write(new_stab_sf, here("data", "climate_poly", "new_stability.geojson"))
write_csv(new_stab_spatial, here("output", "spreadsheets", "model_data.csv"))

Extreme cell resampling

Read in data

pw_pi <- read_csv(here("output", "spreadsheets", "med_3_150_pi.csv"))

analysis_data <- read_csv(here("output", "spreadsheets", "model_data.csv"))

pw_pi_filt <- pw_pi %>% 
  filter(cell %in% analysis_data$cell)

pi_ordered <- pw_pi_filt %>% 
  group_by(order) %>% 
  arrange(desc(pi)) %>% 
  mutate(index = row_number()) %>% 
  ungroup()

Resampling functions

hill_resample <- function(pi_in) {
  rs_pi <- rerun(1000, sample(pi_in, 150) %>% hill_calc()) %>% 
    unlist()
  return(rs_pi)
}

gdm_resample <- function(pi_in) {
  rs_pi <- rerun(1000, sample(pi_in, 150) %>% mean() %>% sqrt()) %>% 
    unlist()
  return(rs_pi)
}

Resampling

dense_cells <- pi_ordered %>% 
  count(cell) %>% 
  slice_max(order_by = n, n = 10) %>% 
  pull(cell)

set.seed(3988)
gde_res_df <- pi_ordered %>% 
  filter(cell %in% dense_cells) %>% 
  group_by(cell) %>% 
  summarize(gde_res = hill_resample(pi_in = pi))

set.seed(877779)
gdm_res_df <- pi_ordered %>% 
  filter(cell %in% dense_cells) %>% 
  group_by(cell) %>% 
  summarize(gdm_res = gdm_resample(pi_in = pi))

GDE

gr_gde <- gde_res_df %>% 
  mutate(cell = as.factor(cell)) %>% 
  ggplot(aes(x = gde_res, y = cell)) +
  ggridges::geom_density_ridges(rel_min_height = 0.01)

# Extract the data ggplot used to prepare the figure.
#   purrr::pluck is grabbing the "data" list from the list that
#   ggplot_build creates, and then extracting the first element of that list.
ingredients <- ggplot_build(gr_gde) %>% purrr::pluck("data", 1)

# Pick the highest point. Could easily add quantiles or other features here.
density_lines <- ingredients %>%
  group_by(group) %>% filter(density == max(density)) %>% ungroup()

resample_plot_gde <- gr_gde +
  geom_segment(data = density_lines, 
               aes(x = x, y = ymin, xend = x, 
                   yend = ymin+density*scale*iscale)) +
  scale_x_continuous(limits = c(min(analysis_data$hill_1), max(analysis_data$hill_1))) +
  labs(x = "GDE", title = "1000 resamples of the top 10 most densely sample cells")


resample_plot_gde

gr_gdm <- gdm_res_df %>% 
  mutate(cell = as.factor(cell)) %>% 
  ggplot(aes(x = gdm_res, y = cell)) +
  ggridges::geom_density_ridges(rel_min_height = 0.01)

# Extract the data ggplot used to prepare the figure.
#   purrr::pluck is grabbing the "data" list from the list that
#   ggplot_build creates, and then extracting the first element of that list.
ingredients <- ggplot_build(gr_gdm) %>% purrr::pluck("data", 1)

# Pick the highest point. Could easily add quantiles or other features here.
density_lines <- ingredients %>%
  group_by(group) %>% filter(density == max(density)) %>% ungroup()

resample_plot_gdm <- gr_gdm +
  geom_segment(data = density_lines, 
               aes(x = x, y = ymin, xend = x, 
                   yend = ymin+density*scale*iscale)) +
  scale_x_continuous(limits = c(min(analysis_data$sqrt_pi), max(analysis_data$sqrt_pi))) +
  labs(x = "GDM", title = "1000 resamples of the top 10 most densely sample cells")


resample_plot_gdm

Combo plot

combo_resample <- resample_plot_gdm / resample_plot_gde

combo_resample

Least restrictive data set

Read in and combine data

med_3_df <- read_csv(here("output", "spreadsheets", "cell_medium_3_10_sumstats.csv"))


rast_list_medium <- list.files(here("data", "climate_agg"),
                        pattern = "medium",
                        full.names = TRUE)

new_stab <- list.files(here("data", "climate_agg"), 
                             pattern = "GlobalExtreme", 
                             full.names = TRUE) %>% 
  stack() %>% 
  projectRaster(template_medium_rast)

names(new_stab) <- paste0(names(new_stab), "_medium")

rasters_full_medium <- raster::stack(rast_list_medium, new_stab)
crs(rasters_full_medium) <- "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs"

med_3_filter <- join_predictors(med_3_df, rasters_full_medium, "medium") %>% 
  filter_dfs("medium")

template_medium_obs <- raster(ncols=ncol(rasters_full_medium), nrows=nrow(rasters_full_medium)) %>% 
  projectRaster(template_medium_rast)

template_medium_obs[] <- NA

## replace the values with those in foo
template_medium_obs[med_3_filter$cell] <- rasters_full_medium[[1]][med_3_filter$cell]

template_medium_sf <- template_medium_obs %>% 
  rasterToPolygons() %>% 
  st_as_sf() %>% 
  mutate(cell = sort(med_3_filter$cell))


corr_final_vec <- read_csv(here("output", "spreadsheets", "keep_nocorr_vars.csv")) %>% 
  pull(1) 

corr_final_vec <- corr_final_vec[c(-13, -14)]

med_3_sf <- left_join(template_medium_sf, med_3_filter, by = "cell") %>% 
  rename(
    temp_trend = GlobalExtreme_tsTrendExt_medium,
         temp_var = GlobalExtreme_tsVarExt_medium,
         precip_trend = GlobalExtreme_prTrendExt_medium,
         precip_var = GlobalExtreme_prVarExt_medium
  ) %>% 
  bind_cols(template_medium_sf %>% 
              st_centroid() %>% 
              st_coordinates()) %>% 
  mutate(
    lon_scaled = X * 0.000001,
    lat_scaled = Y * 0.000001
  ) %>% 
  select(
         cell, 
         num_ind,
         num_otu,
         num_order,
         hill_1,
         sqrt_pi,
         any_of(corr_final_vec), 
         temp_trend,
         temp_var,
         precip_trend,
         precip_var,
         X,
         Y,
         lon_scaled,
         lat_scaled)

all_dfs <- tibble(
  resolution = c(
                 paste(rep("medium", 6, sep = ", "))
                 ),
  min_ind = c(3L, 3L, 3L, 3L, 3L, 3L),
  min_otu = c(10L, 25L, 50L, 100L, 150L, 200L),
  df = list(
    med_3_sf,
    med_3_sf %>% filter(num_otu >= 25),
    med_3_sf %>% filter(num_otu >= 50),
    med_3_sf %>% filter(num_otu >= 100),
    med_3_sf %>% filter(num_otu >= 150),
    med_3_sf %>% filter(num_otu >= 200)
  )
  ) %>% 
  mutate(num_cells = map_int(df, nrow))

Normalize all variables so they are centered (mean of 0) and scaled (sd = 1) for linear regression.

all_dfs_norm <- all_dfs %>% 
  mutate(df_norm = map(df, normalize_vars)) %>% 
  select(-df)

Write to file

write_rds(all_dfs_norm, here("output", "spreadsheets", "model_data.rds"))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

step-3_data-exploration.md

step-3_data-exploration.md

Data exploration

Setup

Map helpers

Data wrangling

Continent mapping

Predictor variables

Missing data

Data exploration

Read and filter

PCA

Map

Correlations

Climate

Habitat

Terrain

Refined correlation matrix

New climate stability

Extreme cell resampling

Least restrictive data set

Files

step-3_data-exploration.md

Latest commit

History

step-3_data-exploration.md

File metadata and controls

Data exploration

Setup

Map helpers

Data wrangling

Continent mapping

Predictor variables

Missing data

Data exploration

Read and filter

PCA

Map

Correlations

Climate

Habitat

Terrain

Refined correlation matrix

New climate stability

Extreme cell resampling

Least restrictive data set