/
simulate_mass_data.R
168 lines (130 loc) · 7.54 KB
/
simulate_mass_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
library(glmmTMB)
library(dplyr)
library(purrr)
library(broom.mixed)
library(tidyverse)
library(ggplot2); theme_set(theme_bw())
# if (packageVersion("broom.mixed") < "0.2.9.5") {
# warning("please install the latest version of broom.mixed via remotes::install_github('bbolker/broom.mixed')")
# }
#set.seed() ensures reproducibility when randomizing
set.seed(101)
n_per_group=15; days=seq(0, 60, by=3) #seq(first day, last day, step size)
n_groups <- 2
#set parameters
beta0=30; beta_day=(-1.5/60); beta_daytreat=(-3/60)
#betas. beta0: initial mass in g; beta_day: slope (average loss (g per day) in control group)
#beta_daytreat: average *additional* loss per day (relative to control) in treatment group
#model
form0 <- ~ 1 + day + treatment:day + (day | batID)
form1 <- mass ~ 1 + day + treatment:day + (day | batID)
#mass: response variable; 1: intercept (mass when all predictor variables are set to 0)
#day: the effect of day on mass; treatment:day: interaction (the effect of day on mass varies depending on treatment level)
#(day | batID); random effects (allows for individual variability in the intercept and slope of the relationship between "day" and "mass" among different bats)
corr_trans <- function(rho) rho/(sqrt(1+rho^2))
#correlation between intercepts and slopes
#we have set rho to -0.75 below, but the calculation is needed to change from -1 and 1 to negative infinity and positive infinity (don't quite get this)
sdint=5; sdslope=(1/60); corr=corr_trans(-0.75); sdres=0.5
#thetas, and betad. sdint: standard deviation of intercept (plus/minus 10); sdslope: standard deviation of the slope
#corr: see above; sdred (betad): standard deviation of residual variance (plus/minus 2), needed because we're simulating a gaussain distribution and we don't just want straight lines (e.g. they just took a drink so their mass is bigger)
#sdint, sdslope and betad are on log scale to ensure they are positive
n_bats <- n_per_group*n_groups
## JD: These ...'s aren't the best practice; should figure out more about map
## BMB: get JD to explain this comment, I don't understand it out of context
#create simulation function
sim <- function(n_per_group, days, beta0, beta_day, beta_daytreat
, sdint, sdslope, corr, sdres, ...
){
batID <- as.factor(1:n_bats) #create individual bat IDs ## JD: Make this look less like a number with paste()
treatment <- as.factor(rep(c("control", "exercise"), each=n_per_group)) #create treatment groups, 15 per group
tdat <- data.frame(batID, treatment) #create data frame with batIDs and treatment groups
dat1 <- expand.grid(batID=batID, day=days) #expand so each batID occurs for each day measured
bat_data <- full_join(dat1, tdat, by="batID") #combine to have batID, treatment groups, days
mass_data <- simulate_new(form0, #simulate mass data with gaussian distribution, using set parameters
nsim = 1,
family = "gaussian",
newdata = bat_data,
newparams = list(
beta = c(beta0, beta_day, beta_daytreat)
, theta = c(log(sdint), log(sdslope), corr)
, betad = log(sdres)
)
)
bat_data$mass <- mass_data[[1]] #assigns simulated mass data to a mass column in bat_data
return(bat_data) # allows the modified bat_data object to be used or assigned to a variable outside of the function
}
#simulate once and plot to ensure the simulated output looks reasonable
s1 <- sim(n_per_group=n_per_group, days=days
, beta0=beta0, beta_day=beta_day, beta_daytreat=beta_daytreat
, sdint=sdint, sdslope=sdslope, corr=corr, sdres=sdres)
plot_sim <- function(bat_data) {
ggplot(bat_data, aes(day, mass, colour = treatment)) +
geom_line(aes(group=batID))
}
plot_sim(s1)
saveRDS(s1, "mass_dataset.RDS")
#percent difference
wide_s1 <- pivot_wider(s1,
names_from = day,
values_from = mass)
first_values <- wide_s1[, 3] # Assuming the first variable starts from column 2
last_values <- wide_s1[, ncol(wide_s1)]
percent_difference <- ( 1 - (last_values / first_values) ) *100
wide_s1 <- (wide_s1 %>% left_join(percent_difference, by = "60")
%>% mutate(percent_differences= percent_difference))
print(percent_difference)
#I can't figure out how to do this
#create fit function for model and return fitted model object
fit <- function(bat_data){
return(glmmTMB(form1 #specified model
, data=bat_data
, family = "gaussian"
))
}
## BMB: could speed up by combining simfun and fitfun, using
## nsim = nsim ...
#create function that simulates data, models it, and extracts coefficients and CIs
simCIs <- function(simfun, fitfun, ...){ #takes 2 arguments (one that simulates data, one that fits a model to the simulated data)
dat <- simfun(...) #simulates data, ... allows passing additional arguments to simfun if needed
fit <- fitfun(dat) #fits model
tt <- (tidy(fit, effects = "fixed", conf.int = TRUE, conf.method = "profile") #extracts the model coefficients and their confidence intervals
%>% select(term, est = estimate, lwr = conf.low, upr = conf.high) #selects specific columns: coefficient names (term), estimates (estimate), lower CI bounds (conf.low), and upper CI bounds (conf.high)
%>% mutate(across(where(is.character), factor)) #converts character columns to factors (for downstream analyses, plotting)
)
return(tt) #returns extracted model coefficients and their CIs
}
## test once before running a lot
#Print estimates, lower CI bounds, and upper CI bounds
print(simCIs(simfun=sim, fitfun=fit#Call simCI function, apply sim and fit functions,
, n_per_group=n_per_group, days=days #apply set parameters
, beta0=beta0, beta_day=beta_day, beta_daytreat=beta_daytreat
, sdint=sdint, sdslope=sdslope, corr=corr, sdres=sdres
))
#how many reps of the simulation we want
nReps_1000 <- 1000
## BMB: for slow stuff it is probably better to take apart the pieces
## (1) generate a big list of fitted objects
## (2) then run a separate workflow step to compute the summary
## statistics you're interested in
#apply the simCIs function iteratively to simulate data, fit models, and calculate confidence intervals, save dataframe as cis
system.time(
cis_1000 <- map_dfr(1:nReps_1000, simCIs, .id="sim" #iterates over 1000 reps (set above), applying simCIs for each iteration. adds "sim" column for simulation number
, .progress = interactive() #progress bar
, simfun=sim, fitfun=fit #functions fit to simCIs
, n_per_group = n_per_group, days=days #parameters fit to simCIs
, beta0=beta0, beta_day=beta_day, beta_daytreat=beta_daytreat#these parameters are fit into simfunin simCIs with ... above
, sdint=sdint, sdslope=sdslope, corr=corr, sdres=sdres
)
)
nReps_2500 <- 2500
system.time(
cis_2500 <- map_dfr(1:nReps_2500, simCIs, .id="sim" #iterates over 1000 reps (set above), applying simCIs for each iteration. adds "sim" column for simulation number
, .progress = interactive() #progress bar
, simfun=sim, fitfun=fit #functions fit to simCIs
, n_per_group = n_per_group, days=days #parameters fit to simCIs
, beta0=beta0, beta_day=beta_day, beta_daytreat=beta_daytreat#these parameters are fit into simfunin simCIs with ... above
, sdint=sdint, sdslope=sdslope, corr=corr, sdres=sdres
)
)
saveRDS(cis_1000, "bat_mass_sims_1000.RDS")
saveRDS(cis_2500, "bat_mass_sims_2500.RDS")