/
ag_supp.rmd
790 lines (675 loc) · 35.4 KB
/
ag_supp.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
---
title: "Supplementary material for 'Parental care drives the evolution of reproductive accessory glands in ray-finned fishes'"
date: "`r format(Sys.Date(), '%d %B %Y')`"
bibliography: corHMM.bib
output:
pdf_document:
toc: true
toc_depth: 2
includes:
in_header: "preamble.tex"
bookdown::html_document2:
toc: true
toc_depth: 2
number_sections: true
---
<!-- disable code folding entirely if not building as HTML? -- 'r ifelse(knitr::is_html_output(),"hide", ...)' -->
<!-- **fix me**: reimplement as [target markdown](https://books.ropensci.org/targets/markdown.html#markdown) ? -->
```{r setup, include = FALSE}
source("R/utils.R")
source("R/mcmc.R")
source("R/functions.R")
source("R/pkg_install.R")
load_pkgs(skip = "tikzDevice")
zmargin <- theme(panel.spacing = grid::unit(0, "lines"))
theme_set(theme_bw())
library(targets)
knitr::opts_chunk$set(echo = FALSE, dpi = 200, optipng = knitr::hook_optipng)
options(bitmapType = "cairo")
```
<!-- use PNG for heavy graphics to avoid blowing up file size.
-- see https://bookdown.org/yihui/rmarkdown-cookbook/optipng.html for info on installing OptiPNG-->
```{r load}
tar_load(ag_model_pcsc)
tar_load(ag_model_full)
tar_load(ag_mcmc0)
tar_load(all_ci)
tar_load(all_contr_ci)
tar_load(ag_compdata)
```
# Descriptive statistics
## Data
```{r mosaic, eval=FALSE, include=FALSE}
## cool but useless
(ag_compdata$data
%>% mutate(ag = paste0("ag:",ag),
pc = paste0("pc:",pc),
sc = paste0("sc:",sc))
%>%
ggplot() +
geom_mosaic(aes(x = product(pc, sc), fill = ag)) +
geom_mosaic_text(aes(x = product(pc, sc), fill = ag), as.label = TRUE) +
scale_fill_discrete_qualitative()
)
```
```{r barchart, fig.cap = "Numbers of species with each set of traits (`0` = absent, `1` = present, `?` = unknown; `pc` = male parental care; `sc` = sperm competition (i.e. group spawning or ARTs); `ag` = accessory glands), depending on whether we use the partial (but fully genetically resolved) tree (`fishphylo`) or the imputed trees from @rabosky_inverse_2018 (`treeblock`)."}
tar_load(ag_compdata)
tar_load(ag_compdata_tb)
barplot_trans <- (.
%>% mutate(across(c(pc, sc), factor, levels = c(0:1, "?")))
%>% mutate(across(ag, factor, levels = 1:0)) ## reverse order to get ag=1 on top of bars
)
bar_data <- (purrr::map_dfr(list(fishphylo = ag_compdata$data,
treeblock = ag_compdata_tb$data),
barplot_trans,
.id = "phylo")
|> rename(`spawning mode`="sc")
)
gg1 <- (ggplot(bar_data, aes(x=pc, fill = ag))
+ geom_bar()
+ facet_grid(phylo~`spawning mode`, labeller = label_both)
+ scale_fill_discrete_qualitative(name = "accessory\nglands")
+ zmargin
+ labs(x="male parental care")
)
print(gg1)
```
The phylogenies in the treeblock are generated according to an imputation
scheme described in @rabosky_inverse_2018. We generated 100 stochastic character maps for each of 100 trees; the figure shows the average number of gains and losses per tree (i.e., average gains/losses across the stochastic character maps for each tree).
```{r simcounts, fig.cap = "average number of gains and losses of AGs simulated for each tree in the tree block"}
if (file.exists("simmap.rda")) {
x <- load("simmap.rda")
countsum <- (counts
|> as_tibble()
|> mutate(r = rep(1:100, each = 100))
|> group_by(r)
|> rename(gains = "ag0,ag1", losses = "ag1,ag0")
|> summarise(across(c(gains, losses), mean))
|> pivot_longer(c(gains, losses))
)
ggplot(countsum) + geom_histogram(aes(x=value), binwidth = 0.25, colour = "white") + facet_wrap(~name, scale = "free_x")
} else {
warning("run figures.R?")
}
```
<!-- use ape::getMRCA() to identify families and geom_cladelab() ... ?? -->
```{r densitree, fig.cap = "densitree-style [@bouckaertDensiTree2010] mapping of tree blocks/phylogenetic uncertainty ", cache=TRUE, dev = "png", optipng = "-o7"}
targets::tar_load(treeblock)
mp <- do.call("c", treeblock)
ggtree::ggdensitree(mp, layout = "radial", alpha = 0.02)
```
```{r barchart_test, include = FALSE}
## bar chart that distinguishes group from ART?
tar_load(full_ag_data)
tar_load(treeblock)
ff <- full_ag_data |> get_ag_data(trait_names = c("ag", "care", "pairs.groups", "pairs.arts"), phylo = treeblock[[1]])
```
# Parameter definitions
## Transition matrices
As described in the main text, a continuous-time stochastic process for three binary traits has a total of 24 possible transitions where a single one of the three states changes. The figure below shows the full and reduced transition matrices. In the axis tick labels, `ag0_pc0_sc1` (for example) refers to the state without AGs (`ag0`), without male care (`pc0`), and with group spawning (`sc1`, loosely "sperm competition present"^[Hereafter we refer to pair spawning as "sperm competition absent" (`sc0`) and group spawning as "sperm competition present" (`sc1`), recognizing that this is an oversimplification.]). Parameter 1 quantifies the log-hazard of a switch to pair spawning in this state (i.e., to `ag0_pc0_sc0`).
The assignment of numbers to rates is arbitrary, numbered in sequence in column-wise order. Knowing the assignment is only necessary when doing low-level computations based on the vector of parameters. The take-home information is which sets of transitions are constrained to be identical in the reduced model; these rates have the same number . In the reduced model (righthand plot), transitions that are assumed to have equal rates - because we assume that all transitions in spawning mode, and all transitions in male care, are independent of the other states - have the same ID number. Colours denote the estimated log-hazard for each parameter when the models are fitted by maximum likelihood. Below, we denote parameter 1 (in the reduced model, which occupies four different cells in the transition matrix) as `sc_loss`, because it quantifies the loss rate of sperm competition regardless of the states of the other traits; we refer to parameter 3 (for example) as `loss.ag_pc0_sc1`, the loss rate of AGs in the absence of male care and presence of sperm competition.
```{r image, message=FALSE, fig.width = 12, fig.height = 5, fig.cap = "Transition matrix for full and reduced (independent evolution of spawning mode and male care) models. Colours of grid squares represent the estimated evolutionary rates (log-hazards) for each transition. Note that the rates for 24-parameter model are extremely uncertain; see sensitivity analysis (last section)."}
pc <- gray(0.9)
cowplot::plot_grid(plot(ag_model_full, pnum_col = pc,
main = "full (24-parameter)"),
plot(ag_model_pcsc, pnum_col = pc,
main = "reduced (12-parameter)"),
nrow=1)
```
## Transitions
```{r contrast-diag, echo = FALSE, out.width = '6in', out.height = '6in', fig.cap = "Transitions for the reduced (12-parameter) model. As in Figure 1 in the main text: Sets of connections drawn with the same colours are constrained to identical evolutionary rates (e.g. light gray downward arrows, representing loss of male parental care – all transitions from “pc1” to the corresponding “pc0” state). Top-bottom and left-right transitions, representing loss/gain of parental care and transitions between pair spawning and group spawning, are assumed to be independent of other trait values. Loss/gain of accessory glands (diagonal arrows), which are our primary interest, are all estimated separately (8 different colours for arrows representing loss/gain of accessory glands under the 4 different combinations of the other two traits, parental care and spawning mode)."}
knitr::include_graphics("pix/flowfig2.pdf")
```
The transitions denoted A-D all represent gains of accessory glands, under different combinations of male care/spawning mode. Arrows A and B both represent AG gains in states with pair spawning (`sc0`); C and D both represent states in states with group spawning (`sc1`). Thus rate(pair) = (rate(A) + rate(B))/2 is the average rate of AG gain under pair spawning; rate(group) = (rate(C) + rate(D))/2 is the average rate under group spawning; and rate(group)-rate(pair) is the *contrast*, which we can think of as "the effect of group vs pair spawning on the rate of AG gain". We can similarly compute other combinations such as rate(no male care) (`pc0`) = (rate(B) + rate(D))/2; rate(male care) (`pc1`) = (rate(A) + rate(C))/2; and contrast(male care) = rate(male care) - rate(no male care).
## Contrast matrix
The *contrast matrix* connects the parameters we estimate (e.g. `loss.ag_pc0_sc0`, loss rate of accessory glands when parental care and sperm competition are both absent) to the biological effects we are interested in (e.g. `pc_loss`, the effect of parental care on the loss rate of accessory glands). For example, the `intercept_loss` effect is the (unweighted) average of the four different loss terms; the `pc_loss` effect is the difference between the average loss rate when male care is present (second row, red squares) and the average loss rate when male care is absent (second row, blue squares).
```{r plot-contrast-mat, fig.width = 6, fig.width = 6, fig.cap = "Contrast matrix defining translation from internal rate parameters (columns, corresponding to transition rates in Figure 1.3) to meaningful contrasts (see figures below). Gain and loss terms for group spawning and male care not shown, as these are not translated into contrasts."}
tar_load(contrast_mat)
## order nicely
order_fun <- function(x) {
trait <- (stringr::str_extract(x,"\\.[[:alpha:]]{2}")
|> stringr::str_remove("\\.")
)
tcode <- match(trait, c("pc", "sc", "ag"))
## loss/gain
lossgain <- stringr::str_extract(x, "[[:alpha:]]{4}")
lcode <- match(lossgain, c("loss", "gain"))
rem <- stringr::str_remove(x,"[[:alpha:]]{4}\\.[[:alpha:]]{2}(_|$)")
order(1000*tcode + 100*lcode + order(rem))
}
ccm <- t(contrast_mat[
## re-order; drop cols 1-4, which aren't involved in contrasts
## (just gain/loss of pc, sc)
## keep only gain/loss contrast rows,
## not "netgain" (which we never used and
## which is a little confusing)
order_fun(rownames(contrast_mat)),])[1:8,-(1:4)]
image_plot(ccm,
ylab = "AG contrast", xlab = "Rate parameter")
```
```{r hux, eval = FALSE}
m <- MASS::fractions(ccm)
mm <- matrix(attr(m, "fracs"),
nrow = nrow(m),
dimnames = dimnames(m))
mm[mm == "0"] <- "."
if (exists("latexstr")) rm("latexstr")
texconn <- textConnection("latexstr", "w")
cc <- function(x, ...) cat(x, file = texconn, append = TRUE, ...)
cc("$$\n")
cc("\\left(\n")
cc("\\begin{array}{cccccccc}\n")
for (i in 1:nrow(mm)) {
cc(mm[i,], sep = " & ")
cc("\\\\\n")
}
cc("\\end{array}\n")
cc("\\right)\n")
cc("$$")
cat(knitr::raw_latex(latexstr))
```
Here is an equivalent numeric matrix:
$$
\left[
\begin{array}{rrrrrrrr}
1/4 & 1/4 & 1/4 & 1/4 & . & . & . & . \\
-1/2 & -1/2 & 1/2 & 1/2 & . & . & . & . \\
1/2 & -1/2 & -1/2 & 1/2 & . & . & . & . \\
-1/2 & 1/2 & -1/2 & 1/2 & . & . & . & . \\
. & . & . & . & 1/4 & 1/4 & 1/4 & 1/4 \\
. & . & . & . & -1/2 & 1/2 & -1/2 & 1/2 \\
. & . & . & . & -1/2 & -1/2 & 1/2 & 1/2 \\
. & . & . & . & 1/2 & -1/2 & -1/2 & 1/2 \\
\end{array}
\right]
$$
# Priors
Here we illustrate our general strategy for constructing priors: pick sensible lower and upper "bounds" (not really bounds, but values we would consider extreme) and use a Normal prior on the log-hazard scale (equivalent to a log-Normal prior on the hazard scale) with a mean halfway between the lower and upper bounds and a standard deviation $\sigma = (\log(\textrm{upper})-\log(\textrm{lower}))/(2 \cdot \textrm{range})$
In our model we use a range of 3 (i.e. the lower and upper bounds are at $\pm 3 \sigma$), to denote that the lower and upper bounds we specify are extreme (99.7% ranges). The figure below uses range = 2 (which would correspond to 95.4% ranges) so the tails are more clearly visible.
```{r prior-spec, fig.width = 10, fig.height = 5, fig.cap = "Schematic of prior definition in terms of lower and upper tails of a Gaussian distribution. Left, linear scale; right, log scale"}
fun <- function(logsc = TRUE, range = 2, xp = 1.15) {
DFUN <- if (!logsc) dlnorm else dnorm
trans <- if (!logsc) exp else identity
xlab <- if (!logsc) "Hazard" else "Log-hazard"
ticklabs <- c("lower", "upper")
if (logsc) ticklabs <- sprintf("log(%s)", ticklabs)
rr <- xp*range
minval <- if (!logsc) 0 else trans(-rr)
mode <- if (logsc) 0 else -1
curve(DFUN(x),
from = minval,
n = 201,
to = trans(rr),
axes = FALSE,
yaxs = "i",
ylim = c(0, DFUN(trans(mode))),
xlab = xlab,
ylab = "Prior density"
)
## UGH, mtext() is not respecting las ??
## mtext(side = 1, line = 2, text = xlab)
## mtext(side = 2, line = 1, text = "Prior density")
box()
abline(v = c(trans(-range), trans(range)), lty = 2)
if (logsc) {
xvec <- seq(-rr, -range, length.out = 201)
} else {
xvec <- seq(0, trans(-range), length.out = 201)
}
## fill in tails
dd <- DFUN(xvec)
polygon(x = c(xvec, rev(xvec)),
y = c(rep(0, length(dd)), rev(dd)),
col = "gray")
xvec <- seq(range, rr, length.out = 51)
dd <- DFUN(trans(abs(xvec)))
polygon(x = trans(c(abs(xvec), rev(abs(xvec)))),
y = c(rep(0, length(dd)), rev(dd)),
col = "gray")
yval <- DFUN(1)
arrows(x0 = trans(-0.5), x1 = trans(-range), y0 = yval, y1 = yval)
arrows(x0 = trans(0.5), x1 = trans(range), y0 = yval, y1 = yval)
text(trans(0), yval, bquote("" %+-% .(range)*sigma))
axis(side = 1, at = trans(c(-range, range)),
labels = ticklabs)
}
par(mfrow = c(1,2),
las = 1, bty = "l", cex = 1.2, mgp = c(2, 1, 0))
fun(xp = 2)
fun(logsc=FALSE)
```
```{r utils}
tar_load(ag_compdata)
corhmm_bounds <- log(c(lwr=0.1, upper = 100*ape::Ntip(ag_compdata$phy)))
corhmm_prior <- log(c(lwr=1, upper = 10*ape::Ntip(ag_compdata$phy)))
corhmm_mid <- mean(corhmm_bounds)
## order of parameters
levs <- c(outer(FUN = paste, sep = ".",
c("loss", "gain"),
c("sc", "pc",
paste0("ag_", c(outer(paste0("pc", 0:1), paste0("sc", 0:1), paste, sep = "_"))))
))
contr_levs <- c(outer(FUN = paste, sep = "_",
c("intercept", "sc", "pc", "pcxsc"),
c("loss", "gain")))
fix_all_ci <- function(x, levels = NULL) {
x <- (x
## replace NA values with Inf/-Inf so bars extend full range
%>% replace_na(list(upr = Inf, lwr = -Inf))
## utility var for below-bounds estimates
%>% mutate(lwr_bound = (estimate <= corhmm_bounds[["lwr"]]))
)
if (!is.null(levels)) {
## reorder levels as above
x <- (x
%>% mutate(across(term, factor, levels = levels))
)
}
return(x)
}
```
```{r priorsamp}
prior_contr_ci <- (fix_all_ci(all_contr_ci, levels = contr_levs)
## netgain/loss terms will be NA after fixing with these levels
|> filter(!is.na(term),
method == "priorsamp")
)
get_ci <- function(tt ) {
(prior_contr_ci
|> filter(term == tt)
|> dplyr::select(estimate, lwr, upr)
|> as.list()
|> map_dbl(~round(exp(.), 1))
)
}
gain_ci <- get_ci("intercept_gain")
```
By sampling directly from the prior and computing contrasts,
we can confirm that the priors on the contrasts (ratio of AG gain/loss rates depending on male care and sperm competition) are indeed neutral (centered at 1.0) and reasonable (95% confidence intervals extend from 10x slower to 10x faster). The intercept terms for gain and loss represent the *average* evolutionary rates across all states, and are measured in units of expected numbers of events across the entire tree. For example, the prior expected number of gains of AG across the whole tree is `r gain_ci[1]` (95% CI: `r gain_ci[2]`-`r gain_ci[3]`).
```{r priorsamp-fig, fig.cap = "95% CIs for prior distributions of contrasts"}
(ggplot(prior_contr_ci)
+ aes(x = exp(estimate),
xmin = exp(lwr),
xmax = exp(upr), y = term)
+ geom_pointrange()
+ labs(y = "", x = "hazard or hazard ratio")
+ scale_x_log10()
+ geom_vline(xintercept = 1, lty = 2)
)
```
```{r}
```
# Statistical summaries
This section presents the quantitative results in more detail.
## Bayesian
Here are the estimated posterior median and confidence intervals,
along with $p_\textrm{MCMC}$ (twice the minimum tail probability)
and the "probability of direction" ($p_d$), the probability that the
parameter has the same sign as the median ($p_d$ is an index of
how clearly the sign of the parameter is known: $p_\textrm{MCMC} = 2(1-p_d)$)
[@makowski_indices_2019;@shi_reconnecting_2021].
```{r bayes_sum_tab, results="asis"}
bayes_pval <- function(x, ref=0) {
x <- mean(x<ref)
2 * min(x, 1-x)
}
bayes_pd <- function(x, ref = 0) {
mean(x*sign(median(x))>ref)
}
tar_load(contr_long_ag_mcmc_tb)
ag_contr_gainloss <- (contr_long_ag_mcmc_tb
|> filter(rate != "netgain")
)
(ag_contr_gainloss
%>% group_by(contrast,rate)
%>% summarise(median = median(value),
lwr = quantile(value, 0.025),
upr = quantile(value, 0.975),
pMCMC = bayes_pval(value),
pd = bayes_pd(value),
.groups = "drop")
## %>% filter(phylo == "treeblock")
%>% arrange(rate, contrast)
## %>% dplyr::select(-phylo)
%>% mutate(across(c(median, lwr, upr), exp))
%>% mutate(across(c(pMCMC, pd), ~ ifelse(contrast=="intercept", NA, .)))
%>% knitr::kable(digits = 3)
)
```
Here are the marginal posterior distributions for all of the contrasts,
including fits to both the full tree-block data set and the subset of
the data that includes only species with genetic data ("fishphylo").
(The latter is useful for comparison with the MLE results, which cannot
use the Bayesian method of sampling across treeblock phylogenies.)
The treeblock estimates are slightly more precise (i.e. the confidence intervals are narrower), as expected since
they can make use of a larger data set. Points show the
posterior median, Line ranges represent 95%
credible intervals.
```{r plot-contrasts, fig.height=4, fig.width=10}
tar_load(contr_long_ag_mcmc0)
tar_load(contr_long_ag_mcmc_tb)
ag_contr_gainloss <- purrr::map_dfr(list(fishphylo=contr_long_ag_mcmc0,
treeblock=contr_long_ag_mcmc_tb),
filter, rate != "netgain",
.id = "phylo")
gg_sum <- ggplot(ag_contr_gainloss, aes(x = exp(value), y = rate)) +
facet_wrap(~ contrast) +
geom_violin(aes(fill = phylo), alpha=0.6) +
stat_summary(fun.data = "median_hilow", geom = "pointrange", aes(group=phylo),
## width by trial and error; not sure what determines this?
position = position_dodge(width=0.875),
colour = "gray30") +
geom_vline(xintercept = 1, lty = 2) +
scale_x_log10() +
zmargin +
scale_fill_discrete_qualitative() +
scale_colour_discrete_qualitative() +
labs(x="expected transitions/proportional difference in rates")
print(gg_sum)
```
## Frequentist
### Likelihood ratio tests
For any pair of *nested* models we can do a likelihood ratio test,
comparing the model fits and number of parameters. We fit restricted models
with AG evolution independent of both sperm competition and male care
(`indep`); dependent on male care but not sperm competion (`pc`);
dependent on sperm competition only (`sc`); depending *additively* on both traits (i.e., the effects of the trait status on evolutionary rates are added,
on the log scale - this model has 10 parameters); and with an interaction between traits
(the full 12-parameter model that is our primary focus).
Testing (most) nested pairs gives this diagram (we omit the
comparison between `pcsc` and `indep`):
```{r lrtest}
tar_load("ag_model_pcsc")
tar_load("ag_model_pc")
tar_load("ag_model_sc")
tar_load("ag_model_indep")
tar_load("ag_model_pcsc_add")
mnames <- c("pcsc", "pcsc_add", "pc", "sc", "indep")
anfun <- function(m1, m2, return_val = c("expression", "list")) {
return_val <- match.arg(return_val)
L1 <- logLik(m1)
L2 <- logLik(m2)
df1 <- attr(L1, "df")
df2 <- attr(L2, "df")
ddev <- 2*abs(c(L2-L1))
ddf <- abs(df1-df2) ## ASSUME nothing pathological is happening
pval <- pchisq(ddev, ddf, lower.tail = FALSE)
switch(return_val,
expression = sprintf("atop(Delta*df==%d,atop(Delta*dev==%1.2f,p==%1.1g))",
ddf, ddev, pval),
list = tibble::lst(ddf, ddev, pval))
}
A <- matrix(NA_character_, 5, 5,
dimnames=list(mnames, mnames))
A["indep","pc"] <- anfun(ag_model_pc, ag_model_indep)
A["indep","sc"] <- anfun(ag_model_sc, ag_model_indep)
A["sc","pcsc_add"] <- anfun(ag_model_pcsc_add, ag_model_sc)
A["pc","pcsc_add"] <- anfun(ag_model_pcsc_add, ag_model_pc)
A["indep","pcsc_add"] <- anfun(ag_model_pcsc_add, ag_model_indep)
A["pcsc_add","pcsc"] <- anfun(ag_model_pcsc_add, ag_model_pcsc)
A["pc","pcsc"] <- anfun(ag_model_pcsc, ag_model_pc)
A["sc","pcsc"] <- anfun(ag_model_pcsc, ag_model_sc)
C <- matrix(0, nrow=nrow(A), ncol = ncol(A))
C[!is.na(A)] <- seq(sum(!is.na(A)))
```
<!-- figure produced as SVG, edited to adjust label positions -->
```{r lrtest-plot, fig.cap = "likelihood ratio test comparisons"}
knitr::include_graphics("pix/lrt_comp.pdf")
```
```{r lrtest-plot-svg, results = "hide"}
svg("lrt_comp.svg", width = 18, height = 10)
plotmat(A, curve = 0,
## pcsc_add box in second row needs to be wider/larger
box.size = c(0.05, 0.1, 0.05, 0.05),
box.prop = c(0.5, 0.25, 0.5, 0.5),
arr.lcol = C,
arr.tcol = C,
pos = matrix(c(1,1,2,1)))
dev.off()
```
The p-values quoted in the paper are from comparing the full model
with interactions to the `pc` and `sc` models. These comparisons are
analogous to "type 3" tests in ANOVA [@fox_r_2018], which are the
most commonly used approach in the biological literature. "Type 2"
tests (as implemented in the `car` package for R) instead compare
the *additive* model to the models with only one main effect:
these tests suggest that the effects of both sperm competition
and male care are statistically significant, although the evidence
for the effect of male care is still stronger.
```{r likcomp_tab}
res <- list()
res[["pcsc"]] <- list(desc = "interaction", anfun(ag_model_pcsc_add, ag_model_pcsc, return = "list"))
res[["sc3"]] <- list(desc = "sc (type III)", anfun(ag_model_pc, ag_model_pcsc, return = "list"))
res[["sc2"]] <- list(desc = "sc (type II)", anfun(ag_model_pc, ag_model_pcsc_add, return = "list"))
res[["pc3"]] <- list(desc = "pc (type III)", anfun(ag_model_sc, ag_model_pcsc, return = "list"))
res[["pc2"]] <- list(desc = "pc (type II)", anfun(ag_model_sc, ag_model_pcsc_add, return = "list"))
res <- do.call(rbind, lapply(res, as.data.frame))
knitr::kable(res, digits = 3)
```
(`ddf`: difference in number of parameters between models;
`ddev`: change in deviance ($-2 \log(L)$); `pval`: likelihood ratio
test p-value)
### Parameter estimates/CIs
We can look at the parameter estimates from maximum likelihood fits, although
most of the confidence intervals of the unregularized fits
are undetermined because of the failure
of the Wald estimates. All confidence intervals more extreme than $\exp(\pm 20)$ are
set to 0 or $\infty$.
Profile confidence intervals could give slightly better results, but
these turds may be not worth polishing very much ...
```{r ptab}
tar_load(all_contr_ci)
rclamp <- function(x, maxlog = 20) {
ifelse(abs(x)>maxlog, sign(x)*Inf, x)
}
ptab <- (all_contr_ci
|> dplyr::filter(grepl("pcsc", method),
!grepl("intercept", term),
!grepl("\\.", term))
|> dplyr::select(method, term, estimate, lwr, upr)
|> mutate(across(method, ~gsub("model_", "", .)))
|> mutate(across(c(estimate, lwr, upr),
~ exp(rclamp(.))))
)
knitr::kable(ptab, digits = 3)
```
### Information-theoretic comparisons
Alternatively, we can make comparisons based on information-theoretic criteria
such as AICc, AIC, or BIC. These three criteria differ in the kind and strength of penalty that they use to avoid overly complex models. Researchers choose different criteria depending on their goals - strictly speaking AIC and its variants are for maximizing predictive accuracy while BIC is for choosing among hypotheses. AICc is often recommended for analyses where $n<40$. Using AICc presents a challenge: for this data set (a phylogeny of 607 species with approximately 20 independent origins of accessory glands), it is difficult to quantify the *effective* number of observations (the $n$ in the rule of thumb above, and in the formula for the AICc, is based on a simpler data format where we can assume that every observation is independent). We chose $n=30$ for this comparison, as a guess at the effective sample size (larger $n$ will tend to select larger models as better).
Negative log-likelihood, which measures the unpenalized goodness-of-fit, is included for completeness/comparison.
```{r ictab}
ag_models <- paste0("ag_model_",
mnames)
fake_nobs <- 30 ## ?? something weird about targets evaluation:
dtab <- tibble(
model = mnames,
desc = c("AG dep on PC, SC & interaction",
"AG dep on PC, SC additively",
"AG dep only on PC",
"AG dep only on SC",
"AG evol indep of PC, SC"))
atab <- (mget(ag_models)
|> setNames(mnames)
|> map_dfr(glance, nobs = fake_nobs, .id = "model")
|> mutate(negLL = -1*logLik)
|> dplyr::select(-logLik)
|> mutate(across(c(AIC, BIC, negLL, AICc), ~ . - min(.)))
|> arrange(AICc)
|> full_join(dtab, by = "model")
|> dplyr::select(desc, df, AICc, AIC, BIC, negLL)
|> setNames(c(" ", "df", "ΔAICc", "ΔAIC", "ΔBIC", "ΔnegLL"))
)
knitr::kable(atab, digits=2)
```
As stated in the main text, we can see that AICc gives similar conclusions to the Bayesian and likelihood-ratio-test analyses: the best model has AG evolution depending on male care only, followed closely by the additive model, and fairly closely by sperm competition only (models with $\Delta$ AICc < 2 are usually interpreted as "roughly equivalent"). In contrast, using AIC selects the additive model as best, and considerably better than the single-factor models. BIC, which like AICc is generally more conservative than AIC, also selects the additive model as best, similar ($\Delta$ BIC < 2) to the male-care-only model.
# Sensitivity analyses
We performed a range of different analyses to test the sensitivity of our conclusions to technical choices, and to explore the difference between frequentist (maximum likelihood estimate = MLE), constrained frequentist (maximum *a posteriori* = MAP), and full Bayesian (MCMC) fitting methods.
- `mcmc_tb`: Bayesian MCMC using neutral priors on the transition rates and biologically informed priors on the gain/loss rates; this is the primary model presented in the main text
- `mcmc_0`: as `mcmc_tb`, but using only the completely resolved phylogeny (i.e., only including species for which genetic data is available)
- `mcmc_tb_nogainloss`: as `mcmc_tb`, but using only the priors on the transition rates, omitting the priors on the gain/loss rates
- `full`: a model estimating all 24 transition rates, i.e. allowing for the rates of evolution of male care and sperm competition to depend on each other and on AG
- `model_pcsc`: MLE fit of the 12-parameter model
- `model_pcsc_add`: MLE fit of the additive (10-parameter) model
- `model_pcsc_prior`: as `model_pcsc`, but adding the priors used in the Bayesian model (this is a regularized or MAP estimate)
```{r ci_plot0, warning = FALSE, message = FALSE}
##
tar_load(ag_compdata)
tar_load(all_ci)
all_ci <- fix_all_ci(all_ci, levels = levs)
## extend NA confidence interval to limit of graph
gg_ci0 <- (ggplot(
all_ci,
aes(term, exp(estimate), ymin = exp(lwr), ymax = exp(upr),
colour = method)
)
+ geom_linerange(position = position_dodge(width = 0.5), key_glyph = "path")
+ geom_point(position = position_dodge(width = 0.5))
+ colorspace::scale_colour_discrete_qualitative()
+ coord_flip()
## + theme(legend.position="bottom")
## https://stackoverflow.com/questions/27130610/legend-on-bottom-two-rows-wrapped-in-ggplot2-in-r
## this was important when we had side-by-side graphs
+ guides(colour = guide_legend(reverse = TRUE),
shape = guide_legend(reverse = TRUE))
## nrow = 2, byrow = TRUE,
## reverse = TRUE))
)
## would like to make arrows ("<") at lower bounds larger, but manipulating
## size aesthetic would also mess up bars (I think). Some hack?
gg_ci1 <- (gg_ci0
+ scale_y_log10(limits = exp(corhmm_bounds), oob = scales::squish)
+ aes(shape = factor(lwr_bound))
+ labs(y="rate", x = "")
+ geom_hline(yintercept =exp(corhmm_mid), lty=2)
+ geom_hline(yintercept = exp(corhmm_prior), lty=2, col="gray")
## 60 = "<"
+ scale_shape_manual(values = c(16, 60), guide = "none")
)
gg_ci_nowald <- (gg_ci0 %+% dplyr::filter(all_ci, method != "model_pcsc")
+ scale_y_log10()
)
```
```{r contrast-ci, fig.width=8, fig.height = 6, warning = FALSE, message = FALSE}
method_levs <- c("mcmc_tb", "mcmc_0",
"mcmc_tb_nogainloss",
"full",
"model_pcsc",
"model_pcsc_add",
"model_pcsc_prior")
all_contr_ci <- (fix_all_ci(all_contr_ci, levels = contr_levs)
|> filter(!grepl("net", term),
!grepl("intercept", term),
method != "priorsamp",
!is.na(term))
|> mutate(across(method, factor, levels =rev(method_levs)))
)
## reverse order of levels, *and* guide, so that both are top-to-bottom
gg_ci1C <- (gg_ci0 %+% all_contr_ci
+ scale_y_log10(limits=c(1e-2, 1e3),
breaks = 10^seq(-2, 3),
labels = function(x) sprintf("%g", x),
oob = scales::squish,
expand = expansion(0,0))
+ labs(y="hazard ratio", x = "")
+ geom_hline(yintercept =1, lty=2)
+ aes(shape = method)
+ scale_shape_manual(values = c(scales::shape_pal(solid=TRUE)(6),9,10))
)
print(gg_ci1C)
```
Points at the edge of the graph (`sc_gain`, `pc_loss`, `pcxsc_loss`) indicate that the point estimates fell outside of the plotted range.
Intercepts are excluded (as being less biologically interesting, and using a different scale from the contrasts: hazards rather than hazard ratios), as are the estimates of the rates of gain and loss of the non-focal traits (spawning mode and male care).
We can draw a variety of conclusions from these results.
- `mcmc_tb` vs. `mcmc_0`: as illustrated above, the estimates from the full tree-block data set and the reduced ("fishphylo") data set are similar, although the tree-block estimates are slightly more precise.
- `mcmc_tb` vs. `mcmc_tb_nogainloss`: leaving out the gain/loss priors doesn't make much difference; it slightly weakens the SC estimate and strengthens the PC estimate
- `mcmc_tb` vs. `full`: although the results from the full model are generally of the same sign as from the reduced (12-parameter) model, they are much less certain. None of the signs of the effects are statistically clear ($p_{\textrm{MCMC}} > 0.05$ for all estimates).
- `mcmc_tb` vs. `model_pcsc`: the 12-parameter model is too poorly constrained to draw conclusions from the parameter estimates; this agrees with the likelihood ratio and IC tests above, which indicate that the additive model is preferred to the interaction model.
- `model_pcsc` vs. `model_pcsc_add`: constraining the model slightly by removing the interaction terms helps a little bit - now we get a point estimate and confidence intervals for the `pc_gain` parameter that agrees reasonably well with the full Bayesian fits - but most of the other parameters of interest are still unidentifiable (large confidence intervals/failure of the Wald approximation). The estimates of the rates of gain and loss of the non-focal traits are well identified, but not of interest.
- `mcmc_tb` vs. `model_pcsc_prior`: when we add the prior information to the MLE fit, we get point estimates, and CIs, that are very close to those from the primary (`mcmc_tb`) model.
# Technical details of Bayesian computations
## MCMC run parameters
All versions of the 12-parameter model were run using 8 chains with 84,000 iterations each with a burn-in of 4000 iterations (with a thinning factor of 10, for a total of 64000 samples). The full 24-parameter model was run similarly, but for 144,000 steps. For each model, runs took approximately 12-24 hours on 8 cores on a modern Linux workstation.
```{r setup_bayes, message=FALSE}
source("R/utils.R")
source("R/mcmc.R")
source("R/functions.R")
source("R/monitornew.R")
load_pkgs()
zmargin <- theme(panel.spacing = grid::unit(0, "lines"))
theme_set(theme_bw())
library(targets)
```
For each model type we show the traceplots; the improved $\hat R$ statistics according to @lambertRobust2022; and a pairs plot of the posterior distributions with hexbin plots of the samples in the lower triangle; kernel density estimates of the marginal posterior density for each parameter on the diagonal; and highest posterior density regions in the upper triangle.
## Fish-phylo model
This is the model that uses only the full/known phylogeny. (Probably irrelevant.)
```{r bayes_traceplot, fig.width=8, fig.height=8, dev = "png", optipng = "-o7"}
tar_load(traceplot_0)
print(traceplot_0)
```
```{r improved_rhat}
tar_load(ag_mcmc0)
aa <- do.call(abind, c(ag_mcmc0, list(along=3)))
aa2 <- aperm(aa,c(1,3,2), resize=TRUE)
monitor(aa2)
```
```{r bayes_pairs, fig.width=10, fig.height=10, echo=FALSE, eval=FALSE}
pairs(as.matrix(ag_mcmc0), gap = 0, pch = ".")
```
```{r bayes_pairs2, fig.width=10, fig.height=10, cache=TRUE}
tar_load(mc_pairsplots_0)
knitr::include_graphics('pix/mcmc_pairs_0.png')
```
Contour levels are: 50%, 80% 90%, 95% (largest) highest posterior density regions.
## Treeblock model (12 parameters)
The model that samples over the 'tree block' (sample of phylogeny reconstructions)
```{r traceplot_tb, fig.width=8, fig.height=8, dev = "png", optipng = "-o7"}
tar_load(traceplot_tb)
print(traceplot_tb)
```
```{r improved_rhat_tb}
tar_load(ag_mcmc_tb)
aa <- do.call(abind, c(ag_mcmc_tb, list(along=3)))
aa2 <- aperm(aa,c(1,3,2), resize=TRUE)
monitor(aa2)
```
```{r bayes_pairs_tb, fig.width=10, fig.height=10, cache=TRUE}
tar_load(mc_pairsplots_tb)
knitr::include_graphics('pix/mcmc_pairs_tb.png')
```
## Full model (24 parameters)
First half:
```{r traceplot_full1, fig.width=8, fig.height=8, dev = "png", optipng = "-o7"}
tar_load(traceplot_full1)
print(traceplot_full1)
```
Second half:
```{r traceplot_full2, fig.width=8, fig.height=8, dev = "png", optipng = "-o7"}
tar_load(traceplot_full2)
print(traceplot_full2)
```
```{r improved_rhat_full}
tar_load(ag_mcmc_full)
aa <- do.call(abind, c(ag_mcmc_full, list(along=3)))
aa2 <- aperm(aa,c(1,3,2), resize=TRUE)
monitor(aa2)
```
```{r bayes_pairs_full, fig.width=10, fig.height=10, cache=TRUE}
tar_load(mc_pairsplots_full)
knitr::include_graphics('pix/mcmc_pairs_full.png')
```
## No gain/loss priors
```{r traceplot_tb_nogainloss, fig.width=8, fig.height=8, dev = "png", optipng = "-o7"}
tar_load(traceplot_tb_nogainloss)
print(traceplot_tb_nogainloss)
```
```{r improved_rhat_tb_nogainloss}
tar_load(ag_mcmc_tb_nogainloss)
aa <- do.call(abind, c(ag_mcmc_tb_nogainloss, list(along=3)))
aa2 <- aperm(aa,c(1,3,2), resize=TRUE)
monitor(aa2)
```
```{r bayes_pairs_tb_nogainloss, fig.width=10, fig.height=10, cache=TRUE}
tar_load(mc_pairsplots_tb_nogainloss)
knitr::include_graphics('pix/mcmc_pairs_tb_nogainloss.png')
```
# References