/
tutorial.html
1458 lines (1427 loc) · 96.6 KB
/
tutorial.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>admixr - Tutorial • admixr</title>
<!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous">
<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css">
<script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
<!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet">
<script src="../pkgdown.js"></script><meta property="og:title" content="admixr - Tutorial">
<meta property="og:description" content="admixr">
<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body data-spy="scroll" data-target="#toc">
<div class="container template-article">
<header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">admixr</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.8.0</span>
</span>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li>
<a href="../index.html">
<span class="fas fa fas fa-home fa-lg"></span>
</a>
</li>
<li>
<a href="../reference/index.html">Reference</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
Articles
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu">
<li>
<a href="../articles/tutorial.html">admixr - Tutorial</a>
</li>
</ul>
</li>
<li>
<a href="../news/index.html">Changelog</a>
</li>
</ul>
<ul class="nav navbar-nav navbar-right">
<li>
<a href="https://github.com/bodkan/admixr/">
<span class="fab fa fab fa-github fa-lg"></span>
</a>
</li>
</ul>
</div>
<!--/.nav-collapse -->
</div>
<!--/.container -->
</div>
<!--/.navbar -->
</header><script src="tutorial_files/header-attrs-2.2/header-attrs.js"></script><div class="row">
<div class="col-md-9 contents">
<div class="page-header toc-ignore">
<h1 data-toc-skip>admixr - Tutorial</h1>
<h4 class="author">Martin Petr</h4>
<h4 class="date">2020-06-14</h4>
<small class="dont-index">Source: <a href="https://github.com/bodkan/admixr/blob/master/vignettes/tutorial.Rmd"><code>vignettes/tutorial.Rmd</code></a></small>
<div class="hidden name"><code>tutorial.Rmd</code></div>
</div>
<div id="introduction" class="section level2">
<h2 class="hasAnchor">
<a href="#introduction" class="anchor"></a>Introduction</h2>
<p><a href="https://github.com/DReichLab/AdmixTools/">ADMIXTOOLS</a> is a widely used software package for calculating admixture statistics and testing population admixture hypotheses.</p>
<p>A typical ADMIXTOOLS workflow generally involves a combination of <code>sed</code>/<code>awk</code>/shell scripting and manual editing to create text configuration files. These are then passed as command-line arguments to one of ADMIXTOOLS commands, and control how to run a particular analysis. The results are then redirected to another file, which has to be parsed by the user to extract values of interest, often using command-line utilities again or (worse) by manual copy-pasting. Finally, the processed results are analysed in R, Excel or another program.</p>
<p>This workflow can be a little cumbersome, especially if one wants to explore many hypotheses involving different combinations of populations. Most importantly, however, it makes it difficult to coduct reproducible research, as it is nearly impossible to construct fully automated “pipelines” that don’t require user intervention.</p>
<p>This R package makes it possible to perform all stages of ADMIXTOOLS analyses entirely from R, completely removing the need for “low level” configuration of individual ADMIXTOOLS programs.</p>
</div>
<div id="installation" class="section level2">
<h2 class="hasAnchor">
<a href="#installation" class="anchor"></a>Installation</h2>
<p><strong>Note that in order to use the <em>admixr</em> package, you need a working installation of ADMIXTOOLS!</strong> You can find installation instructions <a href="https://github.com/DReichLab/AdmixTools/blob/master/README.INSTALL">here</a>. The software runs on Linux and macOS and these are the two systems that <em>admixr</em> is tested on.</p>
<p><strong>Furthermore, you need to make sure that R can find ADMIXTOOLS binaries on the <code>$PATH</code>.</strong> If this is not the case, running <code><a href="https://rdrr.io/pkg/admixr/man">library(admixr)</a></code> will show a warning message with instructions on how to fix this.</p>
<p>To install <em>admixr</em> from GitHub you need to install the package <code>devtools</code> first. To do this, you can simply run (in R):</p>
<div class="sourceCode" id="cb1"><html><body><pre class="r"><span class="fu"><a href="https://rdrr.io/r/utils/install.packages.html">install.packages</a></span>(<span class="st">"devtools"</span>)
<span class="kw pkg">devtools</span><span class="kw ns">::</span><span class="fu"><a href="https://rdrr.io/pkg/devtools/man/remote-reexports.html">install_github</a></span>(<span class="st">"bodkan/admixr"</span>)</pre></body></html></div>
<p>Furthermore, if you want to follow the examples in this vignette, you will need the <a href="https://www.tidyverse.org">tidyverse</a> collection of packages for data manipulation manipulation and plotting, which you can install with:</p>
<div class="sourceCode" id="cb2"><html><body><pre class="r"><span class="fu"><a href="https://rdrr.io/r/utils/install.packages.html">install.packages</a></span>(<span class="st">"tidyverse"</span>)</pre></body></html></div>
<p>You definitely don’t need tidyverse for working with <em>admixr</em> but it really makes data manipulation and plotting things much easier. I recommend at least giving it a shot.</p>
<p>When everything is ready, you can run the following code to load both packages:</p>
<div class="sourceCode" id="cb3"><html><body><pre class="r"><span class="fu"><a href="https://rdrr.io/r/base/library.html">library</a></span>(<span class="no">admixr</span>)
<span class="fu"><a href="https://rdrr.io/r/base/library.html">library</a></span>(<span class="no">tidyverse</span>)</pre></body></html></div>
</div>
<div id="a-note-about-eigenstrat-format" class="section level2">
<h2 class="hasAnchor">
<a href="#a-note-about-eigenstrat-format" class="anchor"></a>A note about EIGENSTRAT format</h2>
<p>ADMIXTOOLS software uses a peculiar set of genetic file formats, which may seem strange if you are used to working with <a href="http://samtools.github.io/hts-specs/VCFv4.3.pdf">VCF files</a>. However, the basic idea remains the same: we want to store and access SNP data (REF/ALT alleles) of a set of individuals at a defined set of genomic positions.</p>
<p>EIGENSTRAT datasets always contain three kinds of files:</p>
<ul>
<li>
<code>ind</code> file - specifies a unique name, sex (optional - can be simply “U” for “undefined”) and label (such as population assignment) of each sample;</li>
<li>
<code>snp</code> file - specifies the positions of SNPs, REF/ALT alleles etc.;</li>
<li>
<code>geno</code> file - contains SNP data (one row per site, one character per sample) in a dense string-based format:
<ul>
<li>0: individual is homozygous ALT</li>
<li>1: individual is a heterozygote</li>
<li>2: individual is homozygous REF</li>
<li>9: missing data</li>
</ul>
</li>
</ul>
<p>Therefore, a VCF file is essentially a combination of all three files in a single package.</p>
<p>Let’s first download a small testing SNP dataset using a built-in <em>admixr</em> function <code><a href="../reference/download_data.html">download_data()</a></code>. This function downloads the data into a temporary directory (you can specify the destination using its <code>dirname</code> argument, in case you want to place it elsewhere). In addition to this, the function returns a shared path/prefix of the whole dataset.</p>
<div class="sourceCode" id="cb4"><html><body><pre class="r">(prefix <- download_data())
[1] "/var/folders/t7/9gjtb6m92flbnp930618vt3r0000gn/T//RtmpoZnOrt/snps/snps"</pre></body></html></div>
<p>We can verify that there are indeed three files with this prefix:</p>
<div class="sourceCode" id="cb5"><html><body><pre class="r">list.files(path = dirname(prefix), pattern = basename(prefix), full.names = TRUE)
[1] "/var/folders/t7/9gjtb6m92flbnp930618vt3r0000gn/T//RtmpoZnOrt/snps/snps.geno"
[2] "/var/folders/t7/9gjtb6m92flbnp930618vt3r0000gn/T//RtmpoZnOrt/snps/snps.ind"
[3] "/var/folders/t7/9gjtb6m92flbnp930618vt3r0000gn/T//RtmpoZnOrt/snps/snps.snp" </pre></body></html></div>
<p>Let’s look at their contents:</p>
<div id="ind-file" class="section level4">
<h4 class="hasAnchor">
<a href="#ind-file" class="anchor"></a><code>ind</code> file</h4>
<pre><code>Chimp U Chimp
Mbuti U Mbuti
Yoruba U Yoruba
Khomani_San U Khomani_San
Han U Han
Dinka U Dinka
Sardinian U Sardinian
Papuan U Papuan
French U French
Vindija U Vindija
Altai U Altai
Denisova U Denisova</code></pre>
<p>The first column (sample name) and the third column (population label) are generally not the same (sample names often have numerical suffixes to make them unique, etc.), but were kept the same here for simplicity. Importantly, when specifying population/sample names in <em>admixr</em> functions, the information in the third column is what is used. For example, if you have individuals such as “French1”, “French2”, “French3” in the first column of an <code>ind</code> file, all three sharing a “French” population label in the third column, specifying “French” in an <em>admixr</em> function will combine all three samples in a single population, instead of working with each individual separately.</p>
</div>
<div id="snp-file-first-3-lines" class="section level4">
<h4 class="hasAnchor">
<a href="#snp-file-first-3-lines" class="anchor"></a><code>snp</code> file (first 3 lines)</h4>
<pre><code>1_832756 1 0.008328 832756 T G
1_838931 1 0.008389 838931 A C
1_843249 1 0.008432 843249 A T</code></pre>
<p>The columns of this file are, in order:</p>
<ol style="list-style-type: decimal">
<li>SNP string ID</li>
<li>chromosome</li>
<li>genetic distance</li>
<li>position along a chromosome</li>
<li>reference allele</li>
<li>alternative allele</li>
</ol>
</div>
<div id="geno-file-first-3-lines" class="section level4">
<h4 class="hasAnchor">
<a href="#geno-file-first-3-lines" class="anchor"></a><code>geno</code> file (first 3 lines)</h4>
<pre><code>902021012000
922221211222
922222122222</code></pre>
<p>Each row is one genomic site, each column is a genotype in one individual.</p>
</div>
</div>
<div id="philosophy-of-admixr" class="section level2">
<h2 class="hasAnchor">
<a href="#philosophy-of-admixr" class="anchor"></a>Philosophy of <em>admixr</em>
</h2>
<p>The goal of <em>admixr</em> is to make ADMIXTOOLS analyses as trivial to run as possible, without having to worry about par/pop/left/right configuration files (as they are known in the jargon of ADMIXTOOLS) and other low-level details.</p>
<p>The only interface between you and ADMIXTOOLS is the following set of R functions:</p>
<ul>
<li><code><a href="../reference/f4ratio.html">d()</a></code></li>
<li><code><a href="../reference/f4ratio.html">f4()</a></code></li>
<li><code><a href="../reference/f4ratio.html">f4ratio()</a></code></li>
<li><code><a href="../reference/f4ratio.html">f3()</a></code></li>
<li><code><a href="../reference/qpAdm.html">qpAdm()</a></code></li>
<li><code><a href="../reference/qpWave.html">qpWave()</a></code></li>
</ul>
<p>Anything that would normally require <a href="https://gaworkshop.readthedocs.io/en/latest/contents/06_f3/f3.html">dozens of lines of shell scripts</a> can be often accomplished by running a single line of R code.</p>
</div>
<div id="internal-representation-of-eigenstrat-data" class="section level2">
<h2 class="hasAnchor">
<a href="#internal-representation-of-eigenstrat-data" class="anchor"></a>Internal representation of EIGENSTRAT data</h2>
<p>As we saw above, each EIGENSTRAT dataset has three components. The way this data is internally represented in <em>admixr</em> is using a small S3 R object created using the <code>eigenstrat</code> constructor function. This function accepts the path and prefix of a trio of EIGENSTRAT snp/ind/geno files and returns an R object of the class <code>EIGENSTRAT</code>:</p>
<div class="sourceCode" id="cb9"><html><body><pre class="r"><span class="no">snps</span> <span class="kw"><-</span> <span class="fu"><a href="../reference/eigenstrat.html">eigenstrat</a></span>(<span class="no">prefix</span>)</pre></body></html></div>
<div class="sourceCode" id="cb10"><html><body><pre class="r"><span class="no">snps</span>
<span class="co">#> EIGENSTRAT object</span>
<span class="co">#> =================</span>
<span class="co">#> components:</span>
<span class="co">#> ind file: /var/folders/t7/9gjtb6m92flbnp930618vt3r0000gn/T//RtmpoZnOrt/snps/snps.ind</span>
<span class="co">#> snp file: /var/folders/t7/9gjtb6m92flbnp930618vt3r0000gn/T//RtmpoZnOrt/snps/snps.snp</span>
<span class="co">#> geno file: /var/folders/t7/9gjtb6m92flbnp930618vt3r0000gn/T//RtmpoZnOrt/snps/snps.geno</span></pre></body></html></div>
<p>This object encapsulates the paths to all three EIGENSTRAT components and makes it easy to pass the data to different <em>admixr</em> functions.</p>
<p>The following couple of sections describe how to use the <em>admixr</em> package using simple example analyses.</p>
</div>
<div id="d-statistic" class="section level2">
<h2 class="hasAnchor">
<a href="#d-statistic" class="anchor"></a>D statistic</h2>
<p>Let’s say we are interested in the following question: <em>"Which populations today show evidence of Neanderthal admixture?</em></p>
<p>One way of looking at this is using the following D statistic: <span class="math display">\[D(\textrm{present-day human W}, \textrm{African}, \textrm{Neanderthal}, \textrm{Chimp}).\]</span></p>
<p><span class="math inline">\(D\)</span> statistics are based on comparing the proportions of BABA and ABBA sites patterns observed in the data:</p>
<p><span class="math display">\[D = \frac{\textrm{# BABA sites - # ABBA sites}}{\textrm{# BABA sites + # ABBA sites}}.\]</span></p>
<p>Significant departure of <span class="math inline">\(D\)</span> from zero indicates an excess of allele sharing between the first and the third population (positive <span class="math inline">\(D\)</span>), or an excess of allele sharing between the second and the third population (negative <span class="math inline">\(D\)</span>). If we get <span class="math inline">\(D\)</span> that is not significantly different from 0, this suggests that the first and second populations form a clade, and don’t differ in the rate of allele sharing with the third population (this is the null hypothesis that the data is compared against).</p>
<p>Therefore, our <span class="math inline">\(D\)</span> statistic above tests whether some modern humans today admixed with Neanderthals, which would increase their genetic affinity to this archaic group compared to Africans (whose ancestors never met Neanderthals).</p>
<p>Let’s save some population names first to make our code more concise:</p>
<div class="sourceCode" id="cb11"><html><body><pre class="r"><span class="no">pops</span> <span class="kw"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"French"</span>, <span class="st">"Sardinian"</span>, <span class="st">"Han"</span>, <span class="st">"Papuan"</span>, <span class="st">"Khomani_San"</span>, <span class="st">"Mbuti"</span>, <span class="st">"Dinka"</span>)</pre></body></html></div>
<p>Using the <em>admixr</em> package we can then calculate our <span class="math inline">\(D\)</span> statistic simply by running:</p>
<div class="sourceCode" id="cb12"><html><body><pre class="r"><span class="no">result</span> <span class="kw"><-</span> <span class="fu"><a href="../reference/f4ratio.html">d</a></span>(<span class="kw">W</span> <span class="kw">=</span> <span class="no">pops</span>, <span class="kw">X</span> <span class="kw">=</span> <span class="st">"Yoruba"</span>, <span class="kw">Y</span> <span class="kw">=</span> <span class="st">"Vindija"</span>, <span class="kw">Z</span> <span class="kw">=</span> <span class="st">"Chimp"</span>, <span class="kw">data</span> <span class="kw">=</span> <span class="no">snps</span>)</pre></body></html></div>
<p>The result is a following data frame:</p>
<div class="sourceCode" id="cb13"><html><body><pre class="r"><span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span>(<span class="no">result</span>)</pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="left">W</th>
<th align="left">X</th>
<th align="left">Y</th>
<th align="left">Z</th>
<th align="right">D</th>
<th align="right">stderr</th>
<th align="right">Zscore</th>
<th align="right">BABA</th>
<th align="right">ABBA</th>
<th align="right">nsnps</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="left">French</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.0313</td>
<td align="right">0.006933</td>
<td align="right">4.510</td>
<td align="right">15802</td>
<td align="right">14844</td>
<td align="right">487753</td>
</tr>
<tr class="even">
<td align="left">Sardinian</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.0287</td>
<td align="right">0.006792</td>
<td align="right">4.222</td>
<td align="right">15729</td>
<td align="right">14852</td>
<td align="right">487646</td>
</tr>
<tr class="odd">
<td align="left">Han</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.0278</td>
<td align="right">0.006609</td>
<td align="right">4.199</td>
<td align="right">15780</td>
<td align="right">14928</td>
<td align="right">487925</td>
</tr>
<tr class="even">
<td align="left">Papuan</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.0457</td>
<td align="right">0.006571</td>
<td align="right">6.953</td>
<td align="right">16131</td>
<td align="right">14721</td>
<td align="right">487694</td>
</tr>
<tr class="odd">
<td align="left">Khomani_San</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.0066</td>
<td align="right">0.006292</td>
<td align="right">1.051</td>
<td align="right">16168</td>
<td align="right">15955</td>
<td align="right">487564</td>
</tr>
<tr class="even">
<td align="left">Mbuti</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">-0.0005</td>
<td align="right">0.006345</td>
<td align="right">-0.074</td>
<td align="right">15751</td>
<td align="right">15766</td>
<td align="right">487642</td>
</tr>
</tbody>
</table>
<p>We can see that in addition to the specified population names, the output table contains additional columns:</p>
<ul>
<li>
<code>D</code> - <span class="math inline">\(D\)</span> statistic value</li>
<li>
<code>stderr</code> - standard error of the <span class="math inline">\(D\)</span> statistic calculated using the block jackknife</li>
<li>
<code>Zscore</code> - <span class="math inline">\(Z\)</span>-zscore value (number of standard errors the <span class="math inline">\(D\)</span> is from 0, i.e. how strongly do we reject the null hypothesis of no admixture)</li>
<li>
<code>BABA</code>, <code>ABBA</code> - counts of observed site patterns</li>
<li>
<code>nsnps</code> - number of SNPs used for a given calculation</li>
</ul>
<p>While we could certainly make inferences by looking at the <span class="math inline">\(Z\)</span>-scores, tables in general are not the best representation of this kind of data, especially as the number of samples increases. Instead, we can use the <a href="https://ggplot2.tidyverse.org"><code>ggplot2</code></a> package to plot the results:</p>
<div class="sourceCode" id="cb14"><html><body><pre class="r"><span class="fu">ggplot</span>(<span class="no">result</span>, <span class="fu">aes</span>(<span class="fu">fct_reorder</span>(<span class="no">W</span>, <span class="no">D</span>), <span class="no">D</span>, <span class="kw">color</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">abs</a></span>(<span class="no">Zscore</span>) <span class="kw">></span> <span class="fl">2</span>)) +
<span class="fu">geom_point</span>() +
<span class="fu">geom_hline</span>(<span class="kw">yintercept</span> <span class="kw">=</span> <span class="fl">0</span>, <span class="kw">linetype</span> <span class="kw">=</span> <span class="fl">2</span>) +
<span class="fu">geom_errorbar</span>(<span class="fu">aes</span>(<span class="kw">ymin</span> <span class="kw">=</span> <span class="no">D</span> - <span class="fl">2</span> * <span class="no">stderr</span>, <span class="kw">ymax</span> <span class="kw">=</span> <span class="no">D</span> + <span class="fl">2</span> * <span class="no">stderr</span>))</pre></body></html></div>
<p><img src="tutorial_files/figure-html/d_plot-1.png" width="672"></p>
<p>(If you want to more know about data analysis using R, including plotting with ggplot2, I highly recommend <a href="http://r4ds.had.co.nz">this</a> free book.)</p>
<p>We can see that the <span class="math inline">\(D\)</span> values for Africans are not significantly different from 0, meaning that the data is consistent with the null hypothesis of no Neanderthal ancestry in Africans. On the other hand, the test rejects the null hypothesis for all non-Africans today, suggesting that Neanderthals admixed with the ancestors of present-day non-Africans.</p>
</div>
<div id="f4-statistic" class="section level2">
<h2 class="hasAnchor">
<a href="#f4-statistic" class="anchor"></a>f4 statistic</h2>
<p>An alternative way of addressing the previous question is to use the <span class="math inline">\(f_4\)</span> statistic, which is very similar to <span class="math inline">\(D\)</span> statistic and can be calculated as:</p>
<p><span class="math display">\[ f_4 = \frac{\textrm{# BABA sites - # ABBA sites}}{\textrm{# sites}}\]</span></p>
<p>Again, significant departure of <span class="math inline">\(f_4\)</span> from 0 can be interpreted as evidence of gene flow.</p>
<p>To repeat the previous analysis using <span class="math inline">\(f_4\)</span> statistic, we can run the function <code><a href="../reference/f4ratio.html">f4()</a></code>:</p>
<div class="sourceCode" id="cb15"><html><body><pre class="r"><span class="no">result</span> <span class="kw"><-</span> <span class="fu"><a href="../reference/f4ratio.html">f4</a></span>(<span class="kw">W</span> <span class="kw">=</span> <span class="no">pops</span>, <span class="kw">X</span> <span class="kw">=</span> <span class="st">"Yoruba"</span>, <span class="kw">Y</span> <span class="kw">=</span> <span class="st">"Vindija"</span>, <span class="kw">Z</span> <span class="kw">=</span> <span class="st">"Chimp"</span>, <span class="kw">data</span> <span class="kw">=</span> <span class="no">snps</span>)</pre></body></html></div>
<div class="sourceCode" id="cb16"><html><body><pre class="r"><span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span>(<span class="no">result</span>)</pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="left">W</th>
<th align="left">X</th>
<th align="left">Y</th>
<th align="left">Z</th>
<th align="right">f4</th>
<th align="right">stderr</th>
<th align="right">Zscore</th>
<th align="right">BABA</th>
<th align="right">ABBA</th>
<th align="right">nsnps</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="left">French</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.001965</td>
<td align="right">0.000437</td>
<td align="right">4.501</td>
<td align="right">15802</td>
<td align="right">14844</td>
<td align="right">487753</td>
</tr>
<tr class="even">
<td align="left">Sardinian</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.001798</td>
<td align="right">0.000427</td>
<td align="right">4.209</td>
<td align="right">15729</td>
<td align="right">14852</td>
<td align="right">487646</td>
</tr>
<tr class="odd">
<td align="left">Han</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.001746</td>
<td align="right">0.000418</td>
<td align="right">4.178</td>
<td align="right">15780</td>
<td align="right">14928</td>
<td align="right">487925</td>
</tr>
<tr class="even">
<td align="left">Papuan</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.002890</td>
<td align="right">0.000417</td>
<td align="right">6.924</td>
<td align="right">16131</td>
<td align="right">14721</td>
<td align="right">487694</td>
</tr>
<tr class="odd">
<td align="left">Khomani_San</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">0.000436</td>
<td align="right">0.000415</td>
<td align="right">1.051</td>
<td align="right">16168</td>
<td align="right">15955</td>
<td align="right">487564</td>
</tr>
<tr class="even">
<td align="left">Mbuti</td>
<td align="left">Yoruba</td>
<td align="left">Vindija</td>
<td align="left">Chimp</td>
<td align="right">-0.000030</td>
<td align="right">0.000410</td>
<td align="right">-0.074</td>
<td align="right">15751</td>
<td align="right">15766</td>
<td align="right">487642</td>
</tr>
</tbody>
</table>
<p>By comparing this result to the <span class="math inline">\(D\)</span> statistic analysis above, we can make the same conclusions.</p>
<p>You might be wondering why we have both <span class="math inline">\(f_4\)</span> and <span class="math inline">\(D\)</span> if they are so similar. The truth is that <span class="math inline">\(f_4\)</span> is, among other things, directly informative about the amount of shared genetic drift (“branch length”) between pairs of populations, which is a very useful theoretical property. Other than that, it’s often a matter of personal preference and so <em>admixr</em> provides functions for calculating both.</p>
</div>
<div id="f4-ratio-statistic" class="section level2">
<h2 class="hasAnchor">
<a href="#f4-ratio-statistic" class="anchor"></a>f4-ratio statistic</h2>
<p>Now we know that non-Africans today carry <em>some</em> Neanderthal ancestry. But what if we want to know <em>how much</em> Neanderthal ancestry they have? What proportion of their genomes is of Neanderthal origin? To answer questions like this, we can use the <span class="math inline">\(f_4\)</span>-ratio statistic, which can be formulated in the following way (using a notation of <a href="http://www.genetics.org/content/192/3/1065">Patterson et al., 2012</a>, who formally described its properties).</p>
<p><span class="math display">\[f_4\textrm{-ratio} = \frac{f_4(A, O; X, C)}{f_4(A, O; B, C)}.\]</span></p>
<p>Using <code>amidxr</code>, we can calculate <span class="math inline">\(f_4\)</span>-ratios using the following code (<code>X</code> being a vector of samples which we want to estimate the Neanderthal ancestry in):</p>
<div class="sourceCode" id="cb17"><html><body><pre class="r"><span class="no">result</span> <span class="kw"><-</span> <span class="fu"><a href="../reference/f4ratio.html">f4ratio</a></span>(<span class="kw">X</span> <span class="kw">=</span> <span class="no">pops</span>, <span class="kw">A</span> <span class="kw">=</span> <span class="st">"Altai"</span>, <span class="kw">B</span> <span class="kw">=</span> <span class="st">"Vindija"</span>, <span class="kw">C</span> <span class="kw">=</span> <span class="st">"Yoruba"</span>, <span class="kw">O</span> <span class="kw">=</span> <span class="st">"Chimp"</span>, <span class="kw">data</span> <span class="kw">=</span> <span class="no">snps</span>)</pre></body></html></div>
<p>The ancestry proportion (a number between 0 and 1) is given in the <code>alpha</code> column:</p>
<div class="sourceCode" id="cb18"><html><body><pre class="r"><span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span>(<span class="no">result</span>)</pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="left">A</th>
<th align="left">B</th>
<th align="left">X</th>
<th align="left">C</th>
<th align="left">O</th>
<th align="right">alpha</th>
<th align="right">stderr</th>
<th align="right">Zscore</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="left">Altai</td>
<td align="left">Vindija</td>
<td align="left">French</td>
<td align="left">Yoruba</td>
<td align="left">Chimp</td>
<td align="right">0.023774</td>
<td align="right">0.006176</td>
<td align="right">3.850</td>
</tr>
<tr class="even">
<td align="left">Altai</td>
<td align="left">Vindija</td>
<td align="left">Sardinian</td>
<td align="left">Yoruba</td>
<td align="left">Chimp</td>
<td align="right">0.024468</td>
<td align="right">0.006071</td>
<td align="right">4.031</td>
</tr>
<tr class="odd">
<td align="left">Altai</td>
<td align="left">Vindija</td>
<td align="left">Han</td>
<td align="left">Yoruba</td>
<td align="left">Chimp</td>
<td align="right">0.022117</td>
<td align="right">0.005892</td>
<td align="right">3.754</td>
</tr>
<tr class="even">
<td align="left">Altai</td>
<td align="left">Vindija</td>
<td align="left">Papuan</td>
<td align="left">Yoruba</td>
<td align="left">Chimp</td>
<td align="right">0.037311</td>
<td align="right">0.005812</td>
<td align="right">6.420</td>
</tr>
<tr class="odd">
<td align="left">Altai</td>
<td align="left">Vindija</td>
<td align="left">Khomani_San</td>
<td align="left">Yoruba</td>
<td align="left">Chimp</td>
<td align="right">0.003909</td>
<td align="right">0.005913</td>
<td align="right">0.661</td>
</tr>
<tr class="even">
<td align="left">Altai</td>
<td align="left">Vindija</td>
<td align="left">Mbuti</td>
<td align="left">Yoruba</td>
<td align="left">Chimp</td>
<td align="right">0.000319</td>
<td align="right">0.005717</td>
<td align="right">0.056</td>
</tr>
</tbody>
</table>
<div class="sourceCode" id="cb19"><html><body><pre class="r"><span class="fu">ggplot</span>(<span class="no">result</span>, <span class="fu">aes</span>(<span class="fu">fct_reorder</span>(<span class="no">X</span>, <span class="no">alpha</span>), <span class="no">alpha</span>, <span class="kw">color</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">abs</a></span>(<span class="no">Zscore</span>) <span class="kw">></span> <span class="fl">2</span>)) +
<span class="fu">geom_point</span>() +
<span class="fu">geom_errorbar</span>(<span class="fu">aes</span>(<span class="kw">ymin</span> <span class="kw">=</span> <span class="no">alpha</span> - <span class="fl">2</span> * <span class="no">stderr</span>, <span class="kw">ymax</span> <span class="kw">=</span> <span class="no">alpha</span> + <span class="fl">2</span> * <span class="no">stderr</span>)) +
<span class="fu">geom_hline</span>(<span class="kw">yintercept</span> <span class="kw">=</span> <span class="fl">0</span>, <span class="kw">linetype</span> <span class="kw">=</span> <span class="fl">2</span>) +
<span class="fu">labs</span>(<span class="kw">y</span> <span class="kw">=</span> <span class="st">"Neandertal ancestry proportion"</span>, <span class="kw">x</span> <span class="kw">=</span> <span class="st">"present-day individual"</span>)</pre></body></html></div>
<p><img src="tutorial_files/figure-html/f4ratio_plot-1.png" width="672"></p>
<p>We can make several observations:</p>
<ul>
<li>Again, we don’t see any significant Neanderthal ancestry in present-day Africans (proportion is consistent with 0%), which is what we confirmed using <span class="math inline">\(D\)</span> and <span class="math inline">\(f_4\)</span> above.</li>
<li>Present-day non-Africans carry between 2-3% of Neanderthal ancestry.</li>
<li>We see a much higher proportion of Neanderthal ancestry in people from Papua New Guinea - more than 4%. This is consistent with earlier studies that suggest additional archaic admixture events in the ancestors of present-day Papuans.</li>
</ul>
</div>
<div id="f3-statistic" class="section level2">
<h2 class="hasAnchor">
<a href="#f3-statistic" class="anchor"></a>f3 statistic</h2>
<p>The <span class="math inline">\(f_3\)</span> statistic, also known as the 3-population statistic, is useful whenever we want to:</p>
<ol style="list-style-type: decimal">
<li>Estimate the branch length (shared genetic drift) between a pair of populations <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span> with respect to a common outgroup <span class="math inline">\(C\)</span>. In this case, the higher the <span class="math inline">\(f_3\)</span> value, the longer the shared evolutionary time between <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span>.</li>
<li>Test whether population <span class="math inline">\(C\)</span> is a mixture of two populations <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span>. Significantly negative values of the <span class="math inline">\(f_3\)</span> statistic are then a statistical evidence of this admixture.</li>
</ol>
<p>As an example, imagine we are interested in relative divergence times between pairs of present-day human populations, and want to know in which approximate order they split of from each other. To address this problem, we could use <span class="math inline">\(f_3\)</span> statistic by fixing the <span class="math inline">\(C\)</span> outgroup as San, and calculating pairwise <span class="math inline">\(f_3\)</span> statistics between all present-day modern humans.</p>
<div class="sourceCode" id="cb20"><html><body><pre class="r"><span class="no">pops</span> <span class="kw"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"French"</span>, <span class="st">"Sardinian"</span>, <span class="st">"Han"</span>, <span class="st">"Papuan"</span>, <span class="st">"Mbuti"</span>, <span class="st">"Dinka"</span>, <span class="st">"Yoruba"</span>)
<span class="no">result</span> <span class="kw"><-</span> <span class="fu"><a href="../reference/f4ratio.html">f3</a></span>(<span class="kw">A</span> <span class="kw">=</span> <span class="no">pops</span>, <span class="kw">B</span> <span class="kw">=</span> <span class="no">pops</span>, <span class="kw">C</span> <span class="kw">=</span> <span class="st">"Khomani_San"</span>, <span class="kw">data</span> <span class="kw">=</span> <span class="no">snps</span>)</pre></body></html></div>
<div class="sourceCode" id="cb21"><html><body><pre class="r"><span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span>(<span class="no">result</span>)</pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="left">A</th>
<th align="left">B</th>
<th align="left">C</th>
<th align="right">f3</th>
<th align="right">stderr</th>
<th align="right">Zscore</th>
<th align="right">nsnps</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="left">French</td>
<td align="left">French</td>
<td align="left">Khomani_San</td>
<td align="right">0.000000</td>
<td align="right">-1.000000</td>
<td align="right">0.000</td>
<td align="right">-1</td>
</tr>
<tr class="even">
<td align="left">French</td>
<td align="left">Sardinian</td>
<td align="left">Khomani_San</td>
<td align="right">0.353447</td>
<td align="right">0.012527</td>
<td align="right">28.215</td>
<td align="right">249760</td>
</tr>
<tr class="odd">
<td align="left">French</td>
<td align="left">Han</td>
<td align="left">Khomani_San</td>
<td align="right">0.316964</td>
<td align="right">0.011914</td>
<td align="right">26.604</td>
<td align="right">253158</td>
</tr>
<tr class="even">
<td align="left">French</td>
<td align="left">Papuan</td>
<td align="left">Khomani_San</td>
<td align="right">0.306962</td>
<td align="right">0.011708</td>
<td align="right">26.218</td>
<td align="right">251648</td>
</tr>
<tr class="odd">
<td align="left">French</td>
<td align="left">Mbuti</td>
<td align="left">Khomani_San</td>
<td align="right">0.119283</td>
<td align="right">0.008448</td>
<td align="right">14.119</td>
<td align="right">271501</td>
</tr>
<tr class="even">
<td align="left">French</td>
<td align="left">Dinka</td>
<td align="left">Khomani_San</td>
<td align="right">0.190141</td>
<td align="right">0.010049</td>
<td align="right">18.922</td>
<td align="right">276964</td>
</tr>
</tbody>
</table>
<div class="sourceCode" id="cb22"><html><body><pre class="r"><span class="co"># sort the population labels according to an increasing f3 value relative to French</span>
<span class="no">ordered</span> <span class="kw"><-</span> <span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span>(<span class="no">result</span>, <span class="no">A</span> <span class="kw">==</span> <span class="st">"Mbuti"</span>, <span class="no">B</span> <span class="kw">!=</span> <span class="st">"Mbuti"</span>) <span class="kw">%>%</span> <span class="fu">arrange</span>(<span class="no">f3</span>) <span class="kw">%>%</span> <span class="no">.</span><span class="kw">[[</span><span class="st">"B"</span>]] <span class="kw">%>%</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"Mbuti"</span>)
<span class="co"># plot heatmap of pairwise f3 values</span>
<span class="no">result</span> <span class="kw">%>%</span>
<span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span>(<span class="no">A</span> <span class="kw">!=</span> <span class="no">B</span>) <span class="kw">%>%</span>
<span class="fu">mutate</span>(<span class="kw">A</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span>(<span class="no">A</span>, <span class="kw">levels</span> <span class="kw">=</span> <span class="no">ordered</span>),
<span class="kw">B</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span>(<span class="no">B</span>, <span class="kw">levels</span> <span class="kw">=</span> <span class="no">ordered</span>)) <span class="kw">%>%</span>
<span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="no">A</span>, <span class="no">B</span>)) + <span class="fu">geom_tile</span>(<span class="fu">aes</span>(<span class="kw">fill</span> <span class="kw">=</span> <span class="no">f3</span>))</pre></body></html></div>
<p><img src="tutorial_files/figure-html/f3_plot-1.png" width="768"></p>
<p>We can see that when we order the heatmap labels based on values of pairwise <span class="math inline">\(f_3\)</span> statistics, the (already known) order of population splits pops up nicely (i.e. San separated first, followed by Mbuti, etc.).</p>
</div>
<div id="qpwave-and-qpadm" class="section level2">
<h2 class="hasAnchor">
<a href="#qpwave-and-qpadm" class="anchor"></a>qpWave and qpAdm</h2>
<p>Both <em>qpWave</em> and <em>qpAdm</em> can be though of as more complex and powerful extensions of the basic ideas behind a simple <span class="math inline">\(f_4\)</span> statistic. Building upon the <span class="math inline">\(f_4\)</span> theory and generalizing it, <em>qpWave</em> makes it possible to find the lowest number of “streams of ancestry” between two groups of populations that is consistent with the data. Extending the concept of <span class="math inline">\(f_4\)</span> statistics even further, <em>qpAdm</em> allows to find the proportions of ancestry from a set of ancestral populations that contributed ancestry to our population of interest.</p>
<p>Unfortunately, both methods represent a rather advanced topic that still lacks proper documentation and beginner-friendly tutorials, and explaining them in detail is beyond the scope of this vignette. If you want to use them, it’s crucial that you read the official documentation decribing the basic ideas of both methods (<a href="https://github.com/DReichLab/AdmixTools/blob/master/pdoc.pdf">distributed with ADMIXTOOLS</a>), <em>and</em> that you read the relevant supplementary sections of papers published by David Reich’s group. At the very least, I recommend reading:</p>
<ul>
<li><p>Note S6 of <em>“<a href="https://www.nature.com/articles/nature11258">Reconstructing Native American population history</a>”</em> by Reich et al. This paper first introduced the theoretical background of what later became <em>qpWave</em>.</p></li>
<li><p>Supplementary Information 10 of <em>“<a href="https://www.nature.com/articles/nature14317">Massive migration from the steppe was a source for Indo-European languages in Europe</a>”</em> by Haak et al., which gives a more consise overview of the <em>qpWave</em> method than S6 of Reich et al. 2012, and also introduces the <em>qpAdm</em> methodology for estimating admixture proportions.</p></li>
</ul>
<p>In the remainder of this section, I will assume that you are familiar with both methods, and will only explain how to use <em>admixr</em> for running them from R.</p>
<div id="qpwave" class="section level3">
<h3 class="hasAnchor">
<a href="#qpwave" class="anchor"></a><em>qpWave</em>
</h3>
<p>To run <em>qpWave</em>, you must provide a list of <em>left</em> and <em>right</em> populations (using the terminology of Haak et al. 2015 above). The aim of the method is to get an idea about the number of migration waves from <em>right</em> to <em>left</em> (with no back-migration from <em>left</em> to <em>right</em>!). This is done by estimating the rank of a matrix of all possible <span class="math inline">\(f_4\)</span> statistics</p>
<p><span class="math display">\[f_4(\textrm{left}_1, \textrm{left}_i; \textrm{right}_1, \textrm{right}_i),\]</span></p>
<p>where <span class="math inline">\(\textrm{left}_1\)</span> and <span class="math inline">\(\textrm{right}_1\)</span> are some fixed populations and the <span class="math inline">\(i\)</span> and <span class="math inline">\(j\)</span> indices run over all other possible choices of populations.</p>
<p>As an example, let’s try to find the number of admixture waves from <em>right</em> = {Yoruba, Mbuti, Alta} into <em>left</em> = {French, Sardinian, Han} populations. We can do this using the function <code><a href="../reference/qpWave.html">qpWave()</a></code>, setting its arguments appropriately:</p>
<div class="sourceCode" id="cb23"><html><body><pre class="r"><span class="no">result</span> <span class="kw"><-</span> <span class="fu"><a href="../reference/qpWave.html">qpWave</a></span>(
<span class="kw">left</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"French"</span>, <span class="st">"Sardinian"</span>, <span class="st">"Han"</span>),
<span class="kw">right</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"Altai"</span>, <span class="st">"Yoruba"</span>, <span class="st">"Mbuti"</span>),
<span class="kw">data</span> <span class="kw">=</span> <span class="no">snps</span>
)</pre></body></html></div>
<p>The <code><a href="../reference/qpWave.html">qpWave()</a></code> function returns a data frame which shows the results of a series of matrix rank tests. The <code>rank</code> column is the matrix rank tested, <code>df</code>, <code>chisq</code> and <code>tail</code> give the degrees of freedom, <span class="math inline">\(\chi^2\)</span> value and <span class="math inline">\(p\)</span>-value for the comparison with the saturated model (the <span class="math inline">\(p\)</span>-value then indicates which matrix rank is consistent with the data - see example below), and <code>dfdiff</code>, <code>chisqdiff</code> and <code>taildiff</code> give the same, but always comparing a model to the model with one rank less.</p>
<div class="sourceCode" id="cb24"><html><body><pre class="r"><span class="no">result</span></pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="right">rank</th>
<th align="right">df</th>
<th align="right">chisq</th>
<th align="right">tail</th>
<th align="right">dfdiff</th>
<th align="right">chisqdiff</th>
<th align="right">taildiff</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="right">0</td>
<td align="right">4</td>
<td align="right">1.756</td>
<td align="right">0.7805106</td>
<td align="right">0</td>
<td align="right">0.000</td>
<td align="right">1.0000000</td>
</tr>
<tr class="even">
<td align="right">1</td>
<td align="right">1</td>
<td align="right">0.192</td>
<td align="right">0.6614968</td>
<td align="right">3</td>
<td align="right">1.564</td>
<td align="right">0.6674998</td>
</tr>
<tr class="odd">
<td align="right">2</td>
<td align="right">0</td>
<td align="right">0.000</td>
<td align="right">1.0000000</td>
<td align="right">1</td>
<td align="right">0.192</td>
<td align="right">0.6614968</td>
</tr>
</tbody>
</table>
<p>In this example, we see that matrix <span class="math inline">\(r = 0\)</span> cannot be rejected (<code>tail</code> <span class="math inline">\(p\)</span>-value = 0.78). Because Reich et al. 2012 showed that <span class="math inline">\(r + 1 \le n\)</span>, where <span class="math inline">\(n\)</span> is the number of admixture waves, we can interpret this as <em>left</em> populations having at least <span class="math inline">\(n = 1\)</span> streams of ancestry from the set of <em>right</em> populations. In this case, the most likely explanation is Neandertal admixture into non-Africans today.</p>
<p>Now, what happens if we add Papuans to the <em>left</em> group?</p>
<div class="sourceCode" id="cb25"><html><body><pre class="r"><span class="no">result</span> <span class="kw"><-</span> <span class="fu"><a href="../reference/qpWave.html">qpWave</a></span>(
<span class="kw">left</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"Papuan"</span>, <span class="st">"French"</span>, <span class="st">"Sardinian"</span>, <span class="st">"Han"</span>),
<span class="kw">right</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"Altai"</span>, <span class="st">"Yoruba"</span>, <span class="st">"Mbuti"</span>),
<span class="kw">data</span> <span class="kw">=</span> <span class="no">snps</span>
)</pre></body></html></div>
<div class="sourceCode" id="cb26"><html><body><pre class="r"><span class="no">result</span></pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="right">rank</th>
<th align="right">df</th>
<th align="right">chisq</th>
<th align="right">tail</th>
<th align="right">dfdiff</th>
<th align="right">chisqdiff</th>
<th align="right">taildiff</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="right">0</td>
<td align="right">6</td>
<td align="right">29.143</td>
<td align="right">0.0000572</td>
<td align="right">0</td>
<td align="right">0.000</td>
<td align="right">1.0000000</td>
</tr>
<tr class="even">
<td align="right">1</td>
<td align="right">2</td>
<td align="right">0.601</td>
<td align="right">0.7403643</td>
<td align="right">4</td>
<td align="right">28.542</td>
<td align="right">0.0000097</td>
</tr>
<tr class="odd">
<td align="right">2</td>
<td align="right">0</td>
<td align="right">0.000</td>
<td align="right">1.0000000</td>
<td align="right">2</td>
<td align="right">0.601</td>
<td align="right">0.7403643</td>
</tr>
</tbody>
</table>
<p>We can now clearly reject rank <span class="math inline">\(r = 0\)</span>, but we see that the data is consistent with rank <span class="math inline">\(r = 1\)</span>, meaning that there must have been at least <span class="math inline">\(n = 2\)</span> streams of ancestry from <em>right</em> to <em>left</em> populations (<span class="math inline">\(r + 1 \le n\)</span>). Because this happened after we introduced Papuans to the <em>left</em> set, this could indicate a separate pulse of archaic introgression into Papuans, which is not surprising given what we know about significantly more archaic ancestry in Papuans than in any other present-day population.</p>
</div>
<div id="qpadm" class="section level3">
<h3 class="hasAnchor">
<a href="#qpadm" class="anchor"></a><em>qpAdm</em>
</h3>
<p>The <em>qpAdm</em> method can be used to find, for a given target population, the proportions of ancestry coming from a set of <em>source</em> populations. Importantly, since we often lack accurate representatives of the true ancestral populations, we can use a set of reference populations instead, under a crucial assumption that the references set is phylogenetically closer to true <em>source</em> populations than to a set of specified <em>outgroups</em>. For example, coming back to our example of estimating the proportions of Neandertal ancestry in people today, we could define:</p>
<ul>
<li>a set of European individuals as the <em>target</em>;</li>
<li>Vindija Neanderthal and an African as two <em>source</em> populations;</li>
<li>
<em>outgroup</em> populations as Chimp, Altai Neanderthal and Denisovan (which are all further from the true ancestral populations than the specified <em>sources</em>).</li>
</ul>
<p>Having defined all three population sets, we can run qpAdm with:</p>
<div class="sourceCode" id="cb27"><html><body><pre class="r"><span class="no">result</span> <span class="kw"><-</span> <span class="fu"><a href="../reference/qpAdm.html">qpAdm</a></span>(
<span class="kw">target</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"Sardinian"</span>, <span class="st">"Han"</span>, <span class="st">"French"</span>),
<span class="kw">sources</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"Vindija"</span>, <span class="st">"Yoruba"</span>),
<span class="kw">outgroups</span> <span class="kw">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span>(<span class="st">"Chimp"</span>, <span class="st">"Denisova"</span>, <span class="st">"Altai"</span>),
<span class="kw">data</span> <span class="kw">=</span> <span class="no">snps</span>
)</pre></body></html></div>
<p>The <code><a href="../reference/qpAdm.html">qpAdm()</a></code> function has an argument <code>details</code> (default TRUE) which makes the function return a list of three elements:</p>
<ul>
<li>
<code>proportions</code> - data frame with admixture proportions - this is what we mostly care about;</li>
<li>
<code>ranks</code> - results of rank tests performed by <em>qpWave</em> - these evaluate how well does the assumed traget-sources-outgroups population model match the data;</li>
<li>
<code>subsets</code> - results of the “all subsets” analysis (see the <a href="https://github.com/DReichLab/AdmixTools/blob/master/pdoc.pdf">documentation</a> for more details.</li>
</ul>
<p>If <code>details</code> is set to <code>FALSE</code>, only the <code>proportions</code> components is returned by the <code><a href="../reference/qpAdm.html">qpAdm()</a></code> function.</p>
<p>Let’s start with the <code>ranks</code> element:</p>
<div class="sourceCode" id="cb28"><html><body><pre class="r"><span class="no">result</span>$<span class="no">ranks</span></pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="left">target</th>
<th align="right">rank</th>
<th align="right">df</th>
<th align="right">chisq</th>
<th align="right">tail</th>
<th align="right">dfdiff</th>
<th align="right">chisqdiff</th>
<th align="right">taildiff</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="left">Sardinian</td>
<td align="right">1</td>
<td align="right">1</td>
<td align="right">0.006</td>
<td align="right">0.9362605</td>
<td align="right">3</td>
<td align="right">-0.006</td>
<td align="right">1.0000000</td>
</tr>
<tr class="even">
<td align="left">Sardinian</td>
<td align="right">2</td>
<td align="right">0</td>
<td align="right">0.000</td>
<td align="right">1.0000000</td>
<td align="right">1</td>
<td align="right">0.006</td>
<td align="right">0.9362605</td>
</tr>
<tr class="odd">
<td align="left">Han</td>
<td align="right">1</td>
<td align="right">1</td>
<td align="right">2.144</td>
<td align="right">0.1431157</td>
<td align="right">3</td>
<td align="right">-2.144</td>
<td align="right">1.0000000</td>
</tr>
<tr class="even">
<td align="left">Han</td>
<td align="right">2</td>
<td align="right">0</td>
<td align="right">0.000</td>
<td align="right">1.0000000</td>
<td align="right">1</td>
<td align="right">2.144</td>
<td align="right">0.1431157</td>
</tr>
<tr class="odd">
<td align="left">French</td>
<td align="right">1</td>
<td align="right">1</td>
<td align="right">3.814</td>
<td align="right">0.0508171</td>
<td align="right">3</td>
<td align="right">-3.814</td>
<td align="right">1.0000000</td>
</tr>
<tr class="even">
<td align="left">French</td>
<td align="right">2</td>
<td align="right">0</td>
<td align="right">0.000</td>
<td align="right">1.0000000</td>
<td align="right">1</td>
<td align="right">3.814</td>
<td align="right">0.0508171</td>
</tr>
</tbody>
</table>
<p>The row with rank = 1 represents a <em>qpWave</em> test with all <span class="math inline">\(n\)</span> <em>source</em> populations set as the <em>left</em> set and all <em>outgroups</em> as the <em>right</em> set. This test evaluates whether the ancestral populations are descended from <span class="math inline">\(n\)</span> independent streams of ancestry. In our case, <span class="math inline">\(n = 2\)</span> (Mbuti and Vindija), which means that the data would have to be consistent with rank <span class="math inline">\(r = 1\)</span> to satisfy the inequality <span class="math inline">\(r + 1 \le n\)</span> proved by Reich et al., 2012. We see that this is true for all three target populations (<span class="math inline">\(p\)</span>-value > 0.05 for all targets), and the simple model of Neandertal admixture thus seems to be reasonably consistent with the data.</p>
<p>The rank = 2 row represents a <em>qpWave</em> test after adding a target population to the <em>left</em> group together with the <em>sources</em>. This test makes sure that including the target population does not increase the rank of the <span class="math inline">\(f_4\)</span> matrix, meaning that the target can be really modelled as a mixture of ancestries from the <em>sources</em>. If the <span class="math inline">\(p\)</span>-values turn out to be very low, this indicates that the assumed model does not fit the data and that a part of the ancestry in a <em>target</em> possibly cannot be traced to any of the <em>sources</em>. In our case, however, all rank = 2 test <span class="math inline">\(p\)</span>-values are not significant, and we can be reasonably sure that the <em>target</em> samples can be fully modelled as a mixtures of all specified <em>references</em>.</p>
<p>Having made sure that our model is reasonably correct, we can now take a look at <code>proportions</code>, which contains admixture proportion estimates from all specified sources, as well as standard errors for those proportions using a block jackknife:</p>
<div class="sourceCode" id="cb29"><html><body><pre class="r"><span class="no">result</span>$<span class="no">proportions</span></pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="left">target</th>
<th align="right">Vindija</th>
<th align="right">Yoruba</th>
<th align="right">stderr_Vindija</th>
<th align="right">stderr_Yoruba</th>
<th align="right">nsnps</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="left">Sardinian</td>
<td align="right">0.025</td>
<td align="right">0.975</td>
<td align="right">0.006</td>
<td align="right">0.006</td>
<td align="right">499314</td>
</tr>
<tr class="even">
<td align="left">Han</td>
<td align="right">0.021</td>
<td align="right">0.979</td>
<td align="right">0.006</td>
<td align="right">0.006</td>
<td align="right">499654</td>
</tr>
<tr class="odd">
<td align="left">French</td>
<td align="right">0.022</td>
<td align="right">0.978</td>
<td align="right">0.006</td>
<td align="right">0.006</td>
<td align="right">499434</td>
</tr>
</tbody>
</table>
<p>If we compare this result to the <span class="math inline">\(f_4\)</span>-ratio values calculated above, we see that the <em>qpAdm</em> estimates are very close to what we got earlier.</p>
<p>The third element in the list of results shows the outcome of an “all subsets” analysis, which involves testing all subsets of potential source populations. Each 1 in the “pattern” column means that the proportion of ancestry from that particular source population (in the order as specified by the user) was forced to 0.0.</p>
<div class="sourceCode" id="cb30"><html><body><pre class="r"><span class="no">result</span>$<span class="no">subsets</span></pre></body></html></div>
<table class="table">
<thead><tr class="header">
<th align="left">target</th>
<th align="left">pattern</th>
<th align="right">wt</th>
<th align="right">dof</th>
<th align="right">chisq</th>
<th align="right">tail</th>
<th align="right">Vindija</th>
<th align="right">Yoruba</th>
</tr></thead>
<tbody>
<tr class="odd">
<td align="left">Sardinian</td>
<td align="left">00</td>
<td align="right">0</td>
<td align="right">1</td>
<td align="right">0.006</td>
<td align="right">0.9362610</td>
<td align="right">0.025</td>
<td align="right">0.975</td>
</tr>
<tr class="even">
<td align="left">Sardinian</td>
<td align="left">01</td>
<td align="right">1</td>
<td align="right">2</td>
<td align="right">15953.171</td>
<td align="right">0.0000000</td>
<td align="right">1.000</td>
<td align="right">0.000</td>
</tr>
<tr class="odd">
<td align="left">Sardinian</td>
<td align="left">10</td>
<td align="right">1</td>
<td align="right">2</td>
<td align="right">16.564</td>
<td align="right">0.0002530</td>
<td align="right">0.000</td>
<td align="right">1.000</td>
</tr>
<tr class="even">
<td align="left">Han</td>
<td align="left">00</td>
<td align="right">0</td>
<td align="right">1</td>
<td align="right">2.144</td>
<td align="right">0.1431160</td>
<td align="right">0.021</td>
<td align="right">0.979</td>
</tr>
<tr class="odd">
<td align="left">Han</td>
<td align="left">01</td>
<td align="right">1</td>
<td align="right">2</td>
<td align="right">14965.791</td>
<td align="right">0.0000000</td>
<td align="right">1.000</td>
<td align="right">0.000</td>
</tr>
<tr class="even">
<td align="left">Han</td>
<td align="left">10</td>
<td align="right">1</td>
<td align="right">2</td>
<td align="right">14.454</td>
<td align="right">0.0007269</td>
<td align="right">0.000</td>
<td align="right">1.000</td>
</tr>
<tr class="odd">
<td align="left">French</td>
<td align="left">00</td>
<td align="right">0</td>
<td align="right">1</td>
<td align="right">3.814</td>
<td align="right">0.0508171</td>
<td align="right">0.022</td>
<td align="right">0.978</td>
</tr>
<tr class="even">
<td align="left">French</td>
<td align="left">01</td>
<td align="right">1</td>
<td align="right">2</td>
<td align="right">15441.258</td>
<td align="right">0.0000000</td>
<td align="right">1.000</td>
<td align="right">0.000</td>
</tr>
<tr class="odd">
<td align="left">French</td>
<td align="left">10</td>
<td align="right">1</td>
<td align="right">2</td>
<td align="right">16.028</td>
<td align="right">0.0003308</td>
<td align="right">0.000</td>
<td align="right">1.000</td>
</tr>
</tbody>