diff --git a/paper/GENETICS_response.docx b/paper/GENETICS_response.docx index b9715c7..07129b5 100644 Binary files a/paper/GENETICS_response.docx and b/paper/GENETICS_response.docx differ diff --git a/paper/ancient.aux b/paper/ancient.aux index 29bdab4..a4405e6 100644 --- a/paper/ancient.aux +++ b/paper/ancient.aux @@ -28,7 +28,8 @@ \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Impact of admixture}{9}} \citation{mathieson2015genome} \citation{10002015global} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Application to ancient humans}{10}} +\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Impact of contamination}{10}} +\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Application to ancient humans}{10}} \citation{skoglund2014genomic} \citation{sawyer2012temporal} \citation{green2010draft} @@ -114,9 +115,11 @@ \newlabel{admixture}{{4}{32}} \@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Impact of ghost admixture on rejecting continuity. The $x$ axis shows the admixture proportion from the ghost population, and the $y$ axis shows the fraction of simulations in which continuity was rejected. Each line corresponds to a different sampling strategy, as indicated in the legend.}}{33}} \newlabel{ghost}{{5}{33}} -\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Parameters of the model inferred from ancient West Eurasian samples. Panel A shows $t_1$ on the x-axis and $t_2$ on the y-axis, with each point corresponding to a population as indicated in the legend. Numbers in the legend correspond to the mean date of all samples in the population. Panels B and C show scatterplots of the mean age of the samples in the population (x-axis) against $t_1$ and $t_2$, respectively. Points are described by the same legend as Panel A.}}{34}} -\newlabel{pops_together}{{6}{34}} -\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Impact of pooling individuals into populations when estimating model parameters from real data. In both panels, the x-axis indicates the parameter estimate when individuals are analyzed separately, while the y-axis indicates the parameter estimate when individuals are grouped into populations. Size of points is proportional to the coverage of each individual. Panel A reports the impact on estimation of $t_1$, while Panel B reports the impact on $t_2$. Note that Panel B has a broken x-axis. Solid lines in each figure indicate $y = x$.}}{35}} -\newlabel{sep_vs_pops}{{7}{35}} -\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Details of populations included in analysis. ``pop'' is population name, ``cov'' is mean coverage of individuals in the population, ``date'' is mean date of individuals in the population, ``$t_1$'' is the maximum likelihood estimate of $t_1$ in the full model, ``$t_2$'' is the maximum likelihood estimate of $t_2$ in the full model, ``LnL'' is the maximum likelihood value in the full model, ``$t_1$ (cont)'' is the maximum likelihood estimate of $t_1$ in the model where $t_2 = 0$, ``LnL'' is the maximum likelihood value in the model where $t_2 = 0$.}}{36}} -\newlabel{params_table}{{1}{36}} +\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Impact of contamination on parameter inference. The $x$ axis shows the contamination fraction, and the $y$ axis shows the average parameter estimate from simulations. Each line corresponds to a different sampling strategy, as indicated in the legend. Panel A shows $t_1$, and Panel B shows $t_2$. Dashed lines indicate the true values of $t_1 = 0.02$ and $t_2 = 0.05$}}{34}} +\newlabel{contamination}{{6}{34}} +\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Parameters of the model inferred from ancient West Eurasian samples. Panel A shows $t_1$ on the x-axis and $t_2$ on the y-axis, with each point corresponding to a population as indicated in the legend. Numbers in the legend correspond to the mean date of all samples in the population. Panels B and C show scatterplots of the mean age of the samples in the population (x-axis) against $t_1$ and $t_2$, respectively. Points are described by the same legend as Panel A.}}{35}} +\newlabel{pops_together}{{7}{35}} +\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Impact of pooling individuals into populations when estimating model parameters from real data. In both panels, the x-axis indicates the parameter estimate when individuals are analyzed separately, while the y-axis indicates the parameter estimate when individuals are grouped into populations. Size of points is proportional to the coverage of each individual. Panel A reports the impact on estimation of $t_1$, while Panel B reports the impact on $t_2$. Note that Panel B has a broken x-axis. Solid lines in each figure indicate $y = x$.}}{36}} +\newlabel{sep_vs_pops}{{8}{36}} +\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Details of populations included in analysis. ``pop'' is population name, ``cov'' is mean coverage of individuals in the population, ``date'' is mean date of individuals in the population, ``$t_1$'' is the maximum likelihood estimate of $t_1$ in the full model, ``$t_2$'' is the maximum likelihood estimate of $t_2$ in the full model, ``LnL'' is the maximum likelihood value in the full model, ``$t_1$ (cont)'' is the maximum likelihood estimate of $t_1$ in the model where $t_2 = 0$, ``LnL'' is the maximum likelihood value in the model where $t_2 = 0$.}}{37}} +\newlabel{params_table}{{1}{37}} diff --git a/paper/ancient.fff b/paper/ancient.fff index 93bcad8..61b9b89 100644 --- a/paper/ancient.fff +++ b/paper/ancient.fff @@ -38,6 +38,14 @@ \end{figure} \efloatseparator +\begin{figure}[h] % figure placement: here, top, bottom, or page + \centering + \includegraphics[width=\textwidth]{contamination_t1_t2.pdf} + \caption{Impact of contamination on parameter inference. The $x$ axis shows the contamination fraction, and the $y$ axis shows the average parameter estimate from simulations. Each line corresponds to a different sampling strategy, as indicated in the legend. Panel A shows $t_1$, and Panel B shows $t_2$. Dashed lines indicate the true values of $t_1 = 0.02$ and $t_2 = 0.05$} + \label{contamination} +\end{figure} +\efloatseparator + \begin{figure}[h] % figure placement: here, top, bottom, or page \centering \includegraphics[width=\textwidth]{parameters_and_age.pdf} diff --git a/paper/ancient.log b/paper/ancient.log index 52ea1da..661c6db 100644 --- a/paper/ancient.log +++ b/paper/ancient.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2014) (preloaded format=pdflatex 2014.5.25) 27 OCT 2017 15:00 +This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2014) (preloaded format=pdflatex 2014.5.25) 30 OCT 2017 11:00 entering extended mode restricted \write18 enabled. file:line:error style messages enabled. @@ -327,13 +327,13 @@ n/10 ^^@ \OML/cmm/m/it/10 x\OT1/cmr/m/n/10 )[]\OML/cmm/m/it/10 ; x\OT1/cmr/m/n/ (ancient.ttt) [11] [12] [13] [14] [15] [16] [17] [18] [19] [20] [21] -Overfull \hbox (62.55733pt too wide) detected at line 346 +Overfull \hbox (62.55733pt too wide) detected at line 358 \OML/cmm/m/it/10 L\OT1/cmr/m/n/10 (\OML/cmm/m/it/10 D\OT1/cmr/m/n/10 ) = [] [] [] [] \OML/cmm/m/it/10 : [] [22] -Overfull \hbox (97.71663pt too wide) in paragraph at lines 364--365 +Overfull \hbox (97.71663pt too wide) in paragraph at lines 376--377 \OT1/cmr/m/n/10 Python im-ple-men-ta-tions of the de-scribed meth-ods are avail -able at \OT1/cmtt/m/n/10 www.github.com/schraiber/continuity/ [] @@ -386,27 +386,36 @@ Package pdftex.def Info: continuity_rejection_ghost.pdf used on input line 35. [33 <./continuity_rejection_ghost.pdf>] - + +File: contamination_t1_t2.pdf Graphic file (type pdf) + + +Package pdftex.def Info: contamination_t1_t2.pdf used on input line 43. +(pdftex.def) Requested size: 430.00462pt x 180.70544pt. + [34 + + <./contamination_t1_t2.pdf>] + File: parameters_and_age.pdf Graphic file (type pdf) -Package pdftex.def Info: parameters_and_age.pdf used on input line 43. +Package pdftex.def Info: parameters_and_age.pdf used on input line 51. (pdftex.def) Requested size: 430.00462pt x 502.56282pt. -LaTeX Warning: Float too large for page by 39.81546pt on input line 46. +LaTeX Warning: Float too large for page by 39.81546pt on input line 54. LaTeX Warning: `h' float specifier changed to `ht'. -[34 <./parameters_and_age.pdf>] - +[35 <./parameters_and_age.pdf>] + File: sep_vs_pops.pdf Graphic file (type pdf) -Package pdftex.def Info: sep_vs_pops.pdf used on input line 51. +Package pdftex.def Info: sep_vs_pops.pdf used on input line 59. (pdftex.def) Requested size: 430.00462pt x 212.35426pt. -[35 +[36 <./sep_vs_pops.pdf>]) (./ancient.ttt @@ -414,18 +423,18 @@ Overfull \hbox (75.64052pt too wide) in paragraph at lines 3--33 [][] [] -[36 +[37 ]) (./ancient.aux) ) Here is how much of TeX's memory you used: - 3806 strings out of 493117 - 53339 string characters out of 6135433 - 128488 words of memory out of 5000000 - 7195 multiletter control sequences out of 15000+600000 + 3812 strings out of 493117 + 53488 string characters out of 6135433 + 128497 words of memory out of 5000000 + 7200 multiletter control sequences out of 15000+600000 15759 words of font info for 60 fonts, out of 8000000 for 9000 1141 hyphenation exceptions out of 8191 - 38i,9n,26p,2357b,308s stack positions out of 5000i,500n,10000p,200000b,80000s + 38i,9n,26p,2363b,308s stack positions out of 5000i,500n,10000p,200000b,80000s -Output written on ancient.pdf (36 pages, 4525045 bytes). +Output written on ancient.pdf (37 pages, 5316029 bytes). PDF statistics: - 524 PDF objects out of 1000 (max. 8388607) - 235 compressed objects within 3 object streams + 575 PDF objects out of 1000 (max. 8388607) + 253 compressed objects within 3 object streams 0 named destinations out of 1000 (max. 500000) - 36 words of extra memory for PDF output out of 10000 (max. 10000000) + 41 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/paper/ancient.pdf b/paper/ancient.pdf index bc9a06b..49680f4 100644 Binary files a/paper/ancient.pdf and b/paper/ancient.pdf differ diff --git a/paper/ancient.synctex.gz b/paper/ancient.synctex.gz index 94d31fb..ad99ec6 100644 Binary files a/paper/ancient.synctex.gz and b/paper/ancient.synctex.gz differ diff --git a/paper/ancient.tex b/paper/ancient.tex index f93c4c6..6aa18c1 100644 --- a/paper/ancient.tex +++ b/paper/ancient.tex @@ -154,11 +154,13 @@ \subsection{Impact of admixture} \subsection{Impact of contamination} +We also explored the impact of foreign DNA contamination on inferences made using this approach. Briefly, we modified the simulations to include a chance $c$ of a read being from a modern sample instead of the ancient sample when simulating reads. We again simulated data corresponding to Figure \ref{RMSE}, with a 300 generation old ancient sample from population of size 1000 split from a population of size 10000 400 generations ago. In Figure \ref{contamination}, we see that relatively modest amounts of contamination can result in estimating zero or near-zero drift times. Interestingly, for the same contamination fraction, higher coverage samples are impacted slightly less. Together, this suggests that contamination will result in samples to be falsely inferred to be directly continuous with the modern population. + \begin{figure}[h] % figure placement: here, top, bottom, or page \centering - \includegraphics[width=\textwidth]{continuity_rejection_ghost.pdf} - \caption{Impact of ghost admixture on rejecting continuity. The $x$ axis shows the admixture proportion from the ghost population, and the $y$ axis shows the fraction of simulations in which continuity was rejected. Each line corresponds to a different sampling strategy, as indicated in the legend.} - \label{ghost} + \includegraphics[width=\textwidth]{contamination_t1_t2.pdf} + \caption{Impact of contamination on parameter inference. The $x$ axis shows the contamination fraction, and the $y$ axis shows the average parameter estimate from simulations. Each line corresponds to a different sampling strategy, as indicated in the legend. Panel A shows $t_1$, and Panel B shows $t_2$. Dashed lines indicate the true values of $t_1 = 0.02$ and $t_2 = 0.05$} + \label{contamination} \end{figure} @@ -225,7 +227,7 @@ \section{Discussion} Ancient DNA (aDNA) presents unique opportunities to enhance our understanding of demography and selection in recent history. However, it also comes equipped with several challenges, due to postmortem DNA damage \citep{sawyer2012temporal}. Several strategies have been developed to deal with the low quality of aDNA data, from relatively simple options like sampling a read at random at every site \citep{green2010draft} to more complicated methods making use of genotype likelihoods \citep{racimo2016joint}. Here, we presented a novel maximum likelihood approach for making inferences about how ancient populations are related to modern populations by analyzing read counts from multiple ancient individuals and explicitly modeling relationship between the two populations. We explicitly condition on the allele frequency in a modern population; as we showed in the appendix, this renders our method robust to ascertainment in modern samples. Thus, it can be used with SNP capture data. Moreover, confidence intervals can be calculated using a nonparametric bootstrap. Using this approach, we examined some aspects of sampling strategy for aDNA analysis and we applied our approach to ancient humans. -We found that sequencing many individuals from an ancient population to low coverage (~.5-1x) can be a significantly more cost effective strategy than sequencing fewer individuals to relatively high coverage. For instance, we saw from simulations that far more accurate estimates of the drift time in an ancient population can be obtained by pooling 2 individuals at 0.5x coverage than by sequencing a single individual to 2x coverage (Figure \ref{RMSE}). We saw this replicated in our analysis of the real data: low coverage individuals showed a significant amount of variation and bias in estimating the model parameters that was substantially reduced when individuals were analyzed jointly in a population (Figure \ref{sep_vs_pops}). To explore this further, we showed that sites sequenced to 1x coverage in a single individual retain no information about the drift time in the ancient population. This can be intuitively understood because the drift time in the ancient population is strongly related the amount of heterozygosity in the ancient population: an ancient population with a longer drift time will have lower heterozygosity at sites shared with a modern population. When a site is only sequenced once in a single individual, there is no information about the heterozygosity of that site. We also observed a pronounced upward bias in estimates of the drift time in the ancient population from low coverage samples. We speculate that this is due to the presence of few sites covered more than once being likely to be homozygous, thus deflating the estimate of heterozygosity in the ancient population. Thus, for analysis of SNP data, we recommend that aDNA sampling be conduced to maximize the number of individuals from each ancient population that can be sequenced to $\sim$1x, rather than attempting to sequence fewer individuals to high coverage. This suggestion can be complicated when samples have vastly different levels of endogenous DNA, where it may be cost effective to sequence high quality samples to higher coverage. In that case, we recommend sequencing samples to at least 3-4x coverage; as evidenced by Figures \ref{RMSE} \ref{continuity}, single samples at <4x coverage provide extremely limited information about the drift time in the ancient population and thus little power to reject continuity. +We found that sequencing many individuals from an ancient population to low coverage (~.5-1x) can be a significantly more cost effective strategy than sequencing fewer individuals to relatively high coverage. For instance, we saw from simulations that far more accurate estimates of the drift time in an ancient population can be obtained by pooling 2 individuals at 0.5x coverage than by sequencing a single individual to 2x coverage (Figure \ref{RMSE}). We saw this replicated in our analysis of the real data: low coverage individuals showed a significant amount of variation and bias in estimating the model parameters that was substantially reduced when individuals were analyzed jointly in a population (Figure \ref{sep_vs_pops}). To explore this further, we showed that sites sequenced to 1x coverage in a single individual retain no information about the drift time in the ancient population. This can be intuitively understood because the drift time in the ancient population is strongly related the amount of heterozygosity in the ancient population: an ancient population with a longer drift time will have lower heterozygosity at sites shared with a modern population. When a site is only sequenced once in a single individual, there is no information about the heterozygosity of that site. We also observed a pronounced upward bias in estimates of the drift time in the ancient population from low coverage samples. We speculate that this is due to the presence of few sites covered more than once being likely to be homozygous, thus deflating the estimate of heterozygosity in the ancient population. Thus, for analysis of SNP data, we recommend that aDNA sampling be conduced to maximize the number of individuals from each ancient population that can be sequenced to $\sim$1x, rather than attempting to sequence fewer individuals to high coverage. This suggestion can be complicated when samples have vastly different levels of endogenous DNA, where it may be cost effective to sequence high quality samples to higher coverage. In that case, we recommend sequencing samples to at least 3-4x coverage; as evidenced by Figures \ref{RMSE} and \ref{continuity}, single samples at $<$4x coverage provide extremely limited information about the drift time in the ancient population and thus little power to reject continuity. When we looked at the impact of model misspecification, we saw several important patterns. First, the influence of admixture from the ancient population on inferences of $t_2$ is approximately linear, suggesting that if there are estimates of the amount of admixture between the modern and ancient population, a bias-corrected estimate of $t_2$ could be produced (Figure \ref{admixture}B). The impact on inference of $t_1$ is more complicated: admixture actually \emph{reduces} estimates of $t_1$ (Figure \ref{admixture}A). This is likely because admixture increases the heterozygosity in the modern population, thus causing the amount of drift time to seem reduced. In both cases, the bias is not impacted by details of sampling strategy, although the variance of estimates is highly in a way consistent with Figure \ref{RMSE}. @@ -233,7 +235,7 @@ \section{Discussion} Because many modern populations may have experienced admixture from unsampled ``ghost'' populations, we also performed simulations to test the impact of ghost admixture on the probability of falsely rejecting continuity. We find that single ancient samples do not provide sufficient power to reject continuity even for high levels of ghost admixture, while increasingly powerful sampling schemes, adding more individuals or higher coverage per individual, reject continuity at higher rates. However, in these situations, whether we regard rejection of continuity as a false or true discovery is somewhat subjective: how much admixture from an outside population is required before considering a population to not be directly ancestral? In future work it will be extremely important to estimate the ``maximum contribution'' of the population an ancient sample comes from (c.f \citet{sjodin2014assessing}). -To gain new insights from empirical data, we applied our approach to ancient samples throughout Europe. Notably, we rejected continuity for all populations that we analyzed. This is unsurprising, given that European history is extremely complicated and has been shaped by many periods of admixture \citep{lazaridis2014ancient, haak2015massive, lazaridis2016genomic}. Thus, modern Europeans have experienced many periods of ``ghost'' admixture (relative to any particular ancient sample). Nonetheless, our results show that none of these populations are even particularly close to directly ancestral, as our simulations have shown that rejection of continuity is robust to low levels of ghost admixture. +To gain new insights from empirical data, we applied our approach to ancient samples throughout Europe. Notably, we rejected continuity for all populations that we analyzed. This is unsurprising, given that European history is extremely complicated and has been shaped by many periods of admixture \citep{lazaridis2014ancient, haak2015massive, lazaridis2016genomic}. Thus, modern Europeans have experienced many periods of ``ghost'' admixture (relative to any particular ancient sample). Nonetheless, our results show that none of these populations are even particularly close to directly ancestral, as our simulations have shown that rejection of continuity will not occur with low levels of ghost admixture. Secondly, we observed that the drift time in the ancient population was much larger than the drift time in the modern population. Assuming that the ancient sample were a contemporary sample, the ratio $t_1/t_2$ is an estimator of the ratio $N_e^{(2)}/N_e^{(1)}$; in fact, because the ancient sample existed for fewer generations since the common ancestor of the ancient and modern populations, $t_1/t_2$ acts as an upper bound on $N_e^{(2)}/N_e^{(1)}$. Moreover, this is unlikely to be due to unmodeled error in the ancient samples: error would be expected increase the heterozygosity in the ancient sample, and thus \emph{decrease} our estimates of $t_2$. Another potential complication is the fact that modern Europeans are a mixture of multiple ancestral populations \citep{lazaridis2014ancient, haak2015massive}. As shown through simulation, admixture increases heterozygosity in the modern population and thus decreases estimates of $t_1$. However, even very large amounts of ghost admixture did not result in the order-of-magnitude differences we see in the real data, suggesting that ghost admixture cannot account for all the discrepancy between modern and ancient $N_e$. Thus, we find strong support for the observation that ancient Europeans were often members of small, isolated populations \citep{skoglund2014genomic}. We interpret these these two results together as suggestive that many ancient samples found thus far in Europe were members of small populations that ultimately went locally extinct. Nonetheless, there may be many samples that belonged to larger metapopulations, and further work is necessary to specifically examine those cases.