/
ScottClark_thesis.tex
2215 lines (1735 loc) · 205 KB
/
ScottClark_thesis.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[phd,tocprelim]{cornell}
%
% tocprelim option must be included to put the roman numeral pages in the
% table of contents
%
% The cornellheadings option will make headings completely consistent with
% guidelines.
%
% This sample document was originally provided by Blake Jacquot, and
% fixed up by Andrew Myers.
%
%Some possible packages to include
\usepackage{graphicx}
%\usepackage{graphics}
%\usepackage{moreverb}
%\usepackage{subfigure}
%\usepackage{epsfig}
%\usepackage{subfigure}
%\usepackage{hangcaption}
%\usepackage{txfonts}
%\usepackage{palatino}
\usepackage{color}
\definecolor{MyDarkGreen}{rgb}{0,0.5,0}
\definecolor{orange}{rgb}{1, .65, 0}
\definecolor{gray}{rgb}{.75, .75, .75}
\usepackage{amsmath}
\usepackage{amssymb}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
%if you're having problems with overfull boxes, you may need to increase
%the tolerance to 9999
\tolerance=9999
\bibliographystyle{plain}
%\bibliographystyle{IEEEbib}
%\renewcommand{\caption}[1]{\singlespacing\hangcaption{#1}\normalspacing}
\renewcommand{\topfraction}{0.85}
\renewcommand{\textfraction}{0.1}
\renewcommand{\floatpagefraction}{0.75}
\title {Parallel Machine Learning Algorithms in Bioinformatics and Global Optimization}
\author {Scott Clark}
\conferraldate {August}{2012}
\degreefield {Ph.D.}
\copyrightholder{Scott Clark}
\copyrightyear{2012}
\begin{document}
\maketitle
\makecopyright
\begin{abstract}
This is a dissertation in three parts, in each we explore the development and analysis of a parallel statistical or machine learning algorithm and its implementation.
First, we examine the Assembly Likelihood Evaluation (ALE) framework. This algorithm defines a rigorous statistical likelihood metric used to validate and score genome and metagenome assemblies. This algorithm can be used to identify specific errors within assemblies and their locations; enable comparison between assemblies allowing for optimization of the assembly process; and using re-sequencing data, detect structural variations.
Second, we develop an algorithm for Expected Parallel Improvement (EPI). This optimization method allows us to optimally sample many points concurrently from an expensive to evaluate and unknown function. Instead of sampling sequentially, which can be inefficient when the available resources allow for simultaneous evaluation, EPI identifies the best set of points to sample next, allowing multiple samplings to be performed in unison.
Finally, we explore Velvetrope: a parallel, bitwise algorithm for finding homologous regions within sequences. This algorithm employs a two-part filter between sequences. It first finds offsets where two sequences share a higher than expected amount of identity. It then filters areas within these offsets with higher than expected identity. The resulting positions along each sequence represent regions of statistically significant similarity.
\end{abstract}
\begin{biosketch}
Scott Clark grew up in Tigard, Oregon and graduated from Central Catholic High School in Portland, Oregon in 2004. He graduated Magna Cum Laude from Oregon State University in 2008, receiving Bachelor of Science degrees in Mathematics, Computational Physics and Physics with minors in Actuarial Sciences and Mathematical Sciences.
In 2008 Scott was awarded a U.S. Department of Energy Computational Science Graduate Fellowship (CSGF) supporting his doctoral work at Cornell University Center for Applied Mathematics (CAM) where he was awarded his Masters degree in Computer Science in 2011. After graduation he plans to move to San Francisco after a celebratory trip around the world and has accepted a software engineering position at Yelp, Inc.
\end{biosketch}
\begin{dedication}
This document is dedicated to my family.
\ \\
Your constant, unconditional love and support \\
at every stage of my education \\
made this possible.
\end{dedication}
\begin{acknowledgements}
I have had the privilege of working with many amazing educators and researchers throughout my academic career. They have all helped sculpt me into the student and person I am today. At Central Catholic High School Steve Workman, Paul Wallulis and Phil Collins taught me the value of hard work and opened my eyes to the world of math and science. My REU advisors, Prof. Daniel Cox of University of California Davis and Prof. Steven Tomsovic of Washington State University gave me my first taste of independent research and helped cultivate my desire to pursue academia. My advisors at Oregon State: Malgorzata Peszynska and Rubin Landau guided me through my undergraduate education and nurtured my budding research pursuits. My practicum supervisors: Dr. Nick Hengartner and Dr. Joel Berendzen of Los Alamos National Laboratory; and Dr. Zhong Wang and Rob Egan of the Joint Genome Institute of Lawerence Berkeley National Laboratory whose joint work and experience was vital to this thesis and my degree. My committee at Cornell: Prof. Steve Strogatz and Prof. Bart Selman for their insights and advice. And, of course, my advisor Prof. Peter Frazier whose unwavering patience, support and encouragement brought this thesis to fruition and has been instrumental to my success at Cornell.
My research was supported by a U.S. Department of Energy Computational Science Graduate Fellowship (CSGF), which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-FG02-97ER25308. I was also supported by a Startup and Production Allocation Award from the National Energy Research Scientific Computing Center (NERSC), which is funded through the Office of Science of the U.S. Department of Energy under Contract No. DE-AC02-05CH11231.
\end{acknowledgements}
\contentspage
\tablelistpage
\figurelistpage
\normalspacing \setcounter{page}{1} \pagenumbering{arabic}
\pagestyle{cornell} \addtolength{\parskip}{0.5\baselineskip}
\part{ALE: Assembly Likelihood Evaluation} % (fold)
\label{prt:ALE: Assembly Likelihood Evaluation}
\singlespacing
\noindent
\large
ALE: a Generic Assembly Likelihood Evaluation Framework for \\ Assessing the Accuracy of Genome and Metagenome Assemblies
\noindent
\normalsize
Joint work with Rob Egan\,$^{1,2}$, Peter Frazier\,$^{3}$ and Zhong Wang\,$^{1,2}$ \\
\scriptsize
$^{1}$Department of Energy, Joint Genome Institute, Walnut Creek, CA 94598, USA\\
$^{2}$Genomics Division, Lawrence Berkeley National Laboratory, Berkeley, CA 94720, USA\\
$^{3}$Cornell University School of Operations Research and Information Engineering, Ithaca, New York 14853, USA
\normalsize
\normalspacing
\begin{center}
Abstract
\end{center}
There is a need for general-purpose methods for objectively evaluating the quality of single and metagenome assemblies, and for automatically detecting any errors they may contain. Current methods do not fully meet this need because they require a reference, only consider one of the many aspects of assembly quality, or lack statistical justification; none are designed to evaluate metagenome assemblies.
In this work we present an Assembly Likelihood Evaluation (ALE) framework that overcomes these limitations, systematically evaluating the accuracy of an assembly in a reference-independent manner using rigorous statistical methods. This framework is comprehensive, and integrates read quality, mate pair orientation and insert length (for paired end reads), sequencing coverage, read alignment, and k-mer frequency. ALE pinpoints synthetic and real errors in both single and metagenomic assemblies, including single-base errors, insertions/deletions, genome rearrangements and chimeric assemblies presented in metagenomes. The ALE framework provides a comprehensive, reference-independent and statistically rigorous measure of single genome and metagenome assembly quality, which can be used to identify miss-assemblies or to optimize the assembly process.
\chapter{ALE Introduction} % (fold)
\label{cha:ALE Introduction}
\noindent
Deoxyribonucleic acid, or DNA, encodes the information vital to the development and function of all life. Everything from the early development of a cell to the complex structure of a central nervous system is encoded in four simple nucelobases, or bases: {\bf \color{red} A}denine, {\bf \color{blue} T}hymine, {\bf \color{MyDarkGreen} C}ytosine and {\bf \color{orange} G}uanine. The primary structure (the linear chain of bases) of this DNA within a chromosome contains all the information necessary to build the proteins and structures that form all living organisms. The primary structure of all the chromosomes within an organism makes up that organism's genome. Knowledge about the genome allows for deeper understanding of the organism and inference about specific traits it may possess. Unfortunately, the relatively small scale of the DNA molecule (3.4\AA\ ($10^{-10}$m)) prevents us from simply reading the genome of an organism. However, DNA sequencing technology allows us to indirectly observe pieces of an organism's genome and attempt to reconstruct it. This work provides a framework for evaluating such reconstructions.
DNA sequencing technology works by taking an organism (or many organisms in the metagenomic
case) and extracting the entirety of the DNA into a test tube (on the order of $10^{10}$ base pairs (bps)).
The DNA is then cut randomly into roughly equal pieces (average sizes are 200bps to 5000bps). For paired-end sequencers these pieces
of DNA are then put into a sequencer that can ``read" the ends of these pieces. After about 100bps are read the chemistry within the sequencer causes the fragment to break, leaving us with information about the nucleotide content of the two ends of the sequence fragment, or ``read." This leaves us with some information about the very ends of these pieces of DNA with some
unknown insert length between them (drawn from a known distribution). The end result of this
lab-work is many millions or billions of short, paired reads that can then be used to (attempt to)
reassemble the entire genome of the organism (or organisms). Other sequencing technologies allow for only one end of the fragment to be read, or many short ``strobes" read in a single direction along the strand, with gaps between read sequence. Reassembling the genome from this information can be thought of as trying to
reassemble the combined pieces of many mixed jigsaw puzzles with missing, overlapping and duplicated pieces of variable
sizes. There are many programs that can create these assemblies (Velvet \cite{Zerbino2008}, Abyss \cite{Simpson2009} and others). However, there
is no good metric for determining the quality of these assemblies, the current techniques either just
use the overall size of the pieces outputted (N50 length: accuracy irrelevant) or try to map the suggested
assembly onto a known reference, which is unknown in almost all cases. Another difficulty these
programs have is ``finishing" assemblies, going from large pieces of contiguous proposed assembly
(contigs) and scaffolding them together and filling in the gaps to create a complete, finished
assembly.
Recent advances in next-generation, high throughput sequencing technologies have dramatically reduced the cost of sequencing (\cite{Metzker2010}). With the development of genome assemblers that take advantage of the large volume of sequence data, reference genomes are being produced at a high and increasing rate using the whole genome shotgun strategy, from small, simple microbial genomes (\cite{Wu2009}) to large, complex plant or mammalian genomes (\cite{Fujimoto2010}; \cite{Li2010}; \cite{Schmutz2010}; \cite{Zimin2008}). Meanwhile, genomes are also being generated directly from complex communities using culture-independent approaches, including singe-cell genome sequencing and metagenome sequencing (\cite{Woyke2010}; \cite{Yilmaz2011}; \cite{Hess2011}; \cite{Iverson2012}). The ability to assemble a metagenome is particularly important because resolving the genomes of individual species, or at least the most abundant, from a complex community is crucial to exploring inter-species interactions and understanding the community's structure, dynamics and function. Single species are very difficult to isolate from a metagenomic community, forcing the researcher to attempt to sequence the entire community at once with many individual species at different concentrations and with varying levels of genomic similarity.
Assembly of individual genomes from next-generation sequencing (NGS) datasets poses significant informatics challenges, including short read length, noisy data and large data volume (\cite{Lin2011}; \cite{Pop2009}). Due to these challenges, errors widely exist in single genome assemblies derived from NGS datasets, with different specific errors commonly associated with particular datasets, genomes, and tools (\cite{Haiminen2011}). Beyond those challenges faced in assembling single genomes, there are also several challenges unique to metagenome assembly. First, unlike in single genome assembly where the sequence depth (the number of sequenced reads that map onto a specific position on a proposed assembly) for the target genome is expected to be Poisson distributed about a uniform mean, the sequencing depth of genomes in a metagenome usually vary greatly. Second, most genome assemblers have difficulties resolving repetitive regions within a single genome, and this problem is exacerbated in metagenome assembly because conserved genomic regions and lateral gene transfer have greatly increased the portion of these ``repetitive" genomic regions. Finally, although there are quite a few single genome assemblers such as Velvet \cite{Zerbino2008}, ABySS \cite{Simpson2009}, soapDenovo \cite{Li2010} and All-Path-LG \cite{Gnerre2010} that are capable of assembling large genomes, there is no metagenome specific assembler yet. Instead, assemblers designed for single genomes are being applied to metagenome data without significant modifications \cite{Qin2010}\cite{Hess2011}. The impact of using an assembler developed to assemble single genome has not yet been systematically evaluated for metagenome assembly, especially in how well it addresses challenges like variable sequencing depth and closely related species that are unique to metagenomes. Quantitative measurement of the quality of a metagenome assembly, as well as the ability to compare the results of different assemblers from the same data set, are as of yet impossible. Many current studies either use only the overall size of assembly (N50 length), which ignores accuracy, or maps the resulting assembly onto some known reference genomes obtained independently to estimate the accuracy \cite{Hess2011}, but such references are not available in most cases. In this work we focus on the accuracy of the proposed genome using only the proposed assembly and the reads used to create it. This allows us to pinpoint localized errors instead getting bogged down trying to quantify the completeness of an entire metagenome.
Several tools have been developed to detect errors in single genome assemblies. If a reference genome for the targeted organism is available, or one is available from a closely related species, erroneous insertions, deletions or large gaps can be detected by comparative analysis of the reference and the genome assembly in question (\cite{Meader2010}; \cite{Salzberg2012}; \cite{Zimin2008}). If a reference is unavailable, the alignment of the raw reads with their assembly provides indirect measures of assembly quality such as coverage depth and mate pair consistency. This information can then be used to detect single-base changes, repeat condensation or expansion, false segmental duplications, and other miss-assemblies (\cite{Choi2008}; \cite{Phillippy2008}; \cite{Narzisi2011}). Despite this progress, researchers still lack a method that integrates indirect measures of read alignment quality in a quantitative, comprehensive and statistically well-founded manner to systematically detect miss-assemblies presented in single genome assemblies. Moreover, metrics suitable for evaluating metagenome assembly accuracy, and associated quantitative methods for detecting errors in metagenome assemblies, have yet to be developed.
In this work we aim to provide a comprehensive integrated framework for evaluating the quality of single genome and metagenome assemblies. In related work, \cite{Phillippy2008} also proposed a method for evaluating the quality of a whole single genome, but the method proposed is a pipeline of conceptually separate techniques for evaluating the different aspects of genome quality. \cite{Choi2008} also combined evidence from several conceptually separate measures of genome quality to identify mis-assemblies. In the previous literature, those works that use a single statistical framework tend to focus on only a single aspect of genome quality: \cite{Zimin2008} introduced a metric (CE statistic) for finding gaps in an assembly by comparing to the genome of a related organism; \cite{Meader2010} developed a statistical method for estimating the rate of erroneous insertions and deletions present in an assembly by comparison to an assembly of a closely related species; \cite{Olson2009} uses mate pair information and a reference genome from a similar organism to identify assembly errors and structural variation. All of these previous approaches contrast with the current work, which is an integrated method for validating several aspects of genome and metagenome assembly quality simultaneously based on a single statistical model. Like the current work, \cite{Laserson2011} also used a single statistical model to assign a likelihood score to assemblies, focusing specifically on the metagenomic case, but the statistical model used ignores the role of mate pairs in assessing assembly quality, which are becoming more prevalent with NGS technologies such as Illumina mate pairs and PacBio ``strobe" reads.
In this work, we measure the overall quality of an assembly in a mathematically rigorous and reference-independent manner, using a probabilistic model for the way that reads are generated from a genome. Using Bayesian statistics, we give explicit expressions for the probability that an assembly is correct, and computational methods based on these expressions. These mathematical methods avoid the pitfalls of summary statistics like N50 score, which only capture one dimension of assembly quality. We provide an automated software tool (ALE) based on this expression. The provided tool may be used in three ways. First, it allows examination of the contribution to this probability of correctness from each base in the assembly, which can be used to identify specific errors and their locations. This is particularly useful when finishing an assembly. Second, it provides an overall score for different assemblies of the same genome or metagenome, thereby enabling comparison of these assemblies and optimization of the assembly process. Two assemblies with roughly equal likelihood of correctness can be considered as having roughly equal quality, while if one assembly has a likelihood that is much lower, then we may safely discard it as inferior. When considering a single assembly in isolation, these methods can also be used to give an absolute score that indicates the quality of this assembly, and how this quality compares to the quality of other assemblies. Third, when applying re-sequencing data to a reference genome ALE can detect structural variations.
% chapter ALE Introduction (end)
\chapter{ALE Methods} % (fold)
\label{cha:ALE Methods}
\section{Reads and assembly properties}
The set of reads $R$ generated by a sequencer are referred to as a library. The library defines the properties of reads and their corresponding distributions for length, insert size and orientation as well as the structure of the reads, whether that is mate pairs, strobe reads or single reads.
Every read drawn from the library is composed of one or more parts (mates) of linear DNA sequence; each position corresponds to a nucleotide {\color{red} A}, {\color{blue} T}, {\color{MyDarkGreen} C}, {\color{orange} G} or an ambiguity code (see table \ref{ambcodes}) corresponding to the belief that the base is one of a set of possible bases with equal probability. In addition to the nucletide information, the sequencer also reports a quality score for each base corresponding to the probability that the sequencer reported the correct base at a given position. This quality score (or Q score) is reported in a log-scale and generally ranges from high confidence (95-99\%) for Illumina and Sanger reads (dropping off along the length of the read) to low confidence (85\%) for PacBio reads. A quality score of 25\% would represent no knowledge about the base.
\begin{table}[htp]
\caption{DNA nucleotide and ambiguity codes}
\label{ambcodes}
\begin{center}
\begin{tabular}{c|c|c|c|c}
Code & Meaning & Complement & Opposite \\
\hline
{\color{red} A} & {\color{red} A} & {\color{blue} T} & B \\
{\color{blue} T} & {\color{blue} T} & {\color{red} A} & V \\
{\color{orange} G} & {\color{orange} G} & {\color{MyDarkGreen} C} & H \\
{\color{MyDarkGreen} C} & {\color{MyDarkGreen} C} & {\color{orange} G} & D \\
K & {\color{orange} G} or {\color{blue} T} & M & M \\
M & {\color{red} A} or {\color{MyDarkGreen} C} & K & K \\
R & {\color{red} A} or {\color{orange} G} & Y & Y \\
Y & {\color{MyDarkGreen} C} or {\color{blue} T} & R & R \\
S & {\color{MyDarkGreen} C} or {\color{orange} G} & S & W \\
W & {\color{red} A} or {\color{blue} T} & W & S \\
B & {\color{MyDarkGreen} C} or {\color{orange} G} or {\color{blue} T} & V & {\color{red} A} \\
V & {\color{red} A} or {\color{MyDarkGreen} C} or {\color{orange} G} & B & {\color{blue} T}/U \\
H & {\color{red} A} or {\color{MyDarkGreen} C} or {\color{blue} T} & D & {\color{orange} G} \\
D & {\color{red} A} or {\color{orange} G} or {\color{blue} T} & H & {\color{MyDarkGreen} C} \\
N & {\color{orange} G} or {\color{red} A} or {\color{blue} T} or {\color{MyDarkGreen} C} & N & -
\end{tabular}\
\end{center}
\end{table}
\pagebreak
Each read has the following properties,
\begin{center}
\textbf{Read $r_{k}$ Properties}
\[\text{Read $r_{k}$: }\overbrace{\underbrace{\text{{\color{red} A}{\color{blue} T}{\color{MyDarkGreen} C}{\color{orange} G}{\color{MyDarkGreen} C}{\color{MyDarkGreen} C}{\color{blue} T}{\color{orange} G}{\color{red} A}{\color{blue} T}{\color{blue} T}{\color{MyDarkGreen} C}}}_{\text{length: } L\left(r_{k}^{(1)}\right) }}^{\text{Mate Pair \#1: } r_{k}^{(1)}}
\overbrace{\underbrace{\text{{\color{gray} ATTCGAGTCGA}}}_{\text{length: } I_{k} }}^{\text{Insert Area}}
\overbrace{\underbrace{\text{{\color{blue} T}{\color{MyDarkGreen} C}{\color{red} A}{\color{blue} T}{\color{MyDarkGreen} C}{\color{blue} T}{\color{MyDarkGreen} C}{\color{orange} G}{\color{red} A}{\color{blue} T}{\color{orange} G}{\color{MyDarkGreen} C}{\color{red} A}}}_{\text{length: } L\left(r_{k}^{(2)}\right) }}^{\text{Mate Pair \#2: } r_{k}^{(2)}}\]
\end{center}
Every read $r_{k}$ in the set of all reads $R$ is composed of one or more mate pairs $\{r_{k}^{(1)}, r_{k}^{(2)}, \ldots\}$. Sequencing technologies such as Illumina and SOLiD produce reads with two mates, PacBio ``strobe" reads can contain many read pairs, older technologies such as Sanger and 454 produce single reads. Each read has corresponding lengths $\{L\left(r_{k}^{(1)}\right), L\left(r_{k}^{(2)}\right), \ldots\}$ drawn from some distribution. These mate pairs are separated by a distance $I_{k}$ drawn from another distribution, which we assume is normal.
\pagebreak
\begin{center}
\textbf{Read Orientation $\omega_{k}$} \\
\begin{tabular}{ccccc} \\
& $\xrightarrow{\hspace*{3.4cm}}$ & Inward & $\xleftarrow{\hspace*{3.7cm}}$ & $\Rightarrow \omega(r_{k}) = +1$ \\
Read $r_{k}$: & {\color{red} A}{\color{blue} T}{\color{MyDarkGreen} C}{\color{orange} G}{\color{MyDarkGreen} C}{\color{MyDarkGreen} C}{\color{blue} T}{\color{orange} G}{\color{red} A}{\color{blue} T}{\color{blue} T}{\color{MyDarkGreen} C} & $\hdots\hdots$ & {\color{blue} T}{\color{MyDarkGreen} C}{\color{red} A}{\color{blue} T}{\color{MyDarkGreen} C}{\color{blue} T}{\color{MyDarkGreen} C}{\color{orange} G}{\color{red} A}{\color{blue} T}{\color{orange} G}{\color{MyDarkGreen} C}{\color{red} A} & \\
& $\xleftarrow{\hspace*{3.4cm}}$ & Outward & $\xrightarrow{\hspace*{3.7cm}}$ & $\Rightarrow \omega(r_{k}) = -1$
\end{tabular}
\end{center}
Every read has a unique orientation based on how the enzymes of the sequencer bind to it. Read pairs of only two mates can be sequenced inward, towards the middle, which would imply $\omega(r_{k}) = +1$ in our notation. Alternately, a read can be sequenced outward, from the middle to the ends, which would imply $\omega(r_{k}) = -1$. Smaller read fragments in Illumina sequencers tend to be sequenced inward, while longer fragments are primarily sequenced outward with some small fraction facing inward with a much smaller insert length. Reads with many mates, such as PacBio strobe reads, are all sequenced in a single direction (left to right) which we denote as $\omega(r_{k}) = 0$ as well as single reads, which do not have an orientation.
Each read can map onto an assembly in the following ways;
\begin{center}
\textbf{Read $r_{k}$ \textit{Singly Mapped} to Assembly $S$} \\
\singlespacing
\begin{tabular}{cl}
$S$ & $\cdots\overset{51}{\text{{\color{MyDarkGreen}C}}}\text{{\color{orange} G}{\color{red} A}}\text{{\color{red} A}}\overset{55}{\text{{\color{blue} T}}} \text{{\color{MyDarkGreen} C}{\color{orange} G}{\color{MyDarkGreen} C}{\color{MyDarkGreen} C}}\overset{60}{\text{{\color{blue} T}}}\text{{\color{orange} G}{\color{red} A}{\color{blue} T}{\color{blue} T}}\overset{65}{\text{{\color{MyDarkGreen} C}}}\text{{\color{red} A}{\color{blue} T}{\color{blue} T}{\color{MyDarkGreen} C}}\overset{70}{\text{{\color{orange} G}}}$\\
$r_{k}$ & \text{{\color{white}$\cdots$GCA}}$\underrightarrow{\text{{\color{red} A}{\color{blue} T}{\color{MyDarkGreen} C}{\color{orange} G}{\color{MyDarkGreen} C}{\color{MyDarkGreen} C}{\color{blue} T}{\color{orange} G}{\color{red} A}{\color{blue} T}{\color{blue} T}{\color{MyDarkGreen} C}}}$\text{{\color{white} A}{\color{white} T}{\color{white} T}{\color{white} C}{\color{white} G}}
\end{tabular}
\normalspacing
\textbf{Read $r_{k}$ \textit{Doubly Mapped} to Assembly} \\
\singlespacing
\begin{tabular}{cl} \\
$S$ & \text{$\cdots\overset{51}{\text{{\color{MyDarkGreen}C}}}${\color{orange} G}{\color{red} A}}$\text{{\color{red} A}$\overset{55}{\text{{\color{blue} T}}}${\color{MyDarkGreen} C}{\color{orange} G}{\color{MyDarkGreen} C}{\color{MyDarkGreen} C}$\overset{60}{\text{{\color{blue} T}}}${\color{orange} G}{\color{red} A}{\color{blue} T}{\color{blue} T}$\overset{65}{\text{{\color{MyDarkGreen} C}}}$}$\text{{\color{red} A}{\color{blue} T}{\color{blue} T}{\color{MyDarkGreen} C}$\overset{70}{\text{{\color{orange} G}}}${\color{red} A}{\color{orange} G}{\color{blue} T}{\color{MyDarkGreen} C}$\overset{75}{\text{{\color{orange} G}}}${\color{red} A}}$\text{{\color{blue} T}{\color{MyDarkGreen} C}{\color{red} A}$\overset{80}{\text{{\color{blue} T}}}${\color{MyDarkGreen} C}{\color{blue} T}{\color{MyDarkGreen} C}{\color{orange} G}$\overset{85}{\text{{\color{red} A}}}${\color{blue} T}{\color{orange} G}$\overset{88}{\text{{\color{MyDarkGreen} C}}}$}$\text{N} \\
$r_{k}$ & \text{{\color{white}$\cdots$GCA}}$\underrightarrow{\text{{\color{red} A}{\color{blue} T}{\color{MyDarkGreen} C}{\color{orange} G}{\color{MyDarkGreen} C}{\color{MyDarkGreen} C}{\color{blue} T}{\color{orange} G}{\color{red} A}{\color{blue} T}{\color{blue} T}{\color{MyDarkGreen} C}}}\underset{\text{Implied Insert}}{\underleftrightarrow{\text{{\color{white} A}{\color{white} T}{\color{white} T}{\color{white} C}{\color{white} G}{\color{white} A}{\color{white} G}{\color{white} T}{\color{white} C}{\color{white} G}{\color{white} A}}}}\underleftarrow{\text{{\color{blue} T}{\color{MyDarkGreen} C}{\color{red} A}{\color{blue} T}{\color{MyDarkGreen} C}{\color{blue} T}{\color{MyDarkGreen} C}{\color{orange} G}{\color{red} A}{\color{blue} T}{\color{orange} G}{\color{MyDarkGreen} C}{\color{red} A}}}$
\end{tabular}
\normalspacing
\end{center}
A read $r_{k}$ can be either \textit{singly mapped} or \textit{doubly mapped} (or more for strobe reads) to an assembly $S$ if one or both of it's corresponding mate pairs map to some subset of $S$. If the read is doubly mapped we can directly calculate the implied orientation $\omega_{k}$, insert length $I_{k}$ and ordering of the reads. An assembly can be composed of many broken up pieces of contiguous sequence, or contigs, which can lead to some or many reads only being singly mapped, or singly mapped to multiple contigs (called a chimer). By scaffolding these contigs into larger and larger pieces a more complete assembly is formed. Unknown length between contigs can be represented by one or more ambiguity codes such as {\bf N} which represents that the assembly has a base in that position, but there is no knowledge of which base it is (other codes represent other sub-combinations of possible bases, see Table \ref{ambcodes}).
The \textit{coverage} or \textit{depth} at a specific position in an assembly is the number of reads that map in some form onto that position. Under a random shearing process (in the read generation) we would expect the coverage to be Poisson distributed about a common, uniform mean throughout the assembly \cite{Lander1988}. Certain biases within sequencers towards GC rich areas of a genome and the inclusion of metagenomic data can violate this assumption, which we include in our model (see Section \ref{sec:GCBias}).
\section{The ALE score and the likelihood of an assembly}
The ALE framework is founded upon a statistical model that describes how reads are generated from an assembly. Given a proposed assembly (a set of scaffolds or contigs), $S$, and a set of reads $R$, this probabilistic model gives the likelihood of observing this set of reads if the proposed assembly were correct. We write this likelihood, $P(R|S)$, and its calculation includes information about read quality, agreement between the mapped reads and the proposed assembly, mate pair orientation, insert length (for paired end or strobe reads), and sequencing depth. This statistical model also provides a Bayesian prior probability distribution $P(S)$ describing how likely an assembly $S$ would be, if we were unable to observe any read information. This prior probability is computed using the k-mer distribution of the assembly. A detailed description of the likelihood and prior probability is given in the Methods section \ref{sub:kmersubscore}.
The ALE score is computed from these two values, and it is proportional to the probability that the assembly $S$ is correct. We write this probability as $P(S|R)$. Bayes' rule tells us that this probability is
\begin{equation}
P(S|R) = \frac{P(R|S)P(S)}{Z}.
\end{equation}
where $Z$ is a proportionality constant that ensures that $P(S|R)$ is a probability distribution. We see $P(S|R)$ as a statistical measure of the overall quality of an assembly $S$. As is typical in large-scale applications of Bayesian statistics, it is computationally intractable to compute the constant $Z$ exactly. The ALE score is computed by replacing the constant $Z$ with an approximation described in the Methods section \ref{sec:ApproxZ}.
Although the ALE score can be reported as a standalone value, and understood as an approximation to $P(S|R)$, it is most useful for comparing two different assemblies of the same genome, using the same set of reads to evaluate them. Suppose we have two such assemblies, $S_{1}$ and $S_{2}$. Call $A_{1}$ the total ALE score of the first assembly, and $A_{2}$ the total ALE score of the second assembly both generated from the same set of reads $R$. The difference of these scores is then
\begin{equation}
A_{1} - A_{2} = \log\left(\frac{P\left(R|S_{1}\right)P\left(S_{1}\right)}{P\left(R|S_{2}\right)P\left(S_{2}\right)}\right).
\end{equation}
The assembly with the higher ALE score is also the one with the larger probability of being correct. Moreover, we show that the difference between two assemblies' ALE scores describes their relative probabilities of correctness. If one assembly's ALE score is larger than the other's, with a difference of $x$ between their ALE scores, then the assembly with the larger ALE score is more likely to be correct by a multiplicative factor of $e^{x}$. Below, we refer to the ALE score more precisely as the total ALE score, to differentiate it from the sub-scores (described in section \ref{sec:ingredients_of_ALE_score}) used to construct it.
\begin{figure*}[!htpb]%figure1
\centerline{\includegraphics[width=\textwidth]{figures/ALE/Clark_Fig1c.pdf}}
\caption[Components of the ALE score]{The components of the total ALE score. ALE takes a proposed assembly and an alignment of reads as input. Four scores, the k-mer, placement, depth and insert sub-scores are computed using the model described in the Methods section. From the four scores a total ALE score is calculated and reported as a text file (.ale), and the text file can be used for input into the supplied plotter to generate a PDF file for visualization.}\label{fig:01}
\end{figure*}
Figure 1 shows the pipeline used to compute the total ALE score. Given a set of reads and a proposed assembly, ALE first takes as input the alignments of the reads onto the assembly in the form of a SAM or BAM file \cite{Li2009}, which can be produced by a third-party alignment algorithm such as bowtie \cite{Langmead2009} or bwa \cite{Li2009}. ALE then determines the probabilistic placement of each read and a corresponding placement sub-score for each mapped base, which describes how well the read agrees with the assembly. In the case of paired end reads, ALE also calculates an insert sub-score for all mapped bases of the assembly from the read pair, which describes how well the distance between the mapped reads matches the distribution of lengths that we would expect from the library. ALE also calculates a depth sub-score, which describes the quality of the sequencing depth accounting for the GC bias prevalent in some NGS technologies. The placement, insert and depth scores together determine $P(R|S)$. Independently, with only the assembly and not the reads, ALE calculates the k-mer sub-score and $P(S)$. Each sub-score is calculated for each scaffold or contig within an assembly independently, allowing for variations commonly found in metagenomes. The four sub-scores are then combined to form the total ALE score. The constituent calculations in this pipeline are described in the Methods section \ref{sec:ingredients_of_ALE_score} and \ref{sec:ApproxZ}.
In addition, these four sub-scores are reported by ALE as a function of position within the assembly, and can be visualized with the included plotting package or imported in table form to another package such as the Integrative Genomics Viewer (IGV) \cite{Nicol2009}, or the UCSC genome browser \cite{Kent2002}. When used in this way, these sub-scores can be used to locate specific errors in an assembly.
% section The ALE Score and the likelihood of an assembly (end)
\section{Probabilistic ingredients of the total ALE score} % (fold)
\label{sec:ingredients_of_ALE_score}
We can combine the two probabilities, $P(R|S)$ and $P(S)$, to provide an expression for the probability of the assembly given the reads, $P(S|R)$. While this combined expression is too computationally expensive to compute exactly because of the normalization factor, ALE provides a summary measure of quality called the total ALE score that is proportional to $P(S|R)$ and can be used compare assemblies.
Below, we first describe how $P(R|S)$ and $P(S)$ are computed from a set of reads $R$ and a given assembly $S$. We then describe how they are combined to compute the ALE score, and how the ALE score can be used to compare the qualities of different assemblies. We first provide an overview of how $P(R|S)$ and $P(S)$ are defined and computed, beginning with $P(R|S)$ and then discussing $P(S)$.
The statistical model from which $P(R|S)$ is calculated supposes that each paired read is generated at random from the genome or collection of genomes according to the following process. First, the distance between the two paired read ends, and their orientation, is chosen at random from a distribution that is specific to the library used to generate them. Second, the locations of the mate pairs on that genome are chosen at random, potentially with a consistent GC bias. Third and finally, the content of each of the two paired ends are generated by taking the genome's true base pairs at the chosen locations and then copying these base pairs into the reported read, with a given probability of error, insertion, or deletion for each base pair given by the sequencer's quality score.
The likelihood $P(R|S)$ that results from this process can be factored into three components
\begin{equation}
P(R|S)=P_{\text{placement}}(R|S)P_{\text{insert}}(R|S)P_{\text{depth}}(R|S).
\end{equation}
The first component, $P_{\text{placement}}(R|S)$ describes how well the reads' contents match the assembly at the locations to which they are mapped. The second component, $P_{\text{insert}}(R|S)$, describes how well the distances and orientations between each paired read match the distances and orientations that we would expect from the library. The third component $P_{\text{depth}}(R|S)$ describes how well the depth at each location agrees with the depth that we would expect given the GC content at that location. Contributions to these three quantities, as a function of position in the assembly, are used to produce the placement, insert and depth sub-scores.
Together with the likelihood of the reads $R$ given the assembly $S$, $P(R|S)$, the ALE framework also depends upon a Bayesian prior probability distribution over assemblies, written $P(S)$. $P(S)$ describes how likely we would believe the assembly $S$ to be, if we did not have any read information. In this prior probability distribution, we encode the belief that within a single genome, each k-mer has a unique k-mer frequency. This is the frequency of any set of k base pairs appearing in order in the genome. This defines a $4^{k}$ dimensional vector that is conserved across a genome and can help discover when different genomes have been mistakenly combined in a metagenome setting \cite{Teeling2004} \cite{Woyke2006}. Because $P(S)$ is determined by k-mer frequencies, we use the notation $P_{\text{kmer}}(S)$ rather than the more generic $P(S)$ when referring to this probability distribution. Contributions to $P_{\text{kmer}}(S)$ as a function of position in the genome is referred to as the k-mer sub-score.
\subsection{Placement sub-score} % (fold)
\label{sub:Placement sub-score}
$P_{\text{placement}}(R|S)$ quantifies the likelihood of observing a read $r_{i}$, or set of reads $R$, given an assembly $S$. It includes information about how the read maps onto the assembly, the quality score of each base and orientation.
We assume that every paired read is independent of all other pairs of reads, which allows us to write $P_{\text{placement}}(R|S)$ as
\begin{equation}
P_{\text{placement}}(R|S) = \prod_{r_{i} \in R} P_{\text{placement}}\left(r_{i}|S\right),
\end{equation}
where $P_{\text{placement}}\left(r_{i}|S\right)$ describes how well the contents of a single read $r_{i}$ match the assembly at the locations to which they are mapped, as well as how well the distance and orientation between that read's paired ends match the distance and orientation that we expect from the library. We assume independence of these distributions, allowing us to write this as
\begin{equation}
P_{\text{placement}}\left(r_{i}|S\right) = P_{\text{matches}}\left(r_{i}|S\right)P_{\text{orientation}}\left(r_{i}|S\right).
\end{equation}
$P_{\text{matches}}\left(r_{i}|S\right)$ measures how well the read matches the section of the assembly to which it maps. Making the assumption that each base $j$ of the read is correctly called by the sequencer independently with a probability equal to the base's quality score $Q_{j}$, we can write this as
\begin{equation}
P_{\text{matches}}\left(r_{i}|S\right) = \prod_{{\rm base}_{j} \in r_{i}} P\left({\rm base}_{j}|S\right)
\end{equation}
where
\begin{equation}
P\left({\rm base}_{j}|S\right) = Q_{j}
\end{equation}
when the base $j$ correctly matches the assembly and
\begin{equation}
P\left({\rm base}_{j}|S\right) = (1-Q_{j})/4
\end{equation}
when it does not. This expression follows from our modeling assumption that all 4 possible errors that the sequencer could have reported at that base (three different substitutions and a deletion) are equally likely when the read does not match the sequence. This symmetry requires each of the four possible reported errors (a base not equal to the assembly or a deletion) to have equal probability. An insertion, which does not have a corresponding base in the assembly, is modeled similarly, with the 4 in the denominator representing the uniform likelihood of observing any of the 4 possible bases on the assembly at that position. The product across all bases in a read is then equal to the total probability of observing that particular read at the given location in the assembly.
If the assembly has an unknown base at the location (denoted by an ``N") then we set
\begin{equation}
P\left({\rm base}_{j}|S\right) = 1/4
\end{equation}
modeling the fact that there is no information about the correct base at that location with a uniform distribution over all 4 possible bases. If an ambiguity code is reported by the sequencer then the above expression is modified to account for a distribution over the possible bases encoded by the corresponding code (see Table \ref{ambcodes}).
Each read is only allowed to be ``placed" at a single position in the assembly. If the aligner placed a particular read at more than one position, we choose a single position at random, weighted by $P_{\text{placement}}(r_{j}|S)$ score for each proposed position of the read on the assembly. This allows for repeat regions to be properly represented with the correct number of reads in expectation.
The orientation likelihood, $P_{\text{orientation}}\left(r_{i}|S\right)$, is calculated by first counting the number of times that each orientation occurs in the library using the mapping information. The probability that a particular read from a particular library has a particular orientation is then modeled as that orientation's empirical frequency in the library (this can be overridden with user-specified values for the probabilities). The likelihood $P_{\text{orientation}}\left(r_{i}|S\right)$ is then the empirical frequency of the observed orientation of the read $r_{i}$ in the library from which $r_{i}$ belongs.
After combining these two independent probabilities we are left with the total placement score $P_{\text{placement}}(R|S)$ for a given read. Below, we use this when calculating the probability that an assembly is correct given the reads, as well as the overall total ALE score. We also use it to calculate per-base placement scores at particular positions in the assembly. The placement sub-score at a particular position is given as the geometric mean of $P(r_{j}|S)$ of all $r_{j}$ covering that specific position,
\begin{equation}
\left[\prod_{R^{\prime}}P(R|S)\right]^{1/N}
\end{equation}
where the product is over all reads $R^{\prime}$ covering the given position, and $N$ is the number of such reads.
% subsection Placement sub-score (end)
\subsection{Insert sub-score} % (fold)
\label{sub:Insert sub-score}
The insert likelihood, $P_{\text{insert}}\left(r_{i}|S\right)$, is determined by first observing all insert lengths from all mappings of all reads and calculating the population mean, $\mu$, and standard deviation, $\sigma^{2}$ of these lengths (the mean and standard deviation can also be set by the user, if they are known). This step only needs to be done once. Once completed, we calculate the insert likelihood for each read $r_{i}$ by assuming that the corresponding observed insert length $L_{i}$ is distributed normally with this mean and variance, so that
\begin{equation}
P_{\text{insert}}\left(r_{i}|S\right) = {\rm Normal}\left(L_{i};\mu,\sigma^{2}\right).
\end{equation}
In this expression, the insert length $L_{i}$ is computed from the read $r_{i}$ and its mapping to the assembly $S$. Similar to the placement score we can calculate the geometric mean of insert scores at a given position to come up with the insert sub-score. This can be useful for determining areas of potential constriction or expansion within a proposed assembly.
% subsection Insert sub-score (end)
\subsection{Depth sub-score} % (fold)
\label{sub:Depth sub-score}
$P_{\text{depth}}(R|S)$ describes how well the depth at each location agrees with the depth that we would expect given the GC content at that location (which is ideally Poisson-distributed \cite{Lander1988}).
For each read, the GC content is the proportion of bases that are either G or C. Modern sequencers and library preparation techniques can bias GC-rich areas of a genome \cite{Aird2011} This bias affects the observed depth of reads mapping onto specific areas of an assembly. To correct for this bias we first calculate for each of the following 100 ranges of GC content over the average read length, 0\% to 1\%, 1\% to 2\%, $\ldots$, 99\% to 100\%, the average observed depth for positions in each contig in the assembly with a GC content in this range. Call $\mu_{\text{depth}(X_{i})}$ the observed average depth of all reads with a GC content falling in the same range as the GC content percentage $X_{i}$. We set the minimum expected depth to be 10, discounting regions of exceptionally low average depth.
We model the depths to be Poisson distributed about a mean drawn from a Gamma distribution centered at the expected depth for that position given its GC content. This models the dependence of the expected depth on more than just the GC content at that position, such as the presence of ``hard stops" (regions with no reads mapping to them) and the GC content at nearby positions. It results in an infinite mixture of Poissons that is equivalent to a Negative Binomial distribution. For simplicity and computational convenience, we make an independence assumption when computing this component. This causes the expected coverage at a location to depend only upon the GC content at that position, and not the GC content at nearby positions.
Then, at any given position the depth sub-score is
\begin{equation}
\begin{array}{l}
P_{\text{depth}}\left(d_{j}|S,X_{i}\right) \\
= {\rm Poisson}\left(d_{j};Y_{i}\right), Y_{i} \sim {\rm Gamma}(\max(10,\mu_{\text{depth}(X_{i})}), 1) \\
= {\rm NegBinom}(d_{j}; \max(10, \mu_{\text{depth}(X_{i})}), 1/2)
\end{array}
\end{equation}
where the depth is $d_{i}$ and where the GC content percentage $X_{i}$ is averaged across all reads that map (in the placement step) to that position. See Section \ref{sec:GCBias} for a further analysis.
% subsection Depth sub-score (end)
\subsection{k-mer sub-score} % (fold)
\label{sub:kmersubscore}
$P_{\text{kmer}}(S) \propto P(S)$, the k-mer sub-score, describes the likelihood of the assembly $S$, in the absence of any read information. Within this prior probability distribution, we encode the belief that within a single genome, each k-mer (a permutation of $k$ base pairs, where k is a fixed user defined number initially set to 4) has a unique k-mer frequency. This is the frequency with which the k-mer appears in the genome. The $4^{k}$ dimensional vector giving this frequency for each k-mer is conserved across a genome and can help determine if two different genomes have been mistakenly combined \cite{Teeling2004} \cite{Woyke2006}. Let $K$ be the set of all possible unique k-mers, so $|K| = 4^{k}$, and for each $i$ in $K$ let $n_{i}$ be the number of times this k-mer appears in a contig in the assembly. Then, the frequency $f_{i}$ of a particular k-mer $i$ within a contig is
\begin{equation}
f_{i} = \frac{n_{i}}{\sum_{j\in K}n_{j}}.
\end{equation}
The k-mer score is the product of this frequency over each k-mer appearing in each contig of the assembly $S$, which can be written as
\begin{equation}
P_{\text{kmer}}(S) = \prod_{i\in K} f_{i}^{n_{i}}.
\end{equation}
This is equivalent to assuming each k-mer in the assembly is drawn independently with identical distributions from a multinomial distribution with probabilities empirically estimated from the assembly.
The k-mer sub-score of a base at any given position in the assembly is the (geometric) average of $P_{\text{kmer}}(S)$ of all k-mers that cover that position. In calculating this average, the very first base in the genome only has one contributing k-mer, the second has two, up to k contributing k-mers after $k-1$ bases.
% subsection k-mer sub-score (end)
% section Probabilistic ingredients of the total ALE score (end)
\section{Approximating Z} % (fold)
\label{sec:ApproxZ}
Bayes' rule tells us that the probability that the assembly $S$ is correct is
\begin{equation}
P(S|R) = \frac{P(R|S)P(S)}{Z}
\end{equation}
where $Z$ is a proportionality constant that ensures that $P(S|R)$ is a probability distribution, where $Z$ is found by summing over all possible assemblies $S^{\prime}$,
\begin{equation}
Z = \sum_{S^{\prime}}P(R|S^{\prime})P(S^{\prime}).
\end{equation}
$Z$ cannot be explicitly computed because the space of all possible assemblies is far too large ($4^{L}$ where $L$ is the length of the assembly).
Instead we compute an approximation $\hat{Z}$ to $Z$. This provides an approximation to $P(S|R)$,
\begin{equation}
P(S|R) \approx \frac{P(R|S)P(S)}{\hat{Z}}.
\end{equation}
We can compare two assemblies generated from the same library of reads without calculating $Z$, the denominator in our Bayesian likelihood framework, because it cancels when taking the ratio of the likelihoods of the two assemblies. To determine a total ALE score for a single assembly, however, we must calculate or approximate $Z$. Our goal in approximating $Z$ is to use a quantity that does not depend on the assembly $S$ (or only depends weakly through some empirically estimated quantities included as parameters in the overarching statistical model), and is approximately of the same order of magnitude as the exact value of $Z$. In this section, we refer to our approximate $Z$ as $\hat{Z}$, and define it as a product of the terms,
\begin{equation}
\hat{Z} = \hat{Z}_{\text{placement}}\hat{Z}_{\text{insert}}\hat{Z}_{\text{depth}}\hat{Z}_{\text{kmer}}.
\end{equation}
We will define each term in this product separately.
\subsection{Approximating arbitrary $Z$}
\label{arbZ}
The $Z$ normalization factor in Bayes' Theorem is defined as
\begin{equation}
Z = \sum_{S^{\prime}}P(R|S^{\prime})P(S^{\prime}).
\end{equation}
The problem with computing this value explicitly is
\begin{enumerate}
\item The space $S^{\prime}$ is very large ($4^{L}$ where $L$ is the length of an assembly).
\item The set of assemblies for which $P(R|S^{\prime})$ is above a certain threshold $\epsilon > 0$ (like floating point precision $\epsilon = 10^{-8}$) is very small compared to the entire space. This is because the probability $P(R|S^{\prime})$ falls off very quickly when the reads do not agree with the assembly $S^{\prime}$, but this is still too large of a space to compute $Z$ explicitly.
\end{enumerate}
If we make the approximation
\begin{equation}
P(R|S^{\prime}) \approx \mathbb{E}_{}\left[P({\bf R}|S^{\prime})\right] = \sum_{r \in R^{\prime}} P(r|S^{\prime})P(r|S^{\prime}) = \sum_{r \in R^{\prime}} P(r|S_{0})^{2},
\end{equation}
where $R^{\prime}$ is the set of all possible reads and ${\bf R}$ is a random set of reads drawn from this set and $S_{0}$ is an arbitrary assembly. We drop the dependence on $S^{\prime}$ in the final equality because when summing over all possible reads every combination of read and assembly is evaluated, regardless of the specific assembly compared against. This is to say that for any fixed $S$, every possible permutation of reads (location and number of errors, quality scores, insert lengths, orientations, depths) is evaluated with respect to that assembly. Let $E$ be the set of all possible errors between a read and an assembly and all combinations thereof, (for example, substitution error at position 9 with quality score .97, etc). Then
\begin{equation}
\mathbb{E}_{}\left[P(R|S^{\prime})\right] = \sum_{r \in R^{\prime}} P(r|S^{\prime})^{2} = \sum_{E} \sum_{r_{e} \in R_{e}^{\prime}} P(r_{e}|S^{\prime})^{2} = \sum_{r \in R^{\prime}} P(r|S_{0})^{2},
\end{equation}
where $R_{e}^{\prime}$ is the set of reads containing the errors $e \in E$. $E$ contains all possible errors (including no errors) so the space of reads can be partitioned with respect to the errors when compared to any fixed assembly $S^{\prime}$. The value of $P(r_{e}|S^{\prime})$ is fixed by the model and independent of the assembly. If there is an error at a specific position, the placement score depends on the type of error and the quality score of the read at that position, which is encoded in $e \in E$. So we can set $P(r_{e}|S^{\prime}) = A_{e}$ where $A_{e}$ is a value independent of $S^{\prime}$.
In fact, this is true for any such assembly, making the dependence on $S^{\prime}$ moot. Our equation for $Z$ then becomes
\begin{equation}
Z \approx \sum_{S^{\prime}} \mathbb{E}_{}\left[P(R|S^{\prime})\right] P(S^{\prime}) = \sum_{S^{\prime}} \left[\sum_{r \in R^{\prime}} P(r|S_{0})^{2}\right] P(S^{\prime}) = \sum_{r \in R^{\prime}} P(r|S_{0})^{2} = \hat{Z}
\end{equation}
because $\sum_{S^{\prime}}P(S^{\prime}) = 1$ and the dependence on $S^{\prime}$ is dropped.
\subsection{Approximating $Z_{\text{placement}}$} % (fold)
\label{sub:Aprroximating Z_placement}
$\hat{Z}_{\text{placement}}$ is defined as
\begin{equation}
\hat{Z}_{\text{placement}} = \prod_{r \in R} \hat{Z}_{\text{placement}}(r|S).
\end{equation}
In this expression $R$ is the set of reads actually observed, $r$ is one read in this set of reads, and $\hat{Z}_{\text{placement}}(r|S)$ is defined as
\begin{equation}
\begin{array}{lcl}
\hat{Z}_{\text{placement}}(r|S) & = & \mathbb{E}\left[P_{\text{placement}}\left({\bf r^{\prime}}\right)|S\right] \\
& = & \sum_{r^{\prime} \in R^{\prime}} \left[ P_{\text{placement}}(r^{\prime}|S)\right]^{2} \\
& = & \sum_{\text{matches}} \sum_{\text{orientation}} \left(P_{\text{matches}}(r^{\prime}|S)P_{\text{orientation}}(r^{\prime}|S)\right)^{2}
\end{array}.
\end{equation}
In this expression $R^{\prime}$ is the set of all possible reads of the length given by $r$ and ${\bf r^{\prime}}$ is a random set of reads drawn from that set. We sum over all possible matches and orientations, which is analogous to summing over all possible reads. Although $S$ appears in this expression, its value does not depend on $S$ because of permutation symmetry as described in Section \ref{arbZ}. This symmetry allows us to calculate this expression analytically, without enumerating over $R^{\prime}$. In addition to its lack of dependence of $S$ and its ease of computation, this choice for $P_{\text{placement}}(r)$ is motivated by the belief that this quantity scales roughly like
\begin{equation}
P_{\text{placement}}(r) = \sum_{S^{\prime}}P_{\text{placement}}(r|S^{\prime})P(S^{\prime}),
\end{equation}
which is a quantity identical in form to $Z$, but restricted to the placement probability of a particular read.
% subsection Approximating $Z_{\text{placement}}$ (end)
\subsection{Approximating $Z_{\text{insert}}$} % (fold)
\label{sub:Aprroximating Z_insert}
We define $\hat{Z}_{\text{placement}}(r|S)$ as,
\begin{equation}
\begin{array}{lcl}
\hat{Z}_{\text{placement}}(r|S) & = & \mathbb{E}\left[P_{\text{insert}}\left(r^{\prime}\right)|S\right] \\
& = & \sum_{r^{\prime} \in R^{\prime}} \left[ P_{\text{insert}}(r^{\prime}|S)\right]^{2} \\
& = & \sum_{\text{insert}} \left(P_{\text{insert}}(r^{\prime}|S)\right)^{2}
\end{array},
\end{equation}
where again, in this expression $R^{\prime}$ is the set of all possible reads of the length given by $r$ and ${\bf r^{\prime}}$ is a random set of reads drawn from that set. Similarly, we sum over all possible insert lengths.
% subsection Approximating $Z_{\text{insert}}$ (end)
\subsection{Approximating $Z_{\text{depth}}$} % (fold)
\label{sub:Aprroximating Z_depth}
We define $\hat{Z}_{\text{depth}}$ as,
\begin{equation}
\hat{Z}_{\text{depth}} = \prod_{{\rm base}_{i} \in S} \hat{Z}_{\text{depth}} \left(X_{i}|S\right),
\end{equation}
where
\begin{equation}
\hat{Z}_{\text{depth}} \left(X_{i}|S\right) = \mathbb{E}\left[P_{\text{depth}} \left({\bf X_{i}}|S\right)|S\right] = \sum_{d=0}^{\infty} \left(P_{\text{depth}}\left(d | X_{i}, S\right)\right)^{2},
\end{equation}
where $P_{\text{depth}}$ is defined as before and ${\bf X_{i}}$ is a random set of depths drawn from the set of possible depths at location $i$. Although $S$ appears in this expression, its value does not depend on $S$. We can calculate this expression analytically using a hyper-geometric function; see implementation section \ref{sec:DepthZnormalization}.
% subsection Aprroximating $Z_{\text{depth}}$ (end)
\subsection{Approximating $Z_{\text{kmer}}$} % (fold)
\label{sub:Aprroximating Z_kmer}
We define $\hat{Z}_{\text{kmer}}$ as the expected kmer score,
\begin{equation}
\hat{Z}_{\text{kmer}} = \mathbb{E}\left[P_{\text{kmer}}({\bf S})\right] = \left(\sum_{i\in K} f_{i}^{2}\right)^{N},
\end{equation}
where $f_{i}$, $K$ and $n_{i}$ are defined as before in section \ref{sub:kmersubscore} and ${\bf S}$ is a random assembly drawn from the set of all possible assemblies. This method finds the expected k-mer score for a uniform random k-mer and applies that score $N$ times. Although $\hat{Z}_{\text{kmer}}$ depends on $S$ through the empirically determined $f_{i}$, which may be undesirable, this dependence follows naturally from our statistical model because the $f_{i}$ are considered parameters, which are estimated from data and then treated as known by the model.
% subsection Aprroximating $Z_{\text{kmer}}$ (end)
The preceding calculations allow us to approximate $Z$ and find an ``absolute" likelihood score for a given assembly without comparing it to another assembly with the same library and alignment.
% section Approximating Z (end)
\section{Relationship of the difference of total ALE scores to probability of correctness} % (fold)
\label{sec:Relationship of the difference of total ALE scores to probability of correctness}
Here we derive an expression described in the Results section for the difference of two total ALE scores in terms of the probability that an assembly is correct. Suppose we have two assemblies, $S_{1}$ and $S_{2}$. Call $A_{1}$ the total ALE score of the first assembly, and $A_{2}$ the total ALE score of the second assembly, both generated from the same set of reads $R$. The difference of these scores is then
\begin{equation}
\begin{array}{l}
A_{1} - A_{2} \\
= \log\left(P\left(R|S_{1}\right)\right) + \log\left(P\left(S_{1}\right)\right) - \log\left(\hat{Z}\right) + \log\left(P\left(R|S_{2}\right)\right) + \log\left(P\left(S_{2}\right)\right) + \log\left(\hat{Z}\right) \\
\approx \log\left(P\left(R|S_{1}\right)\right) + \log\left(P\left(S_{1}\right)\right) - \log\left(Z\right) + \log\left(P\left(R|S_{2}\right)\right) + \log\left(P\left(S_{2}\right)\right) + \log\left(Z\right) \\
= \log\left(\frac{P\left(R|S_{1}\right)P\left(S_{1}\right)}{P\left(R|S_{2}\right)P\left(S_{2}\right)}\right)
\end{array}.
\end{equation}
% section Relationship of the difference of total ALE scores to probability of correctness (end)
\section{Correction for GC Bias} % (fold)
\label{sec:GCBias}
Modern sequencers have a GC bias, which is to say that regions of the genome with different concentrations of the bases {\bf G} and {\bf C} will produce different numbers of reads, which will lead to the coverage of these regions within an assembly to vary. In Figure \ref{fig:GC1} we see the average depth at different GC concentrations within a {\it Spirochaeta smaragdinae} genome with reads generated from an Illumina sequencer. This variation in average depth can cause many false positives related to the ALE depth score. The Poisson distribution drops off steeply, which is to say that depth values away from the global mean would be scored very low, due solely to the inherent depth bias of the sequencer. We need to model this bias to eliminate these false positives.
\begin{figure}[!tpb]%figure1
\centerline{\includegraphics[width=\textwidth]{figures/ALE/avgDepthVsGC.png}}
\caption[Average depth vs. GC content]{The average depth at various GC contents in the {\it Spirochaeta smaragdinae} genome for reads of length $l = 77$.}\label{fig:GC1}
\end{figure}
We define the average GC content of a base relative to a read length $l$ as the average GC content of all reads of length $l$ that could possibly overlap that position.
Assume we look at the following subsequence, focusing on the G at position 4 and assuming that the reads are of length 4.
\singlespacing
\begin{center}
\texttt{...1234567...\\
...ATCGTCA...}
\end{center}
\normalspacing
One sees the following possible reads of length $l=4$ that can overlap with position 4 with their corresponding GC content (Table \ref{GCcontTable})
\begin{table}[h]
\label{GCcontTable}
\caption{GC content of reads}
\begin{center}
\begin{tabular}{c|c}
read & GC content \\
\hline
\texttt{ATCG} & 50\% \\
\texttt{TCGT} & 50\% \\
\texttt{CGTC} & 75\% \\
\texttt{GTCA} & 50\%
\end{tabular}
\end{center}
\end{table}
This implies that the base at position 4 is marked with an effective average GC content of 56.25\% given reads of length $l=4$.
\begin{figure}[hptb]
\centerline{\includegraphics[width=\textwidth]{figures/ALE/spiroGC.png}}
\caption[GC content of {\it Spirochaeta smaragdinae}]{The GC content can vary greatly within a genome. This figure illustrates the GC content of various positions along the {\it Spirochaeta smaragdinae} genome for read length $l=77$.}\label{fig:GC2}
\end{figure}
\begin{figure}[hptb]
\centerline{\includegraphics[width=\textwidth]{figures/ALE/DepthDistVsGC_564lib.png}}
\caption[Depth distribution of different GC contents]{The depth distributions for different ranges of GC content within the {\it Spirochaeta smaragdinae} genome. We can see that the distributions have a fat tail and have varying means.}\label{fig:GC3}
\end{figure}
We can see that the GC bias of the sequencer violates the assumption that the depth is Poisson distributed with a uniform mean. To correct for this bias we model the depth $d$ at a position $j$ being Poisson distributed with a mean drawn from a Gamma distribution related to the depth at that positions GC content,
\begin{equation}
P_{\text{depth}}\left(d_{j}|S,X_{i}\right) = {\rm Poisson}\left(d_{j};Y_{i}]\right)
\end{equation}
with
\begin{equation}
Y_{i} \sim {\rm Gamma}(\max(10,\mu_{\text{depth}(X_{i})}), 1).
\end{equation}
We set the mean of the Gamma distribution to the average depth of all positions with the same GC content $\mu_{\text{depth}(X_{i})}$. The minimum value of the parameter is set to 10 in practice to further discount hard stops (positions with 0 depth) and contigs that have very low, but consistent depth, which represents low evidence.
The above equations result in a Negative Binomial distribution,
\begin{equation}
P_{\text{depth}}\left(d_{j}|S,X_{i}\right) = {\rm NegBinom}(d_{j}; \max(10, \mu_{\text{depth}(X_{i})}), 1/2),
\end{equation}
where the second parameter is analytically equal to $\frac{1}{2}$.
This model has the benefit of incorporating the inherent GC bias of the sequencer and fitting the data well (Figure \ref{fig:GC4}). The ALE depth scores that are generated by using this distribution have a lower variance (see Figure \ref{fig:GC5}). This means that scores that deviate from the mean are not scored as low as when using the Poisson distribution.
\begin{figure}[hptb]
\centerline{\includegraphics[width=0.5\textwidth]{figures/ALE/GC_depth_distributions.pdf}}
\caption[Various distributions fit to GC content]{The depth distribution for all positions with GC content between 40-50\%. We compare it to a Poisson distribution as well as the maximum likelihood estimated Negative Binomial distributions with and without fixed parameter $r$.}\label{fig:GC4}
\end{figure}
\begin{figure}[hptb]
\centerline{\includegraphics[width=0.5\textwidth]{figures/ALE/GC_depth_distribution_of_scores.pdf}}
\caption[Distribution of scores using various distributions]{The distributions of ALE depth scores given to positions with 4-=50\% GC content. We see that the Poisson distribution tends to have a fat tail of low scores. This is to say that it is very strict, punishing positions that fall outside of the narrow probability distribution function. The fixed $r$ Negative Binomial function has a large number of medium scores and a low number of very low scores, making it a much more forgiving distribution.}\label{fig:GC5}
\end{figure}
% section Correction for GC Bias (end)
% section Placing reads (end)
\section{Thresholding the total ALE score} % (fold)
\label{sec:Thresholding the total ALE score}
In the plotting program distributed with ALE we use a thresholding algorithm to highlight potential areas of poor assembly quality using the per-base ALE scores. We do this by averaging scores within windows, allowing for the discovery of large errors in the assembly while smoothing out the noise. By the Central Limit Theorem, when we average many independent and identically distributed random variables the result is approximately normally distributed. This allows us to create a ``threshold" for which to delineate ``good" scores from ``bad" and pinpoint problematic regions. This is represented by a solid black line in the figures labeled ``$5\sigma$." This line is calculated by assuming that the individual scores at each position in the assembly are drawn from a mixture of two normal distributions: one for high accuracy and another for low accuracy. We use maximum likelihood to determine the mean and variance of the two underlying distributions. See section \ref{mixturemodel} in the implementation chapter for a more in-depth discussion. The threshold is set as five standard deviations from the mean of the ``high accuracy" distribution. This allows us to readily find areas of inaccuracy that are unlikely to be drawn from an accurate region. Five standard deviations corresponds to 1 false positive in 2 million positions if the joint normal distribution assumptions hold. The number of standard deviations at which the black line is drawn can be set from the command line by the user.
A black bar is drawn on the plot if the likelihood falls below the threshold at a significant fraction of the positions in any contiguous region with a given length (this fraction and length are user defined, and are initially set to 0.01\% and 1000bp respectively) see figure 3.1, 3.3 and 3.4. The red bars correspond to regions of potential inaccuracy in the assembly that should be examined further. The plotter outputs these regions in a tab delineated text file for easy input into genome viewing software programs like IGV.
% section Thresholding the total ALE score (end)
% chapter ALE Methods (end)
\chapter{ALE Results} % (fold)
\label{cha:ALE Results}
\section{Performance on major types of miss-assemblies in a genome assembly with synthetic data}
Common assembly errors include single-base substitutions, insertion/deletions, chimeric assemblies derived from translocations or misjoins, and copy number errors derived from repeat condensation/expansions. To test ALE's ability to detect these types of errors in an assembly, we generated synthetic reads from a reference genome and then seeded the reference with each type of error. First, 400,000 pair-end synthetic reads were generated from the first 350kbp at random positions of Escherichia Coli K12 Substrain DH10B (\cite{Durfee2008}). Their insert length follows a normal distribution with mean 200bp and standard deviation 7bp. Next, synthetic miss-assemblies were introduced at 6 different locations within this reference. The miss-assemblies introduced were a substitution, insertion, deletion, inversion, translocation and a copy number error, respectively (Figure \ref{ALEfig2}). We treated this mutated genome as the proposed assembly.
\begin{figure*}[!htpb]%figure1
\centerline{\includegraphics[width=\textwidth]{figures/ALE/fig_2_top.pdf}}
\caption[ALE performance on with synthetic reads]{The performance of ALE on synthetic errors in {\it E.Coli}. At the genome level, at least one of the four sub-scores drops dramatically in each region containing a synthetic error. A higher resolution view for each type is illustrated in Figure \ref{ALEfig2p1}, \ref{ALEfig2p2}, \ref{ALEfig2p3} and \ref{ALEfig2p4}. ({\bf A}) Substitution, deletion and insertion errors; ({\bf B}) an inversion error; ({\bf C}) an transposition error; and ({\bf D}) an copy number error.}\label{ALEfig2}
\end{figure*}
\begin{figure*}[hptb]%figure1
\centerline{\includegraphics[width=0.6\textwidth]{figures/ALE/fig2_bot_p1.pdf}}
\caption[ALE performance: synthetic single base errors]{Part {\bf A} of Figure \ref{ALEfig2}: A substitution, deletion and insertion error. The placement sub-score drops significantly at positions 69kbp, 70kbp and 71kbp respectively, which are the locations where the single-base substitution, deletion and insertion errors were added.}\label{ALEfig2p1}
\end{figure*}
\begin{figure*}[hptb]%figure1
\centerline{\includegraphics[width=0.6\textwidth]{figures/ALE/fig2_bot_p2.pdf}}
\caption[ALE performance: synthetic inversion]{Part {\bf B} of Figure \ref{ALEfig2}: An inversion error of length 200bp at position 140kbp causes a drop in the placement, insert and depth sub-score as read mates fail to align to the region.}\label{ALEfig2p2}
\end{figure*}
\begin{figure*}[hptb]%figure1
\centerline{\includegraphics[width=0.6\textwidth]{figures/ALE/fig2_bot_p3.pdf}}
\caption[ALE performance: synthetic transposition]{Part {\bf C} of Figure \ref{ALEfig2}: A transposition error of length 200bp at position 210kbp and a copy number error of length 77bp at position 280kbp both cause the placement, insert and depth sub-scores to drop.}\label{ALEfig2p3}
\end{figure*}
\begin{figure*}[hptb]%figure1
\centerline{\includegraphics[width=0.6\textwidth]{figures/ALE/fig2_bot_p4.pdf}}
\caption[ALE performance: synthetic copy number error]{Part {\bf D} of Figure \ref{ALEfig2}: A copy number error of length 77bp (read length) was added at position 280kbp causing the depth and insert scores to drop, as well as the placement at the very center where the two copies meet.}\label{ALEfig2p4}
\end{figure*}
We tested the ALE algorithm by aligning the above synthetic reads to the proposed assembly using bowtie (\cite{Langmead2009}) and ran the results through the ALE software package. ALE automatically thresholds each error (see Methods) and produces plots of the sub-scores near each error using the included plotter (Figure \ref{ALEfig2}). We found that ALE is able to locate each type of error in the proposed assembly. At the genome level, at least one of the four sub-scores drops dramatically in each region containing a synthetic error. With this set of synthetic data, ALE reports no false discoveries. These results suggest that ALE systematically reports all major types of errors with simulated data.
\begin{figure}[!tpb]%figure1
\centerline{\includegraphics[width=\textwidth]{figures/ALE/Clark_Fig3b.pdf}}
\caption[ALE score vs. number of errors]{The total ALE score decreases monotonically as the number of errors increases. Insertion (green) and deletion (red) errors cause the total ALE score to drop at a faster rate (per error) than substitution errors (blue) under the model.}\label{ALEfig3}
\end{figure}
Furthermore, the total ALE score decreased as more errors were added to the assembly. As shown in Figure \ref{ALEfig3}, as the number of substitution, insertion and deletion errors increased, the total ALE score decreased monotonically, the rate of which is determined by the quality scores of the data (see Methods section \ref{}). This suggests that the total ALE score indicates overall assembly accuracy.
\section{Detecting chimeric assemblies in a synthetic metagenome}
One common assembly error in metagenome assemblies is the chimeric assembly consisting of two or more genomes. To test ALE's ability to distinguish this type of metagenome-specific error, we simulated a miss-assembled contig by joining several pieces of two genomes (Brachyspira murdochii DSM 12563 (\cite{Pati2010}) and Conexibacter woesei DSM 14684 (\cite{Pukall2010})) in a random order (Figure \ref{ALEfig4}). Using a known, synthetic reference allows for the unbiased testing of ALE's sensitivity to chimeric metagenomes, since there is not any true metagenome reference available. K-mer sub-scores and plots were generated for this simulated contig as well as the two correct genomes individually for comparison.
\begin{figure*}[!tpb]%figure1
\centerline{\includegraphics[width=\textwidth]{figures/ALE/Clark_Fig4b.png}}
\caption[ALE performance on synthetic metagenome]{The performance of ALE on synthetic metagenome assembly errors. ({\bf A} and {\bf B}) The k-mer sub-score for each genome individually. ({\bf C}) k-mer sub-score for a simulated chimeric metagenome between the two genomes. A diagram above ({\bf C}) illustrates how the two genomes were mixed.}\label{ALEfig4}
\end{figure*}
ALE relies on k-mer sub-score (the default is k=4) to distinguish contigs coming from different microbial species, because tetra-nucleotide frequencies are a reliable species-specific signature (\cite{Teeling2004}; \cite{Woyke2006}). If a genome, or contig, contains two or more distinct regions characterized by different k-mer vectors, then the k-mer sub-score will be lower for the positions characterized by the less prevalent k-mer vector (see Methods). Because the other sub-scores are unaffected by the mixture, the drop in total ALE score is due to the lower k-mer sub-score. This unique capability of ALE allows easy detection of chimeric contig/scaffolds within a metagenome assembly. As shown in Figure \ref{ALEfig4}, the k-mer sub-score is consistent for each of the two genomes individually as expected (Figure \ref{ALEfig4}, A and B). In contrast, the k-mer sub-score is much lower for the mixed genomes, clearly identifying where two genomes are mixed together in the same assembly.
\section{Discovery of errors in real genome assemblies}
\label{ALE_res_spiro}
The above experiments used simulated reads, or assemblies with simulated errors. Real reads and real genome/metagenome assemblies are often very noisy, presenting an additional challenge to ALE. To test ALE using real world assemblies with real reads we chose a finished genome, {\it Spirochaeta smargdinae} DSM 11293, originally constructed from 454 and Illumina reads (\cite{Mavromatis2010}), and applied ALE to it using one lane of 2x76 paired end Illumina reads. The results are shown in Figure \ref{ALEfig5} and Table \ref{ALEtab1}. At the genome level, ALE found several errors, including a large 560kbp region (3.91mb – 4.48mp) in the proposed assembly where the depth sub-score dropped below the threshold. We found 3 areas producing errors that are likely due to repeat condensation. For example, further examination of two regions (408kbp-415kbp and 4.241mbp-4.247mbp) by overlaying the Illumina short read data indicates these regions have much higher sequence depth (2X) than neighboring regions, and contain many SNPs (two alleles of roughly equal ratio) (Figure \ref{ALEfig5}, B and C), supporting the hypothesis that there are two copies of these regions in this genome. The boundaries of these regions also have abnormal placement and insert sub-scores, further supporting the hypothesis that there are miss-assemblies at the above locations.
\begin{figure*}[!tpb]%figure1
\centerline{\includegraphics[width=\textwidth]{figures/ALE/fig5_top.pdf}}
\caption[ALE performance on real reads]{({\bf A}) the ALE plot for {\it Spirochaeta smaragdinae} using 44M paired reads. Four ALE sub-scores were plotted across the entire genome assembly: placement (blue), k-mer (green), insert (purple) and depth (red). miss-assemblies identified by ALE default thresholds were highlighted. Two of these miss-assemblies are displayed in the integrated genome viewer (IGV, in Figure \ref{ALEfig5p2}), with the original Illumina data used by ALE and the validation data from PacBio sequencing. SNPs automatically identified by IGV are shown as colored bars in the sequencing coverage plots (the coverage is indicated by the numbers on the left).}\label{ALEfig5}
\end{figure*}
\begin{figure*}[!tpb]%figure1
\centerline{\includegraphics[width=\textwidth]{figures/ALE/fig5_mid.pdf}}
\centerline{\includegraphics[width=\textwidth]{figures/ALE/fig5_bot.pdf}}
\caption[ALE real reads: IGV of miss-assemblies]{Two of these miss-assemblies from Figure \ref{ALEfig5} displayed in the integrated genome viewer (IGV), with the original Illumina data used by ALE and the validation data from PacBio sequencing. SNPs automatically identified by IGV are shown as colored bars in the sequencing coverage plots (the coverage is indicated by the numbers on the left).}\label{ALEfig5p2}
\end{figure*}
\begin{table}[!htp]
\caption{miss-assemblies identified in {\it Spirochaeta smaragdinae}.}
\label{ALEtab1}
\begin{center}
%\processtable{miss-assemblies identified in Spirochaeta smaragdinae.\label{Tab:01}}
\begin{tabular}{c|c|c}
Threshold Violation Type & Starting Position & Ending Position\\\hline
Placement & 411624 & 412375\\
Placement & 4243804 & 4244967\\
Placement & 4317554 & 4317856\\
Insert & 383643 & 688112\\
Depth & 3909940 & 4481198\\
\end{tabular}
\end{center}
\end{table}
To determine whether these errors identified by ALE are true assembly errors or Illumina artifacts, we independently validated the results using PacBio sequencing data. A total of 53 SMRT cells comprising 221mb of mapped reads or 34 folds of overage were aligned to the assembly. Manual inspection of the resulting PacBio alignment confirms 5/5 assembly errors (Figure \ref{ALEfig5} B and C \& Table \ref{ALEtab1}), suggesting the errors identified by ALE are true errors in the assembly.
\section{Sensitivity to SNVs in real data}
\label{SNVsReal}
To show that ALE has a high sensitivity to real errors in real data we examine a re-sequencing project. In this project one lane of Illumina 36x2 paired reads was generated from a new strain of {\it Rhodobacter sphaeroides} 2.4.1 (\cite{Choudhary2006}) with an insert length of ~200bp covering the genome with an average coverage depth of 557. This genome has a very high GC content (68\%) and contains 336 hard stops and many more very low depth regions. A hard stop is a region where a bias in the sequencer causes it to report 0 depth (no reads) without any read pairs spanning the region. This makes SNV detection difficult for many SNP detectors (\cite{Wang2011}). The reads were aligned to the reference genome and used to independently compile a reference set of 222 possible SNVs between the strains (176 from Chromosome1, length 3.2Mbp; 46 from Chromosome2, length 0.94Mbp). The placement sub-score was then computed using the same re-aligned reads.
To determine the positions with the least evidence under the model we sorted the placement sub-scores for each chromosome. The 0.0001\% worst scoring positions (219 regions) on Chromosome1 are within a read length of 154 of the 176 variants (88\%), and the top 0.0005\% worst positions (977 regions) contain more than 97\% of the variants. The same experiment for Chromosome2 recovers 87\% (40 of 46 variants from 63 regions) and 96\% (from 309 regions) respectively. For a further discussion refer to section \ref{ROCALE}. This shows that the positions at which the proposed assembly differs from the genome generating the reads are among the positions with the loewest sub-scores. The regions with poor sub-scores that do not correspond to the variant list are other regions unsupported by the read evidence, such as hard stops regions of very low coverage that stem from the bias of the sequencer. This shows that ALE can locate regions unsupported by the read evidence, including SNVs, and that ALE accurately gauges assembly quality at multiple base resolutions.
\section{ALE's performance with Pacific Biosciences RS data}
The above experiments were all performed with next generation short read data. Currently, Pacific Biosciences (PacBio) sequencing platform, also referred to as ``third-generation" sequencing, is becoming increasing popular due to its long read length (up to several kb) (\cite{Eid2009}). These long reads are expected to greatly reduce the complexity associated with genome assembly validation. In contrast with the second generation sequencing, single-molecule based PacBio RS sequencing has a much higher base error rate (~15\%), making it an ideal candidate for testing the robustness of ALE against very noisy data. For this purpose we examined the reference genome of {\it Lambda Phage} and corresponding PacBio reads of average depth 548x and a randomly sampled set at 50x. To determine ALE's performance on this dataset, the reference genome was synthetically mutated by adding 12 substitution, insertion and deletion errors at various locations (Table \ref{ALEtab2}). At 548x depth, within the top 12 lowest placement sub-scores, ALE recovered all 12 errors at the mutated positions, while reporting no false positives. At 50x depth, excluding the low coverage edges, the 12 errors were detected in the top 14 lowest placement sub-scores, with 2 false positives. In comparison, the standard Pacific Biosciences variant caller, EviCons, correctly identified only 10 of these errors with low confidence at default settings and the full 548x depth. This shows that ALE is a robust measure of assembly accuracy with noisy sequencing data, and it is a generic framework that can be used with both short and long sequence read technologies.
\begin{table}[!htp]
\label{ALEtab2}
\begin{center}
\begin{tabular}{cccccc}
Operation & Mutation & Position & 548x& 50x & 548x\\
Type & Details & & Evicon & Rank & Rank\\
& & & (PacBio) & (ALE) & (ALE)\\
\hline
Sub & C $\rightarrow$ A & 881 & 1 & 5 & 5\\
Ins & CC $\rightarrow$ CCC & 2161 & - & 14 & 12\\
Del & G $\rightarrow$ - & 3681 & 1 & 9 & 8\\
N/A & - & 15712 & - & 7 & -\\
Del & AACGGGCAGA & 16561 & 1 & 4 & 4\\
Ins & AACGGGCAGA & 17030 & 1 & 3 & 2\\
Sub & A $\rightarrow$ G & 22881 & 1 & 10 & 7\\
Sub & T $\rightarrow$ A & 28561 & 1 & 11 & 10\\
Del & T & 34560 & 1 & 12 & 11\\
Sub & G $\rightarrow$ C & 36560 & 1 & 8 & 9\\
Ins & ACGTACGT & 40721 & 1 & 1 & 1\\
N/A & - & 41318 & - & 13 & -\\
Del & TCATCGCG & 43200 & - & 6 & 6\\
Ins & C & 47600 & 1 & 2 & 3
\end{tabular}
\end{center}
\caption[Real Data ALE vs. PacBio]{Performance of ALE on synthetic assembly errors in {\it Lambda Phage} genome with different PacBio sequencing depths (50x and 548x)}
\end{table}
\section{Discussion}
ALE facilitates the rapid discovery of many types of errors in genome assemblies, including metagenomes. It does this by applying a rigorous statistical model and calculating the likelihood of observing a specific assembly given the reads that were used to generate it. This allows ALE to determine specific regions within a proposed assembly that are poorly supported by the reads. By integrating several aspects of the assembly and the reads, including k-mer composition, sequence depth, insert length, and how well individual bases map, ALE is able to find errors as small as a single substitution error or indel, as well as large copy number errors and chimeric metagenome assemblies.
This framework can serve as a guide to optimize the genome assembly in the following two ways. First, total ALE scores can be used to identify the best assembly from those generated by different assembly protocols. Second, by modifying the regions in which ALE reports low sub-scores, more accurate genomes can be constructed. The space of possible corrections to an input genome is too large to allow the current implementation of ALE to be used as an independent assembler, but it could be used to compare and combine the results from different assemblers and produce an assembly that is most likely to be correct. ALE could also be used to present an alternative method for calculating assembly quality in local assembly algorithms such as Genovo (\cite{Laserson2011}).
When used with a reference genome and re-sequencing data, ALE can discover structural variations. As shown in the cases of {\it Spirochaeta smaragdinae} and {\it Rhodobacter sphaeroides}, ALE readily detects structural variations whose sizes vary from a few bases to several hundred kilobases.
ALE is influenced by the quality of its input: the read data and the alignments of those reads onto the proposed assembly. Data with biased content or alignments, while accepted by ALE, tend to produce noisy sub-scores. The robustness of ALE, however, allows for the recovery of an accurate assembly quality measure as long as the random noise is consistent with the statistical model used by ALE (see Methods).
Future experimental work is needed to determine the profile of assembly errors within a given dataset. This would better characterize the sub-scores of specific assembly errors, and allow the computation of a per-base confidence in the correctness of the assembly at each base from the corresponding sub-scores. One possible approach would be to select a number of regions with good sub-scores, mutate those regions of the assembly to simulate errors, and then reevaluate the sub-scores at these regions. Comparing the sub-scores before and after mutation would provide information about the distribution of sub-scores for accurate and erroneous regions, in that dataset. Additionally, this could inform an auto-correction algorithm based on ALE to fix problematic regions.
In addition, the model could be extended in future work to account for factors like origin of replication bias prevalent in circular genomes, automatic detection of sequencer bias and different potential distributions for insert length and coverage depth. Biases such as hard stops in Illumina could potentially be found by examining unlikely distributions of read orientation at specific locations coupled with low depth. Specific signatures within the different ALE metrics could be used to classify and correct for specific biases, much as ALE currently corrects for GC content (see Methods).
% chapter ALE Results (end)
\chapter{ALE Implementation} % (fold)
\label{cha:ALE Implementation}
\section{Mixture model for score thresholding}
\label{mixturemodel}
In order to distinguish ``good" scores from ``bad" scores in an assembly we make a series of assumptions about the distribution from which the scores are drawn. When we smooth the data we are effectively averaging many random variables corresponding to the scores at a given position, which due to the Central Limit Theorem means that they can be considered to be normally distributed, if they are independent and identically distributed. The independence assumption is made by assuming the length of the assembly is much larger than the length of a read and the fact that reads are considered independent. The scores are all generated from the same model and are therefore identically distributed.
We assume that the scores $\vec{s}$ are drawn from one of two Gaussian distributions, a ``good" distribution $N_{g}$ and a ``bad" distribution $N_{b}$.
\begin{equation}
p\left(s_{i}|\lambda\right) = w_{g} \phi\left(s_{i} | \mu_{g}, \sigma^{2}_{g}\right) + w_{b} \phi\left(s_{i} | \mu_{b}, \sigma^{2}_{b}\right)
\end{equation}
where $w_{g} + w_{b} = 1$ are weights and $\phi$ is the Normal probability mass function and $\lambda$ is the collection of hyperparameters $\{w_{g}, w_{b}, \mu_{g}, \mu_{b}, \sigma^{2}_{g}, \sigma^{2}_{b}\}$.
The likelihood of the model is
\begin{equation}
p\left(\vec{s}|\lambda\right) = \prod_{i=1}^{L} p\left(s_{i}|\lambda\right).
\end{equation}
We cannot maximize this likelihood analytically, but we can find local optima using expectation maximization.
\subsection{Expectation maximization}
We iterate using the {\it a posteriori} probability of a single score $s_{i}$ being drawn from one of the two components,
\begin{equation}
p_{g}(s_{i}, \lambda) = \frac{w_{g} \phi\left(s_{i} | \mu_{g}, \sigma^{2}_{g}\right)}{w_{g} \phi\left(s_{i} | \mu_{g}, \sigma^{2}_{g}\right) + w_{b} \phi\left(s_{i} | \mu_{b}, \sigma^{2}_{b}\right)}
\end{equation}
and
\begin{equation}
p_{b}(s_{i}, \lambda) = \frac{w_{b} \phi\left(s_{i} | \mu_{b}, \sigma^{2}_{b}\right)}{w_{g} \phi\left(s_{i} | \mu_{g}, \sigma^{2}_{g}\right) + w_{b} \phi\left(s_{i} | \mu_{b}, \sigma^{2}_{b}\right)}
\end{equation}
at each iteration we update the hyperparameters $\lambda$ using the following formulas,
\begin{equation}
w_{d}^{(t+1)} = \frac{1}{2}\sum_{i=1}^{L} p_{d}(s_{i}, \lambda^{(t)})
\end{equation}
\begin{equation}
\mu_{d}^{(t+1)} = \frac{\sum_{i=1}^{L} p_{d}(s_{i}, \lambda^{(t)}) s_{i}}{\sum_{i=1}^{L} p_{d}(s_{i}, \lambda^{(t)})}
\end{equation}
\begin{equation}
\sigma_{d}^{2^{(t+1)}} = \frac{\sum_{i=1}^{L} p_{d}(s_{i}, \lambda^{(t)}) s_{i}^{2}}{\sum_{i=1}^{L} p_{d}(s_{i}, \lambda^{(t)})} - \mu_{d}^{2}
\end{equation}
for each $d \in \{g, b\}$ until the change in the total likelihood is less than some threshold, or after some number of maximum iterations.
To speed up this process only a random 10\% of the data or a random 10,000 points are used to build the model, whichever is smaller. If the model fails to converge, a different random set of data is chosen, up to a maximum number of iterations. This adds a level of robustness that overcomes certain pathological situations where the 2-Gaussian mixture model will fail.
\subsection{Example with ALE depth score}
As an example we will examine the ALE depth score for {\it Spirochaeta smaragdinae} from Section \ref{ALE_res_spiro}. This genome has two distinct regions in the depth score representing ``good" and ``bad" scores. The two Gaussian mixture model fits the data much better than a single Gaussian distribution as seen in Figure \ref{fig:GMM_spiro} and Table \ref{tab:GMM_spiro}.
\begin{figure}[hpt]
\centerline{\includegraphics[width=\textwidth]{figures/ALE/gmm_spiro_depth.png}}
\caption[Example of GMM for ALE depth score]{We see the difference between the observed depth scores (blue dots), the two component Gaussian mixture model (green line) and a single component Gaussian mixture model (red dashed line). It is clear that the mixture model is a far superior fit to the data}
\label{fig:GMM_spiro}
\end{figure}
\begin{table}[hpt]
\begin{center}
\begin{tabular}{c|c|c}
& 2-Gaussian mixture & single Gaussian \\
\hline
$\mu$ & -12.56, -1.05 & -2.47 \\
$\sigma^{2}$ & 1.72, 0.32& 3.85 \\
$\log(p)$ & -42627.63 & -12771185.48
\end{tabular}
\end{center}
\caption[Example of GMM for ALE depth score]{The components and log likelihoods of the two models. We can see that the two Gaussian mixture model is 2 orders of magnitude more likely (in log space) than the single Gaussian model.}
\label{tab:GMM_spiro}
\end{table}
\section{False positive rate vs. number of reported errors}
\label{ROCALE}
In section \ref{SNVsReal} the values given were for the top 0.0001\% and top 0.0005\% of lowest ALE scoring positions. These values were arbitrarily chosen (and can be set by the user in the software package). In Figure \ref{fig:ROC_ALE} we see how the number of reported lowest scoring positions affects the sensitivity and accuracy of ALE with respect to the manually curated set of errors.
Of the top 250 positions on chromosome 1 with the lowest ALE placement score, 88\% of
them are within an insert length (200bp) of either a hard stop (0 depth) or a variant. Furthermore,
these 250 positions are within an insert length of 84\% of the variants and we find 58\% at the
exact base ALE reports. This is to say that we are able to independently locate 154 of 176
variants with a false positive rate of 12\% within the first 250 lowest ALE placement scores. For
chromosome 2 the lowest 75 ALE placement score positions return 85\% (41 of 46) of the variants
(60\% exactly) with an 11\% false positive rate. As a whole ALE discovers 188 of the 222 variants
with a false positive rate of 12\%.
In figure \ref{fig:ROC_ALE} we can see how the total number of lowest ALE placement scores observed
changes the number of variants that we correctly reproduce and the total true positive rate. As
we look at more positions we recover more variants, but with a higher rate of false positives.
Depending on the number of errors in the genome, the slope at which the true positive rate
decreases will vary. A binary search for an acceptable false positive rate can be performed to
meet the needs of accuracy in individual projects.
\begin{figure}[hpt]
\centerline{\includegraphics[width=\textwidth]{figures/ALE/fig6_top.png}}
\caption[ROC curve vs. number of reported errors]{We use ALE to score a re-sequencing genome of {\it E.Coli} in which a set of errors has
been manually and independently curated. The top X lowest ALE placement scores at positions
where there is at least a single read correspond well to the variants discovered in genome
finishing. As we increase the number of ALE scores examined the total percent of variants found
exactly and within a read (36bp) or insert (200bp) length increases. The top area corresponding to
these low scores tend to contain, or is within a short distance of a variant or a “hard stop” (region
of 0 depth). This is to say that the true positive rate is very high (false positive rate very low). This figure
shows how the various rates change with the number of ALE scores reported for Chromosome 1
of the {\it E.Coli} re-sequencing project. We see that after 150 positions that most variants are found,
but we still have a very low false positive rate.}
\label{fig:ROC_ALE}
\end{figure}
\begin{figure}[hpt]
\centerline{\includegraphics[width=\textwidth]{figures/ALE/fig6_bot.png}}
\caption[ROC curve for chromosome 2]{Similar to Figure \ref{fig:ROC_ALE}, for Chromosome 2, which is smaller,
after only 75 scores most variants are discovered and the false positive rate is around 10\%. As we
look at more scores we discover more variants, but the false positive rate increases.}
\label{fig:ROC_ALE2}
\end{figure}
\section{Influence of alignment input} % (fold)
\label{sec:Influence of alignment input}
ALE takes as input a proposed assembly and a SAM/BAM \cite{Li2009} file of alignments of reads onto this proposed assembly. This allows ALE to calculate the probability of observing the assembly given the reads. ALE assumes that this mapping will include, if not all possible mappings, at least the ``best" mapping for each read in the library (if such a mapping exists). For assemblies with many repeat regions (>100) or libraries with large insert sizes, this can be difficult to obtain due to the bias introduced using default parameters of standard aligners. While an extensive review of alignment packages and their optimization is beyond the scope of this work, a review can be found in \cite{Li2010}. If an assembly has many repeats and the aligner bias causes the reporting of reads only mapping to a fraction of possible regions, then ALE will see the unmapped regions as having 0 depth (no supporting reads) which will result in artificially low depth sub-scores. The robustness of ALE will still allow for comparison between assemblies with similar biases, but should be taken into account if the input to ALE is biased for only certain assemblies. To avoid this bias some mappers must be explicitly forced to search for all possible placements (-a in bowtie).
In summary, ALE determines the likelihood of an assembly given the reads and an accurate, unbiased alignment of those reads onto the assembly, without which the model assumptions are violated. These preconditions are usually met except for certain pathological genomes, and even in these cases can be readily corrected by changing the parameters of the aligner used to make ALE's input.
% section Influence of alignment input (end)
\section{Depth Z normalization} % (fold)
\label{sec:DepthZnormalization}
When calculating $\hat{Z}_{\text{depth}}$ at a specific position analytically,
\begin{equation}
\hat{Z}_{\text{depth}}(r,k) = \sum_{k=0}^{\infty} \left(\mbox{nbPMF}\left(k,r,\frac{1}{2}\right)\right)^{2} = \frac{1}{4^{r}} \ _{2}\mbox{F}_{1}\left(r,r;1;\frac{1}{4}\right)
\end{equation}
where $r$ is the depth, $k$ is the expected depth, $_{2}\mbox{F}_{1}$ is a hyper geometric function,
\begin{equation}
_{2}\mbox{F}_{1}\left(r,r;1;\frac{1}{4}\right) = \sum_{n=0}^{\infty} \frac{(r)^{2}_{n}}{4^{n}n!(1)_{n}}
\end{equation}
where
\begin{equation}
(r)_{n} = \frac{\Gamma(r+n)}{\Gamma(r)}
\end{equation}
and nbPMF is the negative binomial probability mass function
\begin{equation}
\mbox{nbPMF}\left(k,r,p\right) = {k+r-1 \choose k} (1-p)^{r}p^{k}.
\end{equation}
We note that numerically evaluating this function results in precision errors for large $r$ due to the fact that we are multiplying a very small number by a very large number. If we move the fraction into the hyper-geometric function and take the exponential of the log we get
\begin{equation}
\hat{Z}_{\text{depth}}(r) = \sum_{n=0}^{\infty} \exp\left(2S(r+n) - 2S(r) - 2S(n+1) - (r+n)\log(4)\right)
\end{equation}
where
\begin{equation}
S(x) = \log\left(\Gamma(x)\right)
\end{equation}
and we can use Stirling's approximation,
% return (input - 0.5)*log(input) - input + lnfactconst2 + 1.0/(12.0*input) - 1.0/(360.0*input3) + 1.0/(1260.0*input5) - 1.0/(1680.0*input7);
\begin{equation}
S(x) = \log\left(x - \frac{1}{2}\right)\log(x) - x + \log(2\pi) + \frac{1}{12x} - \frac{1}{360x^{3}} + \frac{1}{1260x^{5}} - \frac{1}{1680x^{7}} + O\left(\frac{1}{x^{9}}\right)
\end{equation}
to estimate this value. The sum is calculated in practice from $n=0$ until the resulting contribution is less than machine precision ($10^{-16}$ for doubles) due to the fact that the interior function is monotonically decreasing. This is pre-computed in python for common values of $r$ (0 to 2048) for constant time lookup. Other values are computed in real time as needed.
This allows us to numerically calculate $\hat{Z}_{\text{depth}}$ with high precision.
% section Depth Z normalization (end)
\section{Availability and requirements}
\begin{itemize}
\item \textbf{Project name:} ALE
\item \textbf{Project home page:} www.alescore.org (and www.github.com/sc932/ALE)
\item \textbf{Operating systems:} Linux 32/64-bit, Mac OSX, Windows (Cygwin)
\item \textbf{Programming languages:} Python, C
\item \textbf{Other requirements:} Some python packages, see documentation
\item \textbf{License:} UoI/CNSA Open Source
\end{itemize}
% chapter ALE Implementation (end)
% part ALE: Assembly Likelihood Evaluation (end)
\part{EPI: Expected Parallel Improvement} % (fold)
\label{prt:EPI: Expected Parallel Improvement}
\singlespacing
\noindent
\Large
EPI: Expected Parallel Improvement
\noindent
\normalsize
Joint work with Peter Frazier$^{1}$ \\
\scriptsize
$^{1}$Cornell University School of Operations Research and Information Engineering, Ithaca, New York 14853, USA
\normalsize
\normalspacing
\begin{center}
Abstract
\end{center}
This derivative-free global optimization method allows us to optimally sample many points concurrently from an expensive to evaluate, unknown and possibly non-convex function. Instead of sampling sequentially, which can be inefficient when the available resources allow for simultaneous evaluation, EPI provides the best set of points to sample next, allowing multiple samplings to be performed in unison.
In this work we develop a model for expected parallel improvement based on numerically estimating the expected improvement using multiple samples and use multi-start gradient descent to find the optimal set of points to sample next, while fully taking into account points that are currently being sampled for which the result is not yet known.
\chapter{EPI Introduction} % (fold)
\label{cha:EPI Introduction}
\section{Optimization of Expensive Functions}
Optimization attempts to find the maximum or minimum value of some function or experiment. The goal is to find the input or set of parameters that either maximizes or minimizes a particular value. This can be maximizing gains in a financial model, minimizing costs in operations, finding the best result of a drug trial or any of a number of real world examples. The basic setup is that there is something that is desirable to maximize or minimize and we want to find the parameters that obtain this. The underlying functions may be difficult to sample; whether requiring long amounts of time such as drug trials, or excessive money such as financial models, or both such as exploration of natural resources. This limitation forces a good optimization algorithm to find the best possible solution quickly and efficiently, requiring as few exploratory samplings as possible before converging to an optimal solution.
The quantitative and data-intensive explosion in fields such as bioinformatics and other sciences is producing petabytes of data and increasingly complex models and computer algorithms to analyze it. As the amount of data being inputed and the complexity of these algorithms grow they take more and more time to compute. Even with modern supercomputers that can perform many peta-FLOPS ($10^{15}$ Floating Point Operations Per Second) scientific codes simulating fluid dynamics \cite{Compo2011} and complex chemical reactions \cite{Valiev2010} can take many hours or days to compute, using millions of CPU hours or more. The assembly of a single genome using the software package Velvet \cite{Zerbino2008} can take 24 hours or more on a high memory supercomputer. The sheer amount of time and resources required to run these simulations and computations means that the fine-tuning of the parameters of the models is extremely time and resource intensive.
Statistical methods such as EGO \cite{Jones1998} attempt to solve this problem by estimating the underlying function that is being optimized and computing the next point to sample so that it maximizes the expected improvement over the best result observed so far. When performed sequentially, this method often quickly finds a good point within the space of possible inputs, and given more samples will continue to search for the global optimum. This method is limited by its sequential nature, however, and cannot take advantage of possible parallel computational or sampling resources allowing for samples to be drawn concurrently. There have been some heuristic attempts to address the problem of parallel expected improvement \cite{Ginsbourger2008}, but all suffer from limitations in their sequential design. In this work we develop a model for expected parallel improvement, based on numerically estimating expected improvement using multiple samples and use multi-start gradient descent to find the optimal set of points to sample next, while fully taking into account points that are currently being sampled for which the result is not yet known.
\section{Gaussian Processes}
We begin with a Gaussian process prior on a continuous function $f$. The function $f$ has domain $A \subseteq \mathbb{R}^{d}$. Our overarching goal is to solve the global optimization problem
\begin{equation}
\max_{x \in A} f(x).
\end{equation}
This problem can be constrained or unconstrained depending on the set $A$.
The paper \cite{Jones1998} developed a method for choosing which points to evaluate next based on fitting a global metamodel to the points evaluated thus far, and then maximizing a merit criterion over the whole surface to choose the single point to evaluate next.
Although \cite{Jones1998} describes their technique, EGO, in terms of a kriging metamodel and uses frequentist language, their technique can also be understood in a Bayesian framework. This framework uses a Gaussian process prior on the function $f$, which is a probabilistic model whose estimates of $f$ have the corresponding framework described below.
\section{Gaussian process priors}
Any Gaussian process prior on $f$ is described by a mean function $\mu : A \mapsto \mathbb{R}$ and a covariance function $K : A \times A \mapsto \mathbb{R}_{+}$. The mean function is general, and sometimes reflects some overall trends believed to be in the data, but is more commonly chosen to be 0. The covariance function must satisfy certain conditions:
\begin{equation}K(x,x) \geq 0,\end{equation}
\begin{equation}K(x,y) = K(y,x),\end{equation}
and it must be positive semi-definite, which is to say that for all $\vec{x}_{1}, \ldots, \vec{x}_{k} \in A$, and all finite $k$, if we construct a matrix by setting the value at row $i$ and column $j$ to be $\Sigma_{ij} = K(\vec{x}_{i}, \vec{x}_{j})$ this matrix must be positive semi-definite,
\begin{equation}\vec{v}^{T}\Sigma \vec{v} \geq 0, \ \ \ \forall \vec{v} \in \mathbb{R}^{d}.\end{equation}
Common choices for $K$ include the Gaussian covariance function, $K(x,x^{\prime}) = a \exp(-b \| x - x^{\prime}\|^{2})$ for some parameters $a$ and $b$ and the power exponential error function, $K(x, x^{\prime}) = a \exp(-\sum_{i} b_{i} (x_{i} - x_{i}^{\prime})^{p})$ for some parameters $\vec{b} \in \mathbb{R}^{d}, p$ and $a$.
Putting a Gaussian Process (GP) prior on $f$, written
\begin{equation}
f \sim \mbox{GP}(\mu(\cdot), K(\cdot, \cdot))
\end{equation}
means that if we take any fixed set of points $x_{1}, \ldots, x_{n} \in A$ and consider the vector $(f(x_{1}), \ldots, f(x_{n}))$ as an unknown quantity, our prior on it is the multivariate normal,
\begin{equation}
(f(x_{1}), \ldots, f(x_{n})) \sim N\left( \left[ \begin{tabular}{c} $\mu(x_{1})$ \\ $\vdots$ \\ $\mu(x_{n})$ \end{tabular} \right] , \Sigma_{n} = \left[ \begin{tabular}{ccc} $K(x_{1}, x_{1})$ & $\cdots$ & $K(x_{n}, x_{1})$ \\ $\vdots$ & $\ddots$ & $\vdots$ \\ $K(x_{1}, x_{n})$ & $\cdots$ & $K(x_{n},x_{n})$ \end{tabular} \right] \right).
\end{equation}
GPs are analytically convenient. If we observe the function $f$ at $x_{1}, \ldots, x_{n}$, getting values $y_{1} = f(x_{1}), \ldots, y_{n} = f(x_{n})$, then the posterior of $f$ is also a GP,
\begin{equation}
f|x_{1:n}, y_{1:n} \sim GP(\mu_{n}, \Sigma_{n})
\end{equation}
where $\mu_{n}$ and $\Sigma_{n}$ are defined in the Methods section \ref{comp_of_gp}. We can see the evolution of the GP as more points are sampled in Figure \ref{fig:GPP_evolve}.
\begin{figure}[hpt]