-
Notifications
You must be signed in to change notification settings - Fork 4
/
report.tex
1401 lines (1148 loc) · 59.1 KB
/
report.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass{article}
\usepackage{fullpage}
\usepackage{graphicx}
\usepackage{color}
\usepackage[usestackEOL]{stackengine}
\usepackage{fancyhdr}
\usepackage{url}
\usepackage{subfig}
\usepackage{multirow}
\usepackage[table]{xcolor}
\usepackage{wrapfig}
\usepackage{blindtext}
\usepackage{amsmath,bm}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{amsfonts}
\usepackage[round]{natbib}
\usepackage{enumitem,xcolor}
\usepackage[multiple]{footmisc}
\usepackage[
pdftitle={Capstone Report - Udacity Machine Learning Nanodegree},
pdfsubject={Machine Learning, Reinforcement Learning, Deep Learning, Artificial Intelligence, Games},
pdfauthor={David Robles},
pdfpagemode=UseOutlines,
pdfborder= {0 0 1.0},
bookmarks,
bookmarksopen,
colorlinks=true,
citecolor=blue,
linkcolor=blue, %
linkbordercolor=blue, %
urlcolor=blue, %
]{hyperref}
\usepackage{adjustbox}
\usepackage{kantlipsum}
\usepackage{tikz}
\usepackage[labelfont=bf]{caption}
\usepackage[utf8]{inputenc}
% Default fixed font does not support bold face
\DeclareFixedFont{\ttb}{T1}{txtt}{bx}{n}{8} % for bold
\DeclareFixedFont{\ttm}{T1}{txtt}{m}{n}{8} % for normal
%%%%%%%%%%%%%
% EQUATIONS %
%%%%%%%%%%%%%
% ArgMin
\DeclareMathOperator*{\argmin}{\arg\!\min}
% ArgMax
\DeclareMathOperator*{\argmax}{\arg\!\max}
% Custom colors
\usepackage{color}
\definecolor{deepblue}{rgb}{0,0,0.5}
\definecolor{deepred}{rgb}{0.6,0,0}
\definecolor{deepgreen}{rgb}{0,0.5,0}
\definecolor{coolblue}{HTML}{101094}
\usepackage{listings}
\definecolor{codebg}{RGB}{238,238,238}
% Python style for highlighting
\newcommand\pythonstyle{\lstset{
language=Python,
basicstyle=\ttm,
otherkeywords={}, % Add keywords here
keywordstyle=\ttm\color{coolblue},
emph={MyClass}, % Custom highlighting
emphstyle=\ttm\color{deepred}, % Custom highlighting style
stringstyle=\color{deepgreen},
frame=tb, % Any extra options here
framesep=10pt,
framexleftmargin=10pt,
backgroundcolor=\color{codebg},
rulecolor=\color{codebg},
aboveskip=15pt,
belowskip=15pt,
showstringspaces=false %
}}
% Python environment
\lstnewenvironment{python}[1][] {
\pythonstyle
\lstset{#1}
}{}
% \setmonofont[Color={0019D4}]{Courier New}
% Python for external files
\newcommand\pythonexternal[2][]{{
\pythonstyle
\lstinputlisting[#1]{#2}}}
% Python for inline
\newcommand\pythoninline[1]{{\pythonstyle\lstinline!#1!}}
%%%%%%%%%%%%%%
% Github URL %
%%%%%%%%%%%%%%
\newcommand{\GithubURL}[1]{[\href{https://github.com/davidrobles/mlnd-capstone-code/blob/master/#1}{implementation}]}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Connect 4 UCI Data Set URL %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\URLUCI}{\url{https://archive.ics.uci.edu/ml/datasets/Connect-4}}
%%%%%%%%%%
% Colors %
%%%%%%%%%%
\definecolor{even}{RGB}{205,222,231}
\definecolor{odd}{RGB}{240,249,254}
\definecolor{header}{RGB}{128,169,188}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Learning to Play Board Games With Reinforcement Learning}
\author{David A. Robles}
\date{May 4, 2017}
\begin{document}
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Project Overview}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Reinforcement learning is the area of machine learning concerned with the idea of learning by
interacting with an environment~\citep{Sutton1998RL}. It has a long and notable history in games.
The checkers program written by \citet{Samuel1959Checkers} was the first remarkable application of
temporal difference learning, and also the first significant learning program of any kind. It had
the principles of temporal difference learning decades before it was described and analyzed.
However, it was another game where reinforcement learning reached its first big success, when
\textsc{TD-Gammon}~\citep{Tesauro1995TD} reached world-class gameplay in Backgammon by training a
neural network-based evaluation function through self-play.
Deep Learning~\citep{LeCun2015Nature} is another branch of machine learning that allows
computational models that are composed of multiple processing layers to learn representations of
data with multiple levels of abstraction. Deep learning techniques have dramatically improved the
state-of-the-art in areas such as speech recognition~\citep{Hinton2012Speech}, image
recognition~\citep{Krizhevsky2012ImageNet} and natural language processing~\citep{Colbert2012}.
Reecently, there have been several breakthroughs when combining reinforcement learning and deep
learning. \cite{Mnih2015AtariNature} used a convolutional neural network trained with a variant of
Q-learning and learned to play Atari 2600 games at a human-level. Last year, one of the biggest
challenges for artificial intelligence was solved, when Google's DeepMind~\citep{Silver2016GoNature}
created AlphaGo to beat the world's greatest Go player. AlphaGo used deep neural networks trained by
a combination of supervised learning from human expert games, and reinforcement learning from games
of self-play.
Developing strong game-playing programs for classic two-player games (e.g. Chess, Checkers, Go) is
important in two respects: first, for humans that play the games looking for an intellectual
challenge, and second, for AI researchers that use the games as testbeds for artificial
intelligence. In both cases, the task for game programmers and AI researchers of writing strong game
AI is a hard and tedious task that requires hours of trial and error adjustments and human
expertise. Therefore, when a strong game-playing algorithm is created to play a specific game, it is
rarely useful for creating an algorithm to play another game, since the domain knowledge is
non-transferable. For this reason, there is enormous value in using machine learning to learn value
functions to play these games without using any domain knowledge.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Problem Statement}
\label{sec:problem-statement}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\URLcf}{https://en.wikipedia.org/wiki/Connect_Four}
In this project we use reinforcement learning and deep learning to learn value functions for the
games of \mbox{Tic-Tac-Toe} and \mbox{Connect 4}. We define the machine learning problem as follows:
\begin{itemize}
\item \textbf{Task:} Playing Tic-Tac-Toe and Connect 4.
\item \textbf{Performance:} Winning percentage when playing against a random player, starting
from new games, and also starting from board positions from the UCI Connect 4 dataset.
\item \textbf{Experience:} Games played against a random opponent, an Alpha-Beta opponent and
against itself.
\item \textbf{Target function:} $Q^\pi : \mathcal{S} \times \mathcal{A} \to \mathbb{R}$, where
$\mathcal{S}$ is the set of \emph{states} (board positions) and $\mathcal{A}$ is the set of
\emph{actions} (moves), and $\mathbb{R}$ represents the value of being in a state $s \in
\mathcal{S}$, applying a action $a \in \mathcal{A}$, and following policy $\pi$ thereafter.
\item \textbf{Target function representations:} Lookup table and deep neural network.
\end{itemize}
More specifically, we seek to build an agent that uses Q-learning via self-play to train a deep
convolutional neural network to approximate the optimal action-value function:
\begin{equation}
Q^*(s,a) = \max\limits_\pi Q^\pi(s,a), \forall s \in \mathcal{S}, a \in \mathcal{A}
\end{equation}
\noindent which is the maximum sum of rewards achievable by a behavior policy $\pi$.
%%%%%%%%%%%%%%%%%%%%
\subsection{Metrics}
%%%%%%%%%%%%%%%%%%%%
\begin{itemize}
\item \textbf{Winning percentage.} Consists in playing a high number of games (e.g. 100,000)
against other agents (e.g. Random player) using the learned value functions as the
action-value function to take greedy actions.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Analysis}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Data Exploration and Visualization}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In this project we use two environments and one dataset. The environments are the games of
Tic-Tac-Toe and Connect 4, and the dataset is the UCI Connect 4 dataset.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Tic-Tac-Toe Environment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Tic-Tac-Toe \GithubURL{capstone/game/games/tictactoe.py} is a paper-and-pencil game for two players,
X and O, who take turns marking the spaces in a $3 \times 3$ grid. The player who succeeds in
placing three of their marks in a horizontal, vertical, or diagonal row wins the game.
\hyperref[fig:tic-env]{Figure~\ref*{fig:tic-env}} shows three Tic-Tac-Toe game positions: a win, a
draw and a loss.
Tic-Tac-Toe is an extremely simple game, nonetheless, is very useful to analyze simple concepts and
to verify that the implementation of the learning algorithms are behaving as expected before moving
into the Connect 4 environment, which is more complex because of its large state space.
%%%%%%%%%%
% Figure %
%%%%%%%%%%
\begin{figure}[!b]
\centering
\subfloat[Win for \textsc{X}]{
\label{fig:tic-env-win}
\includegraphics[width=0.15\textwidth]{figures/tic_env_win.pdf}
} \hspace{0.2in}
\subfloat[Win for \textsc{O}]{
\label{fig:tic-env-loss}
\includegraphics[width=0.15\textwidth]{figures/tic_env_loss.pdf}
} \hspace{0.2in}
\subfloat[Draw]{
\label{fig:tic-env-draw}
\includegraphics[width=0.15\textwidth]{figures/tic_env_draw.pdf}
}
\caption{Tic-Tac-Toe}
\label{fig:tic-env}
\end{figure}
\pagebreak[4]
% \begin{python}
% from capstone.game import TicTacToe
% game = TicTacToe()
% game.legal_moves() # [1, 2, 3, 4, 5, 6, 7, 8, 9]
% print(game)
% \end{python}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Connect 4 Environment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Connect 4 \GithubURL{capstone/game/games/connect4.py} is a two-player board game of perfect
information where pieces are dropped into the columns of a vertical $6 \times 7$ grid with the goal
of forming a straight line of 4 connected pieces. There are at most seven actions per state, since
placing a piece in a column is a legal action only if that column has at least one empty location.
In this project we use pieces of two colors: \textsc{Yellow} for the first player, and \textsc{Red}
for the second player. \hyperref[fig:c4-env]{Figure~\ref*{fig:c4-env}} shows three Connect 4 game
positions: a win for \textsc{Yellow}, a win for \textsc{Red} and a draw:
%%%%%%%%%%
% Figure %
%%%%%%%%%%
\begin{figure}[!h]
\centering
\subfloat[Win for \textsc{Yellow}]{
\label{fig:c4-env-wing}
\includegraphics[width=0.18\textwidth]{figures/c4_env_win.pdf}
} \hspace{0.1in}
\subfloat[Win for \textsc{Red}]{
\label{fig:c4-env-loss}
\includegraphics[width=0.18\textwidth]{figures/c4_env_loss.pdf}
} \hspace{0.1in}
\subfloat[Draw]{
\label{fig:c4-env-draw}
\includegraphics[width=0.18\textwidth]{figures/c4_env_draw.pdf}
}
\caption{Connect Four}
\label{fig:c4-env}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{UCI Connect 4 Data Set}
\label{sec:uci-c4}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As part of the testing phase, we will use the \emph{Connect 4 Data Set} that is available from the
UCI Machine Learning Repository~\citep{Lichman2013}. A partial view of the dataset is presented in
\hyperref[table:uci-dataset]{Table~\ref*{table:uci-dataset}}. The dataset has a total of 67,557
instances, representing all legal 8-ply positions in \mbox{Connect 4} in which neither player has
won yet, and which the next move is not forced. Each instance is described by 42 features, one for
each space in the $6 \times 7$ board, and can belong to one of the classes $\{\texttt{x},
\texttt{o}, \texttt{b}\}$, where \texttt{x} is the first player, \texttt{o} is the second player,
and \texttt{b} is empty. The outcome class is the game theoretical value for the first player, and
can belong to one of the classes $\{\texttt{win}, \texttt{loss}, \texttt{draw}\}$. There are 44,473
wins, 16,635 losses and 6,449 draws \GithubURL{experiments/c4_uci_data_expl.py}.
\hyperref[fig:c4-exp]{Figure~\ref*{fig:c4-exp}} shows a visual representation of five randomly
selected instances of the data set. As we can see, all game positions have eight pieces in the
board, representing all legal 8-ply positions \GithubURL{experiments/c4_uci_viz.py}.
%%%%%%%%%%%%%%%%%%%%%%
% Table: UCI Dataset %
%%%%%%%%%%%%%%%%%%%%%%
\begin{table}[b!]
\small
\centering
\renewcommand{\arraystretch}{1.2}
{\rowcolors{3}{even}{odd}
\begin{tabular}{ c | c c c c c c c c c c c c c c c c c | c }
\rowcolor{header}
& \multicolumn{17}{c|}{\textbf{Features}} & \textbf{Target} \\ \rowcolor{header}
\textbf{No.} & \textbf{a1} & \textbf{a2} & \textbf{a3} & \textbf{a4} & \textbf{a5} &
\textbf{a6} & \textbf{b1} & \textbf{b2} & \textbf{...} & \textbf{f5} & \textbf{f6} &
\textbf{g1} & \textbf{g2} & \textbf{g3} & \textbf{g4} & \textbf{g5} & \textbf{g6} & \textbf{outcome} \\
0 & b & b & b & b & b & b & b & b & ... & b & b & b & b & b & b & b & b & win \\
1 & b & b & b & b & b & b & b & b & ... & b & b & b & b & b & b & b & b & win \\
2 & b & b & b & b & b & b & o & b & ... & b & b & b & b & b & b & b & b & win \\
3 & b & b & b & b & b & b & b & b & ... & b & b & b & b & b & b & b & b & win \\
4 & o & b & b & b & b & b & b & b & ... & b & b & b & b & b & b & b & b & win \\
... & .. & .. & .. & .. & .. & .. & .. & .. & ... & .. & .. & .. & .. & .. & .. & .. & .. & ... \\
67552 & x & x & b & b & b & b & o & x & ... & b & b & o & o & x & b & b & b & loss \\
67553 & x & x & b & b & b & b & o & b & ... & b & b & o & x & o & o & x & b & draw \\
67554 & x & x & b & b & b & b & o & o & ... & b & b & o & x & x & o & b & b & loss \\
67555 & x & o & b & b & b & b & o & b & ... & b & b & o & x & o & x & x & b & draw \\
67556 & x & o & o & o & x & b & o & b & ... & b & b & x & b & b & b & b & b & draw \\
\end{tabular}
}
\caption{UCI Connect 4 Dataset. Each row represents a different board position, and each feature
represents a specific cell in the board. The target is the outcome of the game for the
first player, assuming perfect play.}
\label{table:uci-dataset}
\end{table}
\pagebreak[4]
%%%%%%%%%%
% Figure %
%%%%%%%%%%
\begin{figure}[!t]
\centering
\subfloat[Win]{
\label{fig:c4-exp-1}
\includegraphics[width=0.15\textwidth]{figures/c4_exploration_1_win.pdf}
} \hspace{0.1in}
\subfloat[Loss]{
\label{fig:c4-exp-2}
\includegraphics[width=0.15\textwidth]{figures/c4_exploration_2_loss.pdf}
} \hspace{0.1in}
\subfloat[Win]{
\label{fig:c4-exp-3}
\includegraphics[width=0.15\textwidth]{figures/c4_exploration_3_win.pdf}
} \hspace{0.1in}
\subfloat[Loss]{
\label{fig:c4-exp-4}
\includegraphics[width=0.15\textwidth]{figures/c4_exploration_4_loss.pdf}
} \hspace{0.1in}
\subfloat[Draw]{
\label{fig:c4-exp-10}
\includegraphics[width=0.15\textwidth]{figures/c4_exploration_10_draw.pdf}
}
\caption{Five randomly selected instances of the UCI Connect 4 Data Set. The outcome of each board
position is from the point of view of the firs player (yellow discs).}
\label{fig:c4-exp}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Algorithms and Techniques}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Markov Decision Process}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
A \emph{Markov decision process} (MDP) \GithubURL{capstone/rl/mdp.py} consist of four elements:
\begin{itemize}
\item $\mathcal{S}$ is the set of \emph{states} (state space).
\item $\mathcal{A}$ is the set of \emph{actions} (action space). The set of actions that are
available in some particular state $s_t \in \mathcal{S}$ is denoted $\mathcal{A}(s_t)$.
\item $ T : \mathcal{S} \times \mathcal{A} \times \mathcal{S} \to \mathbb{R}$ is the
\emph{transition function}, which is the probability given we are in state $s_t \in
\mathcal{S}$, take action $a_t \in \mathcal{A}(s_t)$ and we will transition to state $s_{t+1}
\in \mathcal{S}$.
\item $ R : \mathcal{S} \times \mathcal{A} \times \mathcal{S} \to \mathbb{R}$ is the
\emph{reward function}, which is the immediate reward received when in state $s_t \in
\mathcal{S}$ action $a_t \in \mathcal{A}$ is taken and the MDP transitions to state $s_{t+1}
\in \mathcal{S}$. However, it is also possible to define it either as $ R : \mathcal{S} \times
\mathcal{A} \to \mathbb{R}$ or $R : \mathcal{S} \to \mathbb{R}$. The first one gives rewards
for performing an action $a_t$ in a particular state $s_t$, and the second gives rewards when
transitioning to state $s_{t+1}$.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Environment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
In the reinforcement learning problem an agent does not have access to the dynamics (reward and
transition functions) of the MDP. However, it interacts with an \emph{environment}
\GithubURL{capstone/rl/environment.py} by way of three signals: a \emph{state}, which describes the
state of the environment, an \emph{action}, which allows the agent to have some impact on the
environment, and a \emph{reward}, which provides the agent with feedback on its immediate
performance.
%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Policy}
%%%%%%%%%%%%%%%%%%%%%%
In an MDP, the agent acts according to a policy $\pi$ \GithubURL{capstone/rl/policy.py}, which maps
each state $s \in \mathcal{S}$ to an action $a \in \mathcal{A}(s)$. A policy that specifies a unique
action to be performed is called a \emph{deterministic} policy, and is defined as $\pi : \mathcal{S}
\rightarrow \mathcal{A}$.
The interaction between the policy used by the agent and the environment works as follows. First, it
starts at an \emph{initial state} $s_0$. Then, the policy $\pi$ selects an action $a_0 = \pi(s_0)$
from the set of available actions $\mathcal{A}(s_0)$, and the action is executed. The environment
transitions to a new state $s_1$ based on the transition function $T$ with probability
$T(s_0,a_0,s_1)$, and a reward $r_0 = R(s_0, a_0, s_1)$ is received. This process continues,
producing a \emph{trajectory} of experience $s_0, a_0, s_1, r_1, a_1, s_2, r_2, a_2, \dots$, and the
process ends in a \emph{terminal state} $s_T$ and is restarted in the initial state.
We use three types of policies in this project:
\begin{itemize}
\item \textbf{Random.} Selects actions uniformly at random
\GithubURL{capstone/rl/policies/random_policy.py}.
\item \textbf{Greedy.} Selects the \emph{max action}, which is the greedy
action with the highest value \GithubURL{capstone/rl/policies/greedy.py},
\begin{equation}
\pi_{\textrm{greedy}}(s) = \argmax_{a \in \mathcal{A}(s)} Q(s, a)
\end{equation}
\item \textbf{$\epsilon$-greedy.} Selects the best action for a proportion
$1 - \epsilon$ of the trials, and another action is randomly selected (with
uniform probability) for a proportion \GithubURL{capstone/rl/policies/egreedy.py},
\begin{equation}
\pi_{\epsilon}(s) = \left\{
\begin{array}{lr}
\pi_{\textrm{rand}}(s,a) & \text{if } rand() < \epsilon\\
\pi_{\textrm{greedy}}(s,a) & \text{otherwise}
\end{array}
\right.
\end{equation}
where $\epsilon \in [0, 1]$ and $rand()$ returns a random number from a
uniform distribution $\in [0, 1]$.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Value Functions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Most of the algorithms for solving MDPs (computing optimal policies) do it by learning a \emph{value
function} \GithubURL{capstone/rl/value_function.py}. A value function estimates what is good for an
agent over the long run. It estimates the expected outcome from any given state, by summarizing the
total amount of reward that an agent can expect to accumulate into a single number. Value functions
are defined for particular policies.
The \emph{state value function} (or V-function), is the expected return when starting in state $s$
and following policy $\pi$ thereafter~\citep{Sutton1998RL},
%
\begin{equation}
V^\pi(s) = \mathbb{E}_\pi \left[R_t | s_t = s \right]
\end{equation}
The \emph{action value function} (or Q-function), is the expected return after selecting action $a$
in state $s$ and then following policy $\pi$,
%
\begin{equation}
Q^\pi(s,a) = \mathbb{E}_\pi \left[ R_t | s_t = s, a_t = a \right]
\end{equation}
The \emph{optimal value function} is the unique value function that maximises the value of every
state, or state-action pair,
%
\begin{eqnarray}
Q^*(s,a) & = & \max\limits_\pi Q^\pi(s,a), \forall s \in \mathcal{S}, a \in \mathcal{A}
\end{eqnarray}
An \emph{optimal policy} $\pi^*(s,a)$ is a policy that maximises the action value function from
every state in the MDP,
%
\begin{equation}
\pi^*(s,a) = \argmax_\pi Q^\pi(s, a)
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Q-learning}
%%%%%%%%%%%%%%%%%%%%%%%%%%
One of the most basic and popular methods to estimate action-value functions is the
\emph{Q-learning} algorithm~\citep{Watkins1989PhD} \GithubURL{capstone/rl/learners/qlearning.py}.
It is model-free online off-policy algorithm, whose main strength is that it is able to compare the
expected utility of the available actions without requiring a model of the environment. Q-learning
works by learning an action-value function that gives the expected utility of taking a given action
in a given state and following a fixed policy thereafter. The update rule uses action-values and a
built-in max-operator over the action-values of the next state in order to update $Q(s_t, a_t)$ as
follows,
\begin{equation}
Q(s_t,a_t) \gets Q(s_t,a_t) + \alpha \left[r_{t+1} + \gamma \max_a Q(s_{t+1},a) - Q(s_t,a_t)\right]
\end{equation}
The agent makes a step in the environment from state $s_t$ to $s_{t+1}$ using action $a_t$ while
receiving reward $r_t$. The update takes place on the action-value $a_t$ in the state $s_t$ from
which this action was executed. This version of Q-learning works well for tasks with a small a
state-space, since it uses arrays or tables \GithubURL{capstone/rl/value_functions/tabular.py} with
one entry for each state-action pair.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Approximate Q-learning}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In many cases in which there are far more states than could possibly be entries in a table we need
to use function approximation. Approximate Q-learning
\GithubURL{capstone/rl/learners/qlearning_approx.py} consists in parameterizing an approximate
action-value function, $Q(s,a;\theta_i) \approx Q(s,a)$, in which $\theta_i$ are the parameters
(weights) of the action-value function at iteration $i$. Usually the number of parameters of a
function approximator is much less than the state space, which means that a change in one parameter
can change many Q-values, as opposed to just one as in the tabular case.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Experience Replay}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Reinforcement learning is known to be unstable or even to diverge when a nonlinear function
approximator such as a neural network is used to represent the action-value function. One trick to
make it work is to use \emph{experience replay}~\citep{LIn1993ExpReplay}
\GithubURL{capstone/rl/learners/qlearning_approx.py\#L41}, which consists in storing the experiences
$(s_t, a_t, r_t, s_{t+1})$ at each time step $t$ in a data set $D_t=\{e_1,\dots,e_t\}$. During the
training of approximate Q-learning random minibatches from the replay memory are used instead of the
most recent transition. This breaks the similarity of subsequent training samples, which otherwise
might drive the network into a local minimum. The \mbox{Q-learning} update at iteration $i$ uses the
following loss function~\citep{Mnih2015AtariNature}:
\begin{equation}
L_i(\theta_i) = \mathbb{E}_{(s, a, r, s') \sim U(D)} \\
\left[\left(r + \gamma \max_{a'} Q(s',a', \theta^-_i) - Q(s,a;\theta)\right)^2\right]
\end{equation}
where $(s, a, r, s') \sim U(D)$ is a sample minibatch of experience drawn uniformly at random from
the memory pool of stores experiences.
%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Self-play}
%%%%%%%%%%%%%%%%%%%%%%%%%
Self-play \GithubURL{capstone/rl/learners/qlearning_approx.py\#L24} is by far the most popular
training method for games. It is a single policy $\pi(s,a)$ that is used by both players in a
two-player game, $\pi_1(s,a) = \pi_2(s,a) = \pi(s,a)$. The first reason for its popularity is that
training is quickest if the learner's opponent is roughly equally strong, and that definitely holds
for self-play. As a second reason for popularity, there is no need to implement or access a
different agent with roughly equal playing strength. However, self-play has several drawbacks, with
the main one being that a single opponent does not provide sufficient
exploration~\citep{Szita2011RLGames}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Convolutional Neural Networks}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Convolutional Neural Networks~\citep{LeCun1989}, or CNNs, are a special type of neural network that
has a known grid-like topology. Like most other neural networks they are trained with a variant of
the backpropagation algorithm. CNNs strengh is pattern recognition directly from pixels of images
with minimal processing. We use a convolutional network as a function approximator for the board of
Connect 4 \GithubURL{capstone/rl/value_functions/c4deepnetwork.py\#L42}, since the board can be
thought of as a 2-D grid of pixels (discs in this case).
%%%%%%%%%%%%%%%%%%%%%%
\subsection{Benchmark}
\label{sec:benchmark}
%%%%%%%%%%%%%%%%%%%%%%
\begin{itemize}
\item \textbf{Random agent}. This benchmark consists in playing against an agent that takes
uniformly random moves. Also, this will help us to detect bugs in the code and algorithms:
if a learned value function does not play significantly better than a random agent, is not
learning.
\item \textbf{Connect 4 Data Set}. The board configurations in this dataset will be used as
starting positions to play games against the random agent. Starting from positions where we
known the game-theoretic outcome is helpful to know how good the learning agent can get.
\end{itemize}
% \pagebreak[4]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Methodology}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Data Preprocessing}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The only data preprocessing done in this project was for the Connect 4 UCI dataset. As mentioned
before, the dataset was used only for the testing phase. All the learning came from the two
reinforcement learning simulators for Tic-Tac-Toe and Connect 4. We loaded the Connect 4 UCI dataset
into a Pandas DataFrame for simple data analysis using \texttt{load\_dataframe}
\GithubURL{capstone/datasets/ucic4.py\#L6}. Also, a \texttt{series\_to\_game} helper function was
created to convert a DataFrame row into a \mbox{Connect 4} game
\GithubURL{capstone/datasets/ucic4.py\#L24}, and another three helper functions were created to load
games randomly, or by specific game-theoretic outcome \GithubURL{capstone/datasets/ucic4.py\#L49}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Implementation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Game}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The first part of the implementation was to define a \texttt{Game} interface
\GithubURL{capstone/game/game.py} to be used by both games: Tic-Tac-Toe and Connect 4.
\begin{python}
class Game(object):
def copy(self):
'''Returns a copy of the game.'''
pass
def cur_player(self):
'''
Returns the index of the player in turn, starting with 0:
0 (Player 1), 1 (Player 2), etc.
'''
pass
def is_over(self):
'''Returns True if the game is over.'''
return len(self.legal_moves()) == 0
def legal_moves(self):
'''Returns a list of legal moves for the player in turn.'''
pass
def make_move(self, move):
'''Makes one move for the player in turn.'''
pass
def outcomes(self):
'''Returns a list of outcomes for each player at the end of the game.'''
pass
def reset(self):
'''Restarts the game.'''
pass
\end{python}
Once we had the \texttt{Game} interface well defined we implemented Tic-Tac-toe
\GithubURL{capstone/game/games/tictactoe.py} and \mbox{Connect 4}
\GithubURL{capstone/game/games/connect4.py}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Markov Decision Process}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Next, the interface for an MDP \GithubURL{capstone/rl/mdp.py\#L7}:
\begin{python}
class MDP(object):
def states(self):
'''Returns a list of all states. Not generally possible for large MDPs.'''
pass
def start_state(self):
'''Returns the initial state.'''
pass
def actions(self, state):
'''Returns a list of possible actions in the given state.'''
pass
def transitions(self, state, action):
'''
Returns a dict of (next_state: probability) key/values, where 'next_state' is
reachable from 'state' by taking 'action'. The sum of all probabilities should
be 1.0. Not available in reinforcement learning.
'''
pass
def reward(self, state, action, next_state):
'''
Returns the reward of being in 'state', taking 'action', and ending up
in 'next_state'. Not available in reinforcement learning.
'''
pass
def is_terminal(self, state):
'''.Returns True if the given state is terminal.'''
pass
\end{python}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Markov Decision Process for Games}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We also created two helper classes. The first class \texttt{GameMDP}
\GithubURL{capstone/rl/mdp.py\#L66} was designed to convert a Game into an MDP. In this case the
agent makes moves for both players in the game. Used in self-play learning.
\begin{python}
class GameMDP(MDP):
def __init__(self, game):
self._game = game
self._states = {}
def actions(self, state):
return [None] if state.is_over() else state.legal_moves()
def is_terminal(self, state):
return state.is_over()
def reward(self, state, action, next_state):
'''Returns the utility from the point of view of the first player.'''
return utility(next_state, 0) if next_state.is_over() else 0
def start_state(self):
return self._game.copy()
def states(self):
if not self._states:
def generate_states(game):
'''Generates all the states for the game'''
if game not in self._states:
self._states[game] = game
for move in game.legal_moves():
new_game = game.copy().make_move(move)
generate_states(new_game)
generate_states(self._game)
return self._states
def transitions(self, state, action):
if state.is_over():
return [(state, 1.0)]
new_game = state.copy().make_move(action)
return [(new_game, 1.0)]
\end{python}
And also, \texttt{FixedGameMDP} \GithubURL{capstone/rl/mdp.py\#L108}, which converts a \texttt{Game}
into an \texttt{MDP} by using a fixed opponent for one of the players in the game:
\begin{python}
class FixedGameMDP(GameMDP):
def __init__(self, game, opp_player, opp_idx):
'''
opp_player: the opponent player
opp_idx: the idx of the opponent player in the game
'''
super(FixedGameMDP, self).__init__(game)
self._opp_player = opp_player
self._opp_idx = opp_idx
self._agent_idx = opp_idx ^ 1
def reward(self, game, move, next_game):
return utility(next_game, self._agent_idx) if next_game.is_over() else 0
def start_state(self):
new_game = self._game.copy()
if not new_game.is_over() and new_game.cur_player() == self._opp_idx:
chosen_move = self._opp_player.choose_move(new_game)
new_game.make_move(chosen_move)
return new_game
def transitions(self, game, move):
if game.is_over():
return []
new_game = game.copy().make_move(move)
if not new_game.is_over() and new_game.cur_player() == self._opp_idx:
chosen_move = self._opp_player.choose_move(new_game)
new_game.make_move(chosen_move)
return [(new_game, 1.0)]
\end{python}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Environment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
A reinforcement learning environment \GithubURL{capstone/rl/environment.py} serves as a middleman
between the policies and the MDP. The main idea is that when a learning agent interacts with an
environment it does not have access to the transition and reward functions. It wraps the MDP.
\begin{python}
class Environment(object):
def __init__(self, mdp):
self._mdp = mdp
self._cur_state = self._mdp.start_state()
def actions(self, state):
'''Returns the available actions in the given state.'''
return self._mdp.actions(state)
def cur_state(self):
'''Returns the current state.'''
return self._cur_state.copy()
def do_action(self, action):
'''
Performs the given action in the current state.
Returns (reward, next_state).
'''
prev = self.cur_state()
transitions = self._mdp.transitions(self.cur_state(), action)
for next_state, prob in transitions:
self._cur_state = next_state
reward = self._mdp.reward(prev, action, self.cur_state())
return reward, self.cur_state()
def is_terminal(self):
return self._mdp.is_terminal(self.cur_state())
def reset(self):
'''Resets the current state to the start state.'''
self._cur_state = self._mdp.start_state()
\end{python}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{EpisodicLearnerMixin}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Q-learning and Approximate Q-learning algorithms are episodic learners. Their implementation
includes the mixin \texttt{EpisodicLearnerMixin} to reuse functionality for running episodes and
calling the callbacks that are used for generating plots and evaluations.
\begin{python}
class EpisodicLearnerMixin(object):
'''
Mixin for learning value functions by interacting with an
environment in an episodic setting.
'''
@property
def value_function(self):
if hasattr(self, 'vfunction'):
return self.vfunction
if hasattr(self, 'qfunction'):
return self.qfunction
def train(self, n_episodes, callbacks=None):
'''Trains the model for a fixed number of episodes.'''
callbacks = CallbackList(callbacks)
callbacks.on_train_begin()
for episode in range(n_episodes):
callbacks.on_episode_begin(episode, self.value_function)
self.env.reset()
self.episode()
callbacks.on_episode_end(episode, self.value_function)
callbacks.on_train_end(self.value_function)
\end{python}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Tabular Value Function}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We created a generic tabular value function \GithubURL{capstone/rl/value_functions/tabular.py} that
can be used for both state $V(s)$ and state-action $Q(s, a)$ values. In many algorithms such as in
Q-learning is helpful to initialize the values in the lookup table to random values when they do not
exist. This is supported with the \texttt{init} parameter in the constructor.
\begin{python}
_MEAN = 0.0
_STD = 0.3
class TabularVF(ValueFunction):
def __init__(self, init=True, random_state=None):
self.init = init
self.random_state = check_random_state(random_state)
self._table = {}
def __setitem__(self, key, value):
self._table[key] = value
def __getitem__(self, key):
if key not in self._table:
if self.init:
self._table[key] = self.random_state.normal(_MEAN, _STD)
else:
self._table[key] = 0
return self._table[key]
\end{python}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Tabular Q-learning}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Tabular Q-learning \GithubURL{capstone/rl/learners/qlearning.py} includes the
\texttt{EpisodicLearnerMixin} to reuse the code for learning in episodes. The constructor allows the
\texttt{selfplay} parameter, which is used with a self-play policy to maximize/minimize the best
Q-value.
\begin{python}
class QLearning(EpisodicLearnerMixin):
def __init__(self, env, policy, qfunction, learning_rate=0.1,
discount_factor=1.0, selfplay=False):
self.env = env
self.policy = policy
self.qfunction = qfunction
self.learning_rate = learning_rate
self.discount_factor = discount_factor
self.selfplay = selfplay
def best_qvalue(self, state):
if self.selfplay:
best_func = max_qvalue if state.cur_player() == 0 else min_qvalue
return best_func(state, self.env.get_actions(state), self.qfunction)
return max_qvalue(state, self.env.get_actions(state), self.qfunction)
def episode(self):
while not self.env.is_terminal():
state = self.env.get_state()
action = self.policy.get_action(state)
reward, next_state = self.env.do_action(action)
best_qvalue = self.best_qvalue(next_state)
target = reward + (self.discount_factor * best_qvalue)
td_error = target - self.qfunction[state, action]
self.qfunction[state, action] += self.learning_rate * td_error
\end{python}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Approximate Q-learning}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The implementation of Approximate Q-learning \GithubURL{capstone/rl/learners/qlearning_approx.py} is
very similar to the one for tabular \mbox{Q-learning}. The main difference is that it supports
experience replay. When experience replay is enabled it can be configured by specifying the
\texttt{batch\_size} and the \texttt{replay\_memory\_size}.
\begin{python}
class ApproximateQLearning(EpisodicLearnerMixin):
def __init__(self, env, policy, qfunction, discount_factor=1.0, selfplay=False,
experience_replay=True, batch_size=32, replay_memory_size=10000):
self.env = env
self.policy = policy
self.qfunction = qfunction
self.discount_factor = discount_factor
self.selfplay = selfplay
self.experience_replay = experience_replay
self.batch_size = batch_size
self.replay_memory_size = replay_memory_size
if self.experience_replay:
self.replay_memory = deque(maxlen=self.replay_memory_size)
def best_qvalue(self, state):
func = None
if self.selfplay:
func = np.max if state.cur_player() == 0 else np.min
else:
func = np.max
return self.qfunction.best_value(state, self.env.actions(state), func)