-
Notifications
You must be signed in to change notification settings - Fork 5
/
examples_long.tex
95 lines (87 loc) · 4.05 KB
/
examples_long.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
\subsection{Transformer}
\label{sec:transformer}
We define a Transformer used autoregressively as a language model.
The input is a sequence of one-hot vectors, from which we compute word embeddings and positional encodings:
\begin{align*}
I &\in \{0, 1\}^{\seq \times \vocab} & \nsum{\vocab} I &= 1 \\
W &= (E \ndot{\vocab} I)\sqrt{|\layer|} & E &\in \reals^{\vocab \times \layer} \\
P &\in \reals^{\seq \times \layer} \\
P_{\seq(p), \layer(i)} &= \begin{cases}
\sin((p-1) / 10000^{(i-1) / |\layer|}) & \text{$i$ odd} \\
\cos((p-1) / 10000^{(i-2) / |\layer|}) & \text{$i$ even.}
\end{cases}
\end{align*}
Then we use $L$ layers of self-attention and feed-forward neural networks:
\begin{align*}
X^0 &= W+P \\
T^1 &= \text{LayerNorm}^1(\text{SelfAtt}^1(X^0) + X^0)\\
X^1 &= \text{LayerNorm}^{1'}(\text{FFN}^1(T^1) + T^1)\\
&\vdotswithin{=} \\
T^{L} &= \text{LayerNorm}^L(\text{SelfAtt}^L(X^{L-1}) + X^{L-1})\\
X^{L} &= \text{LayerNorm}^{L'}(\text{FFN}^L(T^L) + T^L)\\
O &= \nfun{\vocab}{softmax}(E \ndot{\layer} X^L)
\end{align*}
where $\text{LayerNorm}$, $\text{SelfAtt}$ and $\text{FFN}$ are defined below.
Layer normalization ($l = 1, 1', \ldots, L, L'$):
\begin{align*}
\text{LayerNorm}^l \colon \reals^{\layer} &\rightarrow \reals^{\layer} \\
\text{LayerNorm}^l(X) &= \nfun{\layer}{standardize}(X) \odot \gamma^l + \beta^l \\
\beta^l, \gamma^l &\in \reals^{\layer}
\end{align*}
We defined attention in \S\ref{sec:attention}; the Transformer uses multi-head self-attention, in which queries, keys, and values are all computed from the same sequence.
\begin{align*}
\text{SelfAtt}^l \colon \reals^{\seq \times \layer} &\rightarrow \reals^{\seq \times \layer} \\
\text{SelfAtt}^l(X) &= Y
\end{align*}
where
\begin{align*}
|\seq| &= |\seq'| \\
|\key| = |\val| &= |\layer|/|\heads| \\
Q &= W^{l,Q} \ndot{\layer} X_{\seq\rightarrow\seq'} & W^{l,Q} &\in \reals^{\heads \times \layer \times \key} \\
K &= W^{l,K} \ndot{\layer} X & W^{l,K} &\in \reals^{\heads \times \layer \times \key} \\
V &= W^{l,V} \ndot{\layer} X & W^{l,V} &\in \reals^{\heads \times \layer \times \val} \\
M & \in \reals^{\seq \times \seq'} \\
M_{\seq(i), \seq'(j)} &= \begin{cases}
0 & i \leq j\\
-\infty & \text{otherwise}
\end{cases} \\
Y &= W^{l,O} \ndot{\heads \\ \val} \text{Attention}(Q, K, V, M)_{\seq'\rightarrow\seq} & W^{l,O} &\in \reals^{\heads \times \val \times \layer}
\end{align*}
Feedforward neural networks:
\begin{align*}
\text{FFN}^l \colon \reals^{\layer} &\rightarrow \reals^{\layer} \\
\text{FFN}^l(X) &= X^2
\end{align*}
where
\begin{align*}
X^1 &= \text{relu}(W^{l,1} \ndot{\layer} X + b^{l,1}) & W^{l,1} &\in \reals^{\hidden \times \layer} & b^{l,1} &\in \reals^{\hidden} \\
X^2 &= W^{l,2} \ndot{\hidden} X^1 + b^{l,2} & W^{l,2} &\in \reals^{\layer \times \hidden} & b^{l,2} &\in \reals^{\hidden}.
\end{align*}
\subsection{LeNet}
\begin{align*}
X^0 &\in \reals^{\batch \times \chans[c_0] \times \height \times \width} \\
T^1 &= \text{relu}(\text{Conv}^1(X^0)) \\
X^1 &= \text{MaxPool}^1(T^1) \\
T^2 &= \text{relu}(\text{Conv}^2(X^1)) \\
X^2 &= \text{MaxPool}^2(T^2)_{(\height,\width,\chans)\rightarrow\layer} \\
X^3 &= \text{relu}(W^3 \ndot{\layer} X^2 + b^3) & W^3 &\in \reals^{\hidden \times \layer} & b^3 &\in \reals^{\hidden} \\
O &= \nfun{\classes}{softmax} (W^4 \ndot{\hidden} X^3 + b^4) & W^4 &\in \reals^{\classes \times \hidden} & b^4 &\in \reals^{\classes}
\end{align*}
As an alternative to the flattening operation in the equation for $X^2$, we could have written
\begin{align*}
X^2 &= \text{MaxPool}^2(T^2) \\
X^3 &= \text{relu}(W^3 \ndot{\height \\ \width \\ \chans} X^2 + b^3) & W^3 &\in \reals^{\hidden \times \height \times \width \times \chans}.
\end{align*}
The convolution and pooling operations are defined as follows:
\begin{align*}
\text{Conv}^l(X) &= \text{Conv2d}(X; W^l, b^l)_{\chans'\rightarrow\chans}
\end{align*}
where
\begin{align*}
W^l & \in \reals^{\chans'[c_l] \times \chans[c_{l-1}] \times \kh[kh_l] \times \kw[kw_l]} \\
b^l &\in \reals^{\chans'[c_l]}
\end{align*}
and
\begin{align*}
\text{MaxPool}^l(X) &= \text{MaxPool2d}_{ph^l,ph^l}(X).
\end{align*}