examples_long.tex

\subsection{Transformer}
\label{sec:transformer}

We define a Transformer used autoregressively as a language model.
The input is a sequence of one-hot vectors, from which we compute word embeddings and positional encodings:
\begin{align*}
  I &\in \{0, 1\}^{\seq \times \vocab} & \nsum{\vocab} I &= 1 \\
  W &= (E \ndot{\vocab} I)\sqrt{|\layer|} & E &\in \reals^{\vocab \times \layer} \\
  P &\in \reals^{\seq \times \layer} \\
  P_{\seq(p), \layer(i)} &= \begin{cases}
    \sin((p-1) / 10000^{(i-1) / |\layer|}) & \text{$i$ odd} \\ 
    \cos((p-1) / 10000^{(i-2) / |\layer|}) & \text{$i$ even.}
  \end{cases}
\end{align*}

Then we use $L$ layers of self-attention and feed-forward neural networks:
\begin{align*} 
X^0 &= W+P \\
T^1 &= \text{LayerNorm}^1(\text{SelfAtt}^1(X^0) + X^0)\\
X^1 &= \text{LayerNorm}^{1'}(\text{FFN}^1(T^1) + T^1)\\
&\vdotswithin{=} \\
T^{L} &= \text{LayerNorm}^L(\text{SelfAtt}^L(X^{L-1}) + X^{L-1})\\
X^{L} &= \text{LayerNorm}^{L'}(\text{FFN}^L(T^L) + T^L)\\
O &= \nfun{\vocab}{softmax}(E \ndot{\layer} X^L)
\end{align*}
where $\text{LayerNorm}$, $\text{SelfAtt}$ and $\text{FFN}$ are defined below.

Layer normalization ($l = 1, 1', \ldots, L, L'$):
\begin{align*}
  \text{LayerNorm}^l \colon \reals^{\layer} &\rightarrow \reals^{\layer} \\
  \text{LayerNorm}^l(X) &= \nfun{\layer}{standardize}(X) \odot \gamma^l + \beta^l \\
  \beta^l, \gamma^l &\in \reals^{\layer}
\end{align*}

We defined attention in \S\ref{sec:attention}; the Transformer uses multi-head self-attention, in which queries, keys, and values are all computed from the same sequence.
\begin{align*}
  \text{SelfAtt}^l \colon \reals^{\seq \times \layer} &\rightarrow \reals^{\seq \times \layer} \\
  \text{SelfAtt}^l(X) &= Y
\end{align*}
where
\begin{align*}
  |\seq| &= |\seq'| \\
  |\key| = |\val| &= |\layer|/|\heads| \\
  Q &= W^{l,Q} \ndot{\layer} X_{\seq\rightarrow\seq'} & W^{l,Q} &\in \reals^{\heads \times \layer \times \key} \\
  K &= W^{l,K} \ndot{\layer} X & W^{l,K} &\in \reals^{\heads \times \layer \times \key} \\
  V &= W^{l,V} \ndot{\layer} X & W^{l,V} &\in \reals^{\heads \times \layer \times \val} \\
  M & \in \reals^{\seq \times \seq'} \\
  M_{\seq(i), \seq'(j)} &= \begin{cases}
    0 & i \leq j\\
    -\infty & \text{otherwise}
  \end{cases} \\
  Y &= W^{l,O} \ndot{\heads \\ \val} \text{Attention}(Q, K, V, M)_{\seq'\rightarrow\seq} & W^{l,O} &\in \reals^{\heads \times \val \times \layer}
\end{align*}

Feedforward neural networks:
\begin{align*}
  \text{FFN}^l \colon \reals^{\layer} &\rightarrow \reals^{\layer} \\
  \text{FFN}^l(X) &= X^2
\end{align*}
where
\begin{align*}
  X^1 &= \text{relu}(W^{l,1} \ndot{\layer} X + b^{l,1}) & W^{l,1} &\in \reals^{\hidden \times \layer} & b^{l,1} &\in \reals^{\hidden} \\
  X^2 &= W^{l,2} \ndot{\hidden} X^1 + b^{l,2} & W^{l,2} &\in \reals^{\layer \times \hidden} & b^{l,2} &\in \reals^{\hidden}.
\end{align*}

\subsection{LeNet}

\begin{align*}
X^0 &\in \reals^{\batch \times \chans[c_0] \times \height \times \width} \\
T^1 &= \text{relu}(\text{Conv}^1(X^0)) \\
X^1 &= \text{MaxPool}^1(T^1) \\
T^2 &= \text{relu}(\text{Conv}^2(X^1)) \\
X^2 &= \text{MaxPool}^2(T^2)_{(\height,\width,\chans)\rightarrow\layer} \\
X^3 &= \text{relu}(W^3 \ndot{\layer} X^2 + b^3) & W^3 &\in \reals^{\hidden \times \layer} & b^3 &\in \reals^{\hidden} \\
O &= \nfun{\classes}{softmax} (W^4 \ndot{\hidden} X^3 + b^4) & W^4 &\in \reals^{\classes \times \hidden} & b^4 &\in \reals^{\classes}
\end{align*}
As an alternative to the flattening operation in the equation for $X^2$, we could have written
\begin{align*}
X^2 &= \text{MaxPool}^2(T^2) \\
X^3 &= \text{relu}(W^3 \ndot{\height \\ \width \\ \chans} X^2 + b^3) & W^3 &\in \reals^{\hidden \times \height \times \width \times \chans}.
\end{align*}

The convolution and pooling operations are defined as follows:
\begin{align*}
\text{Conv}^l(X) &= \text{Conv2d}(X; W^l, b^l)_{\chans'\rightarrow\chans}
\end{align*}
where
\begin{align*}
W^l & \in \reals^{\chans'[c_l] \times \chans[c_{l-1}] \times \kh[kh_l] \times \kw[kw_l]} \\
b^l &\in \reals^{\chans'[c_l]}
\end{align*}
and
\begin{align*}
\text{MaxPool}^l(X) &= \text{MaxPool2d}_{ph^l,ph^l}(X).
\end{align*}