-
Notifications
You must be signed in to change notification settings - Fork 0
/
cs4243-final-cheatsheet.tex
675 lines (645 loc) · 32.9 KB
/
cs4243-final-cheatsheet.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
\documentclass{article}
\usepackage[a4paper, margin=3mm, landscape]{geometry}
\usepackage{multicol}
\usepackage{xcolor}
\usepackage{enumitem}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{listings}
\usepackage{soul}
\usepackage{graphicx}
\pdfinfo{
/Title (CS4243.pdf)
/Creator (TeX)
/Producer (pdfTeX 1.40.0)
/Author (Jason Qiu)
/Subject (CS4243)
/Keywords (CS4243, nus, cheatsheet, pdf)
}
\graphicspath{ {./img/} }
\pagestyle{empty}
\setcounter{secnumdepth}{0}
\setlength{\columnseprule}{0.25pt}
% Redefine section commands to use less space
\makeatletter
\renewcommand{\section}{\@startsection{section}{1}{0mm}%
{-1ex plus -.5ex minus -.2ex}%
{0.5ex plus .2ex}%x
{\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}%
{-1explus -.5ex minus -.2ex}%
{0.5ex plus .2ex}%
{\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}%
{-1ex plus -.5ex minus -.2ex}%
{1ex plus .2ex}%
{\normalfont\small\bfseries}}%
\makeatother
% Adjust spacing for all itemize/enumerate
\setlength{\leftmargini}{0.5cm}
\setlength{\leftmarginii}{0.5cm}
\setlist[itemize,1]{leftmargin=2mm,labelindent=1mm,labelsep=1mm}
\setlist[itemize,2]{leftmargin=2mm,labelindent=1mm,labelsep=1mm,label=$\bullet$}
\setlist[itemize,3]{leftmargin=2mm,labelindent=1mm,labelsep=1mm,label=$\bullet$}
\setlist[enumerate,1]{leftmargin=2mm,labelindent=1mm,labelsep=1mm}
\setlist[enumerate,2]{leftmargin=2mm,labelindent=1mm,labelsep=1mm,label=(\arabic*)}
\setlist[itemize]{nolistsep}
\setlist[enumerate]{nolistsep}
% Adjust column separator width in matrix
\setlength{\arraycolsep}{2.5pt}
% Font
\renewcommand{\familydefault}{\sfdefault}
% Define colors for math formulas
\definecolor{myblue}{cmyk}{1,.72,0,.38}
\everymath\expandafter{\the\everymath \color{myblue}}
% Custom command for keywords
\definecolor{highlight}{RGB}{251,243,218}
\newcommand{\keyword}[2]{\sethlcolor{highlight}\hl{\textbf{#1}} - #2}
\newcommand{\ilkeyword}[1]{\sethlcolor{highlight}\hl{\textbf{#1}}}
% Define colors and style for code
\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codered}{HTML}{CC241D}
\definecolor{backcolor}{rgb}{0.95,0.95,0.95}
\lstdefinestyle{codestyle}{
backgroundcolor = \color{backcolor},
commentstyle = \color{codegray},
keywordstyle = \color{codered},
stringstyle = \color{codegreen},
basicstyle = \ttfamily,
breakatwhitespace = false,
showstringspaces = false,
breaklines = true,
showtabs = false,
tabsize = 2
}
\lstset{style = codestyle}
% -----------------------------------------------------------------------
\begin{document}
\begin{multicols*}{4}
\footnotesize
% Title box
\begin{center}
\fbox{
\parbox{0.8\linewidth}{
\centering \textcolor{black}{
{\Large\textbf{CS4243}} \\
\normalsize{AY23/24 Sem 2}} \\
{\footnotesize \textcolor{gray}{github.com/jasonqiu212}}
}
}
\end{center}
\section{05. Segmentation}
\begin{itemize}
\item Goal: Separate image into coherent regions
\item Idea: \keyword{Clustering}{Group similar data points together}
\item Challenges: What makes 2 points same/different? Choice of features (e.g. Color, Intensity, Position), Which clustering algorithm?
\item \keyword{k-Means Clustering}{Iteratively re-assign points to nearest cluster center}
\begin{enumerate}
\item Randomly initialize the cluster centers $c_1, \ldots, c_K$
\item For each point $p_i$, find the closest $c_j$ to put $p_i$ in
\item Set $c_j$ to be mean of points in cluster $j$
\item Repeat, if $c_j$ have changed up to some threshold
\end{enumerate}
\begin{itemize}
\item Pros: Simple, Converges to local min.
\item Cons: Setting $K$, Sensitive to initial centers (Since k-means converges to local min.), Sensitive to outliers (Can add more clusters), Assumes spherical clusters
\end{itemize}
\item \ilkeyword{Simple Linear Iterative Clustering (SLIC) Superpixels}
\begin{itemize}
\item \keyword{Superpixel}{Group of pixels that share common traits}
\begin{itemize}
\item Application: Inputs to other CV algo. since more compact representation with perceptual meaning
\end{itemize}
\item Num. of pixels: $n_{tp}$; Target num. of superpixels: $n_{sp}$
\item Initial width of each superpixel: $s = \sqrt{n_{tp} / n_{sp}}$
\item Features: $z = [r,g,b,x,y]$
\item Color distance: $d_c = || \langle r_j,g_j,b_j \rangle - \langle r_i,g_i,b_i \rangle ||$
\item Spatial distance: $d_s = || \langle x_j,y_j \rangle - \langle x_i,y_i \rangle ||$
\item Scaling factors: $d_{cm}$ and $d_{sm}=s$ set as max. expected values of $d_c$ and $d_s$ respectively
\item $D = \sqrt{(\frac{d_c}{d_{cm}})^2+(\frac{d_s}{d_{sm}})^2} = \sqrt{d_c^2 + (\frac{d_s}{s})^2 c^2}$
\end{itemize}
\begin{enumerate}
\item Split img. into grid of size $s \times s$. Set cluster centers as lowest gradient position in $3 \times 3$ neighborhood from superpixel center to speed up convergence since initialize on value common to surrounding.
\item For each cluster center, check distance to all pixels within $2s \times 2s$ neighborhood. Assign pixels to closest checked center.
\item Update cluster centers using mean and repeat if not converged (Same as k-Means)
\item Optional: Replace superpixel region by average value to create stained glass effect
\end{enumerate}
\begin{itemize}
\item Modification of k-Means: Not random initialization, Compute pixel's distance only to closest set of cluster centers
\end{itemize}
\item \keyword{Mean-Shift Clustering}{Find local density maxima in feature space}
\begin{itemize}
\item \keyword{Attraction basin}{Region in feature space for which all trajectories of centroids lead to same mode}
\item \keyword{Cluster}{All data points in attraction basin of a mode}
\end{itemize}
\begin{enumerate}
\item For each data point:
\begin{enumerate}
\item Define window around and get centroid
\item Shift window to centroid
\item Repeat until window centroid stops moving
\end{enumerate}
\end{enumerate}
\begin{itemize}
\item Segmentation with Mean Shift: Do mean shift and merge pixels in same attraction basin
\item Choosing window size: Trial and error, Sample points and use avg. dist. to knn. (Num. of neighbors needs to be large enough to ensure increase in density)
\begin{itemize}
\item Larger window size $\rightarrow$ Fewer clusters
\end{itemize}
\item Pros: No assumptions on cluster shape, 1 parameter, Finds variable num. of modes (vs. specified $k$ in k-Means), Robust to outliers
\item Cons: Choosing $h$, Slow, Scales poorly with feature space dimension
\item Optimizations:
\begin{itemize}
\item After each run of mean shift, assign all points within radius $r$ of end point to same cluster
\item Assign points in radius $c < r$ of search path to mode
\end{itemize}
\end{itemize}
\end{itemize}
\section{06. Texture}
\begin{itemize}
\item \keyword{Texture}{Pattern with repeating elements}
\item \keyword{Filter Bank}{Measures variety of structures in local neighborhood and generates multi-dimensional features}
\begin{itemize}
\item Goal: How to represent texture?
\item Idea: Apply filters with small windows to generate statistics that summarize local patterns. Dist. in feature space bet. windows $\rightarrow$ Pixel's texture similarity
\item $d$ filters $\rightarrow$ $d$-dimensional feature vector
\item Choosing window size: Try many sizes and look for one where statistic does not change much
\item Choose filters in different scales and orientations (to solve window size problem)
\item \keyword{Gabor Filter}{Represent filter banks mathematically by combining sinusoids with exp. (Gaussian) envelope}
\end{itemize}
\item \keyword{Texton}{Characterizes texture by replacing each pixel with integer representing \textbf{texture type}}
\begin{enumerate}
\item Apply filter bank to training image
\item Cluster in feature space and store cluster centers (Texton dictionary)
\item For test image, filter image with same filter bank to get feature vector for each pixel. Assign each pixel to nearest cluster. Cluster ID = Texton ID.
\item For a given region, compute \ilkeyword{texton histograms}
\end{enumerate}
\begin{itemize}
\item Classification: Given new img., compare hist. to other trng. samples and assign to label with most similarity
\item Segmentation: Use texton histograms as a feature
\end{itemize}
\item \keyword{Perceived Boundaries}{Segmentation by human}
\begin{itemize}
\item Idea: Texture gradients indicate boundaries well
\end{itemize}
\begin{enumerate}
\item For each pixel, consider a disk that is split into 2 halves with some orientation
\item Measure texture diff. bet. 2 halves via texton hist.
\item Try all orientations. Orientation with high difference suggests boundaries.
\end{enumerate}
\end{itemize}
\section{07. Keypoints}
\begin{itemize}
\item Motivation: How to stitch 2 images (e.g. Panaroma)?
\begin{enumerate}
\item Keypoints: Find locations
\item Descriptors: Rep. surrounding regions with math
\item Do the matching
\end{enumerate}
\item Good keypoints are \textbf{repeatable} and \textbf{distinct}
\item \ilkeyword{Harris Corner Detection}
\begin{itemize}
\item Significance: Corners have big changes in all directions when shifting window
\item Given window $W$ shifted by offset $(u,v)$:
\[
E(U,v)=\sum_{(x,y) \in W} (I(x+u,y+v) - I(x,y))^2
\]
\item Assuming only small shifts (for Taylor Series Exp.):
\[
E(u,v) = Au^2 + 2Buv + Cv^2 = \begin{bmatrix}
u & v \\
\end{bmatrix}
\begin{bmatrix}
A & B \\
B & C \\
\end{bmatrix}
\begin{bmatrix}
u \\
v \\
\end{bmatrix}
\]
\begin{itemize}
\item $A = \sum_{(x,y) \in W} I_x^2$ $\quad$ $B = \sum_{(x,y) \in W} I_x I_y$
\item $C = \sum_{(x,y) \in W} I_y^2$
\item \keyword{2nd Moment Matrix (H)}{Middle matrix}
\end{itemize}
\item $E = k$ visualized as ellipse, where $H$ controls shape
\begin{itemize}
\item Eigenvectors of $H$ $\rightarrow$ Axes orientation
\item Eigenvalues of $H$ $\rightarrow$ Axes length
\end{itemize}
\includegraphics[scale=0.18]{h-ellipse.jpg}
\item \ilkeyword{Eigenvectors} of $A$ are vectors $\mathbf{x}$ that: $A\mathbf{x} = \lambda \mathbf{x}$
\item \ilkeyword{Eigenvalue ($\lambda$)} corresponds to $\mathbf{x}$: $\det(A-\lambda I) = 0$
\item Since $A = H$ is $2 \times 2$: $\lambda_{\pm} = \frac{1}{2} ((h_{11} + h_{22}) \pm \sqrt{2h_{12}h_{21} + (h_{11} - h_{22})^2})$
\item After getting $\lambda$s, find $\mathbf{x}$: $(A - \lambda I) \mathbf{x} = 0$
\item Both $\lambda_{\text{max}}$ and $\lambda_{\text{min}}$ are large $\rightarrow$ Corner
\item 'Cornerness' Score: $R = \min(\lambda_1, \lambda_2)$ (But getting $\lambda$ is slow)
\item \ilkeyword{Harris Operator}: $R = \det (H) - \kappa (\text{trace}(H))^2$
\begin{itemize}
\item $\det (H) = AC - B^2 = \lambda_1 \lambda_2 $
\item $\text{trace} (H) = A + C = \lambda_1 + \lambda_2 $
\item $R > 0$ $\rightarrow$ Corner, $R<0$ $\rightarrow$ Edge, $R \approx 0$ $\rightarrow$ Flat
\end{itemize}
\begin{enumerate}
\item Compute gradient for each point in image
\item Compute $H$ matrix for each image window and get 'correctness' score
\item Find points with window where $R > $ threshold
\item Take points of local maxima
\end{enumerate}
\item \keyword{Non-Max. Suppression}{Iteratively search for max. values, then zero everything in surrounding window}
\begin{itemize}
\item Window size important
\item Adaptive: To prevent uneven distri. of keypoints in areas of higher contrast, pick corners which are both local max. and whose response is greater than all neighboring local max.
\end{itemize}
\item In practice: $H = \sum_{(x,y) \in W} w_{x,y} \begin{bmatrix}
I_x^2 & I_x I_y \\
I_x I_y & I_y^2 \\
\end{bmatrix}$ (e.g. Convolve with Gaussian)
\end{itemize}
\item \ilkeyword{Harris Corner Invariances}
\begin{itemize}
\item Purpose: If img. transf., how repeatable is detection?
\item \keyword{Equivariance}{Image transformed, and detection location undergoes similar transformation}
\item \keyword{Invariance}{Image tranf., but no detection score change}
\item Translation: Equivariant and invariant
\item Rotation: Equivariant and invariant
\item Photometric transformation (Assume $I' = aI + b$): Invariant to $b \neq 0$, but not invariant to $a \neq 1$
\item Scaling: Not equivariant and invariant
\begin{itemize}
\item Scale of window can determine if location is keypoint $\rightarrow$ Need to scale up window by image scale
\item \keyword{Auto. Scale Selection}{When looking for keypoints, try window sizes and find scale that gives local max.}
\end{itemize}
\end{itemize}
\item \keyword{Laplacian of Gaussian}{Alternative keypoint detector which detects 'blobs' and is scale-sensitive}
\begin{itemize}
\item $\nabla^2 g = \frac{\partial^2 g}{\partial x^2} + \frac{\partial^2 g}{\partial y^2}$
\item Practice: Approx. by Difference of Gaussian for speed
\item Idea: Convolution with LoG has highest response when signal has same scale as Gaussian. Built-in scale sensitivity by varying scale $\sigma$.
\item Implementation: Fix window and kernel size; rescale img. with Gaussian blurring and downsampling
\end{itemize}
\end{itemize}
\section{08. Descriptors}
\begin{itemize}
\item Goal: Get feature vector surrounding each keypoint and measure similarity between feature vectors for matching
\item Desc. should be invariant/equivariant and unique. E.g.:
\begin{itemize}
\item Raw intensity: Good for exact template matching, but sensitive to lighting
\item Image gradient: Invariant to raw intensity (i.e. Lighting), but sensitive to transformations
\item Color histogram: Invariant to scale and rotation, but not sensitive to spatial layout
\item Spatial histogram: Compute color histograms over spatial cells. But not invariant to large rotations.
\begin{itemize}
\item Orientation normalization: Normalize orientation of patch based on dominant image gradient
\item Save orien. angle $\theta$ w/ keypoint (e.g. Mean, mode)
\end{itemize}
\item \keyword{GIST Descriptor}{Rough spatial distribution of image gradients that is rotation invariant}
\begin{enumerate}
\item Divide image into $4 \times 4$ grid
\item Apply Gabor filters (All dir. edge; $N$ filters)
\item Compute filter response averages for each cells
\item Size of descriptor: $4 \times 4 \times N$
\end{enumerate}
\end{itemize}
\item \keyword{SIFT}{Keypoint detector and descriptor}
\begin{itemize}
\item Detector: Uses multi-scale LoG to get scale invariance, orientation normalization for rotation invariance, and threshold for removing low-contrast and low-curvature keypoints
\end{itemize}
\item \ilkeyword{SIFT Descriptor}
\begin{enumerate}
\item Take $16 \times 16$-pixel window around keypoint. Partition window into $4 \times 4$ grid.
\item Compute \textbf{gradient orientations and magnitudes} for each pixel. Reweight magnitudes using Gaussian and discard pixels with low magnitude.
\item For each $4 \times 4$-pixel cell, make \textbf{histogram with 8 orientation bins}. Shift histogram binning by dominant orien. (i.e. Subtract by dom. orien.) for rotation invariance. Collapse into $1 \times 128$ vector.
\item Normalize vector to unit length
\end{enumerate}
\begin{itemize}
\item Invariant to scale, rotation, and lighting
\item Partially invariant to viewpoint (Up to $60^\circ$)
\item Quick and efficient
\end{itemize}
\item \keyword{Feature Matching}{Given feature in $I_1$, how to find best match in $I_2$?}
\begin{enumerate}
\item Define distance function that compares desc.
\begin{itemize}
\item Euclidean distance: $||f_1 - f_2||$ (Can give small distances for incorrect matches)
\item Ratio distance bet. best vs. next-best: $\frac{||f_1 - f_2||}{||f_1 - f_2'||}$
\end{itemize}
\item Nested loop: Find vector with min. distance in $I_2$
\begin{itemize}
\item Or ratio of best vs. next-best $<$ threshold
\end{itemize}
\end{enumerate}
\begin{itemize}
\item Evaluation: \ilkeyword{ROC curve} by varying threshold
\begin{itemize}
\item Recall vs. 1 - Specificity
\item Area Under the Curve (AUC): 1 is the best
\item \keyword{Recall}{$\frac{TP}{TP + FN}$} $\quad$ \keyword{Specificity}{$\frac{TN}{TN + FP}$}
\item \keyword{Precision}{$\frac{TP}{TP + FP}$}
\end{itemize}
\end{itemize}
\end{itemize}
\section{09. Homography}
\begin{itemize}
\item Goal: Stitch images from diff. viewpoints via projection
\item When to use: Scene is planar, approx. planar (i.e. Small depth variation), or only camera rotation
\item Problem: Given set of matched keypoints $\{ p_i, p_i' \}$, get transformation $p' = f(p;H)$ where $H$ are parameters
\item Given homography function: Convert to homogeneous coord., multiply by homo. matrix, and convert back
\[
p = \begin{bmatrix}
x \\
y \\
\end{bmatrix} P = \begin{bmatrix}
x \\
y \\
1 \\
\end{bmatrix};
P' = HP;
P' = \begin{bmatrix}
x' \\
y' \\
w' \\
\end{bmatrix} p' = \begin{bmatrix}
\frac{x'}{w'} \\
\frac{y'}{w'} \\
\end{bmatrix}
\]
\item \keyword{Direct Linear Transform}{Find best estimate of $H$}
\begin{enumerate}
\item For each matching, create $2 \times 9$ matrix $A_i$
\begin{itemize}
\item $
A_i = \begin{bmatrix}
-x & -y & -1 & 0 & 0 & 0 & xx' & yx' & x' \\
0 & 0 & 0 & -x & -y & -1 & xy' & yy' & y' \\
\end{bmatrix}
$
\end{itemize}
\item Concatenate into $2n \times 9$ matrix $A$
\item Compute SVD of $A = U \sum V^T$
\item Store vector of smallest singular value $h = v_{\hat{i}}$
\item Reshape to get $H$
\end{enumerate}
\begin{itemize}
\item Assumptions: Projective model with linear transf.
\item Cons: Sensitive to scaling (i.e. $P$: High res.; $P'$: Low res.) $\rightarrow$ Normalize, Outliers $\rightarrow$ Poor est. of $H$
\end{itemize}
\item \keyword{RANSAC}{More robust method for est. homographies}
\begin{itemize}
\item Motivation: DLT easily corrupted by outliers
\end{itemize}
\begin{enumerate}
\item Loop $N$ times
\begin{enumerate}
\item Sample randomly num. of pts. required to fit model
\item Solve for model params. using samples
\item Score by frac. of inliers in threshold $\delta$ of model
\end{enumerate}
\item Fit model to samples with most inliers
\end{enumerate}
\begin{enumerate}
\item RANSAC loop
\begin{enumerate}
\item Sample 4 matches ($H$ has 8 deg. of freedom)
\item Compute $H$ using DLT
\item Inliers: Get $P''$ using $H$ and check distance to $P'$
\item Keep $H$ if largest number of inliers
\end{enumerate}
\item Using best $H$ with most inliers, recompute using all inliers
\end{enumerate}
\begin{itemize}
\item $\delta$: Impacts if inliers are kept (Trial and error)
\item $N = \frac{\log(1-p)}{\log(1- (1-e)^s)}$ where $p$ is prob. that $\geq 1$ set of samples does not contain outliers, $e$ is prob. that point is outlier, and $s$ is num. of samples per iter.
\item Can loop $N$ times or stop early when expected prob. of inliers reached, but both need prob. of outliers
\item Integrate RANSAC with feature matching: Compute matches as before, add RANSAC loop and eliminate some matches that do not fit $H$
\end{itemize}
\item \keyword{Warping}{Moves pixels of an image}
\begin{itemize}
\item Mapping: Transf. from source to destination via $f$
\item Resampling: \ilkeyword{Splat} if map to bet. pixels, avg. if receive $>1$ source
\end{itemize}
\end{itemize}
\section{10. Optical Flow}
\begin{itemize}
\item \keyword{Flow}{Displacement of pixels bet. frames (Vector field)}
\item Assumptions:
\begin{itemize}
\item Color constancy: $I(x,y,t) = C$
\item Small motion: $I(x+u\delta t,y+v \delta t, t+\delta t) = I(x,y,t)$
\[
I_x u + I_y v + I_t = 0
\]
\item $I_t = I(x,y,t+1) - I(x,y,t)$
\end{itemize}
\item Problem: 1 equation, 2 unknowns
\item \keyword{Lucas-Kanade}{Assumes constant flow in small region}
\begin{itemize}
\item Given $m \times n$ patch: $Ax = b; x = (A^T A)^{-1}A^T b$
\[
\begin{bmatrix}
I_x(p_1) & I_y(p_1) \\
\vdots & \vdots \\
I_x(p_{m \times n}) & I_y(p_{m \times n}) \\
\end{bmatrix}
\begin{bmatrix}
u \\
v \\
\end{bmatrix} = - \begin{bmatrix}
I_t(p_1) \\
\vdots \\
I_t(p_{m \times n})
\end{bmatrix}
\]
\[
\begin{bmatrix}
u \\
v \\
\end{bmatrix} = - \begin{bmatrix}
\sum_{p \in P} I_x I_x & \sum_{p \in P} I_x I_y \\
\sum_{p \in P} I_y I_x & \sum_{p \in P} I_y I_y \\
\end{bmatrix}^{-1}
\begin{bmatrix}
\sum_{p \in P} I_x I_t \\
\sum_{p \in P} I_y I_t \\
\end{bmatrix}
\]
\item Requirements: $A^T A$ is invertible $\rightarrow$ $\det (A^T A) = \lambda_1 \lambda_2$ should be big, $A^T A$ is well-conditioned $\rightarrow$ $\frac{\lambda_{\text{max}}}{\lambda_{\text{min}}}$ should be small
\begin{itemize}
\item Produces \textbf{sparse flow}: Only for some features
\item Similar to corner detector: Corners good for flow
\end{itemize}
\item Aperture Problem: Given small window over an edge, hard to tell which direction line is moving
\begin{itemize}
\item Solution: Get windows with diff. gradients (Corner)
\end{itemize}
\item Aliasing: Undersampling of frames $\rightarrow$ Nearest match based on intensity is incorrect
\begin{itemize}
\item Similar example: Image motion is large
\item Solution: Reduce res. to reduce apparent movement
\end{itemize}
\includegraphics[scale=0.09]{coarse-to-fine-flow-estimation.jpg}
\end{itemize}
\item \keyword{Horn-Schunck}{Assumes smooth flow field}
\begin{itemize}
\item Minimization problem: Use gradient descent
\[
\min_{u,v} \sum_{i,j} (E_s(i,j) + \lambda E_d(i,j))
\]
\item $E_s(i,j) = \frac{1}{4} ((u_{ij} - u_{i+1,j})^2 + (u_{ij} - u_{i,j+1})^2 + (v_{ij} - v_{i+1,j})^2 + (v_{ij} - v_{v,j+1})^2)$
\item $E_d(i,j) = (I_x u_{ij} + I_y v_{ij} + I_t)^2$
\end{itemize}
\begin{enumerate}
\item Compute $I_x$, $I_y$, $I_t$ and initialize flow $u=v=\mathbf{0}$
\item Do until converge: $\hat{u}_{kl} = \bar{u}_{kl} - k I_x; \hat{v}_{kl} = \bar{v}_{kl} - k I_y$
\[
k = \frac{I_x \bar{u}_{kl} + I_y \bar{v}_{kl}+I_t}{\lambda^{-1} + I_x^2 + I_y^2} \text{ where } \bar{u}_{kl} \text{ is local avg.}
\]
\end{enumerate}
\begin{itemize}
\item Choice of $\lambda$: When small, maximize smoothness
\item Produces \textbf{dense flow}: Flow for all pixels
\item Good for when image motion is small
\end{itemize}
\item Evaluation: Euclidean distance, Cosine similarity
\end{itemize}
\section{11. Tracking}
\begin{itemize}
\item \keyword{Tracking}{Estimates parameters (location) of dynamic system (Feature points, target objects) over time}
\item Use optical flow for tracking? Flow only reliable for small motions, Occlusions and textureless regions (i.e. No corners) get bad est., LK gets sparse fields
\item How to match template:
\begin{itemize}
\item Template matching with normalized cross-correlation: Brute force, slow
\begin{itemize}
\item $x_{ij} = \frac{1}{||F||||w_{ij}||} \sum_{u=-k}^{k} \sum_{v=-k}^{k} f_{uv} \cdot p_{i+u,j+v}$
\end{itemize}
\item Multi-scale template matching: Start search with coarser res., Faster
\item Local refinement based on some initial guess (e.g. Previous frame): Fastest, but sensitive to guess
\end{itemize}
\item Notation:
\begin{itemize}
\item 2D image transformation: $W(\mathbf{x}; \mathbf{p})$ where $\mathbf{x} = \begin{bmatrix}
x & y \\
\end{bmatrix}$ and $\mathbf{p} = \{ p_1, \ldots, p_N \}$
\item Brightness constancy assump.: $I(x') = I(W(x; p))$
\item Translation: $W(\mathbf{x};\mathbf{p}) = \begin{bmatrix}
x + p_1 \\
y + p_2 \\
\end{bmatrix} = \begin{bmatrix}
1 & 0 & p_1 \\
0 & 1 & p_2 \\
\end{bmatrix} \begin{bmatrix}
x \\
y \\
1 \\
\end{bmatrix}$
\item Affine: $W = \begin{bmatrix}
p_1x + p_2y + p_3 \\
p_4x + p_5y + p_6 \\
\end{bmatrix} = \begin{bmatrix}
p_1 & p_2 & p_3 \\
p_4 & p_5 & p_6 \\
\end{bmatrix} \begin{bmatrix}
x \\
y \\
1 \\
\end{bmatrix}$
\end{itemize}
\item Image alignment problem: Given template $T(\mathbf{x})$ and source img. $I(\mathbf{x})$, solve for transformation parameters $\mathbf{p}$ that maps coordinates of img. to template such that: $\min_{\mathbf{p}} \sum_{\mathbf{x}} (I(W(\mathbf{x};\mathbf{p})) - T(\mathbf{x}))^2$
\item \ilkeyword{Lucas-Kanade Image Alignment}
\begin{itemize}
\item Assumption: Good initial guess of $\mathbf{p}$ and increment $\triangle{\mathbf{p}}$ from guess $\mathbf{p}$ is small $\rightarrow$ Linearize with Taylor Series and solve for $\triangle{\mathbf{p}}$
\end{itemize}
\begin{enumerate}
\item Warp image with initial $\mathbf{p}$: $I(W(\mathbf{x};\mathbf{p}))$
\item Get error image: $T(\mathbf{x}) - I(W(\mathbf{x};\mathbf{p}))$
\item Get gradient: $\nabla I(x')$ where $x'$: Warped img. coord.
\item Get Jacobian: $\frac{\partial \mathbf{W}}{\partial \mathbf{p}} = \frac{\partial (\mathbf{W}_1, \mathbf{W}_2)}{\partial (p_1, \ldots, p_N)}$
\item Get Hessian approx.: $H = \sum_{\mathbf{x}} (\nabla I \frac{\partial \mathbf{W}}{\partial \mathbf{p}})^T (\nabla I \frac{\partial \mathbf{W}}{\partial \mathbf{p}})$
\item $\triangle{\mathbf{p}} = H^{-1} \sum_{\mathbf{x}} (\nabla I \frac{\partial \mathbf{W}}{\partial \mathbf{p}})^T (T(\mathbf{x}) - I(W(\mathbf{x};\mathbf{p})))$
\item $\mathbf{p} = \mathbf{p} + \triangle{\mathbf{p}}$
\end{enumerate}
\item \keyword{KLT Tracker}{Use LK img. alginment to track \textbf{feature points}}
\begin{itemize}
\item Good features to track: Hessian is well-conditioned matrix $\rightarrow$ Corners!
\end{itemize}
\begin{enumerate}
\item Find corners where $\min(\lambda_1, \lambda_2) > \lambda$ for Hessian
\item Loop over corners:
\begin{enumerate}
\item Get displacement to next frame using LK alignment
\item Store disp. of each corner, update corner position
\item Optional: Add corners every $M$ frames via step 1
\end{enumerate}
\item Return long trajectories for each corner point
\end{enumerate}
\begin{itemize}
\item Challenges: Determining features to track, How to track efficiently, Changing appearance of tracked points, \keyword{Drift}{Accumulation of small errors as model updates}, Handle addition/removal of tracked points
\item Feature tracking vs. Optical flow: Duality
\end{itemize}
\item \keyword{Template-based Tracking}{General algorithm to track \textbf{target objects}}
\begin{enumerate}
\item Mark bounding box around target obj. in 1st frame
\item From target object template, get template descriptor: $\mathbf{q} = \{ q_1, \ldots, q_M \}$
\item In next frame, find similar descriptor in neighborhood
\begin{itemize}
\item Candidate descriptor: $\mathbf{p}(\mathbf{y})$ centered at location $\mathbf{y}$
\item Find peak similarity in region: $\max_{\mathbf{y}} \rho (\mathbf{p}(\mathbf{y}),\mathbf{q})$
\end{itemize}
\item Update target and descriptor $\mathbf{q}$ and repeat
\end{enumerate}
\begin{itemize}
\item What features to use? (Naive: Raw intensity; Better: Gradient, hist.)
\item Search for candidates (Naive: Brute force image; Better: Use prev. results)
\item Finding best candidate (Naive: Simple global max; Better: Use prev. locations)
\item Update target's template (Naive: Keep fixed; Better: Update)
\end{itemize}
\item Cross-correlation as Naive Tracker: Crop region of interest as template and apply norm. cross-correlation with template as filter
\begin{itemize}
\item Problem: False local max. when target differs a lot from template $\rightarrow$ Template need to dynamically adapt
\end{itemize}
\item \keyword{MOSSE Filter}{Given target $x$, get unknown filter $g$ to get response $y$ modelled as Gaussian with small $\sigma$}
\begin{itemize}
\item $g = \arg \min_g \frac{1}{N} \sum_{i=1}^{N} (g \otimes x_i - y_i)^2 + \lambda||g||^2$
\item $g = (X^T X + \lambda I)^{-1}X^Ty$
\item $N$: Total number of target samples
\item $X$: $N \times n_g$
\includegraphics[scale=0.09]{mosse-filter.jpg}
\item Incr. update kernel: Discrete Fourier Transform
\begin{itemize}
\item Use multiplication, instead of X-corr. $\rightarrow$ Faster
\item Kernel remains additive across targets $\rightarrow$ Recently detected target gets higher weight to update kernel
\end{itemize}
\item Issues: Init., Occlusions, Exit frame, Clutter, Drift
\item Accuracy: How well does tracker bounding box overlap with ground truth box? $\frac{\text{Area} (B_{gt} \cap B_p)}{\text{Area} (B_{gt} \cup B_p)}$
\item Robustness: How many times does tracker fail?
\end{itemize}
\end{itemize}
\section{12. Deep Learning}
\begin{itemize}
\item \keyword{Neural Network}{Interconnected perceptrons with biases and weights}
\item Images as inputs: 1 fully-connected perceptron contains 1 weighted combo. of all input pixels $\rightarrow$ Ignores spatial info. (Neighboring pixels are more correlated) and slow
\begin{itemize}
\item Insight: Stationarity (Statistics are similar at diff. locations) $\rightarrow$ Use same weights $\rightarrow$ Same as convolution
\end{itemize}
\item \keyword{Convolution}{Apply multiple filters/kernels to image}
\begin{itemize}
\item Single filter $\rightarrow$ 1 2D-Feature map/channel
\item \keyword{Pooling}{Reduce large distortions $\rightarrow$ More invariance}
\begin{itemize}
\item Interleaving conv. with pooling causes later conv. to capture more of the image with same kernel size
\item \keyword{Receptive Field}{Set of img. pixels that intermediate output pixel depends on}
\item Convolutions after pooling increase receptive field
\end{itemize}
\item \keyword{Activation Func.}{Introduces non-linearities}
\begin{itemize}
\item $\text{ReLU}(x) = \max (0,x)$
\end{itemize}
\end{itemize}
\item Principles behind CNN:
\begin{itemize}
\item Local interactions: Spatial locality included and layers will expand from local region to global region
\item Parameter sharing: Stationarity assumption $\rightarrow$ Equivariant representation
\end{itemize}
\end{itemize}
\end{multicols*}
\end{document}