11-00-ACA-Fingerprinting.tex

% move all configuration stuff into includes file so we can focus on the content
\input{include}


\subtitle{module 11.0: audio fingerprinting}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
    % generate title page
	\input{include/titlepage}

    \section[overview]{lecture overview}
        \begin{frame}{introduction}{overview}
            \begin{block}{corresponding textbook section}
                    %\href{http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6331126}{Chapter 9: Audio Fingerprinting} (pp.~163--167)
                    chapter~11
            \end{block}

            \begin{itemize}
                \item   \textbf{lecture content}
                    \begin{itemize}
                        \item   introduction to audio fingerprinting
                        \item   in-depth example for fingerprint extraction and retrieval
                    \end{itemize}
                \bigskip
                \item<2->   \textbf{learning objectives}
                    \begin{itemize}
                        \item   discuss goals and limitations of audio fingerprinting systems as compared to watermarking or cover song detection systems
                        \item   describe the processing steps of the Philips fingerprinting system
                    \end{itemize}
            \end{itemize}
            \inserticon{directions}
        \end{frame}

    \section[intro]{introduction}
       \begin{frame}{audio fingerprinting}{introduction}
            \begin{itemize}
                \item	\textbf{objective}: 
                    \begin{itemize}
                        \item   represent a recording with a compact and unique digest\\ ($\rightarrow$ \textit{fingerprint}, \textit{perceptual hash})
                        
                        \bigskip
                        \item<2->   allow quick matching between previously stored fingerprints and an extracted fingerprint
                    \end{itemize}
                \bigskip
                \item<3->	\textbf{applications}:
                    \begin{itemize}
                        \item	\textit{broadcast monitoring}:\\ automate verification for royalties/infringement claims
                        \item	\textit{value-added services}:\\ offer information and meta data
                    \end{itemize}
            \end{itemize}
        \end{frame}
        
        \begin{frame}{audio fingerprinting}{fingerprinting vs.\ watermarking}
            \begin{itemize}
                \item	\textbf{fingerprinting}:
                    \begin{itemize}
                        \item identifies \textit{recording} (but not musical content)
                    \end{itemize}
                \item	\textbf{watermarking}:
                    \begin{itemize}
                        \item embeds perceptually ``unnoticeable'' data block in the audio
                        \item   identifies \textit{instance} of recording
                    \end{itemize}
            \end{itemize}
            \pause
            \begin{footnotesize}
                \begin{table}
                    \centering
                    \begin{tabular}{lccccccccccc} %{\textwidth}{@{\extracolsep{\fill}}ccccccccccccc}
                        \\ \hline
                        \bf{\emph{Property}}	 & \bf{\emph{Fingerprinting}}	 & \bf{\emph{Watermarking}}\\ 
                         \hline
                        \bf{Allows Legacy Content Indexing}	 & +	 & --\\
                        \bf{Allows Embedded (Meta) Data}	 & --	 & +\\
                        \bf{Leaves Signal Unchanged}	 & +	 & --\\
                        \bf{Identification of}	 & Recording	 & User or Interaction\\
                    \end{tabular}
                \end{table}
            \end{footnotesize}
        \end{frame}

    \section[requirements]{requirements}
        \begin{frame}{audio fingerprinting}{fingerprint requirements}
            \begin{itemize}%
                \item	\textbf{accuracy \& reliability}:\\ minimize false negatives/positives
                \item<2->	\textbf{robustness \& security}: \\ robust against distortions and attacks
                \item<3->	\textbf{granularity}:\\ quick identification in a real-time context
                \item<4->	\textbf{versatility}:\\ independent of file format, etc.
                \item<5->	\textbf{scalability}:\\ good database performance
                \item<6->	\textbf{complexity}:\\ implementation possible on embedded devices
            \end{itemize}
        \end{frame}
        
    \section[approaches]{approaches}
        \begin{frame}{audio fingerprinting}{general fingerprinting system}
            \begin{figure}
                \centering
                \input{pict/fingerprinting_system}	
            \end{figure}
        \end{frame}
        \begin{frame}{audio fingerprinting}{brainstorm}
            \question{How does it work? MD5?}
        \end{frame}
        
     \section{philips}   
        \begin{frame}{audio fingerprinting}{system example: philips extraction 1/3}
            \begin{footnotesize}
                \begin{columns}[T]
                    \column{.3\textwidth}
                        \scalebox{.75}
                        {
                            \centering
                            \input{pict/fingerprinting_flowphilips}
                        }
                    \column{.7\textwidth}%\vspace{-5mm}
                        \begin{enumerate}
                            \item<1->	\textbf{pre-processing}:\\ downmixing \& downsampling (\unit[5]{kHz})
                            \item<2->	\textbf{STFT}: $\mathcal{K}=2048$, overlap $\frac{31}{32}$
                            \item<3->	\textbf{log frequency bands}:\\ $33$ bands from 300--2000\unit{Hz}
                            \item<4->	\textbf{freq derivative}: $33$ bands
                            \item<4->	\textbf{time derivative}: $32$ bands
                            \item<5->	\textbf{quantization}: 
                                \begin{tiny}
                                    \begin{equation*}\label{eq:fingerprint}
                                        v_\mathrm{FP}(k,n)	= \begin{cases}
                                                        1 & \text{if } \big(\Delta{E}(k,n) - \Delta{E}(k,n-1)\big) > 0\\
                                                        0 & \text{otherwise}
                                                    \end{cases}\nonumber
                                    \end{equation*}
                                \end{tiny}

                            \item<6->[$\Rightarrow$]	\textbf{\unit[32]{bit}} \textit{subfingerprint}
                        \end{enumerate}
                \end{columns}
            \end{footnotesize}
        \end{frame}

        \begin{frame}{audio fingerprinting}{system example: philips extraction 2/3}
            \begin{itemize}
                \item[] \textbf{fingerprint}
                    \begin{itemize}
                        \item	$256$ subsequent subfingerprints
                        \item<2->[$\Rightarrow$]
                        \item<2->	\textit{length}: \unit[3]{s}
                        \item<2->	\textit{size}: $256\cdot \unit[4]{Byte} = \unit[1]{kByte}$
                    \end{itemize}
                \smallskip
                \item<3->[]   \textbf{example}:
                    \begin{itemize}
                        \item   \unit[5]{min} song
                        \begin{equation*}
                            \unit[1]{kByte} \cdot \frac{5\cdot 60 \unit{s}}{\unit[3]{s}} = \unit[100]{kByte}
                        \end{equation*}
                    \end{itemize}
                    
                    \begin{itemize}
                        \item<4->   database with 1 million songs (avg.\ length \unit[5]{min})
                        \begin{equation*}
                            10^6\cdot 256\cdot\frac{5\cdot 60 \unit{s}}{\unit[3]{s}} = \unit[25.6\cdot 10^9]{subfingerprints}
                        \end{equation*}
                        \item<4->[$\Rightarrow$] \unit[100]{GByte} storage
                    \end{itemize}
            \end{itemize}
        \end{frame}
        
        \begin{frame}{audio fingerprinting}{system example: philips extraction 3/3}
            \begin{columns}
            \column{.4\linewidth}
                \begin{itemize}
                    \item   original: \includeaudio{pop_excerpt}
                    \item   low quality encoding: \includeaudio{pop_excerpt_lq}
                \end{itemize}
            \column{.6\linewidth}
                \vspace{-5mm}
                \figwithmatlab{Fingerprint}
            \end{columns}
        \end{frame}
        
        \begin{frame}{audio fingerprinting}{system example: philips identification 1/3}
            \begin{itemize}
                \item	\textbf{database}
                    \begin{itemize}
                        \item	contains all subfingerprints for all songs
                        \item<2->	previous example database: $25$ billion subfingerprints
                        \bigskip
                    \end{itemize}
                \item<3->	\textbf{problem}
                    \begin{itemize}
                        \item how to identify fingerprint efficiently?
                    \end{itemize}
            \end{itemize}
        \end{frame}
        
        \begin{frame}{audio fingerprinting}{system example: philips identification 2/3}
            \begin{itemize}
                \item \textbf{simple system}:
                    \begin{enumerate}
                        \item	create lookup table with all possible subfingerprints ($2^{32}$) pointing to occurrences
                        \bigskip
                        \item<2->	assume at least one of the extracted $256$ subfingerprints is error-free\\
                        \item<2->[$\Rightarrow$] only entries listed at $256$ positions of the table have to be checked
                        \bigskip
                        \item<3->	compute \textit{Hamming} distance between extracted fingerprint and candidates
                    \end{enumerate}
            \end{itemize}
        \end{frame}
        
        \begin{frame}{audio fingerprinting}{system example: philips identification 3/3}
            \begin{itemize}
                \item \textbf{variant 1}: 
                    \begin{itemize}
                        \item	allow \textit{one} bit error  
                        \item<2->[$\Rightarrow$] workload increase by factor $\approx 33$
                     \end{itemize}
               \bigskip
                \item<3-> \textbf{variant 2}: 
                    \begin{itemize}
                        \item<3->	introduce concept of bit error probability into fingerprint extraction
                            \begin{itemize}
                                \item	small energy difference $\rightarrow$ high error probability
                                \item	large energy difference $\rightarrow$ low error probability
                            \end{itemize}
                        \bigskip
                        \item<4->	rank bits per subfingerprint by error probability and check only for bit errors at likely positions
                    \end{itemize}
            \end{itemize}
        \end{frame}

     \section{shazam}   
        \begin{frame}{audio fingerprinting}{other systems: shazam}
            \vspace{-3mm}
            \begin{figure}
                \centering
                \includegraphics[scale=.23]{graph/fingerprint_shazaam}
            \end{figure}
            \vspace{-5mm}
            plot from \footfullcite{wang_industrial_2003}
            
        \end{frame}
    
    \section{summary}
        \begin{frame}{summary}{lecture content}
            \begin{itemize}
                \item   \textbf{audio fingerprinting}
                    \begin{itemize}
                        \item   represent recording with compact, robust, and unique fingerprint
                        \item   focus on (perceptual) audio representation rather than ``musical'' content
                        \item   allow efficient matching of this fingerprint with database
                    \end{itemize}
                \bigskip
                \item   \textbf{often confused with other tasks}
                    \begin{enumerate}
                        \item   audio watermarking
                        \item   cover song detection
                    \end{enumerate}
            \end{itemize}
            \inserticon{summary}
        \end{frame}
\end{document}