03-03-02-ACA-Input-TF-CQT.tex

% move all configuration stuff into includes file so we can focus on the content
\input{include}


\subtitle{module 3.3.2: time-frequency representations~---~constant Q transform}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
    % generate title page
	\input{include/titlepage}

    \section[overview]{lecture overview}
        \begin{frame}{introduction}{overview}
            \begin{block}{corresponding textbook section}
                    %\href{http://ieeexplore.ieee.org/xpl/articleDetails.jsp?tp=&arnumber=6331119&}{Chapter 2~---~Fundamentals}: pp.~24--26
                    section~3.3.2
            \end{block}

            \begin{itemize}
                \item   \textbf{lecture content}
                    \begin{itemize}
                        \item   constant-Q transform (CQT)
                    \end{itemize}
                \bigskip
                \item<2->   \textbf{learning objectives}
                    \begin{itemize}
                        \item   discussing advantages and disadvantages of different time-frequency transforms
                        \item   explaining the principles of the CQT and auditory filterbanks
                    \end{itemize}
            \end{itemize}
            \inserticon{directions}
        \end{frame}
        
    \section[intro]{introduction}
        \begin{frame}{non-FT time frequency transforms}{introduction}
            \begin{itemize}
                \item   Fourier transform continues to be much-used tool in audio signal processing and MIR
                \bigskip
                \item   but there are disadvantages, e.g.
                    \begin{itemize}
                        \item   frequency axis does not directly map to (perceptual) pitch axis
                        \item   frequency and time resolution inversely related
                        \smallskip
                        \item<2->[$\Rightarrow$] \textbf{alternative transforms} can be used
                    \end{itemize}
            \end{itemize}
        \end{frame}
        
    \section[CQT]{constant-Q transform}
        \begin{frame}{constant-Q transform}{introduction}
            \begin{itemize}
                \item<1-> DFT has a \textit{linear} frequency axis:
                    \begin{itemize}
                        \item	not perceptually meaningful: \textit{logarithmic} is better match
                        \item	low pitch resolution at low frequencies
                    \end{itemize}
                \bigskip
                \item<2->[$\Rightarrow$] compute DFT-like transform {\color{highlight}{at specific frequencies}}
                    \begin{itemize}
                        \item   space frequencies logarithmically (constant $\mathcal{Q}$)
                        \item   resulting abscissa resolution is pitch-related
                        \item   parameter $c$ adjusts number of bins per octave
                    \end{itemize}
            \end{itemize}
            
            \bigskip
            \visible<2->{\begin{equation*}
                \mathcal{Q} = \frac{f}{\Delta f} = \frac{1}{2^{\nicefrac{1}{c}}-1}
            \end{equation*}
            }
        \end{frame}	

        \begin{frame}{constant Q transform}{implementation 1/2}
            
            \begin{columns}
            \column{0.55\textwidth}
            \begin{eqnarray*}
                X_\mathrm{CQ}(k,n) &=& \frac{1}{\mathcal{K}(k)}\sum\limits_{i=i_{\mathrm{s}}(n)}^{i_{\mathrm{e}}(n)}{w_k(i-i_{\mathrm{s}})\cdot x(i) \e^{\mathrm{j}2\pi \frac{\mathcal{Q}\cdot(i-i_{\mathrm{s}})}{\mathcal{K}(k)}}}\\
                \mathcal{K}(k) &=& \frac{f_{\mathrm{S}}}{f(k)} \mathcal{Q}
            \end{eqnarray*} 
            
            \column{0.45\textwidth}
            \begin{itemize}
                \item   $f(k)$: frequency of bin index $k$
                \item   $\mathcal{K}(k)$: blocklength for bin index $k$
                \item   $\mathcal{Q}$: measure of pitch res.
                \item   $w_k$: window function
                \item   $i_\mathrm{s},i_\mathrm{e}$: start and stop time indices of block
                \item   $f_\mathrm{S}$: sample rate
            \end{itemize}
            \end{columns}
            \bigskip
            \begin{itemize}
                \item   long window for low frequencies (high freq res, low time res)
                \item   short window for high frequencies (low freq res, high time res)
            \end{itemize}
        \end{frame}	

        \begin{frame}{constant Q transform}{implementation 2/2}
            \vspace{-5mm}
            \begin{columns}
                \column{.3\linewidth}
                    \begin{center}\textbf{non-overlapping}\end{center}
                    \vspace{-8mm}
                    \begin{figure}
                    \centering
                        \scalebox{1.2}{\input{pict/fundamentals_CQT}}
                    \end{figure}
                \column{.3\linewidth}
                    \begin{center}\textbf{overlapping}\end{center}
                    \vspace{10mm}
                    \vspace{15mm}
                \column{.3\linewidth}
                    \begin{center}\textbf{differences}\end{center}
                    \begin{itemize}
                        \item   outputs at multiple vs.\ one time resolution
                        \item   multiple different FFT lengths vs.\ one FFT length (zero-padded)
                        \item   dependent vs.\ independent definition of block and hop length
                    \end{itemize}
                    \vspace{15mm}
                \end{columns}
        \end{frame}	

        \begin{frame}{constant Q transform}{CQT vs.\ DFT}
            \textbf{CQT}:
            \bigskip
            \begin{itemize}
                \item<1->[+]	perceptually/musically adapted frequency resolution
                \smallskip
                \item<2->[--]	time resolution depends on frequency
                \smallskip
                \item<3->[--]	not invertible
                \smallskip
                \item<4->[--]	no optimized implementation (compare FFT)
            \end{itemize}
        \end{frame}	

    %\section{Auditory Filterbanks}
        %\begin{frame}{auditory filterbanks}{introduction}
            %FT and related transforms bad models of physiological properties of the human ear:
                %\begin{itemize}
                    %\item   frequency resolution (critical bands)
                    %\item   frequency scale (pitch resolution)
                    %\item   loudness \& masking
                    %\item   event perception \&  time integration
                %\end{itemize}
            %
            %\smallskip
            %$\Rightarrow$ \textbf{auditory filterbanks}
            %
            %\pause
            %\bigskip
            %not as widely used as one might think because
            %
            %\begin{enumerate}
                %\item<3->	computationally inefficient
                %\item<3->	analysis only: no invertibility (mostly)
                %\item<3->	not proven to be superior
            %\end{enumerate}
        %\end{frame}	
%
        %\begin{frame}{auditory filterbanks}{gammatone filterbank}
            %\vspace{-6mm}
            %\begin{equation*}
                %h(i) = \frac{ a \cdot\left(\nicefrac{i}{f_{\mathrm{S}}}\right)^{\mathcal{O}-1}\cdot \cos\left( 2\pi \cdot f_{\mathrm{c}} \frac{ i}{f_{\mathrm{S}}} \right) }{\e^{\nicefrac{2\pi i \Delta f}{f_{\mathrm{S}}}}}
            %\end{equation*}
            %\vspace{-4mm}
            %\figwithmatlab{Gammatone}
        %\end{frame}	

    \section{summary}
        \begin{frame}{summary}{lecture content}
            \begin{itemize}
                \item   \textbf{DFT has disadvantages}
                    \begin{itemize}
                        \item   low frequency resolution for low pitches
                        \item   non-logarithmic/perceptually relevant pitch resolution
                    \end{itemize}
                \bigskip
                \item      \textbf{CQT}
                    \begin{itemize}
                        \item   similar to Fourier Transform but logarithmically spaced frequency bins
                        \item   not invertible and inefficient
                    \end{itemize}
                %\bigskip
                %\item      \textbf{filterbanks}
                    %\begin{itemize}
                        %\item   good model of human physiology
                        %\item   not invertible and inefficient
                        %\item   not proven to be superior
                    %\end{itemize}
            \end{itemize}
            \inserticon{summary}
        \end{frame}
\end{document}