03-05-ACA-Input-Features.tex

% move all configuration stuff into includes file so we can focus on the content
\input{include}

\newcommand{\listspectralfeature}[2]{
\vspace{-6mm}
\begin{footnotesize}
\begin{equation*}
    \input{eq/Llf_Spectral#1}
\end{equation*}
\end{footnotesize}
\only<1>{
    {\flushright\includeaudio{sax_example}}
    \vspace{-9mm}
    \figwithref{FeatureSpectral#1}{matlab source: \href{https://github.com/alexanderlerch/ACA-Plots/blob/master/matlab/plotFeatures.m}{plotFeatures.m}}
    \inserticon{audio}
}
\only<2>{
    %\vspace{10mm}
    
    \textbf{common variants}:
    
    {#2}
    }
}                    

\subtitle{module 3.5: instantaneous features}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
    % generate title page
	\input{include/titlepage}

    \section[overview]{lecture overview}
        \begin{frame}{introduction}{overview}
            \begin{block}{corresponding textbook section}
                    %\href{http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6331120}{Chapter 3~---~Instantaneous Features}: pp.~31--35
                    section~3.5
            \end{block}

            \begin{itemize}
                \item   \textbf{lecture content}
                    \begin{itemize}
                        \item   introduction to the concept of features
                        \item   timbre
                        \item   spectral shape instantaneous features
                   \end{itemize}
                \bigskip
                \item<2->   \textbf{learning objectives}
                    \begin{itemize}
                        \item   describe the process of feature extraction
                        \item   list possible pre-processing option and explain potential use cases
                        \item   describe the general impact of spectral shape on timbre perception
                        \item   summarize features, describe their computation, and discuss their meaning
                    \end{itemize}
            \end{itemize}
            \inserticon{directions}
        \end{frame}

    \section[intro]{introduction}
        \begin{frame}{instantaneous features}{introduction}
            remember the flow chart of a general ACA system:
            \vspace{-3mm}
            \begin{figure}
                \input{pict/introduction_ACASystem_2}
            \end{figure}
            
            \vspace{-2mm}
            \pause
            \textbf{feature}:
            \begin{itemize}
                \item<2->   \textit{terminology}: 
                    \begin{itemize}
                        \item   audio descriptor
                        \item   instantaneous/short-term/\color<3->{highlight}{low-level feature}
                    \end{itemize}
                \item<2->   \textit{characteristics}:
                    \begin{itemize}
                        \item	not necessarily musically, perceptually, or semantically meaningful
                        \item	low-level: usually one value per block
                    \end{itemize}
            \end{itemize}
        \end{frame}
        
        \begin{frame}{instantaneous features}{feature}
            \toremember{}
            \begin{block}{a feature \ldots}
            \begin{itemize}
                \item   is task-specific, i.e.\ holds descriptive power relevant to the task,
                \bigskip
                \item   may be custom-designed, chosen from a set of established features, or learned from data,
                \bigskip
                \item   can be a representation of any data (audio, meta data, other features, ...),
                \bigskip
                \item   is not necessarily musically, perceptually, or semantically meaningful or interpretable
                \bigskip
                \item   also: non-redundant, invariant to irrelevancies
            \end{itemize}
            \end{block}
        \end{frame}
        
        \begin{frame}{instantaneous features}{feature example}
            waveform envelope of three different signals 
            
            \figwithmatlab{Waveform}
            
            \vspace{-2mm}
            \begin{columns}
                \column{.16\textwidth}
                \column{.25\textwidth}\centering
                    \hspace{8mm}\includeaudio{excerpt_speech}
                \column{.25\textwidth}\centering
                    \includeaudio{excerpt_stringquartet}
                \column{.25\textwidth}\centering
                    \hspace{-10mm}\includeaudio{excerpt_pop}
                \column{.09\textwidth}
            \end{columns}
            
            \bigskip
            \begin{itemize}
                \item<2->   envelopes of waveforms can have distinct shape
                \item<2->[$\Rightarrow$] a feature describing envelope shape could help to distinguish these signal types
            \end{itemize}
            \inserticon{audio}
        \end{frame}

        \begin{frame}{instantaneous features}{feature extraction}
            \vspace{-5mm}
            \begin{columns}
            \column{.6\linewidth}
            \flushright{\includeaudio{sax_example}}
            \vspace{-5mm}
            \figwithmatlab{FeatureExtraction}
            %\begin{figure}
            %\vspace{-8mm}
                %%\includegraphics[height=10mm,width=.7\columnwidth]{waveform}\\ \vspace{-3mm}
                %\includegraphics[scale=.08]{FeatureExtraction}
                %
            %\end{figure}
            \column{.4\linewidth}
                \begin{itemize}
                    \item   repeat for every block
                    \item   repeat for every feature: \textit{Spectral Centroid}, \textit{RMS}, \textit{MFCCs}, \ldots
                    \bigskip
                    \item[$\Rightarrow$] feature matrix per audio input
                \end{itemize}
            \end{columns}
            \inserticon{audio}
        \end{frame}

    \section{timbre}
        \begin{frame}{timbre}{introduction 1/2}
            \vspace{-3mm}
            \begin{block}{\textbf{definition (American Standards Association)}}
                ...that attribute of sensation in terms of which a listener can judge that two sounds having the same loudness and pitch are dissimilar
            \end{block}
            \bigskip
            \question{What is the problem with this definition?}
            Bregman:\footfullcite{bregman_auditory_1994}
                    \begin{enumerate}
                        \item   implies that timbre \textit{only} exists for sounds with pitch!
                        \item   only says that timbre \textit{is not} loudness and pitch
                    \end{enumerate}
            \pause
            
            \begin{itemize}
                \item[$\rightarrow$]   [timbre is] "\textit{...the psychoacoustician's multidimensional waste-basket category for everything that cannot be labeled pitch or loudness.}"\footfullcite{mcadams_hearing_1979}
            \end{itemize}
        \end{frame}
        
        \begin{frame}{timbre}{introduction 2/2}
        
            timbre is
            \begin{itemize}
                \item   a function of \textbf{temporal envelope}
                    \begin{itemize}
                        \item   attack time characteristics
                        \item   amplitude modulations
                        \item   \ldots
                    \end{itemize}
                \item<2->   a function of \color<4->{highlight}{\textbf{spectral distribution}}
                    \begin{itemize}
                        \item   spectral envelope
                        \item   number of partials
                        \item   energy distribution of partials
                        \item   \ldots
                    \end{itemize}
            \end{itemize}

            \begin{itemize}
                \item<3->[] when dealing with complex mixtures of sound, it is very difficult (maybe impossible?) to extract detailed temporal information for individual tones
                \item<4->[$\Rightarrow$] timbre features typically focus on the \textbf{spectral shape}
            \end{itemize}
        \end{frame}

    \section[spectral features]{spectral shape features}
        \begin{frame}{spectral shape features}{spectral centroid}
            \listspectralfeature{Centroid}{\begin{itemize}  \item power spectrum    \item logarithmic frequency scale
						\[
							v_\mathrm{SC,log}(n) = \frac{\sum\limits_{k = k(f_{\mathrm{min}})}^{\mathcal{K}/2-1}{\log_2\left(\frac{f(k)}{f_{\mathrm{ref}}}\right)\cdot |X(k,n)|^2}}{\sum\limits_{k = k(f_{\mathrm{min}})}^{N/2-1}{|X(k,n)|^2}} 
						\]
			\end{itemize}}
        \end{frame}
        \begin{frame}{spectral shape features}{spectral spread}
            \listspectralfeature{Spread}{\begin{itemize}    \item same variants as with \textit{Spectral Centroid}, e.g.\ logarithmic:
				\begin{footnotesize}\[
                        v_\mathrm{SS,log}(n) = \sqrt{\frac{\sum\limits_{k = k(f_{\mathrm{min}})}^{\nicefrac{\mathcal{K}}{2}-1}{\left(\log_2\left(\frac{f(k)}{\unit[1000]{Hz}}\right)-v_{\mathrm{SC}}(n)\right)^2\cdot |X(k,n)|^2}}{\sum\limits_{k = k(f_{\mathrm{min}})}^{\nicefrac{\mathcal{K}}{2}-1}{|X(k,n)|^2}}} 
                    \]\end{footnotesize}
			\end{itemize}}
        \end{frame}
        \begin{frame}{spectral shape features}{spectral rolloff}
            \listspectralfeature{Rolloff}{\begin{itemize}   \item scaled to frequency  \item power spectrum  \end{itemize}}
        \end{frame}
        \begin{frame}{spectral shape features}{spectral decrease}
            \listspectralfeature{Decrease}{\begin{itemize}  \item restricted frequency range:
					\[
						v_{\mathrm{SD}}(n) = \frac{\sum\limits_{k = k_{\mathrm{l}}}^{k_{\mathrm{u}}}\frac{1}{k}\cdot \big(|X(k,n)|-|X(k_{\mathrm{l}}-1,n)|\big)}{\sum\limits_{k = k_{\mathrm{l}}}^{k_{\mathrm{u}}}{|X(k,n)|}} 
					\]  \end{itemize}}
        \end{frame}
        \begin{frame}{spectral shape features}{spectral flux}
            \listspectralfeature{Flux}{
						\begin{footnotesize}
							\begin{eqnarray*}
								v_{\mathrm{SF}}(n, \beta) &=& \frac{\sqrt[\beta]{\sum\limits_{k = 0}^{\mathcal{K}/2-1}{\left(|X(k,n)|-|X(k,n-1)|\right)^\beta}}}{\nicefrac{\mathcal{K}}{2}}\\
								v_{\mathrm{SF}, \sigma}(n) &=& \sqrt{{\frac{2}{\mathcal{K}}}\sum\limits_{k = 0}^{\mathcal{K}/2-1}{\left(\Delta X(k,n)-\mu_{\Delta X}\right)^2}}\\
								v_\mathrm{SF, log}(n) &=& {{\frac{2}{\mathcal{K}}}\sum\limits_{k = 0}^{\mathcal{K}/2-1}{\log_2\left(\frac{|X(k,n)|}{|X(k,n-1)|}\right)}}
							\end{eqnarray*}
						\end{footnotesize}
                        }
        \end{frame}
        %\begin{frame}{spectral shape features}{spectral slope}
            %\listspectralfeature{Slope}{}
        %\end{frame}
        %\begin{frame}{spectral shape features}{spectral skewness}
            %\listspectralfeature{Skewness}{}
        %\end{frame}
        %\begin{frame}{spectral shape features}{spectral kurtosis}
            %\listspectralfeature{Kurtosis}{}
        %\end{frame}

    \section[MFCCs]{Mel Frequency Cepstral Coefficients}
		\begin{frame}{fundamentals}{cepstrum 1/3}
			\textbf{signal model}: \\
			convolution of \textit{excitation signal} and \textit{transfer function}
			\begin{equation*}\label{eq:speech}
				x(i) = e(i)\ast h(i)
			\end{equation*}
			\pause
			\begin{equation*}
				X(\jom) = E(\jom)\cdot H(\jom) 
			\end{equation*}
			\pause
			\begin{eqnarray*}
				\log\big(X(\jom)\big)	&=& \log\big(E(\jom)\cdot H(\jom)\big)\nonumber\\
											&=& \log\big(E(\jom)\big) \mathbf{+} \log\big(H(\jom)\big) 
			\end{eqnarray*}
		\end{frame}

		\begin{frame}{fundamentals}{cepstrum 2/3}
			\vspace{-6mm}
            \begin{footnotesize}
            \begin{eqnarray*}
			\only<1-3>{	c_x(i)	&=& \mathfrak{F}^{-1}\left\{\log\left(X(\jom)\right)\right\}\nonumber\\
                        \pause
						&=& \mathfrak{F}^{-1}\left\{\log\left(E(\jom)\right) + \log\left(H(\jom)\right)\right\}\nonumber\\
						\pause
                        &=& \mathfrak{F}^{-1}\left\{\log\left(E(\jom)\right)\right\} + \mathfrak{F}^{-1}\left\{\log\left(H(\jom)\right)\right\} \\
                    \pause
             }   \hat{c}_x(i_{\mathrm{s}}(n)\ldots i_{\mathrm{e}}(n)) &=& \sum\limits_{k=0}^{\nicefrac{\mathcal{K}}{2}-1}{\log\left(|X(k,n)|\right)\e^{\mathrm{j}ki\Delta\Omega}} 
			\end{eqnarray*}
			\end{footnotesize}	
			\only<4->{\vspace{-3mm}
            \figwithmatlab{F0Cepstrum}
            }
		\end{frame}
        
        \begin{frame}{fundamentals}{cepstrum 3/3}
            \begin{itemize}
                \item \textbf{summary}:
                    \begin{itemize}
                        \item   cepstrum 'replaces' time domain convolution operation with addition
                        \item   result is the \textit{unfiltered} excitation signal \textit{plus} the filter IR (both logarithmic)
                        \item   can be used for, e.g., \textit{spectral envelope extraction} or \textit{pitch detection}
                        \bigskip
                        \item<2> more naming silliness:\\
                         cepstrum, quefrency, liftering, \ldots
                    \end{itemize}
            \end{itemize}
        \end{frame}

		\begin{frame}{spectral shape features}{mel frequency cepstral coefficients 1/4}
            \begin{itemize}
                \item   typical processing steps for the mel frequency cepstral coefficients (MFCCs):
                    \begin{enumerate}
                        \item   compute magnitude spectrum
                        \item   convert linear frequency scale to logarithmic
                        \item   group bins into bands
                        \item   apply logarithm to all bands
                        \item   compute (inverse) cosine transform (DCT)
                    \end{enumerate}
            \end{itemize}
            
            \bigskip
			\begin{equation*}
				v^j_{\mathrm{MFCC}}(n)	= \sum\limits_{k' = 1}^{\mathcal{K}'}{\log\big( |X'(k',n)|\big)\cdot \cos\left( j\cdot\left(k'-\frac{1}{2} \right)\frac{\pi}{\mathcal{K}'} \right)}			
			\end{equation*}
        \end{frame}

		\begin{frame}{spectral shape features}{mel frequency cepstral coefficients 2/4}
			\figwithmatlab{MfccFilterbank}
            
            \begin{itemize}
                \item   constant Q filter spacing for higher frequencies (mel scale)
                \item   FFT values are weighted and summed over bins for each band
            \end{itemize}
		\end{frame}
		\begin{frame}{spectral shape features}{mel frequency cepstral coefficients 3/4}
            \begin{columns}
                \column{.6\linewidth}
                    mel-warped cosine bases for DCT
                 \column{.4\linewidth}
			\vspace{-11mm}
                    \begin{figure}
                        \centering
                        \includegraphics[width=\columnwidth]{MfccMelDct}
                        \label{fig:MfccMelDct}
                    \end{figure}
            \end{columns}
		\end{frame}
		\begin{frame}{spectral shape features}{mel frequency cepstral coefficients 4/4}
			\vspace{-6mm}
            \begin{footnotesize}
			\begin{table}
				\centering
				\begin{tabular*}{\textwidth}{@{\extracolsep{\fill}}lccc}%{l|c|c|c} %{c|p{12mm}p{12mm}p{12mm}p{12mm}p{12mm}p{12mm}p{12mm}}
                    \\ \hline
                    \bf{\emph{Property}}	 & \bf{\emph{DM}}	 & \bf{\emph{HTK}}	 & \bf{\emph{SAT}}\\ 
                     \hline
                    \bf{Num.\ filters}	 & 20	 & 24	 & 40\\
                    \bf{Mel scale}	 & lin/log	 & log	 & lin/log\\
                    \bf{Freq.\ range}	 & $[100; 4000]$	 & $[100; 4000]$	 & $[200; 6400]$\\
                    \bf{Normalization}	 & Equal height	 & Equal height	 & Equal area\\
				\end{tabular*}
			\end{table}
			\end{footnotesize}

            \vspace{-5mm}
            \figwithref{FeatureSpectralMfccs}{matlab source: \href{https://github.com/alexanderlerch/ACA-Plots/blob/master/matlab/plotFeatures.m}{plotFeatures.m}}
		\end{frame}

    \section[tonalness]{tonalness features}
        \begin{frame}{tonalness features}{spectral crest factor}
            \listspectralfeature{CrestFactor}{\begin{itemize}   \item normalization \item power spectrum \item measure \textit{per band} instead of  whole spectrum \end{itemize}}
        \end{frame}
        \begin{frame}{tonalness features}{spectral flatness}
            \listspectralfeature{Flatness}{\begin{itemize}   \item power vs.\ magnitude spectrum \item smoothed spectrum (avoid spurious 0-bins) \item measure \textit{per band} instead of  whole spectrum \end{itemize}}
        \end{frame}
        \begin{frame}{tonalness features}{spectral tonal power ratio}
            \listspectralfeature{TonalPowerRatio}{\begin{itemize}   \item definition of tonal/non-tonal components \begin{itemize}\item local maxima\item peak salience\item in periodic (harmonic) pattern \item \ldots\end{itemize}\end{itemize}}
        \end{frame}
        
		\begin{frame}{tonalness features}{maximum of ACF}
            \vspace{-6mm}
			\begin{equation*}
				\input{eq/Llf_MaxAcf}
			\end{equation*}
            {\flushright\includeaudio{sax_example}}
            \vspace{-9mm}
            \inserticon{audio}
            \figwithref{FeatureTimeMaxAcf}{matlab source: \href{https://github.com/alexanderlerch/ACA-Plots/blob/master/matlab/plotFeatures.m}{matlab/displayFeatures.m}}
		\end{frame}
		%\begin{frame}{tonalness features}{maximum of ACF 2/2}
            %\question{maximum detection: how to avoid main lobe maxima?}
			%
			%\begin{itemize}
				%\item	minimum lag
				%\item	magnitude threshold
				%\item	search only after first local minimum
			%\end{itemize}
		%\end{frame}
        %Predictivity Ratio
        %SpectralPredictivity

    \section[technical]{technical properties}
		\begin{frame}{technical features}{zero crossing rate}
            \vspace{-6mm}
			\begin{equation*}\label{eq:zc}
				\input{eq/Llf_ZeroCrossingRate} 
			\end{equation*}
            {\flushright\includeaudio{sax_example}}
            \vspace{-9mm}
            \inserticon{audio}
            \figwithref{FeatureTimeZeroCrossingRate}{matlab source: \href{https://github.com/alexanderlerch/ACA-Plots/blob/master/matlab/plotFeatures.m}{matlab/displayFeatures.m}}
		\end{frame}
		\begin{frame}{technical features}{ACF coefficients}
            \vspace{-6mm}
			\begin{equation*}
				\input{eq/Llf_AcfC} \quad \text{with}\enspace \eta = 1,2,3,\ldots
			\end{equation*}
            {\flushright\includeaudio{sax_example}}
            \vspace{-9mm}
            \inserticon{audio}
            \figwithref{FeatureTimeAcfCoeff}{matlab source: \href{https://github.com/alexanderlerch/ACA-Plots/blob/master/matlab/plotFeatures.m}{matlab/displayFeatures.m}}
		\end{frame}

    \section{summary}
        \begin{frame}{summary}{lecture content}
            \vspace{-3mm}
            \begin{itemize}
                \item   \textbf{feature}
                    \begin{itemize}
                        \item   descriptor with condensed relevant information
                        \item   not necessarily interpretable by humans
                    \end{itemize}
                \bigskip
                \item   \textbf{feature extraction}
                    \begin{itemize}
                        \item   usually extracted per short block of samples
                        \item   many features can be extracted from audio data, resulting in feature matrix
                    \end{itemize}
                \bigskip
                \item   \textbf{timbre}
                    \begin{itemize}
                        \item   mostly dependent on both spectral shape and time domain envelope characteristics
                        \item   multi-dimensional perceptual property not as clearly defined as pitch or loudness
                    \end{itemize}

                \bigskip
                \item   \textbf{instantaneous spectral shape features}
                    \begin{itemize}
                        \item   established set of baseline features
                        \item   often extracted from the magnitude spectrum, describing timbre
                        \item   condensing various properties of the spectral shape into single values
                        \item   there exist multiple variants of ``the same'' feature
                    \end{itemize}
            \end{itemize}
            \inserticon{summary}
        \end{frame}
\end{document}