Merge pull request #16 from marl/zeropadding

Add support for padding and set default behavior to True. Fixes #15
marl · May 10, 2018 · 2c9993c · 2c9993c
2 parents a8c4155 + 53f6551
commit 2c9993c
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -58,16 +58,47 @@ The resulting `audio_file.f0.csv` contains 3 columns: the first with timestamps
     0.08,199.678,0.775208
     ...
 
-By default CREPE does not apply temporal smoothing to the pitch curve, but Viterbi smoothing is supported via the optional `--viterbi` command line argument. The script can also optionally save the output activation matrix of the model to an npy file (`--save-activation`), where the matrix dimensions are (n_frames, 360) using a hop size of 10 ms (there are 360 pitch bins covering 20 cents each). The script can also output a plot of the activation matrix (`--save-plot`), saved to `audio_file.activation.png` including an optional visual representation of the model's voicing detection (`--plot-voicing`). Here's an example plot of the activation matrix (without the voicing overlay) for an excerpt of male singing voice:
+#### Timestamps
+Following the convention adopted by popular audio processing libraries such as 
+[Essentia](http://essentia.upf.edu/) and [Librosa](https://librosa.github.io/librosa/), 
+from v0.0.5 onwards CREPE will pad the input signal such that the first frame 
+is zero-centered (the center of the frame corresponds to time 0) and generally 
+all frames are centered around their corresponding timestamp, i.e. frame 
+`D[:, t]` is centered at `audio[t * hop_length]`. This behavior can be changed 
+by specifying the optional `--no-centering` flag, in which case the first frame 
+will *start* at time zero and generally frame `D[:, t]` will *begin* at 
+`audio[t * hop_length]`. Sticking to the default behavior (centered frames) is 
+strongly recommended to avoid misalignment with features and annotations produced 
+by other common audio processing tools. 
+
+
+#### Temporal smoothing
+By default CREPE does not apply temporal smoothing to the pitch curve, but 
+Viterbi smoothing is supported via the optional `--viterbi` command line argument. 
+
+
+#### Saving the activation matrix
+The script can also optionally save the output activation matrix of the model 
+to an npy file (`--save-activation`), where the matrix dimensions are 
+(n_frames, 360) using a hop size of 10 ms (there are 360 pitch bins covering 20 
+cents each). 
+
+The script can also output a plot of the activation matrix (`--save-plot`), 
+saved to `audio_file.activation.png` including an optional visual representation 
+of the model's voicing detection (`--plot-voicing`). Here's an example plot of 
+the activation matrix (without the voicing overlay) for an excerpt of male 
+singing voice:
 
 ![salience](https://user-images.githubusercontent.com/266841/38465913-6fa085b0-3aef-11e8-9633-bdd59618ea23.png)
 
+#### Batch processing
 For batch processing of files, you can provide a folder path instead of a file path: 
 ```bash
 $ python crepe.py audio_folder
 ```
 The script will process all WAV files found inside the folder. 
 
+#### Additional usage information
 For more information on the usage, please refer to the help message:
 
 ```bash

diff --git a/crepe/cli.py b/crepe/cli.py
@@ -8,7 +8,7 @@
 
 
 def run(filename, output=None, viterbi=False, save_activation=False,
-        save_plot=False, plot_voicing=False):
+        save_plot=False, plot_voicing=False, no_centering=False):
     """
     Collect the WAV files to process and run the model
 
@@ -31,6 +31,12 @@ def run(filename, output=None, viterbi=False, save_activation=False,
         Include a visual representation of the voicing activity detection in
         the plot of the output activation matrix. False by default, only
         relevant if save_plot is True.
+    no_centering : bool
+        Don't pad the signal, meaning frames will begin at their timestamp
+        instead of being centered around their timestamp (which is the
+        default). CAUTION: setting this option can result in CREPE's output
+        being misaligned with respect to the output of other audio processing
+        tools and is generally not recommended.
 
     """
 
@@ -59,8 +65,12 @@ def run(filename, output=None, viterbi=False, save_activation=False,
     for i, file in enumerate(files):
         print('CREPE: Processing {} ... ({}/{})'.format(file, i+1, len(files)),
               file=sys.stderr)
-        process_file(file, output, viterbi,
-                     save_activation, save_plot, plot_voicing)
+        process_file(file, output=output,
+                     viterbi=viterbi,
+                     center=(not no_centering),
+                     save_activation=save_activation,
+                     save_plot=save_plot,
+                     plot_voicing=plot_voicing)
 
 
 def main():
@@ -108,8 +118,21 @@ def main():
     parser.add_argument('--plot-voicing', '-v', action='store_true',
                         help='Plot the voicing prediction on top of the '
                              'output activation matrix plot')
+    parser.add_argument('--no-centering', '-n', action='store_true',
+                        help="Don't pad the signal, meaning frames will begin "
+                             "at their timestamp instead of being centered "
+                             "around their timestamp (which is the default). "
+                             "CAUTION: setting this option can result in "
+                             "CREPE's output being misaligned with respect to "
+                             "the output of other audio processing tools and "
+                             "is generally not recommended.")
 
     args = parser.parse_args()
 
-    run(args.filename, args.output, args.viterbi,
-        args.save_activation, args.save_plot, args.plot_voicing)
+    run(args.filename,
+        output=args.output,
+        viterbi=args.viterbi,
+        save_activation=args.save_activation,
+        save_plot=args.save_plot,
+        plot_voicing=args.plot_voicing,
+        no_centering=args.no_centering)
diff --git a/crepe/core.py b/crepe/core.py
@@ -125,7 +125,7 @@ def to_viterbi_cents(salience):
                      range(len(observations))])
 
 
-def get_activation(audio, sr):
+def get_activation(audio, sr, center=True):
     """
     
     Parameters
@@ -135,6 +135,10 @@ def get_activation(audio, sr):
     sr : int
         Sample rate of the audio samples. The audio will be resampled if
         the sample rate is not 16 kHz, which is expected by the model.
+    center : boolean
+        - If `True` (default), the signal `audio` is padded so that frame
+          `D[:, t]` is centered at `audio[t * hop_length]`.
+        - If `False`, then `D[:, t]` begins at `audio[t * hop_length]`
 
     Returns
     -------
@@ -153,6 +157,11 @@ def get_activation(audio, sr):
         from resampy import resample
         audio = resample(audio, sr, model_srate)
 
+    # pad so that frames are centered around their timestamps (i.e. first frame
+    # is zero centered).
+    if center:
+        audio = np.pad(audio, 512, mode='constant', constant_values=0)
+
     # make 1024-sample frames of the audio with hop length of 10 milliseconds
     hop_length = int(model_srate / 100)
     n_frames = 1 + int((len(audio) - 1024) / hop_length)
@@ -168,7 +177,7 @@ def get_activation(audio, sr):
     return model.predict(frames, verbose=1)
 
 
-def predict(audio, sr, viterbi=False):
+def predict(audio, sr, viterbi=False, center=True):
     """
     Perform pitch estimation on given audio
     
@@ -180,7 +189,11 @@ def predict(audio, sr, viterbi=False):
         Sample rate of the audio samples. The audio will be resampled if
         the sample rate is not 16 kHz, which is expected by the model.
     viterbi : bool
-        Apply viterbi smoothing to the estimated pitch curve. False by default. 
+        Apply viterbi smoothing to the estimated pitch curve. False by default.
+    center : boolean
+        - If `True` (default), the signal `audio` is padded so that frame
+          `D[:, t]` is centered at `audio[t * hop_length]`.
+        - If `False`, then `D[:, t]` begins at `audio[t * hop_length]`
 
     Returns
     -------
@@ -195,7 +208,7 @@ def predict(audio, sr, viterbi=False):
         activation: np.ndarray [shape=(T, 360)]
             The raw activation matrix
     """
-    activation = get_activation(audio, sr)
+    activation = get_activation(audio, sr, center=center)
     confidence = activation.max(axis=1)
 
     if viterbi:
@@ -212,7 +225,7 @@ def predict(audio, sr, viterbi=False):
     return time, frequency, confidence, activation
 
 
-def process_file(file, output=None, viterbi=False,
+def process_file(file, output=None, viterbi=False, center=True,
                  save_activation=False, save_plot=False, plot_voicing=False):
     """
     Use the input model to perform pitch estimation on the input file.
@@ -226,6 +239,10 @@ def process_file(file, output=None, viterbi=False,
         be saved to the directory containing the input file.
     viterbi : bool
         Apply viterbi smoothing to the estimated pitch curve. False by default.
+    center : boolean
+        - If `True` (default), the signal `audio` is padded so that frame
+          `D[:, t]` is centered at `audio[t * hop_length]`.
+        - If `False`, then `D[:, t]` begins at `audio[t * hop_length]`
     save_activation : bool
         Save the output activation matrix to an .npy file. False by default.
     save_plot : bool
@@ -249,7 +266,9 @@ def process_file(file, output=None, viterbi=False,
         print("CREPE: Could not read %s" % file, file=sys.stderr)
         raise
 
-    time, frequency, confidence, activation = predict(audio, sr, viterbi)
+    time, frequency, confidence, activation = predict(audio, sr,
+                                                      viterbi=viterbi,
+                                                      center=center)
 
     # write prediction as TSV
     f0_file = output_path(file, ".f0.csv", output)
@@ -263,7 +282,8 @@ def process_file(file, output=None, viterbi=False,
     if save_activation:
         activation_path = output_path(file, ".activation.npy", output)
         np.save(activation_path, activation)
-        print("CREPE: Saved the activation matrix at {}".format(activation_path))
+        print("CREPE: Saved the activation matrix at {}".format(
+            activation_path))
 
     # save the salience visualization in a PNG file
     if save_plot:

diff --git a/crepe/version.py b/crepe/version.py
@@ -1 +1 @@
-version = '0.0.4'
+version = '0.0.5'
diff --git a/setup.py b/setup.py
@@ -20,10 +20,15 @@
 
 version = imp.load_source('crepe.version', os.path.join('crepe', 'version.py'))
 
+with open('README.md') as file:
+    long_description = file.read()
+
 setup(
     name='crepe',
     version=version.version,
     description='CREPE pitch tracker',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
     url='https://github.com/marl/crepe',
     author='Jong Wook Kim',
     author_email='jongwook@nyu.edu',