Skip to content

Commit

Permalink
Merge pull request #16 from marl/zeropadding
Browse files Browse the repository at this point in the history
Add support for padding and set default behavior to True. Fixes #15
  • Loading branch information
justinsalamon committed May 10, 2018
2 parents a8c4155 + 53f6551 commit 2c9993c
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 14 deletions.
33 changes: 32 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,47 @@ The resulting `audio_file.f0.csv` contains 3 columns: the first with timestamps
0.08,199.678,0.775208
...

By default CREPE does not apply temporal smoothing to the pitch curve, but Viterbi smoothing is supported via the optional `--viterbi` command line argument. The script can also optionally save the output activation matrix of the model to an npy file (`--save-activation`), where the matrix dimensions are (n_frames, 360) using a hop size of 10 ms (there are 360 pitch bins covering 20 cents each). The script can also output a plot of the activation matrix (`--save-plot`), saved to `audio_file.activation.png` including an optional visual representation of the model's voicing detection (`--plot-voicing`). Here's an example plot of the activation matrix (without the voicing overlay) for an excerpt of male singing voice:
#### Timestamps
Following the convention adopted by popular audio processing libraries such as
[Essentia](http://essentia.upf.edu/) and [Librosa](https://librosa.github.io/librosa/),
from v0.0.5 onwards CREPE will pad the input signal such that the first frame
is zero-centered (the center of the frame corresponds to time 0) and generally
all frames are centered around their corresponding timestamp, i.e. frame
`D[:, t]` is centered at `audio[t * hop_length]`. This behavior can be changed
by specifying the optional `--no-centering` flag, in which case the first frame
will *start* at time zero and generally frame `D[:, t]` will *begin* at
`audio[t * hop_length]`. Sticking to the default behavior (centered frames) is
strongly recommended to avoid misalignment with features and annotations produced
by other common audio processing tools.


#### Temporal smoothing
By default CREPE does not apply temporal smoothing to the pitch curve, but
Viterbi smoothing is supported via the optional `--viterbi` command line argument.


#### Saving the activation matrix
The script can also optionally save the output activation matrix of the model
to an npy file (`--save-activation`), where the matrix dimensions are
(n_frames, 360) using a hop size of 10 ms (there are 360 pitch bins covering 20
cents each).

The script can also output a plot of the activation matrix (`--save-plot`),
saved to `audio_file.activation.png` including an optional visual representation
of the model's voicing detection (`--plot-voicing`). Here's an example plot of
the activation matrix (without the voicing overlay) for an excerpt of male
singing voice:

![salience](https://user-images.githubusercontent.com/266841/38465913-6fa085b0-3aef-11e8-9633-bdd59618ea23.png)

#### Batch processing
For batch processing of files, you can provide a folder path instead of a file path:
```bash
$ python crepe.py audio_folder
```
The script will process all WAV files found inside the folder.

#### Additional usage information
For more information on the usage, please refer to the help message:

```bash
Expand Down
33 changes: 28 additions & 5 deletions crepe/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def run(filename, output=None, viterbi=False, save_activation=False,
save_plot=False, plot_voicing=False):
save_plot=False, plot_voicing=False, no_centering=False):
"""
Collect the WAV files to process and run the model
Expand All @@ -31,6 +31,12 @@ def run(filename, output=None, viterbi=False, save_activation=False,
Include a visual representation of the voicing activity detection in
the plot of the output activation matrix. False by default, only
relevant if save_plot is True.
no_centering : bool
Don't pad the signal, meaning frames will begin at their timestamp
instead of being centered around their timestamp (which is the
default). CAUTION: setting this option can result in CREPE's output
being misaligned with respect to the output of other audio processing
tools and is generally not recommended.
"""

Expand Down Expand Up @@ -59,8 +65,12 @@ def run(filename, output=None, viterbi=False, save_activation=False,
for i, file in enumerate(files):
print('CREPE: Processing {} ... ({}/{})'.format(file, i+1, len(files)),
file=sys.stderr)
process_file(file, output, viterbi,
save_activation, save_plot, plot_voicing)
process_file(file, output=output,
viterbi=viterbi,
center=(not no_centering),
save_activation=save_activation,
save_plot=save_plot,
plot_voicing=plot_voicing)


def main():
Expand Down Expand Up @@ -108,8 +118,21 @@ def main():
parser.add_argument('--plot-voicing', '-v', action='store_true',
help='Plot the voicing prediction on top of the '
'output activation matrix plot')
parser.add_argument('--no-centering', '-n', action='store_true',
help="Don't pad the signal, meaning frames will begin "
"at their timestamp instead of being centered "
"around their timestamp (which is the default). "
"CAUTION: setting this option can result in "
"CREPE's output being misaligned with respect to "
"the output of other audio processing tools and "
"is generally not recommended.")

args = parser.parse_args()

run(args.filename, args.output, args.viterbi,
args.save_activation, args.save_plot, args.plot_voicing)
run(args.filename,
output=args.output,
viterbi=args.viterbi,
save_activation=args.save_activation,
save_plot=args.save_plot,
plot_voicing=args.plot_voicing,
no_centering=args.no_centering)
34 changes: 27 additions & 7 deletions crepe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def to_viterbi_cents(salience):
range(len(observations))])


def get_activation(audio, sr):
def get_activation(audio, sr, center=True):
"""
Parameters
Expand All @@ -135,6 +135,10 @@ def get_activation(audio, sr):
sr : int
Sample rate of the audio samples. The audio will be resampled if
the sample rate is not 16 kHz, which is expected by the model.
center : boolean
- If `True` (default), the signal `audio` is padded so that frame
`D[:, t]` is centered at `audio[t * hop_length]`.
- If `False`, then `D[:, t]` begins at `audio[t * hop_length]`
Returns
-------
Expand All @@ -153,6 +157,11 @@ def get_activation(audio, sr):
from resampy import resample
audio = resample(audio, sr, model_srate)

# pad so that frames are centered around their timestamps (i.e. first frame
# is zero centered).
if center:
audio = np.pad(audio, 512, mode='constant', constant_values=0)

# make 1024-sample frames of the audio with hop length of 10 milliseconds
hop_length = int(model_srate / 100)
n_frames = 1 + int((len(audio) - 1024) / hop_length)
Expand All @@ -168,7 +177,7 @@ def get_activation(audio, sr):
return model.predict(frames, verbose=1)


def predict(audio, sr, viterbi=False):
def predict(audio, sr, viterbi=False, center=True):
"""
Perform pitch estimation on given audio
Expand All @@ -180,7 +189,11 @@ def predict(audio, sr, viterbi=False):
Sample rate of the audio samples. The audio will be resampled if
the sample rate is not 16 kHz, which is expected by the model.
viterbi : bool
Apply viterbi smoothing to the estimated pitch curve. False by default.
Apply viterbi smoothing to the estimated pitch curve. False by default.
center : boolean
- If `True` (default), the signal `audio` is padded so that frame
`D[:, t]` is centered at `audio[t * hop_length]`.
- If `False`, then `D[:, t]` begins at `audio[t * hop_length]`
Returns
-------
Expand All @@ -195,7 +208,7 @@ def predict(audio, sr, viterbi=False):
activation: np.ndarray [shape=(T, 360)]
The raw activation matrix
"""
activation = get_activation(audio, sr)
activation = get_activation(audio, sr, center=center)
confidence = activation.max(axis=1)

if viterbi:
Expand All @@ -212,7 +225,7 @@ def predict(audio, sr, viterbi=False):
return time, frequency, confidence, activation


def process_file(file, output=None, viterbi=False,
def process_file(file, output=None, viterbi=False, center=True,
save_activation=False, save_plot=False, plot_voicing=False):
"""
Use the input model to perform pitch estimation on the input file.
Expand All @@ -226,6 +239,10 @@ def process_file(file, output=None, viterbi=False,
be saved to the directory containing the input file.
viterbi : bool
Apply viterbi smoothing to the estimated pitch curve. False by default.
center : boolean
- If `True` (default), the signal `audio` is padded so that frame
`D[:, t]` is centered at `audio[t * hop_length]`.
- If `False`, then `D[:, t]` begins at `audio[t * hop_length]`
save_activation : bool
Save the output activation matrix to an .npy file. False by default.
save_plot : bool
Expand All @@ -249,7 +266,9 @@ def process_file(file, output=None, viterbi=False,
print("CREPE: Could not read %s" % file, file=sys.stderr)
raise

time, frequency, confidence, activation = predict(audio, sr, viterbi)
time, frequency, confidence, activation = predict(audio, sr,
viterbi=viterbi,
center=center)

# write prediction as TSV
f0_file = output_path(file, ".f0.csv", output)
Expand All @@ -263,7 +282,8 @@ def process_file(file, output=None, viterbi=False,
if save_activation:
activation_path = output_path(file, ".activation.npy", output)
np.save(activation_path, activation)
print("CREPE: Saved the activation matrix at {}".format(activation_path))
print("CREPE: Saved the activation matrix at {}".format(
activation_path))

# save the salience visualization in a PNG file
if save_plot:
Expand Down
2 changes: 1 addition & 1 deletion crepe/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version = '0.0.4'
version = '0.0.5'
5 changes: 5 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@

version = imp.load_source('crepe.version', os.path.join('crepe', 'version.py'))

with open('README.md') as file:
long_description = file.read()

setup(
name='crepe',
version=version.version,
description='CREPE pitch tracker',
long_description=long_description,
long_description_content_type='text/markdown',
url='https://github.com/marl/crepe',
author='Jong Wook Kim',
author_email='jongwook@nyu.edu',
Expand Down

0 comments on commit 2c9993c

Please sign in to comment.