/
speaker_verification_mass.py
135 lines (104 loc) · 5.42 KB
/
speaker_verification_mass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""import io
import sys
import torch
import torch.nn as nn
from torchvision import models
from PIL import Image
import torchvision.transforms as transforms
from encoder import inference as encoder
from encoder import audio
import librosa
from pathlib import Path
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
from encoder.params_model import *
import pandas as pd
from itertools import combinations
from tqdm import tqdm
import pickle
# TODO: Write this program s.t. you may enter ```python speaker_verificiation_mass.py verification``` or ```python speaker_verificiation_mass.py test1``` and it will auto
# fetch the proper speaker_verficiation_dataset.py copy of the dataset and pull batches of samples in vectors to encode via a model/step of choice and then perform
# the cosine similartiy [-1, 1]. Note that with the cosine similarity in this range 0 is not a terrible evaluation. It's just perfectly in-between a match and non-match. Threshold
#i'm thinking could be around .1. Not sure yet though. I need to plot the distribution
# USAGE: python speaker_verification_mass.py /research/iprobe-sandle20/Playground/evector/Data/EVec/encoder/Validation/ 1 0
# COMMENTARY: Generates combos from scratch (0) and assumes the data is .npy format (1)
# USAGE: python speaker_verification_mass.py /research/iprobe-sandle20/Playground/evector/Data/EVec/encoder/Validation/ 0 0
# COMMENTARY: Generates combos from scratch (0) and assumes the data is .wav format (0)
def get_model():
model_save_path = Path('/research/iprobe-sandle20/Playground/evector/encoder/saved_models/first_backups/first_bak_105000.pt') # NOTE: Add your own path here to your saved model. Be careful of which step/model you are loading
module_name = 'model_GST'
encoder.load_model(model_save_path, module_name=module_name)
return encoder
def get_tensor(file_path, preprocess=True, sampling_rate=8000, duration=None):
if(preprocess):
ref_audio = encoder.preprocess_wav(file_path)
else:
ref_audio, sr = librosa.load(file_path, sr=sampling_rate)
if(duration is not None):
ref_audio = ref_audio[0:int(duration*sampling_rate)]
return ref_audio
encoder = get_model()
def get_speaker_embedding(file_path, preprocess=True, sampling_rate=16000, duration=None, normalize=True):
ref_audio = get_tensor(file_path, preprocess=preprocess, sampling_rate=sampling_rate, duration=duration)
print(ref_audio.shape)
embed, partial_embeds, _ = encoder.embed_utterance(ref_audio, return_partials=True)
if(normalize):
embed = embed / np.linalg.norm(embed)
return embed
def get_combinations(root_of_data : Path, preprocessed_already=True):
if not preprocessed_already:
# Assumes that the input is .wav files
list_of_samples = []
paths = root_of_data.glob('**/*.wav')
for sampleA, sampleB in tqdm(list(combinations(paths, 2))):
identificationA = sampleA.parent.name
identificationB = sampleB.parent.name
list_of_samples.append(((identificationA, sampleA), (identificationB, sampleB)))
#print(list_of_samples[0:10])
return list_of_samples
else:
# Assumes the input is .npy files (already preprocessed)
# run the get_speaker_embedding with preprocess = False. This will allow for the use
# of the pre-computed .npy files in the Data/ directory. If you do not have this
# then using preprocessed = False on this function
list_of_samples = []
# TODO @Morgan remove this when done
paths = root_of_data.glob('**/*.npy')
for sampleA, sampleB in tqdm(list(combinations(paths, 2))):
identificationA = sampleA.parent.name
identificationB = sampleB.parent.name
list_of_samples.append(((identificationA, sampleA), (identificationB, sampleB)))
#print(list_of_samples[0:10])
return list_of_samples
def speaker_verification(path1, path2, already_preprocessed):
embed1 = get_speaker_embedding(path1, preprocess=not already_preprocessed)
embed2 = get_speaker_embedding(path2, preprocess=not already_preprocessed)
assert embed1.shape == embed2.shape
print('Match Score', cosine_similarity(embed1.reshape(1, -1), embed2.reshape(1, -1)))
# Main program is as follows
# Get all the combos of vectors and their corresp. IDs
if not int(sys.argv[3]):
combos = get_combinations(Path(sys.argv[1])) # returns this format:
# [((ID_SAMPLEA, VECTOR_SAMPLEA), (ID_SAMPLEB, VECTOR_SAMPLEB)) ..... ]
# Save the combos. This part can be computationally expensive. That's why
with open('ExperimentData/experiment_combos.pickle', 'wb') as handle:
pickle.dump(combos, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
# else load precomputed
with open('ExperimentData/experiment_combos.pickle', 'rb') as handle:
combos = pickle.load(handle)
results = []
for sampleA, sampleB in tqdm(combos):
print(sampleA, sampleB)
#match_score = speaker_verification()
#print(match_score)
break
#results.append(('gen' if sampleA[0] == sampleB[0] else 'imp', match_score, emotion))
# Save the results of the experiment
#np.savetxt("ExperimentData/SpeakerID_Experiment_Results.csv",
# results,
# delimiter =", ",
# fmt ='% s')
"""