-
Notifications
You must be signed in to change notification settings - Fork 7
/
Pattern_Generator.py
214 lines (181 loc) · 9.23 KB
/
Pattern_Generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import numpy as np
import yaml, os, pickle, librosa, argparse
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor as PE
from threading import Thread
from random import shuffle
from Audio import Audio_Prep, Mel_Generate
from yin import pitch_calc
with open('Hyper_Parameter.yaml') as f:
hp_Dict = yaml.load(f, Loader=yaml.Loader)
using_Extension = [x.upper() for x in ['.wav', '.m4a', '.flac']]
top_DB_Dict = {'VCTK': 15, 'VC1': 23, 'VC2': 23, 'Libri': 23, 'CMUA': 60} # VC1 and Libri is from 'https://github.com/CorentinJ/Real-Time-Voice-Cloning'
def Pitch_Generate(audio):
pitch = pitch_calc(
sig= audio,
sr= hp_Dict['Sound']['Sample_Rate'],
w_len= hp_Dict['Sound']['Frame_Length'],
w_step= hp_Dict['Sound']['Frame_Shift'],
confidence_threshold= hp_Dict['Sound']['Confidence_Threshold'],
gaussian_smoothing_sigma = hp_Dict['Sound']['Gaussian_Smoothing_Sigma']
)
return (pitch - np.min(pitch)) / (np.max(pitch) - np.min(pitch) + 1e-7)
def Pattern_Generate(path, top_db= 15):
audio = Audio_Prep(path, hp_Dict['Sound']['Sample_Rate'], top_db)
mel = Mel_Generate(
audio= audio,
sample_rate= hp_Dict['Sound']['Sample_Rate'],
num_frequency= hp_Dict['Sound']['Spectrogram_Dim'],
num_mel= hp_Dict['Sound']['Mel_Dim'],
window_length= hp_Dict['Sound']['Frame_Length'],
hop_length= hp_Dict['Sound']['Frame_Shift'],
mel_fmin= hp_Dict['Sound']['Mel_F_Min'],
mel_fmax= hp_Dict['Sound']['Mel_F_Max'],
max_abs_value= hp_Dict['Sound']['Max_Abs_Mel']
)
pitch = Pitch_Generate(audio)
return audio, mel, pitch
def Pattern_File_Generate(path, speaker_Index, speaker, dataset, tag= '', eval= False):
pattern_Path = hp_Dict['Train']['Eval_Pattern' if eval else 'Train_Pattern']['Path']
pickle_Path = '{}.{}{}.{}.PICKLE'.format(
dataset,
speaker,
'.{}'.format(tag) if tag != '' else tag,
os.path.splitext(os.path.basename(path))[0]
)
pickle_Path = os.path.join(pattern_Path, dataset, pickle_Path).replace("\\", "/")
if os.path.exists(pickle_Path):
return
os.makedirs(os.path.join(pattern_Path, dataset).replace('\\', '/'), exist_ok= True)
try:
audio, mel, pitch = Pattern_Generate(path, top_DB_Dict[dataset])
assert mel.shape[0] == pitch.shape[0], 'Mel_shape != Pitch_shape {} != {}'.format(mel.shape, pitch.shape)
new_Pattern_Dict = {
'Audio': audio,
'Mel': mel,
'Pitch': pitch,
'Speaker_Index': speaker_Index,
'Speaker': speaker,
'Dataset': dataset,
}
except Exception as e:
print('Error: {} in {}'.format(e, path))
return
with open(pickle_Path, 'wb') as f:
pickle.dump(new_Pattern_Dict, f, protocol=4)
def VCTK_Info_Load(vctk_Path, num_Speakers):
vctk_Wav_Path = os.path.join(vctk_Path, 'wav48').replace('\\', '/')
try:
with open(os.path.join(vctk_Path, 'VCTK.NonOutlier.txt').replace('\\', '/'), 'r') as f:
vctk_Non_Outlier_List = [x.strip() for x in f.readlines()]
except:
vctk_Non_Outlier_List = None
vctk_File_Path_List = []
for root, _, files in os.walk(vctk_Wav_Path):
for file in files:
if not vctk_Non_Outlier_List is None and not file in vctk_Non_Outlier_List:
continue
wav_File_Path = os.path.join(root, file).replace('\\', '/')
if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension:
continue
vctk_File_Path_List.append(wav_File_Path)
vctk_Speaker_Dict = {
path: path.split('/')[-2].upper()
for path in vctk_File_Path_List
}
# speakers = list(set(vctk_Speaker_Dict.values()))[:num_Speakers]
speakers = [x for x in list(set(vctk_Speaker_Dict.values())) if not x in ['P243', 'P240', 'P232', 'P277', 'P228', 'P226']]
speakers = ['P243'] + speakers + ['P232', 'P277', 'P240']
print(speakers)
print(len(speakers))
vctk_File_Path_List = [path for path in vctk_File_Path_List if vctk_Speaker_Dict[path] in speakers]
vctk_Speaker_Dict = {path: speaker for path, speaker in vctk_Speaker_Dict.items() if speaker in speakers}
speaker_Index_Dict = {speaker: index for index, speaker in enumerate(speakers)}
print('VCTK info generated: {}'.format(len(vctk_File_Path_List)))
return vctk_File_Path_List, vctk_Speaker_Dict, speaker_Index_Dict
def Split_Eval(path_List, speaker_Dict, eval_Rate):
path_List_Dict = {spekaer: [] for spekaer in set(speaker_Dict.values())}
for path in path_List:
path_List_Dict[speaker_Dict[path]].append(path)
for speaker in path_List_Dict.keys():
shuffle(path_List_Dict[speaker])
train_Path_List_Dict = {
speaker: paths[int(len(paths) * eval_Rate):]
for speaker, paths in path_List_Dict.items()
}
eval_Path_List_Dict = {
speaker: paths[:int(len(paths) * eval_Rate)]
for speaker, paths in path_List_Dict.items()
}
train_Paths = [path for paths in train_Path_List_Dict.values() for path in paths]
eval_Paths = [path for paths in eval_Path_List_Dict.values() for path in paths]
return train_Paths, eval_Paths
def Metadata_Generate(eval= False):
pattern_Path = hp_Dict['Train']['Eval_Pattern' if eval else 'Train_Pattern']['Path']
new_Metadata_Dict = {
'Spectrogram_Dim': hp_Dict['Sound']['Spectrogram_Dim'],
'Mel_Dim': hp_Dict['Sound']['Mel_Dim'],
'Frame_Shift': hp_Dict['Sound']['Frame_Shift'],
'Frame_Length': hp_Dict['Sound']['Frame_Length'],
'Sample_Rate': hp_Dict['Sound']['Sample_Rate'],
'Max_Abs_Mel': hp_Dict['Sound']['Max_Abs_Mel'],
'File_List': [],
'Mel_Length_Dict': {},
'Pitch_Length_Dict': {},
'Dataset_Dict': {},
'Speaker_Dict': {},
'File_List_by_Speaker_Dict': {},
}
speaker_Index_Set = set()
for root, _, files in os.walk(pattern_Path):
for file in files:
with open(os.path.join(root, file).replace("\\", "/"), "rb") as f:
pattern_Dict = pickle.load(f)
try:
if not all([key in ('Audio', 'Mel', 'Pitch', 'Speaker_Index', 'Speaker', 'Dataset') for key in pattern_Dict.keys()]):
continue
new_Metadata_Dict['File_List'].append(file)
new_Metadata_Dict['Mel_Length_Dict'][file] = pattern_Dict['Mel'].shape[0]
new_Metadata_Dict['Pitch_Length_Dict'][file] = pattern_Dict['Pitch'].shape[0]
new_Metadata_Dict['Dataset_Dict'][file] = pattern_Dict['Dataset']
new_Metadata_Dict['Speaker_Dict'][file] = pattern_Dict['Speaker']
if not (pattern_Dict['Dataset'], pattern_Dict['Speaker']) in new_Metadata_Dict['File_List_by_Speaker_Dict'].keys():
new_Metadata_Dict['File_List_by_Speaker_Dict'][pattern_Dict['Dataset'], pattern_Dict['Speaker']] = []
new_Metadata_Dict['File_List_by_Speaker_Dict'][pattern_Dict['Dataset'], pattern_Dict['Speaker']].append(file)
speaker_Index_Set.add((pattern_Dict['Speaker'], pattern_Dict['Speaker_Index']))
except:
print('File \'{}\' is not correct pattern file. This file is ignored.'.format(file))
new_Metadata_Dict['Speaker_Index_Dict'] = {speaker: index for speaker, index in speaker_Index_Set}
with open(os.path.join(pattern_Path, hp_Dict['Train']['Train_Pattern']['Metadata_File'].upper()).replace("\\", "/"), 'wb') as f:
pickle.dump(new_Metadata_Dict, f, protocol=4)
print('Metadata generate done.')
if __name__ == '__main__':
argParser = argparse.ArgumentParser()
argParser.add_argument("-p", "--vctk_path", required=True)
argParser.add_argument("-s", "--num_speaker", required=True, type= int)
argParser.add_argument("-e", "--eval_rate", required=True, type= float)
argParser.add_argument("-mw", "--max_worker", default= 10, type= int)
args = argParser.parse_args()
path_List, speaker_Dict, speaker_Index_Dict = VCTK_Info_Load(vctk_Path= args.vctk_path, num_Speakers= args.num_speaker)
dataset_Dict = {path: 'VCTK' for path in path_List}
tag_Dict = {path: '' for path in path_List}
train_Path_List, eval_Path_List = Split_Eval(path_List, speaker_Dict, args.eval_rate)
with PE(max_workers = args.max_worker) as pe:
for _ in tqdm(
pe.map(
lambda params: Pattern_File_Generate(*params),
[(path, speaker_Index_Dict[speaker_Dict[path]], speaker_Dict[path], dataset_Dict[path], tag_Dict[path], False) for path in train_Path_List]
),
total= len(train_Path_List)
):
pass
for _ in tqdm(
pe.map(
lambda params: Pattern_File_Generate(*params),
[(path, speaker_Index_Dict[speaker_Dict[path]], speaker_Dict[path], dataset_Dict[path], tag_Dict[path], True) for path in eval_Path_List]
),
total= len(eval_Path_List)
):
pass
Metadata_Generate()
Metadata_Generate(eval= True)