/
models.py
194 lines (175 loc) · 8.35 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import sys
from typing import Optional
from os.path import abspath, dirname
import torch
sys.path.append(abspath(dirname(__file__)+'/'))
from model import FastPitch as _FastPitch
from model_AAI import Transformer_AAI as _Transformer_AAI
def parse_model_args(model_name, parser, add_help=False):
if model_name == 'FastPitch':
from arg_parser_fp import parse_fastpitch_args
return parse_fastpitch_args(parser, add_help)
elif model_name == 'FastPitch_AAI':
from arg_parser_fp_AAI import parse_fastpitch_args
return parse_fastpitch_args(parser, add_help)
else:
raise NotImplementedError(model_name)
def batchnorm_to_float(module):
"""Converts batch norm to FP32"""
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
module.float()
for child in module.children():
batchnorm_to_float(child)
return module
def init_bn(module):
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
if module.affine:
module.weight.data.uniform_()
for child in module.children():
init_bn(child)
def get_model(model_name, model_config, device,
uniform_initialize_bn_weight=False, forward_is_infer=False,
jitable=False):
""" Code chooses a model based on name"""
model = None
if model_name == 'FastPitch':
if forward_is_infer:
if jitable:
class FastPitch__forward_is_infer(_FastPitchJIT):
def forward(self, inputs, input_lengths, pace: float = 1.0,
dur_tgt: Optional[torch.Tensor] = None,
pitch_tgt: Optional[torch.Tensor] = None,
speaker: int = 0):
return self.infer(inputs, input_lengths, pace=pace,
dur_tgt=dur_tgt, pitch_tgt=pitch_tgt,
speaker=speaker)
else:
class FastPitch__forward_is_infer(_FastPitch):
def forward(self, inputs, input_lengths, pace: float = 1.0,
dur_tgt: Optional[torch.Tensor] = None,
pitch_tgt: Optional[torch.Tensor] = None,
pitch_transform=None,
speaker: Optional[int] = None):
return self.infer(inputs, input_lengths, pace=pace,
dur_tgt=dur_tgt, pitch_tgt=pitch_tgt,
pitch_transform=pitch_transform,
speaker=speaker)
model = FastPitch__forward_is_infer(**model_config)
else:
model = _FastPitch(**model_config)
elif model_name == 'Transformer_AAI':
if forward_is_infer:
class Transformer__forward_is_infer(_Transformer_AAI):
def forward(self, inputs, input_lengths, pace: float = 1.0,
dur_tgt: Optional[torch.Tensor] = None,
pitch_tgt: Optional[torch.Tensor] = None,
pitch_transform=None,
speaker: Optional[int] = None):
return self.infer(inputs, input_lengths, pace=pace,
dur_tgt=dur_tgt, pitch_tgt=pitch_tgt,
pitch_transform=pitch_transform,
speaker=speaker)
model = Transformer__forward_is_infer(**model_config)
else:
model = _Transformer_AAI(**model_config)
else:
raise NotImplementedError(model_name)
if uniform_initialize_bn_weight:
init_bn(model)
return model.to(device)
def get_model_config(model_name, args):
""" Code chooses a model based on name"""
if model_name == 'FastPitch':
model_config = dict(
# io
n_mel_channels=args.n_mel_channels,
max_seq_len=args.max_seq_len,
# symbols
n_symbols=40,
padding_idx=0,
symbols_embedding_dim=args.symbols_embedding_dim,
# input FFT
in_fft_n_layers=args.in_fft_n_layers,
in_fft_n_heads=args.in_fft_n_heads,
in_fft_d_head=args.in_fft_d_head,
in_fft_conv1d_kernel_size=args.in_fft_conv1d_kernel_size,
in_fft_conv1d_filter_size=args.in_fft_conv1d_filter_size,
in_fft_output_size=args.in_fft_output_size,
p_in_fft_dropout=args.p_in_fft_dropout,
p_in_fft_dropatt=args.p_in_fft_dropatt,
p_in_fft_dropemb=args.p_in_fft_dropemb,
# output FFT
out_fft_n_layers=args.out_fft_n_layers,
out_fft_n_heads=args.out_fft_n_heads,
out_fft_d_head=args.out_fft_d_head,
out_fft_conv1d_kernel_size=args.out_fft_conv1d_kernel_size,
out_fft_conv1d_filter_size=args.out_fft_conv1d_filter_size,
out_fft_output_size=args.out_fft_output_size,
p_out_fft_dropout=args.p_out_fft_dropout,
p_out_fft_dropatt=args.p_out_fft_dropatt,
p_out_fft_dropemb=args.p_out_fft_dropemb,
# duration predictor
dur_predictor_kernel_size=args.dur_predictor_kernel_size,
dur_predictor_filter_size=args.dur_predictor_filter_size,
p_dur_predictor_dropout=args.p_dur_predictor_dropout,
dur_predictor_n_layers=args.dur_predictor_n_layers,
# pitch predictor
pitch_predictor_kernel_size=args.pitch_predictor_kernel_size,
pitch_predictor_filter_size=args.pitch_predictor_filter_size,
p_pitch_predictor_dropout=args.p_pitch_predictor_dropout,
pitch_predictor_n_layers=args.pitch_predictor_n_layers,
# pitch conditioning
pitch_embedding_kernel_size=args.pitch_embedding_kernel_size,
# speakers parameters
n_speakers=args.n_speakers,
speaker_emb_weight=args.speaker_emb_weight
)
return model_config
elif model_name == 'Transformer_AAI':
model_config = dict(
# io
n_mel_channels=args.n_mel_channels,
max_seq_len=args.max_seq_len,
# symbols
n_symbols=41,
padding_idx=40,
symbols_embedding_dim=args.symbols_embedding_dim,
# input FFT
in_fft_n_layers=args.in_fft_n_layers,
in_fft_n_heads=args.in_fft_n_heads,
in_fft_d_head=args.in_fft_d_head,
in_fft_conv1d_kernel_size=args.in_fft_conv1d_kernel_size,
in_fft_conv1d_filter_size=args.in_fft_conv1d_filter_size,
in_fft_output_size=args.in_fft_output_size,
p_in_fft_dropout=args.p_in_fft_dropout,
p_in_fft_dropatt=args.p_in_fft_dropatt,
p_in_fft_dropemb=args.p_in_fft_dropemb,
# output FFT
out_fft_n_layers=args.out_fft_n_layers,
out_fft_n_heads=args.out_fft_n_heads,
out_fft_d_head=args.out_fft_d_head,
out_fft_conv1d_kernel_size=args.out_fft_conv1d_kernel_size,
out_fft_conv1d_filter_size=args.out_fft_conv1d_filter_size,
out_fft_output_size=args.out_fft_output_size,
p_out_fft_dropout=args.p_out_fft_dropout,
p_out_fft_dropatt=args.p_out_fft_dropatt,
p_out_fft_dropemb=args.p_out_fft_dropemb,
# duration predictor
dur_predictor_kernel_size=args.dur_predictor_kernel_size,
dur_predictor_filter_size=args.dur_predictor_filter_size,
p_dur_predictor_dropout=args.p_dur_predictor_dropout,
dur_predictor_n_layers=args.dur_predictor_n_layers,
# pitch predictor
pitch_predictor_kernel_size=args.pitch_predictor_kernel_size,
pitch_predictor_filter_size=args.pitch_predictor_filter_size,
p_pitch_predictor_dropout=args.p_pitch_predictor_dropout,
pitch_predictor_n_layers=args.pitch_predictor_n_layers,
# pitch conditioning
pitch_embedding_kernel_size=args.pitch_embedding_kernel_size,
# speakers parameters
n_speakers=args.n_speakers,
speaker_emb_weight=args.speaker_emb_weight
)
return model_config
else:
raise NotImplementedError(model_name)