| { | |
| "decoder": { | |
| "type": "istftnet", | |
| "upsample_kernel_sizes": [20, 12], | |
| "upsample_rates": [10, 6], | |
| "gen_istft_hop_size": 5, | |
| "gen_istft_n_fft": 20, | |
| "resblock_dilation_sizes": [ | |
| [1, 3, 5], | |
| [1, 3, 5], | |
| [1, 3, 5] | |
| ], | |
| "resblock_kernel_sizes": [3, 7, 11], | |
| "upsample_initial_channel": 512 | |
| }, | |
| "dim_in": 64, | |
| "dropout": 0.2, | |
| "hidden_dim": 512, | |
| "max_conv_dim": 512, | |
| "max_dur": 50, | |
| "multispeaker": true, | |
| "n_layer": 3, | |
| "n_mels": 80, | |
| "n_token": 178, | |
| "style_dim": 128 | |
| } |