{ "codec": { "ssl_adaptor": { "in_dim": 1280, "embed_dim": 768, "out_dim": 768, "num_layers": 4, "num_heads": 12, "ffn_dim": 3072, "attn_dropout": 0, "dropout": 0 }, "acoustic_encoder": { "num_mels": 128, "sampling_rate": 16000, "hop_length": 160, "n_fft": 400, "fmin": 0, "fmax": 8000, "embed_dim": 768, "num_layers": 12, "num_heads": 12, "ffn_dim": 3072, "attn_dropout": 0, "dropout": 0, "max_positions": 1500 }, "downsample": { "embed_dim": 1536, "avg_pooler": 4 }, "rvq": { "input_dim": 1536, "rvq_dim": 768, "output_dim": 768, "num_quantizers": 16, "codebook_size": 2048, "codebook_dim": 512 }, "upsample": { "embed_dim": 768, "stride": 4 }, "semantic_decoder": { "in_dim": 768, "embed_dim": 768, "out_dim": 1280, "num_layers": 4, "num_heads": 12, "ffn_dim": 3072, "attn_dropout": 0, "dropout": 0 }, "acoustic_decoder": { "embed_dim": 768, "num_layers": 12, "num_heads": 12, "dropout": 0, "hop_length": 240, "causal": true } } }