Dongchao commited on
Commit
e6bf8fc
·
verified ·
1 Parent(s): 23ea083

Upload v0.1

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ep1.checkpoint filter=lfs diff=lfs merge=lfs -text
audiothinker.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5f0ce19603b19dd518bc8ba6bf4a7ef2c99d0ba2a6b5503d8a356177a402b57
3
+ size 1940043702
ckpt_00615000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a76a33002ed073f2a1e658df16bfa57133beaf3ac2ad51a7018549d3050067fc
3
+ size 1228804780
config.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ seed: 999
2
+ cudnn_deterministic: true
3
+ model: diffusion_transformer_1D
4
+ dropout_p: 0.1
5
+ token_dropout_p: 0.1
6
+ num_output_layer: 2
7
+ drop_path_rate: 0.0
8
+ statistical_prior_path: null
9
+ train_data_path: large_audio.scp
10
+ val_data_path: val_with_duration.scp
11
+ batch_size: 16
12
+ max_length: 8000
13
+ min_length: 100
14
+ n_worker: 4
15
+ minibatch_debug: -1
16
+ segment_duration: 30
17
+ n_epoch: 5
18
+ grad_accum: 1
19
+ fine_decoder: false
20
+ learning_rate: 0.0001
21
+ grad_clip: 2.0
22
+ warmup_steps: 1000
23
+ data_parallel: fsdp
24
+ mixed_precision: fp32
25
+ grad_precision: fp32
26
+ activation_checkpointing: true
27
+ weight_decay: 0.05
28
+ n_layer: 16
29
+ n_head: 12
30
+ n_embd: 768
31
+ dropout: 0.0
32
+ bias: false
33
+ block_size: 8192
34
+ prefix_lm: false
35
+ num_codebooks: 1
36
+ num_channels: 32
37
+ unet_model_name: transformer-2d
38
+ transformer_diffusion_config: model_config.json
39
+ sq_config: sq_config.yaml
40
+ sq_resume: ckpt_00615000.pth
41
+ whisper_path: openai/whisper-medium
42
+ reason_lm_path: audiothinking.pth
43
+ reconstruction_path: ep5.checkpoint
44
+ llm_path: meta-llama/Llama-3.2-3B
45
+ prompt_path: prompts/train_prompt.json
46
+ best_rq_ckpt: music_ssl.pt
47
+ exp_dir: ./
48
+ print_freq: 100
49
+ save_interval: 10000
50
+ resume: null
51
+ rank: 0
ep1.checkpoint ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f1eb7b5614d933c1e95b74b6ea65b9dcb61ac652666525ed7626eb2f0c0f746
3
+ size 15154454704
model_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Transformer1DModel",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "activation_fn": "gelu-approximate",
5
+ "attention_bias": true,
6
+ "attention_head_dim": 64,
7
+ "attention_type": "default",
8
+ "cross_attention_dim": null,
9
+ "double_self_attention": false,
10
+ "dropout": 0.0,
11
+ "in_channels": 1040,
12
+ "norm_elementwise_affine": false,
13
+ "norm_eps": 1e-06,
14
+ "norm_num_groups": 32,
15
+ "norm_type": "ada_norm_single",
16
+ "num_attention_heads": 24,
17
+ "num_embeds_ada_norm": 1000,
18
+ "num_layers": 32,
19
+ "num_vector_embeds": null,
20
+ "only_cross_attention": false,
21
+ "out_channels": 136,
22
+ "patch_size": 1,
23
+ "sample_size": 384,
24
+ "upcast_attention": false,
25
+ "use_linear_projection": false
26
+ }
27
+
music_ssl.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e250df56b035f74c1f66f15133f4c78f664d70fa0b09aa9a752b7871bb58c02f
3
+ size 3957949089
sq_config.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ generator:
2
+ name: ScalarModel
3
+ config:
4
+ num_bands: 1
5
+ sample_rate: 24000
6
+ causal: true
7
+ num_samples: 2
8
+ downsample_factors:
9
+ - 2
10
+ - 3
11
+ - 4
12
+ - 4
13
+ - 5
14
+ downsample_kernel_sizes:
15
+ - 4
16
+ - 6
17
+ - 8
18
+ - 8
19
+ - 10
20
+ upsample_factors:
21
+ - 5
22
+ - 4
23
+ - 4
24
+ - 3
25
+ - 2
26
+ upsample_kernel_sizes:
27
+ - 10
28
+ - 8
29
+ - 8
30
+ - 6
31
+ - 4
32
+ latent_hidden_dim: 136
33
+ default_kernel_size: 7
34
+ delay_kernel_size: 5
35
+ init_channel: 48
36
+ res_kernel_size: 7
37
+ d_list:
38
+ - mfd
39
+ mfd:
40
+ name: MultiFrequencyDiscriminator
41
+ config:
42
+ hop_lengths:
43
+ - 32
44
+ - 64
45
+ - 128
46
+ - 256
47
+ - 512
48
+ - 1024
49
+ hidden_channels:
50
+ - 64
51
+ - 128
52
+ - 256
53
+ - 512
54
+ - 512
55
+ - 512
56
+ domain: double
57
+ mel_scale: true
58
+ sample_rate: 24000
59
+ mpd:
60
+ name: MultiPeriodDiscriminator
61
+ config:
62
+ period_sizes:
63
+ - 2
64
+ - 3
65
+ - 5
66
+ - 7
67
+ - 11
68
+ period_kernel_size: 5
69
+ msd:
70
+ name: MultiScaleDiscriminator
71
+ config:
72
+ num_scales: 3
73
+ pool_kernel_size: 4
74
+ pool_stride: 2
75
+ optimizer:
76
+ g:
77
+ name: AdamW
78
+ config:
79
+ lr: 0.0002
80
+ betas:
81
+ - 0.8
82
+ - 0.99
83
+ eps: 1.0e-06
84
+ d:
85
+ name: AdamW
86
+ config:
87
+ lr: 0.0002
88
+ betas:
89
+ - 0.8
90
+ - 0.99
91
+ eps: 1.0e-06
92
+ lr_scheduler:
93
+ g:
94
+ name: ExponentialLR
95
+ config:
96
+ gamma: 0.999
97
+ d:
98
+ name: ExponentialLR
99
+ config:
100
+ gamma: 0.999
101
+ criterion:
102
+ g_criterion:
103
+ name: losses.generator_loss.GeneratorSTFTLoss
104
+ config:
105
+ use_mel_loss: false
106
+ adv_criterion: MSEGLoss
107
+ mel_loss_weight: 45
108
+ use_feature_match: true
109
+ feat_match_loss_weight: 20
110
+ use_full_stft_loss: true
111
+ use_sub_stft_loss: true
112
+ full_stft_loss_weight: 1
113
+ sub_stft_loss_weight: 1
114
+ mel_scale_loss:
115
+ sampling_rate: 24000
116
+ n_fft: 1024
117
+ num_mels: 80
118
+ hop_size: 160
119
+ win_size: 800
120
+ fmin: 0
121
+ full_multi_scale_stft_loss:
122
+ fft_sizes:
123
+ - 512
124
+ - 1024
125
+ - 2048
126
+ win_sizes:
127
+ - 480
128
+ - 960
129
+ - 1200
130
+ hop_sizes:
131
+ - 120
132
+ - 240
133
+ - 300
134
+ sub_multi_scale_stft_loss:
135
+ num_bands: 6
136
+ fft_sizes:
137
+ - 128
138
+ - 256
139
+ - 256
140
+ win_sizes:
141
+ - 80
142
+ - 120
143
+ - 200
144
+ hop_sizes:
145
+ - 20
146
+ - 40
147
+ - 50
148
+ d_criterion:
149
+ name: losses.discriminator_loss.MSEDiscriminatorLoss
150
+ config: null
151
+ commit_loss_weight: 1.0
152
+ training_file: /home/ydc/code2/ScalartTokenizer16k_m36/data/train_v2.scp
153
+ validation_file: /home/ydc/code2/ScalartTokenizer16k_m36/data/val.scp
154
+ seed: 2333
155
+ cudnn_deterministic: false
156
+ tensorboard: true
157
+ checkpoint_interval: 5000
158
+ summary_interval: 100
159
+ validation_interval: 5000
160
+ num_epoches: 50
161
+ print_freq: 10
162
+ discriminator_iter_start: 0
163
+ num_ckpt_keep: 10
164
+ segment_size: 48000
165
+ audio_norm_scale: 0.95
166
+ batch_size: 16
167
+ num_workers: 4
168
+ num_plots: 8
169
+ local_rank: -1
170
+ basic_model_config: config/scalar24k_64dim.yaml
171
+ exp_model_config: null
172
+ log_dir: /data9/ydc/exp/s_codec_24k_136dim_scale9_25hz
173
+ hop_length: 2000
174
+ ngpus_per_node: 4
175
+ sample_rate: 24000
176
+ model_ckpt_dir: /data9/ydc/exp/s_codec_24k_136dim_scale9_25hz/model_ckpts