Upload v0.1

Browse files

Files changed (8) hide show

.gitattributes +1 -0
audiothinker.pth +3 -0
ckpt_00615000.pth +3 -0
config.yaml +51 -0
ep1.checkpoint +3 -0
model_config.json +27 -0
music_ssl.pt +3 -0
sq_config.yaml +176 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ep1.checkpoint filter=lfs diff=lfs merge=lfs -text

audiothinker.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5f0ce19603b19dd518bc8ba6bf4a7ef2c99d0ba2a6b5503d8a356177a402b57
+size 1940043702

ckpt_00615000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a76a33002ed073f2a1e658df16bfa57133beaf3ac2ad51a7018549d3050067fc
+size 1228804780

config.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+seed: 999
+cudnn_deterministic: true
+model: diffusion_transformer_1D
+dropout_p: 0.1
+token_dropout_p: 0.1
+num_output_layer: 2
+drop_path_rate: 0.0
+statistical_prior_path: null
+train_data_path: large_audio.scp
+val_data_path: val_with_duration.scp
+batch_size: 16
+max_length: 8000
+min_length: 100
+n_worker: 4
+minibatch_debug: -1
+segment_duration: 30
+n_epoch: 5
+grad_accum: 1
+fine_decoder: false
+learning_rate: 0.0001
+grad_clip: 2.0
+warmup_steps: 1000
+data_parallel: fsdp
+mixed_precision: fp32
+grad_precision: fp32
+activation_checkpointing: true
+weight_decay: 0.05
+n_layer: 16
+n_head: 12
+n_embd: 768
+dropout: 0.0
+bias: false
+block_size: 8192
+prefix_lm: false
+num_codebooks: 1
+num_channels: 32
+unet_model_name: transformer-2d
+transformer_diffusion_config: model_config.json
+sq_config: sq_config.yaml
+sq_resume: ckpt_00615000.pth
+whisper_path: openai/whisper-medium
+reason_lm_path: audiothinking.pth
+reconstruction_path: ep5.checkpoint
+llm_path: meta-llama/Llama-3.2-3B
+prompt_path: prompts/train_prompt.json
+best_rq_ckpt: music_ssl.pt
+exp_dir: ./
+print_freq: 100
+save_interval: 10000
+resume: null
+rank: 0

ep1.checkpoint ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f1eb7b5614d933c1e95b74b6ea65b9dcb61ac652666525ed7626eb2f0c0f746
+size 15154454704

model_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+    "_class_name": "Transformer1DModel",
+    "_diffusers_version": "0.22.0.dev0",
+    "activation_fn": "gelu-approximate",
+    "attention_bias": true,
+    "attention_head_dim": 64,
+    "attention_type": "default",
+    "cross_attention_dim": null,
+    "double_self_attention": false,
+    "dropout": 0.0,
+    "in_channels": 1040,
+    "norm_elementwise_affine": false,
+    "norm_eps": 1e-06,
+    "norm_num_groups": 32,
+    "norm_type": "ada_norm_single",
+    "num_attention_heads": 24,
+    "num_embeds_ada_norm": 1000,
+    "num_layers": 32,
+    "num_vector_embeds": null,
+    "only_cross_attention": false,
+    "out_channels": 136,
+    "patch_size": 1,
+    "sample_size": 384,
+    "upcast_attention": false,
+    "use_linear_projection": false
+}

music_ssl.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e250df56b035f74c1f66f15133f4c78f664d70fa0b09aa9a752b7871bb58c02f
+size 3957949089

sq_config.yaml ADDED Viewed

	@@ -0,0 +1,176 @@

+generator:
+  name: ScalarModel
+  config:
+    num_bands: 1
+    sample_rate: 24000
+    causal: true
+    num_samples: 2
+    downsample_factors:
+    - 2
+    - 3
+    - 4
+    - 4
+    - 5
+    downsample_kernel_sizes:
+    - 4
+    - 6
+    - 8
+    - 8
+    - 10
+    upsample_factors:
+    - 5
+    - 4
+    - 4
+    - 3
+    - 2
+    upsample_kernel_sizes:
+    - 10
+    - 8
+    - 8
+    - 6
+    - 4
+    latent_hidden_dim: 136
+    default_kernel_size: 7
+    delay_kernel_size: 5
+    init_channel: 48
+    res_kernel_size: 7
+d_list:
+- mfd
+mfd:
+  name: MultiFrequencyDiscriminator
+  config:
+    hop_lengths:
+    - 32
+    - 64
+    - 128
+    - 256
+    - 512
+    - 1024
+    hidden_channels:
+    - 64
+    - 128
+    - 256
+    - 512
+    - 512
+    - 512
+    domain: double
+    mel_scale: true
+    sample_rate: 24000
+mpd:
+  name: MultiPeriodDiscriminator
+  config:
+    period_sizes:
+    - 2
+    - 3
+    - 5
+    - 7
+    - 11
+    period_kernel_size: 5
+msd:
+  name: MultiScaleDiscriminator
+  config:
+    num_scales: 3
+    pool_kernel_size: 4
+    pool_stride: 2
+optimizer:
+  g:
+    name: AdamW
+    config:
+      lr: 0.0002
+      betas:
+      - 0.8
+      - 0.99
+      eps: 1.0e-06
+  d:
+    name: AdamW
+    config:
+      lr: 0.0002
+      betas:
+      - 0.8
+      - 0.99
+      eps: 1.0e-06
+lr_scheduler:
+  g:
+    name: ExponentialLR
+    config:
+      gamma: 0.999
+  d:
+    name: ExponentialLR
+    config:
+      gamma: 0.999
+criterion:
+  g_criterion:
+    name: losses.generator_loss.GeneratorSTFTLoss
+    config:
+      use_mel_loss: false
+      adv_criterion: MSEGLoss
+      mel_loss_weight: 45
+      use_feature_match: true
+      feat_match_loss_weight: 20
+      use_full_stft_loss: true
+      use_sub_stft_loss: true
+      full_stft_loss_weight: 1
+      sub_stft_loss_weight: 1
+      mel_scale_loss:
+        sampling_rate: 24000
+        n_fft: 1024
+        num_mels: 80
+        hop_size: 160
+        win_size: 800
+        fmin: 0
+      full_multi_scale_stft_loss:
+        fft_sizes:
+        - 512
+        - 1024
+        - 2048
+        win_sizes:
+        - 480
+        - 960
+        - 1200
+        hop_sizes:
+        - 120
+        - 240
+        - 300
+      sub_multi_scale_stft_loss:
+        num_bands: 6
+        fft_sizes:
+        - 128
+        - 256
+        - 256
+        win_sizes:
+        - 80
+        - 120
+        - 200
+        hop_sizes:
+        - 20
+        - 40
+        - 50
+  d_criterion:
+    name: losses.discriminator_loss.MSEDiscriminatorLoss
+    config: null
+  commit_loss_weight: 1.0
+training_file: /home/ydc/code2/ScalartTokenizer16k_m36/data/train_v2.scp
+validation_file: /home/ydc/code2/ScalartTokenizer16k_m36/data/val.scp
+seed: 2333
+cudnn_deterministic: false
+tensorboard: true
+checkpoint_interval: 5000
+summary_interval: 100
+validation_interval: 5000
+num_epoches: 50
+print_freq: 10
+discriminator_iter_start: 0
+num_ckpt_keep: 10
+segment_size: 48000
+audio_norm_scale: 0.95
+batch_size: 16
+num_workers: 4
+num_plots: 8
+local_rank: -1
+basic_model_config: config/scalar24k_64dim.yaml
+exp_model_config: null
+log_dir: /data9/ydc/exp/s_codec_24k_136dim_scale9_25hz
+hop_length: 2000
+ngpus_per_node: 4
+sample_rate: 24000
+model_ckpt_dir: /data9/ydc/exp/s_codec_24k_136dim_scale9_25hz/model_ckpts