kunato commited on
Commit
ffe4f8c
·
verified ·
1 Parent(s): 6e96f5a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ iter_0000001/.metadata filter=lfs diff=lfs merge=lfs -text
37
+ iter_0000001/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ iter_0000001/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
39
+ iter_0000001/__0_2.distcp filter=lfs diff=lfs merge=lfs -text
40
+ iter_0000001/__0_3.distcp filter=lfs diff=lfs merge=lfs -text
41
+ iter_0000001/__0_4.distcp filter=lfs diff=lfs merge=lfs -text
42
+ iter_0000001/__0_5.distcp filter=lfs diff=lfs merge=lfs -text
43
+ iter_0000001/__0_6.distcp filter=lfs diff=lfs merge=lfs -text
args.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
3
+ "model_type": "qwen3_moe",
4
+ "model_revision": null,
5
+ "task_type": "causal_lm",
6
+ "torch_dtype": "bfloat16",
7
+ "attn_impl": null,
8
+ "new_special_tokens": [],
9
+ "num_labels": null,
10
+ "problem_type": null,
11
+ "rope_scaling": null,
12
+ "device_map": null,
13
+ "max_memory": {},
14
+ "max_model_len": null,
15
+ "local_repo_path": null,
16
+ "init_strategy": null,
17
+ "template": "qwen3",
18
+ "system": null,
19
+ "max_length": 2048,
20
+ "truncation_strategy": "delete",
21
+ "max_pixels": null,
22
+ "agent_template": null,
23
+ "norm_bbox": null,
24
+ "use_chat_template": true,
25
+ "padding_free": false,
26
+ "padding_side": "right",
27
+ "loss_scale": "default",
28
+ "sequence_parallel_size": 1,
29
+ "response_prefix": null,
30
+ "template_backend": "swift",
31
+ "dataset": [],
32
+ "val_dataset": [],
33
+ "split_dataset_ratio": 0.0,
34
+ "data_seed": 42,
35
+ "dataset_num_proc": 1,
36
+ "load_from_cache_file": true,
37
+ "dataset_shuffle": true,
38
+ "val_dataset_shuffle": false,
39
+ "streaming": false,
40
+ "interleave_prob": null,
41
+ "stopping_strategy": "first_exhausted",
42
+ "shuffle_buffer_size": 1000,
43
+ "download_mode": "reuse_dataset_if_exists",
44
+ "columns": {},
45
+ "strict": false,
46
+ "remove_unused_columns": true,
47
+ "model_name": null,
48
+ "model_author": null,
49
+ "custom_dataset_info": [],
50
+ "quant_method": null,
51
+ "quant_bits": null,
52
+ "hqq_axis": null,
53
+ "bnb_4bit_compute_dtype": "bfloat16",
54
+ "bnb_4bit_quant_type": "nf4",
55
+ "bnb_4bit_use_double_quant": true,
56
+ "bnb_4bit_quant_storage": null,
57
+ "max_new_tokens": null,
58
+ "temperature": null,
59
+ "top_k": null,
60
+ "top_p": null,
61
+ "repetition_penalty": null,
62
+ "num_beams": 1,
63
+ "stream": false,
64
+ "stop_words": [],
65
+ "logprobs": false,
66
+ "top_logprobs": null,
67
+ "ckpt_dir": null,
68
+ "lora_modules": [],
69
+ "tuner_backend": "peft",
70
+ "train_type": "lora",
71
+ "adapters": [],
72
+ "external_plugins": [],
73
+ "seed": 42,
74
+ "model_kwargs": {},
75
+ "load_args": true,
76
+ "load_data_args": false,
77
+ "packing": false,
78
+ "custom_register_path": [],
79
+ "use_hf": true,
80
+ "hub_token": null,
81
+ "ddp_timeout": 18000000,
82
+ "ddp_backend": null,
83
+ "ignore_args_error": false,
84
+ "use_swift_lora": false,
85
+ "merge_lora": false,
86
+ "safe_serialization": true,
87
+ "max_shard_size": "5GB",
88
+ "output_dir": "/data/workspace/kunato/ms-swift/Qwen3-Coder-30B-A3B-Instruct-mcore",
89
+ "quant_n_samples": 256,
90
+ "quant_batch_size": 1,
91
+ "group_size": 128,
92
+ "to_ollama": false,
93
+ "to_mcore": true,
94
+ "to_hf": false,
95
+ "mcore_model": null,
96
+ "mcore_adapters": [],
97
+ "thread_count": 7,
98
+ "test_convert_precision": false,
99
+ "push_to_hub": false,
100
+ "hub_model_id": null,
101
+ "hub_private_repo": false,
102
+ "commit_message": "update files",
103
+ "to_peft_format": false,
104
+ "exist_ok": false,
105
+ "rank": 0,
106
+ "local_rank": 0,
107
+ "global_world_size": 1,
108
+ "local_world_size": 1,
109
+ "model_suffix": "Qwen3-Coder-30B-A3B-Instruct",
110
+ "model_info": "ModelInfo(model_type='qwen3_moe', model_dir='/data/share/cache/huggingface/hub/models--Qwen--Qwen3-Coder-30B-A3B-Instruct/snapshots/573fa3901e5799703b1e60825b0ec024a4c0f1d3', torch_dtype=torch.bfloat16, max_model_len=262144, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=True, config=None, task_type='causal_lm', num_labels=None)",
111
+ "model_meta": "ModelMeta(model_type='qwen3_moe', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-30B-A3B-Base', hf_model_id='Qwen/Qwen3-30B-A3B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-30B-A3B', hf_model_id='Qwen/Qwen3-30B-A3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-235B-A22B', hf_model_id='Qwen/Qwen3-235B-A22B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-30B-A3B-FP8', hf_model_id='Qwen/Qwen3-30B-A3B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-235B-A22B-FP8', hf_model_id='Qwen/Qwen3-235B-A22B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-30B-A3B-AWQ', hf_model_id='cognitivecomputations/Qwen3-30B-A3B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-235B-A22B-AWQ', hf_model_id='cognitivecomputations/Qwen3-235B-A22B-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507', hf_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', hf_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507', hf_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', hf_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-235B-A22B-Instruct-2507-AWQ', hf_model_id=None, model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-Coder-480B-A35B-Instruct', hf_model_id='Qwen/Qwen3-Coder-480B-A35B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8', hf_model_id='Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-Coder-480B-A35B-Instruct-AWQ', hf_model_id=None, model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding'])], template='qwen3', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f7493c93b50>, model_arch=None, architectures=['Qwen3MoeForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.51'], tags=[])",
112
+ "model_dir": "/data/share/cache/huggingface/hub/models--Qwen--Qwen3-Coder-30B-A3B-Instruct/snapshots/573fa3901e5799703b1e60825b0ec024a4c0f1d3",
113
+ "hub": "<class 'swift.hub.hub.HFHub'>"
114
+ }
iter_0000001/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d302c916053b2067ae57154c87dc6496346165ce06454943e1262f3bf876734
3
+ size 8922498
iter_0000001/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65702bff8e8f33d2936ae6ef25c838e64388197f1c2767a94a99e384b462efad
3
+ size 8730421564
iter_0000001/__0_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d0a18109b4414cea2191cb4761ca955bc6442a9db4f9290b8250df5526def49
3
+ size 8730421564
iter_0000001/__0_2.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:984c3d426c4201da562e225761222adbabd2812897c8a216b2bba364669a2ab1
3
+ size 8730574533
iter_0000001/__0_3.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:385eccd07d14a736e87b5681e6fcff813a45ec556f014f0cf8889fc50d1f013a
3
+ size 8730582482
iter_0000001/__0_4.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86a91ba65d4876a986d46a9a2a97f7dc3c7848621f5c543eb3a4cf185f8e35da
3
+ size 8730284216
iter_0000001/__0_5.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:061f1a1306bf881c92aa3c8985bc2e0a78afe54a4f011781a03930d32447dbab
3
+ size 8730301567
iter_0000001/__0_6.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91ef59c7d78d8e48c12eb936bf2bbe8375e9a3ba2b933c1ee1f876143a411666
3
+ size 8730301307
iter_0000001/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cb3d7293fc4aba94d7150d4707ab9a01af46859c10703a23e75949b661760ff
3
+ size 17511
iter_0000001/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1
runs/events.out.tfevents.1757523328.fc4f4ac1-03.cloud.together.ai.1920040.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f509aef81de7e1d440ddaec564c09399d508b17c1221caadc670d107f632187
3
+ size 88