mratsim commited on
Commit
a09519f
·
verified ·
1 Parent(s): aeb3c7c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
GLM45-NoThink-SillyTavern-Preset.json ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "instruct": {
3
+ "input_sequence": "<|user|>\n",
4
+ "output_sequence": "<|assistant|>\n<think></think>\n",
5
+ "last_output_sequence": "",
6
+ "system_sequence": "<|system|>\n",
7
+ "stop_sequence": "<|user|>",
8
+ "wrap": false,
9
+ "macro": true,
10
+ "activation_regex": "",
11
+ "first_output_sequence": "",
12
+ "skip_examples": false,
13
+ "output_suffix": "",
14
+ "input_suffix": "/nothink",
15
+ "system_suffix": "",
16
+ "user_alignment_message": "",
17
+ "system_same_as_user": false,
18
+ "last_system_sequence": "",
19
+ "first_input_sequence": "",
20
+ "last_input_sequence": "",
21
+ "names_behavior": "always",
22
+ "sequences_as_stop_strings": true,
23
+ "story_string_prefix": "",
24
+ "story_string_suffix": "",
25
+ "name": "GLM4.5-NoThink"
26
+ },
27
+ "context": {
28
+ "story_string": "[gMASK]<sop><|system|>\n{{#if anchorBefore}}{{anchorBefore}}\n{{/if}}{{#if system}}{{system}}\n{{/if}}{{#if wiBefore}}## World Info:\n{{wiBefore}}\n{{/if}}{{#if description}}## {{char}}'s Description:\n{{description}}\n{{/if}}{{#if personality}}## {{char}}'s Personality:\n{{personality}}\n{{/if}}{{#if persona}}## {{user}}'s Persona:\n{{persona}}\n{{/if}}{{#if scenario}}## Scenario:\n{{scenario}}\n{{/if}}{{#if wiAfter}}## Lore:\n{{wiAfter}}\n{{/if}}{{#if anchorAfter}}{{anchorAfter}}\n{{/if}}{{trim}}\n",
29
+ "example_separator": "",
30
+ "chat_start": "",
31
+ "use_stop_strings": false,
32
+ "names_as_stop_strings": true,
33
+ "story_string_position": 0,
34
+ "story_string_depth": 1,
35
+ "story_string_role": 0,
36
+ "always_force_name2": true,
37
+ "trim_sentences": false,
38
+ "single_line": false,
39
+ "name": "GLM4.5-NoThink"
40
+ },
41
+ "preset": {
42
+ "temp": 0.8,
43
+ "temperature_last": true,
44
+ "top_p": 0.95,
45
+ "top_k": 0,
46
+ "top_a": 0,
47
+ "tfs": 1,
48
+ "epsilon_cutoff": 0,
49
+ "eta_cutoff": 0,
50
+ "typical_p": 1,
51
+ "min_p": 0.05,
52
+ "rep_pen": 1,
53
+ "rep_pen_range": 0,
54
+ "rep_pen_decay": 0,
55
+ "rep_pen_slope": 1,
56
+ "no_repeat_ngram_size": 0,
57
+ "penalty_alpha": 0,
58
+ "num_beams": 1,
59
+ "length_penalty": 1,
60
+ "min_length": 0,
61
+ "encoder_rep_pen": 1,
62
+ "freq_pen": 0,
63
+ "presence_pen": 0,
64
+ "skew": 0,
65
+ "do_sample": true,
66
+ "early_stopping": false,
67
+ "dynatemp": false,
68
+ "min_temp": 1,
69
+ "max_temp": 1.3,
70
+ "dynatemp_exponent": 1,
71
+ "smoothing_factor": 0,
72
+ "smoothing_curve": 1,
73
+ "dry_allowed_length": 4,
74
+ "dry_multiplier": 0,
75
+ "dry_base": 1.75,
76
+ "dry_sequence_breakers": "[\"\\n\", \":\", \"\\\"\", \"*\", \"<|start_header_id|>system<|end_header_id|>\", \"<|start_header_id|>assistant<|end_header_id|>\", \"<|start_header_id|>user<|end_header_id|>\", \"<|eot_id|>\"]",
77
+ "dry_penalty_last_n": 0,
78
+ "add_bos_token": true,
79
+ "ban_eos_token": false,
80
+ "skip_special_tokens": false,
81
+ "mirostat_mode": 0,
82
+ "mirostat_tau": 2,
83
+ "mirostat_eta": 0.1,
84
+ "guidance_scale": 1,
85
+ "negative_prompt": "",
86
+ "grammar_string": "",
87
+ "json_schema": {},
88
+ "banned_tokens": "",
89
+ "sampler_priority": [
90
+ "repetition_penalty",
91
+ "presence_penalty",
92
+ "frequency_penalty",
93
+ "dry",
94
+ "top_k",
95
+ "top_p",
96
+ "typical_p",
97
+ "top_n_sigma",
98
+ "epsilon_cutoff",
99
+ "eta_cutoff",
100
+ "tfs",
101
+ "top_a",
102
+ "min_p",
103
+ "temperature",
104
+ "mirostat",
105
+ "quadratic_sampling",
106
+ "dynamic_temperature",
107
+ "xtc",
108
+ "encoder_repetition_penalty",
109
+ "no_repeat_ngram"
110
+ ],
111
+ "samplers": [
112
+ "penalties",
113
+ "dry",
114
+ "top_n_sigma",
115
+ "top_k",
116
+ "typ_p",
117
+ "tfs_z",
118
+ "typical_p",
119
+ "top_p",
120
+ "min_p",
121
+ "xtc",
122
+ "temperature"
123
+ ],
124
+ "samplers_priorities": [
125
+ "dry",
126
+ "penalties",
127
+ "no_repeat_ngram",
128
+ "temperature",
129
+ "top_nsigma",
130
+ "top_p_top_k",
131
+ "top_a",
132
+ "min_p",
133
+ "tfs",
134
+ "eta_cutoff",
135
+ "epsilon_cutoff",
136
+ "typical_p",
137
+ "quadratic",
138
+ "xtc"
139
+ ],
140
+ "ignore_eos_token": false,
141
+ "spaces_between_special_tokens": true,
142
+ "speculative_ngram": false,
143
+ "sampler_order": [
144
+ 6,
145
+ 0,
146
+ 1,
147
+ 3,
148
+ 4,
149
+ 2,
150
+ 5
151
+ ],
152
+ "logit_bias": [],
153
+ "xtc_threshold": 0,
154
+ "xtc_probability": 0,
155
+ "nsigma": 0,
156
+ "min_keep": 0,
157
+ "extensions": {},
158
+ "rep_pen_size": 0,
159
+ "genamt": 700,
160
+ "max_length": 20480,
161
+ "name": "GLM4.5-Iceblink-v2"
162
+ }
163
+ }
README.md ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model:
4
+ - zerofata/GLM-4.5-Iceblink-v2-106B-A12B
5
+ datasets:
6
+ - neuralmagic/calibration
7
+ - HuggingFaceH4/ultrachat_200k
8
+ - nvidia/OpenCodeInstruct
9
+ - CSJianYang/CodeArena
10
+ - nvidia/OpenScienceReasoning-2
11
+ - MegaScience/MegaScience
12
+ - Gryphe/Opus-WritingPrompts
13
+ - ServiceNow-AI/M2Lingual
14
+ - anthracite-org/stheno-filtered-v1.1
15
+ - zerofata/Roleplay-Anime-Characters
16
+ - zerofata/Instruct-Anime
17
+ - zerofata/Instruct-Anime-CreativeWriting
18
+ - sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo
19
+ - nvidia/OpenMathInstruct-2
20
+ - fka/awesome-chatgpt-prompts
21
+ - databricks/databricks-dolly-15k
22
+ - FreedomIntelligence/SocraticChat
23
+ - ruggsea/stanford-encyclopedia-of-philosophy_instruct
24
+ - mlfoundations-dev/stackexchange_philosophy
25
+ - theoldmandthesea/17k_business_book
26
+ - anthracite-org/nopm_claude_writing_fixed
27
+ pipeline_tag: text-generation
28
+ tags:
29
+ - text adventure
30
+ - roleplay
31
+ - rpg
32
+ - creative writing
33
+ - conversational
34
+ - awq
35
+ - vllm
36
+ ---
37
+ # GLM-4.5-Iceblink-v2-106B-A12B (AWQ 4-bit quant)
38
+
39
+ This repo contains GLM-4.5-Iceblink-v2-106B-A12B quantized with AWQ mixed 4-bit/16-bit precision following state-of-the-art Mixture-Of-Expert quantization and a careful selection of calibration datasets covering math, sciences, philosophy, business, fiction, roleplay, creative writing, general knowledge and multilingual to plausibly ensure that all 127 experts of the model had been activated through enough calibration samples.
40
+
41
+ - Original Model:
42
+ - [zerofata/GLM-4.5-Iceblink-v2-106B-A12B](https://huggingface.co/zerofata/GLM-4.5-Iceblink-v2-106B-A12B)
43
+
44
+ The model requires ~65.7GiB of VRAM + 23GiB for a KV-cache for 131072 tokens.
45
+ This fits perfectly with 4x24GB or 2x48GB or 1x96GB GPUs.
46
+
47
+ ## 📥 Usage & Running Instructions
48
+
49
+ The model was tested with vLLM + 1x RTX Pro 6000, here is a script suitable for such configuration with 131072 context length.
50
+
51
+ ### Recommendations
52
+
53
+ It is however recommended to use only 65K context to avoid significant degradation (https://fiction.live/stories/Fiction-liveBench-Sept-29-2025/oQdzQvKHw8JyXbN87)
54
+
55
+ The recommended sampler is "min-p" sampling, this sampling is available through
56
+ both the oldest Text completions API and the Chat completions API (and there is a new Response API),
57
+ however most LLM frontends only support modifying min-p when using Text completions.
58
+ You can however use `--override-generation-config "${SAMPLER_JSONCONFIG}"` to override the sampler (which is a merge of generation_config.json and vLLM defaults)
59
+
60
+ ### Running script
61
+
62
+ ```bash
63
+ # Model configuration (Mandatory)
64
+ MODEL="mratsim/GLM-4.5-Iceblink-v2-106B-A12B-AWQ"
65
+ MODELNAME="GLM-4.5-Iceblink-v2"
66
+ GPU_UTIL=0.97
67
+
68
+ # Sampling configuration (Optional, if departing from `generation_config.json`)
69
+ SAMPLER_OVERRIDE='{"temperature": 0.8, "min_p": 0.05, "top_p": 0.95}'
70
+
71
+ # Prevent memory fragmentation
72
+ export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
73
+
74
+ # Prevent vLLM from using 100% CPU when idle (Very Recommended)
75
+ export VLLM_SLEEP_WHEN_IDLE=1
76
+
77
+ # Use FlashInfer backend (fastest, recommended, "instant" context reprocessing)
78
+ # however needs to reduce context length to 120000 tokens and GPU_UTIL to 0.95
79
+ # export VLLM_ATTENTION_BACKEND=FLASHINFER
80
+
81
+ vllm serve "${MODEL}" \
82
+ --served-model-name "${MODELNAME}" \
83
+ --gpu-memory-utilization ${GPU_UTIL} \
84
+ --override-generation-config "${SAMPLER_OVERRIDE}"
85
+ ```
86
+
87
+ > ℹ️ The FlashInfer backend may fail with an error similar to
88
+ > `Failed to allocate memory for batch_prefill_tmp_v with size XYZ and alignment 16 in AlignedAllocator`.
89
+ >
90
+ > A workaround is running a sed replacement command within vllm install to increase buffer space
91
+ > ```bash
92
+ > sed -i 's/FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 \* 1024 \* 1024/FLASHINFER_WORKSPACE_BUFFER_SIZE = 768 \* 1024 \* 1024/g' vllm/v1/attention/backends/flashinfer.py
93
+ > ```
94
+ > This will be fixed by PR https://github.com/vllm-project/vllm/pull/25344 or https://github.com/vllm-project/vllm/pull/28269
95
+
96
+ ## 🔬 Quantization method
97
+
98
+ The llmcompressor library was used with the following recipe:
99
+
100
+ ```yaml
101
+ default_stage:
102
+ default_modifiers:
103
+ AWQModifier:
104
+ config_groups:
105
+ group_0:
106
+ targets: ['re:.*mlp\.experts\.[0-9]+\.(down|gate|up)_proj$']
107
+ weights:
108
+ num_bits: 4
109
+ type: int
110
+ symmetric: true
111
+ group_size: 32
112
+ strategy: group
113
+ block_structure: null
114
+ dynamic: false
115
+ actorder: null
116
+ observer: mse
117
+ observer_kwargs: {}
118
+ input_activations: null
119
+ output_activations: null
120
+ format: null
121
+ targets: ['re:.*mlp\.experts\.[0-9]+\.(down|gate|up)_proj$']
122
+ ignore: []
123
+ mappings:
124
+ - smooth_layer: re:.*post_attention_layernorm$
125
+ balance_layers: ['re:.*gate_proj$', 're:.*up_proj$']
126
+ - smooth_layer: re:.*up_proj$
127
+ balance_layers: ['re:.*down_proj$']
128
+ duo_scaling: true
129
+ ```
130
+
131
+ and calibrated with over 1600 samples, up to 8192 sequence length of:
132
+ - [neuralmagic/calibration](https://huggingface.co/datasets/neuralmagic/calibration)
133
+ - [HuggingFaceH4/ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
134
+ - [nvidia/OpenCodeInstruct](https://huggingface.co/datasets/nvidia/OpenCodeInstruct)
135
+ - [CSJianYang/CodeArena](https://huggingface.co/datasets/CSJianYang/CodeArena)
136
+ - [nvidia/OpenScienceReasoning-2](https://huggingface.co/datasets/nvidia/OpenScienceReasoning-2)
137
+ - [MegaScience/MegaScience](https://huggingface.co/datasets/MegaScience/MegaScience)
138
+ - [Gryphe/Opus-WritingPrompts](https://huggingface.co/datasets/Gryphe/Opus-WritingPrompts)
139
+ - [ServiceNow-AI/M2Lingual](https://huggingface.co/datasets/ServiceNow-AI/M2Lingual)
140
+ - [anthracite-org/stheno-filtered-v1.1](https://huggingface.co/datasets/anthracite-org/stheno-filtered-v1.1)
141
+ - [zerofata/Roleplay-Anime-Characters](https://huggingface.co/datasets/zerofata/Roleplay-Anime-Characters)
142
+ - [zerofata/Instruct-Anime](https://huggingface.co/datasets/zerofata/Instruct-Anime)
143
+ - [zerofata/Instruct-Anime-CreativeWriting](https://huggingface.co/datasets/zerofata/Instruct-Anime-CreativeWriting)
144
+ - [sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo](https://huggingface.co/datasets/sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo)
145
+ - [nvidia/OpenMathInstruct-2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)
146
+ - [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts)
147
+ - [databricks/databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k)
148
+ - [FreedomIntelligence/SocraticChat](https://huggingface.co/datasets/FreedomIntelligence/SocraticChat)
149
+ - [ruggsea/stanford-encyclopedia-of-philosophy_instruct](https://huggingface.co/datasets/ruggsea/stanford-encyclopedia-of-philosophy_instruct)
150
+ - [mlfoundations-dev/stackexchange_philosophy](https://huggingface.co/datasets/mlfoundations-dev/stackexchange_philosophy)
151
+ - [theoldmandthesea/17k_business_book](https://huggingface.co/datasets/theoldmandthesea/17k_business_book)
152
+ - [anthracite-org/nopm_claude_writing_fixed](https://huggingface.co/datasets/anthracite-org/nopm_claude_writing_fixed)
153
+
154
+ According to the [AWQ presentation](https://minjiazhang.github.io/courses/fall24-resource/slides/awq.pdf)
155
+ only 64 samples are needed however due to the Mixture-of-Experts topology, this implies all 127 experts need to see at least 64 samples or alternatively, we activate all experts during calibration which requires [reimplementing the attention block of the model](https://github.com/vllm-project/llm-compressor/tree/0.8.1/examples/quantization_w4a4_fp4#quantizing-moes) in [llmcompressor's modeling DB](https://github.com/vllm-project/llm-compressor/tree/0.8.1/src/llmcompressor/modeling).
156
+
157
+ ### Deep-dive
158
+
159
+ Quantization should be focused on Linear layer (also called Dense or Fully-Connected layers i.e. MatMu+Bias)
160
+ In particular quantizing LayerNorm/RMSnorm layer is strongly discouraged, see [1]
161
+ > LayerNorm in Quantization. Kovaleva et al. (2021); Wei et al. (2022) find that outliers in the
162
+ > LayerNorm parameters of BERT (Devlin et al., 2019) cause difficulties in model compression.
163
+ > Given the importance of LayerNorm, all the quantization methods we discuss above leave LayerNorm unquantized.
164
+
165
+ _Note: Experts layers might not be stored as a `Linear` layer, meaning they might be skipped if using `llmcompressor` with a `Linear` target._
166
+
167
+ Some layers have a higher impact on LLM performance.
168
+ According to [2], spending more bits in attention layers results in large gain compared to spending them in FFN layers.
169
+ According to [3] on 2-bit quantization:
170
+ - quantizing expert FFN layers do not seriously impact model quality
171
+ - quantizing cross-attention has some impact
172
+ - quantizing self-attention has a large impact
173
+ - quantizing dense FFN has a very significant impact
174
+
175
+ Hence to preserve model quality we choose not to quantize dense FFN layers (i.e. shared experts) and self-attention layers.
176
+
177
+ We notice that:
178
+ - official MXFP4 weights of gpt-oss-120b from OpenAI keep self-attention in BF16:
179
+ - https://huggingface.co/openai/gpt-oss-120b/blob/main/model.safetensors.index.json
180
+ - NVFP4 weights of DeepSeek-R1 quantized by Nvidia also keep self-attention in BF16:
181
+ - https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4/blob/main/model.safetensors.index.json
182
+
183
+ According to [2], giving more bits to the first `k` blocks have a significantly higher impact on model quality than for the same last `k` blocks.
184
+ In this case, we keep the first layer unquantized as `"first_k_dense_replace": 1` in [config.json](config.json)
185
+
186
+ ### References
187
+
188
+ 1. Why Do Some Inputs Break Low-Bit LLM Quantization? (2025)\
189
+ Ting-Yun Chang, Muru Zhang, Jesse Thomason, Robin Jia\
190
+ https://arxiv.org/pdf/2506.12044
191
+
192
+ 2. Examining Post-Training Quantization for Mixture-of-Experts: A Benchmark (2024)\
193
+ Pingzhi Li, Xiaolong Jin, Yu Cheng, Tianlong Chen\
194
+ https://arxiv.org/pdf/2406.08155v1
195
+
196
+ 3. Mixture of Quantized Experts (MoQE): Complementary Effect of Low-bit Quantization and Robustness (2023)\
197
+ Young Jin Kim, Raffy Fahim, Hany Hassan Awadalla\
198
+ https://arxiv.org/pdf/2310.02410
chat_template.jinja ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # Tools
5
+
6
+ You may call one or more functions to assist with the user query.
7
+
8
+ You are provided with function signatures within <tools></tools> XML tags:
9
+ <tools>
10
+ {% for tool in tools %}
11
+ {{ tool | tojson(ensure_ascii=False) }}
12
+ {% endfor %}
13
+ </tools>
14
+
15
+ For each function call, output the function name and arguments within the following XML format:
16
+ <tool_call>{function-name}
17
+ <arg_key>{arg-key-1}</arg_key>
18
+ <arg_value>{arg-value-1}</arg_value>
19
+ <arg_key>{arg-key-2}</arg_key>
20
+ <arg_value>{arg-value-2}</arg_value>
21
+ ...
22
+ </tool_call>{%- endif -%}
23
+ {%- macro visible_text(content) -%}
24
+ {%- if content is string -%}
25
+ {{- content }}
26
+ {%- elif content is iterable and content is not mapping -%}
27
+ {%- for item in content -%}
28
+ {%- if item is mapping and item.type == 'text' -%}
29
+ {{- item.text }}
30
+ {%- elif item is string -%}
31
+ {{- item }}
32
+ {%- endif -%}
33
+ {%- endfor -%}
34
+ {%- else -%}
35
+ {{- content }}
36
+ {%- endif -%}
37
+ {%- endmacro -%}
38
+ {%- set ns = namespace(last_user_index=-1) %}
39
+ {%- for m in messages %}
40
+ {%- if m.role == 'user' %}
41
+ {% set ns.last_user_index = loop.index0 -%}
42
+ {%- endif %}
43
+ {%- endfor %}
44
+ {% for m in messages %}
45
+ {%- if m.role == 'user' -%}<|user|>
46
+ {{ visible_text(m.content) }}
47
+ {{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}
48
+ {%- elif m.role == 'assistant' -%}
49
+ <|assistant|>
50
+ {%- set reasoning_content = '' %}
51
+ {%- set content = visible_text(m.content) %}
52
+ {%- if m.reasoning_content is string %}
53
+ {%- set reasoning_content = m.reasoning_content %}
54
+ {%- else %}
55
+ {%- if '</think>' in content %}
56
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
57
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
58
+ {%- endif %}
59
+ {%- endif %}
60
+ {%- if loop.index0 > ns.last_user_index and reasoning_content -%}
61
+ {{ '\n<think>' + reasoning_content.strip() + '</think>'}}
62
+ {%- else -%}
63
+ {{ '\n<think></think>' }}
64
+ {%- endif -%}
65
+ {%- if content.strip() -%}
66
+ {{ '\n' + content.strip() }}
67
+ {%- endif -%}
68
+ {% if m.tool_calls %}
69
+ {% for tc in m.tool_calls %}
70
+ {%- if tc.function %}
71
+ {%- set tc = tc.function %}
72
+ {%- endif %}
73
+ {{ '\n<tool_call>' + tc.name }}
74
+ {% set _args = tc.arguments %}
75
+ {% for k, v in _args.items() %}
76
+ <arg_key>{{ k }}</arg_key>
77
+ <arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>
78
+ {% endfor %}
79
+ </tool_call>{% endfor %}
80
+ {% endif %}
81
+ {%- elif m.role == 'tool' -%}
82
+ {%- if m.content is string -%}
83
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
84
+ {{- '<|observation|>' }}
85
+ {%- endif %}
86
+ {{- '\n<tool_response>\n' }}
87
+ {{- m.content }}
88
+ {{- '\n</tool_response>' }}
89
+ {%- else -%}
90
+ <|observation|>{% for tr in m.content %}
91
+
92
+ <tool_response>
93
+ {{ tr.output if tr.output is defined else tr }}
94
+ </tool_response>{% endfor -%}
95
+ {% endif -%}
96
+ {%- elif m.role == 'system' -%}
97
+ <|system|>
98
+ {{ visible_text(m.content) }}
99
+ {%- endif -%}
100
+ {%- endfor -%}
101
+ {%- if add_generation_prompt -%}
102
+ <|assistant|>{{- '\n<think></think>' if (enable_thinking is defined and not enable_thinking) else '' -}}
103
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Glm4MoeForCausalLM"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": [
9
+ 151329,
10
+ 151336,
11
+ 151338
12
+ ],
13
+ "first_k_dense_replace": 1,
14
+ "head_dim": 128,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 4096,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 10944,
19
+ "max_position_embeddings": 131072,
20
+ "model_type": "glm4_moe",
21
+ "moe_intermediate_size": 1408,
22
+ "n_group": 1,
23
+ "n_routed_experts": 128,
24
+ "n_shared_experts": 1,
25
+ "no_split_module_classes": [
26
+ "MergedColumnParallelLinear"
27
+ ],
28
+ "norm_topk_prob": true,
29
+ "num_attention_heads": 96,
30
+ "num_experts_per_tok": 8,
31
+ "num_hidden_layers": 46,
32
+ "num_key_value_heads": 8,
33
+ "num_nextn_predict_layers": 1,
34
+ "pad_token_id": 151329,
35
+ "partial_rotary_factor": 0.5,
36
+ "quantization_config": {
37
+ "config_groups": {
38
+ "group_0": {
39
+ "format": "pack-quantized",
40
+ "input_activations": null,
41
+ "output_activations": null,
42
+ "targets": [
43
+ "Linear",
44
+ "re:.*mlp\\.experts\\.[0-9]+\\.(down|gate|up)_proj$"
45
+ ],
46
+ "weights": {
47
+ "actorder": null,
48
+ "block_structure": null,
49
+ "dynamic": false,
50
+ "group_size": 32,
51
+ "num_bits": 4,
52
+ "observer": "mse",
53
+ "observer_kwargs": {},
54
+ "strategy": "group",
55
+ "symmetric": true,
56
+ "type": "int"
57
+ }
58
+ }
59
+ },
60
+ "format": "pack-quantized",
61
+ "global_compression_ratio": null,
62
+ "ignore": [
63
+ "model.layers.0.self_attn.q_proj",
64
+ "model.layers.0.self_attn.k_proj",
65
+ "model.layers.0.self_attn.v_proj",
66
+ "model.layers.0.self_attn.o_proj",
67
+ "model.layers.0.mlp.gate_proj",
68
+ "model.layers.0.mlp.up_proj",
69
+ "model.layers.0.mlp.down_proj",
70
+ "model.layers.1.self_attn.q_proj",
71
+ "model.layers.1.self_attn.k_proj",
72
+ "model.layers.1.self_attn.v_proj",
73
+ "model.layers.1.self_attn.o_proj",
74
+ "model.layers.1.mlp.shared_experts.gate_proj",
75
+ "model.layers.1.mlp.shared_experts.up_proj",
76
+ "model.layers.1.mlp.shared_experts.down_proj",
77
+ "model.layers.2.self_attn.q_proj",
78
+ "model.layers.2.self_attn.k_proj",
79
+ "model.layers.2.self_attn.v_proj",
80
+ "model.layers.2.self_attn.o_proj",
81
+ "model.layers.2.mlp.shared_experts.gate_proj",
82
+ "model.layers.2.mlp.shared_experts.up_proj",
83
+ "model.layers.2.mlp.shared_experts.down_proj",
84
+ "model.layers.3.self_attn.q_proj",
85
+ "model.layers.3.self_attn.k_proj",
86
+ "model.layers.3.self_attn.v_proj",
87
+ "model.layers.3.self_attn.o_proj",
88
+ "model.layers.3.mlp.shared_experts.gate_proj",
89
+ "model.layers.3.mlp.shared_experts.up_proj",
90
+ "model.layers.3.mlp.shared_experts.down_proj",
91
+ "model.layers.4.self_attn.q_proj",
92
+ "model.layers.4.self_attn.k_proj",
93
+ "model.layers.4.self_attn.v_proj",
94
+ "model.layers.4.self_attn.o_proj",
95
+ "model.layers.4.mlp.shared_experts.gate_proj",
96
+ "model.layers.4.mlp.shared_experts.up_proj",
97
+ "model.layers.4.mlp.shared_experts.down_proj",
98
+ "model.layers.5.self_attn.q_proj",
99
+ "model.layers.5.self_attn.k_proj",
100
+ "model.layers.5.self_attn.v_proj",
101
+ "model.layers.5.self_attn.o_proj",
102
+ "model.layers.5.mlp.shared_experts.gate_proj",
103
+ "model.layers.5.mlp.shared_experts.up_proj",
104
+ "model.layers.5.mlp.shared_experts.down_proj",
105
+ "model.layers.6.self_attn.q_proj",
106
+ "model.layers.6.self_attn.k_proj",
107
+ "model.layers.6.self_attn.v_proj",
108
+ "model.layers.6.self_attn.o_proj",
109
+ "model.layers.6.mlp.shared_experts.gate_proj",
110
+ "model.layers.6.mlp.shared_experts.up_proj",
111
+ "model.layers.6.mlp.shared_experts.down_proj",
112
+ "model.layers.7.self_attn.q_proj",
113
+ "model.layers.7.self_attn.k_proj",
114
+ "model.layers.7.self_attn.v_proj",
115
+ "model.layers.7.self_attn.o_proj",
116
+ "model.layers.7.mlp.shared_experts.gate_proj",
117
+ "model.layers.7.mlp.shared_experts.up_proj",
118
+ "model.layers.7.mlp.shared_experts.down_proj",
119
+ "model.layers.8.self_attn.q_proj",
120
+ "model.layers.8.self_attn.k_proj",
121
+ "model.layers.8.self_attn.v_proj",
122
+ "model.layers.8.self_attn.o_proj",
123
+ "model.layers.8.mlp.shared_experts.gate_proj",
124
+ "model.layers.8.mlp.shared_experts.up_proj",
125
+ "model.layers.8.mlp.shared_experts.down_proj",
126
+ "model.layers.9.self_attn.q_proj",
127
+ "model.layers.9.self_attn.k_proj",
128
+ "model.layers.9.self_attn.v_proj",
129
+ "model.layers.9.self_attn.o_proj",
130
+ "model.layers.9.mlp.shared_experts.gate_proj",
131
+ "model.layers.9.mlp.shared_experts.up_proj",
132
+ "model.layers.9.mlp.shared_experts.down_proj",
133
+ "model.layers.10.self_attn.q_proj",
134
+ "model.layers.10.self_attn.k_proj",
135
+ "model.layers.10.self_attn.v_proj",
136
+ "model.layers.10.self_attn.o_proj",
137
+ "model.layers.10.mlp.shared_experts.gate_proj",
138
+ "model.layers.10.mlp.shared_experts.up_proj",
139
+ "model.layers.10.mlp.shared_experts.down_proj",
140
+ "model.layers.11.self_attn.q_proj",
141
+ "model.layers.11.self_attn.k_proj",
142
+ "model.layers.11.self_attn.v_proj",
143
+ "model.layers.11.self_attn.o_proj",
144
+ "model.layers.11.mlp.shared_experts.gate_proj",
145
+ "model.layers.11.mlp.shared_experts.up_proj",
146
+ "model.layers.11.mlp.shared_experts.down_proj",
147
+ "model.layers.12.self_attn.q_proj",
148
+ "model.layers.12.self_attn.k_proj",
149
+ "model.layers.12.self_attn.v_proj",
150
+ "model.layers.12.self_attn.o_proj",
151
+ "model.layers.12.mlp.shared_experts.gate_proj",
152
+ "model.layers.12.mlp.shared_experts.up_proj",
153
+ "model.layers.12.mlp.shared_experts.down_proj",
154
+ "model.layers.13.self_attn.q_proj",
155
+ "model.layers.13.self_attn.k_proj",
156
+ "model.layers.13.self_attn.v_proj",
157
+ "model.layers.13.self_attn.o_proj",
158
+ "model.layers.13.mlp.shared_experts.gate_proj",
159
+ "model.layers.13.mlp.shared_experts.up_proj",
160
+ "model.layers.13.mlp.shared_experts.down_proj",
161
+ "model.layers.14.self_attn.q_proj",
162
+ "model.layers.14.self_attn.k_proj",
163
+ "model.layers.14.self_attn.v_proj",
164
+ "model.layers.14.self_attn.o_proj",
165
+ "model.layers.14.mlp.shared_experts.gate_proj",
166
+ "model.layers.14.mlp.shared_experts.up_proj",
167
+ "model.layers.14.mlp.shared_experts.down_proj",
168
+ "model.layers.15.self_attn.q_proj",
169
+ "model.layers.15.self_attn.k_proj",
170
+ "model.layers.15.self_attn.v_proj",
171
+ "model.layers.15.self_attn.o_proj",
172
+ "model.layers.15.mlp.shared_experts.gate_proj",
173
+ "model.layers.15.mlp.shared_experts.up_proj",
174
+ "model.layers.15.mlp.shared_experts.down_proj",
175
+ "model.layers.16.self_attn.q_proj",
176
+ "model.layers.16.self_attn.k_proj",
177
+ "model.layers.16.self_attn.v_proj",
178
+ "model.layers.16.self_attn.o_proj",
179
+ "model.layers.16.mlp.shared_experts.gate_proj",
180
+ "model.layers.16.mlp.shared_experts.up_proj",
181
+ "model.layers.16.mlp.shared_experts.down_proj",
182
+ "model.layers.17.self_attn.q_proj",
183
+ "model.layers.17.self_attn.k_proj",
184
+ "model.layers.17.self_attn.v_proj",
185
+ "model.layers.17.self_attn.o_proj",
186
+ "model.layers.17.mlp.shared_experts.gate_proj",
187
+ "model.layers.17.mlp.shared_experts.up_proj",
188
+ "model.layers.17.mlp.shared_experts.down_proj",
189
+ "model.layers.18.self_attn.q_proj",
190
+ "model.layers.18.self_attn.k_proj",
191
+ "model.layers.18.self_attn.v_proj",
192
+ "model.layers.18.self_attn.o_proj",
193
+ "model.layers.18.mlp.shared_experts.gate_proj",
194
+ "model.layers.18.mlp.shared_experts.up_proj",
195
+ "model.layers.18.mlp.shared_experts.down_proj",
196
+ "model.layers.19.self_attn.q_proj",
197
+ "model.layers.19.self_attn.k_proj",
198
+ "model.layers.19.self_attn.v_proj",
199
+ "model.layers.19.self_attn.o_proj",
200
+ "model.layers.19.mlp.shared_experts.gate_proj",
201
+ "model.layers.19.mlp.shared_experts.up_proj",
202
+ "model.layers.19.mlp.shared_experts.down_proj",
203
+ "model.layers.20.self_attn.q_proj",
204
+ "model.layers.20.self_attn.k_proj",
205
+ "model.layers.20.self_attn.v_proj",
206
+ "model.layers.20.self_attn.o_proj",
207
+ "model.layers.20.mlp.shared_experts.gate_proj",
208
+ "model.layers.20.mlp.shared_experts.up_proj",
209
+ "model.layers.20.mlp.shared_experts.down_proj",
210
+ "model.layers.21.self_attn.q_proj",
211
+ "model.layers.21.self_attn.k_proj",
212
+ "model.layers.21.self_attn.v_proj",
213
+ "model.layers.21.self_attn.o_proj",
214
+ "model.layers.21.mlp.shared_experts.gate_proj",
215
+ "model.layers.21.mlp.shared_experts.up_proj",
216
+ "model.layers.21.mlp.shared_experts.down_proj",
217
+ "model.layers.22.self_attn.q_proj",
218
+ "model.layers.22.self_attn.k_proj",
219
+ "model.layers.22.self_attn.v_proj",
220
+ "model.layers.22.self_attn.o_proj",
221
+ "model.layers.22.mlp.shared_experts.gate_proj",
222
+ "model.layers.22.mlp.shared_experts.up_proj",
223
+ "model.layers.22.mlp.shared_experts.down_proj",
224
+ "model.layers.23.self_attn.q_proj",
225
+ "model.layers.23.self_attn.k_proj",
226
+ "model.layers.23.self_attn.v_proj",
227
+ "model.layers.23.self_attn.o_proj",
228
+ "model.layers.23.mlp.shared_experts.gate_proj",
229
+ "model.layers.23.mlp.shared_experts.up_proj",
230
+ "model.layers.23.mlp.shared_experts.down_proj",
231
+ "model.layers.24.self_attn.q_proj",
232
+ "model.layers.24.self_attn.k_proj",
233
+ "model.layers.24.self_attn.v_proj",
234
+ "model.layers.24.self_attn.o_proj",
235
+ "model.layers.24.mlp.shared_experts.gate_proj",
236
+ "model.layers.24.mlp.shared_experts.up_proj",
237
+ "model.layers.24.mlp.shared_experts.down_proj",
238
+ "model.layers.25.self_attn.q_proj",
239
+ "model.layers.25.self_attn.k_proj",
240
+ "model.layers.25.self_attn.v_proj",
241
+ "model.layers.25.self_attn.o_proj",
242
+ "model.layers.25.mlp.shared_experts.gate_proj",
243
+ "model.layers.25.mlp.shared_experts.up_proj",
244
+ "model.layers.25.mlp.shared_experts.down_proj",
245
+ "model.layers.26.self_attn.q_proj",
246
+ "model.layers.26.self_attn.k_proj",
247
+ "model.layers.26.self_attn.v_proj",
248
+ "model.layers.26.self_attn.o_proj",
249
+ "model.layers.26.mlp.shared_experts.gate_proj",
250
+ "model.layers.26.mlp.shared_experts.up_proj",
251
+ "model.layers.26.mlp.shared_experts.down_proj",
252
+ "model.layers.27.self_attn.q_proj",
253
+ "model.layers.27.self_attn.k_proj",
254
+ "model.layers.27.self_attn.v_proj",
255
+ "model.layers.27.self_attn.o_proj",
256
+ "model.layers.27.mlp.shared_experts.gate_proj",
257
+ "model.layers.27.mlp.shared_experts.up_proj",
258
+ "model.layers.27.mlp.shared_experts.down_proj",
259
+ "model.layers.28.self_attn.q_proj",
260
+ "model.layers.28.self_attn.k_proj",
261
+ "model.layers.28.self_attn.v_proj",
262
+ "model.layers.28.self_attn.o_proj",
263
+ "model.layers.28.mlp.shared_experts.gate_proj",
264
+ "model.layers.28.mlp.shared_experts.up_proj",
265
+ "model.layers.28.mlp.shared_experts.down_proj",
266
+ "model.layers.29.self_attn.q_proj",
267
+ "model.layers.29.self_attn.k_proj",
268
+ "model.layers.29.self_attn.v_proj",
269
+ "model.layers.29.self_attn.o_proj",
270
+ "model.layers.29.mlp.shared_experts.gate_proj",
271
+ "model.layers.29.mlp.shared_experts.up_proj",
272
+ "model.layers.29.mlp.shared_experts.down_proj",
273
+ "model.layers.30.self_attn.q_proj",
274
+ "model.layers.30.self_attn.k_proj",
275
+ "model.layers.30.self_attn.v_proj",
276
+ "model.layers.30.self_attn.o_proj",
277
+ "model.layers.30.mlp.shared_experts.gate_proj",
278
+ "model.layers.30.mlp.shared_experts.up_proj",
279
+ "model.layers.30.mlp.shared_experts.down_proj",
280
+ "model.layers.31.self_attn.q_proj",
281
+ "model.layers.31.self_attn.k_proj",
282
+ "model.layers.31.self_attn.v_proj",
283
+ "model.layers.31.self_attn.o_proj",
284
+ "model.layers.31.mlp.shared_experts.gate_proj",
285
+ "model.layers.31.mlp.shared_experts.up_proj",
286
+ "model.layers.31.mlp.shared_experts.down_proj",
287
+ "model.layers.32.self_attn.q_proj",
288
+ "model.layers.32.self_attn.k_proj",
289
+ "model.layers.32.self_attn.v_proj",
290
+ "model.layers.32.self_attn.o_proj",
291
+ "model.layers.32.mlp.shared_experts.gate_proj",
292
+ "model.layers.32.mlp.shared_experts.up_proj",
293
+ "model.layers.32.mlp.shared_experts.down_proj",
294
+ "model.layers.33.self_attn.q_proj",
295
+ "model.layers.33.self_attn.k_proj",
296
+ "model.layers.33.self_attn.v_proj",
297
+ "model.layers.33.self_attn.o_proj",
298
+ "model.layers.33.mlp.shared_experts.gate_proj",
299
+ "model.layers.33.mlp.shared_experts.up_proj",
300
+ "model.layers.33.mlp.shared_experts.down_proj",
301
+ "model.layers.34.self_attn.q_proj",
302
+ "model.layers.34.self_attn.k_proj",
303
+ "model.layers.34.self_attn.v_proj",
304
+ "model.layers.34.self_attn.o_proj",
305
+ "model.layers.34.mlp.shared_experts.gate_proj",
306
+ "model.layers.34.mlp.shared_experts.up_proj",
307
+ "model.layers.34.mlp.shared_experts.down_proj",
308
+ "model.layers.35.self_attn.q_proj",
309
+ "model.layers.35.self_attn.k_proj",
310
+ "model.layers.35.self_attn.v_proj",
311
+ "model.layers.35.self_attn.o_proj",
312
+ "model.layers.35.mlp.shared_experts.gate_proj",
313
+ "model.layers.35.mlp.shared_experts.up_proj",
314
+ "model.layers.35.mlp.shared_experts.down_proj",
315
+ "model.layers.36.self_attn.q_proj",
316
+ "model.layers.36.self_attn.k_proj",
317
+ "model.layers.36.self_attn.v_proj",
318
+ "model.layers.36.self_attn.o_proj",
319
+ "model.layers.36.mlp.shared_experts.gate_proj",
320
+ "model.layers.36.mlp.shared_experts.up_proj",
321
+ "model.layers.36.mlp.shared_experts.down_proj",
322
+ "model.layers.37.self_attn.q_proj",
323
+ "model.layers.37.self_attn.k_proj",
324
+ "model.layers.37.self_attn.v_proj",
325
+ "model.layers.37.self_attn.o_proj",
326
+ "model.layers.37.mlp.shared_experts.gate_proj",
327
+ "model.layers.37.mlp.shared_experts.up_proj",
328
+ "model.layers.37.mlp.shared_experts.down_proj",
329
+ "model.layers.38.self_attn.q_proj",
330
+ "model.layers.38.self_attn.k_proj",
331
+ "model.layers.38.self_attn.v_proj",
332
+ "model.layers.38.self_attn.o_proj",
333
+ "model.layers.38.mlp.shared_experts.gate_proj",
334
+ "model.layers.38.mlp.shared_experts.up_proj",
335
+ "model.layers.38.mlp.shared_experts.down_proj",
336
+ "model.layers.39.self_attn.q_proj",
337
+ "model.layers.39.self_attn.k_proj",
338
+ "model.layers.39.self_attn.v_proj",
339
+ "model.layers.39.self_attn.o_proj",
340
+ "model.layers.39.mlp.shared_experts.gate_proj",
341
+ "model.layers.39.mlp.shared_experts.up_proj",
342
+ "model.layers.39.mlp.shared_experts.down_proj",
343
+ "model.layers.40.self_attn.q_proj",
344
+ "model.layers.40.self_attn.k_proj",
345
+ "model.layers.40.self_attn.v_proj",
346
+ "model.layers.40.self_attn.o_proj",
347
+ "model.layers.40.mlp.shared_experts.gate_proj",
348
+ "model.layers.40.mlp.shared_experts.up_proj",
349
+ "model.layers.40.mlp.shared_experts.down_proj",
350
+ "model.layers.41.self_attn.q_proj",
351
+ "model.layers.41.self_attn.k_proj",
352
+ "model.layers.41.self_attn.v_proj",
353
+ "model.layers.41.self_attn.o_proj",
354
+ "model.layers.41.mlp.shared_experts.gate_proj",
355
+ "model.layers.41.mlp.shared_experts.up_proj",
356
+ "model.layers.41.mlp.shared_experts.down_proj",
357
+ "model.layers.42.self_attn.q_proj",
358
+ "model.layers.42.self_attn.k_proj",
359
+ "model.layers.42.self_attn.v_proj",
360
+ "model.layers.42.self_attn.o_proj",
361
+ "model.layers.42.mlp.shared_experts.gate_proj",
362
+ "model.layers.42.mlp.shared_experts.up_proj",
363
+ "model.layers.42.mlp.shared_experts.down_proj",
364
+ "model.layers.43.self_attn.q_proj",
365
+ "model.layers.43.self_attn.k_proj",
366
+ "model.layers.43.self_attn.v_proj",
367
+ "model.layers.43.self_attn.o_proj",
368
+ "model.layers.43.mlp.shared_experts.gate_proj",
369
+ "model.layers.43.mlp.shared_experts.up_proj",
370
+ "model.layers.43.mlp.shared_experts.down_proj",
371
+ "model.layers.44.self_attn.q_proj",
372
+ "model.layers.44.self_attn.k_proj",
373
+ "model.layers.44.self_attn.v_proj",
374
+ "model.layers.44.self_attn.o_proj",
375
+ "model.layers.44.mlp.shared_experts.gate_proj",
376
+ "model.layers.44.mlp.shared_experts.up_proj",
377
+ "model.layers.44.mlp.shared_experts.down_proj",
378
+ "model.layers.45.self_attn.q_proj",
379
+ "model.layers.45.self_attn.k_proj",
380
+ "model.layers.45.self_attn.v_proj",
381
+ "model.layers.45.self_attn.o_proj",
382
+ "model.layers.45.mlp.shared_experts.gate_proj",
383
+ "model.layers.45.mlp.shared_experts.up_proj",
384
+ "model.layers.45.mlp.shared_experts.down_proj",
385
+ "lm_head"
386
+ ],
387
+ "kv_cache_scheme": null,
388
+ "quant_method": "compressed-tensors",
389
+ "quantization_status": "compressed",
390
+ "sparsity_config": {},
391
+ "transform_config": {},
392
+ "version": "0.12.2"
393
+ },
394
+ "rms_norm_eps": 1e-05,
395
+ "rope_scaling": null,
396
+ "rope_theta": 1000000,
397
+ "routed_scaling_factor": 1.0,
398
+ "tie_word_embeddings": false,
399
+ "topk_group": 1,
400
+ "transformers_version": "4.56.2",
401
+ "use_cache": true,
402
+ "use_qk_norm": false,
403
+ "vocab_size": 151552
404
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151329,
6
+ 151336,
7
+ 151338
8
+ ],
9
+ "min_p": 0.05,
10
+ "pad_token_id": 151329,
11
+ "temperature": 0.8,
12
+ "top_p": 0.95,
13
+ "transformers_version": "4.56.2"
14
+ }
model-00001-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f31178a14945e05a33b292a870c234f8a6f2442a6913b5c5c1d54cad5e83489
3
+ size 4997984280
model-00002-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f427ff1a9a1fec0ab7ad56272409aed8a2ed783ae0c2fa078df188037b9ad178
3
+ size 4998615584
model-00003-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2abc175d520311d0b296c96861acbc49c468b87d225f40cc0b337260653db2b9
3
+ size 4998615696
model-00004-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eda15385dc06c247aaacf24a32b0e3276a79577829af208e5c6573b072047bdf
3
+ size 4999357328
model-00005-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b120a3af82ddef351483d0f780df27e14da4252bbde74269eaac32760fe6deab
3
+ size 4998619544
model-00006-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d15e33cec2941d52769d4f0aa87350f530b68096a83d7016398d8b56b74a633
3
+ size 4998619656
model-00007-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c95397e718aab16335458598107553f671f4d4301cc40913465ff0da49d22ad
3
+ size 4999357512
model-00008-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8be0612ce358b2a19850e3371ee575442bcec26a629e30c76659dc15660545ae
3
+ size 4998619544
model-00009-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59d9176cac635c1f196e20312adda1359ccbaa4309aaa9ca192558a32bd803b7
3
+ size 4998619656
model-00010-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5071929b799809ae04e436faaf1d409963a44551da7023267515519458b4240
3
+ size 4999357512
model-00011-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a909d45b42ee120da85ca78f816274e2f4def602622a825d196ab2bc45f3b120
3
+ size 4998619544
model-00012-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3acf084cc7e8c59cb9caf16a9f9d914770645227eed3784f8499d1766a1be964
3
+ size 4998619656
model-00013-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e82b2b3be9a66e0445dd0b1bf7cd2a72fb185eec705d613243d80294863e551
3
+ size 4999357512
model-00014-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c802cefaea29976a29f6db9b5bf460a67639900ff1a488a6f138500fadfe3df5
3
+ size 4228942984
model-00015-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3215a617243d684b885879f7ba6eee3f5c346651709bc26c02551ca55bd54297
3
+ size 1241514112
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
recipe.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ AWQModifier:
4
+ config_groups:
5
+ group_0:
6
+ targets: ['re:.*mlp\.experts\.[0-9]+\.(down|gate|up)_proj$']
7
+ weights:
8
+ num_bits: 4
9
+ type: int
10
+ symmetric: true
11
+ group_size: 32
12
+ strategy: group
13
+ block_structure: null
14
+ dynamic: false
15
+ actorder: null
16
+ observer: mse
17
+ observer_kwargs: {}
18
+ input_activations: null
19
+ output_activations: null
20
+ format: null
21
+ targets: ['re:.*mlp\.experts\.[0-9]+\.(down|gate|up)_proj$']
22
+ ignore: []
23
+ mappings:
24
+ - smooth_layer: re:.*post_attention_layernorm$
25
+ balance_layers: ['re:.*gate_proj$', 're:.*up_proj$']
26
+ - smooth_layer: re:.*up_proj$
27
+ balance_layers: ['re:.*down_proj$']
28
+ duo_scaling: true
special_tokens_map.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "[MASK]",
5
+ "[gMASK]",
6
+ "[sMASK]",
7
+ "<sop>",
8
+ "<eop>",
9
+ "<|system|>",
10
+ "<|user|>",
11
+ "<|assistant|>",
12
+ "<|observation|>",
13
+ "<|begin_of_image|>",
14
+ "<|end_of_image|>",
15
+ "<|begin_of_video|>",
16
+ "<|end_of_video|>",
17
+ "<|begin_of_audio|>",
18
+ "<|end_of_audio|>",
19
+ "<|begin_of_transcription|>",
20
+ "<|end_of_transcription|>",
21
+ "<|code_prefix|>",
22
+ "<|code_middle|>",
23
+ "<|code_suffix|>",
24
+ "/nothink"
25
+ ],
26
+ "eos_token": {
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "pad_token": {
34
+ "content": "<|endoftext|>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ }
40
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f0a7d462ffab1bb3530eb290e68bd4f8eb603e92080d8d66bdf3ccd03bcbcb3
3
+ size 19970799
tokenizer_config.json ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151329": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151330": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "151331": {
20
+ "content": "[gMASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "151332": {
28
+ "content": "[sMASK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "151333": {
36
+ "content": "<sop>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "151334": {
44
+ "content": "<eop>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "151335": {
52
+ "content": "<|system|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "151336": {
60
+ "content": "<|user|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "151337": {
68
+ "content": "<|assistant|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "151338": {
76
+ "content": "<|observation|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "151339": {
84
+ "content": "<|begin_of_image|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "151340": {
92
+ "content": "<|end_of_image|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "151341": {
100
+ "content": "<|begin_of_video|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "151342": {
108
+ "content": "<|end_of_video|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "151343": {
116
+ "content": "<|begin_of_audio|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "151344": {
124
+ "content": "<|end_of_audio|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "151345": {
132
+ "content": "<|begin_of_transcription|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "151346": {
140
+ "content": "<|end_of_transcription|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "151347": {
148
+ "content": "<|code_prefix|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "151348": {
156
+ "content": "<|code_middle|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "151349": {
164
+ "content": "<|code_suffix|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "151350": {
172
+ "content": "<think>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": false
178
+ },
179
+ "151351": {
180
+ "content": "</think>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": false
186
+ },
187
+ "151352": {
188
+ "content": "<tool_call>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": false
194
+ },
195
+ "151353": {
196
+ "content": "</tool_call>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": false
202
+ },
203
+ "151354": {
204
+ "content": "<tool_response>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": false
210
+ },
211
+ "151355": {
212
+ "content": "</tool_response>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": false
218
+ },
219
+ "151356": {
220
+ "content": "<arg_key>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": false
226
+ },
227
+ "151357": {
228
+ "content": "</arg_key>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": false
234
+ },
235
+ "151358": {
236
+ "content": "<arg_value>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": false
242
+ },
243
+ "151359": {
244
+ "content": "</arg_value>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": false
250
+ },
251
+ "151360": {
252
+ "content": "/nothink",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "151361": {
260
+ "content": "<|begin_of_box|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": false
266
+ },
267
+ "151362": {
268
+ "content": "<|end_of_box|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": false
274
+ },
275
+ "151363": {
276
+ "content": "<|image|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": false
282
+ },
283
+ "151364": {
284
+ "content": "<|video|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": false
290
+ }
291
+ },
292
+ "additional_special_tokens": [
293
+ "<|endoftext|>",
294
+ "[MASK]",
295
+ "[gMASK]",
296
+ "[sMASK]",
297
+ "<sop>",
298
+ "<eop>",
299
+ "<|system|>",
300
+ "<|user|>",
301
+ "<|assistant|>",
302
+ "<|observation|>",
303
+ "<|begin_of_image|>",
304
+ "<|end_of_image|>",
305
+ "<|begin_of_video|>",
306
+ "<|end_of_video|>",
307
+ "<|begin_of_audio|>",
308
+ "<|end_of_audio|>",
309
+ "<|begin_of_transcription|>",
310
+ "<|end_of_transcription|>",
311
+ "<|code_prefix|>",
312
+ "<|code_middle|>",
313
+ "<|code_suffix|>",
314
+ "/nothink"
315
+ ],
316
+ "clean_up_tokenization_spaces": false,
317
+ "do_lower_case": false,
318
+ "eos_token": "<|endoftext|>",
319
+ "extra_special_tokens": {},
320
+ "model_max_length": 128000,
321
+ "pad_token": "<|endoftext|>",
322
+ "padding_side": "left",
323
+ "remove_space": false,
324
+ "tokenizer_class": "PreTrainedTokenizerFast"
325
+ }