File size: 3,732 Bytes
9835d54 8bd5e50 9835d54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
---
quantized_by: anikifoss
pipeline_tag: text-generation
base_model: Qwen/Qwen3-Coder-480B-A35B-Instruct
license: apache-2.0
base_model_relation: quantized
tags:
- conversational
---
# Model Card
High quality quantization of **Qwen3-Coder-480B-A35B-Instruct** without using imatrix.
# Run
## ik_llama.cpp
See [this detailed guide](https://github.com/ikawrakow/ik_llama.cpp/discussions/258) on how to setup ik_llama and how to make custom quants.
```
./build/bin/llama-server \
--alias anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K \
--model /mnt/data/Models/anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K/Qwen3-Coder-480B-A35B-Instruct-HQ4_K-00001-of-00007.gguf \
--no-mmap -rtr \
--temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
--ctx-size 51000 \
-ctk f16 -ctv f16 \
-fa \
-b 1024 -ub 1024 \
-fmoe \
--n-gpu-layers 99 \
--override-tensor exps=CPU \
--parallel 1 \
--threads 32 \
--threads-batch 64 \
--host 127.0.0.1 \
--port 8090
```
## llama.cpp
```
./build/bin/llama-server \
--alias anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K \
--model /mnt/data/Models/anikifoss/Qwen3-Coder-480B-A35B-Instruct-HQ4_K/Qwen3-Coder-480B-A35B-Instruct-HQ4_K-00001-of-00007.gguf \
--no-mmap \
--temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
--ctx-size 51000 \
-ctk f16 -ctv f16 \
-fa \
-b 1024 -ub 1024 \
--n-gpu-layers 99 \
--override-tensor exps=CPU \
--parallel 1 \
--threads 32 \
--threads-batch 64 \
--host 127.0.0.1 \
--port 8090
```
## Quantization Recipe
Quantized with [ik_llama](https://github.com/ikawrakow/ik_llama.cpp), but should work with any GGUF compatible inference framework.
```bash
#!/usr/bin/env bash
custom="
# Token embedding and output tensors
output\.weight=bf16
output_norm\.weight=f32
token_embd\.weight=bf16
blk\.[0-9]\.attn_k\.weight=q8_0
blk\.[0-9]\.attn_k_norm\.weight=f32
blk\.[0-9]\.attn_norm\.weight=f32
blk\.[0-9]\.attn_output\.weight=q8_0
blk\.[0-9]\.attn_q\.weight=q8_0
blk\.[0-9]\.attn_q_norm\.weight=f32
blk\.[0-9]\.attn_v\.weight=q8_0
blk\.[0-9]\.ffn_down_exps\.weight=q6_K
blk\.[0-9]\.ffn_gate_exps\.weight=q4_K
blk\.[0-9]\.ffn_up_exps\.weight=q4_K
blk\.[0-9]\.ffn_gate_inp\.weight=f32
blk\.[0-9]\.ffn_norm\.weight=f32
blk\.[1-5][0-9]\.attn_k\.weight=q8_0
blk\.[1-5][0-9]\.attn_k_norm\.weight=f32
blk\.[1-5][0-9]\.attn_norm\.weight=f32
blk\.[1-5][0-9]\.attn_output\.weight=q8_0
blk\.[1-5][0-9]\.attn_q\.weight=q8_0
blk\.[1-5][0-9]\.attn_q_norm\.weight=f32
blk\.[1-5][0-9]\.attn_v\.weight=q8_0
blk\.[1-5][0-9]\.ffn_down_exps\.weight=q6_K
blk\.[1-5][0-9]\.ffn_gate_exps\.weight=q4_K
blk\.[1-5][0-9]\.ffn_up_exps\.weight=q4_K
blk\.[1-5][0-9]\.ffn_gate_inp\.weight=f32
blk\.[1-5][0-9]\.ffn_norm\.weight=f32
blk\.6[0-1]\.attn_k\.weight=q8_0
blk\.6[0-1]\.attn_k_norm\.weight=f32
blk\.6[0-1]\.attn_norm\.weight=f32
blk\.6[0-1]\.attn_output\.weight=q8_0
blk\.6[0-1]\.attn_q\.weight=q8_0
blk\.6[0-1]\.attn_q_norm\.weight=f32
blk\.6[0-1]\.attn_v\.weight=q8_0
blk\.6[0-1]\.ffn_down_exps\.weight=q6_K
blk\.6[0-1]\.ffn_gate_exps\.weight=q4_K
blk\.6[0-1]\.ffn_up_exps\.weight=q4_K
blk\.6[0-1]\.ffn_gate_inp\.weight=f32
blk\.6[0-1]\.ffn_norm\.weight=f32
"
custom=$(
echo "$custom" | grep -v '^#' | \
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)
echo "Running with: -custom-q $custom"
TARGET_MODEL="Qwen3-Coder-480B-A35B-Instruct-HQ4_K"
mkdir -p ~/Env/models/anikifoss/$TARGET_MODEL
./build/bin/llama-quantize \
--custom-q "$custom" \
/mnt/data/Models/Qwen/Qwen3-Coder-480B-A35B-Instruct-GGUF/Qwen3-Coder-480B-A35B-Instruct-BF16-00001-of-00021.gguf \
~/Env/models/anikifoss/$TARGET_MODEL/$TARGET_MODEL.gguf \
Q4_K \
32
```
|