Commit
·
91b390b
1
Parent(s):
2d6f4c3
new variant found
Browse files- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/llamabench.txt +11 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/perplexity_code.txt +189 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/perplexity_general.txt +189 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/perplexity_math.txt +189 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/ppl_corpus_code.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/ppl_corpus_general.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/ppl_corpus_math.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/llamabench.txt +11 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/perplexity_code.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/perplexity_general.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/perplexity_math.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/ppl_corpus_code.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/ppl_corpus_general.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/ppl_corpus_math.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/llamabench.txt +11 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/perplexity_code.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/perplexity_general.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/perplexity_math.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/ppl_corpus_code.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/ppl_corpus_general.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/ppl_corpus_math.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/llamabench.txt +11 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/perplexity_code.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/perplexity_general.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/perplexity_math.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/ppl_corpus_code.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/ppl_corpus_general.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/ppl_corpus_math.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/llamabench.txt +11 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/perplexity_code.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/perplexity_general.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/perplexity_math.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/ppl_corpus_code.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/ppl_corpus_general.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/ppl_corpus_math.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/llamabench.txt +11 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/perplexity_code.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/perplexity_general.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/perplexity_math.txt +190 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/ppl_corpus_code.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/ppl_corpus_general.txt +0 -0
- Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/ppl_corpus_math.txt +0 -0
- README.md +135 -63
- granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16.gguf +3 -0
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/llamabench.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
| model | size | params | backend | ngl | test | t/s |
|
| 7 |
+
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
| 8 |
+
| granitehybrid 350M MXFP4 MoE | 461.84 MiB | 340.33 M | CUDA | 35 | pp8 | 1618.73 ± 49.27 |
|
| 9 |
+
| granitehybrid 350M MXFP4 MoE | 461.84 MiB | 340.33 M | CUDA | 35 | tg128 | 286.08 ± 16.41 |
|
| 10 |
+
|
| 11 |
+
build: 92bb442ad (7040)
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/perplexity_code.txt
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21119 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 37 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
print_info: file format = GGUF V3 (latest)
|
| 63 |
+
print_info: file type = MXFP4 MoE
|
| 64 |
+
print_info: file size = 461.84 MiB (11.38 BPW)
|
| 65 |
+
load: printing all EOG tokens:
|
| 66 |
+
load: - 100257 ('<|end_of_text|>')
|
| 67 |
+
load: - 100261 ('<|fim_pad|>')
|
| 68 |
+
load: special tokens cache size = 96
|
| 69 |
+
load: token to piece cache size = 0.6152 MB
|
| 70 |
+
print_info: arch = granitehybrid
|
| 71 |
+
print_info: vocab_only = 0
|
| 72 |
+
print_info: n_ctx_train = 1048576
|
| 73 |
+
print_info: n_embd = 768
|
| 74 |
+
print_info: n_embd_inp = 768
|
| 75 |
+
print_info: n_layer = 32
|
| 76 |
+
print_info: n_head = 12
|
| 77 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 78 |
+
print_info: n_rot = 64
|
| 79 |
+
print_info: n_swa = 0
|
| 80 |
+
print_info: is_swa_any = 0
|
| 81 |
+
print_info: n_embd_head_k = 64
|
| 82 |
+
print_info: n_embd_head_v = 64
|
| 83 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 84 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: f_norm_eps = 0.0e+00
|
| 87 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 88 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 89 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 90 |
+
print_info: f_logit_scale = 3.0e+00
|
| 91 |
+
print_info: f_attn_scale = 1.6e-02
|
| 92 |
+
print_info: n_ff = 2048
|
| 93 |
+
print_info: n_expert = 0
|
| 94 |
+
print_info: n_expert_used = 0
|
| 95 |
+
print_info: n_expert_groups = 0
|
| 96 |
+
print_info: n_group_used = 0
|
| 97 |
+
print_info: causal attn = 1
|
| 98 |
+
print_info: pooling type = 0
|
| 99 |
+
print_info: rope type = 0
|
| 100 |
+
print_info: rope scaling = linear
|
| 101 |
+
print_info: freq_base_train = 10000.0
|
| 102 |
+
print_info: freq_scale_train = 1
|
| 103 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 104 |
+
print_info: rope_finetuned = unknown
|
| 105 |
+
print_info: ssm_d_conv = 4
|
| 106 |
+
print_info: ssm_d_inner = 1536
|
| 107 |
+
print_info: ssm_d_state = 128
|
| 108 |
+
print_info: ssm_dt_rank = 48
|
| 109 |
+
print_info: ssm_n_group = 1
|
| 110 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 111 |
+
print_info: model type = 350M
|
| 112 |
+
print_info: model params = 340.33 M
|
| 113 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 114 |
+
print_info: f_embedding_scale = 12.000000
|
| 115 |
+
print_info: f_residual_scale = 0.246000
|
| 116 |
+
print_info: f_attention_scale = 0.015625
|
| 117 |
+
print_info: n_ff_shexp = 2048
|
| 118 |
+
print_info: vocab type = BPE
|
| 119 |
+
print_info: n_vocab = 100352
|
| 120 |
+
print_info: n_merges = 100000
|
| 121 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 122 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 125 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 126 |
+
print_info: LF token = 198 'Ċ'
|
| 127 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 128 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 129 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 130 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 131 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 132 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 133 |
+
print_info: max token length = 256
|
| 134 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 135 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 136 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 137 |
+
load_tensors: CPU_Mapped model buffer size = 265.94 MiB
|
| 138 |
+
load_tensors: CUDA0 model buffer size = 97.09 MiB
|
| 139 |
+
load_tensors: CUDA1 model buffer size = 98.83 MiB
|
| 140 |
+
......................................................................
|
| 141 |
+
llama_context: constructing llama_context
|
| 142 |
+
llama_context: n_seq_max = 1
|
| 143 |
+
llama_context: n_ctx = 2048
|
| 144 |
+
llama_context: n_ctx_seq = 2048
|
| 145 |
+
llama_context: n_batch = 2048
|
| 146 |
+
llama_context: n_ubatch = 512
|
| 147 |
+
llama_context: causal_attn = 1
|
| 148 |
+
llama_context: flash_attn = auto
|
| 149 |
+
llama_context: kv_unified = false
|
| 150 |
+
llama_context: freq_base = 10000.0
|
| 151 |
+
llama_context: freq_scale = 1
|
| 152 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 153 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 154 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 155 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 157 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 158 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 159 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 161 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 162 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 163 |
+
llama_context: CUDA0 compute buffer size = 354.10 MiB
|
| 164 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 165 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 166 |
+
llama_context: graph nodes = 1815
|
| 167 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 168 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 169 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 170 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 171 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 172 |
+
|
| 173 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 174 |
+
perplexity: tokenizing the input ..
|
| 175 |
+
perplexity: tokenization took 122.013 ms
|
| 176 |
+
perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 177 |
+
perplexity: 0.52 seconds per pass - ETA 0.37 minutes
|
| 178 |
+
[1]4.3826,[2]3.9809,[3]2.5707,[4]2.3680,[5]2.6033,[6]2.8480,[7]2.6996,[8]2.5077,[9]2.3051,[10]2.1368,[11]2.1196,[12]2.1460,[13]2.0577,[14]2.0376,[15]2.0782,[16]2.0125,[17]1.9874,[18]2.0058,[19]1.9662,[20]1.9310,[21]1.8981,[22]1.8832,[23]1.9121,[24]1.8857,[25]1.9047,[26]1.8728,[27]1.8596,[28]1.8513,[29]1.8970,[30]1.9135,[31]1.9123,[32]1.8881,[33]1.9116,[34]1.9038,[35]1.8850,[36]1.9163,[37]1.9227,[38]1.9207,[39]1.9422,[40]1.9397,[41]1.9327,[42]1.9567,[43]1.9653,[44]1.9546,
|
| 179 |
+
Final estimate: PPL = 1.9546 +/- 0.01751
|
| 180 |
+
|
| 181 |
+
llama_perf_context_print: load time = 221.21 ms
|
| 182 |
+
llama_perf_context_print: prompt eval time = 15337.30 ms / 90112 tokens ( 0.17 ms per token, 5875.35 tokens per second)
|
| 183 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 184 |
+
llama_perf_context_print: total time = 16163.30 ms / 90113 tokens
|
| 185 |
+
llama_perf_context_print: graphs reused = 0
|
| 186 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 187 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20511 + ( 461 = 97 + 10 + 354) + 3134 |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23372 + ( 130 = 98 + 8 + 22) + 621 |
|
| 189 |
+
llama_memory_breakdown_print: | - Host | 294 = 265 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/perplexity_general.txt
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21247 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 37 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
print_info: file format = GGUF V3 (latest)
|
| 63 |
+
print_info: file type = MXFP4 MoE
|
| 64 |
+
print_info: file size = 461.84 MiB (11.38 BPW)
|
| 65 |
+
load: printing all EOG tokens:
|
| 66 |
+
load: - 100257 ('<|end_of_text|>')
|
| 67 |
+
load: - 100261 ('<|fim_pad|>')
|
| 68 |
+
load: special tokens cache size = 96
|
| 69 |
+
load: token to piece cache size = 0.6152 MB
|
| 70 |
+
print_info: arch = granitehybrid
|
| 71 |
+
print_info: vocab_only = 0
|
| 72 |
+
print_info: n_ctx_train = 1048576
|
| 73 |
+
print_info: n_embd = 768
|
| 74 |
+
print_info: n_embd_inp = 768
|
| 75 |
+
print_info: n_layer = 32
|
| 76 |
+
print_info: n_head = 12
|
| 77 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 78 |
+
print_info: n_rot = 64
|
| 79 |
+
print_info: n_swa = 0
|
| 80 |
+
print_info: is_swa_any = 0
|
| 81 |
+
print_info: n_embd_head_k = 64
|
| 82 |
+
print_info: n_embd_head_v = 64
|
| 83 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 84 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: f_norm_eps = 0.0e+00
|
| 87 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 88 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 89 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 90 |
+
print_info: f_logit_scale = 3.0e+00
|
| 91 |
+
print_info: f_attn_scale = 1.6e-02
|
| 92 |
+
print_info: n_ff = 2048
|
| 93 |
+
print_info: n_expert = 0
|
| 94 |
+
print_info: n_expert_used = 0
|
| 95 |
+
print_info: n_expert_groups = 0
|
| 96 |
+
print_info: n_group_used = 0
|
| 97 |
+
print_info: causal attn = 1
|
| 98 |
+
print_info: pooling type = 0
|
| 99 |
+
print_info: rope type = 0
|
| 100 |
+
print_info: rope scaling = linear
|
| 101 |
+
print_info: freq_base_train = 10000.0
|
| 102 |
+
print_info: freq_scale_train = 1
|
| 103 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 104 |
+
print_info: rope_finetuned = unknown
|
| 105 |
+
print_info: ssm_d_conv = 4
|
| 106 |
+
print_info: ssm_d_inner = 1536
|
| 107 |
+
print_info: ssm_d_state = 128
|
| 108 |
+
print_info: ssm_dt_rank = 48
|
| 109 |
+
print_info: ssm_n_group = 1
|
| 110 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 111 |
+
print_info: model type = 350M
|
| 112 |
+
print_info: model params = 340.33 M
|
| 113 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 114 |
+
print_info: f_embedding_scale = 12.000000
|
| 115 |
+
print_info: f_residual_scale = 0.246000
|
| 116 |
+
print_info: f_attention_scale = 0.015625
|
| 117 |
+
print_info: n_ff_shexp = 2048
|
| 118 |
+
print_info: vocab type = BPE
|
| 119 |
+
print_info: n_vocab = 100352
|
| 120 |
+
print_info: n_merges = 100000
|
| 121 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 122 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 125 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 126 |
+
print_info: LF token = 198 'Ċ'
|
| 127 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 128 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 129 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 130 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 131 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 132 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 133 |
+
print_info: max token length = 256
|
| 134 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 135 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 136 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 137 |
+
load_tensors: CPU_Mapped model buffer size = 265.94 MiB
|
| 138 |
+
load_tensors: CUDA0 model buffer size = 97.09 MiB
|
| 139 |
+
load_tensors: CUDA1 model buffer size = 98.83 MiB
|
| 140 |
+
......................................................................
|
| 141 |
+
llama_context: constructing llama_context
|
| 142 |
+
llama_context: n_seq_max = 1
|
| 143 |
+
llama_context: n_ctx = 2048
|
| 144 |
+
llama_context: n_ctx_seq = 2048
|
| 145 |
+
llama_context: n_batch = 2048
|
| 146 |
+
llama_context: n_ubatch = 512
|
| 147 |
+
llama_context: causal_attn = 1
|
| 148 |
+
llama_context: flash_attn = auto
|
| 149 |
+
llama_context: kv_unified = false
|
| 150 |
+
llama_context: freq_base = 10000.0
|
| 151 |
+
llama_context: freq_scale = 1
|
| 152 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 153 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 154 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 155 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 157 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 158 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 159 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 161 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 162 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 163 |
+
llama_context: CUDA0 compute buffer size = 354.10 MiB
|
| 164 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 165 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 166 |
+
llama_context: graph nodes = 1815
|
| 167 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 168 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 169 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 170 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 171 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 172 |
+
|
| 173 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 174 |
+
perplexity: tokenizing the input ..
|
| 175 |
+
perplexity: tokenization took 39.223 ms
|
| 176 |
+
perplexity: calculating perplexity over 14 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 177 |
+
perplexity: 0.58 seconds per pass - ETA 0.13 minutes
|
| 178 |
+
[1]18.6243,[2]21.6400,[3]22.2933,[4]20.2232,[5]20.2199,[6]18.0090,[7]17.6263,[8]17.5788,[9]18.0953,[10]18.0725,[11]17.9160,[12]18.0402,[13]18.1147,[14]18.1532,
|
| 179 |
+
Final estimate: PPL = 18.1532 +/- 0.46672
|
| 180 |
+
|
| 181 |
+
llama_perf_context_print: load time = 258.27 ms
|
| 182 |
+
llama_perf_context_print: prompt eval time = 5276.13 ms / 28672 tokens ( 0.18 ms per token, 5434.29 tokens per second)
|
| 183 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 184 |
+
llama_perf_context_print: total time = 5558.16 ms / 28673 tokens
|
| 185 |
+
llama_perf_context_print: graphs reused = 0
|
| 186 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 187 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20511 + ( 461 = 97 + 10 + 354) + 3134 |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23372 + ( 130 = 98 + 8 + 22) + 621 |
|
| 189 |
+
llama_memory_breakdown_print: | - Host | 294 = 265 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/perplexity_math.txt
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21119 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 37 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
print_info: file format = GGUF V3 (latest)
|
| 63 |
+
print_info: file type = MXFP4 MoE
|
| 64 |
+
print_info: file size = 461.84 MiB (11.38 BPW)
|
| 65 |
+
load: printing all EOG tokens:
|
| 66 |
+
load: - 100257 ('<|end_of_text|>')
|
| 67 |
+
load: - 100261 ('<|fim_pad|>')
|
| 68 |
+
load: special tokens cache size = 96
|
| 69 |
+
load: token to piece cache size = 0.6152 MB
|
| 70 |
+
print_info: arch = granitehybrid
|
| 71 |
+
print_info: vocab_only = 0
|
| 72 |
+
print_info: n_ctx_train = 1048576
|
| 73 |
+
print_info: n_embd = 768
|
| 74 |
+
print_info: n_embd_inp = 768
|
| 75 |
+
print_info: n_layer = 32
|
| 76 |
+
print_info: n_head = 12
|
| 77 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 78 |
+
print_info: n_rot = 64
|
| 79 |
+
print_info: n_swa = 0
|
| 80 |
+
print_info: is_swa_any = 0
|
| 81 |
+
print_info: n_embd_head_k = 64
|
| 82 |
+
print_info: n_embd_head_v = 64
|
| 83 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 84 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: f_norm_eps = 0.0e+00
|
| 87 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 88 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 89 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 90 |
+
print_info: f_logit_scale = 3.0e+00
|
| 91 |
+
print_info: f_attn_scale = 1.6e-02
|
| 92 |
+
print_info: n_ff = 2048
|
| 93 |
+
print_info: n_expert = 0
|
| 94 |
+
print_info: n_expert_used = 0
|
| 95 |
+
print_info: n_expert_groups = 0
|
| 96 |
+
print_info: n_group_used = 0
|
| 97 |
+
print_info: causal attn = 1
|
| 98 |
+
print_info: pooling type = 0
|
| 99 |
+
print_info: rope type = 0
|
| 100 |
+
print_info: rope scaling = linear
|
| 101 |
+
print_info: freq_base_train = 10000.0
|
| 102 |
+
print_info: freq_scale_train = 1
|
| 103 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 104 |
+
print_info: rope_finetuned = unknown
|
| 105 |
+
print_info: ssm_d_conv = 4
|
| 106 |
+
print_info: ssm_d_inner = 1536
|
| 107 |
+
print_info: ssm_d_state = 128
|
| 108 |
+
print_info: ssm_dt_rank = 48
|
| 109 |
+
print_info: ssm_n_group = 1
|
| 110 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 111 |
+
print_info: model type = 350M
|
| 112 |
+
print_info: model params = 340.33 M
|
| 113 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 114 |
+
print_info: f_embedding_scale = 12.000000
|
| 115 |
+
print_info: f_residual_scale = 0.246000
|
| 116 |
+
print_info: f_attention_scale = 0.015625
|
| 117 |
+
print_info: n_ff_shexp = 2048
|
| 118 |
+
print_info: vocab type = BPE
|
| 119 |
+
print_info: n_vocab = 100352
|
| 120 |
+
print_info: n_merges = 100000
|
| 121 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 122 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 125 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 126 |
+
print_info: LF token = 198 'Ċ'
|
| 127 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 128 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 129 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 130 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 131 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 132 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 133 |
+
print_info: max token length = 256
|
| 134 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 135 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 136 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 137 |
+
load_tensors: CPU_Mapped model buffer size = 265.94 MiB
|
| 138 |
+
load_tensors: CUDA0 model buffer size = 97.09 MiB
|
| 139 |
+
load_tensors: CUDA1 model buffer size = 98.83 MiB
|
| 140 |
+
......................................................................
|
| 141 |
+
llama_context: constructing llama_context
|
| 142 |
+
llama_context: n_seq_max = 1
|
| 143 |
+
llama_context: n_ctx = 2048
|
| 144 |
+
llama_context: n_ctx_seq = 2048
|
| 145 |
+
llama_context: n_batch = 2048
|
| 146 |
+
llama_context: n_ubatch = 512
|
| 147 |
+
llama_context: causal_attn = 1
|
| 148 |
+
llama_context: flash_attn = auto
|
| 149 |
+
llama_context: kv_unified = false
|
| 150 |
+
llama_context: freq_base = 10000.0
|
| 151 |
+
llama_context: freq_scale = 1
|
| 152 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 153 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 154 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 155 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 157 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 158 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 159 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 161 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 162 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 163 |
+
llama_context: CUDA0 compute buffer size = 354.10 MiB
|
| 164 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 165 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 166 |
+
llama_context: graph nodes = 1815
|
| 167 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 168 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 169 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 170 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 171 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 172 |
+
|
| 173 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 174 |
+
perplexity: tokenizing the input ..
|
| 175 |
+
perplexity: tokenization took 34.345 ms
|
| 176 |
+
perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 177 |
+
perplexity: 0.52 seconds per pass - ETA 0.12 minutes
|
| 178 |
+
[1]8.6838,[2]9.9176,[3]9.4843,[4]9.8298,[5]9.9730,[6]10.0492,[7]10.2116,[8]9.9103,[9]9.9652,[10]9.9791,[11]10.2135,[12]10.2943,[13]10.4160,[14]10.3885,[15]10.2881,
|
| 179 |
+
Final estimate: PPL = 10.2881 +/- 0.23163
|
| 180 |
+
|
| 181 |
+
llama_perf_context_print: load time = 212.69 ms
|
| 182 |
+
llama_perf_context_print: prompt eval time = 5367.04 ms / 30720 tokens ( 0.17 ms per token, 5723.82 tokens per second)
|
| 183 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 184 |
+
llama_perf_context_print: total time = 5658.34 ms / 30721 tokens
|
| 185 |
+
llama_perf_context_print: graphs reused = 0
|
| 186 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 187 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20511 + ( 461 = 97 + 10 + 354) + 3134 |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23372 + ( 130 = 98 + 8 + 22) + 621 |
|
| 189 |
+
llama_memory_breakdown_print: | - Host | 294 = 265 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/ppl_corpus_code.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/ppl_corpus_general.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_f16/ppl_corpus_math.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/llamabench.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
| model | size | params | backend | ngl | test | t/s |
|
| 7 |
+
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
| 8 |
+
| granitehybrid 350M MXFP4 MoE | 318.51 MiB | 340.33 M | CUDA | 35 | pp8 | 1707.74 ± 48.03 |
|
| 9 |
+
| granitehybrid 350M MXFP4 MoE | 318.51 MiB | 340.33 M | CUDA | 35 | tg128 | 304.31 ± 9.14 |
|
| 10 |
+
|
| 11 |
+
build: 92bb442ad (7040)
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/perplexity_code.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21117 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 4 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 33 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 318.51 MiB (7.85 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 158.00 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 79.40 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 81.14 MiB
|
| 141 |
+
...................................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 267.39 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 91.152 ms
|
| 177 |
+
perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.49 seconds per pass - ETA 0.35 minutes
|
| 179 |
+
[1]4.3517,[2]3.9745,[3]2.5678,[4]2.3702,[5]2.6105,[6]2.8548,[7]2.7066,[8]2.5143,[9]2.3099,[10]2.1407,[11]2.1229,[12]2.1494,[13]2.0606,[14]2.0404,[15]2.0818,[16]2.0161,[17]1.9908,[18]2.0095,[19]1.9697,[20]1.9343,[21]1.9013,[22]1.8865,[23]1.9166,[24]1.8900,[25]1.9092,[26]1.8770,[27]1.8636,[28]1.8555,[29]1.9013,[30]1.9180,[31]1.9167,[32]1.8922,[33]1.9157,[34]1.9080,[35]1.8892,[36]1.9207,[37]1.9271,[38]1.9251,[39]1.9467,[40]1.9440,[41]1.9366,[42]1.9605,[43]1.9690,[44]1.9581,
|
| 180 |
+
Final estimate: PPL = 1.9581 +/- 0.01754
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 204.47 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 14784.78 ms / 90112 tokens ( 0.16 ms per token, 6094.92 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 15658.06 ms / 90113 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20681 + ( 356 = 79 + 10 + 267) + 3068 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23390 + ( 112 = 81 + 8 + 22) + 621 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 186 = 157 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/perplexity_general.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21119 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 4 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 33 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 318.51 MiB (7.85 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 158.00 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 79.40 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 81.14 MiB
|
| 141 |
+
...................................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 267.39 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 42.825 ms
|
| 177 |
+
perplexity: calculating perplexity over 14 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.54 seconds per pass - ETA 0.12 minutes
|
| 179 |
+
[1]18.6512,[2]21.6381,[3]22.3486,[4]20.2410,[5]20.2498,[6]18.0768,[7]17.7092,[8]17.6486,[9]18.1639,[10]18.1415,[11]17.9674,[12]18.0778,[13]18.1444,[14]18.1862,
|
| 180 |
+
Final estimate: PPL = 18.1862 +/- 0.46855
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 206.35 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 4768.99 ms / 28672 tokens ( 0.17 ms per token, 6012.17 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 5039.14 ms / 28673 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20681 + ( 356 = 79 + 10 + 267) + 3068 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23390 + ( 112 = 81 + 8 + 22) + 621 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 186 = 157 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/perplexity_math.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21117 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 4 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 33 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 318.51 MiB (7.85 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 158.00 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 79.40 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 81.14 MiB
|
| 141 |
+
...................................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 267.39 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 36.171 ms
|
| 177 |
+
perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.54 seconds per pass - ETA 0.13 minutes
|
| 179 |
+
[1]8.7655,[2]9.9228,[3]9.4725,[4]9.7974,[5]9.9635,[6]10.0535,[7]10.2088,[8]9.9054,[9]9.9607,[10]9.9652,[11]10.1962,[12]10.2815,[13]10.4015,[14]10.3779,[15]10.2794,
|
| 180 |
+
Final estimate: PPL = 10.2794 +/- 0.23142
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 256.52 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5354.18 ms / 30720 tokens ( 0.17 ms per token, 5737.58 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 5661.01 ms / 30721 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20681 + ( 356 = 79 + 10 + 267) + 3068 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23390 + ( 112 = 81 + 8 + 22) + 621 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 186 = 157 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/ppl_corpus_code.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/ppl_corpus_general.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_f16-router_gate_emb_q6_k/ppl_corpus_math.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/llamabench.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
| model | size | params | backend | ngl | test | t/s |
|
| 7 |
+
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
| 8 |
+
| granitehybrid 350M MXFP4 MoE | 413.54 MiB | 340.33 M | CUDA | 35 | pp8 | 1658.55 ± 48.11 |
|
| 9 |
+
| granitehybrid 350M MXFP4 MoE | 413.54 MiB | 340.33 M | CUDA | 35 | tg128 | 288.98 ± 15.13 |
|
| 10 |
+
|
| 11 |
+
build: 92bb442ad (7040)
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/perplexity_code.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21084 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["�� Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 1 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 164 tensors
|
| 62 |
+
llama_model_loader: - type mxfp4: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 413.54 MiB (10.19 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 248.24 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 81.38 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 83.95 MiB
|
| 141 |
+
..................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 351.61 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 120.533 ms
|
| 177 |
+
perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.60 seconds per pass - ETA 0.43 minutes
|
| 179 |
+
[1]4.3491,[2]3.9791,[3]2.5701,[4]2.3717,[5]2.6057,[6]2.8516,[7]2.7012,[8]2.5110,[9]2.3072,[10]2.1384,[11]2.1205,[12]2.1469,[13]2.0591,[14]2.0382,[15]2.0788,[16]2.0125,[17]1.9881,[18]2.0059,[19]1.9660,[20]1.9309,[21]1.8984,[22]1.8838,[23]1.9129,[24]1.8869,[25]1.9057,[26]1.8738,[27]1.8612,[28]1.8528,[29]1.8986,[30]1.9151,[31]1.9141,[32]1.8898,[33]1.9134,[34]1.9055,[35]1.8869,[36]1.9188,[37]1.9250,[38]1.9233,[39]1.9446,[40]1.9422,[41]1.9352,[42]1.9591,[43]1.9679,[44]1.9572,
|
| 180 |
+
Final estimate: PPL = 1.9572 +/- 0.01750
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 223.86 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 15323.24 ms / 90112 tokens ( 0.17 ms per token, 5880.74 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 16151.18 ms / 90113 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20496 + ( 443 = 81 + 10 + 351) + 3167 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23400 + ( 115 = 83 + 8 + 22) + 608 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 277 = 248 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/perplexity_general.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21073 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["�� Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 1 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 164 tensors
|
| 62 |
+
llama_model_loader: - type mxfp4: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 413.54 MiB (10.19 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 248.24 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 81.38 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 83.95 MiB
|
| 141 |
+
..................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 351.61 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 41.224 ms
|
| 177 |
+
perplexity: calculating perplexity over 14 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.61 seconds per pass - ETA 0.13 minutes
|
| 179 |
+
[1]18.7962,[2]21.8607,[3]22.5100,[4]20.4092,[5]20.4112,[6]18.1799,[7]17.7936,[8]17.7871,[9]18.2852,[10]18.2411,[11]18.0854,[12]18.1918,[13]18.2681,[14]18.2903,
|
| 180 |
+
Final estimate: PPL = 18.2903 +/- 0.46972
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 235.09 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5138.69 ms / 28672 tokens ( 0.18 ms per token, 5579.63 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 5469.61 ms / 28673 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20496 + ( 443 = 81 + 10 + 351) + 3167 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23400 + ( 115 = 83 + 8 + 22) + 608 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 277 = 248 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/perplexity_math.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21084 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["�� Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 1 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 164 tensors
|
| 62 |
+
llama_model_loader: - type mxfp4: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 413.54 MiB (10.19 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 248.24 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 81.38 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 83.95 MiB
|
| 141 |
+
..................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 351.61 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 36.886 ms
|
| 177 |
+
perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.53 seconds per pass - ETA 0.12 minutes
|
| 179 |
+
[1]8.7490,[2]10.0105,[3]9.5878,[4]9.9360,[5]10.1086,[6]10.1772,[7]10.3318,[8]10.0191,[9]10.0694,[10]10.0736,[11]10.3061,[12]10.3801,[13]10.5055,[14]10.4769,[15]10.3689,
|
| 180 |
+
Final estimate: PPL = 10.3689 +/- 0.23339
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 214.11 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5401.50 ms / 30720 tokens ( 0.18 ms per token, 5687.31 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 5702.33 ms / 30721 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20496 + ( 443 = 81 + 10 + 351) + 3167 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23400 + ( 115 = 83 + 8 + 22) + 608 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 277 = 248 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/ppl_corpus_code.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/ppl_corpus_general.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-embd_f16/ppl_corpus_math.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/llamabench.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
| model | size | params | backend | ngl | test | t/s |
|
| 7 |
+
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
| 8 |
+
| granitehybrid 350M MXFP4 MoE | 318.51 MiB | 340.33 M | CUDA | 35 | pp8 | 1649.31 ± 23.94 |
|
| 9 |
+
| granitehybrid 350M MXFP4 MoE | 318.51 MiB | 340.33 M | CUDA | 35 | tg128 | 300.67 ± 11.67 |
|
| 10 |
+
|
| 11 |
+
build: 92bb442ad (7040)
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/perplexity_code.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21079 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 4 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 33 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 318.51 MiB (7.85 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 158.00 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 79.40 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 81.14 MiB
|
| 141 |
+
...................................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 267.39 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 90.608 ms
|
| 177 |
+
perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.50 seconds per pass - ETA 0.35 minutes
|
| 179 |
+
[1]4.3517,[2]3.9745,[3]2.5678,[4]2.3702,[5]2.6105,[6]2.8548,[7]2.7066,[8]2.5143,[9]2.3099,[10]2.1407,[11]2.1229,[12]2.1494,[13]2.0606,[14]2.0404,[15]2.0818,[16]2.0161,[17]1.9908,[18]2.0095,[19]1.9697,[20]1.9343,[21]1.9013,[22]1.8865,[23]1.9166,[24]1.8900,[25]1.9092,[26]1.8770,[27]1.8636,[28]1.8555,[29]1.9013,[30]1.9180,[31]1.9167,[32]1.8922,[33]1.9157,[34]1.9080,[35]1.8892,[36]1.9207,[37]1.9271,[38]1.9251,[39]1.9467,[40]1.9440,[41]1.9366,[42]1.9605,[43]1.9690,[44]1.9581,
|
| 180 |
+
Final estimate: PPL = 1.9581 +/- 0.01754
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 258.24 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 15073.77 ms / 90112 tokens ( 0.17 ms per token, 5978.07 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 15896.19 ms / 90113 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20643 + ( 356 = 79 + 10 + 267) + 3106 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23390 + ( 112 = 81 + 8 + 22) + 621 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 186 = 157 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/perplexity_general.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21084 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 4 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 33 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 318.51 MiB (7.85 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 158.00 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 79.40 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 81.14 MiB
|
| 141 |
+
...................................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 267.39 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 36.875 ms
|
| 177 |
+
perplexity: calculating perplexity over 14 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.85 seconds per pass - ETA 0.18 minutes
|
| 179 |
+
[1]18.6512,[2]21.6381,[3]22.3486,[4]20.2410,[5]20.2498,[6]18.0768,[7]17.7092,[8]17.6486,[9]18.1639,[10]18.1415,[11]17.9674,[12]18.0778,[13]18.1444,[14]18.1862,
|
| 180 |
+
Final estimate: PPL = 18.1862 +/- 0.46855
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 282.45 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5159.99 ms / 28672 tokens ( 0.18 ms per token, 5556.60 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 5431.32 ms / 28673 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20649 + ( 356 = 79 + 10 + 267) + 3100 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23390 + ( 112 = 81 + 8 + 22) + 621 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 186 = 157 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/perplexity_math.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21079 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 4 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 33 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 318.51 MiB (7.85 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 158.00 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 79.40 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 81.14 MiB
|
| 141 |
+
...................................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 267.39 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 35.426 ms
|
| 177 |
+
perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.57 seconds per pass - ETA 0.13 minutes
|
| 179 |
+
[1]8.7655,[2]9.9228,[3]9.4725,[4]9.7974,[5]9.9635,[6]10.0535,[7]10.2088,[8]9.9054,[9]9.9607,[10]9.9652,[11]10.1962,[12]10.2815,[13]10.4015,[14]10.3779,[15]10.2794,
|
| 180 |
+
Final estimate: PPL = 10.2794 +/- 0.23142
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 211.56 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5241.07 ms / 30720 tokens ( 0.17 ms per token, 5861.40 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 5525.24 ms / 30721 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20643 + ( 356 = 79 + 10 + 267) + 3106 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23390 + ( 112 = 81 + 8 + 22) + 621 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 186 = 157 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/ppl_corpus_code.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/ppl_corpus_general.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_mxfp4-router_gate_emb_f16/ppl_corpus_math.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/llamabench.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
| model | size | params | backend | ngl | test | t/s |
|
| 7 |
+
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
| 8 |
+
| granitehybrid 350M MXFP4 MoE | 414.19 MiB | 340.33 M | CUDA | 35 | pp8 | 1577.68 ± 58.97 |
|
| 9 |
+
| granitehybrid 350M MXFP4 MoE | 414.19 MiB | 340.33 M | CUDA | 35 | tg128 | 307.95 ± 11.28 |
|
| 10 |
+
|
| 11 |
+
build: 92bb442ad (7040)
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/perplexity_code.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21085 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 1 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 164 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 414.19 MiB (10.21 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 248.40 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 81.71 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 84.11 MiB
|
| 141 |
+
..................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 351.61 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 103.423 ms
|
| 177 |
+
perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.57 seconds per pass - ETA 0.40 minutes
|
| 179 |
+
[1]4.3638,[2]3.9778,[3]2.5685,[4]2.3700,[5]2.6041,[6]2.8488,[7]2.7004,[8]2.5086,[9]2.3058,[10]2.1374,[11]2.1200,[12]2.1461,[13]2.0575,[14]2.0372,[15]2.0776,[16]2.0120,[17]1.9869,[18]2.0052,[19]1.9658,[20]1.9305,[21]1.8976,[22]1.8829,[23]1.9121,[24]1.8856,[25]1.9045,[26]1.8727,[27]1.8596,[28]1.8511,[29]1.8965,[30]1.9133,[31]1.9122,[32]1.8881,[33]1.9115,[34]1.9036,[35]1.8848,[36]1.9161,[37]1.9225,[38]1.9204,[39]1.9419,[40]1.9393,[41]1.9321,[42]1.9561,[43]1.9647,[44]1.9539,
|
| 180 |
+
Final estimate: PPL = 1.9539 +/- 0.01749
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 275.21 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 16124.85 ms / 90112 tokens ( 0.18 ms per token, 5588.39 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 17002.61 ms / 90113 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20499 + ( 443 = 81 + 10 + 351) + 3163 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23398 + ( 115 = 84 + 8 + 22) + 610 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 277 = 248 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/perplexity_general.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21083 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 1 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 164 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 414.19 MiB (10.21 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 248.40 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 81.71 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 84.11 MiB
|
| 141 |
+
..................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 351.61 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 46.422 ms
|
| 177 |
+
perplexity: calculating perplexity over 14 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.57 seconds per pass - ETA 0.12 minutes
|
| 179 |
+
[1]18.5175,[2]21.5651,[3]22.2330,[4]20.1930,[5]20.1937,[6]17.9976,[7]17.6212,[8]17.5896,[9]18.1097,[10]18.0961,[11]17.9342,[12]18.0526,[13]18.1223,[14]18.1555,
|
| 180 |
+
Final estimate: PPL = 18.1555 +/- 0.46639
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 285.95 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5794.67 ms / 28672 tokens ( 0.20 ms per token, 4948.00 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 6216.80 ms / 28673 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20494 + ( 443 = 81 + 10 + 351) + 3168 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23398 + ( 115 = 84 + 8 + 22) + 610 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 277 = 248 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/perplexity_math.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21087 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 1 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 164 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 414.19 MiB (10.21 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 248.40 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 81.71 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 84.11 MiB
|
| 141 |
+
..................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 351.61 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 39.186 ms
|
| 177 |
+
perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.65 seconds per pass - ETA 0.15 minutes
|
| 179 |
+
[1]8.6883,[2]9.9276,[3]9.4995,[4]9.8344,[5]9.9786,[6]10.0566,[7]10.2142,[8]9.9124,[9]9.9724,[10]9.9839,[11]10.2179,[12]10.3011,[13]10.4233,[14]10.3949,[15]10.2956,
|
| 180 |
+
Final estimate: PPL = 10.2956 +/- 0.23171
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 222.66 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5721.99 ms / 30720 tokens ( 0.19 ms per token, 5368.76 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 6059.09 ms / 30721 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20499 + ( 443 = 81 + 10 + 351) + 3163 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23398 + ( 115 = 84 + 8 + 22) + 610 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 277 = 248 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/ppl_corpus_code.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/ppl_corpus_general.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-embd_f16/ppl_corpus_math.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/llamabench.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
| model | size | params | backend | ngl | test | t/s |
|
| 7 |
+
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
| 8 |
+
| granitehybrid 350M MXFP4 MoE | 459.19 MiB | 340.33 M | CUDA | 35 | pp8 | 1652.10 ± 29.30 |
|
| 9 |
+
| granitehybrid 350M MXFP4 MoE | 459.19 MiB | 340.33 M | CUDA | 35 | tg128 | 292.38 ± 9.54 |
|
| 10 |
+
|
| 11 |
+
build: 92bb442ad (7040)
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/perplexity_code.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21085 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 33 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 459.19 MiB (11.32 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 265.27 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 95.76 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 98.17 MiB
|
| 141 |
+
.....................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 354.10 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 92.34 ms
|
| 177 |
+
perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.54 seconds per pass - ETA 0.38 minutes
|
| 179 |
+
[1]4.3900,[2]3.9915,[3]2.5750,[4]2.3699,[5]2.6085,[6]2.8527,[7]2.7033,[8]2.5113,[9]2.3080,[10]2.1391,[11]2.1211,[12]2.1472,[13]2.0589,[14]2.0386,[15]2.0789,[16]2.0132,[17]1.9882,[18]2.0064,[19]1.9668,[20]1.9314,[21]1.8986,[22]1.8838,[23]1.9127,[24]1.8862,[25]1.9052,[26]1.8732,[27]1.8601,[28]1.8518,[29]1.8971,[30]1.9137,[31]1.9124,[32]1.8882,[33]1.9115,[34]1.9036,[35]1.8848,[36]1.9161,[37]1.9224,[38]1.9204,[39]1.9420,[40]1.9395,[41]1.9324,[42]1.9565,[43]1.9650,[44]1.9543,
|
| 180 |
+
Final estimate: PPL = 1.9543 +/- 0.01750
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 250.31 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 15511.51 ms / 90112 tokens ( 0.17 ms per token, 5809.36 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 16356.83 ms / 90113 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20479 + ( 460 = 95 + 10 + 354) + 3167 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23372 + ( 129 = 98 + 8 + 22) + 622 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 294 = 265 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/perplexity_general.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21079 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 33 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 459.19 MiB (11.32 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 265.27 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 95.76 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 98.17 MiB
|
| 141 |
+
.....................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 354.10 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 37.624 ms
|
| 177 |
+
perplexity: calculating perplexity over 14 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.53 seconds per pass - ETA 0.12 minutes
|
| 179 |
+
[1]18.5143,[2]21.5620,[3]22.2454,[4]20.1986,[5]20.1917,[6]18.0056,[7]17.6202,[8]17.5785,[9]18.1005,[10]18.0823,[11]17.9222,[12]18.0450,[13]18.1147,[14]18.1547,
|
| 180 |
+
Final estimate: PPL = 18.1547 +/- 0.46668
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 226.12 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5160.53 ms / 28672 tokens ( 0.18 ms per token, 5556.02 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 5430.22 ms / 28673 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20473 + ( 460 = 95 + 10 + 354) + 3173 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23372 + ( 129 = 98 + 8 + 22) + 622 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 294 = 265 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/perplexity_math.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
| 2 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
| 3 |
+
ggml_cuda_init: found 2 CUDA devices:
|
| 4 |
+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 5 |
+
Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
|
| 6 |
+
build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
|
| 7 |
+
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 21079 MiB free
|
| 8 |
+
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
|
| 9 |
+
llama_model_loader: loaded meta data with 48 key-value pairs and 402 tensors from /mnt/world8/AI/Models/granite-4.0-h-350m-unsloth/GGUF/MXFP4/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16.gguf (version GGUF V3 (latest))
|
| 10 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 11 |
+
llama_model_loader: - kv 0: general.architecture str = granitehybrid
|
| 12 |
+
llama_model_loader: - kv 1: general.type str = model
|
| 13 |
+
llama_model_loader: - kv 2: general.name str = Granite 4.0 H 350m Unsloth
|
| 14 |
+
llama_model_loader: - kv 3: general.finetune str = unsloth
|
| 15 |
+
llama_model_loader: - kv 4: general.basename str = granite-4.0-h
|
| 16 |
+
llama_model_loader: - kv 5: general.size_label str = 350M
|
| 17 |
+
llama_model_loader: - kv 6: general.license str = apache-2.0
|
| 18 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
| 19 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Granite 4.0 H 350m
|
| 20 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Ibm Granite
|
| 21 |
+
llama_model_loader: - kv 10: general.base_model.0.repo_url str = https://huggingface.co/ibm-granite/gr...
|
| 22 |
+
llama_model_loader: - kv 11: general.tags arr[str,3] = ["language", "unsloth", "granite-4.0"]
|
| 23 |
+
llama_model_loader: - kv 12: granitehybrid.block_count u32 = 32
|
| 24 |
+
llama_model_loader: - kv 13: granitehybrid.context_length u32 = 1048576
|
| 25 |
+
llama_model_loader: - kv 14: granitehybrid.embedding_length u32 = 768
|
| 26 |
+
llama_model_loader: - kv 15: granitehybrid.feed_forward_length u32 = 2048
|
| 27 |
+
llama_model_loader: - kv 16: granitehybrid.attention.head_count u32 = 12
|
| 28 |
+
llama_model_loader: - kv 17: granitehybrid.attention.head_count_kv arr[i32,32] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
|
| 29 |
+
llama_model_loader: - kv 18: granitehybrid.rope.freq_base f32 = 10000.000000
|
| 30 |
+
llama_model_loader: - kv 19: granitehybrid.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 31 |
+
llama_model_loader: - kv 20: granitehybrid.expert_count u32 = 0
|
| 32 |
+
llama_model_loader: - kv 21: granitehybrid.expert_used_count u32 = 0
|
| 33 |
+
llama_model_loader: - kv 22: granitehybrid.vocab_size u32 = 100352
|
| 34 |
+
llama_model_loader: - kv 23: granitehybrid.rope.dimension_count u32 = 64
|
| 35 |
+
llama_model_loader: - kv 24: granitehybrid.attention.scale f32 = 0.015625
|
| 36 |
+
llama_model_loader: - kv 25: granitehybrid.embedding_scale f32 = 12.000000
|
| 37 |
+
llama_model_loader: - kv 26: granitehybrid.residual_scale f32 = 0.246000
|
| 38 |
+
llama_model_loader: - kv 27: granitehybrid.logit_scale f32 = 3.000000
|
| 39 |
+
llama_model_loader: - kv 28: granitehybrid.expert_shared_feed_forward_length u32 = 2048
|
| 40 |
+
llama_model_loader: - kv 29: granitehybrid.ssm.conv_kernel u32 = 4
|
| 41 |
+
llama_model_loader: - kv 30: granitehybrid.ssm.state_size u32 = 128
|
| 42 |
+
llama_model_loader: - kv 31: granitehybrid.ssm.group_count u32 = 1
|
| 43 |
+
llama_model_loader: - kv 32: granitehybrid.ssm.inner_size u32 = 1536
|
| 44 |
+
llama_model_loader: - kv 33: granitehybrid.ssm.time_step_rank u32 = 48
|
| 45 |
+
llama_model_loader: - kv 34: granitehybrid.rope.scaling.finetuned bool = false
|
| 46 |
+
llama_model_loader: - kv 35: tokenizer.ggml.model str = gpt2
|
| 47 |
+
llama_model_loader: - kv 36: tokenizer.ggml.pre str = dbrx
|
| 48 |
+
llama_model_loader: - kv 37: tokenizer.ggml.tokens arr[str,100352] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
| 49 |
+
llama_model_loader: - kv 38: tokenizer.ggml.token_type arr[i32,100352] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
| 50 |
+
llama_model_loader: - kv 39: tokenizer.ggml.merges arr[str,100000] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
|
| 51 |
+
llama_model_loader: - kv 40: tokenizer.ggml.bos_token_id u32 = 100257
|
| 52 |
+
llama_model_loader: - kv 41: tokenizer.ggml.eos_token_id u32 = 100257
|
| 53 |
+
llama_model_loader: - kv 42: tokenizer.ggml.unknown_token_id u32 = 100269
|
| 54 |
+
llama_model_loader: - kv 43: tokenizer.ggml.padding_token_id u32 = 100256
|
| 55 |
+
llama_model_loader: - kv 44: tokenizer.ggml.add_bos_token bool = false
|
| 56 |
+
llama_model_loader: - kv 45: tokenizer.chat_template str = {%- set tools_system_message_prefix =...
|
| 57 |
+
llama_model_loader: - kv 46: general.quantization_version u32 = 2
|
| 58 |
+
llama_model_loader: - kv 47: general.file_type u32 = 38
|
| 59 |
+
llama_model_loader: - type f32: 233 tensors
|
| 60 |
+
llama_model_loader: - type f16: 33 tensors
|
| 61 |
+
llama_model_loader: - type q8_0: 132 tensors
|
| 62 |
+
llama_model_loader: - type q6_K: 4 tensors
|
| 63 |
+
print_info: file format = GGUF V3 (latest)
|
| 64 |
+
print_info: file type = MXFP4 MoE
|
| 65 |
+
print_info: file size = 459.19 MiB (11.32 BPW)
|
| 66 |
+
load: printing all EOG tokens:
|
| 67 |
+
load: - 100257 ('<|end_of_text|>')
|
| 68 |
+
load: - 100261 ('<|fim_pad|>')
|
| 69 |
+
load: special tokens cache size = 96
|
| 70 |
+
load: token to piece cache size = 0.6152 MB
|
| 71 |
+
print_info: arch = granitehybrid
|
| 72 |
+
print_info: vocab_only = 0
|
| 73 |
+
print_info: n_ctx_train = 1048576
|
| 74 |
+
print_info: n_embd = 768
|
| 75 |
+
print_info: n_embd_inp = 768
|
| 76 |
+
print_info: n_layer = 32
|
| 77 |
+
print_info: n_head = 12
|
| 78 |
+
print_info: n_head_kv = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]
|
| 79 |
+
print_info: n_rot = 64
|
| 80 |
+
print_info: n_swa = 0
|
| 81 |
+
print_info: is_swa_any = 0
|
| 82 |
+
print_info: n_embd_head_k = 64
|
| 83 |
+
print_info: n_embd_head_v = 64
|
| 84 |
+
print_info: n_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]
|
| 85 |
+
print_info: n_embd_k_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 86 |
+
print_info: n_embd_v_gqa = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 256, 0, 0, 0, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256, 0, 0, 0, 0]
|
| 87 |
+
print_info: f_norm_eps = 0.0e+00
|
| 88 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 89 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 90 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 91 |
+
print_info: f_logit_scale = 3.0e+00
|
| 92 |
+
print_info: f_attn_scale = 1.6e-02
|
| 93 |
+
print_info: n_ff = 2048
|
| 94 |
+
print_info: n_expert = 0
|
| 95 |
+
print_info: n_expert_used = 0
|
| 96 |
+
print_info: n_expert_groups = 0
|
| 97 |
+
print_info: n_group_used = 0
|
| 98 |
+
print_info: causal attn = 1
|
| 99 |
+
print_info: pooling type = 0
|
| 100 |
+
print_info: rope type = 0
|
| 101 |
+
print_info: rope scaling = linear
|
| 102 |
+
print_info: freq_base_train = 10000.0
|
| 103 |
+
print_info: freq_scale_train = 1
|
| 104 |
+
print_info: n_ctx_orig_yarn = 1048576
|
| 105 |
+
print_info: rope_finetuned = unknown
|
| 106 |
+
print_info: ssm_d_conv = 4
|
| 107 |
+
print_info: ssm_d_inner = 1536
|
| 108 |
+
print_info: ssm_d_state = 128
|
| 109 |
+
print_info: ssm_dt_rank = 48
|
| 110 |
+
print_info: ssm_n_group = 1
|
| 111 |
+
print_info: ssm_dt_b_c_rms = 0
|
| 112 |
+
print_info: model type = 350M
|
| 113 |
+
print_info: model params = 340.33 M
|
| 114 |
+
print_info: general.name = Granite 4.0 H 350m Unsloth
|
| 115 |
+
print_info: f_embedding_scale = 12.000000
|
| 116 |
+
print_info: f_residual_scale = 0.246000
|
| 117 |
+
print_info: f_attention_scale = 0.015625
|
| 118 |
+
print_info: n_ff_shexp = 2048
|
| 119 |
+
print_info: vocab type = BPE
|
| 120 |
+
print_info: n_vocab = 100352
|
| 121 |
+
print_info: n_merges = 100000
|
| 122 |
+
print_info: BOS token = 100257 '<|end_of_text|>'
|
| 123 |
+
print_info: EOS token = 100257 '<|end_of_text|>'
|
| 124 |
+
print_info: EOT token = 100257 '<|end_of_text|>'
|
| 125 |
+
print_info: UNK token = 100269 '<|unk|>'
|
| 126 |
+
print_info: PAD token = 100256 '<|pad|>'
|
| 127 |
+
print_info: LF token = 198 'Ċ'
|
| 128 |
+
print_info: FIM PRE token = 100258 '<|fim_prefix|>'
|
| 129 |
+
print_info: FIM SUF token = 100260 '<|fim_suffix|>'
|
| 130 |
+
print_info: FIM MID token = 100259 '<|fim_middle|>'
|
| 131 |
+
print_info: FIM PAD token = 100261 '<|fim_pad|>'
|
| 132 |
+
print_info: EOG token = 100257 '<|end_of_text|>'
|
| 133 |
+
print_info: EOG token = 100261 '<|fim_pad|>'
|
| 134 |
+
print_info: max token length = 256
|
| 135 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 136 |
+
load_tensors: offloading 20 repeating layers to GPU
|
| 137 |
+
load_tensors: offloaded 20/33 layers to GPU
|
| 138 |
+
load_tensors: CPU_Mapped model buffer size = 265.27 MiB
|
| 139 |
+
load_tensors: CUDA0 model buffer size = 95.76 MiB
|
| 140 |
+
load_tensors: CUDA1 model buffer size = 98.17 MiB
|
| 141 |
+
.....................................................................
|
| 142 |
+
llama_context: constructing llama_context
|
| 143 |
+
llama_context: n_seq_max = 1
|
| 144 |
+
llama_context: n_ctx = 2048
|
| 145 |
+
llama_context: n_ctx_seq = 2048
|
| 146 |
+
llama_context: n_batch = 2048
|
| 147 |
+
llama_context: n_ubatch = 512
|
| 148 |
+
llama_context: causal_attn = 1
|
| 149 |
+
llama_context: flash_attn = auto
|
| 150 |
+
llama_context: kv_unified = false
|
| 151 |
+
llama_context: freq_base = 10000.0
|
| 152 |
+
llama_context: freq_scale = 1
|
| 153 |
+
llama_context: n_ctx_seq (2048) < n_ctx_train (1048576) -- the full capacity of the model will not be utilized
|
| 154 |
+
llama_context: CPU output buffer size = 0.38 MiB
|
| 155 |
+
llama_kv_cache: CPU KV buffer size = 2.00 MiB
|
| 156 |
+
llama_kv_cache: CUDA0 KV buffer size = 4.00 MiB
|
| 157 |
+
llama_kv_cache: CUDA1 KV buffer size = 2.00 MiB
|
| 158 |
+
llama_kv_cache: size = 8.00 MiB ( 2048 cells, 4 layers, 1/1 seqs), K (f16): 4.00 MiB, V (f16): 4.00 MiB
|
| 159 |
+
llama_memory_recurrent: CPU RS buffer size = 8.48 MiB
|
| 160 |
+
llama_memory_recurrent: CUDA0 RS buffer size = 6.16 MiB
|
| 161 |
+
llama_memory_recurrent: CUDA1 RS buffer size = 6.93 MiB
|
| 162 |
+
llama_memory_recurrent: size = 21.57 MiB ( 1 cells, 32 layers, 1 seqs), R (f32): 0.57 MiB, S (f32): 21.00 MiB
|
| 163 |
+
llama_context: Flash Attention was auto, set to enabled
|
| 164 |
+
llama_context: CUDA0 compute buffer size = 354.10 MiB
|
| 165 |
+
llama_context: CUDA1 compute buffer size = 22.39 MiB
|
| 166 |
+
llama_context: CUDA_Host compute buffer size = 18.34 MiB
|
| 167 |
+
llama_context: graph nodes = 1815
|
| 168 |
+
llama_context: graph splits = 182 (with bs=512), 41 (with bs=1)
|
| 169 |
+
common_init_from_params: added <|end_of_text|> logit bias = -inf
|
| 170 |
+
common_init_from_params: added <|fim_pad|> logit bias = -inf
|
| 171 |
+
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
|
| 172 |
+
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
|
| 173 |
+
|
| 174 |
+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
| 175 |
+
perplexity: tokenizing the input ..
|
| 176 |
+
perplexity: tokenization took 38.92 ms
|
| 177 |
+
perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
|
| 178 |
+
perplexity: 0.63 seconds per pass - ETA 0.15 minutes
|
| 179 |
+
[1]8.6997,[2]9.9092,[3]9.4740,[4]9.8120,[5]9.9534,[6]10.0280,[7]10.1927,[8]9.8904,[9]9.9473,[10]9.9599,[11]10.1948,[12]10.2769,[13]10.3983,[14]10.3724,[15]10.2742,
|
| 180 |
+
Final estimate: PPL = 10.2742 +/- 0.23108
|
| 181 |
+
|
| 182 |
+
llama_perf_context_print: load time = 226.80 ms
|
| 183 |
+
llama_perf_context_print: prompt eval time = 5676.92 ms / 30720 tokens ( 0.18 ms per token, 5411.38 tokens per second)
|
| 184 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
| 185 |
+
llama_perf_context_print: total time = 5959.27 ms / 30721 tokens
|
| 186 |
+
llama_perf_context_print: graphs reused = 0
|
| 187 |
+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
| 188 |
+
llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24107 = 20596 + ( 460 = 95 + 10 + 354) + 3050 |
|
| 189 |
+
llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 23372 + ( 129 = 98 + 8 + 22) + 622 |
|
| 190 |
+
llama_memory_breakdown_print: | - Host | 294 = 265 + 10 + 18 |
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/ppl_corpus_code.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/ppl_corpus_general.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Benchmarks/granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16/ppl_corpus_math.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
CHANGED
|
@@ -21,6 +21,14 @@ base_model:
|
|
| 21 |
|
| 22 |
## **Use The Following Models!**
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
Stats compared against the standard Q8_0 (precision loss still compared to F16)
|
| 25 |
|
| 26 |
* **MXFP4_MOE-output_q6_K-router_gate_emb_q6_K**
|
|
@@ -42,11 +50,23 @@ Unlike pure MXFP4, which heavily degrades dense models. This hybrid method selec
|
|
| 42 |
|
| 43 |
# The Magic Model
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
#### MXFP4_MOE-output_q6_K-router_gate_emb_q6_K
|
|
@@ -119,21 +139,34 @@ All models were tested with a unified automated harness using `llama.cpp` tools.
|
|
| 119 |
|
| 120 |
Comparing to F16.
|
| 121 |
|
| 122 |
-
| model_name
|
| 123 |
-
|
|
| 124 |
-
| MXFP4_MOE-
|
| 125 |
-
|
|
| 126 |
-
| MXFP4_MOE-
|
| 127 |
-
| MXFP4_MOE-
|
| 128 |
-
| MXFP4_MOE-
|
| 129 |
-
|
|
| 130 |
-
|
|
| 131 |
-
| MXFP4_MOE-
|
| 132 |
-
| MXFP4_MOE-
|
| 133 |
-
|
|
| 134 |
-
|
|
| 135 |
-
| MXFP4_MOE
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
* All percentages compared against the selected family F16 baseline.
|
| 139 |
|
|
@@ -141,22 +174,35 @@ Comparing to F16.
|
|
| 141 |
|
| 142 |
### Table - File Size + TPS + Avg Precision Loss
|
| 143 |
|
| 144 |
-
| model_name
|
| 145 |
-
|
|
| 146 |
-
| F16
|
| 147 |
-
| MXFP4_MOE-
|
| 148 |
-
|
|
| 149 |
-
| MXFP4_MOE-
|
| 150 |
-
| MXFP4_MOE-
|
| 151 |
-
| MXFP4_MOE-
|
| 152 |
-
|
|
| 153 |
-
|
|
| 154 |
-
| MXFP4_MOE-
|
| 155 |
-
| MXFP4_MOE-
|
| 156 |
-
|
|
| 157 |
-
|
|
| 158 |
-
| MXFP4_MOE
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
* Bench NGL was 35
|
| 162 |
* Utilized CUDA
|
|
@@ -167,20 +213,33 @@ Comparing to F16.
|
|
| 167 |
|
| 168 |
| model_name | gen | gen_er | code | code_er | math | math_er |
|
| 169 |
| ---------- | ---- | ------- | ----- | -------- | ------ | -------- |
|
| 170 |
-
| F16 |
|
| 171 |
-
| MXFP4_MOE-
|
| 172 |
-
|
|
| 173 |
-
| MXFP4_MOE-
|
| 174 |
-
| MXFP4_MOE-
|
| 175 |
-
| MXFP4_MOE-
|
| 176 |
-
|
|
| 177 |
-
|
|
| 178 |
-
| MXFP4_MOE-
|
| 179 |
-
| MXFP4_MOE-
|
| 180 |
-
|
|
| 181 |
-
|
|
| 182 |
-
| MXFP4_MOE |
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
* gen = ppl_general
|
| 186 |
* gen_er = ppl_general_error
|
|
@@ -196,20 +255,33 @@ Comparing to F16.
|
|
| 196 |
| model_name | loss_general | loss_code | loss_math |
|
| 197 |
| ---------- | ------------ | ---------- | ---------- |
|
| 198 |
| F16 | 0 | 0 | 0 |
|
| 199 |
-
| MXFP4_MOE-
|
| 200 |
-
|
|
| 201 |
-
| MXFP4_MOE-
|
| 202 |
-
| MXFP4_MOE-
|
| 203 |
-
| MXFP4_MOE-
|
| 204 |
-
|
|
| 205 |
-
|
|
| 206 |
-
| MXFP4_MOE-
|
| 207 |
-
| MXFP4_MOE-
|
| 208 |
-
|
|
| 209 |
-
|
|
| 210 |
-
| MXFP4_MOE |
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
* loss_general = precision_loss_general_pct
|
| 214 |
* loss_code = precision_loss_code_pct
|
| 215 |
-
* loss_math = precision_loss_math_pct
|
|
|
|
| 21 |
|
| 22 |
## **Use The Following Models!**
|
| 23 |
|
| 24 |
+
* **MXFP4_MOE-output_q6_k-router_gate_emb_f16** (This is the special version)
|
| 25 |
+
|
| 26 |
+
29.7% smaller than F16 • 1652.1 TPS • 0.04959% precision loss compared to F16
|
| 27 |
+
|
| 28 |
+
Why is this version special? Because precision loss on tiny models like this affect the model to the extreme. To achieve ~30% smaller size at this small of a precision loss is the exact trade off desired when needing to minimize the models size. This is the primary variant suggested for this model.
|
| 29 |
+
|
| 30 |
+
---
|
| 31 |
+
|
| 32 |
Stats compared against the standard Q8_0 (precision loss still compared to F16)
|
| 33 |
|
| 34 |
* **MXFP4_MOE-output_q6_K-router_gate_emb_q6_K**
|
|
|
|
| 50 |
|
| 51 |
# The Magic Model
|
| 52 |
|
| 53 |
+
#### MXFP4_MOE-output_q6_k-router_gate_emb_f16
|
| 54 |
+
|
| 55 |
+
> **(29.7% smaller than F16 • 1652.1 TPS • 0.04959% precision loss compared to F16)**
|
| 56 |
|
| 57 |
+
This... this is hot.. if I do say so myself.
|
| 58 |
+
|
| 59 |
+
The following was the conversion script:
|
| 60 |
+
```bash
|
| 61 |
+
llama-quantize \
|
| 62 |
+
--tensor-type token_embd.weight=F16 \
|
| 63 |
+
--tensor-type output.weight=Q6_K \
|
| 64 |
+
--tensor-type 'router.*'=F16 \
|
| 65 |
+
--tensor-type 'gate.*'=F16 \
|
| 66 |
+
"Path_To_F16_GGUF.gguf" \
|
| 67 |
+
"Path_To_GGUF.gguf" \
|
| 68 |
+
mxfp4_moe
|
| 69 |
+
```
|
| 70 |
|
| 71 |
|
| 72 |
#### MXFP4_MOE-output_q6_K-router_gate_emb_q6_K
|
|
|
|
| 139 |
|
| 140 |
Comparing to F16.
|
| 141 |
|
| 142 |
+
| model_name | size_reduction | tps_change |
|
| 143 |
+
| ---------- | -------------- | ---------- |
|
| 144 |
+
| MXFP4_MOE-output_q6_k-router_gate_emb_f16 | 29.69% | -11.37% |
|
| 145 |
+
| MXFP4_MOE-output_f16-router_gate_emb_f16 | 29.69% | -13.16% |
|
| 146 |
+
| MXFP4_MOE-output_q6_k-embd_f16 | 35.94% | -15.36% |
|
| 147 |
+
| MXFP4_MOE-F16 | 35.94% | -15.21% |
|
| 148 |
+
| MXFP4_MOE-output_f16-router_gate_emb_q6_k | 51.56% | -8.38% |
|
| 149 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_f16 | 51.56% | -11.52% |
|
| 150 |
+
| MXFP4_MOE-output_q6_K-router_gate_emb_q6_K | 51.56% | -7.63% |
|
| 151 |
+
| MXFP4_MOE-Q6_K | 50% | -6.55% |
|
| 152 |
+
| MXFP4_MOE-Q8 | 46.88% | -6.77% |
|
| 153 |
+
| Q8_0 | 46.88% | -7.07% |
|
| 154 |
+
| Q6_K | 59.38% | -9.24% |
|
| 155 |
+
| MXFP4_MOE-output_mxfp4-embd_f16 | 35.94% | -11.02% |
|
| 156 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q6_K | 51.56% | -7.64% |
|
| 157 |
+
| MXFP4_MOE-output_mxfp4-embd_q6_K | 50% | -7.01% |
|
| 158 |
+
| MXFP4_MOE-output_mxfp4-embd_q8 | 46.88% | -7.39% |
|
| 159 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q8 | 46.88% | -6.63% |
|
| 160 |
+
| MXFP4_MOE-Q5_K | 51.56% | -6.49% |
|
| 161 |
+
| MXFP4_MOE-output_mxfp4-embd_q5_K | 51.56% | -6.37% |
|
| 162 |
+
| Q5_K_M | 62.5% | -8.98% |
|
| 163 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q5_K | 53.12% | -7.48% |
|
| 164 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q4_K | 56.25% | -6.61% |
|
| 165 |
+
| MXFP4_MOE-Q4_K | 53.12% | -6.47% |
|
| 166 |
+
| MXFP4_MOE-output_mxfp4-embd_q4_K | 53.12% | -6.23% |
|
| 167 |
+
| Q4_K_M | 67.19% | -8.45% |
|
| 168 |
+
| MXFP4_MOE-output_q8-embd_mxfp4 | 53.12% | -6.53% |
|
| 169 |
+
| MXFP4_MOE | 73.44% | -1.13% |
|
| 170 |
|
| 171 |
* All percentages compared against the selected family F16 baseline.
|
| 172 |
|
|
|
|
| 174 |
|
| 175 |
### Table - File Size + TPS + Avg Precision Loss
|
| 176 |
|
| 177 |
+
| model_name | file_size_gb | bench_tps | avg_prec_loss |
|
| 178 |
+
| ---------- | ------------ | --------- | -------------- |
|
| 179 |
+
| F16 | 0.64 | 1863.96 | 0 |
|
| 180 |
+
| MXFP4_MOE-output_q6_k-router_gate_emb_f16 | 0.45 | 1652.1 | 0.0459 |
|
| 181 |
+
| MXFP4_MOE-output_f16-router_gate_emb_f16 | 0.45 | 1618.73 | 0.0934 |
|
| 182 |
+
| MXFP4_MOE-output_q6_k-embd_f16 | 0.41 | 1577.68 | 0.11 |
|
| 183 |
+
| MXFP4_MOE-F16 | 0.41 | 1580.44 | 0.12 |
|
| 184 |
+
| MXFP4_MOE-output_f16-router_gate_emb_q6_k | 0.31 | 1707.74 | 0.1855 |
|
| 185 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_f16 | 0.31 | 1649.31 | 0.1855 |
|
| 186 |
+
| MXFP4_MOE-output_q6_K-router_gate_emb_q6_K | 0.31 | 1721.67 | 0.214 |
|
| 187 |
+
| MXFP4_MOE-Q6_K | 0.32 | 1741.83 | 0.2545 |
|
| 188 |
+
| MXFP4_MOE-Q8 | 0.34 | 1737.7 | 0.3695 |
|
| 189 |
+
| Q8_0 | 0.34 | 1732.23 | 0.3695 |
|
| 190 |
+
| Q6_K | 0.26 | 1691.78 | 0.6105 |
|
| 191 |
+
| MXFP4_MOE-output_mxfp4-embd_f16 | 0.41 | 1658.55 | 0.6519 |
|
| 192 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q6_K | 0.31 | 1721.57 | 0.693 |
|
| 193 |
+
| MXFP4_MOE-output_mxfp4-embd_q6_K | 0.32 | 1733.28 | 0.8372 |
|
| 194 |
+
| MXFP4_MOE-output_mxfp4-embd_q8 | 0.34 | 1726.18 | 0.8454 |
|
| 195 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q8 | 0.34 | 1740.43 | 0.8454 |
|
| 196 |
+
| MXFP4_MOE-Q5_K | 0.31 | 1742.99 | 2.1423 |
|
| 197 |
+
| MXFP4_MOE-output_mxfp4-embd_q5_K | 0.31 | 1745.27 | 2.6333 |
|
| 198 |
+
| Q5_K_M | 0.24 | 1696.53 | 2.9645 |
|
| 199 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q5_K | 0.3 | 1724.55 | 3.1646 |
|
| 200 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q4_K | 0.28 | 1740.67 | 4.3156 |
|
| 201 |
+
| MXFP4_MOE-Q4_K | 0.3 | 1743.34 | 4.5808 |
|
| 202 |
+
| MXFP4_MOE-output_mxfp4-embd_q4_K | 0.3 | 1747.89 | 4.7838 |
|
| 203 |
+
| Q4_K_M | 0.21 | 1706.54 | 12.1189 |
|
| 204 |
+
| MXFP4_MOE-output_q8-embd_mxfp4 | 0.3 | 1742.28 | 13.915 |
|
| 205 |
+
| MXFP4_MOE | 0.17 | 1842.9 | 8225.0298 |
|
| 206 |
|
| 207 |
* Bench NGL was 35
|
| 208 |
* Utilized CUDA
|
|
|
|
| 213 |
|
| 214 |
| model_name | gen | gen_er | code | code_er | math | math_er |
|
| 215 |
| ---------- | ---- | ------- | ----- | -------- | ------ | -------- |
|
| 216 |
+
| F16 | 18.1241 | 0.4654 | 1.9547 | 0.0175 | 10.2753 | 0.2312 |
|
| 217 |
+
| MXFP4_MOE-output_q6_k-router_gate_emb_f16 | 18.1547 | 0.4667 | 1.9543 | 0.0175 | 10.2742 | 0.2311 |
|
| 218 |
+
| MXFP4_MOE-output_f16-router_gate_emb_f16 | 18.1532 | 0.4667 | 1.9546 | 0.0175 | 10.2881 | 0.2316 |
|
| 219 |
+
| MXFP4_MOE-output_q6_k-embd_f16 | 18.1555 | 0.4664 | 1.9539 | 0.0175 | 10.2956 | 0.2317 |
|
| 220 |
+
| MXFP4_MOE-F16 | 18.1603 | 0.4666 | 1.9546 | 0.0175 | 10.2923 | 0.2317 |
|
| 221 |
+
| MXFP4_MOE-output_f16-router_gate_emb_q6_k | 18.1862 | 0.4686 | 1.9581 | 0.0175 | 10.2794 | 0.2314 |
|
| 222 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_f16 | 18.1862 | 0.4686 | 1.9581 | 0.0175 | 10.2794 | 0.2314 |
|
| 223 |
+
| MXFP4_MOE-output_q6_K-router_gate_emb_q6_K | 18.2137 | 0.4694 | 1.9581 | 0.0175 | 10.2726 | 0.2311 |
|
| 224 |
+
| MXFP4_MOE-Q6_K | 18.2289 | 0.4697 | 1.9583 | 0.0175 | 10.2754 | 0.2311 |
|
| 225 |
+
| MXFP4_MOE-Q8 | 18.2363 | 0.4693 | 1.9558 | 0.0175 | 10.3198 | 0.2325 |
|
| 226 |
+
| Q8_0 | 18.2363 | 0.4693 | 1.9558 | 0.0175 | 10.3198 | 0.2325 |
|
| 227 |
+
| Q6_K | 18.3753 | 0.4719 | 1.9612 | 0.0175 | 10.2869 | 0.2294 |
|
| 228 |
+
| MXFP4_MOE-output_mxfp4-embd_f16 | 18.2903 | 0.4697 | 1.9572 | 0.0175 | 10.3689 | 0.2334 |
|
| 229 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q6_K | 18.334 | 0.472 | 1.9603 | 0.0175 | 10.3405 | 0.2326 |
|
| 230 |
+
| MXFP4_MOE-output_mxfp4-embd_q6_K | 18.3312 | 0.4717 | 1.9612 | 0.0175 | 10.3818 | 0.2338 |
|
| 231 |
+
| MXFP4_MOE-output_mxfp4-embd_q8 | 18.3491 | 0.4717 | 1.958 | 0.0175 | 10.391 | 0.234 |
|
| 232 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q8 | 18.3491 | 0.4717 | 1.958 | 0.0175 | 10.391 | 0.234 |
|
| 233 |
+
| MXFP4_MOE-Q5_K | 18.8193 | 0.4864 | 1.9665 | 0.0177 | 10.4795 | 0.2366 |
|
| 234 |
+
| MXFP4_MOE-output_mxfp4-embd_q5_K | 18.9164 | 0.4885 | 1.9678 | 0.0177 | 10.569 | 0.2391 |
|
| 235 |
+
| Q5_K_M | 18.9868 | 0.4897 | 1.9833 | 0.0179 | 10.5497 | 0.2372 |
|
| 236 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q5_K | 19.176 | 0.4956 | 1.9713 | 0.0178 | 10.5672 | 0.2381 |
|
| 237 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q4_K | 19.0072 | 0.4913 | 1.9966 | 0.0182 | 10.8847 | 0.2476 |
|
| 238 |
+
| MXFP4_MOE-Q4_K | 19.1505 | 0.4952 | 1.992 | 0.0181 | 10.9094 | 0.25 |
|
| 239 |
+
| MXFP4_MOE-output_mxfp4-embd_q4_K | 19.1528 | 0.4949 | 1.9946 | 0.0181 | 10.957 | 0.2506 |
|
| 240 |
+
| Q4_K_M | 21.3531 | 0.5635 | 2.0638 | 0.0194 | 11.6069 | 0.2693 |
|
| 241 |
+
| MXFP4_MOE-output_q8-embd_mxfp4 | 22.2013 | 0.5834 | 2.1047 | 0.0199 | 11.4647 | 0.2597 |
|
| 242 |
+
| MXFP4_MOE | 1172.2706 | 45.947 | 303.0942 | 7.7666 | 308.3771 | 10.9069 |
|
| 243 |
|
| 244 |
* gen = ppl_general
|
| 245 |
* gen_er = ppl_general_error
|
|
|
|
| 255 |
| model_name | loss_general | loss_code | loss_math |
|
| 256 |
| ---------- | ------------ | ---------- | ---------- |
|
| 257 |
| F16 | 0 | 0 | 0 |
|
| 258 |
+
| MXFP4_MOE-output_q6_k-router_gate_emb_f16 | 0.1688 | -0.0205 | -0.0107 |
|
| 259 |
+
| MXFP4_MOE-output_f16-router_gate_emb_f16 | 0.1606 | -0.0051 | 0.1246 |
|
| 260 |
+
| MXFP4_MOE-output_q6_k-embd_f16 | 0.1732 | -0.0409 | 0.1976 |
|
| 261 |
+
| MXFP4_MOE-F16 | 0.1997 | -0.0051 | 0.1654 |
|
| 262 |
+
| MXFP4_MOE-output_f16-router_gate_emb_q6_k | 0.3426 | 0.1739 | 0.0399 |
|
| 263 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_f16 | 0.3426 | 0.1739 | 0.0399 |
|
| 264 |
+
| MXFP4_MOE-output_q6_K-router_gate_emb_q6_K | 0.4944 | 0.1739 | -0.0263 |
|
| 265 |
+
| MXFP4_MOE-Q6_K | 0.5782 | 0.1842 | 0.001 |
|
| 266 |
+
| MXFP4_MOE-Q8 | 0.6191 | 0.0563 | 0.4331 |
|
| 267 |
+
| Q8_0 | 0.6191 | 0.0563 | 0.4331 |
|
| 268 |
+
| Q6_K | 1.386 | 0.3325 | 0.1129 |
|
| 269 |
+
| MXFP4_MOE-output_mxfp4-embd_f16 | 0.917 | 0.1279 | 0.9109 |
|
| 270 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q6_K | 1.1581 | 0.2865 | 0.6345 |
|
| 271 |
+
| MXFP4_MOE-output_mxfp4-embd_q6_K | 1.1427 | 0.3325 | 1.0365 |
|
| 272 |
+
| MXFP4_MOE-output_mxfp4-embd_q8 | 1.2414 | 0.1688 | 1.126 |
|
| 273 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q8 | 1.2414 | 0.1688 | 1.126 |
|
| 274 |
+
| MXFP4_MOE-Q5_K | 3.8358 | 0.6037 | 1.9873 |
|
| 275 |
+
| MXFP4_MOE-output_mxfp4-embd_q5_K | 4.3715 | 0.6702 | 2.8583 |
|
| 276 |
+
| Q5_K_M | 4.76 | 1.4631 | 2.6705 |
|
| 277 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q5_K | 5.8039 | 0.8492 | 2.8408 |
|
| 278 |
+
| MXFP4_MOE-output_mxfp4-router_gate_emb_q4_K | 4.8725 | 2.1436 | 5.9307 |
|
| 279 |
+
| MXFP4_MOE-Q4_K | 5.6632 | 1.9082 | 6.1711 |
|
| 280 |
+
| MXFP4_MOE-output_mxfp4-embd_q4_K | 5.6759 | 2.0412 | 6.6344 |
|
| 281 |
+
| Q4_K_M | 17.8161 | 5.5814 | 12.9592 |
|
| 282 |
+
| MXFP4_MOE-output_q8-embd_mxfp4 | 22.496 | 7.6738 | 11.5753 |
|
| 283 |
+
| MXFP4_MOE | 6368.021 | 15405.9191 | 2901.1494 |
|
| 284 |
|
| 285 |
* loss_general = precision_loss_general_pct
|
| 286 |
* loss_code = precision_loss_code_pct
|
| 287 |
+
* loss_math = precision_loss_math_pct
|
granite-4.0-h-350m-unsloth-MXFP4_MOE-output_q6_k-router_gate_emb_f16.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6754b9f82a628193e355dd01e176a412266d9aa34597ab5e6248e3f6af9ec11
|
| 3 |
+
size 485063616
|