QAT instructions + HF model upload

# =========================================================================================
# Fine-tuning script based on https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_%281B_and_3B%29-Conversational.ipynb
# This script and HF checkpoint are only intended to showcase how to do finetuning in a way compatible with ExecuTorch
# Only 10 steps are done, and quality of the finetuned model is not evaluated
# =========================================================================================

from unsloth import FastLanguageModel
from unsloth.chat_templates import (
    get_chat_template,
    standardize_data_formats,
    standardize_sharegpt,
    train_on_responses_only,
)

from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq
import torch
import torch.nn as nn

batch_size = 2
learning_rate = 2e-5
gradient_accumulation_steps = 4
max_steps = 10
full_finetuning = True
qat_scheme = "int8-int4"
output_dir = "/tmp/unsloth_example"


model_id = "unsloth/Qwen3-4B"
chat_template = "qwen3"
max_seq_length = 2048
dtype = torch.bfloat16
load_in_4bit = False

################################################################################
# Define model/tokenizer
################################################################################

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_id,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit =load_in_4bit,
    full_finetuning=full_finetuning,
    qat_scheme=qat_scheme,
)
tokenizer = get_chat_template(tokenizer, chat_template = chat_template)

print("MODEL AFTER LOADING")
print(model)
print(model.config)
print(model._torchao_config)

################################################################################
# Process dataset
################################################################################

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

print("DATASET ENTRY")
print(dataset[0])
print("\n\n")

################################################################################
# Define trainer
################################################################################

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=5,
        num_train_epochs=1,
        max_steps=max_steps,
        learning_rate=learning_rate,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

################################################################################
# Do fine tuning
################################################################################
print("DOING FINETUNING")
trainer_stats = trainer.train()
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)

################################################################################
# Save model
################################################################################
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# ################################################################################
# # Convert model to torchao and save to hub
# ################################################################################

from unsloth.models._utils import _convert_torchao_model
_convert_torchao_model(model)

print('MODEL AFTER CONVERT', model)
print("CONFIG", model.config)


################################################################################
# Push converted model to hub
################################################################################
from huggingface_hub import get_token, whoami

def _get_username():
    token = get_token()
    username = whoami(token=token)["name"]
    return username

username = _get_username()
model_name = model_id.split("/")[-1]
save_to = f"{username}/{model_name}-{qat_scheme}-unsloth-v2"
model.push_to_hub(save_to, safe_serialization=False)
tokenizer.push_to_hub(save_to)

################################################################################
# Load converted from hub and inspect
################################################################################
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(save_to)
print('model', model)
print("model.embed_tokens.weight", model.model.embed_tokens.weight)
print("model.layers[0].self_attn.q_proj.weight", model.model.layers[0].self_attn.q_proj.weight)
print("lm_head.weight", model.lm_head.weight)

Exporting to ExecuTorch

We can run the quantized model on a mobile phone using ExecuTorch:

# 1. Install ExecuTorch
pip install executorch pytorch_tokenizers torchtune

# 2. Download QAT'd weights we uploaded to HF
python -m executorch.examples.models.qwen3.convert_weights $(hf download metascroy/Qwen3-4B-int8-int4-unsloth-v2) pytorch_model_converted.bin

# 3. Download model config from ExecuTorch repo
curl -L -o 4b_config.json https://raw.githubusercontent.com/pytorch/executorch/main/examples/models/qwen3/config/4b_config.json

# 4. Export to ExecuTorch pte file
python -m executorch.examples.models.llama.export_llama \
  --model "qwen3_4b" \
  --checkpoint pytorch_model_converted.bin \
  --params 4b_config.json \
  --output_name model.pte \
  -kv \
  --use_sdpa_with_kv_cache \
  -X \
  --xnnpack-extended-ops \
  --max_context_length 1024 \
  --max_seq_length 512 \
  --dtype fp32 \
  --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'

# 5. (optional) Upload pte file to HuggingFace
hf upload metascroy/Qwen3-4B-int8-int4-unsloth-v2 model.pte

Running in a mobile app

After that you can run the model in an iOS mobile app using the executorch-examples repo.

First clone the repo and open the Xcode project:

git clone https://github.com/meta-pytorch/executorch-examples.git
open executorch-examples/llm/apple/etLLM.xcodeproj

Once open, connect your iPhone and select it as the device in Xcode. To build the app, press the play button. (This does require you sign the code.)

To run the model we just finetuned, you need to transfer the model.pte file and tokenizer.json to your phone. To do this, open finder, select your phone, and drag and drop the files to copy them over. Please rename the pte file to qwen3_model.pte before copying it over because the demo app requires the name begin with qwen3 to use the correct prompt template.

In the etLLM app, you can select the model and tokenizer to use by browsing your file system for the "qwen3_model.pte" and "tokenizer.json" files we just copied to the phone.

Uploaded model

Developed by: metascroy
License: apache-2.0
Finetuned from model : unsloth/Qwen3-4B

This qwen3 model was trained 2x faster with Unsloth and Huggingface's TRL library.

Downloads last month: 48

Model tree for metascroy/Qwen3-4B-int8-int4-unsloth-v2

Base model

Qwen/Qwen3-4B-Base

Finetuned

Qwen/Qwen3-4B

Finetuned

unsloth/Qwen3-4B

Quantized

(9)

this model