|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from trl import DPOTrainer, DPOConfig |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
dataset = load_dataset("Anthropic/hh-rlhf", split="train") |
|
|
|
|
|
|
|
|
train_dataset = dataset.select(range(10000)) |
|
|
|
|
|
|
|
|
model_name = "HuggingFaceTB/SmolLM3-3B" |
|
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
training_args = DPOConfig( |
|
|
|
|
|
beta=0.1, |
|
|
max_prompt_length=512, |
|
|
max_length=1024, |
|
|
|
|
|
learning_rate=5e-7, |
|
|
per_device_train_batch_size=2, |
|
|
gradient_accumulation_steps=8, |
|
|
max_steps=1000, |
|
|
|
|
|
warmup_steps=100, |
|
|
lr_scheduler_type="cosine", |
|
|
gradient_checkpointing=True, |
|
|
bf16=True, |
|
|
|
|
|
logging_steps=50, |
|
|
save_steps=250, |
|
|
output_dir="./smollm3-dpo-aligned", |
|
|
|
|
|
push_to_hub=True, |
|
|
hub_model_id="robbiemu/smollm3-dpo-aligned", |
|
|
report_to="trackio", |
|
|
|
|
|
remove_unused_columns=False, |
|
|
) |
|
|
|
|
|
|
|
|
trainer = DPOTrainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
processing_class=tokenizer, |
|
|
) |
|
|
|
|
|
|
|
|
print("Starting DPO training...") |
|
|
trainer.train() |
|
|
|
|
|
print("Training completed! Model saved and pushed to Hub.") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|