QueryRefiner-0.5B-v0.1-SFT / Code /Fine-tuning.py

Upload Fine-tuning.py

196b7f6 verified about 2 months ago

10.5 kB

	import re
	import numpy as np
	import torch
	from datasets import load_dataset, Dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TrainingArguments,
	Trainer,
	DataCollatorForLanguageModeling,
	)
	from huggingface_hub import login

	##########
	# CONFIG #
	##########

	MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
	DATASET = "dataset/repo"
	OUTPUT_MODEL = "model/repo"

	# Training hyperparams
	NUM_EPOCHS = 3
	PER_DEVICE_BATCH = 4
	GRADIENT_ACCUMULATION = 4
	LEARNING_RATE = 2e-5
	WEIGHT_DECAY = 0.01
	WARMUP_STEPS = 100
	BF16 = True
	TORCH_COMPILE = False

	#########
	# LOGIN #
	#########

	login("<YOUR_HF_TOKEN>")

	##################
	# LOAD TOKENIZER #
	##################

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	tokenizer.padding_side = "right"

	################
	# LOAD DATASET #
	################

	raw_ds = load_dataset(DATASET, "default", split="train")
	raw_ds = raw_ds.shuffle(seed=42)

	# Apply Qwen chat template
	formatted_texts = [
	tokenizer.apply_chat_template(
	conv,
	tokenize=False,
	add_generation_prompt=False
	)
	for conv in raw_ds["text"]
	]

	# Build simple dataset
	ds = Dataset.from_dict({"text": formatted_texts})

	########################
	# CUSTOM DATA COLLATOR #
	########################

	class Qwen25DataCollator(DataCollatorForLanguageModeling):
	def __init__(self, tokenizer, mlm=False):
	super().__init__(tokenizer=tokenizer, mlm=mlm)
	# get token ids robustly (some tokenizers might return [] for encode if token missing)
	try:
	self.im_start_token = tokenizer.encode("<\|im_start\|>", add_special_tokens=False)[0]
	except Exception:
	self.im_start_token = None
	try:
	self.im_end_token = tokenizer.encode("<\|im_end\|>", add_special_tokens=False)[0]
	except Exception:
	self.im_end_token = None

	# "assistant" token sequence (may be multiple tokens)
	try:
	self.assistant_text = tokenizer.encode("assistant", add_special_tokens=False)
	except Exception:
	self.assistant_text = []

	# Provide both __call__ and torch_call for compatibility
	def __call__(self, features):
	return self.torch_call(features)

	def torch_call(self, examples):
	"""
	examples: list of dicts returned by tokenization (each example contains 'input_ids', 'attention_mask', etc.)
	We'll leverage the parent to create initial batch and then mask labels for assistant responses only.
	"""
	batch = super().torch_call(examples) # returns input_ids, attention_mask, labels (for MLM)
	input_ids = batch["input_ids"]
	labels = batch["labels"]

	# If special tokens are not present, return default batch unchanged
	if self.im_start_token is None or self.im_end_token is None or len(self.assistant_text) == 0:
	return batch

	# Iterate examples in batch to mask labels: only assistant response tokens should be supervised
	for i, ids in enumerate(input_ids):
	# Find positions of <\|im_start\|> and <\|im_end\|>
	im_start_positions = torch.where(ids == self.im_start_token)[0]
	im_end_positions = torch.where(ids == self.im_end_token)[0]

	if im_start_positions.numel() == 0 or im_end_positions.numel() == 0:
	# no recognized chat markers: leave labels as-is (or continue)
	continue

	last_assistant_start = None
	# Find last im_start that is followed by "assistant"
	for start_pos in im_start_positions:
	# check if tokens following start_pos match "assistant"
	as_len = len(self.assistant_text)
	candidate_end = start_pos + 1 + as_len
	if candidate_end <= len(ids):
	segment = ids[start_pos + 1:start_pos + 1 + as_len]
	if torch.equal(segment, torch.tensor(self.assistant_text, device=ids.device)):
	last_assistant_start = int(start_pos)

	if last_assistant_start is None:
	continue

	# Find first im_end after last_assistant_start
	assistant_end_positions = im_end_positions[im_end_positions > last_assistant_start]
	if assistant_end_positions.numel() == 0:
	continue

	assistant_end = int(assistant_end_positions[0])

	# Response text is between (last_assistant_start + 1 + len("assistant")) and assistant_end - 1 (inclusive),
	# but because template may include a newline or an extra token, we set response_start carefully.
	response_start = last_assistant_start + 1 + len(self.assistant_text)
	# If there's a newline token or separator, skip it if present in input_ids
	# (this is conservative: we do not assume an extra token, but we keep it if present)
	if response_start < len(ids) and ids[response_start] == tokenizer.encode("\n", add_special_tokens=False)[0]:
	response_start += 1

	# Apply masking:
	# Set everything before response_start to -100 (ignored), preserve response tokens, set rest to -100
	labels[i, :] = -100
	if response_start < len(ids):
	# labels slice up to assistant_end inclusive
	end_idx = min(assistant_end + 1, ids.shape[0])
	labels[i, response_start:end_idx] = ids[response_start:end_idx]

	# assign modified labels back
	batch["labels"] = labels
	return batch

	collator = Qwen25DataCollator(tokenizer=tokenizer, mlm=False)

	###############################################
	# ANALYZE DATASET LENGTHS TO SET `max_length` #
	###############################################

	# We analyze the dataset to optimize the choice of `max_length`
	print("Analyzing dataset to determine max_length (sample up to 1000)...")
	assistant_lengths = []
	full_lengths = []

	sample_limit = min(1000, len(ds))
	for example in ds["text"][:sample_limit]:
	full_tokens = tokenizer(example, truncation=False, add_special_tokens=True)
	full_lengths.append(len(full_tokens["input_ids"]))

	# extract the last assistant response via regex pattern
	pattern = r"<\\|im_start\\|>assistant\n(.*?)<\\|im_end\\|>"
	matches = re.findall(pattern, example, re.DOTALL)
	if matches:
	last_response = matches[-1]
	resp_tokens = tokenizer(last_response, truncation=False, add_special_tokens=False)
	assistant_lengths.append(len(resp_tokens["input_ids"]))

	# Basic statistics (guard for empty lists)
	def safe_stat(arr):
	if len(arr) == 0:
	return 0.0, 0.0, 0.0, 0.0
	return np.mean(arr), np.median(arr), np.percentile(arr, 95), np.percentile(arr, 99)

	mean_ass, med_ass, p95_ass, p99_ass = safe_stat(assistant_lengths)
	mean_full, _, p95_full, _ = safe_stat(full_lengths)

	print(f"Assistant response mean={mean_ass:.1f}, median={med_ass:.1f}, 95%={p95_ass:.1f}, 99%={p99_ass:.1f}")
	print(f"Full conversation mean={mean_full:.1f}, 95%={p95_full:.1f}")

	# Round up to nearest power of two but don't exceed tokenizer.model_max_length
	def next_power_of_2(x):
	if x <= 1:
	return 1
	return 2 ** int(np.ceil(np.log2(x)))

	target_length = int(min(p95_full if p95_full > 0 else tokenizer.model_max_length, tokenizer.model_max_length))
	MAX_LENGTH = next_power_of_2(target_length)
	if MAX_LENGTH > tokenizer.model_max_length:
	MAX_LENGTH = tokenizer.model_max_length

	print(f"Using MAX_LENGTH = {MAX_LENGTH}")

	####################
	# TOKENIZE DATASET #
	####################

	def tokenize_function(examples):
	return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding=False)

	tokenized_ds = ds.map(tokenize_function, batched=True, remove_columns=ds.column_names)

	##############
	# LOAD MODEL #
	##############

	# Load model
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.bfloat16 if BF16 else None,
	device_map="auto",
	attn_implementation="flash_attention_2",
	use_cache=False,
	)

	try:
	from liger_kernel.transformers import apply_liger_kernel_to_qwen2
	try:
	apply_liger_kernel_to_qwen2(model)
	except TypeError:
	apply_liger_kernel_to_qwen2()
	print("Liger Kernel applied successfully for Qwen2 optimization")
	except Exception:
	print("Liger Kernel not available or failed to apply; continuing without it.")

	print(f"Model loaded. Parameters: {model.num_parameters() / 1e9:.3f}B")

	######################
	# TRAINING ARGUMENTS #
	######################

	training_args = TrainingArguments(
	output_dir="./qwen_rephraser_checkpoints",
	num_train_epochs=NUM_EPOCHS,
	per_device_train_batch_size=PER_DEVICE_BATCH,
	gradient_accumulation_steps=GRADIENT_ACCUMULATION,
	learning_rate=LEARNING_RATE,
	weight_decay=WEIGHT_DECAY,
	warmup_steps=WARMUP_STEPS,
	lr_scheduler_type="cosine",
	logging_steps=10,
	save_steps=500,
	save_total_limit=2,
	bf16=BF16,
	optim="adamw_torch_fused",
	gradient_checkpointing=True,
	report_to="none",
	push_to_hub=False, # we'll push manually at the end
	hub_model_id=OUTPUT_MODEL,
	hub_private_repo=True,
	dataloader_num_workers=4,
	dataloader_pin_memory=True,
	ddp_find_unused_parameters=False,
	torch_compile=TORCH_COMPILE,
	)

	###########
	# TRAINER #
	###########

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_ds,
	data_collator=collator,
	)

	#########
	# TRAIN #
	#########

	print("Starting training...")
	trainer.train()

	####################
	# SAVE FINAL MODEL #
	####################

	print("Saving model to ./final_model ...")
	model.config.use_cache = True
	trainer.save_model("./final_model")
	tokenizer.save_pretrained("./final_model")

	##################
	# PUSHING TO HUB #
	##################

	try:
	print(f"Pushing model and tokenizer to the hub as {OUTPUT_MODEL} (private)...")
	model.push_to_hub(OUTPUT_MODEL, private=True)
	tokenizer.push_to_hub(OUTPUT_MODEL, private=True)
	print("Push completed.")
	except Exception as e:
	print("Warning: push_to_hub failed:", e)

	print("Training complete!")