Qwen-Training

Sleeping

App Files Files Community

Qwen-Training / app.py

rahul7star

Update app.py

58ba31b verified 16 days ago

raw

history blame contribute delete

16.3 kB

	"""
	PromptWizard Qwen Training — Configurable Dataset & Repo
	Fine-tunes Qwen using a user-selected dataset and uploads the trained model
	to a user-specified Hugging Face Hub repo asynchronously with detailed logs.
	"""

	import gradio as gr
	import spaces
	import torch
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	Trainer,
	TrainingArguments,
	)
	from datasets import load_dataset
	from peft import LoraConfig, get_peft_model, TaskType
	from huggingface_hub import HfApi, HfFolder, Repository
	import os, tempfile, shutil, asyncio, threading, time
	from datetime import datetime

	# ==== Async upload wrapper ====
	def start_async_upload(local_dir, hf_repo, output_log):
	"""Starts async model upload in a background thread."""
	def runner():
	output_log.append(f"[INFO] 🚀 Async upload thread started for repo: {hf_repo}")
	asyncio.run(async_upload_model(local_dir, hf_repo, output_log))
	output_log.append(f"[INFO] 🛑 Async upload thread finished for repo: {hf_repo}")

	threading.Thread(target=runner, daemon=True).start()


	from huggingface_hub import upload_folder, HfFolder

	async def async_upload_model(local_dir, hf_repo, output_log, max_retries=3):
	"""
	Uploads a local model directory to HF Hub asynchronously using HTTP API.
	"""
	try:
	token = HfFolder.get_token()
	output_log.append(f"[INFO] ☁️ Preparing to upload to repo: {hf_repo}")

	attempt = 0
	while attempt < max_retries:
	try:
	output_log.append(f"[INFO] 🔄 Attempt {attempt+1} to upload folder via HTTP API...")
	upload_folder(
	folder_path=local_dir,
	repo_id=hf_repo,
	repo_type="model",
	token=token,
	ignore_patterns=[".lock", ".tmp"], # ignore temp files
	create_pr=False,
	)
	output_log.append("[SUCCESS] ✅ Model successfully uploaded to HF Hub!")
	break
	except Exception as e:
	attempt += 1
	output_log.append(f"[ERROR] Upload attempt {attempt} failed: {e}")
	if attempt >= max_retries:
	output_log.append("[ERROR] ❌ Max retries reached. Upload failed.")
	else:
	output_log.append("[INFO] Retrying upload in 5 seconds...")
	await asyncio.sleep(5)

	except Exception as e:
	output_log.append(f"[ERROR] ❌ Unexpected error during upload: {e}")

	# ==== GPU check ====
	def check_gpu_status():
	return "🚀 Zero GPU Ready - GPU will be allocated when training starts"

	# ==== Logging helper ====
	def log_message(output_log, msg):
	line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}"
	print(line)
	output_log.append(line)


	# ==== Main TExt Training ====
	# @spaces.GPU(duration=300)
	# def train_model(base_model, dataset_name, num_epochs, batch_size, learning_rate, hf_repo):
	# output_log = []
	# test_split = 0.2

	# try:
	# log_message(output_log, "🔍 Initializing training sequence...")

	# # ===== Device =====
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# log_message(output_log, f"🎮 Using device: {device}")
	# if device == "cuda":
	# log_message(output_log, f"✅ GPU: {torch.cuda.get_device_name(0)}")

	# # ===== Load dataset =====
	# log_message(output_log, f"\n📚 Loading dataset: {dataset_name} ...")
	# dataset = load_dataset(dataset_name)
	# dataset = dataset["train"].train_test_split(test_size=test_split)
	# train_dataset = dataset["train"]
	# test_dataset = dataset["test"]

	# log_message(output_log, f" Training samples: {len(train_dataset)}")
	# log_message(output_log, f" Test samples: {len(test_dataset)}")
	# log_message(output_log, f" Columns: {train_dataset.column_names}")

	# # ===== Format examples =====
	# # def format_example(item):
	# # text = item.get("text") or item.get("content") or " ".join(str(v) for v in item.values())
	# # prompt = f"""<\|system\|>
	# # You are a wise teacher interpreting Bhagavad Gita with deep insights.
	# # <\|user\|>
	# # {text}
	# # <\|assistant\|>
	# # """
	# # return {"text": prompt}



	# # ===== Format examples dynamically =====
	# def format_example(item):
	# text_content = item.get("text") or item.get("content") or str(item.get("path", "")) or " ".join(str(v) for v in item.values())
	# # Use shorter, clean system prompt + user content for better loss
	# prompt = (
	# f"<\|system\|>\nYou are an expert AI assistant.\n<\|user\|>\n{text_content}\n<\|assistant\|>\n"
	# )
	# return {"text": prompt}

	# train_dataset = train_dataset.map(format_example)
	# test_dataset = test_dataset.map(format_example)
	# log_message(output_log, f"✅ Formatted {len(train_dataset)} train + {len(test_dataset)} test examples")

	# # ===== Load model & tokenizer =====
	# log_message(output_log, f"\n🤖 Loading model: {base_model}")
	# tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
	# if tokenizer.pad_token is None:
	# tokenizer.pad_token = tokenizer.eos_token

	# model = AutoModelForCausalLM.from_pretrained(
	# base_model,
	# trust_remote_code=True,
	# torch_dtype=torch.float16 if device == "cuda" else torch.float32,
	# low_cpu_mem_usage=True,
	# )
	# if device == "cuda":
	# model = model.to(device)
	# log_message(output_log, "✅ Model and tokenizer loaded successfully")
	# log_message(output_log, f"Tokenizer vocab size: {tokenizer.vocab_size}")

	# # ===== LoRA configuration =====
	# log_message(output_log, "\n⚙️ Configuring LoRA for efficient fine-tuning...")
	# lora_config = LoraConfig(
	# task_type=TaskType.CAUSAL_LM,
	# r=8,
	# lora_alpha=16,
	# lora_dropout=0.1,
	# target_modules=["q_proj", "v_proj"],
	# bias="none",
	# )
	# model = get_peft_model(model, lora_config)
	# trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	# log_message(output_log, f"Trainable params after LoRA: {trainable_params:,}")

	# # ===== Tokenization + labels =====
	# def tokenize_fn(examples):
	# tokenized = tokenizer(
	# examples["text"],
	# padding="max_length",
	# truncation=True,
	# max_length=256,
	# )
	# tokenized["labels"] = tokenized["input_ids"].copy()
	# return tokenized

	# train_dataset = train_dataset.map(tokenize_fn, batched=True)
	# test_dataset = test_dataset.map(tokenize_fn, batched=True)
	# log_message(output_log, "✅ Tokenization + labels done")

	# # ===== Training arguments =====
	# output_dir = "./qwen-gita-lora"
	# training_args = TrainingArguments(
	# output_dir=output_dir,
	# num_train_epochs=num_epochs,
	# per_device_train_batch_size=batch_size,
	# gradient_accumulation_steps=2,
	# warmup_steps=10,
	# logging_steps=5,
	# save_strategy="epoch",
	# fp16=device == "cuda",
	# optim="adamw_torch",
	# learning_rate=learning_rate,
	# max_steps=500, # Limit for demo is 100
	# )

	# trainer = Trainer(
	# model=model,
	# args=training_args,
	# train_dataset=train_dataset,
	# eval_dataset=test_dataset,
	# tokenizer=tokenizer,
	# )

	# # ===== Train =====
	# log_message(output_log, "\n🚀 Starting training...")
	# trainer.train()
	# log_message(output_log, "\n💾 Saving trained model locally...")
	# trainer.save_model(output_dir)
	# tokenizer.save_pretrained(output_dir)

	# # ===== Async upload =====
	# log_message(output_log, f"\n☁️ Initiating async upload to {hf_repo}")
	# start_async_upload(output_dir, hf_repo, output_log)

	# log_message(output_log, "✅ Training complete & async upload started!")

	# except Exception as e:
	# log_message(output_log, f"\n❌ Error during training: {e}")

	# return "\n".join(output_log)


	# =====================================================
	# 🧠 Train model to expand short prompts into long ones
	# =====================================================
	@spaces.GPU(duration=300)
	def train_model(
	base_model, dataset_name, num_epochs, batch_size, learning_rate, hf_repo
	):
	output_log = []

	try:
	log_message(output_log, "🚀 Starting FAST test training...")

	# ===== Device =====
	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
	log_message(output_log, f"🎮 Device: {device}, dtype: {dtype}")
	if device == "cuda":
	log_message(output_log, f"✅ GPU: {torch.cuda.get_device_name(0)}")

	# ===== Load dataset =====
	log_message(output_log, f"\n📚 Loading dataset: {dataset_name}")
	dataset = load_dataset(dataset_name)
	dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
	train_dataset, test_dataset = dataset["train"], dataset["test"]

	# ===== ⚡ FAST mode: use small subset =====
	train_dataset = train_dataset.select(range(min(1000, len(train_dataset))))
	test_dataset = test_dataset.select(range(min(200, len(test_dataset))))
	log_message(output_log, f"⚡ Using {len(train_dataset)} train / {len(test_dataset)} test samples")

	# ===== Format samples =====
	def format_example(example):
	short_prompt = example.get("short", "").strip()
	long_response = example.get("long", "").strip()
	return {
	"text": (
	f"<\|system\|>\nYou are an AI that expands short prompts into detailed, descriptive ones.\n"
	f"<\|user\|>\nShort: {short_prompt}\n"
	f"<\|assistant\|>\n{long_response}"
	)
	}

	train_dataset = train_dataset.map(format_example)
	test_dataset = test_dataset.map(format_example)

	# ===== Format examples dynamically =====
	def format_example(item):
	text_content = item.get("text") or item.get("content") or str(item.get("path", "")) or " ".join(str(v) for v in item.values())
	# Use shorter, clean system prompt + user content for better loss
	prompt = (
	f"<\|system\|>\nYou are an expert AI assistant.\n<\|user\|>\n{text_content}\n<\|assistant\|>\n"
	)
	return {"text": prompt}

	train_dataset = train_dataset.map(format_example)
	test_dataset = test_dataset.map(format_example)
	log_message(output_log, f"✅ Formatted {len(train_dataset)} train + {len(test_dataset)} test examples")

	# ===== Load model & tokenizer =====
	log_message(output_log, f"\n🤖 Loading model: {base_model}")
	tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	base_model,
	trust_remote_code=True,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32,
	low_cpu_mem_usage=True,
	)
	if device == "cuda":
	model = model.to(device)
	log_message(output_log, "✅ Model and tokenizer loaded successfully")
	log_message(output_log, f"Tokenizer vocab size: {tokenizer.vocab_size}")

	# ===== LoRA configuration =====
	log_message(output_log, "\n⚙️ Configuring LoRA for efficient fine-tuning...")
	lora_config = LoraConfig(
	task_type=TaskType.CAUSAL_LM,
	r=8,
	lora_alpha=16,
	lora_dropout=0.1,
	target_modules=["q_proj", "v_proj"],
	bias="none",
	)
	model = get_peft_model(model, lora_config)
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	log_message(output_log, f"Trainable params after LoRA: {trainable_params:,}")

	# ===== Tokenization + labels =====
	def tokenize_fn(examples):
	tokenized = tokenizer(
	examples["text"],
	padding="max_length",
	truncation=True,
	max_length=256,
	)
	tokenized["labels"] = tokenized["input_ids"].copy()
	return tokenized

	train_dataset = train_dataset.map(tokenize_fn, batched=True)
	test_dataset = test_dataset.map(tokenize_fn, batched=True)
	log_message(output_log, "✅ Tokenization + labels done")

	# ===== Training arguments =====
	output_dir = "./qwen-gita-lora"
	training_args = TrainingArguments(
	output_dir=output_dir,
	num_train_epochs=num_epochs,
	per_device_train_batch_size=batch_size,
	gradient_accumulation_steps=2,
	warmup_steps=10,
	logging_steps=5,
	save_strategy="epoch",
	fp16=device == "cuda",
	optim="adamw_torch",
	learning_rate=learning_rate,
	max_steps=500, # Limit for demo is 100
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=test_dataset,
	tokenizer=tokenizer,
	)

	# ===== Train =====
	log_message(output_log, "\n🚀 Starting training...")
	trainer.train()
	log_message(output_log, "\n💾 Saving trained model locally...")
	trainer.save_model(output_dir)
	tokenizer.save_pretrained(output_dir)

	# ===== Async upload =====
	log_message(output_log, f"\n☁️ Initiating async upload to {hf_repo}")
	start_async_upload(output_dir, hf_repo, output_log)

	log_message(output_log, "✅ Training complete & async upload started!")

	except Exception as e:
	log_message(output_log, f"\n❌ Error during training: {e}")

	return "\n".join(output_log)
	# ==== Gradio Interface ====
	def create_interface():
	with gr.Blocks(title="PromptWizard — Qwen Trainer") as demo:
	gr.Markdown("""
	# 🧘 PromptWizard Qwen Fine-tuning
	Fine-tune Qwen on any dataset and upload to any Hugging Face repo.
	""")

	with gr.Row():
	with gr.Column():
	gr.Textbox(label="GPU Status", value=check_gpu_status(), interactive=False)
	base_model = gr.Textbox(label="Base Model", value="Qwen/Qwen2.5-0.5B")
	dataset_name = gr.Textbox(label="Dataset Name", value="rahul7star/Gita")
	hf_repo = gr.Textbox(label="HF Repo for Upload", value="rahul7star/Qwen0.5-3B-Gita")
	num_epochs = gr.Slider(1, 3, value=1, step=1, label="Epochs")
	batch_size = gr.Slider(1, 4, value=2, step=1, label="Batch Size")
	learning_rate = gr.Number(value=5e-5, label="Learning Rate")
	train_btn = gr.Button("🚀 Start Fine-tuning", variant="primary")

	with gr.Column():
	output = gr.Textbox(
	label="Training Log",
	lines=25,
	max_lines=40,
	value="Click 'Start Fine-tuning' to train and upload your model.",
	)

	train_btn.click(
	fn=train_model,
	inputs=[base_model, dataset_name, num_epochs, batch_size, learning_rate, hf_repo],
	outputs=output,
	)

	return demo


	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860)