Spaces:

ibraheem007
/

tailored

Running

App Files Files Community

tailored / export_training_data_from_db.py

ibraheem007

Upload 5 files

c2b1f56 verified 12 days ago

raw

history blame contribute delete

7.03 kB

	from db.connection import SessionLocal
	from db.models import ContentHistory, Feedback
	from sqlalchemy.orm import joinedload
	import os
	import json

	MIN_CLARITY = 4
	MIN_DEPTH = 4
	MIN_COMMENT_LENGTH = 25

	def is_high_quality(feedback, content_entry):
	"""Check if feedback meets high quality criteria for Groq content (fine-tuning data)"""
	# Only use Groq content for fine-tuning (the established model)
	if content_entry.generated_model != "groq":
	print(f"❌ Skipping - not Groq content: {content_entry.generated_model}")
	return False

	# Quality criteria for fine-tuning data
	if feedback.clarity < MIN_CLARITY:
	print(f"❌ Clarity too low: {feedback.clarity} < {MIN_CLARITY}")
	return False

	if feedback.depth < MIN_DEPTH:
	print(f"❌ Depth too low: {feedback.depth} < {MIN_DEPTH}")
	return False

	if feedback.complexity != "Just right":
	print(f"❌ Complexity not 'Just right': {feedback.complexity}")
	return False

	comment_text = (feedback.comments or "").strip()
	if len(comment_text) < MIN_COMMENT_LENGTH:
	print(f"❌ Comment too short: {len(comment_text)} < {MIN_COMMENT_LENGTH}")
	return False

	print(f"✅ High-quality Groq feedback for fine-tuning: clarity={feedback.clarity}, depth={feedback.depth}")
	return True

	def format_training_example(entry, feedback):
	"""Format a training example from Groq content and feedback"""
	if entry.user_type == "student":
	return {
	"instruction": f"Simplify the following content for a {entry.student_level} student: {entry.prompt.strip()}",
	"input": f"Student Level: {entry.student_level}",
	"output": entry.output.strip(),
	"metadata": {
	"user_type": "student",
	"student_level": entry.student_level,
	"clarity_score": feedback.clarity,
	"depth_score": feedback.depth,
	"complexity": feedback.complexity,
	"comments": feedback.comments
	}
	}
	elif entry.user_type == "tutor":
	return {
	"instruction": f"Create a {entry.content_type} about '{entry.topic}' for {entry.student_level} students.",
	"input": f"Learning Objectives: {entry.prompt}",
	"output": entry.output.strip(),
	"metadata": {
	"user_type": "tutor",
	"content_type": entry.content_type,
	"topic": entry.topic,
	"student_level": entry.student_level,
	"clarity_score": feedback.clarity,
	"depth_score": feedback.depth,
	"complexity": feedback.complexity,
	"comments": feedback.comments
	}
	}
	return None

	def export_training_data_from_db(output_file="data/training/phi3_fine_tuning_data.jsonl"):
	"""Export Groq content with high-quality feedback for Phi-3 fine-tuning"""
	print("🔧 Exporting Groq training data for Phi-3 fine-tuning...")
	os.makedirs(os.path.dirname(output_file), exist_ok=True)

	session = SessionLocal()
	try:
	# Get all content entries with their feedback
	entries = session.query(ContentHistory).options(joinedload(ContentHistory.feedback)).all()
	print(f"📊 Found {len(entries)} total content entries")

	high_quality_groq = []
	total_groq_feedback = 0
	total_entries_checked = 0

	for entry in entries:
	total_entries_checked += 1
	feedback_list = entry.feedback
	print(f"🔍 Checking entry {total_entries_checked}/{len(entries)}: model={entry.generated_model}, user_type={entry.user_type}, feedback_count={len(feedback_list)}")

	for feedback in feedback_list:
	# Count all Groq feedback for statistics
	if entry.generated_model == "groq":
	total_groq_feedback += 1
	print(f" 📝 Groq Feedback {total_groq_feedback}: clarity={feedback.clarity}, depth={feedback.depth}")

	# Only export high-quality Groq feedback (for fine-tuning Phi-3)
	if is_high_quality(feedback, entry):
	example = format_training_example(entry, feedback)
	if example:
	high_quality_groq.append(example)
	print(f" ✅ Added Groq training example")

	print(f"📈 Export Summary:")
	print(f" - Total entries checked: {total_entries_checked}")
	print(f" - Total Groq feedback: {total_groq_feedback}")
	print(f" - High-quality Groq examples: {len(high_quality_groq)}")

	if not high_quality_groq:
	print("❌ No high-quality Groq training data found.")
	print("💡 Make sure you have Groq-generated content with high-quality feedback:")
	print(f" - Generated by Groq model")
	print(f" - Clarity >= {MIN_CLARITY}")
	print(f" - Depth >= {MIN_DEPTH}")
	print(f" - Complexity = 'Just right'")
	print(f" - Comments length >= {MIN_COMMENT_LENGTH} characters")
	return False

	# Write to JSONL file (without metadata for training)
	with open(output_file, "w", encoding="utf-8") as f:
	for item in high_quality_groq:
	# Remove metadata for actual training
	training_item = {
	"instruction": item["instruction"],
	"input": item["input"],
	"output": item["output"]
	}
	f.write(json.dumps(training_item, ensure_ascii=False) + "\n")

	print(f"✅ Successfully exported {len(high_quality_groq)} Groq training examples to {output_file}")

	# Show detailed breakdown
	if high_quality_groq:
	student_examples = len([e for e in high_quality_groq if "Simplify" in e["instruction"]])
	tutor_examples = len([e for e in high_quality_groq if "Create a" in e["instruction"]])
	print(f"📊 Breakdown: {student_examples} student examples, {tutor_examples} tutor examples")

	print("📝 Sample training example:")
	sample = high_quality_groq[0]
	print(json.dumps({
	"instruction": sample["instruction"][:100] + "...",
	"input": sample["input"],
	"output": sample["output"][:100] + "..."
	}, indent=2, ensure_ascii=False))

	return True

	except Exception as e:
	print(f"❌ Error exporting training data: {e}")
	import traceback
	traceback.print_exc()
	return False

	finally:
	session.close()

	if __name__ == "__main__":
	export_training_data_from_db()