Spaces:

ibraheem007
/

tailored

Running

App Files Files Community

tailored / export_training_data_from_db.py

ibraheem007

Upload 5 files

c2b1f56 verified 12 days ago

raw

history blame

7.03 kB

	from db.connection import SessionLocal
	from db.models import ContentHistory, Feedback
	from sqlalchemy.orm import joinedload
	import os
	import json

	MIN_CLARITY = 4
	MIN_DEPTH = 4
	MIN_COMMENT_LENGTH = 25

	def is_high_quality(feedback, content_entry):
	"""Check if feedback meets high quality criteria for Groq content (fine-tuning data)"""
	# Only use Groq content for fine-tuning (the established model)
	if content_entry.generated_model != "groq":
	print(f"❌ Skipping - not Groq content: {content_entry.generated_model}")
	return False

	# Quality criteria for fine-tuning data
	if feedback.clarity < MIN_CLARITY:
	print(f"❌ Clarity too low: {feedback.clarity} < {MIN_CLARITY}")
	return False

	if feedback.depth < MIN_DEPTH:
	print(f"❌ Depth too low: {feedback.depth} < {MIN_DEPTH}")
	return False

	if feedback.complexity != "Just right":
	print(f"❌ Complexity not 'Just right': {feedback.complexity}")
	return False

	comment_text = (feedback.comments or "").strip()
	if len(comment_text) < MIN_COMMENT_LENGTH:
	print(f"❌ Comment too short: {len(comment_text)} < {MIN_COMMENT_LENGTH}")
	return False

	print(f"✅ High-quality Groq feedback for fine-tuning: clarity={feedback.clarity}, depth={feedback.depth}")
	return True

	def format_training_example(entry, feedback):
	"""Format a training example from Groq content and feedback"""
	if entry.user_type == "student":
	return {
	"instruction": f"Simplify the following content for a {entry.student_level} student: {entry.prompt.strip()}",
	"input": f"Student Level: {entry.student_level}",
	"output": entry.output.strip(),
	"metadata": {
	"user_type": "student",
	"student_level": entry.student_level,
	"clarity_score": feedback.clarity,
	"depth_score": feedback.depth,
	"complexity": feedback.complexity,
	"comments": feedback.comments
	}
	}
	elif entry.user_type == "tutor":
	return {
	"instruction": f"Create a {entry.content_type} about '{entry.topic}' for {entry.student_level} students.",
	"input": f"Learning Objectives: {entry.prompt}",
	"output": entry.output.strip(),
	"metadata": {
	"user_type": "tutor",
	"content_type": entry.content_type,
	"topic": entry.topic,
	"student_level": entry.student_level,
	"clarity_score": feedback.clarity,
	"depth_score": feedback.depth,
	"complexity": feedback.complexity,
	"comments": feedback.comments
	}
	}
	return None

	def export_training_data_from_db(output_file="data/training/phi3_fine_tuning_data.jsonl"):
	"""Export Groq content with high-quality feedback for Phi-3 fine-tuning"""
	print("🔧 Exporting Groq training data for Phi-3 fine-tuning...")
	os.makedirs(os.path.dirname(output_file), exist_ok=True)

	session = SessionLocal()
	try:
	# Get all content entries with their feedback
	entries = session.query(ContentHistory).options(joinedload(ContentHistory.feedback)).all()
	print(f"📊 Found {len(entries)} total content entries")

	high_quality_groq = []
	total_groq_feedback = 0
	total_entries_checked = 0

	for entry in entries:
	total_entries_checked += 1
	feedback_list = entry.feedback
	print(f"🔍 Checking entry {total_entries_checked}/{len(entries)}: model={entry.generated_model}, user_type={entry.user_type}, feedback_count={len(feedback_list)}")

	for feedback in feedback_list:
	# Count all Groq feedback for statistics
	if entry.generated_model == "groq":
	total_groq_feedback += 1
	print(f" 📝 Groq Feedback {total_groq_feedback}: clarity={feedback.clarity}, depth={feedback.depth}")

	# Only export high-quality Groq feedback (for fine-tuning Phi-3)
	if is_high_quality(feedback, entry):
	example = format_training_example(entry, feedback)
	if example:
	high_quality_groq.append(example)
	print(f" ✅ Added Groq training example")

	print(f"📈 Export Summary:")
	print(f" - Total entries checked: {total_entries_checked}")
	print(f" - Total Groq feedback: {total_groq_feedback}")
	print(f" - High-quality Groq examples: {len(high_quality_groq)}")

	if not high_quality_groq:
	print("❌ No high-quality Groq training data found.")
	print("💡 Make sure you have Groq-generated content with high-quality feedback:")
	print(f" - Generated by Groq model")
	print(f" - Clarity >= {MIN_CLARITY}")
	print(f" - Depth >= {MIN_DEPTH}")
	print(f" - Complexity = 'Just right'")
	print(f" - Comments length >= {MIN_COMMENT_LENGTH} characters")
	return False

	# Write to JSONL file (without metadata for training)
	with open(output_file, "w", encoding="utf-8") as f:
	for item in high_quality_groq:
	# Remove metadata for actual training
	training_item = {
	"instruction": item["instruction"],
	"input": item["input"],
	"output": item["output"]
	}
	f.write(json.dumps(training_item, ensure_ascii=False) + "\n")

	print(f"✅ Successfully exported {len(high_quality_groq)} Groq training examples to {output_file}")

	# Show detailed breakdown
	if high_quality_groq:
	student_examples = len([e for e in high_quality_groq if "Simplify" in e["instruction"]])
	tutor_examples = len([e for e in high_quality_groq if "Create a" in e["instruction"]])
	print(f"📊 Breakdown: {student_examples} student examples, {tutor_examples} tutor examples")

	print("📝 Sample training example:")
	sample = high_quality_groq[0]
	print(json.dumps({
	"instruction": sample["instruction"][:100] + "...",
	"input": sample["input"],
	"output": sample["output"][:100] + "..."
	}, indent=2, ensure_ascii=False))

	return True

	except Exception as e:
	print(f"❌ Error exporting training data: {e}")
	import traceback
	traceback.print_exc()
	return False

	finally:
	session.close()

	if __name__ == "__main__":
	export_training_data_from_db()