tailored / export_training_data_from_db.py
ibraheem007's picture
Upload 5 files
c2b1f56 verified
from db.connection import SessionLocal
from db.models import ContentHistory, Feedback
from sqlalchemy.orm import joinedload
import os
import json
MIN_CLARITY = 4
MIN_DEPTH = 4
MIN_COMMENT_LENGTH = 25
def is_high_quality(feedback, content_entry):
"""Check if feedback meets high quality criteria for Groq content (fine-tuning data)"""
# Only use Groq content for fine-tuning (the established model)
if content_entry.generated_model != "groq":
print(f"❌ Skipping - not Groq content: {content_entry.generated_model}")
return False
# Quality criteria for fine-tuning data
if feedback.clarity < MIN_CLARITY:
print(f"❌ Clarity too low: {feedback.clarity} < {MIN_CLARITY}")
return False
if feedback.depth < MIN_DEPTH:
print(f"❌ Depth too low: {feedback.depth} < {MIN_DEPTH}")
return False
if feedback.complexity != "Just right":
print(f"❌ Complexity not 'Just right': {feedback.complexity}")
return False
comment_text = (feedback.comments or "").strip()
if len(comment_text) < MIN_COMMENT_LENGTH:
print(f"❌ Comment too short: {len(comment_text)} < {MIN_COMMENT_LENGTH}")
return False
print(f"βœ… High-quality Groq feedback for fine-tuning: clarity={feedback.clarity}, depth={feedback.depth}")
return True
def format_training_example(entry, feedback):
"""Format a training example from Groq content and feedback"""
if entry.user_type == "student":
return {
"instruction": f"Simplify the following content for a {entry.student_level} student: {entry.prompt.strip()}",
"input": f"Student Level: {entry.student_level}",
"output": entry.output.strip(),
"metadata": {
"user_type": "student",
"student_level": entry.student_level,
"clarity_score": feedback.clarity,
"depth_score": feedback.depth,
"complexity": feedback.complexity,
"comments": feedback.comments
}
}
elif entry.user_type == "tutor":
return {
"instruction": f"Create a {entry.content_type} about '{entry.topic}' for {entry.student_level} students.",
"input": f"Learning Objectives: {entry.prompt}",
"output": entry.output.strip(),
"metadata": {
"user_type": "tutor",
"content_type": entry.content_type,
"topic": entry.topic,
"student_level": entry.student_level,
"clarity_score": feedback.clarity,
"depth_score": feedback.depth,
"complexity": feedback.complexity,
"comments": feedback.comments
}
}
return None
def export_training_data_from_db(output_file="data/training/phi3_fine_tuning_data.jsonl"):
"""Export Groq content with high-quality feedback for Phi-3 fine-tuning"""
print("πŸ”§ Exporting Groq training data for Phi-3 fine-tuning...")
os.makedirs(os.path.dirname(output_file), exist_ok=True)
session = SessionLocal()
try:
# Get all content entries with their feedback
entries = session.query(ContentHistory).options(joinedload(ContentHistory.feedback)).all()
print(f"πŸ“Š Found {len(entries)} total content entries")
high_quality_groq = []
total_groq_feedback = 0
total_entries_checked = 0
for entry in entries:
total_entries_checked += 1
feedback_list = entry.feedback
print(f"πŸ” Checking entry {total_entries_checked}/{len(entries)}: model={entry.generated_model}, user_type={entry.user_type}, feedback_count={len(feedback_list)}")
for feedback in feedback_list:
# Count all Groq feedback for statistics
if entry.generated_model == "groq":
total_groq_feedback += 1
print(f" πŸ“ Groq Feedback {total_groq_feedback}: clarity={feedback.clarity}, depth={feedback.depth}")
# Only export high-quality Groq feedback (for fine-tuning Phi-3)
if is_high_quality(feedback, entry):
example = format_training_example(entry, feedback)
if example:
high_quality_groq.append(example)
print(f" βœ… Added Groq training example")
print(f"πŸ“ˆ Export Summary:")
print(f" - Total entries checked: {total_entries_checked}")
print(f" - Total Groq feedback: {total_groq_feedback}")
print(f" - High-quality Groq examples: {len(high_quality_groq)}")
if not high_quality_groq:
print("❌ No high-quality Groq training data found.")
print("πŸ’‘ Make sure you have Groq-generated content with high-quality feedback:")
print(f" - Generated by Groq model")
print(f" - Clarity >= {MIN_CLARITY}")
print(f" - Depth >= {MIN_DEPTH}")
print(f" - Complexity = 'Just right'")
print(f" - Comments length >= {MIN_COMMENT_LENGTH} characters")
return False
# Write to JSONL file (without metadata for training)
with open(output_file, "w", encoding="utf-8") as f:
for item in high_quality_groq:
# Remove metadata for actual training
training_item = {
"instruction": item["instruction"],
"input": item["input"],
"output": item["output"]
}
f.write(json.dumps(training_item, ensure_ascii=False) + "\n")
print(f"βœ… Successfully exported {len(high_quality_groq)} Groq training examples to {output_file}")
# Show detailed breakdown
if high_quality_groq:
student_examples = len([e for e in high_quality_groq if "Simplify" in e["instruction"]])
tutor_examples = len([e for e in high_quality_groq if "Create a" in e["instruction"]])
print(f"πŸ“Š Breakdown: {student_examples} student examples, {tutor_examples} tutor examples")
print("πŸ“ Sample training example:")
sample = high_quality_groq[0]
print(json.dumps({
"instruction": sample["instruction"][:100] + "...",
"input": sample["input"],
"output": sample["output"][:100] + "..."
}, indent=2, ensure_ascii=False))
return True
except Exception as e:
print(f"❌ Error exporting training data: {e}")
import traceback
traceback.print_exc()
return False
finally:
session.close()
if __name__ == "__main__":
export_training_data_from_db()