Spaces:
Running
Running
File size: 7,029 Bytes
c2b1f56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from db.connection import SessionLocal
from db.models import ContentHistory, Feedback
from sqlalchemy.orm import joinedload
import os
import json
MIN_CLARITY = 4
MIN_DEPTH = 4
MIN_COMMENT_LENGTH = 25
def is_high_quality(feedback, content_entry):
"""Check if feedback meets high quality criteria for Groq content (fine-tuning data)"""
# Only use Groq content for fine-tuning (the established model)
if content_entry.generated_model != "groq":
print(f"β Skipping - not Groq content: {content_entry.generated_model}")
return False
# Quality criteria for fine-tuning data
if feedback.clarity < MIN_CLARITY:
print(f"β Clarity too low: {feedback.clarity} < {MIN_CLARITY}")
return False
if feedback.depth < MIN_DEPTH:
print(f"β Depth too low: {feedback.depth} < {MIN_DEPTH}")
return False
if feedback.complexity != "Just right":
print(f"β Complexity not 'Just right': {feedback.complexity}")
return False
comment_text = (feedback.comments or "").strip()
if len(comment_text) < MIN_COMMENT_LENGTH:
print(f"β Comment too short: {len(comment_text)} < {MIN_COMMENT_LENGTH}")
return False
print(f"β
High-quality Groq feedback for fine-tuning: clarity={feedback.clarity}, depth={feedback.depth}")
return True
def format_training_example(entry, feedback):
"""Format a training example from Groq content and feedback"""
if entry.user_type == "student":
return {
"instruction": f"Simplify the following content for a {entry.student_level} student: {entry.prompt.strip()}",
"input": f"Student Level: {entry.student_level}",
"output": entry.output.strip(),
"metadata": {
"user_type": "student",
"student_level": entry.student_level,
"clarity_score": feedback.clarity,
"depth_score": feedback.depth,
"complexity": feedback.complexity,
"comments": feedback.comments
}
}
elif entry.user_type == "tutor":
return {
"instruction": f"Create a {entry.content_type} about '{entry.topic}' for {entry.student_level} students.",
"input": f"Learning Objectives: {entry.prompt}",
"output": entry.output.strip(),
"metadata": {
"user_type": "tutor",
"content_type": entry.content_type,
"topic": entry.topic,
"student_level": entry.student_level,
"clarity_score": feedback.clarity,
"depth_score": feedback.depth,
"complexity": feedback.complexity,
"comments": feedback.comments
}
}
return None
def export_training_data_from_db(output_file="data/training/phi3_fine_tuning_data.jsonl"):
"""Export Groq content with high-quality feedback for Phi-3 fine-tuning"""
print("π§ Exporting Groq training data for Phi-3 fine-tuning...")
os.makedirs(os.path.dirname(output_file), exist_ok=True)
session = SessionLocal()
try:
# Get all content entries with their feedback
entries = session.query(ContentHistory).options(joinedload(ContentHistory.feedback)).all()
print(f"π Found {len(entries)} total content entries")
high_quality_groq = []
total_groq_feedback = 0
total_entries_checked = 0
for entry in entries:
total_entries_checked += 1
feedback_list = entry.feedback
print(f"π Checking entry {total_entries_checked}/{len(entries)}: model={entry.generated_model}, user_type={entry.user_type}, feedback_count={len(feedback_list)}")
for feedback in feedback_list:
# Count all Groq feedback for statistics
if entry.generated_model == "groq":
total_groq_feedback += 1
print(f" π Groq Feedback {total_groq_feedback}: clarity={feedback.clarity}, depth={feedback.depth}")
# Only export high-quality Groq feedback (for fine-tuning Phi-3)
if is_high_quality(feedback, entry):
example = format_training_example(entry, feedback)
if example:
high_quality_groq.append(example)
print(f" β
Added Groq training example")
print(f"π Export Summary:")
print(f" - Total entries checked: {total_entries_checked}")
print(f" - Total Groq feedback: {total_groq_feedback}")
print(f" - High-quality Groq examples: {len(high_quality_groq)}")
if not high_quality_groq:
print("β No high-quality Groq training data found.")
print("π‘ Make sure you have Groq-generated content with high-quality feedback:")
print(f" - Generated by Groq model")
print(f" - Clarity >= {MIN_CLARITY}")
print(f" - Depth >= {MIN_DEPTH}")
print(f" - Complexity = 'Just right'")
print(f" - Comments length >= {MIN_COMMENT_LENGTH} characters")
return False
# Write to JSONL file (without metadata for training)
with open(output_file, "w", encoding="utf-8") as f:
for item in high_quality_groq:
# Remove metadata for actual training
training_item = {
"instruction": item["instruction"],
"input": item["input"],
"output": item["output"]
}
f.write(json.dumps(training_item, ensure_ascii=False) + "\n")
print(f"β
Successfully exported {len(high_quality_groq)} Groq training examples to {output_file}")
# Show detailed breakdown
if high_quality_groq:
student_examples = len([e for e in high_quality_groq if "Simplify" in e["instruction"]])
tutor_examples = len([e for e in high_quality_groq if "Create a" in e["instruction"]])
print(f"π Breakdown: {student_examples} student examples, {tutor_examples} tutor examples")
print("π Sample training example:")
sample = high_quality_groq[0]
print(json.dumps({
"instruction": sample["instruction"][:100] + "...",
"input": sample["input"],
"output": sample["output"][:100] + "..."
}, indent=2, ensure_ascii=False))
return True
except Exception as e:
print(f"β Error exporting training data: {e}")
import traceback
traceback.print_exc()
return False
finally:
session.close()
if __name__ == "__main__":
export_training_data_from_db() |