File size: 7,029 Bytes
c2b1f56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from db.connection import SessionLocal  
from db.models import ContentHistory, Feedback
from sqlalchemy.orm import joinedload
import os
import json

MIN_CLARITY = 4
MIN_DEPTH = 4
MIN_COMMENT_LENGTH = 25

def is_high_quality(feedback, content_entry):
    """Check if feedback meets high quality criteria for Groq content (fine-tuning data)"""
    # Only use Groq content for fine-tuning (the established model)
    if content_entry.generated_model != "groq":
        print(f"❌ Skipping - not Groq content: {content_entry.generated_model}")
        return False
    
    # Quality criteria for fine-tuning data
    if feedback.clarity < MIN_CLARITY:
        print(f"❌ Clarity too low: {feedback.clarity} < {MIN_CLARITY}")
        return False
        
    if feedback.depth < MIN_DEPTH:
        print(f"❌ Depth too low: {feedback.depth} < {MIN_DEPTH}")
        return False

    if feedback.complexity != "Just right":
        print(f"❌ Complexity not 'Just right': {feedback.complexity}")
        return False

    comment_text = (feedback.comments or "").strip()
    if len(comment_text) < MIN_COMMENT_LENGTH:
        print(f"❌ Comment too short: {len(comment_text)} < {MIN_COMMENT_LENGTH}")
        return False

    print(f"βœ… High-quality Groq feedback for fine-tuning: clarity={feedback.clarity}, depth={feedback.depth}")
    return True

def format_training_example(entry, feedback):
    """Format a training example from Groq content and feedback"""
    if entry.user_type == "student":
        return {
            "instruction": f"Simplify the following content for a {entry.student_level} student: {entry.prompt.strip()}",
            "input": f"Student Level: {entry.student_level}",
            "output": entry.output.strip(),
            "metadata": {
                "user_type": "student",
                "student_level": entry.student_level,
                "clarity_score": feedback.clarity,
                "depth_score": feedback.depth,
                "complexity": feedback.complexity,
                "comments": feedback.comments
            }
        }
    elif entry.user_type == "tutor":
        return {
            "instruction": f"Create a {entry.content_type} about '{entry.topic}' for {entry.student_level} students.",
            "input": f"Learning Objectives: {entry.prompt}",
            "output": entry.output.strip(),
            "metadata": {
                "user_type": "tutor", 
                "content_type": entry.content_type,
                "topic": entry.topic,
                "student_level": entry.student_level,
                "clarity_score": feedback.clarity,
                "depth_score": feedback.depth,
                "complexity": feedback.complexity,
                "comments": feedback.comments
            }
        }
    return None

def export_training_data_from_db(output_file="data/training/phi3_fine_tuning_data.jsonl"):
    """Export Groq content with high-quality feedback for Phi-3 fine-tuning"""
    print("πŸ”§ Exporting Groq training data for Phi-3 fine-tuning...")
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    session = SessionLocal()
    try:
        # Get all content entries with their feedback
        entries = session.query(ContentHistory).options(joinedload(ContentHistory.feedback)).all()
        print(f"πŸ“Š Found {len(entries)} total content entries")

        high_quality_groq = []
        total_groq_feedback = 0
        total_entries_checked = 0
        
        for entry in entries:
            total_entries_checked += 1
            feedback_list = entry.feedback
            print(f"πŸ” Checking entry {total_entries_checked}/{len(entries)}: model={entry.generated_model}, user_type={entry.user_type}, feedback_count={len(feedback_list)}")
            
            for feedback in feedback_list:
                # Count all Groq feedback for statistics
                if entry.generated_model == "groq":
                    total_groq_feedback += 1
                    print(f"  πŸ“ Groq Feedback {total_groq_feedback}: clarity={feedback.clarity}, depth={feedback.depth}")
                
                # Only export high-quality Groq feedback (for fine-tuning Phi-3)
                if is_high_quality(feedback, entry):
                    example = format_training_example(entry, feedback)
                    if example:
                        high_quality_groq.append(example)
                        print(f"  βœ… Added Groq training example")

        print(f"πŸ“ˆ Export Summary:")
        print(f"   - Total entries checked: {total_entries_checked}")
        print(f"   - Total Groq feedback: {total_groq_feedback}")
        print(f"   - High-quality Groq examples: {len(high_quality_groq)}")

        if not high_quality_groq:
            print("❌ No high-quality Groq training data found.")
            print("πŸ’‘ Make sure you have Groq-generated content with high-quality feedback:")
            print(f"   - Generated by Groq model")
            print(f"   - Clarity >= {MIN_CLARITY}")
            print(f"   - Depth >= {MIN_DEPTH}")
            print(f"   - Complexity = 'Just right'")
            print(f"   - Comments length >= {MIN_COMMENT_LENGTH} characters")
            return False

        # Write to JSONL file (without metadata for training)
        with open(output_file, "w", encoding="utf-8") as f:
            for item in high_quality_groq:
                # Remove metadata for actual training
                training_item = {
                    "instruction": item["instruction"],
                    "input": item["input"],
                    "output": item["output"]
                }
                f.write(json.dumps(training_item, ensure_ascii=False) + "\n")

        print(f"βœ… Successfully exported {len(high_quality_groq)} Groq training examples to {output_file}")
        
        # Show detailed breakdown
        if high_quality_groq:
            student_examples = len([e for e in high_quality_groq if "Simplify" in e["instruction"]])
            tutor_examples = len([e for e in high_quality_groq if "Create a" in e["instruction"]])
            print(f"πŸ“Š Breakdown: {student_examples} student examples, {tutor_examples} tutor examples")
            
            print("πŸ“ Sample training example:")
            sample = high_quality_groq[0]
            print(json.dumps({
                "instruction": sample["instruction"][:100] + "...",
                "input": sample["input"],
                "output": sample["output"][:100] + "..."
            }, indent=2, ensure_ascii=False))
            
        return True

    except Exception as e:
        print(f"❌ Error exporting training data: {e}")
        import traceback
        traceback.print_exc()
        return False

    finally:
        session.close()

if __name__ == "__main__":
    export_training_data_from_db()