Spaces:
Running
Running
| import json | |
| import os | |
| import re | |
| from datetime import datetime | |
| from db.helpers import get_research_stats | |
| def save_feedback(prompt, output, clarity, depth, complexity, comments, user_type=None, student_level=None): | |
| """ | |
| Save user feedback to a JSONL file with additional metadata | |
| """ | |
| # Create feedback directory if it doesn't exist | |
| os.makedirs("data/feedback", exist_ok=True) | |
| feedback_data = { | |
| "timestamp": datetime.now().isoformat(), | |
| "prompt": prompt, | |
| "output": output, | |
| "feedback": { | |
| "clarity": clarity, | |
| "depth": depth, | |
| "complexity": complexity, | |
| "comments": comments | |
| }, | |
| "metadata": { | |
| "user_type": user_type, | |
| "student_level": student_level | |
| } | |
| } | |
| # Save to JSONL file | |
| feedback_file = "data/feedback/feedback.jsonl" | |
| try: | |
| with open(feedback_file, "a", encoding="utf-8") as f: | |
| f.write(json.dumps(feedback_data, ensure_ascii=False) + "\n") | |
| print(f"β Feedback saved to {feedback_file}") | |
| return True | |
| except Exception as e: | |
| print(f"β Error saving feedback: {e}") | |
| return False | |
| def load_feedback_data(): | |
| """Load all feedback data for analysis""" | |
| feedback_file = "data/feedback/feedback.jsonl" | |
| if not os.path.exists(feedback_file): | |
| return [] | |
| feedback_data = [] | |
| try: | |
| with open(feedback_file, "r", encoding="utf-8") as f: | |
| for line in f: | |
| if line.strip(): | |
| feedback_data.append(json.loads(line.strip())) | |
| return feedback_data | |
| except Exception as e: | |
| print(f"β Error loading feedback data: {e}") | |
| return [] | |
| def get_feedback_stats(): | |
| """Get basic statistics about collected feedback""" | |
| feedback_data = load_feedback_data() | |
| if not feedback_data: | |
| return { | |
| "total_feedback": 0, | |
| "average_clarity": 0, | |
| "average_depth": 0, | |
| "complexity_distribution": {}, | |
| "user_type_distribution": {} | |
| } | |
| total = len(feedback_data) | |
| clarity_sum = 0 | |
| depth_sum = 0 | |
| complexity_counts = {} | |
| user_type_counts = {} | |
| for entry in feedback_data: | |
| clarity_sum += entry["feedback"]["clarity"] | |
| depth_sum += entry["feedback"]["depth"] | |
| complexity = entry["feedback"]["complexity"] | |
| complexity_counts[complexity] = complexity_counts.get(complexity, 0) + 1 | |
| user_type = entry["metadata"].get("user_type", "unknown") | |
| user_type_counts[user_type] = user_type_counts.get(user_type, 0) + 1 | |
| return { | |
| "total_feedback": total, | |
| "average_clarity": round(clarity_sum / total, 2) if total > 0 else 0, | |
| "average_depth": round(depth_sum / total, 2) if total > 0 else 0, | |
| "complexity_distribution": complexity_counts, | |
| "user_type_distribution": user_type_counts | |
| } | |
| def is_high_quality_feedback(feedback_entry): | |
| """ | |
| SIMPLEST VERSION: Length-based filtering after removing emojis | |
| Only uses high-quality, "just right" feedback for training | |
| """ | |
| feedback = feedback_entry["feedback"] | |
| # Quality thresholds | |
| MIN_CLARITY = 4 | |
| MIN_DEPTH = 4 | |
| MIN_COMMENT_LENGTH = 25 # Substantive comments after emoji removal | |
| MIN_WORD_COUNT = 4 # Minimum words for substance | |
| # Check ratings (must be high quality) | |
| if feedback["clarity"] < MIN_CLARITY or feedback["depth"] < MIN_DEPTH: | |
| return False | |
| # Check complexity (we want "Just right" examples to replicate) | |
| if feedback["complexity"] != "Just right": | |
| return False | |
| # Check comments if provided | |
| comments = feedback.get("comments", "").strip() | |
| if comments: | |
| # Remove emojis first, then check length | |
| emoji_pattern = re.compile( | |
| r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001F900-\U0001F9FF\U0001F018-\U0001F270πππππβ€οΈπ₯]', | |
| flags=re.UNICODE | |
| ) | |
| text_without_emojis = emoji_pattern.sub('', comments).strip() | |
| # Now apply length check on the cleaned text | |
| if len(text_without_emojis) < MIN_COMMENT_LENGTH: | |
| return False | |
| # Check word count for minimal substance | |
| word_count = len(text_without_emojis.split()) | |
| if word_count < MIN_WORD_COUNT: | |
| return False | |
| return True | |
| def prepare_training_data(): | |
| """ | |
| Prepare high-quality feedback for model fine-tuning | |
| Returns structured training examples | |
| """ | |
| all_feedback = load_feedback_data() | |
| training_examples = [] | |
| skipped_count = 0 | |
| for feedback in all_feedback: | |
| if is_high_quality_feedback(feedback): | |
| # Create training example from high-quality feedback | |
| training_example = { | |
| "instruction": feedback["prompt"], | |
| "input": f"Student Level: {feedback['metadata'].get('student_level', 'Unknown')}", | |
| "output": feedback["output"], | |
| "metadata": { | |
| "user_type": feedback["metadata"].get("user_type"), | |
| "clarity_score": feedback["feedback"]["clarity"], | |
| "depth_score": feedback["feedback"]["depth"], | |
| "comments": feedback["feedback"].get("comments", "") | |
| } | |
| } | |
| training_examples.append(training_example) | |
| else: | |
| skipped_count += 1 | |
| print(f"β Prepared {len(training_examples)} training examples (skipped {skipped_count} low-quality)") | |
| return training_examples | |
| def get_training_data_stats(): | |
| """ | |
| Get statistics about prepared training data | |
| """ | |
| training_data = prepare_training_data() | |
| if not training_data: | |
| return { | |
| "total_training_examples": 0, | |
| "user_type_breakdown": {}, | |
| "average_scores": {"clarity": 0, "depth": 0} | |
| } | |
| user_type_counts = {} | |
| clarity_sum = 0 | |
| depth_sum = 0 | |
| for example in training_data: | |
| user_type = example["metadata"].get("user_type", "unknown") | |
| user_type_counts[user_type] = user_type_counts.get(user_type, 0) + 1 | |
| clarity_sum += example["metadata"]["clarity_score"] | |
| depth_sum += example["metadata"]["depth_score"] | |
| return { | |
| "total_training_examples": len(training_data), | |
| "user_type_breakdown": user_type_counts, | |
| "average_scores": { | |
| "clarity": round(clarity_sum / len(training_data), 2), | |
| "depth": round(depth_sum / len(training_data), 2) | |
| } | |
| } | |
| def export_training_data(output_file="data/training/training_data.jsonl"): | |
| """ | |
| Export filtered training data to file for fine-tuning | |
| """ | |
| training_data = prepare_training_data() | |
| if not training_data: | |
| print("β No high-quality training data available") | |
| return False | |
| # Create directory if it doesn't exist | |
| os.makedirs(os.path.dirname(output_file), exist_ok=True) | |
| try: | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| for example in training_data: | |
| # Remove metadata for actual training | |
| training_example = { | |
| "instruction": example["instruction"], | |
| "input": example["input"], | |
| "output": example["output"] | |
| } | |
| f.write(json.dumps(training_example, ensure_ascii=False) + "\n") | |
| print(f"β Exported {len(training_data)} training examples to {output_file}") | |
| return True | |
| except Exception as e: | |
| print(f"β Error exporting training data: {e}") | |
| return False | |
| def get_research_progress(): | |
| """Fetch research progress from PostgreSQL""" | |
| stats = get_research_stats() | |
| return { | |
| "total_feedback": stats["total_feedback"], | |
| "high_quality_examples": stats["high_quality_feedback"], | |
| "conversion_rate": stats["conversion_rate"], | |
| "average_quality": stats["average_scores"], | |
| "user_breakdown": stats["user_type_breakdown"] | |
| } | |