from flask import Flask, request, render_template from transformers import pipeline from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier import polars as pl import joblib from pathlib import Path import logging import os from time import perf_counter from typing import Optional, Tuple logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) app = Flask(__name__) CLASS_ID_TO_SENTIMENT = { "0": "negative", "1": "neutral", "2": "positive" } def categorize_probability(probability: Optional[float]) -> Tuple[str, str, str]: """ Map a probability (0-1) to a qualitative label and associated CSS modifier. Returns (label, css_class, display_value). """ if probability is None: return ("Unknown", "probability-unknown", "N/A") percent = max(0.0, min(probability * 100.0, 100.0)) if percent >= 80: return ("Definitely", "probability-definitely", f"{percent:.0f}%") if percent >= 60: return ("Probably", "probability-probably", f"{percent:.0f}%") return ("Maybe", "probability-maybe", f"{percent:.0f}%") PRESET_TEXTS = [ "flower isn't beautiful", "there is no more love. only pain.", "one isn't a beauty, but two is a wondrous wonder", "hvl is a fake university #uibforever" ] # Use HF Spaces persistent storage if available, otherwise local cache CACHE_DIR = Path(os.getenv("HF_HOME", ".")) / ".model_cache" CACHE_DIR.mkdir(exist_ok=True) logger.info("Loading BERTweet from HuggingFace Hub...") bertweet_pipeline = pipeline("sentiment-analysis", model="kluvin/bertweet-tweet-sentiment") logger.info("BERTweet loaded successfully") # Define model configurations model_configs = { "Decision Tree": Pipeline([ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), ("clf", DecisionTreeClassifier(max_depth=10, random_state=42)) ]), "Random Forest": Pipeline([ ("tfidf", TfidfVectorizer(max_features=500, stop_words="english")), ("clf", RandomForestClassifier(n_estimators=100, random_state=42)) ]), "Logistic Regression": Pipeline([ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), ("clf", LogisticRegression(max_iter=1000, random_state=42)) ]), "Linear SVM": Pipeline([ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), ("clf", LinearSVC(random_state=42)) ]) } sklearn_pipelines = {} cache_file = CACHE_DIR / "ml_models.joblib" if cache_file.exists(): logger.info("Loading cached ML models...") try: sklearn_pipelines = joblib.load(cache_file) logger.info("✓ Cached models loaded successfully!") except Exception as e: logger.error(f"Failed to load cache: {e}") logger.info("Will retrain models...") if not sklearn_pipelines: logger.info("Loading training data and training ML models...") splits = {'train': 'train.jsonl'} df = pl.read_ndjson('hf://datasets/SetFit/tweet_sentiment_extraction/' + splits['train']) X_train = df['text'].to_list() y_train = df['label'].to_list() logger.info("Training models...") for model_name, sklearn_pipeline in model_configs.items(): logger.info(f" Training {model_name}...") sklearn_pipeline.fit(X_train, y_train) sklearn_pipelines[model_name] = sklearn_pipeline logger.info("Saving models to cache...") joblib.dump(sklearn_pipelines, cache_file) logger.info(f"✓ Models cached at {cache_file}") logger.info("All models loaded and ready!") def render_model_result(model_name: str, sentiment_name: str, probability: float | None) -> str: probability_label, probability_css, probability_value = categorize_probability(probability) return f'''
{sentiment_name.capitalize()}
{probability_label} {probability_value}