from flask import Flask, request, render_template from transformers import pipeline from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier import polars as pl import joblib from pathlib import Path import logging import os from time import perf_counter from typing import Optional, Tuple logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) app = Flask(__name__) CLASS_ID_TO_SENTIMENT = { "0": "negative", "1": "neutral", "2": "positive" } def categorize_probability(probability: Optional[float]) -> Tuple[str, str, str]: """ Map a probability (0-1) to a qualitative label and associated CSS modifier. Returns (label, css_class, display_value). """ if probability is None: return ("Unknown", "probability-unknown", "N/A") percent = max(0.0, min(probability * 100.0, 100.0)) if percent >= 80: return ("Definitely", "probability-definitely", f"{percent:.0f}%") if percent >= 60: return ("Probably", "probability-probably", f"{percent:.0f}%") return ("Maybe", "probability-maybe", f"{percent:.0f}%") PRESET_TEXTS = [ "flower isn't beautiful", "there is no more love. only pain.", "one isn't a beauty, but two is a wondrous wonder", "hvl is a fake university #uibforever" ] # Use HF Spaces persistent storage if available, otherwise local cache CACHE_DIR = Path(os.getenv("HF_HOME", ".")) / ".model_cache" CACHE_DIR.mkdir(exist_ok=True) logger.info("Loading BERTweet from HuggingFace Hub...") bertweet_pipeline = pipeline("sentiment-analysis", model="kluvin/bertweet-tweet-sentiment") logger.info("BERTweet loaded successfully") # Define model configurations model_configs = { "Decision Tree": Pipeline([ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), ("clf", DecisionTreeClassifier(max_depth=10, random_state=42)) ]), "Random Forest": Pipeline([ ("tfidf", TfidfVectorizer(max_features=500, stop_words="english")), ("clf", RandomForestClassifier(n_estimators=100, random_state=42)) ]), "Logistic Regression": Pipeline([ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), ("clf", LogisticRegression(max_iter=1000, random_state=42)) ]), "Linear SVM": Pipeline([ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), ("clf", LinearSVC(random_state=42)) ]) } sklearn_pipelines = {} cache_file = CACHE_DIR / "ml_models.joblib" if cache_file.exists(): logger.info("Loading cached ML models...") try: sklearn_pipelines = joblib.load(cache_file) logger.info("✓ Cached models loaded successfully!") except Exception as e: logger.error(f"Failed to load cache: {e}") logger.info("Will retrain models...") if not sklearn_pipelines: logger.info("Loading training data and training ML models...") splits = {'train': 'train.jsonl'} df = pl.read_ndjson('hf://datasets/SetFit/tweet_sentiment_extraction/' + splits['train']) X_train = df['text'].to_list() y_train = df['label'].to_list() logger.info("Training models...") for model_name, sklearn_pipeline in model_configs.items(): logger.info(f" Training {model_name}...") sklearn_pipeline.fit(X_train, y_train) sklearn_pipelines[model_name] = sklearn_pipeline logger.info("Saving models to cache...") joblib.dump(sklearn_pipelines, cache_file) logger.info(f"✓ Models cached at {cache_file}") logger.info("All models loaded and ready!") def render_model_result(model_name: str, sentiment_name: str, probability: float | None) -> str: probability_label, probability_css, probability_value = categorize_probability(probability) return f'''

{model_name}

{sentiment_name.capitalize()}

{probability_label} {probability_value}

''' def build_results_markup(text_input: str) -> str: inference_start = perf_counter() results_html = "" pipeline_output = bertweet_pipeline(text_input)[0] predicted_class_id = pipeline_output['label'] probability = pipeline_output['score'] sentiment_name = CLASS_ID_TO_SENTIMENT[predicted_class_id] logger.info(f"BERTweet prediction: {text_input} -> {sentiment_name} ({probability:.4f})") results_html += render_model_result("BERTweet (Transformer)", sentiment_name, probability) for model_name, sklearn_pipeline in sklearn_pipelines.items(): inputs = [text_input] predicted_class = sklearn_pipeline.predict(inputs)[0] classifier = sklearn_pipeline.named_steps['clf'] if hasattr(classifier, 'predict_proba'): class_probabilities = sklearn_pipeline.predict_proba(inputs)[0] probability = class_probabilities.max() elif hasattr(classifier, 'decision_function'): decision_scores = sklearn_pipeline.decision_function(inputs)[0] probability = 1.0 / (1.0 + abs(decision_scores.min())) else: probability = None sentiment_name = CLASS_ID_TO_SENTIMENT[str(predicted_class)] results_html += render_model_result(model_name, sentiment_name, probability) elapsed_ms = (perf_counter() - inference_start) * 1000 return ( f'' f'
{results_html}
' ) @app.route('/') def home(): default_text = PRESET_TEXTS[0] initial_results_html = "" try: logger.info("Precomputing initial classification for default preset...") initial_results_html = build_results_markup(default_text) except Exception as e: logger.error(f"Failed to precompute initial results: {e}", exc_info=True) return render_template( 'index.html', presets=PRESET_TEXTS, default_preset=default_text, initial_results=initial_results_html ) @app.route('/classify', methods=['POST']) def classify(): try: text_input = request.form['text'] cleaned_text = text_input.strip() if not cleaned_text: return '''

Error: Please enter some text

''' logger.info(f"Classifying: {cleaned_text[:50]}...") return build_results_markup(cleaned_text) except Exception as e: logger.error(f"Classification error: {e}", exc_info=True) return f'''

Error: {e}

''' if __name__ == "__main__": if app.debug: logger.setLevel(logging.DEBUG) app.run(debug=True)