Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,9 @@ import matplotlib.pyplot as plt
|
|
| 6 |
import seaborn as sns
|
| 7 |
from transformers import pipeline
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
# Load sentiment pipeline
|
| 10 |
sentiment_pipeline = pipeline(
|
| 11 |
"text-classification",
|
|
@@ -21,7 +24,7 @@ def clean_text(text):
|
|
| 21 |
return text.lower().strip()
|
| 22 |
|
| 23 |
def predict_sentiment(texts):
|
| 24 |
-
results = sentiment_pipeline(texts, truncation=
|
| 25 |
sentiments = []
|
| 26 |
confidences = []
|
| 27 |
for r in results:
|
|
@@ -41,8 +44,9 @@ def recategorize(labels, mode, pos_threshold, neg_threshold):
|
|
| 41 |
"Negative" if lbl <= neg_threshold else
|
| 42 |
"Neutral" for lbl in labels
|
| 43 |
]
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
def analyze_sentiment(file, text_column, mode, pos_thresh, neg_thresh):
|
| 46 |
try:
|
| 47 |
df = pd.read_csv(file.name)
|
| 48 |
except Exception as e:
|
|
@@ -51,24 +55,54 @@ def analyze_sentiment(file, text_column, mode, pos_thresh, neg_thresh):
|
|
| 51 |
if text_column not in df.columns:
|
| 52 |
return "Selected column not found.", None, None, None, None, None
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
df["sentiment_recategorised"] = recategorize(df["sentiment_1to10"], mode, pos_thresh, neg_thresh)
|
| 59 |
|
| 60 |
# Save results
|
| 61 |
output_file = "bigbird_sentiment_results.csv"
|
| 62 |
df.to_csv(output_file, index=False)
|
| 63 |
|
| 64 |
-
# Plot 1: Original 10-class sentiment distribution
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
|
| 73 |
# Plot 2: Recategorized sentiment distribution
|
| 74 |
plt.figure(figsize=(6, 4))
|
|
@@ -79,19 +113,22 @@ def analyze_sentiment(file, text_column, mode, pos_thresh, neg_thresh):
|
|
| 79 |
plt.savefig(plot2_path)
|
| 80 |
plt.close()
|
| 81 |
|
| 82 |
-
# Plot 3: Confidence score distribution
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# Sample preview
|
| 93 |
preview = df[[text_column, "sentiment_1to10", "confidence", "sentiment_recategorised"]].head(10)
|
| 94 |
-
return f"Sentiment analysis complete.
|
|
|
|
| 95 |
|
| 96 |
def get_text_columns(file):
|
| 97 |
try:
|
|
@@ -106,7 +143,7 @@ def get_text_columns(file):
|
|
| 106 |
with gr.Blocks() as app:
|
| 107 |
gr.Markdown("## βοΈ Sentiment analysis with `pvaluedotone/bigbird-flight`")
|
| 108 |
gr.Markdown("**Citation:** Mat Roni, S. (2025). *Sentiment analysis with Big Bird Flight on Gradio* (version 1.0) [software]. https://huggingface.co/spaces/pvaluedotone/bigbird-flight")
|
| 109 |
-
gr.Markdown("Upload a CSV, choose a text column, select output style (10-class, binary, or ternary), and
|
| 110 |
|
| 111 |
with gr.Row():
|
| 112 |
file_input = gr.File(label="Upload CSV", file_types=[".csv"])
|
|
@@ -121,8 +158,10 @@ with gr.Blocks() as app:
|
|
| 121 |
interactive=True
|
| 122 |
)
|
| 123 |
|
| 124 |
-
pos_thresh_slider = gr.Slider(
|
| 125 |
-
neg_thresh_slider = gr.Slider(1,
|
|
|
|
|
|
|
| 126 |
|
| 127 |
def toggle_thresholds(mode):
|
| 128 |
show_pos = mode != "Original (1β10)"
|
|
@@ -134,19 +173,19 @@ with gr.Blocks() as app:
|
|
| 134 |
|
| 135 |
output_mode.change(toggle_thresholds, inputs=output_mode, outputs=[pos_thresh_slider, neg_thresh_slider])
|
| 136 |
|
| 137 |
-
run_button = gr.Button("
|
| 138 |
|
| 139 |
status = gr.Textbox(label="Status")
|
| 140 |
df_output = gr.Dataframe(label="Sample Output (Top 10)")
|
| 141 |
file_result = gr.File(label="Download Full Results")
|
| 142 |
plot_orig = gr.Image(label="Original Sentiment Distribution")
|
| 143 |
-
plot_recat = gr.Image(label="
|
| 144 |
plot_conf = gr.Image(label="Confidence Score Distribution")
|
| 145 |
|
| 146 |
run_button.click(
|
| 147 |
analyze_sentiment,
|
| 148 |
-
inputs=[file_input, column_dropdown, output_mode, pos_thresh_slider, neg_thresh_slider],
|
| 149 |
outputs=[status, df_output, file_result, plot_orig, plot_recat, plot_conf]
|
| 150 |
)
|
| 151 |
|
| 152 |
-
app.launch(debug=True)
|
|
|
|
| 6 |
import seaborn as sns
|
| 7 |
from transformers import pipeline
|
| 8 |
|
| 9 |
+
cached_df = None
|
| 10 |
+
cached_file_name = None
|
| 11 |
+
|
| 12 |
# Load sentiment pipeline
|
| 13 |
sentiment_pipeline = pipeline(
|
| 14 |
"text-classification",
|
|
|
|
| 24 |
return text.lower().strip()
|
| 25 |
|
| 26 |
def predict_sentiment(texts):
|
| 27 |
+
results = sentiment_pipeline(texts, truncation=False, batch_size=32)
|
| 28 |
sentiments = []
|
| 29 |
confidences = []
|
| 30 |
for r in results:
|
|
|
|
| 44 |
"Negative" if lbl <= neg_threshold else
|
| 45 |
"Neutral" for lbl in labels
|
| 46 |
]
|
| 47 |
+
def analyze_sentiment(file, text_column, mode, pos_thresh, neg_thresh, auto_fix):
|
| 48 |
+
global cached_df, cached_file_name
|
| 49 |
|
|
|
|
| 50 |
try:
|
| 51 |
df = pd.read_csv(file.name)
|
| 52 |
except Exception as e:
|
|
|
|
| 55 |
if text_column not in df.columns:
|
| 56 |
return "Selected column not found.", None, None, None, None, None
|
| 57 |
|
| 58 |
+
# Check if sentiment analysis already done and file is unchanged
|
| 59 |
+
if (
|
| 60 |
+
cached_df is not None and
|
| 61 |
+
cached_file_name == file.name and
|
| 62 |
+
"sentiment_1to10" in cached_df.columns and
|
| 63 |
+
"confidence" in cached_df.columns
|
| 64 |
+
):
|
| 65 |
+
df = cached_df.copy()
|
| 66 |
+
else:
|
| 67 |
+
# Clean and predict
|
| 68 |
+
df["clean_text"] = df[text_column].apply(clean_text)
|
| 69 |
+
predictions, confidences = predict_sentiment(df["clean_text"].tolist())
|
| 70 |
+
df["sentiment_1to10"] = predictions
|
| 71 |
+
df["confidence"] = confidences
|
| 72 |
+
# Cache result
|
| 73 |
+
cached_df = df.copy()
|
| 74 |
+
cached_file_name = file.name
|
| 75 |
+
|
| 76 |
+
# π Check thresholds
|
| 77 |
+
if mode == "Ternary (Pos/Neu/Neg)":
|
| 78 |
+
if pos_thresh <= neg_thresh:
|
| 79 |
+
if auto_fix:
|
| 80 |
+
neg_thresh = pos_thresh - 1
|
| 81 |
+
if neg_thresh < 1:
|
| 82 |
+
return "β οΈ Cannot auto-correct: thresholds out of valid range (1β10).", None, None, None, None, None
|
| 83 |
+
else:
|
| 84 |
+
return (
|
| 85 |
+
f"β οΈ Invalid thresholds: Positive min ({pos_thresh}) must be greater than Negative max ({neg_thresh}).",
|
| 86 |
+
None, None, None, None, None
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Apply recategorization
|
| 90 |
df["sentiment_recategorised"] = recategorize(df["sentiment_1to10"], mode, pos_thresh, neg_thresh)
|
| 91 |
|
| 92 |
# Save results
|
| 93 |
output_file = "bigbird_sentiment_results.csv"
|
| 94 |
df.to_csv(output_file, index=False)
|
| 95 |
|
| 96 |
+
# Plot 1: Original 10-class sentiment distribution (only if new analysis)
|
| 97 |
+
if "plot1_path" not in globals():
|
| 98 |
+
plt.figure(figsize=(6, 4))
|
| 99 |
+
sns.countplot(x=df["sentiment_1to10"], palette="Blues")
|
| 100 |
+
plt.title("Original 10-Class Sentiment Distribution")
|
| 101 |
+
plt.tight_layout()
|
| 102 |
+
global plot1_path
|
| 103 |
+
plot1_path = "original_dist.png"
|
| 104 |
+
plt.savefig(plot1_path)
|
| 105 |
+
plt.close()
|
| 106 |
|
| 107 |
# Plot 2: Recategorized sentiment distribution
|
| 108 |
plt.figure(figsize=(6, 4))
|
|
|
|
| 113 |
plt.savefig(plot2_path)
|
| 114 |
plt.close()
|
| 115 |
|
| 116 |
+
# Plot 3: Confidence score distribution (only if new analysis)
|
| 117 |
+
if "plot3_path" not in globals():
|
| 118 |
+
plt.figure(figsize=(6, 4))
|
| 119 |
+
sns.histplot(df["confidence"], bins=20, color="skyblue", kde=True)
|
| 120 |
+
plt.title("Confidence Score Distribution")
|
| 121 |
+
plt.xlabel("Confidence")
|
| 122 |
+
plt.tight_layout()
|
| 123 |
+
global plot3_path
|
| 124 |
+
plot3_path = "confidence_dist.png"
|
| 125 |
+
plt.savefig(plot3_path)
|
| 126 |
+
plt.close()
|
| 127 |
|
| 128 |
# Sample preview
|
| 129 |
preview = df[[text_column, "sentiment_1to10", "confidence", "sentiment_recategorised"]].head(10)
|
| 130 |
+
return f"β
Sentiment analysis complete. Used cache: {cached_file_name == file.name}", preview, output_file, plot1_path, plot2_path, plot3_path
|
| 131 |
+
|
| 132 |
|
| 133 |
def get_text_columns(file):
|
| 134 |
try:
|
|
|
|
| 143 |
with gr.Blocks() as app:
|
| 144 |
gr.Markdown("## βοΈ Sentiment analysis with `pvaluedotone/bigbird-flight`")
|
| 145 |
gr.Markdown("**Citation:** Mat Roni, S. (2025). *Sentiment analysis with Big Bird Flight on Gradio* (version 1.0) [software]. https://huggingface.co/spaces/pvaluedotone/bigbird-flight")
|
| 146 |
+
gr.Markdown("Upload a CSV, choose a text column to analyse, select output style (10-class, binary, or ternary), and analyse.")
|
| 147 |
|
| 148 |
with gr.Row():
|
| 149 |
file_input = gr.File(label="Upload CSV", file_types=[".csv"])
|
|
|
|
| 158 |
interactive=True
|
| 159 |
)
|
| 160 |
|
| 161 |
+
pos_thresh_slider = gr.Slider(3, 10, value=7, step=1, label="Positive min", visible=False)
|
| 162 |
+
neg_thresh_slider = gr.Slider(1, 7, value=4, step=1, label="Negative max", visible=False)
|
| 163 |
+
auto_fix_checkbox = gr.Checkbox(label="Auto-correct thresholds if overlapping?", value=True)
|
| 164 |
+
|
| 165 |
|
| 166 |
def toggle_thresholds(mode):
|
| 167 |
show_pos = mode != "Original (1β10)"
|
|
|
|
| 173 |
|
| 174 |
output_mode.change(toggle_thresholds, inputs=output_mode, outputs=[pos_thresh_slider, neg_thresh_slider])
|
| 175 |
|
| 176 |
+
run_button = gr.Button("Process sentiment")
|
| 177 |
|
| 178 |
status = gr.Textbox(label="Status")
|
| 179 |
df_output = gr.Dataframe(label="Sample Output (Top 10)")
|
| 180 |
file_result = gr.File(label="Download Full Results")
|
| 181 |
plot_orig = gr.Image(label="Original Sentiment Distribution")
|
| 182 |
+
plot_recat = gr.Image(label="Recategorised Sentiment Distribution")
|
| 183 |
plot_conf = gr.Image(label="Confidence Score Distribution")
|
| 184 |
|
| 185 |
run_button.click(
|
| 186 |
analyze_sentiment,
|
| 187 |
+
inputs=[file_input, column_dropdown, output_mode, pos_thresh_slider, neg_thresh_slider, auto_fix_checkbox],
|
| 188 |
outputs=[status, df_output, file_result, plot_orig, plot_recat, plot_conf]
|
| 189 |
)
|
| 190 |
|
| 191 |
+
app.launch(share=True, debug=True)
|