Yacine Jernite
load dataset
06103c4
"""Main Gradio app for moderation model testing."""
import os
import sys
import gradio as gr
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from datetime import datetime
from utils.dataset import format_categories_and_reasoning, save_to_dataset
from utils.helpers import get_hf_token
from utils.model_interface import extract_model_id, run_test
from ui.sidebar import build_sidebar
from ui.tab_config import build_config_tab
from ui.tab_dataset import build_dataset_tab
from ui.tab_policy import build_policy_tab
from ui.tab_testing import (
build_testing_tab,
format_model_info,
format_reasoning_info,
format_test_result,
)
# ============================================================================
# Handlers
# ============================================================================
def handle_run_test(test_input, current_policy, model_choice, reasoning_effort, max_tokens, temperature, top_p, system_prompt_val, response_format_val, save_mode, oauth_token: gr.OAuthToken | None = None):
"""Handle test execution."""
if not test_input or not test_input.strip():
model_info = format_model_info(model_choice, reasoning_effort)
return model_info, "*Please enter test content*", "*No content*", "*No response yet*", gr.update(value="", visible=False), gr.update(value="", visible=False)
if not current_policy or current_policy == "*No policy loaded*":
model_info = format_model_info(model_choice, reasoning_effort)
return model_info, "*Please load a policy first*", "*No policy*", "*No response yet*", gr.update(value="", visible=False), gr.update(value="", visible=False)
# OAuth token is automatically injected by Gradio - we don't pass login_button as input
hf_token, _ = get_hf_token(oauth_token)
if hf_token is None:
model_info = format_model_info(model_choice, reasoning_effort)
return model_info, "*Please log in to use Inference Providers*", "*Authentication required*", "*No response yet*", gr.update(value="", visible=False), gr.update(value="", visible=False)
model_id = extract_model_id(model_choice)
result = run_test(
model_id=model_id,
test_input=test_input,
policy=current_policy,
hf_token=hf_token,
reasoning_effort=reasoning_effort,
max_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
system_prompt=system_prompt_val,
response_format=response_format_val,
)
label_text, parsed, cat_text, reasoning, raw_response = format_test_result(result)
reasoning_visible = bool(reasoning and reasoning.strip())
model_info = format_model_info(model_choice, reasoning_effort)
reasoning_info_text, reasoning_info_visible = format_reasoning_info(model_choice, reasoning)
# Save to dataset if enabled
if save_mode == "Save to Dataset" and hf_token is not None:
try:
categories_and_reasoning_text = format_categories_and_reasoning(parsed)
policy_violation = parsed.get("label", -1)
data = {
"input": test_input,
"policy_violation": policy_violation,
"categories_and_reasoning": categories_and_reasoning_text,
"policy": current_policy,
"model_selection": model_choice,
"raw_response": raw_response,
"reasoning_trace": reasoning or "",
"reasoning_effort": reasoning_effort or "",
"max_tokens": int(max_tokens),
"temperature": float(temperature),
"top_p": float(top_p),
"system_prompt": system_prompt_val or "",
"response_format": response_format_val or "",
"timestamp": datetime.now().isoformat(),
}
save_to_dataset(hf_token, data)
except Exception as e:
# Log error but don't break test execution
print(f"Failed to save to dataset: {e}")
return (
model_info,
label_text,
cat_text,
raw_response,
gr.update(value=reasoning_info_text, visible=reasoning_info_visible),
gr.update(value=reasoning or "", visible=reasoning_visible),
)
# ============================================================================
# UI Components
# ============================================================================
with gr.Blocks(title="Moderation Model Testing") as demo:
gr.Markdown("# Moderation Model Testing Interface")
gr.Markdown(
"Test moderation models with custom content policies. Define your policy, select a model, "
"and evaluate how different models classify content according to your rules. "
"Supports reasoning models that provide detailed explanations for their decisions."
)
# Sidebar (collapsible)
sidebar_components = build_sidebar()
login_button = sidebar_components["login_button"]
# Main content area with tabs
with gr.Tabs():
# Build tabs
testing_components = build_testing_tab()
test_input = testing_components["test_input"]
run_test_btn = testing_components["run_test_btn"]
save_mode = testing_components["save_mode"]
model_info_display = testing_components["model_info_display"]
label_display = testing_components["label_display"]
categories_display = testing_components["categories_display"]
model_response_display = testing_components["model_response_display"]
reasoning_info = testing_components["reasoning_info"]
reasoning_display = testing_components["reasoning_display"]
policy_components = build_policy_tab(os.path.dirname(__file__))
current_policy_state = policy_components["current_policy_state"]
config_components = build_config_tab()
model_dropdown = config_components["model_dropdown"]
reasoning_effort = config_components["reasoning_effort"]
max_tokens = config_components["max_tokens"]
temperature = config_components["temperature"]
top_p = config_components["top_p"]
system_prompt_textbox = config_components["system_prompt_textbox"]
response_format_textbox = config_components["response_format_textbox"]
dataset_components = build_dataset_tab()
example_dropdown = dataset_components["example_dropdown"]
cached_examples = dataset_components["cached_examples"]
dropdown_choices_state = dataset_components["dropdown_choices_state"]
# ============================================================================
# Event Handlers
# ============================================================================
# Cross-tab handler: Run test (needs components from all tabs)
run_test_btn.click(
handle_run_test,
inputs=[
test_input,
current_policy_state,
model_dropdown,
reasoning_effort,
max_tokens,
temperature,
top_p,
system_prompt_textbox,
response_format_textbox,
save_mode,
],
outputs=[
model_info_display,
label_display,
categories_display,
model_response_display,
reasoning_info,
reasoning_display,
],
)
model_dropdown.change(
format_model_info,
inputs=[model_dropdown, reasoning_effort],
outputs=model_info_display,
)
reasoning_effort.change(
format_model_info,
inputs=[model_dropdown, reasoning_effort],
outputs=model_info_display,
)
# Dataset load handler
def load_example_from_dataset(selected_label, cached_examples_list, dropdown_choices_list):
"""Load example from dataset and populate all fields."""
if (not cached_examples_list or not selected_label or
not dropdown_choices_list or selected_label not in dropdown_choices_list):
# Return None to skip updates
return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
try:
# Find index by matching label
idx = dropdown_choices_list.index(selected_label)
if idx < 0 or idx >= len(cached_examples_list):
return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
example = cached_examples_list[idx]
# Get policy - ensure it's a string (not None)
policy = example.get("policy", "") or ""
# Extract saved results
policy_violation = example.get("policy_violation", -1)
categories_and_reasoning = example.get("categories_and_reasoning", "")
raw_response = example.get("raw_response", "")
reasoning_trace = example.get("reasoning_trace", "")
model_selection = example.get("model_selection", "")
reasoning_effort_val = example.get("reasoning_effort", "")
# Format label text
if policy_violation == 1:
label_text = "## ❌ Policy Violation Detected"
elif policy_violation == 0:
label_text = "## ✅ No Policy Violation"
else:
label_text = "## ⚠️ Unable to determine label"
# Format model info
model_info = format_model_info(model_selection, reasoning_effort_val)
# Format reasoning info
reasoning_info_text, reasoning_info_visible = format_reasoning_info(model_selection, reasoning_trace)
reasoning_visible = bool(reasoning_trace and reasoning_trace.strip())
return (
example.get("input", ""),
policy, # current_policy_state - UI syncs automatically via change handler
example.get("model_selection", ""),
example.get("reasoning_effort", ""),
example.get("max_tokens", 0),
example.get("temperature", 0.0),
example.get("top_p", 0.0),
example.get("system_prompt", ""),
example.get("response_format", ""),
# Results
model_info,
label_text,
categories_and_reasoning,
raw_response,
gr.update(value=reasoning_info_text, visible=reasoning_info_visible),
gr.update(value=reasoning_trace or "", visible=reasoning_visible),
)
except (ValueError, IndexError):
return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
example_dropdown.change(
load_example_from_dataset,
inputs=[example_dropdown, cached_examples, dropdown_choices_state],
outputs=[
test_input,
current_policy_state, # UI components sync automatically via change handler
model_dropdown,
reasoning_effort,
max_tokens,
temperature,
top_p,
system_prompt_textbox,
response_format_textbox,
# Results
model_info_display,
label_display,
categories_display,
model_response_display,
reasoning_info,
reasoning_display,
],
)
if __name__ == "__main__":
demo.launch(ssr_mode=False)