|
|
import os |
|
|
from typing import List |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
from huggingface_hub import hf_hub_download, list_repo_files |
|
|
|
|
|
|
|
|
RESULTS_REPO = os.environ.get("RESULTS_REPO", "comfortably-dumb/dedeucebench-results") |
|
|
LEADERBOARD_FILE = os.environ.get("LEADERBOARD_FILE", "leaderboard.csv") |
|
|
|
|
|
|
|
|
def _derive_provider_model_id(df: pd.DataFrame) -> pd.DataFrame: |
|
|
if df is None or df.empty: |
|
|
return df |
|
|
if "provider" in df.columns and "model_id" in df.columns: |
|
|
return df |
|
|
prov, mid = [], [] |
|
|
for m in df.get("model", []): |
|
|
if isinstance(m, str) and ":" in m: |
|
|
p, i = m.split(":", 1) |
|
|
prov.append(p) |
|
|
mid.append(i) |
|
|
else: |
|
|
prov.append("") |
|
|
mid.append(m) |
|
|
df = df.copy() |
|
|
df["provider"] = prov |
|
|
df["model_id"] = mid |
|
|
return df |
|
|
|
|
|
|
|
|
def _load_top_leaderboard() -> pd.DataFrame: |
|
|
"""Fallback: load the top-level leaderboard.csv (no subset info).""" |
|
|
path = hf_hub_download( |
|
|
repo_id=RESULTS_REPO, repo_type="dataset", filename=LEADERBOARD_FILE |
|
|
) |
|
|
try: |
|
|
df = pd.read_csv(path) |
|
|
except Exception: |
|
|
df = pd.read_csv(path, on_bad_lines="skip") |
|
|
return _derive_provider_model_id(df) |
|
|
|
|
|
|
|
|
def _list_run_csvs() -> List[str]: |
|
|
files = list_repo_files(repo_id=RESULTS_REPO, repo_type="dataset") |
|
|
|
|
|
return [ |
|
|
f |
|
|
for f in files |
|
|
if f.startswith("runs/") and f.endswith(".csv") and "/leaderboard." in f |
|
|
] |
|
|
|
|
|
|
|
|
def _subset_from_path(path: str) -> str: |
|
|
|
|
|
try: |
|
|
folder = path.split("/", 3)[2] |
|
|
except Exception: |
|
|
folder = "" |
|
|
if "." in folder: |
|
|
return folder.split(".")[-1].strip() |
|
|
return "" |
|
|
|
|
|
|
|
|
def load_runs_with_subset() -> pd.DataFrame: |
|
|
rows: List[pd.DataFrame] = [] |
|
|
for f in _list_run_csvs(): |
|
|
try: |
|
|
local = hf_hub_download(repo_id=RESULTS_REPO, repo_type="dataset", filename=f) |
|
|
df = pd.read_csv(local) |
|
|
if df is None or df.empty: |
|
|
continue |
|
|
df = _derive_provider_model_id(df) |
|
|
df = df.copy() |
|
|
df["subset"] = _subset_from_path(f) |
|
|
rows.append(df) |
|
|
except Exception: |
|
|
continue |
|
|
if not rows: |
|
|
return pd.DataFrame(columns=["model", "provider", "model_id", "Score100", "TokensTotal", "subset"]) |
|
|
out = pd.concat(rows, ignore_index=True) |
|
|
|
|
|
for col in ("Score100", "TokensTotal"): |
|
|
if col in out.columns: |
|
|
out[col] = pd.to_numeric(out[col], errors="coerce") |
|
|
|
|
|
return out |
|
|
|
|
|
|
|
|
def filter_by_subset(df: pd.DataFrame, subset_label: str) -> pd.DataFrame: |
|
|
if df is None or df.empty: |
|
|
return df |
|
|
key = subset_label.lower() |
|
|
if key.startswith("lite"): |
|
|
key = "lite" |
|
|
out = df[df["subset"].astype(str).str.lower() == key].copy() |
|
|
|
|
|
keep = ["model", "provider", "model_id", "Score100", "TokensTotal"] |
|
|
for col in keep: |
|
|
if col not in out.columns: |
|
|
out[col] = None |
|
|
out = out[keep] |
|
|
out = out.sort_values(by=["Score100"], ascending=False, na_position="last") |
|
|
out.reset_index(drop=True, inplace=True) |
|
|
return out |
|
|
|
|
|
|
|
|
with gr.Blocks(title="DedeuceBench Leaderboard") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# DedeuceBench Leaderboard |
|
|
|
|
|
A compact benchmark for agentic system identification under a strict query budget. Each episode is a hidden Mealy machine (finite‑state transducer). Agents must actively probe using a tiny tool API and then submit an exact transition table. We measure success, safety, and efficiency. Mealy machines appear in many interactive systems — protocols, user interfaces, embedded controllers — making them a simple, general substrate for research on sample‑efficient probing and safe exploration. |
|
|
|
|
|
Source dataset: `{repo}` |
|
|
|
|
|
- Use the Subset dropdown to switch between Lite (sanity), Easy, and Medium. |
|
|
- Columns: Model, Score, Provider, Model ID, Tokens Total. |
|
|
- Score = 100 × Success@Budget (formatted to two decimals). |
|
|
""".format(repo=RESULTS_REPO) |
|
|
) |
|
|
with gr.Row(): |
|
|
subset = gr.Dropdown( |
|
|
label="Subset", |
|
|
choices=["Lite (sanity)", "Easy", "Medium"], |
|
|
value="Lite (sanity)", |
|
|
) |
|
|
refresh = gr.Button("Refresh") |
|
|
|
|
|
def _format_for_display(df: pd.DataFrame) -> pd.DataFrame: |
|
|
if df is None or df.empty: |
|
|
return df |
|
|
df = df.copy() |
|
|
|
|
|
if "Score100" in df.columns: |
|
|
df["Score100"] = pd.to_numeric(df["Score100"], errors="coerce") |
|
|
df = df.sort_values(by=["Score100"], ascending=False, na_position="last") |
|
|
score_str = df["Score100"].map(lambda x: f"{x:.2f}" if pd.notnull(x) else "") |
|
|
else: |
|
|
score_str = [] |
|
|
|
|
|
rename = { |
|
|
"model": "Model", |
|
|
"provider": "Provider", |
|
|
"model_id": "Model ID", |
|
|
"Score100": "Score", |
|
|
"TokensTotal": "Tokens Total", |
|
|
} |
|
|
df = df.rename(columns=rename) |
|
|
if "Score" in df.columns: |
|
|
df["Score"] = score_str |
|
|
|
|
|
keep = ["Model", "Score", "Provider", "Model ID", "Tokens Total"] |
|
|
for col in keep: |
|
|
if col not in df.columns: |
|
|
df[col] = None |
|
|
df = df[keep] |
|
|
df.reset_index(drop=True, inplace=True) |
|
|
return df |
|
|
|
|
|
def _fallback_top_level() -> pd.DataFrame: |
|
|
base = _load_top_leaderboard() |
|
|
base = _derive_provider_model_id(base) |
|
|
keep = ["model", "provider", "model_id", "Score100", "TokensTotal"] |
|
|
for col in keep: |
|
|
if col not in base.columns: |
|
|
base[col] = None |
|
|
base = base[keep] |
|
|
return _format_for_display(base) |
|
|
|
|
|
def do_refresh(sub: str): |
|
|
|
|
|
label = (sub or "").strip().lower() |
|
|
if label.startswith("lite"): |
|
|
norm = "lite" |
|
|
elif label.startswith("easy"): |
|
|
norm = "easy" |
|
|
else: |
|
|
norm = "medium" |
|
|
df = load_runs_with_subset() |
|
|
|
|
|
if df is None or df.empty: |
|
|
return _fallback_top_level() |
|
|
filtered = filter_by_subset(df, norm) |
|
|
return _format_for_display(filtered) |
|
|
|
|
|
|
|
|
_initial_df = do_refresh("Lite (sanity)") |
|
|
table = gr.Dataframe(value=_initial_df, interactive=False) |
|
|
|
|
|
subset.change(fn=do_refresh, inputs=[subset], outputs=[table]) |
|
|
refresh.click(fn=do_refresh, inputs=[subset], outputs=[table]) |
|
|
|
|
|
|
|
|
demo.load(fn=do_refresh, inputs=[subset], outputs=[table]) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|