comfortably-dumb's picture
Added description
7af1601
import os
from typing import List
import gradio as gr
import pandas as pd
from huggingface_hub import hf_hub_download, list_repo_files
RESULTS_REPO = os.environ.get("RESULTS_REPO", "comfortably-dumb/dedeucebench-results")
LEADERBOARD_FILE = os.environ.get("LEADERBOARD_FILE", "leaderboard.csv")
def _derive_provider_model_id(df: pd.DataFrame) -> pd.DataFrame:
if df is None or df.empty:
return df
if "provider" in df.columns and "model_id" in df.columns:
return df
prov, mid = [], []
for m in df.get("model", []):
if isinstance(m, str) and ":" in m:
p, i = m.split(":", 1)
prov.append(p)
mid.append(i)
else:
prov.append("")
mid.append(m)
df = df.copy()
df["provider"] = prov
df["model_id"] = mid
return df
def _load_top_leaderboard() -> pd.DataFrame:
"""Fallback: load the top-level leaderboard.csv (no subset info)."""
path = hf_hub_download(
repo_id=RESULTS_REPO, repo_type="dataset", filename=LEADERBOARD_FILE
)
try:
df = pd.read_csv(path)
except Exception:
df = pd.read_csv(path, on_bad_lines="skip")
return _derive_provider_model_id(df)
def _list_run_csvs() -> List[str]:
files = list_repo_files(repo_id=RESULTS_REPO, repo_type="dataset")
# Expect pattern: runs/YYYY-MM-DD/<route>.<subset>/leaderboard.<route>.<subset>.csv
return [
f
for f in files
if f.startswith("runs/") and f.endswith(".csv") and "/leaderboard." in f
]
def _subset_from_path(path: str) -> str:
# runs/YYYY-MM-DD/<route>.<subset>/leaderboard.<route>.<subset>.csv
try:
folder = path.split("/", 3)[2] # <route>.<subset>
except Exception:
folder = ""
if "." in folder:
return folder.split(".")[-1].strip()
return ""
def load_runs_with_subset() -> pd.DataFrame:
rows: List[pd.DataFrame] = []
for f in _list_run_csvs():
try:
local = hf_hub_download(repo_id=RESULTS_REPO, repo_type="dataset", filename=f)
df = pd.read_csv(local)
if df is None or df.empty:
continue
df = _derive_provider_model_id(df)
df = df.copy()
df["subset"] = _subset_from_path(f)
rows.append(df)
except Exception:
continue
if not rows:
return pd.DataFrame(columns=["model", "provider", "model_id", "Score100", "TokensTotal", "subset"])
out = pd.concat(rows, ignore_index=True)
# Coerce numeric
for col in ("Score100", "TokensTotal"):
if col in out.columns:
out[col] = pd.to_numeric(out[col], errors="coerce")
# Keep a clean view of essential columns
return out
def filter_by_subset(df: pd.DataFrame, subset_label: str) -> pd.DataFrame:
if df is None or df.empty:
return df
key = subset_label.lower()
if key.startswith("lite"):
key = "lite"
out = df[df["subset"].astype(str).str.lower() == key].copy()
# Select only requested columns
keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
for col in keep:
if col not in out.columns:
out[col] = None
out = out[keep]
out = out.sort_values(by=["Score100"], ascending=False, na_position="last")
out.reset_index(drop=True, inplace=True)
return out
with gr.Blocks(title="DedeuceBench Leaderboard") as demo:
gr.Markdown(
"""
# DedeuceBench Leaderboard
A compact benchmark for agentic system identification under a strict query budget. Each episode is a hidden Mealy machine (finite‑state transducer). Agents must actively probe using a tiny tool API and then submit an exact transition table. We measure success, safety, and efficiency. Mealy machines appear in many interactive systems — protocols, user interfaces, embedded controllers — making them a simple, general substrate for research on sample‑efficient probing and safe exploration.
Source dataset: `{repo}`
- Use the Subset dropdown to switch between Lite (sanity), Easy, and Medium.
- Columns: Model, Score, Provider, Model ID, Tokens Total.
- Score = 100 × Success@Budget (formatted to two decimals).
""".format(repo=RESULTS_REPO)
)
with gr.Row():
subset = gr.Dropdown(
label="Subset",
choices=["Lite (sanity)", "Easy", "Medium"],
value="Lite (sanity)",
)
refresh = gr.Button("Refresh")
def _format_for_display(df: pd.DataFrame) -> pd.DataFrame:
if df is None or df.empty:
return df
df = df.copy()
# Ensure numeric for sorting
if "Score100" in df.columns:
df["Score100"] = pd.to_numeric(df["Score100"], errors="coerce")
df = df.sort_values(by=["Score100"], ascending=False, na_position="last")
score_str = df["Score100"].map(lambda x: f"{x:.2f}" if pd.notnull(x) else "")
else:
score_str = []
# Rename columns for display
rename = {
"model": "Model",
"provider": "Provider",
"model_id": "Model ID",
"Score100": "Score",
"TokensTotal": "Tokens Total",
}
df = df.rename(columns=rename)
if "Score" in df.columns:
df["Score"] = score_str
# Reorder: Model, Score, Provider, Model ID, Tokens Total
keep = ["Model", "Score", "Provider", "Model ID", "Tokens Total"]
for col in keep:
if col not in df.columns:
df[col] = None
df = df[keep]
df.reset_index(drop=True, inplace=True)
return df
def _fallback_top_level() -> pd.DataFrame:
base = _load_top_leaderboard()
base = _derive_provider_model_id(base)
keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
for col in keep:
if col not in base.columns:
base[col] = None
base = base[keep]
return _format_for_display(base)
def do_refresh(sub: str):
# Normalize subset label
label = (sub or "").strip().lower()
if label.startswith("lite"):
norm = "lite"
elif label.startswith("easy"):
norm = "easy"
else:
norm = "medium"
df = load_runs_with_subset()
# Fallback to top-level if runs empty (no subset info)
if df is None or df.empty:
return _fallback_top_level()
filtered = filter_by_subset(df, norm)
return _format_for_display(filtered)
# Initial render: load current data for default subset
_initial_df = do_refresh("Lite (sanity)")
table = gr.Dataframe(value=_initial_df, interactive=False)
subset.change(fn=do_refresh, inputs=[subset], outputs=[table])
refresh.click(fn=do_refresh, inputs=[subset], outputs=[table])
# Auto-refresh on app load
demo.load(fn=do_refresh, inputs=[subset], outputs=[table])
if __name__ == "__main__":
demo.launch()