File size: 7,087 Bytes
487525a 011bfa0 487525a 011bfa0 487525a 011bfa0 487525a 011bfa0 487525a 011bfa0 487525a 011bfa0 487525a 011bfa0 487525a 7af1601 011bfa0 487525a 011bfa0 7af1601 011bfa0 487525a 7af1601 487525a 011bfa0 7af1601 011bfa0 7af1601 011bfa0 487525a 7af1601 487525a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import os
from typing import List
import gradio as gr
import pandas as pd
from huggingface_hub import hf_hub_download, list_repo_files
RESULTS_REPO = os.environ.get("RESULTS_REPO", "comfortably-dumb/dedeucebench-results")
LEADERBOARD_FILE = os.environ.get("LEADERBOARD_FILE", "leaderboard.csv")
def _derive_provider_model_id(df: pd.DataFrame) -> pd.DataFrame:
if df is None or df.empty:
return df
if "provider" in df.columns and "model_id" in df.columns:
return df
prov, mid = [], []
for m in df.get("model", []):
if isinstance(m, str) and ":" in m:
p, i = m.split(":", 1)
prov.append(p)
mid.append(i)
else:
prov.append("")
mid.append(m)
df = df.copy()
df["provider"] = prov
df["model_id"] = mid
return df
def _load_top_leaderboard() -> pd.DataFrame:
"""Fallback: load the top-level leaderboard.csv (no subset info)."""
path = hf_hub_download(
repo_id=RESULTS_REPO, repo_type="dataset", filename=LEADERBOARD_FILE
)
try:
df = pd.read_csv(path)
except Exception:
df = pd.read_csv(path, on_bad_lines="skip")
return _derive_provider_model_id(df)
def _list_run_csvs() -> List[str]:
files = list_repo_files(repo_id=RESULTS_REPO, repo_type="dataset")
# Expect pattern: runs/YYYY-MM-DD/<route>.<subset>/leaderboard.<route>.<subset>.csv
return [
f
for f in files
if f.startswith("runs/") and f.endswith(".csv") and "/leaderboard." in f
]
def _subset_from_path(path: str) -> str:
# runs/YYYY-MM-DD/<route>.<subset>/leaderboard.<route>.<subset>.csv
try:
folder = path.split("/", 3)[2] # <route>.<subset>
except Exception:
folder = ""
if "." in folder:
return folder.split(".")[-1].strip()
return ""
def load_runs_with_subset() -> pd.DataFrame:
rows: List[pd.DataFrame] = []
for f in _list_run_csvs():
try:
local = hf_hub_download(repo_id=RESULTS_REPO, repo_type="dataset", filename=f)
df = pd.read_csv(local)
if df is None or df.empty:
continue
df = _derive_provider_model_id(df)
df = df.copy()
df["subset"] = _subset_from_path(f)
rows.append(df)
except Exception:
continue
if not rows:
return pd.DataFrame(columns=["model", "provider", "model_id", "Score100", "TokensTotal", "subset"])
out = pd.concat(rows, ignore_index=True)
# Coerce numeric
for col in ("Score100", "TokensTotal"):
if col in out.columns:
out[col] = pd.to_numeric(out[col], errors="coerce")
# Keep a clean view of essential columns
return out
def filter_by_subset(df: pd.DataFrame, subset_label: str) -> pd.DataFrame:
if df is None or df.empty:
return df
key = subset_label.lower()
if key.startswith("lite"):
key = "lite"
out = df[df["subset"].astype(str).str.lower() == key].copy()
# Select only requested columns
keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
for col in keep:
if col not in out.columns:
out[col] = None
out = out[keep]
out = out.sort_values(by=["Score100"], ascending=False, na_position="last")
out.reset_index(drop=True, inplace=True)
return out
with gr.Blocks(title="DedeuceBench Leaderboard") as demo:
gr.Markdown(
"""
# DedeuceBench Leaderboard
A compact benchmark for agentic system identification under a strict query budget. Each episode is a hidden Mealy machine (finite‑state transducer). Agents must actively probe using a tiny tool API and then submit an exact transition table. We measure success, safety, and efficiency. Mealy machines appear in many interactive systems — protocols, user interfaces, embedded controllers — making them a simple, general substrate for research on sample‑efficient probing and safe exploration.
Source dataset: `{repo}`
- Use the Subset dropdown to switch between Lite (sanity), Easy, and Medium.
- Columns: Model, Score, Provider, Model ID, Tokens Total.
- Score = 100 × Success@Budget (formatted to two decimals).
""".format(repo=RESULTS_REPO)
)
with gr.Row():
subset = gr.Dropdown(
label="Subset",
choices=["Lite (sanity)", "Easy", "Medium"],
value="Lite (sanity)",
)
refresh = gr.Button("Refresh")
def _format_for_display(df: pd.DataFrame) -> pd.DataFrame:
if df is None or df.empty:
return df
df = df.copy()
# Ensure numeric for sorting
if "Score100" in df.columns:
df["Score100"] = pd.to_numeric(df["Score100"], errors="coerce")
df = df.sort_values(by=["Score100"], ascending=False, na_position="last")
score_str = df["Score100"].map(lambda x: f"{x:.2f}" if pd.notnull(x) else "")
else:
score_str = []
# Rename columns for display
rename = {
"model": "Model",
"provider": "Provider",
"model_id": "Model ID",
"Score100": "Score",
"TokensTotal": "Tokens Total",
}
df = df.rename(columns=rename)
if "Score" in df.columns:
df["Score"] = score_str
# Reorder: Model, Score, Provider, Model ID, Tokens Total
keep = ["Model", "Score", "Provider", "Model ID", "Tokens Total"]
for col in keep:
if col not in df.columns:
df[col] = None
df = df[keep]
df.reset_index(drop=True, inplace=True)
return df
def _fallback_top_level() -> pd.DataFrame:
base = _load_top_leaderboard()
base = _derive_provider_model_id(base)
keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
for col in keep:
if col not in base.columns:
base[col] = None
base = base[keep]
return _format_for_display(base)
def do_refresh(sub: str):
# Normalize subset label
label = (sub or "").strip().lower()
if label.startswith("lite"):
norm = "lite"
elif label.startswith("easy"):
norm = "easy"
else:
norm = "medium"
df = load_runs_with_subset()
# Fallback to top-level if runs empty (no subset info)
if df is None or df.empty:
return _fallback_top_level()
filtered = filter_by_subset(df, norm)
return _format_for_display(filtered)
# Initial render: load current data for default subset
_initial_df = do_refresh("Lite (sanity)")
table = gr.Dataframe(value=_initial_df, interactive=False)
subset.change(fn=do_refresh, inputs=[subset], outputs=[table])
refresh.click(fn=do_refresh, inputs=[subset], outputs=[table])
# Auto-refresh on app load
demo.load(fn=do_refresh, inputs=[subset], outputs=[table])
if __name__ == "__main__":
demo.launch()
|