File size: 7,087 Bytes
487525a
011bfa0
487525a
 
 
011bfa0
487525a
 
011bfa0
 
487525a
 
011bfa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487525a
011bfa0
487525a
 
 
 
 
011bfa0
487525a
 
011bfa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487525a
 
 
 
 
 
 
7af1601
 
 
 
 
 
 
011bfa0
 
487525a
011bfa0
 
7af1601
 
011bfa0
487525a
 
7af1601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487525a
011bfa0
7af1601
 
 
 
 
 
 
 
011bfa0
 
 
7af1601
 
 
 
 
 
 
011bfa0
 
 
487525a
7af1601
 
487525a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import os
from typing import List

import gradio as gr
import pandas as pd
from huggingface_hub import hf_hub_download, list_repo_files


RESULTS_REPO = os.environ.get("RESULTS_REPO", "comfortably-dumb/dedeucebench-results")
LEADERBOARD_FILE = os.environ.get("LEADERBOARD_FILE", "leaderboard.csv")


def _derive_provider_model_id(df: pd.DataFrame) -> pd.DataFrame:
    if df is None or df.empty:
        return df
    if "provider" in df.columns and "model_id" in df.columns:
        return df
    prov, mid = [], []
    for m in df.get("model", []):
        if isinstance(m, str) and ":" in m:
            p, i = m.split(":", 1)
            prov.append(p)
            mid.append(i)
        else:
            prov.append("")
            mid.append(m)
    df = df.copy()
    df["provider"] = prov
    df["model_id"] = mid
    return df


def _load_top_leaderboard() -> pd.DataFrame:
    """Fallback: load the top-level leaderboard.csv (no subset info)."""
    path = hf_hub_download(
        repo_id=RESULTS_REPO, repo_type="dataset", filename=LEADERBOARD_FILE
    )
    try:
        df = pd.read_csv(path)
    except Exception:
        df = pd.read_csv(path, on_bad_lines="skip")
    return _derive_provider_model_id(df)


def _list_run_csvs() -> List[str]:
    files = list_repo_files(repo_id=RESULTS_REPO, repo_type="dataset")
    # Expect pattern: runs/YYYY-MM-DD/<route>.<subset>/leaderboard.<route>.<subset>.csv
    return [
        f
        for f in files
        if f.startswith("runs/") and f.endswith(".csv") and "/leaderboard." in f
    ]


def _subset_from_path(path: str) -> str:
    # runs/YYYY-MM-DD/<route>.<subset>/leaderboard.<route>.<subset>.csv
    try:
        folder = path.split("/", 3)[2]  # <route>.<subset>
    except Exception:
        folder = ""
    if "." in folder:
        return folder.split(".")[-1].strip()
    return ""


def load_runs_with_subset() -> pd.DataFrame:
    rows: List[pd.DataFrame] = []
    for f in _list_run_csvs():
        try:
            local = hf_hub_download(repo_id=RESULTS_REPO, repo_type="dataset", filename=f)
            df = pd.read_csv(local)
            if df is None or df.empty:
                continue
            df = _derive_provider_model_id(df)
            df = df.copy()
            df["subset"] = _subset_from_path(f)
            rows.append(df)
        except Exception:
            continue
    if not rows:
        return pd.DataFrame(columns=["model", "provider", "model_id", "Score100", "TokensTotal", "subset"])
    out = pd.concat(rows, ignore_index=True)
    # Coerce numeric
    for col in ("Score100", "TokensTotal"):
        if col in out.columns:
            out[col] = pd.to_numeric(out[col], errors="coerce")
    # Keep a clean view of essential columns
    return out


def filter_by_subset(df: pd.DataFrame, subset_label: str) -> pd.DataFrame:
    if df is None or df.empty:
        return df
    key = subset_label.lower()
    if key.startswith("lite"):
        key = "lite"
    out = df[df["subset"].astype(str).str.lower() == key].copy()
    # Select only requested columns
    keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
    for col in keep:
        if col not in out.columns:
            out[col] = None
    out = out[keep]
    out = out.sort_values(by=["Score100"], ascending=False, na_position="last")
    out.reset_index(drop=True, inplace=True)
    return out


with gr.Blocks(title="DedeuceBench Leaderboard") as demo:
    gr.Markdown(
        """
        # DedeuceBench Leaderboard

        A compact benchmark for agentic system identification under a strict query budget. Each episode is a hidden Mealy machine (finite‑state transducer). Agents must actively probe using a tiny tool API and then submit an exact transition table. We measure success, safety, and efficiency. Mealy machines appear in many interactive systems — protocols, user interfaces, embedded controllers — making them a simple, general substrate for research on sample‑efficient probing and safe exploration.

        Source dataset: `{repo}`

        - Use the Subset dropdown to switch between Lite (sanity), Easy, and Medium.
        - Columns: Model, Score, Provider, Model ID, Tokens Total.
        - Score = 100 × Success@Budget (formatted to two decimals).
        """.format(repo=RESULTS_REPO)
    )
    with gr.Row():
        subset = gr.Dropdown(
            label="Subset",
            choices=["Lite (sanity)", "Easy", "Medium"],
            value="Lite (sanity)",
        )
        refresh = gr.Button("Refresh")

    def _format_for_display(df: pd.DataFrame) -> pd.DataFrame:
        if df is None or df.empty:
            return df
        df = df.copy()
        # Ensure numeric for sorting
        if "Score100" in df.columns:
            df["Score100"] = pd.to_numeric(df["Score100"], errors="coerce")
            df = df.sort_values(by=["Score100"], ascending=False, na_position="last")
            score_str = df["Score100"].map(lambda x: f"{x:.2f}" if pd.notnull(x) else "")
        else:
            score_str = []
        # Rename columns for display
        rename = {
            "model": "Model",
            "provider": "Provider",
            "model_id": "Model ID",
            "Score100": "Score",
            "TokensTotal": "Tokens Total",
        }
        df = df.rename(columns=rename)
        if "Score" in df.columns:
            df["Score"] = score_str
        # Reorder: Model, Score, Provider, Model ID, Tokens Total
        keep = ["Model", "Score", "Provider", "Model ID", "Tokens Total"]
        for col in keep:
            if col not in df.columns:
                df[col] = None
        df = df[keep]
        df.reset_index(drop=True, inplace=True)
        return df

    def _fallback_top_level() -> pd.DataFrame:
        base = _load_top_leaderboard()
        base = _derive_provider_model_id(base)
        keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
        for col in keep:
            if col not in base.columns:
                base[col] = None
        base = base[keep]
        return _format_for_display(base)

    def do_refresh(sub: str):
        # Normalize subset label
        label = (sub or "").strip().lower()
        if label.startswith("lite"):
            norm = "lite"
        elif label.startswith("easy"):
            norm = "easy"
        else:
            norm = "medium"
        df = load_runs_with_subset()
        # Fallback to top-level if runs empty (no subset info)
        if df is None or df.empty:
            return _fallback_top_level()
        filtered = filter_by_subset(df, norm)
        return _format_for_display(filtered)

    # Initial render: load current data for default subset
    _initial_df = do_refresh("Lite (sanity)")
    table = gr.Dataframe(value=_initial_df, interactive=False)

    subset.change(fn=do_refresh, inputs=[subset], outputs=[table])
    refresh.click(fn=do_refresh, inputs=[subset], outputs=[table])

    # Auto-refresh on app load
    demo.load(fn=do_refresh, inputs=[subset], outputs=[table])


if __name__ == "__main__":
    demo.launch()