Commit
·
7af1601
1
Parent(s):
011bfa0
Added description
Browse files
app.py
CHANGED
|
@@ -110,45 +110,90 @@ with gr.Blocks(title="DedeuceBench Leaderboard") as demo:
|
|
| 110 |
gr.Markdown(
|
| 111 |
"""
|
| 112 |
# DedeuceBench Leaderboard
|
| 113 |
-
Source: `{repo}`
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
""".format(repo=RESULTS_REPO)
|
| 119 |
)
|
| 120 |
with gr.Row():
|
| 121 |
subset = gr.Dropdown(
|
| 122 |
label="Subset",
|
| 123 |
-
choices=["Lite(sanity)", "
|
| 124 |
-
value="Lite(sanity)",
|
| 125 |
)
|
| 126 |
refresh = gr.Button("Refresh")
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
def do_refresh(sub: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
df = load_runs_with_subset()
|
| 134 |
# Fallback to top-level if runs empty (no subset info)
|
| 135 |
if df is None or df.empty:
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
base = base.sort_values(by=["Score100"], ascending=False, na_position="last")
|
| 144 |
-
base.reset_index(drop=True, inplace=True)
|
| 145 |
-
return base
|
| 146 |
-
return filter_by_subset(df, sub)
|
| 147 |
|
| 148 |
subset.change(fn=do_refresh, inputs=[subset], outputs=[table])
|
| 149 |
refresh.click(fn=do_refresh, inputs=[subset], outputs=[table])
|
| 150 |
|
| 151 |
-
#
|
|
|
|
| 152 |
|
| 153 |
|
| 154 |
if __name__ == "__main__":
|
|
|
|
| 110 |
gr.Markdown(
|
| 111 |
"""
|
| 112 |
# DedeuceBench Leaderboard
|
|
|
|
| 113 |
|
| 114 |
+
A compact benchmark for agentic system identification under a strict query budget. Each episode is a hidden Mealy machine (finite‑state transducer). Agents must actively probe using a tiny tool API and then submit an exact transition table. We measure success, safety, and efficiency. Mealy machines appear in many interactive systems — protocols, user interfaces, embedded controllers — making them a simple, general substrate for research on sample‑efficient probing and safe exploration.
|
| 115 |
+
|
| 116 |
+
Source dataset: `{repo}`
|
| 117 |
+
|
| 118 |
+
- Use the Subset dropdown to switch between Lite (sanity), Easy, and Medium.
|
| 119 |
+
- Columns: Model, Score, Provider, Model ID, Tokens Total.
|
| 120 |
+
- Score = 100 × Success@Budget (formatted to two decimals).
|
| 121 |
""".format(repo=RESULTS_REPO)
|
| 122 |
)
|
| 123 |
with gr.Row():
|
| 124 |
subset = gr.Dropdown(
|
| 125 |
label="Subset",
|
| 126 |
+
choices=["Lite (sanity)", "Easy", "Medium"],
|
| 127 |
+
value="Lite (sanity)",
|
| 128 |
)
|
| 129 |
refresh = gr.Button("Refresh")
|
| 130 |
|
| 131 |
+
def _format_for_display(df: pd.DataFrame) -> pd.DataFrame:
|
| 132 |
+
if df is None or df.empty:
|
| 133 |
+
return df
|
| 134 |
+
df = df.copy()
|
| 135 |
+
# Ensure numeric for sorting
|
| 136 |
+
if "Score100" in df.columns:
|
| 137 |
+
df["Score100"] = pd.to_numeric(df["Score100"], errors="coerce")
|
| 138 |
+
df = df.sort_values(by=["Score100"], ascending=False, na_position="last")
|
| 139 |
+
score_str = df["Score100"].map(lambda x: f"{x:.2f}" if pd.notnull(x) else "")
|
| 140 |
+
else:
|
| 141 |
+
score_str = []
|
| 142 |
+
# Rename columns for display
|
| 143 |
+
rename = {
|
| 144 |
+
"model": "Model",
|
| 145 |
+
"provider": "Provider",
|
| 146 |
+
"model_id": "Model ID",
|
| 147 |
+
"Score100": "Score",
|
| 148 |
+
"TokensTotal": "Tokens Total",
|
| 149 |
+
}
|
| 150 |
+
df = df.rename(columns=rename)
|
| 151 |
+
if "Score" in df.columns:
|
| 152 |
+
df["Score"] = score_str
|
| 153 |
+
# Reorder: Model, Score, Provider, Model ID, Tokens Total
|
| 154 |
+
keep = ["Model", "Score", "Provider", "Model ID", "Tokens Total"]
|
| 155 |
+
for col in keep:
|
| 156 |
+
if col not in df.columns:
|
| 157 |
+
df[col] = None
|
| 158 |
+
df = df[keep]
|
| 159 |
+
df.reset_index(drop=True, inplace=True)
|
| 160 |
+
return df
|
| 161 |
+
|
| 162 |
+
def _fallback_top_level() -> pd.DataFrame:
|
| 163 |
+
base = _load_top_leaderboard()
|
| 164 |
+
base = _derive_provider_model_id(base)
|
| 165 |
+
keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
|
| 166 |
+
for col in keep:
|
| 167 |
+
if col not in base.columns:
|
| 168 |
+
base[col] = None
|
| 169 |
+
base = base[keep]
|
| 170 |
+
return _format_for_display(base)
|
| 171 |
|
| 172 |
def do_refresh(sub: str):
|
| 173 |
+
# Normalize subset label
|
| 174 |
+
label = (sub or "").strip().lower()
|
| 175 |
+
if label.startswith("lite"):
|
| 176 |
+
norm = "lite"
|
| 177 |
+
elif label.startswith("easy"):
|
| 178 |
+
norm = "easy"
|
| 179 |
+
else:
|
| 180 |
+
norm = "medium"
|
| 181 |
df = load_runs_with_subset()
|
| 182 |
# Fallback to top-level if runs empty (no subset info)
|
| 183 |
if df is None or df.empty:
|
| 184 |
+
return _fallback_top_level()
|
| 185 |
+
filtered = filter_by_subset(df, norm)
|
| 186 |
+
return _format_for_display(filtered)
|
| 187 |
+
|
| 188 |
+
# Initial render: load current data for default subset
|
| 189 |
+
_initial_df = do_refresh("Lite (sanity)")
|
| 190 |
+
table = gr.Dataframe(value=_initial_df, interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
subset.change(fn=do_refresh, inputs=[subset], outputs=[table])
|
| 193 |
refresh.click(fn=do_refresh, inputs=[subset], outputs=[table])
|
| 194 |
|
| 195 |
+
# Auto-refresh on app load
|
| 196 |
+
demo.load(fn=do_refresh, inputs=[subset], outputs=[table])
|
| 197 |
|
| 198 |
|
| 199 |
if __name__ == "__main__":
|