comfortably-dumb commited on
Commit
7af1601
·
1 Parent(s): 011bfa0

Added description

Browse files
Files changed (1) hide show
  1. app.py +66 -21
app.py CHANGED
@@ -110,45 +110,90 @@ with gr.Blocks(title="DedeuceBench Leaderboard") as demo:
110
  gr.Markdown(
111
  """
112
  # DedeuceBench Leaderboard
113
- Source: `{repo}`
114
 
115
- - Use the Subset dropdown to switch between Lite(sanity), easy, and medium.
116
- - Columns shown: model, provider, model_id, Score100, TokensTotal.
117
- - Click Refresh to pull the latest runs from the results dataset.
 
 
 
 
118
  """.format(repo=RESULTS_REPO)
119
  )
120
  with gr.Row():
121
  subset = gr.Dropdown(
122
  label="Subset",
123
- choices=["Lite(sanity)", "easy", "medium"],
124
- value="Lite(sanity)",
125
  )
126
  refresh = gr.Button("Refresh")
127
 
128
- # Initial load for table value
129
- _initial_df = filter_by_subset(load_runs_with_subset(), "Lite(sanity)")
130
- table = gr.Dataframe(value=_initial_df, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  def do_refresh(sub: str):
 
 
 
 
 
 
 
 
133
  df = load_runs_with_subset()
134
  # Fallback to top-level if runs empty (no subset info)
135
  if df is None or df.empty:
136
- base = _load_top_leaderboard()
137
- base = _derive_provider_model_id(base)
138
- keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
139
- for col in keep:
140
- if col not in base.columns:
141
- base[col] = None
142
- base = base[keep]
143
- base = base.sort_values(by=["Score100"], ascending=False, na_position="last")
144
- base.reset_index(drop=True, inplace=True)
145
- return base
146
- return filter_by_subset(df, sub)
147
 
148
  subset.change(fn=do_refresh, inputs=[subset], outputs=[table])
149
  refresh.click(fn=do_refresh, inputs=[subset], outputs=[table])
150
 
151
- # No explicit initial setter; initial value is provided above
 
152
 
153
 
154
  if __name__ == "__main__":
 
110
  gr.Markdown(
111
  """
112
  # DedeuceBench Leaderboard
 
113
 
114
+ A compact benchmark for agentic system identification under a strict query budget. Each episode is a hidden Mealy machine (finite‑state transducer). Agents must actively probe using a tiny tool API and then submit an exact transition table. We measure success, safety, and efficiency. Mealy machines appear in many interactive systems — protocols, user interfaces, embedded controllers — making them a simple, general substrate for research on sample‑efficient probing and safe exploration.
115
+
116
+ Source dataset: `{repo}`
117
+
118
+ - Use the Subset dropdown to switch between Lite (sanity), Easy, and Medium.
119
+ - Columns: Model, Score, Provider, Model ID, Tokens Total.
120
+ - Score = 100 × Success@Budget (formatted to two decimals).
121
  """.format(repo=RESULTS_REPO)
122
  )
123
  with gr.Row():
124
  subset = gr.Dropdown(
125
  label="Subset",
126
+ choices=["Lite (sanity)", "Easy", "Medium"],
127
+ value="Lite (sanity)",
128
  )
129
  refresh = gr.Button("Refresh")
130
 
131
+ def _format_for_display(df: pd.DataFrame) -> pd.DataFrame:
132
+ if df is None or df.empty:
133
+ return df
134
+ df = df.copy()
135
+ # Ensure numeric for sorting
136
+ if "Score100" in df.columns:
137
+ df["Score100"] = pd.to_numeric(df["Score100"], errors="coerce")
138
+ df = df.sort_values(by=["Score100"], ascending=False, na_position="last")
139
+ score_str = df["Score100"].map(lambda x: f"{x:.2f}" if pd.notnull(x) else "")
140
+ else:
141
+ score_str = []
142
+ # Rename columns for display
143
+ rename = {
144
+ "model": "Model",
145
+ "provider": "Provider",
146
+ "model_id": "Model ID",
147
+ "Score100": "Score",
148
+ "TokensTotal": "Tokens Total",
149
+ }
150
+ df = df.rename(columns=rename)
151
+ if "Score" in df.columns:
152
+ df["Score"] = score_str
153
+ # Reorder: Model, Score, Provider, Model ID, Tokens Total
154
+ keep = ["Model", "Score", "Provider", "Model ID", "Tokens Total"]
155
+ for col in keep:
156
+ if col not in df.columns:
157
+ df[col] = None
158
+ df = df[keep]
159
+ df.reset_index(drop=True, inplace=True)
160
+ return df
161
+
162
+ def _fallback_top_level() -> pd.DataFrame:
163
+ base = _load_top_leaderboard()
164
+ base = _derive_provider_model_id(base)
165
+ keep = ["model", "provider", "model_id", "Score100", "TokensTotal"]
166
+ for col in keep:
167
+ if col not in base.columns:
168
+ base[col] = None
169
+ base = base[keep]
170
+ return _format_for_display(base)
171
 
172
  def do_refresh(sub: str):
173
+ # Normalize subset label
174
+ label = (sub or "").strip().lower()
175
+ if label.startswith("lite"):
176
+ norm = "lite"
177
+ elif label.startswith("easy"):
178
+ norm = "easy"
179
+ else:
180
+ norm = "medium"
181
  df = load_runs_with_subset()
182
  # Fallback to top-level if runs empty (no subset info)
183
  if df is None or df.empty:
184
+ return _fallback_top_level()
185
+ filtered = filter_by_subset(df, norm)
186
+ return _format_for_display(filtered)
187
+
188
+ # Initial render: load current data for default subset
189
+ _initial_df = do_refresh("Lite (sanity)")
190
+ table = gr.Dataframe(value=_initial_df, interactive=False)
 
 
 
 
191
 
192
  subset.change(fn=do_refresh, inputs=[subset], outputs=[table])
193
  refresh.click(fn=do_refresh, inputs=[subset], outputs=[table])
194
 
195
+ # Auto-refresh on app load
196
+ demo.load(fn=do_refresh, inputs=[subset], outputs=[table])
197
 
198
 
199
  if __name__ == "__main__":