Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Improve Evaluate tab
Browse files- yourbench_space/app.py +19 -10
- yourbench_space/utils.py +11 -0
yourbench_space/app.py
CHANGED
|
@@ -12,9 +12,11 @@ from datasets import load_dataset
|
|
| 12 |
from huggingface_hub import whoami, HfApi
|
| 13 |
from yourbench_space import PATH
|
| 14 |
from yourbench_space.utils import (
|
|
|
|
| 15 |
STAGES,
|
| 16 |
SubprocessManagerGroup,
|
| 17 |
save_files,
|
|
|
|
| 18 |
update_dataset,
|
| 19 |
map_stage_names,
|
| 20 |
is_running_locally,
|
|
@@ -234,6 +236,12 @@ def init_session(profile: gr.OAuthProfile | None):
|
|
| 234 |
logger.info(f"Started session for {local_uuid}")
|
| 235 |
return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
with gr.Blocks(theme=gr.themes.Default()) as app:
|
| 239 |
session_state = gr.State()
|
|
@@ -349,14 +357,18 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
| 349 |
|
| 350 |
with gr.Accordion("Lighteval Preview"):
|
| 351 |
lighteval_df = gr.DataFrame()
|
| 352 |
-
|
| 353 |
stages_table.change(
|
| 354 |
update_dataset,
|
| 355 |
inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
|
| 356 |
outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
|
| 357 |
)
|
| 358 |
|
| 359 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
# TODO: this timer should only be active when the second tab is passed to active for the first time
|
| 361 |
log_timer = gr.Timer(1.0, active=True)
|
| 362 |
log_timer.tick(
|
|
@@ -365,20 +377,16 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
| 365 |
outputs=[log_output, stages_table],
|
| 366 |
)
|
| 367 |
|
| 368 |
-
# with gr.Tab("Evaluate", id=2):
|
| 369 |
-
# with gr.Row():
|
| 370 |
-
# btn_launch_evals = gr.Button("Launch evaluations")
|
| 371 |
-
# status = gr.Textbox(label="Status")
|
| 372 |
-
# btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")], status)
|
| 373 |
-
|
| 374 |
with gr.Tab("Evaluate", id=2):
|
| 375 |
with gr.Column():
|
| 376 |
gr.Markdown("### 🧪 Run YourBench Evaluation")
|
| 377 |
gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
|
| 378 |
|
| 379 |
with gr.Row():
|
| 380 |
-
|
| 381 |
-
|
|
|
|
|
|
|
| 382 |
|
| 383 |
with gr.Accordion("Evaluation Log", open=True):
|
| 384 |
eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
|
|
@@ -390,6 +398,7 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
| 390 |
)
|
| 391 |
clear_status_btn.click(lambda: "", outputs=eval_status)
|
| 392 |
|
|
|
|
| 393 |
app.load(init_session, outputs=session_state)
|
| 394 |
|
| 395 |
app.launch(allowed_paths=[PATH])
|
|
|
|
| 12 |
from huggingface_hub import whoami, HfApi
|
| 13 |
from yourbench_space import PATH
|
| 14 |
from yourbench_space.utils import (
|
| 15 |
+
STAGE_DISPLAY_MAP,
|
| 16 |
STAGES,
|
| 17 |
SubprocessManagerGroup,
|
| 18 |
save_files,
|
| 19 |
+
on_generation_succsess,
|
| 20 |
update_dataset,
|
| 21 |
map_stage_names,
|
| 22 |
is_running_locally,
|
|
|
|
| 236 |
logger.info(f"Started session for {local_uuid}")
|
| 237 |
return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
|
| 238 |
|
| 239 |
+
btn_launch_evals = gr.Button(
|
| 240 |
+
"🚀 Launch Evaluation",
|
| 241 |
+
visible=True,
|
| 242 |
+
interactive=False, # Start non-interactive
|
| 243 |
+
variant="primary"
|
| 244 |
+
)
|
| 245 |
|
| 246 |
with gr.Blocks(theme=gr.themes.Default()) as app:
|
| 247 |
session_state = gr.State()
|
|
|
|
| 357 |
|
| 358 |
with gr.Accordion("Lighteval Preview"):
|
| 359 |
lighteval_df = gr.DataFrame()
|
|
|
|
| 360 |
stages_table.change(
|
| 361 |
update_dataset,
|
| 362 |
inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
|
| 363 |
outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
|
| 364 |
)
|
| 365 |
|
| 366 |
+
stages_table.change(
|
| 367 |
+
on_generation_succsess,
|
| 368 |
+
inputs=stages_table,
|
| 369 |
+
outputs=[tabs,btn_launch_evals],
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
# TODO: this timer should only be active when the second tab is passed to active for the first time
|
| 373 |
log_timer = gr.Timer(1.0, active=True)
|
| 374 |
log_timer.tick(
|
|
|
|
| 377 |
outputs=[log_output, stages_table],
|
| 378 |
)
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
with gr.Tab("Evaluate", id=2):
|
| 381 |
with gr.Column():
|
| 382 |
gr.Markdown("### 🧪 Run YourBench Evaluation")
|
| 383 |
gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
|
| 384 |
|
| 385 |
with gr.Row():
|
| 386 |
+
with gr.Column():
|
| 387 |
+
btn_launch_evals.render()
|
| 388 |
+
with gr.Column():
|
| 389 |
+
clear_status_btn = gr.Button("Clear", variant="secondary")
|
| 390 |
|
| 391 |
with gr.Accordion("Evaluation Log", open=True):
|
| 392 |
eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
|
|
|
|
| 398 |
)
|
| 399 |
clear_status_btn.click(lambda: "", outputs=eval_status)
|
| 400 |
|
| 401 |
+
|
| 402 |
app.load(init_session, outputs=session_state)
|
| 403 |
|
| 404 |
app.launch(allowed_paths=[PATH])
|
yourbench_space/utils.py
CHANGED
|
@@ -129,6 +129,17 @@ def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OA
|
|
| 129 |
|
| 130 |
return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
class SubprocessManagerGroup:
|
| 134 |
"""Instanciates one manager per user (should be used as a singleton class)"""
|
|
|
|
| 129 |
|
| 130 |
return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
|
| 131 |
|
| 132 |
+
def should_enable_eval_tab(stages):
|
| 133 |
+
logger.info(f"Stages received: {stages}")
|
| 134 |
+
logger.info(f"Lighteval stage name: {STAGE_DISPLAY_MAP['lighteval']}")
|
| 135 |
+
return STAGE_DISPLAY_MAP["lighteval"] in stages
|
| 136 |
+
|
| 137 |
+
def on_generation_succsess(stages):
|
| 138 |
+
stages = stages or []
|
| 139 |
+
if STAGE_DISPLAY_MAP["lighteval"] in stages:
|
| 140 |
+
gr.Success("🌟 Your Dataset is ready for evaluation!")
|
| 141 |
+
return gr.update(selected=2), gr.update(interactive=True, visible=True)
|
| 142 |
+
return gr.update(), gr.update(interactive=False, visible=True)
|
| 143 |
|
| 144 |
class SubprocessManagerGroup:
|
| 145 |
"""Instanciates one manager per user (should be used as a singleton class)"""
|