Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Alina Lozovskaya
commited on
Commit
·
ea047ad
1
Parent(s):
2617bee
Update Setup and Run Generatation tabs
Browse files- yourbench_space/app.py +125 -77
- yourbench_space/config.py +33 -37
- yourbench_space/evaluation.py +34 -20
- yourbench_space/utils.py +94 -43
yourbench_space/app.py
CHANGED
|
@@ -1,32 +1,33 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
import os
|
| 3 |
import sys
|
| 4 |
import time
|
| 5 |
-
import gradio as gr
|
| 6 |
import uuid
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
from datasets import load_dataset
|
| 9 |
-
from huggingface_hub import whoami
|
| 10 |
from loguru import logger
|
| 11 |
-
from pathlib import Path
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
from yourbench_space.utils import (
|
|
|
|
| 15 |
SubprocessManagerGroup,
|
| 16 |
save_files,
|
| 17 |
update_dataset,
|
| 18 |
-
|
| 19 |
-
is_running_locally
|
| 20 |
)
|
| 21 |
-
from yourbench_space.
|
| 22 |
-
from yourbench_space.
|
|
|
|
| 23 |
|
| 24 |
project_description = """
|
| 25 |
-
# YourBench 🚀
|
| 26 |
**Dynamic Benchmark Generation for Language Models**
|
| 27 |
|
| 28 |
Quickly create zero-shot benchmarks from your documents – keeping models accurate and adaptable
|
| 29 |
-
- 📖 [FAQ](#)
|
| 30 |
- 💻 [GitHub](https://github.com/huggingface/yourbench/tree/v0.2-alpha-space)
|
| 31 |
"""
|
| 32 |
|
|
@@ -35,7 +36,7 @@ logger.add(sys.stderr, level="INFO")
|
|
| 35 |
|
| 36 |
# Global to store all managers per session
|
| 37 |
MANAGERS = SubprocessManagerGroup()
|
| 38 |
-
USER_ID_SESSION_MAP: dict[str, str] =
|
| 39 |
|
| 40 |
|
| 41 |
docs_path = Path(__file__).parent / "docs.md"
|
|
@@ -45,30 +46,36 @@ citation_content = (
|
|
| 45 |
else "# Citation\n\nDocumentation file not found."
|
| 46 |
)
|
| 47 |
|
|
|
|
| 48 |
def generate_and_return(hf_org, hf_dataset_name, session_state: gr.State):
|
| 49 |
manager = MANAGERS.get(session_state)
|
| 50 |
-
if manager is None:
|
| 51 |
return (
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
session_uid = session_state.value
|
| 57 |
config_path = generate_and_save_config(hf_org, hf_dataset_name, session_uid, manager.config_path)
|
| 58 |
for _ in range(5):
|
| 59 |
time.sleep(0.5)
|
| 60 |
if config_path.exists():
|
|
|
|
| 61 |
return (
|
| 62 |
"✅ Config saved!",
|
| 63 |
gr.update(value=str(config_path), visible=True, interactive=True),
|
| 64 |
)
|
|
|
|
|
|
|
| 65 |
return (
|
| 66 |
"❌ Config generation failed.",
|
| 67 |
gr.update(visible=False, interactive=False),
|
| 68 |
)
|
| 69 |
|
|
|
|
| 70 |
final_dataset = None
|
| 71 |
|
|
|
|
| 72 |
def update_process_status(session_state: gr.State):
|
| 73 |
"""Update process status and include exit details if process has terminated"""
|
| 74 |
if session_state is None:
|
|
@@ -79,17 +86,22 @@ def update_process_status(session_state: gr.State):
|
|
| 79 |
return gr.update(value=False, label="Not running")
|
| 80 |
|
| 81 |
is_running = manager.is_running()
|
| 82 |
-
|
| 83 |
if not is_running:
|
| 84 |
exit_code, exit_reason = manager.get_exit_details()
|
| 85 |
-
status_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
return gr.update(value=False, label=status_text)
|
| 87 |
-
|
| 88 |
return gr.update(value=True, label="Process Status: Running")
|
| 89 |
|
|
|
|
| 90 |
def prepare_task(session_uid: str, oauth_token: gr.OAuthToken | None, hf_dataset_name: str, _=None):
|
| 91 |
if oauth_token is None and not is_running_locally():
|
| 92 |
-
gr.Warning(
|
| 93 |
return
|
| 94 |
new_env = os.environ.copy()
|
| 95 |
|
|
@@ -122,6 +134,7 @@ def switch_to_run_generation_tab():
|
|
| 122 |
def enable_button(files):
|
| 123 |
return gr.update(interactive=bool(files))
|
| 124 |
|
|
|
|
| 125 |
def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
|
| 126 |
# Test dataset existence
|
| 127 |
eval_ds_name = f"{org_name}/{eval_name}"
|
|
@@ -136,13 +149,29 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
|
|
| 136 |
status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
|
| 137 |
# Create space
|
| 138 |
from huggingface_hub import HfApi
|
|
|
|
| 139 |
repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
|
| 140 |
api = HfApi()
|
| 141 |
|
| 142 |
try:
|
| 143 |
-
api.create_repo(
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
|
| 147 |
api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
|
| 148 |
except Exception as e:
|
|
@@ -179,8 +208,6 @@ def init_session(profile: gr.OAuthProfile | None):
|
|
| 179 |
|
| 180 |
|
| 181 |
with gr.Blocks(theme=gr.themes.Default()) as app:
|
| 182 |
-
# We initialize the session state with the user randomly generated uuid
|
| 183 |
-
# Using uuid4 makes collision cases extremely unlikely even for concurrent users
|
| 184 |
session_state = gr.State()
|
| 185 |
|
| 186 |
gr.Markdown(project_description)
|
|
@@ -190,12 +217,8 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
| 190 |
with gr.Row():
|
| 191 |
with gr.Accordion("Hugging Face Settings"):
|
| 192 |
login_btn = gr.LoginButton()
|
| 193 |
-
hf_org_dropdown = gr.Dropdown(
|
| 194 |
-
|
| 195 |
-
)
|
| 196 |
-
app.load(
|
| 197 |
-
update_hf_org_dropdown, inputs=None, outputs=hf_org_dropdown
|
| 198 |
-
)
|
| 199 |
|
| 200 |
hf_dataset_name = gr.Textbox(
|
| 201 |
label="Dataset name",
|
|
@@ -213,17 +236,36 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
| 213 |
file_input.upload(
|
| 214 |
save_files,
|
| 215 |
inputs=[session_state, file_input],
|
| 216 |
-
outputs
|
| 217 |
)
|
|
|
|
| 218 |
|
| 219 |
preview_button = gr.Button("Generate New Config", interactive=False)
|
| 220 |
log_message = gr.Textbox(label="Log Message", visible=True)
|
| 221 |
-
download_button = gr.File(
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
)
|
| 224 |
|
| 225 |
file_input.change(enable_button, inputs=file_input, outputs=preview_button)
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
preview_button.click(
|
| 228 |
generate_and_return,
|
| 229 |
inputs=[hf_org_dropdown, hf_dataset_name, session_state],
|
|
@@ -234,66 +276,72 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
| 234 |
inputs=None,
|
| 235 |
outputs=tabs,
|
| 236 |
)
|
| 237 |
-
|
| 238 |
with gr.Tab("Run Generation", id=1):
|
| 239 |
-
with gr.
|
| 240 |
-
|
| 241 |
-
|
|
|
|
|
|
|
| 242 |
|
| 243 |
-
|
| 244 |
stop_button.click(MANAGERS.stop_process, inputs=session_state)
|
| 245 |
-
|
| 246 |
-
kill_button = gr.Button("Kill Task")
|
| 247 |
kill_button.click(MANAGERS.kill_process, inputs=session_state)
|
| 248 |
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
-
|
| 251 |
-
with gr.Column():
|
| 252 |
-
with gr.Accordion("Log Output", open=True):
|
| 253 |
-
log_output = gr.Code(language=None, lines=20, interactive=False)
|
| 254 |
-
|
| 255 |
-
process_status = gr.Checkbox(label="Process Status", interactive=False)
|
| 256 |
-
status_timer = gr.Timer(2.0, active=True)
|
| 257 |
-
status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
|
| 258 |
-
|
| 259 |
-
with gr.Column():
|
| 260 |
with gr.Accordion("Stages", open=True):
|
| 261 |
stages_table = gr.CheckboxGroup(
|
| 262 |
-
choices=STAGES,
|
| 263 |
value=[],
|
| 264 |
label="Pipeline Stages Completed",
|
|
|
|
| 265 |
interactive=False,
|
| 266 |
)
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
-
# TODO: this timer should only be active when the second tab is passed to active for the first time
|
| 285 |
-
log_timer = gr.Timer(1.0, active=True)
|
| 286 |
-
log_timer.tick(
|
| 287 |
-
MANAGERS.read_and_get_output, inputs=session_state, outputs=[log_output, stages_table]
|
| 288 |
-
)
|
| 289 |
with gr.Tab("Evaluate", id=2, visible=False):
|
| 290 |
with gr.Row():
|
| 291 |
btn_launch_evals = gr.Button("Launch evaluations")
|
| 292 |
status = gr.Textbox(label="Status")
|
| 293 |
-
|
| 294 |
btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name], status)
|
| 295 |
|
| 296 |
app.load(init_session, outputs=session_state)
|
| 297 |
|
| 298 |
-
|
| 299 |
-
app.launch(allowed_paths=["/app"])
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
import time
|
|
|
|
| 4 |
import uuid
|
| 5 |
+
import asyncio
|
| 6 |
+
from pathlib import Path
|
| 7 |
|
|
|
|
|
|
|
| 8 |
from loguru import logger
|
|
|
|
| 9 |
|
| 10 |
+
import gradio as gr
|
| 11 |
+
from datasets import load_dataset
|
| 12 |
+
from huggingface_hub import whoami
|
| 13 |
from yourbench_space.utils import (
|
| 14 |
+
STAGES,
|
| 15 |
SubprocessManagerGroup,
|
| 16 |
save_files,
|
| 17 |
update_dataset,
|
| 18 |
+
map_stage_names,
|
| 19 |
+
is_running_locally,
|
| 20 |
)
|
| 21 |
+
from yourbench_space.config import generate_and_save_config
|
| 22 |
+
from yourbench_space.evaluation import run_evaluations, create_eval_file
|
| 23 |
+
|
| 24 |
|
| 25 |
project_description = """
|
| 26 |
+
# YourBench 🚀
|
| 27 |
**Dynamic Benchmark Generation for Language Models**
|
| 28 |
|
| 29 |
Quickly create zero-shot benchmarks from your documents – keeping models accurate and adaptable
|
| 30 |
+
- 📖 [FAQ](#)
|
| 31 |
- 💻 [GitHub](https://github.com/huggingface/yourbench/tree/v0.2-alpha-space)
|
| 32 |
"""
|
| 33 |
|
|
|
|
| 36 |
|
| 37 |
# Global to store all managers per session
|
| 38 |
MANAGERS = SubprocessManagerGroup()
|
| 39 |
+
USER_ID_SESSION_MAP: dict[str, str] = {}
|
| 40 |
|
| 41 |
|
| 42 |
docs_path = Path(__file__).parent / "docs.md"
|
|
|
|
| 46 |
else "# Citation\n\nDocumentation file not found."
|
| 47 |
)
|
| 48 |
|
| 49 |
+
|
| 50 |
def generate_and_return(hf_org, hf_dataset_name, session_state: gr.State):
|
| 51 |
manager = MANAGERS.get(session_state)
|
| 52 |
+
if manager is None: # should not be possible
|
| 53 |
return (
|
| 54 |
+
"❌ Config generation failed.",
|
| 55 |
+
gr.update(visible=False, interactive=False),
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
session_uid = session_state.value
|
| 59 |
config_path = generate_and_save_config(hf_org, hf_dataset_name, session_uid, manager.config_path)
|
| 60 |
for _ in range(5):
|
| 61 |
time.sleep(0.5)
|
| 62 |
if config_path.exists():
|
| 63 |
+
gr.Success("Config generated")
|
| 64 |
return (
|
| 65 |
"✅ Config saved!",
|
| 66 |
gr.update(value=str(config_path), visible=True, interactive=True),
|
| 67 |
)
|
| 68 |
+
|
| 69 |
+
gr.Error("Failed to generate config")
|
| 70 |
return (
|
| 71 |
"❌ Config generation failed.",
|
| 72 |
gr.update(visible=False, interactive=False),
|
| 73 |
)
|
| 74 |
|
| 75 |
+
|
| 76 |
final_dataset = None
|
| 77 |
|
| 78 |
+
|
| 79 |
def update_process_status(session_state: gr.State):
|
| 80 |
"""Update process status and include exit details if process has terminated"""
|
| 81 |
if session_state is None:
|
|
|
|
| 86 |
return gr.update(value=False, label="Not running")
|
| 87 |
|
| 88 |
is_running = manager.is_running()
|
| 89 |
+
|
| 90 |
if not is_running:
|
| 91 |
exit_code, exit_reason = manager.get_exit_details()
|
| 92 |
+
status_text = (
|
| 93 |
+
f"Process Status: Stopped - {exit_reason}, exit code - {exit_code}"
|
| 94 |
+
if exit_reason
|
| 95 |
+
else "Process Status: Stopped"
|
| 96 |
+
)
|
| 97 |
return gr.update(value=False, label=status_text)
|
| 98 |
+
|
| 99 |
return gr.update(value=True, label="Process Status: Running")
|
| 100 |
|
| 101 |
+
|
| 102 |
def prepare_task(session_uid: str, oauth_token: gr.OAuthToken | None, hf_dataset_name: str, _=None):
|
| 103 |
if oauth_token is None and not is_running_locally():
|
| 104 |
+
gr.Warning("You need to log in to use this Space")
|
| 105 |
return
|
| 106 |
new_env = os.environ.copy()
|
| 107 |
|
|
|
|
| 134 |
def enable_button(files):
|
| 135 |
return gr.update(interactive=bool(files))
|
| 136 |
|
| 137 |
+
|
| 138 |
def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
|
| 139 |
# Test dataset existence
|
| 140 |
eval_ds_name = f"{org_name}/{eval_name}"
|
|
|
|
| 149 |
status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
|
| 150 |
# Create space
|
| 151 |
from huggingface_hub import HfApi
|
| 152 |
+
|
| 153 |
repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
|
| 154 |
api = HfApi()
|
| 155 |
|
| 156 |
try:
|
| 157 |
+
api.create_repo(
|
| 158 |
+
repo_id=repo_id,
|
| 159 |
+
repo_type="space",
|
| 160 |
+
space_sdk="gradio",
|
| 161 |
+
token=oauth_token.token,
|
| 162 |
+
)
|
| 163 |
+
api.upload_folder(
|
| 164 |
+
repo_id=repo_id,
|
| 165 |
+
repo_type="space",
|
| 166 |
+
folder_path="src/",
|
| 167 |
+
token=oauth_token.token,
|
| 168 |
+
)
|
| 169 |
+
api.add_space_secret(
|
| 170 |
+
repo_id=repo_id,
|
| 171 |
+
key="HF_TOKEN",
|
| 172 |
+
value=oauth_token.token,
|
| 173 |
+
token=oauth_token.token,
|
| 174 |
+
)
|
| 175 |
api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
|
| 176 |
api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
|
| 177 |
except Exception as e:
|
|
|
|
| 208 |
|
| 209 |
|
| 210 |
with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
|
|
|
|
|
| 211 |
session_state = gr.State()
|
| 212 |
|
| 213 |
gr.Markdown(project_description)
|
|
|
|
| 217 |
with gr.Row():
|
| 218 |
with gr.Accordion("Hugging Face Settings"):
|
| 219 |
login_btn = gr.LoginButton()
|
| 220 |
+
hf_org_dropdown = gr.Dropdown(choices=[], label="Organization", allow_custom_value=True)
|
| 221 |
+
app.load(update_hf_org_dropdown, inputs=None, outputs=hf_org_dropdown)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
hf_dataset_name = gr.Textbox(
|
| 224 |
label="Dataset name",
|
|
|
|
| 236 |
file_input.upload(
|
| 237 |
save_files,
|
| 238 |
inputs=[session_state, file_input],
|
| 239 |
+
outputs=output,
|
| 240 |
)
|
| 241 |
+
delete_button = gr.Button("Delete Uploaded Files", visible=False)
|
| 242 |
|
| 243 |
preview_button = gr.Button("Generate New Config", interactive=False)
|
| 244 |
log_message = gr.Textbox(label="Log Message", visible=True)
|
| 245 |
+
download_button = gr.File(label="Download Config", visible=False, interactive=False)
|
| 246 |
+
|
| 247 |
+
file_input.change(
|
| 248 |
+
lambda files: gr.update(visible=bool(files)),
|
| 249 |
+
inputs=file_input,
|
| 250 |
+
outputs=delete_button,
|
| 251 |
)
|
| 252 |
|
| 253 |
file_input.change(enable_button, inputs=file_input, outputs=preview_button)
|
| 254 |
|
| 255 |
+
def clean_and_confirm(uid):
|
| 256 |
+
MANAGERS.clean_workdir(uid)
|
| 257 |
+
return (
|
| 258 |
+
"Deleted all uploaded files.",
|
| 259 |
+
gr.update(value=None),
|
| 260 |
+
gr.update(interactive=False),
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
delete_button.click(
|
| 264 |
+
clean_and_confirm,
|
| 265 |
+
inputs=session_state,
|
| 266 |
+
outputs=[output, file_input, preview_button],
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
preview_button.click(
|
| 270 |
generate_and_return,
|
| 271 |
inputs=[hf_org_dropdown, hf_dataset_name, session_state],
|
|
|
|
| 276 |
inputs=None,
|
| 277 |
outputs=tabs,
|
| 278 |
)
|
| 279 |
+
|
| 280 |
with gr.Tab("Run Generation", id=1):
|
| 281 |
+
with gr.Column():
|
| 282 |
+
with gr.Row():
|
| 283 |
+
start_button = gr.Button("Start Task")
|
| 284 |
+
stop_button = gr.Button("Stop Task")
|
| 285 |
+
kill_button = gr.Button("Kill Task")
|
| 286 |
|
| 287 |
+
start_button.click(prepare_task, inputs=[session_state, login_btn, hf_dataset_name])
|
| 288 |
stop_button.click(MANAGERS.stop_process, inputs=session_state)
|
|
|
|
|
|
|
| 289 |
kill_button.click(MANAGERS.kill_process, inputs=session_state)
|
| 290 |
|
| 291 |
+
process_status = gr.Checkbox(label="Process Status", interactive=False)
|
| 292 |
+
status_timer = gr.Timer(2.0, active=True)
|
| 293 |
+
status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
|
| 294 |
|
| 295 |
+
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
with gr.Accordion("Stages", open=True):
|
| 297 |
stages_table = gr.CheckboxGroup(
|
| 298 |
+
choices=map_stage_names(STAGES),
|
| 299 |
value=[],
|
| 300 |
label="Pipeline Stages Completed",
|
| 301 |
+
container=False,
|
| 302 |
interactive=False,
|
| 303 |
)
|
| 304 |
|
| 305 |
+
with gr.Row():
|
| 306 |
+
with gr.Column(scale=2):
|
| 307 |
+
with gr.Accordion("Ingestion Preview"):
|
| 308 |
+
ingestion_df = gr.DataFrame()
|
| 309 |
+
|
| 310 |
+
with gr.Accordion("Summarization Preview"):
|
| 311 |
+
summarization_df = gr.DataFrame()
|
| 312 |
+
|
| 313 |
+
with gr.Accordion("Single Shot Preview"):
|
| 314 |
+
single_shot_df = gr.DataFrame()
|
| 315 |
+
|
| 316 |
+
with gr.Accordion("Multi Hop Preview"):
|
| 317 |
+
multi_hop_df = gr.DataFrame()
|
| 318 |
+
|
| 319 |
+
with gr.Accordion("Lighteval Preview"):
|
| 320 |
+
lighteval_df = gr.DataFrame()
|
| 321 |
+
|
| 322 |
+
stages_table.change(
|
| 323 |
+
update_dataset,
|
| 324 |
+
inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
|
| 325 |
+
outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
with gr.Accordion("Log Output", open=False):
|
| 329 |
+
log_output = gr.Code(language=None, lines=20, interactive=False)
|
| 330 |
+
|
| 331 |
+
# TODO: this timer should only be active when the second tab is passed to active for the first time
|
| 332 |
+
log_timer = gr.Timer(1.0, active=True)
|
| 333 |
+
log_timer.tick(
|
| 334 |
+
MANAGERS.read_and_get_output,
|
| 335 |
+
inputs=session_state,
|
| 336 |
+
outputs=[log_output, stages_table],
|
| 337 |
+
)
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
with gr.Tab("Evaluate", id=2, visible=False):
|
| 340 |
with gr.Row():
|
| 341 |
btn_launch_evals = gr.Button("Launch evaluations")
|
| 342 |
status = gr.Textbox(label="Status")
|
|
|
|
| 343 |
btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name], status)
|
| 344 |
|
| 345 |
app.load(init_session, outputs=session_state)
|
| 346 |
|
| 347 |
+
app.launch(allowed_paths=["/home/user/app"])
|
|
|
yourbench_space/config.py
CHANGED
|
@@ -7,13 +7,14 @@ def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
|
|
| 7 |
return {
|
| 8 |
"hf_configuration": {
|
| 9 |
"token": "$HF_TOKEN",
|
| 10 |
-
"private": True,
|
| 11 |
"hf_organization": hf_org,
|
|
|
|
| 12 |
"hf_dataset_name": hf_dataset_name,
|
|
|
|
| 13 |
},
|
| 14 |
"model_list": [
|
| 15 |
{
|
| 16 |
-
"model_name": "
|
| 17 |
"provider": "novita",
|
| 18 |
"max_concurrent_requests": 32,
|
| 19 |
},
|
|
@@ -21,63 +22,59 @@ def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
|
|
| 21 |
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 22 |
"provider": "novita",
|
| 23 |
"max_concurrent_requests": 32,
|
| 24 |
-
}
|
| 25 |
],
|
| 26 |
"model_roles": {
|
| 27 |
-
"ingestion": ["
|
| 28 |
"summarization": ["Qwen/Qwen2.5-72B-Instruct"],
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"judge_answers": ["meta-llama/Llama-3.3-70B-Instruct"],
|
| 33 |
},
|
| 34 |
"pipeline": {
|
| 35 |
"ingestion": {
|
| 36 |
-
"source_documents_dir": f"/app/{session_uid}/uploaded_files/",
|
| 37 |
-
"output_dir": f"/app/{session_uid}/ingested",
|
| 38 |
"run": True,
|
| 39 |
},
|
| 40 |
"upload_ingest_to_hub": {
|
| 41 |
-
"source_documents_dir": f"/app/{session_uid}/ingested",
|
|
|
|
|
|
|
|
|
|
| 42 |
"run": True,
|
| 43 |
},
|
| 44 |
-
"summarization": {"run": True},
|
| 45 |
"chunking": {
|
|
|
|
| 46 |
"chunking_configuration": {
|
| 47 |
"l_min_tokens": 64,
|
| 48 |
"l_max_tokens": 128,
|
| 49 |
-
"tau_threshold": 0.
|
| 50 |
"h_min": 2,
|
| 51 |
-
"h_max":
|
|
|
|
| 52 |
},
|
| 53 |
-
"run": True,
|
| 54 |
},
|
| 55 |
"single_shot_question_generation": {
|
| 56 |
-
"diversification_seed": "24 year old adult",
|
| 57 |
"run": True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
},
|
| 59 |
-
"multi_hop_question_generation": {
|
| 60 |
-
"answer_generation": {
|
| 61 |
-
"question_type": "single_shot",
|
| 62 |
"run": True,
|
| 63 |
-
"
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
{
|
| 70 |
-
"name": "gold",
|
| 71 |
-
"prompt": "GOLD_QA_USER_PROMPT",
|
| 72 |
-
"model_name": "meta-llama/Llama-3.3-70B-Instruct",
|
| 73 |
-
},
|
| 74 |
-
],
|
| 75 |
},
|
| 76 |
-
"
|
| 77 |
-
"run":
|
| 78 |
-
"comparing_strategies": [["zeroshot", "gold"]],
|
| 79 |
-
"chunk_column_index": 0,
|
| 80 |
-
"random_seed": 42,
|
| 81 |
},
|
| 82 |
},
|
| 83 |
}
|
|
@@ -97,4 +94,3 @@ def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config
|
|
| 97 |
file_path = save_yaml_file(config, config_path)
|
| 98 |
logger.success(f"Config saved at: {file_path}")
|
| 99 |
return file_path
|
| 100 |
-
|
|
|
|
| 7 |
return {
|
| 8 |
"hf_configuration": {
|
| 9 |
"token": "$HF_TOKEN",
|
|
|
|
| 10 |
"hf_organization": hf_org,
|
| 11 |
+
"private": True,
|
| 12 |
"hf_dataset_name": hf_dataset_name,
|
| 13 |
+
"concat_if_exist": False,
|
| 14 |
},
|
| 15 |
"model_list": [
|
| 16 |
{
|
| 17 |
+
"model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
|
| 18 |
"provider": "novita",
|
| 19 |
"max_concurrent_requests": 32,
|
| 20 |
},
|
|
|
|
| 22 |
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 23 |
"provider": "novita",
|
| 24 |
"max_concurrent_requests": 32,
|
| 25 |
+
},
|
| 26 |
],
|
| 27 |
"model_roles": {
|
| 28 |
+
"ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
|
| 29 |
"summarization": ["Qwen/Qwen2.5-72B-Instruct"],
|
| 30 |
+
"chunking": ["intfloat/multilingual-e5-large-instruct"],
|
| 31 |
+
"single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
|
| 32 |
+
"multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
|
|
|
|
| 33 |
},
|
| 34 |
"pipeline": {
|
| 35 |
"ingestion": {
|
| 36 |
+
"source_documents_dir": f"/home/user/app/{session_uid}/uploaded_files/",
|
| 37 |
+
"output_dir": f"/home/user/app/{session_uid}/ingested",
|
| 38 |
"run": True,
|
| 39 |
},
|
| 40 |
"upload_ingest_to_hub": {
|
| 41 |
+
"source_documents_dir": f"/home/user/app/{session_uid}/ingested",
|
| 42 |
+
"run": True,
|
| 43 |
+
},
|
| 44 |
+
"summarization": {
|
| 45 |
"run": True,
|
| 46 |
},
|
|
|
|
| 47 |
"chunking": {
|
| 48 |
+
"run": True,
|
| 49 |
"chunking_configuration": {
|
| 50 |
"l_min_tokens": 64,
|
| 51 |
"l_max_tokens": 128,
|
| 52 |
+
"tau_threshold": 0.8,
|
| 53 |
"h_min": 2,
|
| 54 |
+
"h_max": 5,
|
| 55 |
+
"num_multihops_factor": 2,
|
| 56 |
},
|
|
|
|
| 57 |
},
|
| 58 |
"single_shot_question_generation": {
|
|
|
|
| 59 |
"run": True,
|
| 60 |
+
"additional_instructions": "Generate questions to test a curious adult",
|
| 61 |
+
"chunk_sampling": {
|
| 62 |
+
"mode": "count",
|
| 63 |
+
"value": 5,
|
| 64 |
+
"random_seed": 123,
|
| 65 |
+
},
|
| 66 |
},
|
| 67 |
+
"multi_hop_question_generation": {
|
|
|
|
|
|
|
| 68 |
"run": True,
|
| 69 |
+
"additional_instructions": "Generate questions to test a curious adult",
|
| 70 |
+
"chunk_sampling": {
|
| 71 |
+
"mode": "percentage",
|
| 72 |
+
"value": 0.3,
|
| 73 |
+
"random_seed": 42,
|
| 74 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
},
|
| 76 |
+
"lighteval": {
|
| 77 |
+
"run": True,
|
|
|
|
|
|
|
|
|
|
| 78 |
},
|
| 79 |
},
|
| 80 |
}
|
|
|
|
| 94 |
file_path = save_yaml_file(config, config_path)
|
| 95 |
logger.success(f"Config saved at: {file_path}")
|
| 96 |
return file_path
|
|
|
yourbench_space/evaluation.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
-
import
|
|
|
|
|
|
|
| 2 |
from yourbench_space.leaderboard_space.env import INIT_MODELS
|
| 3 |
|
| 4 |
-
|
|
|
|
| 5 |
OUTPUT_DIR = "/data" if ON_SPACES else "."
|
| 6 |
|
|
|
|
| 7 |
def create_eval_file(eval_ds_name):
|
| 8 |
# TODO: replace by Nathan's call
|
| 9 |
-
content =
|
|
|
|
| 10 |
from aenum import extend_enum
|
| 11 |
|
| 12 |
from lighteval.metrics.metrics import Metrics
|
|
@@ -31,10 +36,11 @@ def prompt_function(line, task_name: str = None):
|
|
| 31 |
gold_index=0,
|
| 32 |
specific={"question": line["question"]},
|
| 33 |
)
|
| 34 |
-
"""
|
|
|
|
| 35 |
|
| 36 |
hle = LightevalTaskConfig(
|
| 37 |
-
name="{eval_ds_name.replace(
|
| 38 |
suite=["custom"],
|
| 39 |
prompt_function=prompt_function,
|
| 40 |
hf_repo="{eval_ds_name}",
|
|
@@ -52,38 +58,46 @@ hle = LightevalTaskConfig(
|
|
| 52 |
|
| 53 |
|
| 54 |
TASKS_TABLE = [hle]
|
| 55 |
-
"""
|
| 56 |
-
|
|
|
|
| 57 |
with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
|
| 58 |
f.write(content)
|
| 59 |
|
|
|
|
| 60 |
async def run_process(args: list) -> dict:
|
| 61 |
process = await asyncio.create_subprocess_exec(
|
| 62 |
-
*args,
|
| 63 |
-
stdout=asyncio.subprocess.PIPE,
|
| 64 |
-
stderr=asyncio.subprocess.PIPE
|
| 65 |
)
|
| 66 |
await asyncio.wait_for(process.wait(), timeout=180)
|
| 67 |
stdout = await process.stdout.read()
|
| 68 |
stderr = await process.stderr.read()
|
| 69 |
-
return {
|
| 70 |
-
|
| 71 |
-
'stdout': stdout.decode(),
|
| 72 |
-
'stderr': stderr.decode()
|
| 73 |
-
}
|
| 74 |
|
| 75 |
async def run_evaluations(eval_ds_name: str, org: str) -> list:
|
| 76 |
tasks = []
|
| 77 |
for model_name, provider in INIT_MODELS:
|
| 78 |
args = [
|
| 79 |
-
"lighteval",
|
| 80 |
-
"endpoint",
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
]
|
| 84 |
tasks.append(run_process(args))
|
| 85 |
# Will capture the task if failed
|
| 86 |
processes = await asyncio.gather(*tasks, return_exceptions=True)
|
| 87 |
if all(not isinstance(result, Exception) for result in processes):
|
| 88 |
return "✅"
|
| 89 |
-
return "At least one model failed"
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import asyncio
|
| 3 |
+
|
| 4 |
from yourbench_space.leaderboard_space.env import INIT_MODELS
|
| 5 |
|
| 6 |
+
|
| 7 |
+
ON_SPACES = os.environ.get("system") == "spaces"
|
| 8 |
OUTPUT_DIR = "/data" if ON_SPACES else "."
|
| 9 |
|
| 10 |
+
|
| 11 |
def create_eval_file(eval_ds_name):
|
| 12 |
# TODO: replace by Nathan's call
|
| 13 |
+
content = (
|
| 14 |
+
"""
|
| 15 |
from aenum import extend_enum
|
| 16 |
|
| 17 |
from lighteval.metrics.metrics import Metrics
|
|
|
|
| 36 |
gold_index=0,
|
| 37 |
specific={"question": line["question"]},
|
| 38 |
)
|
| 39 |
+
"""
|
| 40 |
+
+ f"""
|
| 41 |
|
| 42 |
hle = LightevalTaskConfig(
|
| 43 |
+
name="{eval_ds_name.replace("/", "_")}",
|
| 44 |
suite=["custom"],
|
| 45 |
prompt_function=prompt_function,
|
| 46 |
hf_repo="{eval_ds_name}",
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
TASKS_TABLE = [hle]
|
| 61 |
+
"""
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
|
| 65 |
f.write(content)
|
| 66 |
|
| 67 |
+
|
| 68 |
async def run_process(args: list) -> dict:
|
| 69 |
process = await asyncio.create_subprocess_exec(
|
| 70 |
+
*args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
|
|
|
|
|
| 71 |
)
|
| 72 |
await asyncio.wait_for(process.wait(), timeout=180)
|
| 73 |
stdout = await process.stdout.read()
|
| 74 |
stderr = await process.stderr.read()
|
| 75 |
+
return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}
|
| 76 |
+
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
async def run_evaluations(eval_ds_name: str, org: str) -> list:
|
| 79 |
tasks = []
|
| 80 |
for model_name, provider in INIT_MODELS:
|
| 81 |
args = [
|
| 82 |
+
"lighteval",
|
| 83 |
+
"endpoint",
|
| 84 |
+
"inference-providers",
|
| 85 |
+
f"model={model_name},provider={provider}",
|
| 86 |
+
f"custom|{eval_ds_name.replace('/', '_')}|0|0",
|
| 87 |
+
"--custom-tasks",
|
| 88 |
+
f"{OUTPUT_DIR}/custom_task.py",
|
| 89 |
+
"--max-samples",
|
| 90 |
+
"10",
|
| 91 |
+
"--output-dir",
|
| 92 |
+
f"{OUTPUT_DIR}",
|
| 93 |
+
"--save-details",
|
| 94 |
+
"--results-org",
|
| 95 |
+
org,
|
| 96 |
+
"--push-to-hub",
|
| 97 |
]
|
| 98 |
tasks.append(run_process(args))
|
| 99 |
# Will capture the task if failed
|
| 100 |
processes = await asyncio.gather(*tasks, return_exceptions=True)
|
| 101 |
if all(not isinstance(result, Exception) for result in processes):
|
| 102 |
return "✅"
|
| 103 |
+
return "At least one model failed"
|
yourbench_space/utils.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
| 1 |
import io
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
-
import pathlib
|
| 5 |
import shutil
|
|
|
|
| 6 |
import subprocess
|
| 7 |
-
import
|
|
|
|
| 8 |
import pandas as pd
|
| 9 |
-
from collections import defaultdict
|
| 10 |
-
from datasets import load_dataset
|
| 11 |
from loguru import logger
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
STAGES = [
|
| 15 |
"ingestion",
|
|
@@ -17,12 +19,25 @@ STAGES = [
|
|
| 17 |
"summarization",
|
| 18 |
"chunking",
|
| 19 |
"single_shot_question_generation",
|
| 20 |
-
"
|
| 21 |
-
|
| 22 |
-
#"create_leaderboard"
|
| 23 |
-
# "judge_answers", # to uncomment when fixed
|
| 24 |
]
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
def is_running_locally() -> bool:
|
| 27 |
"""
|
| 28 |
Returns True if Gradio is running locally, False if it's running in a Hugging Face Space.
|
|
@@ -33,7 +48,7 @@ def is_running_locally() -> bool:
|
|
| 33 |
def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files: List[pathlib.Path]) -> str:
|
| 34 |
"""Save uploaded files to the UPLOAD_DIRECTORY/uuid safely"""
|
| 35 |
if oauth_token is None and not is_running_locally():
|
| 36 |
-
gr.Warning(
|
| 37 |
return
|
| 38 |
|
| 39 |
saved_paths = []
|
|
@@ -41,7 +56,7 @@ def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files
|
|
| 41 |
for file in [file.name for file in files]:
|
| 42 |
try:
|
| 43 |
source_path = pathlib.Path(file)
|
| 44 |
-
upload_directory_uuid = pathlib.Path(f"/app/{session_state.value}/uploaded_files")
|
| 45 |
# Ensure the upload directory exists
|
| 46 |
upload_directory_uuid.mkdir(parents=True, exist_ok=True)
|
| 47 |
destination_path = upload_directory_uuid / source_path.name
|
|
@@ -56,11 +71,8 @@ def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files
|
|
| 56 |
except Exception as e:
|
| 57 |
print(f"Error moving file {file}: {e}")
|
| 58 |
|
| 59 |
-
return (
|
| 60 |
-
|
| 61 |
-
if saved_paths
|
| 62 |
-
else "No files were saved"
|
| 63 |
-
)
|
| 64 |
|
| 65 |
def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OAuthToken):
|
| 66 |
"""
|
|
@@ -68,31 +80,57 @@ def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OA
|
|
| 68 |
"""
|
| 69 |
ingestion_df = pd.DataFrame()
|
| 70 |
summarization_df = pd.DataFrame()
|
| 71 |
-
|
| 72 |
-
|
|
|
|
| 73 |
|
| 74 |
# Construct dataset name from config
|
| 75 |
dataset_name = f"{hf_org}/{hf_prefix}"
|
| 76 |
|
| 77 |
-
if "
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
class SubprocessManagerGroup:
|
| 95 |
"""Instanciates one manager per user (should be used as a singleton class)"""
|
|
|
|
| 96 |
def __init__(self):
|
| 97 |
self.managers: dict[str, SubprocessManager] = {}
|
| 98 |
|
|
@@ -115,8 +153,15 @@ class SubprocessManagerGroup:
|
|
| 115 |
uid = SubprocessManagerGroup.grab_uuid(uid)
|
| 116 |
if manager := self.managers.get(uid):
|
| 117 |
manager.stop_process()
|
|
|
|
|
|
|
| 118 |
del self.managers[uid]
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
def start_process(self, uid: Union[str, gr.State], custom_env: dict | None):
|
| 121 |
uid = SubprocessManagerGroup.grab_uuid(uid)
|
| 122 |
self.managers[uid].start_process(custom_env=custom_env)
|
|
@@ -141,13 +186,14 @@ class SubprocessManagerGroup:
|
|
| 141 |
return manager.is_running()
|
| 142 |
return False
|
| 143 |
|
|
|
|
| 144 |
class SubprocessManager:
|
| 145 |
def __init__(self, session_uid: str):
|
| 146 |
self.session_uid = session_uid
|
| 147 |
-
self.path = pathlib.Path(f"/app/{session_uid}")
|
| 148 |
self.path.mkdir(parents=True, exist_ok=True)
|
| 149 |
self.config_path = pathlib.Path(f"{self.path}/config.yml")
|
| 150 |
-
self.command = ["uv", "run", "yourbench",
|
| 151 |
self.process = None
|
| 152 |
self.output_stream = io.StringIO()
|
| 153 |
self.exit_code = None
|
|
@@ -160,7 +206,7 @@ class SubprocessManager:
|
|
| 160 |
|
| 161 |
self.output_stream = io.StringIO()
|
| 162 |
self.exit_code = None
|
| 163 |
-
|
| 164 |
try:
|
| 165 |
logger.info(f"Starting process with command: {' '.join(self.command)}")
|
| 166 |
self.process = subprocess.Popen(
|
|
@@ -195,9 +241,12 @@ class SubprocessManager:
|
|
| 195 |
pass
|
| 196 |
|
| 197 |
current_output = self.output_stream.getvalue()
|
| 198 |
-
completed_stages = list(set(re.findall(r"
|
|
|
|
|
|
|
| 199 |
|
| 200 |
-
|
|
|
|
| 201 |
|
| 202 |
def stop_process(self):
|
| 203 |
"""Terminate the subprocess."""
|
|
@@ -207,7 +256,7 @@ class SubprocessManager:
|
|
| 207 |
logger.info("Sending SIGTERM to the Process")
|
| 208 |
try:
|
| 209 |
self.process.terminate()
|
| 210 |
-
self.exit_code =
|
| 211 |
logger.info(f"Process terminated by user with exit code {self.exit_code}")
|
| 212 |
except subprocess.TimeoutExpired:
|
| 213 |
logger.warning("Process did not terminate within timeout, sending SIGKILL")
|
|
@@ -221,7 +270,7 @@ class SubprocessManager:
|
|
| 221 |
logger.info("Sending SIGKILL to the Process")
|
| 222 |
try:
|
| 223 |
self.process.kill()
|
| 224 |
-
self.exit_code = self.process.wait(timeout=5)
|
| 225 |
logger.info(f"Process killed by user with exit code {self.exit_code}")
|
| 226 |
except subprocess.TimeoutExpired:
|
| 227 |
logger.error("Process could not be killed within timeout")
|
|
@@ -237,11 +286,11 @@ class SubprocessManager:
|
|
| 237 |
"""Return exit code and reason if process has terminated"""
|
| 238 |
if self.process is None:
|
| 239 |
return None, "Process was never started"
|
| 240 |
-
|
| 241 |
if self.is_running():
|
| 242 |
return None, "Process is still running"
|
| 243 |
-
|
| 244 |
-
if
|
| 245 |
return self.exit_code, "Process exited abnormaly"
|
| 246 |
|
| 247 |
return self.exit_code, "Process exited normaly"
|
|
@@ -250,3 +299,5 @@ class SubprocessManager:
|
|
| 250 |
"""Stop the process when object is deleted"""
|
| 251 |
if self.process:
|
| 252 |
self.process.kill()
|
|
|
|
|
|
|
|
|
| 1 |
import io
|
| 2 |
import os
|
| 3 |
import re
|
|
|
|
| 4 |
import shutil
|
| 5 |
+
import pathlib
|
| 6 |
import subprocess
|
| 7 |
+
from typing import List, Union, Optional
|
| 8 |
+
|
| 9 |
import pandas as pd
|
|
|
|
|
|
|
| 10 |
from loguru import logger
|
| 11 |
+
|
| 12 |
+
import gradio as gr
|
| 13 |
+
from datasets import load_dataset
|
| 14 |
+
|
| 15 |
|
| 16 |
STAGES = [
|
| 17 |
"ingestion",
|
|
|
|
| 19 |
"summarization",
|
| 20 |
"chunking",
|
| 21 |
"single_shot_question_generation",
|
| 22 |
+
"multi_hop_question_generation",
|
| 23 |
+
"lighteval",
|
|
|
|
|
|
|
| 24 |
]
|
| 25 |
|
| 26 |
+
STAGE_DISPLAY_MAP = {
|
| 27 |
+
"ingestion": "Process Input Docs",
|
| 28 |
+
"upload_ingest_to_hub": "Upload Dataset to Hub",
|
| 29 |
+
"summarization": "Summarize Documents",
|
| 30 |
+
"chunking": "Chunk Documents",
|
| 31 |
+
"single_shot_question_generation": "Generate Single Shot Questions",
|
| 32 |
+
"multi_hop_question_generation": "Generate Multi Hop Questions",
|
| 33 |
+
"lighteval": "Generate Lighteval Subset",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def map_stage_names(stages: list[str]) -> list[str]:
|
| 38 |
+
return [STAGE_DISPLAY_MAP.get(stage, stage) for stage in stages]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
def is_running_locally() -> bool:
|
| 42 |
"""
|
| 43 |
Returns True if Gradio is running locally, False if it's running in a Hugging Face Space.
|
|
|
|
| 48 |
def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files: List[pathlib.Path]) -> str:
|
| 49 |
"""Save uploaded files to the UPLOAD_DIRECTORY/uuid safely"""
|
| 50 |
if oauth_token is None and not is_running_locally():
|
| 51 |
+
gr.Warning("You need to log in to use this Space")
|
| 52 |
return
|
| 53 |
|
| 54 |
saved_paths = []
|
|
|
|
| 56 |
for file in [file.name for file in files]:
|
| 57 |
try:
|
| 58 |
source_path = pathlib.Path(file)
|
| 59 |
+
upload_directory_uuid = pathlib.Path(f"/home/user/app/{session_state.value}/uploaded_files")
|
| 60 |
# Ensure the upload directory exists
|
| 61 |
upload_directory_uuid.mkdir(parents=True, exist_ok=True)
|
| 62 |
destination_path = upload_directory_uuid / source_path.name
|
|
|
|
| 71 |
except Exception as e:
|
| 72 |
print(f"Error moving file {file}: {e}")
|
| 73 |
|
| 74 |
+
return f"Files saved to: {', '.join(saved_paths)}" if saved_paths else "No files were saved"
|
| 75 |
+
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OAuthToken):
|
| 78 |
"""
|
|
|
|
| 80 |
"""
|
| 81 |
ingestion_df = pd.DataFrame()
|
| 82 |
summarization_df = pd.DataFrame()
|
| 83 |
+
single_shot_df = pd.DataFrame()
|
| 84 |
+
multi_hop_df = pd.DataFrame()
|
| 85 |
+
lighteval_df = pd.DataFrame()
|
| 86 |
|
| 87 |
# Construct dataset name from config
|
| 88 |
dataset_name = f"{hf_org}/{hf_prefix}"
|
| 89 |
|
| 90 |
+
if STAGE_DISPLAY_MAP["upload_ingest_to_hub"] in stages:
|
| 91 |
+
ingestion_ds = load_dataset(
|
| 92 |
+
dataset_name, name="ingested", split="train", streaming=True, token=oauth_token.token
|
| 93 |
+
).select_columns("document_text")
|
| 94 |
+
ingestion_df = pd.DataFrame(ingestion_ds.take(1))
|
| 95 |
+
|
| 96 |
+
if STAGE_DISPLAY_MAP["summarization"] in stages:
|
| 97 |
+
summarization_ds = load_dataset(
|
| 98 |
+
dataset_name, name="summarized", split="train", streaming=True, token=oauth_token.token
|
| 99 |
+
).select_columns(["raw_document_summary", "document_summary", "summarization_model"])
|
| 100 |
+
summarization_df = pd.DataFrame(summarization_ds.take(5))
|
| 101 |
+
|
| 102 |
+
if STAGE_DISPLAY_MAP["single_shot_question_generation"] in stages:
|
| 103 |
+
single_shot_ds = load_dataset(
|
| 104 |
+
dataset_name,
|
| 105 |
+
name="single_shot_questions",
|
| 106 |
+
split="train",
|
| 107 |
+
streaming=True,
|
| 108 |
+
token=oauth_token.token,
|
| 109 |
+
).select_columns(["question", "self_answer", "estimated_difficulty"])
|
| 110 |
+
single_shot_df = pd.DataFrame(single_shot_ds.take(5))
|
| 111 |
+
|
| 112 |
+
if STAGE_DISPLAY_MAP["multi_hop_question_generation"] in stages:
|
| 113 |
+
multi_hop_ds = load_dataset(
|
| 114 |
+
dataset_name,
|
| 115 |
+
name="multi_hop_questions",
|
| 116 |
+
split="train",
|
| 117 |
+
streaming=True,
|
| 118 |
+
token=oauth_token.token,
|
| 119 |
+
).select_columns(["question", "self_answer", "estimated_difficulty"])
|
| 120 |
+
multi_hop_df = pd.DataFrame(multi_hop_ds.take(5))
|
| 121 |
+
|
| 122 |
+
if STAGE_DISPLAY_MAP["lighteval"] in stages:
|
| 123 |
+
lighteval_ds = load_dataset(
|
| 124 |
+
dataset_name, name="lighteval", split="train", streaming=True, token=oauth_token.token
|
| 125 |
+
).select_columns(["question", "ground_truth_answer", "question_category", "kind"])
|
| 126 |
+
lighteval_df = pd.DataFrame(lighteval_ds.take(5))
|
| 127 |
+
|
| 128 |
+
return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
|
| 129 |
|
| 130 |
|
| 131 |
class SubprocessManagerGroup:
|
| 132 |
"""Instanciates one manager per user (should be used as a singleton class)"""
|
| 133 |
+
|
| 134 |
def __init__(self):
|
| 135 |
self.managers: dict[str, SubprocessManager] = {}
|
| 136 |
|
|
|
|
| 153 |
uid = SubprocessManagerGroup.grab_uuid(uid)
|
| 154 |
if manager := self.managers.get(uid):
|
| 155 |
manager.stop_process()
|
| 156 |
+
manager.clean_workdir()
|
| 157 |
+
|
| 158 |
del self.managers[uid]
|
| 159 |
|
| 160 |
+
def clean_workdir(self, uid: Union[str, gr.State]):
|
| 161 |
+
uid = SubprocessManagerGroup.grab_uuid(uid)
|
| 162 |
+
if manager := self.managers.get(uid):
|
| 163 |
+
manager.clean_workdir()
|
| 164 |
+
|
| 165 |
def start_process(self, uid: Union[str, gr.State], custom_env: dict | None):
|
| 166 |
uid = SubprocessManagerGroup.grab_uuid(uid)
|
| 167 |
self.managers[uid].start_process(custom_env=custom_env)
|
|
|
|
| 186 |
return manager.is_running()
|
| 187 |
return False
|
| 188 |
|
| 189 |
+
|
| 190 |
class SubprocessManager:
|
| 191 |
def __init__(self, session_uid: str):
|
| 192 |
self.session_uid = session_uid
|
| 193 |
+
self.path = pathlib.Path(f"/home/user/app/{session_uid}")
|
| 194 |
self.path.mkdir(parents=True, exist_ok=True)
|
| 195 |
self.config_path = pathlib.Path(f"{self.path}/config.yml")
|
| 196 |
+
self.command = ["uv", "run", "yourbench", "run", "--config", str(self.config_path)]
|
| 197 |
self.process = None
|
| 198 |
self.output_stream = io.StringIO()
|
| 199 |
self.exit_code = None
|
|
|
|
| 206 |
|
| 207 |
self.output_stream = io.StringIO()
|
| 208 |
self.exit_code = None
|
| 209 |
+
|
| 210 |
try:
|
| 211 |
logger.info(f"Starting process with command: {' '.join(self.command)}")
|
| 212 |
self.process = subprocess.Popen(
|
|
|
|
| 241 |
pass
|
| 242 |
|
| 243 |
current_output = self.output_stream.getvalue()
|
| 244 |
+
completed_stages = list(set(re.findall(r"Completed stage: '([^']*)'", current_output)))
|
| 245 |
+
|
| 246 |
+
return current_output, map_stage_names(completed_stages)
|
| 247 |
|
| 248 |
+
def clean_workdir(self):
|
| 249 |
+
shutil.rmtree(self.path, ignore_errors=True)
|
| 250 |
|
| 251 |
def stop_process(self):
|
| 252 |
"""Terminate the subprocess."""
|
|
|
|
| 256 |
logger.info("Sending SIGTERM to the Process")
|
| 257 |
try:
|
| 258 |
self.process.terminate()
|
| 259 |
+
self.exit_code = self.process.wait(timeout=5) # Wait up to 5 seconds for process to terminate
|
| 260 |
logger.info(f"Process terminated by user with exit code {self.exit_code}")
|
| 261 |
except subprocess.TimeoutExpired:
|
| 262 |
logger.warning("Process did not terminate within timeout, sending SIGKILL")
|
|
|
|
| 270 |
logger.info("Sending SIGKILL to the Process")
|
| 271 |
try:
|
| 272 |
self.process.kill()
|
| 273 |
+
self.exit_code = self.process.wait(timeout=5) # Wait up to 5 seconds for process to be killed
|
| 274 |
logger.info(f"Process killed by user with exit code {self.exit_code}")
|
| 275 |
except subprocess.TimeoutExpired:
|
| 276 |
logger.error("Process could not be killed within timeout")
|
|
|
|
| 286 |
"""Return exit code and reason if process has terminated"""
|
| 287 |
if self.process is None:
|
| 288 |
return None, "Process was never started"
|
| 289 |
+
|
| 290 |
if self.is_running():
|
| 291 |
return None, "Process is still running"
|
| 292 |
+
|
| 293 |
+
if self.exit_code is not None and self.exit_code != 0:
|
| 294 |
return self.exit_code, "Process exited abnormaly"
|
| 295 |
|
| 296 |
return self.exit_code, "Process exited normaly"
|
|
|
|
| 299 |
"""Stop the process when object is deleted"""
|
| 300 |
if self.process:
|
| 301 |
self.process.kill()
|
| 302 |
+
|
| 303 |
+
self.clean_workdir()
|