Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import subprocess | |
| import asyncio | |
| from pathlib import Path | |
| from yourbench_space.leaderboard_space.env import INIT_MODELS | |
| from loguru import logger | |
| ON_SPACES = os.environ.get("system") == "spaces" | |
| OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder | |
| def create_eval_file(eval_ds_name: str): | |
| task_name = eval_ds_name.replace("/", "_") | |
| template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py") | |
| subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name]) | |
| async def run_process(args: list) -> dict: | |
| process = await asyncio.create_subprocess_exec( | |
| *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE | |
| ) | |
| await asyncio.wait_for(process.wait(), timeout=180) | |
| stdout = await process.stdout.read() | |
| stderr = await process.stderr.read() | |
| return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()} | |
| async def run_evaluations(eval_ds_name: str, org: str) -> list: | |
| task_name = eval_ds_name.replace("/", "_") | |
| tasks = [] | |
| for model_name, provider in INIT_MODELS: | |
| args = [ | |
| "lighteval", | |
| "endpoint", | |
| "inference-providers", | |
| f"model={model_name},provider={provider}", | |
| f"custom|{task_name}|0|0", | |
| "--custom-tasks", | |
| f"custom_{task_name}_task.py", | |
| "--max-samples", | |
| "30", | |
| "--output-dir", | |
| f"{OUTPUT_DIR}", | |
| "--save-details", | |
| "--results-org", | |
| org, | |
| "--push-to-hub", | |
| ] | |
| tasks.append(run_process(args)) | |
| # Will capture the task if failed | |
| processes = await asyncio.gather(*tasks, return_exceptions=True) | |
| for process in processes: | |
| logger.info("Logs for process:") | |
| logger.info(process["stdout"]) | |
| logger.info(process["stderr"]) | |
| if all(not isinstance(result, Exception) for result in processes): | |
| return "✅" | |
| return "At least one model failed" | |