Spaces:

OpenEvals
/

InferenceProviderTesting

Running

App Files Files Community

Clémentine commited on 12 days ago

Commit

0b4b222

1 Parent(s): 6e44082

run full sets

Browse files

Files changed (4) hide show

app.py +1 -2
globals.py +1 -1
utils/io.py +20 -3
utils/jobs.py +151 -61

app.py CHANGED Viewed

@@ -164,8 +164,7 @@ job = run_job(
     command=[
         "lighteval", "endpoint", "inference-providers",
         "model_name=MODEL,provider=PROVIDER",
-        "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0",
-        "--max-samples", "10",
         "--push-to-hub", "--save-details",
         "--results-org", "YOURORG"
     ],

     command=[
         "lighteval", "endpoint", "inference-providers",
         "model_name=MODEL,provider=PROVIDER",
+        "extended|ifeval|0,lighteval|gpqa:diamond|0",
         "--push-to-hub", "--save-details",
         "--results-org", "YOURORG"
     ],

globals.py CHANGED Viewed

@@ -15,7 +15,7 @@ NUM_MODELS_RUN: int = 100
 NUM_RUNS_PER_JOB: int = 4  # Number of times to run each job for variance reduction
 RESULTS_DATASET_NAME: str = "IPTesting/inference-provider-test-results"
 LOCAL_CONFIG_FILE: str = "/home/user/app/model_providers.txt"
-TASKS: str = "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0"
 NAMESPACE: str = "huggingface"

 NUM_RUNS_PER_JOB: int = 4  # Number of times to run each job for variance reduction
 RESULTS_DATASET_NAME: str = "IPTesting/inference-provider-test-results"
 LOCAL_CONFIG_FILE: str = "/home/user/app/model_providers.txt"
+TASKS: str = "extended|ifeval|0,lighteval|gpqa:diamond|0"
 NAMESPACE: str = "huggingface"

utils/io.py CHANGED Viewed

@@ -120,7 +120,9 @@ def load_results() -> None:
                 "job_id": row["job_id"],
                 "start_time": row.get("start_time"),
                 "duration": row.get("duration"),
-                "completed_at": row.get("completed_at")
             }
         print(f"Loaded {len(globals.job_results)} results from dataset")
@@ -155,18 +157,31 @@ def get_summary_stats():
 def get_results_table():
     """Return job results as a styled pandas DataFrame for Gradio DataFrame."""
     if not globals.job_results:
-        return pd.DataFrame(columns=["Model", "Provider", "Last Run", "Status", "Current Score", "Previous Score", "Duration", "Completed At", "Latest Job Id"])
     table_data = []
     for key, info in globals.job_results.items():
         current_score = info.get("current_score", "N/A")
         if current_score is not None and isinstance(current_score, (int, float)):
             current_score = f"{current_score:.4f}"
         previous_score = info.get("previous_score", "N/A")
         if previous_score is not None and isinstance(previous_score, (int, float)):
             previous_score = f"{previous_score:.4f}"
         # Format duration
         duration = info.get("duration")
         if duration is not None and isinstance(duration, (int, float)):
@@ -196,9 +211,11 @@ def get_results_table():
         table_data.append([
             model,
             provider,
             info["last_run"],
             info["status"],
             current_score,
             previous_score,
             duration_str,
             completed_at,
@@ -206,7 +223,7 @@ def get_results_table():
             relaunch_link
         ])
-    df = pd.DataFrame(table_data, columns=["Model", "Provider", "Last Run", "Status", "Current Score", "Previous Score", "Duration", "Completed At", "Job Id and Logs", "Actions"])
     # Apply styling to the Status column
     styled_df = df.style.map(style_status, subset=['Status'])

                 "job_id": row["job_id"],
                 "start_time": row.get("start_time"),
                 "duration": row.get("duration"),
+                "completed_at": row.get("completed_at"),
+                "runs": row.get("runs", []),
+                "score_variance": row.get("score_variance")
             }
         print(f"Loaded {len(globals.job_results)} results from dataset")
 def get_results_table():
     """Return job results as a styled pandas DataFrame for Gradio DataFrame."""
     if not globals.job_results:
+        return pd.DataFrame(columns=["Model", "Provider", "Runs", "Last Run", "Status", "Mean Score", "Variance", "Previous Score", "Duration", "Completed At", "Latest Job Id"])
     table_data = []
     for key, info in globals.job_results.items():
+        # Format mean score
         current_score = info.get("current_score", "N/A")
         if current_score is not None and isinstance(current_score, (int, float)):
             current_score = f"{current_score:.4f}"
+        # Format variance
+        variance = info.get("score_variance", "N/A")
+        if variance is not None and isinstance(variance, (int, float)):
+            variance = f"{variance:.6f}"
+        # Format previous score
         previous_score = info.get("previous_score", "N/A")
         if previous_score is not None and isinstance(previous_score, (int, float)):
             previous_score = f"{previous_score:.4f}"
+        # Count runs
+        runs = info.get("runs", [])
+        completed_runs = sum(1 for run in runs if run.get("status") == "COMPLETED")
+        total_runs = len(runs)
+        runs_str = f"{completed_runs}/{total_runs}" if runs else "0/0"
         # Format duration
         duration = info.get("duration")
         if duration is not None and isinstance(duration, (int, float)):
         table_data.append([
             model,
             provider,
+            runs_str,
             info["last_run"],
             info["status"],
             current_score,
+            variance,
             previous_score,
             duration_str,
             completed_at,
             relaunch_link
         ])
+    df = pd.DataFrame(table_data, columns=["Model", "Provider", "Runs", "Last Run", "Status", "Mean Score", "Variance", "Previous Score", "Duration", "Completed At", "Job Id and Logs", "Actions"])
     # Apply styling to the Status column
     styled_df = df.style.map(style_status, subset=['Status'])

utils/jobs.py CHANGED Viewed

@@ -57,22 +57,29 @@ def extract_score_from_job(job_id: str) -> Optional[float]:
         return None
-def run_single_job(model: str, provider: str, tasks: str = globals.TASKS) -> Optional[str]:
-    """Run a single job for a model-provider combination."""
     if not model or not provider:
         print("Missing model or provider")
         return -1
-    # Check if job is already running
     key = globals.get_model_provider_key(model, provider)
     if key in globals.job_results:
         current_status = globals.job_results[key].get("status")
         if current_status == "RUNNING":
-            print( f"Job for {model} on {provider} is already running. Please wait for it to complete.")
             return -1
-    print(f"Starting job for model={model}, provider={provider}")
     job = run_job(
         image="hf.co/spaces/OpenEvals/EvalsOnTheHub",
@@ -82,7 +89,6 @@ def run_single_job(model: str, provider: str, tasks: str = globals.TASKS) -> Opt
             tasks,
             "--push-to-hub", "--save-details",
             "--results-org", "IPTesting",
-            "--max-samples", "10"
         ],
         namespace=globals.NAMESPACE,
         secrets={"HF_TOKEN": os.getenv("HF_TOKEN")},
@@ -90,35 +96,69 @@ def run_single_job(model: str, provider: str, tasks: str = globals.TASKS) -> Opt
     )
     job_id = job.id
-    key = globals.get_model_provider_key(model, provider)
     with globals.results_lock:
-        # Move current score to previous score if it exists (relaunching)
-        previous_score = None
-        if key in globals.job_results and globals.job_results[key].get("current_score", None) is not None:
-            previous_score = globals.job_results[key]["current_score"]
-        start_time = datetime.now()
-        globals.job_results[key] = {
-            "model": model,
-            "provider": provider,
-            "last_run": start_time.strftime("%Y-%m-%d %H:%M:%S"),
-            "status": "RUNNING",
-            "current_score": None,
-            "previous_score": previous_score,
             "job_id": job_id,
             "start_time": start_time.isoformat(),
             "duration": None,
             "completed_at": None
-        }
     # Don't save immediately - let the periodic save handle it
-    print(f"Job launched: ID={job_id}, model={model}, provider={provider}")
     return job_id
 # Todo: factorize both following functions
 def launch_jobs(tasks: str = globals.TASKS, config_file: str = globals.LOCAL_CONFIG_FILE):
-    """Launch jobs for all models and providers."""
     models_providers = load_models_providers(config_file)
     if not models_providers:
@@ -126,17 +166,19 @@ def launch_jobs(tasks: str = globals.TASKS, config_file: str = globals.LOCAL_CON
         return "No valid model-provider combinations found"
     print(f"Found {len(models_providers)} model-provider combinations")
     launched_count = 0
     for model, provider in models_providers:
-        job_id = run_single_job(model, provider, tasks)
-        if job_id != -1:
-            launched_count += 1
     # Save all results once after launching all jobs
     save_results()
-    print(f"Launched {launched_count}/{len(models_providers)} jobs successfully")
-    return f"Launched {launched_count} jobs"
 def relaunch_failed_jobs():
     """Relaunch only failed model-provider combinations from job results."""
@@ -164,50 +206,98 @@ def relaunch_failed_jobs():
 def update_job_statuses() -> None:
-    """Check and update the status of active jobs."""
     try:
         keys = list(globals.job_results.keys())
         for key in keys:
             try:
-                job_id = globals.job_results[key]["job_id"]
-                job_info = inspect_job(job_id=job_id, namespace=globals.NAMESPACE)
-                new_status = job_info.status.stage
                 with globals.results_lock:
-                    old_status = globals.job_results[key]["status"]
-                    if old_status != new_status:
-                        globals.job_results[key]["status"] = new_status
-                        print(f"Job {job_id} status changed: {old_status} -> {new_status}")
-                        # If job completed, try to extract score and calculate duration
-                        if new_status == "COMPLETED":
-                            completed_time = datetime.now()
-                            globals.job_results[key]["completed_at"] = completed_time.strftime("%Y-%m-%d %H:%M:%S")
-                            # Calculate duration if we have start_time
-                            start_time_str = globals.job_results[key].get("start_time")
-                            if start_time_str:
-                                start_time = datetime.fromisoformat(start_time_str)
-                                duration_seconds = (completed_time - start_time).total_seconds()
-                                globals.job_results[key]["duration"] = duration_seconds
-                            score = extract_score_from_job(job_id)
-                            if score is not None:
-                                globals.job_results[key]["current_score"] = score
-                    if new_status == "COMPLETED" and globals.job_results[key]["current_score"] is None:
-                        score = extract_score_from_job(job_id)
-                        if score is not None:
-                            globals.job_results[key]["current_score"] = score
             except Exception as e:
                 print(f"Error checking job: {str(e)}")
         save_results()
     except Exception as e:
         print(f"Error in update_job_statuses: {str(e)}")

         return None
+def run_single_job(model: str, provider: str, tasks: str = globals.TASKS, run_number: int = 1) -> Optional[str]:
+    """Run a single job for a model-provider combination.
+    Args:
+        model: Model ID
+        provider: Provider name
+        tasks: Tasks to run
+        run_number: Which run this is (1-4 for multiple runs)
+    """
     if not model or not provider:
         print("Missing model or provider")
         return -1
+    # Check if any run is already running for this model-provider
     key = globals.get_model_provider_key(model, provider)
     if key in globals.job_results:
         current_status = globals.job_results[key].get("status")
         if current_status == "RUNNING":
+            print(f"Job for {model} on {provider} is already running. Please wait for it to complete.")
             return -1
+    print(f"Starting job for model={model}, provider={provider}, run {run_number}/{globals.NUM_RUNS_PER_JOB}")
     job = run_job(
         image="hf.co/spaces/OpenEvals/EvalsOnTheHub",
             tasks,
             "--push-to-hub", "--save-details",
             "--results-org", "IPTesting",
         ],
         namespace=globals.NAMESPACE,
         secrets={"HF_TOKEN": os.getenv("HF_TOKEN")},
     )
     job_id = job.id
+    start_time = datetime.now()
     with globals.results_lock:
+        # Initialize or update the job result
+        if key not in globals.job_results:
+            # First run - initialize the structure
+            previous_score = None
+            globals.job_results[key] = {
+                "model": model,
+                "provider": provider,
+                "last_run": start_time.strftime("%Y-%m-%d %H:%M:%S"),
+                "status": "RUNNING",
+                "current_score": None,
+                "previous_score": None,
+                "job_id": job_id,
+                "start_time": start_time.isoformat(),
+                "duration": None,
+                "completed_at": None,
+                "runs": []
+            }
+        else:
+            # Subsequent run or relaunch
+            previous_score = globals.job_results[key].get("current_score")
+            globals.job_results[key]["status"] = "RUNNING"
+            globals.job_results[key]["last_run"] = start_time.strftime("%Y-%m-%d %H:%M:%S")
+            globals.job_results[key]["start_time"] = start_time.isoformat()
+            globals.job_results[key]["previous_score"] = previous_score
+        # Add this run to the runs list
+        globals.job_results[key]["runs"].append({
+            "run_number": run_number,
             "job_id": job_id,
+            "status": "RUNNING",
+            "score": None,
             "start_time": start_time.isoformat(),
             "duration": None,
             "completed_at": None
+        })
     # Don't save immediately - let the periodic save handle it
+    print(f"Job launched: ID={job_id}, model={model}, provider={provider}, run {run_number}")
     return job_id
+def run_multiple_jobs(model: str, provider: str, tasks: str = globals.TASKS, num_runs: int = globals.NUM_RUNS_PER_JOB) -> list:
+    """Run multiple jobs for a model-provider combination to reduce variance.
+    Returns:
+        List of job IDs launched
+    """
+    job_ids = []
+    for run_number in range(1, num_runs + 1):
+        job_id = run_single_job(model, provider, tasks, run_number=run_number)
+        if job_id != -1:
+            job_ids.append(job_id)
+        # Small delay between launches
+        time.sleep(2)
+    return job_ids
 # Todo: factorize both following functions
 def launch_jobs(tasks: str = globals.TASKS, config_file: str = globals.LOCAL_CONFIG_FILE):
+    """Launch jobs for all models and providers with multiple runs per combination."""
     models_providers = load_models_providers(config_file)
     if not models_providers:
         return "No valid model-provider combinations found"
     print(f"Found {len(models_providers)} model-provider combinations")
+    print(f"Will launch {globals.NUM_RUNS_PER_JOB} runs per combination")
     launched_count = 0
     for model, provider in models_providers:
+        job_ids = run_multiple_jobs(model, provider, tasks)
+        if job_ids:
+            launched_count += len(job_ids)
     # Save all results once after launching all jobs
     save_results()
+    total_expected = len(models_providers) * globals.NUM_RUNS_PER_JOB
+    print(f"Launched {launched_count}/{total_expected} jobs successfully")
+    return f"Launched {launched_count}/{total_expected} jobs ({len(models_providers)} model-provider combinations × {globals.NUM_RUNS_PER_JOB} runs each)"
 def relaunch_failed_jobs():
     """Relaunch only failed model-provider combinations from job results."""
 def update_job_statuses() -> None:
+    """Check and update the status of active jobs and aggregate scores from multiple runs."""
     try:
         keys = list(globals.job_results.keys())
         for key in keys:
             try:
                 with globals.results_lock:
+                    runs = globals.job_results[key].get("runs", [])
+                    if not runs:
+                        # Legacy format - no runs list
+                        continue
+                    # Check status of each run
+                    all_completed = True
+                    all_failed = True
+                    any_running = False
+                    for run in runs:
+                        if run["status"] == "RUNNING":
+                            # Check if this run's job is still running
+                            try:
+                                job_info = inspect_job(job_id=run["job_id"], namespace=globals.NAMESPACE)
+                                new_status = job_info.status.stage
+                                if run["status"] != new_status:
+                                    run["status"] = new_status
+                                    print(f"Run {run['run_number']} job {run['job_id']} status changed: {run['status']} -> {new_status}")
+                                    if new_status == "COMPLETED":
+                                        completed_time = datetime.now()
+                                        run["completed_at"] = completed_time.strftime("%Y-%m-%d %H:%M:%S")
+                                        # Calculate duration
+                                        if run.get("start_time"):
+                                            start_time = datetime.fromisoformat(run["start_time"])
+                                            run["duration"] = (completed_time - start_time).total_seconds()
+                                        # Extract score
+                                        score = extract_score_from_job(run["job_id"])
+                                        if score is not None:
+                                            run["score"] = score
+                                            print(f"Run {run['run_number']}: extracted score {score:.4f}")
+                            except Exception as e:
+                                print(f"Error checking run {run['run_number']}: {e}")
+                        # Update aggregate status flags
+                        if run["status"] == "RUNNING":
+                            any_running = True
+                            all_completed = False
+                            all_failed = False
+                        elif run["status"] == "COMPLETED":
+                            all_failed = False
+                        elif run["status"] in ["ERROR", "FAILED"]:
+                            all_completed = False
+                    # Update overall status
+                    if any_running:
+                        globals.job_results[key]["status"] = "RUNNING"
+                    elif all_completed:
+                        globals.job_results[key]["status"] = "COMPLETED"
+                        # Calculate aggregate statistics from completed runs
+                        completed_scores = [run["score"] for run in runs if run["status"] == "COMPLETED" and run["score"] is not None]
+                        if completed_scores:
+                            import statistics
+                            mean_score = statistics.mean(completed_scores)
+                            variance = statistics.variance(completed_scores) if len(completed_scores) > 1 else 0.0
+                            globals.job_results[key]["current_score"] = mean_score
+                            globals.job_results[key]["score_variance"] = variance
+                            print(f"Aggregated {len(completed_scores)} runs: mean={mean_score:.4f}, variance={variance:.6f}")
+                            # Update completion time to latest run
+                            latest_completion = max([run["completed_at"] for run in runs if run.get("completed_at")], default=None)
+                            if latest_completion:
+                                globals.job_results[key]["completed_at"] = latest_completion
+                    elif all_failed:
+                        globals.job_results[key]["status"] = "ERROR"
             except Exception as e:
                 print(f"Error checking job: {str(e)}")
+                import traceback
+                traceback.print_exc()
         save_results()
     except Exception as e:
         print(f"Error in update_job_statuses: {str(e)}")
+        import traceback
+        traceback.print_exc()