Clémentine commited on
Commit
0b4b222
·
1 Parent(s): 6e44082

run full sets

Browse files
Files changed (4) hide show
  1. app.py +1 -2
  2. globals.py +1 -1
  3. utils/io.py +20 -3
  4. utils/jobs.py +151 -61
app.py CHANGED
@@ -164,8 +164,7 @@ job = run_job(
164
  command=[
165
  "lighteval", "endpoint", "inference-providers",
166
  "model_name=MODEL,provider=PROVIDER",
167
- "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0",
168
- "--max-samples", "10",
169
  "--push-to-hub", "--save-details",
170
  "--results-org", "YOURORG"
171
  ],
 
164
  command=[
165
  "lighteval", "endpoint", "inference-providers",
166
  "model_name=MODEL,provider=PROVIDER",
167
+ "extended|ifeval|0,lighteval|gpqa:diamond|0",
 
168
  "--push-to-hub", "--save-details",
169
  "--results-org", "YOURORG"
170
  ],
globals.py CHANGED
@@ -15,7 +15,7 @@ NUM_MODELS_RUN: int = 100
15
  NUM_RUNS_PER_JOB: int = 4 # Number of times to run each job for variance reduction
16
  RESULTS_DATASET_NAME: str = "IPTesting/inference-provider-test-results"
17
  LOCAL_CONFIG_FILE: str = "/home/user/app/model_providers.txt"
18
- TASKS: str = "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0"
19
  NAMESPACE: str = "huggingface"
20
 
21
 
 
15
  NUM_RUNS_PER_JOB: int = 4 # Number of times to run each job for variance reduction
16
  RESULTS_DATASET_NAME: str = "IPTesting/inference-provider-test-results"
17
  LOCAL_CONFIG_FILE: str = "/home/user/app/model_providers.txt"
18
+ TASKS: str = "extended|ifeval|0,lighteval|gpqa:diamond|0"
19
  NAMESPACE: str = "huggingface"
20
 
21
 
utils/io.py CHANGED
@@ -120,7 +120,9 @@ def load_results() -> None:
120
  "job_id": row["job_id"],
121
  "start_time": row.get("start_time"),
122
  "duration": row.get("duration"),
123
- "completed_at": row.get("completed_at")
 
 
124
  }
125
 
126
  print(f"Loaded {len(globals.job_results)} results from dataset")
@@ -155,18 +157,31 @@ def get_summary_stats():
155
  def get_results_table():
156
  """Return job results as a styled pandas DataFrame for Gradio DataFrame."""
157
  if not globals.job_results:
158
- return pd.DataFrame(columns=["Model", "Provider", "Last Run", "Status", "Current Score", "Previous Score", "Duration", "Completed At", "Latest Job Id"])
159
 
160
  table_data = []
161
  for key, info in globals.job_results.items():
 
162
  current_score = info.get("current_score", "N/A")
163
  if current_score is not None and isinstance(current_score, (int, float)):
164
  current_score = f"{current_score:.4f}"
165
 
 
 
 
 
 
 
166
  previous_score = info.get("previous_score", "N/A")
167
  if previous_score is not None and isinstance(previous_score, (int, float)):
168
  previous_score = f"{previous_score:.4f}"
169
 
 
 
 
 
 
 
170
  # Format duration
171
  duration = info.get("duration")
172
  if duration is not None and isinstance(duration, (int, float)):
@@ -196,9 +211,11 @@ def get_results_table():
196
  table_data.append([
197
  model,
198
  provider,
 
199
  info["last_run"],
200
  info["status"],
201
  current_score,
 
202
  previous_score,
203
  duration_str,
204
  completed_at,
@@ -206,7 +223,7 @@ def get_results_table():
206
  relaunch_link
207
  ])
208
 
209
- df = pd.DataFrame(table_data, columns=["Model", "Provider", "Last Run", "Status", "Current Score", "Previous Score", "Duration", "Completed At", "Job Id and Logs", "Actions"])
210
 
211
  # Apply styling to the Status column
212
  styled_df = df.style.map(style_status, subset=['Status'])
 
120
  "job_id": row["job_id"],
121
  "start_time": row.get("start_time"),
122
  "duration": row.get("duration"),
123
+ "completed_at": row.get("completed_at"),
124
+ "runs": row.get("runs", []),
125
+ "score_variance": row.get("score_variance")
126
  }
127
 
128
  print(f"Loaded {len(globals.job_results)} results from dataset")
 
157
  def get_results_table():
158
  """Return job results as a styled pandas DataFrame for Gradio DataFrame."""
159
  if not globals.job_results:
160
+ return pd.DataFrame(columns=["Model", "Provider", "Runs", "Last Run", "Status", "Mean Score", "Variance", "Previous Score", "Duration", "Completed At", "Latest Job Id"])
161
 
162
  table_data = []
163
  for key, info in globals.job_results.items():
164
+ # Format mean score
165
  current_score = info.get("current_score", "N/A")
166
  if current_score is not None and isinstance(current_score, (int, float)):
167
  current_score = f"{current_score:.4f}"
168
 
169
+ # Format variance
170
+ variance = info.get("score_variance", "N/A")
171
+ if variance is not None and isinstance(variance, (int, float)):
172
+ variance = f"{variance:.6f}"
173
+
174
+ # Format previous score
175
  previous_score = info.get("previous_score", "N/A")
176
  if previous_score is not None and isinstance(previous_score, (int, float)):
177
  previous_score = f"{previous_score:.4f}"
178
 
179
+ # Count runs
180
+ runs = info.get("runs", [])
181
+ completed_runs = sum(1 for run in runs if run.get("status") == "COMPLETED")
182
+ total_runs = len(runs)
183
+ runs_str = f"{completed_runs}/{total_runs}" if runs else "0/0"
184
+
185
  # Format duration
186
  duration = info.get("duration")
187
  if duration is not None and isinstance(duration, (int, float)):
 
211
  table_data.append([
212
  model,
213
  provider,
214
+ runs_str,
215
  info["last_run"],
216
  info["status"],
217
  current_score,
218
+ variance,
219
  previous_score,
220
  duration_str,
221
  completed_at,
 
223
  relaunch_link
224
  ])
225
 
226
+ df = pd.DataFrame(table_data, columns=["Model", "Provider", "Runs", "Last Run", "Status", "Mean Score", "Variance", "Previous Score", "Duration", "Completed At", "Job Id and Logs", "Actions"])
227
 
228
  # Apply styling to the Status column
229
  styled_df = df.style.map(style_status, subset=['Status'])
utils/jobs.py CHANGED
@@ -57,22 +57,29 @@ def extract_score_from_job(job_id: str) -> Optional[float]:
57
  return None
58
 
59
 
60
- def run_single_job(model: str, provider: str, tasks: str = globals.TASKS) -> Optional[str]:
61
- """Run a single job for a model-provider combination."""
 
 
 
 
 
 
 
62
 
63
  if not model or not provider:
64
  print("Missing model or provider")
65
  return -1
66
 
67
- # Check if job is already running
68
  key = globals.get_model_provider_key(model, provider)
69
  if key in globals.job_results:
70
  current_status = globals.job_results[key].get("status")
71
  if current_status == "RUNNING":
72
- print( f"Job for {model} on {provider} is already running. Please wait for it to complete.")
73
  return -1
74
 
75
- print(f"Starting job for model={model}, provider={provider}")
76
 
77
  job = run_job(
78
  image="hf.co/spaces/OpenEvals/EvalsOnTheHub",
@@ -82,7 +89,6 @@ def run_single_job(model: str, provider: str, tasks: str = globals.TASKS) -> Opt
82
  tasks,
83
  "--push-to-hub", "--save-details",
84
  "--results-org", "IPTesting",
85
- "--max-samples", "10"
86
  ],
87
  namespace=globals.NAMESPACE,
88
  secrets={"HF_TOKEN": os.getenv("HF_TOKEN")},
@@ -90,35 +96,69 @@ def run_single_job(model: str, provider: str, tasks: str = globals.TASKS) -> Opt
90
  )
91
 
92
  job_id = job.id
93
- key = globals.get_model_provider_key(model, provider)
94
 
95
  with globals.results_lock:
96
- # Move current score to previous score if it exists (relaunching)
97
- previous_score = None
98
- if key in globals.job_results and globals.job_results[key].get("current_score", None) is not None:
99
- previous_score = globals.job_results[key]["current_score"]
100
-
101
- start_time = datetime.now()
102
- globals.job_results[key] = {
103
- "model": model,
104
- "provider": provider,
105
- "last_run": start_time.strftime("%Y-%m-%d %H:%M:%S"),
106
- "status": "RUNNING",
107
- "current_score": None,
108
- "previous_score": previous_score,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  "job_id": job_id,
 
 
110
  "start_time": start_time.isoformat(),
111
  "duration": None,
112
  "completed_at": None
113
- }
114
 
115
  # Don't save immediately - let the periodic save handle it
116
- print(f"Job launched: ID={job_id}, model={model}, provider={provider}")
117
  return job_id
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # Todo: factorize both following functions
120
  def launch_jobs(tasks: str = globals.TASKS, config_file: str = globals.LOCAL_CONFIG_FILE):
121
- """Launch jobs for all models and providers."""
122
  models_providers = load_models_providers(config_file)
123
 
124
  if not models_providers:
@@ -126,17 +166,19 @@ def launch_jobs(tasks: str = globals.TASKS, config_file: str = globals.LOCAL_CON
126
  return "No valid model-provider combinations found"
127
 
128
  print(f"Found {len(models_providers)} model-provider combinations")
 
129
 
130
  launched_count = 0
131
  for model, provider in models_providers:
132
- job_id = run_single_job(model, provider, tasks)
133
- if job_id != -1:
134
- launched_count += 1
135
 
136
  # Save all results once after launching all jobs
137
  save_results()
138
- print(f"Launched {launched_count}/{len(models_providers)} jobs successfully")
139
- return f"Launched {launched_count} jobs"
 
140
 
141
  def relaunch_failed_jobs():
142
  """Relaunch only failed model-provider combinations from job results."""
@@ -164,50 +206,98 @@ def relaunch_failed_jobs():
164
 
165
 
166
  def update_job_statuses() -> None:
167
- """Check and update the status of active jobs."""
168
  try:
169
  keys = list(globals.job_results.keys())
170
 
171
  for key in keys:
172
  try:
173
- job_id = globals.job_results[key]["job_id"]
174
-
175
- job_info = inspect_job(job_id=job_id, namespace=globals.NAMESPACE)
176
- new_status = job_info.status.stage
177
-
178
  with globals.results_lock:
179
- old_status = globals.job_results[key]["status"]
180
-
181
- if old_status != new_status:
182
- globals.job_results[key]["status"] = new_status
183
- print(f"Job {job_id} status changed: {old_status} -> {new_status}")
184
-
185
- # If job completed, try to extract score and calculate duration
186
- if new_status == "COMPLETED":
187
- completed_time = datetime.now()
188
- globals.job_results[key]["completed_at"] = completed_time.strftime("%Y-%m-%d %H:%M:%S")
189
-
190
- # Calculate duration if we have start_time
191
- start_time_str = globals.job_results[key].get("start_time")
192
- if start_time_str:
193
- start_time = datetime.fromisoformat(start_time_str)
194
- duration_seconds = (completed_time - start_time).total_seconds()
195
- globals.job_results[key]["duration"] = duration_seconds
196
-
197
- score = extract_score_from_job(job_id)
198
- if score is not None:
199
- globals.job_results[key]["current_score"] = score
200
-
201
- if new_status == "COMPLETED" and globals.job_results[key]["current_score"] is None:
202
- score = extract_score_from_job(job_id)
203
- if score is not None:
204
- globals.job_results[key]["current_score"] = score
205
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  except Exception as e:
208
  print(f"Error checking job: {str(e)}")
 
 
209
 
210
  save_results()
211
 
212
  except Exception as e:
213
  print(f"Error in update_job_statuses: {str(e)}")
 
 
 
57
  return None
58
 
59
 
60
+ def run_single_job(model: str, provider: str, tasks: str = globals.TASKS, run_number: int = 1) -> Optional[str]:
61
+ """Run a single job for a model-provider combination.
62
+
63
+ Args:
64
+ model: Model ID
65
+ provider: Provider name
66
+ tasks: Tasks to run
67
+ run_number: Which run this is (1-4 for multiple runs)
68
+ """
69
 
70
  if not model or not provider:
71
  print("Missing model or provider")
72
  return -1
73
 
74
+ # Check if any run is already running for this model-provider
75
  key = globals.get_model_provider_key(model, provider)
76
  if key in globals.job_results:
77
  current_status = globals.job_results[key].get("status")
78
  if current_status == "RUNNING":
79
+ print(f"Job for {model} on {provider} is already running. Please wait for it to complete.")
80
  return -1
81
 
82
+ print(f"Starting job for model={model}, provider={provider}, run {run_number}/{globals.NUM_RUNS_PER_JOB}")
83
 
84
  job = run_job(
85
  image="hf.co/spaces/OpenEvals/EvalsOnTheHub",
 
89
  tasks,
90
  "--push-to-hub", "--save-details",
91
  "--results-org", "IPTesting",
 
92
  ],
93
  namespace=globals.NAMESPACE,
94
  secrets={"HF_TOKEN": os.getenv("HF_TOKEN")},
 
96
  )
97
 
98
  job_id = job.id
99
+ start_time = datetime.now()
100
 
101
  with globals.results_lock:
102
+ # Initialize or update the job result
103
+ if key not in globals.job_results:
104
+ # First run - initialize the structure
105
+ previous_score = None
106
+ globals.job_results[key] = {
107
+ "model": model,
108
+ "provider": provider,
109
+ "last_run": start_time.strftime("%Y-%m-%d %H:%M:%S"),
110
+ "status": "RUNNING",
111
+ "current_score": None,
112
+ "previous_score": None,
113
+ "job_id": job_id,
114
+ "start_time": start_time.isoformat(),
115
+ "duration": None,
116
+ "completed_at": None,
117
+ "runs": []
118
+ }
119
+ else:
120
+ # Subsequent run or relaunch
121
+ previous_score = globals.job_results[key].get("current_score")
122
+ globals.job_results[key]["status"] = "RUNNING"
123
+ globals.job_results[key]["last_run"] = start_time.strftime("%Y-%m-%d %H:%M:%S")
124
+ globals.job_results[key]["start_time"] = start_time.isoformat()
125
+ globals.job_results[key]["previous_score"] = previous_score
126
+
127
+ # Add this run to the runs list
128
+ globals.job_results[key]["runs"].append({
129
+ "run_number": run_number,
130
  "job_id": job_id,
131
+ "status": "RUNNING",
132
+ "score": None,
133
  "start_time": start_time.isoformat(),
134
  "duration": None,
135
  "completed_at": None
136
+ })
137
 
138
  # Don't save immediately - let the periodic save handle it
139
+ print(f"Job launched: ID={job_id}, model={model}, provider={provider}, run {run_number}")
140
  return job_id
141
 
142
+
143
+ def run_multiple_jobs(model: str, provider: str, tasks: str = globals.TASKS, num_runs: int = globals.NUM_RUNS_PER_JOB) -> list:
144
+ """Run multiple jobs for a model-provider combination to reduce variance.
145
+
146
+ Returns:
147
+ List of job IDs launched
148
+ """
149
+ job_ids = []
150
+ for run_number in range(1, num_runs + 1):
151
+ job_id = run_single_job(model, provider, tasks, run_number=run_number)
152
+ if job_id != -1:
153
+ job_ids.append(job_id)
154
+ # Small delay between launches
155
+ time.sleep(2)
156
+
157
+ return job_ids
158
+
159
  # Todo: factorize both following functions
160
  def launch_jobs(tasks: str = globals.TASKS, config_file: str = globals.LOCAL_CONFIG_FILE):
161
+ """Launch jobs for all models and providers with multiple runs per combination."""
162
  models_providers = load_models_providers(config_file)
163
 
164
  if not models_providers:
 
166
  return "No valid model-provider combinations found"
167
 
168
  print(f"Found {len(models_providers)} model-provider combinations")
169
+ print(f"Will launch {globals.NUM_RUNS_PER_JOB} runs per combination")
170
 
171
  launched_count = 0
172
  for model, provider in models_providers:
173
+ job_ids = run_multiple_jobs(model, provider, tasks)
174
+ if job_ids:
175
+ launched_count += len(job_ids)
176
 
177
  # Save all results once after launching all jobs
178
  save_results()
179
+ total_expected = len(models_providers) * globals.NUM_RUNS_PER_JOB
180
+ print(f"Launched {launched_count}/{total_expected} jobs successfully")
181
+ return f"Launched {launched_count}/{total_expected} jobs ({len(models_providers)} model-provider combinations × {globals.NUM_RUNS_PER_JOB} runs each)"
182
 
183
  def relaunch_failed_jobs():
184
  """Relaunch only failed model-provider combinations from job results."""
 
206
 
207
 
208
  def update_job_statuses() -> None:
209
+ """Check and update the status of active jobs and aggregate scores from multiple runs."""
210
  try:
211
  keys = list(globals.job_results.keys())
212
 
213
  for key in keys:
214
  try:
 
 
 
 
 
215
  with globals.results_lock:
216
+ runs = globals.job_results[key].get("runs", [])
217
+
218
+ if not runs:
219
+ # Legacy format - no runs list
220
+ continue
221
+
222
+ # Check status of each run
223
+ all_completed = True
224
+ all_failed = True
225
+ any_running = False
226
+
227
+ for run in runs:
228
+ if run["status"] == "RUNNING":
229
+ # Check if this run's job is still running
230
+ try:
231
+ job_info = inspect_job(job_id=run["job_id"], namespace=globals.NAMESPACE)
232
+ new_status = job_info.status.stage
233
+
234
+ if run["status"] != new_status:
235
+ run["status"] = new_status
236
+ print(f"Run {run['run_number']} job {run['job_id']} status changed: {run['status']} -> {new_status}")
237
+
238
+ if new_status == "COMPLETED":
239
+ completed_time = datetime.now()
240
+ run["completed_at"] = completed_time.strftime("%Y-%m-%d %H:%M:%S")
241
+
242
+ # Calculate duration
243
+ if run.get("start_time"):
244
+ start_time = datetime.fromisoformat(run["start_time"])
245
+ run["duration"] = (completed_time - start_time).total_seconds()
246
+
247
+ # Extract score
248
+ score = extract_score_from_job(run["job_id"])
249
+ if score is not None:
250
+ run["score"] = score
251
+ print(f"Run {run['run_number']}: extracted score {score:.4f}")
252
+
253
+ except Exception as e:
254
+ print(f"Error checking run {run['run_number']}: {e}")
255
+
256
+ # Update aggregate status flags
257
+ if run["status"] == "RUNNING":
258
+ any_running = True
259
+ all_completed = False
260
+ all_failed = False
261
+ elif run["status"] == "COMPLETED":
262
+ all_failed = False
263
+ elif run["status"] in ["ERROR", "FAILED"]:
264
+ all_completed = False
265
+
266
+ # Update overall status
267
+ if any_running:
268
+ globals.job_results[key]["status"] = "RUNNING"
269
+ elif all_completed:
270
+ globals.job_results[key]["status"] = "COMPLETED"
271
+
272
+ # Calculate aggregate statistics from completed runs
273
+ completed_scores = [run["score"] for run in runs if run["status"] == "COMPLETED" and run["score"] is not None]
274
+
275
+ if completed_scores:
276
+ import statistics
277
+ mean_score = statistics.mean(completed_scores)
278
+ variance = statistics.variance(completed_scores) if len(completed_scores) > 1 else 0.0
279
+
280
+ globals.job_results[key]["current_score"] = mean_score
281
+ globals.job_results[key]["score_variance"] = variance
282
+
283
+ print(f"Aggregated {len(completed_scores)} runs: mean={mean_score:.4f}, variance={variance:.6f}")
284
+
285
+ # Update completion time to latest run
286
+ latest_completion = max([run["completed_at"] for run in runs if run.get("completed_at")], default=None)
287
+ if latest_completion:
288
+ globals.job_results[key]["completed_at"] = latest_completion
289
+
290
+ elif all_failed:
291
+ globals.job_results[key]["status"] = "ERROR"
292
 
293
  except Exception as e:
294
  print(f"Error checking job: {str(e)}")
295
+ import traceback
296
+ traceback.print_exc()
297
 
298
  save_results()
299
 
300
  except Exception as e:
301
  print(f"Error in update_job_statuses: {str(e)}")
302
+ import traceback
303
+ traceback.print_exc()