Spaces:
Sleeping
Sleeping
Peiran
commited on
Commit
·
64125ec
1
Parent(s):
bf7288d
Per-user scheduling + global balancing: add Annotator ID, per-user dedup, count-based prioritization, CSV adds annotator_id; update UI bindings accordingly
Browse files- __pycache__/app.cpython-311.pyc +0 -0
- app.py +56 -16
__pycache__/app.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
|
|
|
app.py
CHANGED
|
@@ -121,11 +121,36 @@ def _build_image_pairs(rows: List[Dict[str, str]], task_name: str) -> List[Dict[
|
|
| 121 |
return pairs
|
| 122 |
|
| 123 |
|
| 124 |
-
def
|
| 125 |
-
"""
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
"""
|
| 128 |
keys = set()
|
|
|
|
|
|
|
| 129 |
csv_path = _persist_csv_path_for_task(task_name)
|
| 130 |
if not os.path.exists(csv_path):
|
| 131 |
return keys
|
|
@@ -133,6 +158,8 @@ def _read_existing_eval_keys(task_name: str) -> set:
|
|
| 133 |
with open(csv_path, newline="", encoding="utf-8") as f:
|
| 134 |
reader = csv.DictReader(f)
|
| 135 |
for r in reader:
|
|
|
|
|
|
|
| 136 |
tid = str(r.get("test_id", "")).strip()
|
| 137 |
m1 = str(r.get("model1_name", "")).strip()
|
| 138 |
m2 = str(r.get("model2_name", "")).strip()
|
|
@@ -166,23 +193,31 @@ def _schedule_round_robin_by_test_id(pairs: List[Dict[str, str]], seed: Optional
|
|
| 166 |
return ordered
|
| 167 |
|
| 168 |
|
| 169 |
-
def load_task(task_name: str):
|
| 170 |
if not task_name:
|
| 171 |
raise gr.Error("Please select a task first.")
|
| 172 |
|
| 173 |
rows = _load_task_rows(task_name)
|
| 174 |
pairs_all = _build_image_pairs(rows, task_name)
|
| 175 |
-
#
|
| 176 |
-
done_keys = _read_existing_eval_keys(task_name)
|
| 177 |
def key_of(p: Dict[str, str]):
|
| 178 |
return (p["test_id"], frozenset({p["model1_name"], p["model2_name"]}), p["org_img"])
|
| 179 |
-
|
| 180 |
-
|
|
|
|
| 181 |
|
| 182 |
-
# Balanced schedule
|
| 183 |
seed_env = os.environ.get("SCHEDULE_SEED")
|
| 184 |
seed = int(seed_env) if seed_env and seed_env.isdigit() else None
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
# Assign A/B order to counteract position bias: alternate after scheduling
|
| 188 |
for idx, p in enumerate(pairs):
|
|
@@ -192,7 +227,7 @@ def load_task(task_name: str):
|
|
| 192 |
try:
|
| 193 |
print("[VisArena] No pending pairs.")
|
| 194 |
print("[VisArena] total_pairs=", len(pairs_all))
|
| 195 |
-
print("[VisArena]
|
| 196 |
print("[VisArena] persist_csv=", _persist_csv_path_for_task(task_name))
|
| 197 |
except Exception:
|
| 198 |
pass
|
|
@@ -234,6 +269,7 @@ def _append_local_persist_csv(task_name: str, row: Dict[str, object]) -> bool:
|
|
| 234 |
csv_exists = os.path.exists(csv_path)
|
| 235 |
fieldnames = [
|
| 236 |
"eval_date",
|
|
|
|
| 237 |
"test_id",
|
| 238 |
"model1_name",
|
| 239 |
"model2_name",
|
|
@@ -301,8 +337,8 @@ def _upload_eval_record_to_dataset(task_name: str, row: Dict[str, object]) -> Tu
|
|
| 301 |
return False, f"Exception: {type(e).__name__}: {e}"
|
| 302 |
|
| 303 |
|
| 304 |
-
def on_task_change(task_name: str, _state_pairs: List[Dict[str, str]]):
|
| 305 |
-
pairs = load_task(task_name)
|
| 306 |
# Defaults for A and B (8 sliders total)
|
| 307 |
default_scores = [3, 3, 3, 3, 3, 3, 3, 3]
|
| 308 |
if not pairs:
|
|
@@ -366,6 +402,7 @@ def on_pair_navigate(index: int, pairs: List[Dict[str, str]]):
|
|
| 366 |
|
| 367 |
def on_submit(
|
| 368 |
task_name: str,
|
|
|
|
| 369 |
index: int,
|
| 370 |
pairs: List[Dict[str, str]],
|
| 371 |
a_physical_score: int,
|
|
@@ -431,9 +468,10 @@ def on_submit(
|
|
| 431 |
}
|
| 432 |
# Build record
|
| 433 |
row = _build_eval_row(pair, score_map)
|
|
|
|
| 434 |
|
| 435 |
# Idempotency: check if this pair already evaluated; if so, skip writing
|
| 436 |
-
done_keys =
|
| 437 |
eval_key = (pair["test_id"], frozenset({pair["model1_name"], pair["model2_name"]}), pair["org_img"])
|
| 438 |
if eval_key in done_keys:
|
| 439 |
ok_local = False
|
|
@@ -502,6 +540,7 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
|
|
| 502 |
interactive=True,
|
| 503 |
value="Scene Composition & Object Insertion",
|
| 504 |
)
|
|
|
|
| 505 |
index_slider = gr.Slider(
|
| 506 |
label="Pair Index",
|
| 507 |
value=0,
|
|
@@ -541,7 +580,7 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
|
|
| 541 |
# Event bindings
|
| 542 |
task_selector.change(
|
| 543 |
fn=on_task_change,
|
| 544 |
-
inputs=[task_selector, pair_state],
|
| 545 |
outputs=[
|
| 546 |
pair_state,
|
| 547 |
index_slider,
|
|
@@ -585,6 +624,7 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
|
|
| 585 |
fn=on_submit,
|
| 586 |
inputs=[
|
| 587 |
task_selector,
|
|
|
|
| 588 |
index_slider,
|
| 589 |
pair_state,
|
| 590 |
a_physical_input,
|
|
@@ -618,7 +658,7 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
|
|
| 618 |
# Auto-load default task on startup
|
| 619 |
demo.load(
|
| 620 |
fn=on_task_change,
|
| 621 |
-
inputs=[task_selector, pair_state],
|
| 622 |
outputs=[
|
| 623 |
pair_state,
|
| 624 |
index_slider,
|
|
|
|
| 121 |
return pairs
|
| 122 |
|
| 123 |
|
| 124 |
+
def _read_eval_counts(task_name: str) -> Dict[Tuple[str, frozenset, str], int]:
|
| 125 |
+
"""Global counts per pair key across all annotators."""
|
| 126 |
+
counts: Dict[Tuple[str, frozenset, str], int] = {}
|
| 127 |
+
csv_path = _persist_csv_path_for_task(task_name)
|
| 128 |
+
if not os.path.exists(csv_path):
|
| 129 |
+
return counts
|
| 130 |
+
try:
|
| 131 |
+
with open(csv_path, newline="", encoding="utf-8") as f:
|
| 132 |
+
reader = csv.DictReader(f)
|
| 133 |
+
for r in reader:
|
| 134 |
+
tid = str(r.get("test_id", "")).strip()
|
| 135 |
+
m1 = str(r.get("model1_name", "")).strip()
|
| 136 |
+
m2 = str(r.get("model2_name", "")).strip()
|
| 137 |
+
org = str(r.get("org_img", "")).strip()
|
| 138 |
+
if not (tid and m1 and m2 and org):
|
| 139 |
+
continue
|
| 140 |
+
key = (tid, frozenset({m1, m2}), org)
|
| 141 |
+
counts[key] = counts.get(key, 0) + 1
|
| 142 |
+
except Exception:
|
| 143 |
+
pass
|
| 144 |
+
return counts
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _read_user_done_keys(task_name: str, annotator_id: str) -> set:
|
| 148 |
+
"""Keys already evaluated by the given annotator.
|
| 149 |
+
If CSV has no annotator_id column (legacy rows), those rows are ignored for per-user filtering.
|
| 150 |
"""
|
| 151 |
keys = set()
|
| 152 |
+
if not annotator_id:
|
| 153 |
+
return keys
|
| 154 |
csv_path = _persist_csv_path_for_task(task_name)
|
| 155 |
if not os.path.exists(csv_path):
|
| 156 |
return keys
|
|
|
|
| 158 |
with open(csv_path, newline="", encoding="utf-8") as f:
|
| 159 |
reader = csv.DictReader(f)
|
| 160 |
for r in reader:
|
| 161 |
+
if str(r.get("annotator_id", "")).strip() != str(annotator_id).strip():
|
| 162 |
+
continue
|
| 163 |
tid = str(r.get("test_id", "")).strip()
|
| 164 |
m1 = str(r.get("model1_name", "")).strip()
|
| 165 |
m2 = str(r.get("model2_name", "")).strip()
|
|
|
|
| 193 |
return ordered
|
| 194 |
|
| 195 |
|
| 196 |
+
def load_task(task_name: str, annotator_id: str = ""):
|
| 197 |
if not task_name:
|
| 198 |
raise gr.Error("Please select a task first.")
|
| 199 |
|
| 200 |
rows = _load_task_rows(task_name)
|
| 201 |
pairs_all = _build_image_pairs(rows, task_name)
|
| 202 |
+
# Per-user filtering and global balancing
|
|
|
|
| 203 |
def key_of(p: Dict[str, str]):
|
| 204 |
return (p["test_id"], frozenset({p["model1_name"], p["model2_name"]}), p["org_img"])
|
| 205 |
+
user_done_keys = _read_user_done_keys(task_name, annotator_id)
|
| 206 |
+
global_counts = _read_eval_counts(task_name)
|
| 207 |
+
pairs = [p for p in pairs_all if key_of(p) not in user_done_keys]
|
| 208 |
|
| 209 |
+
# Balanced schedule: prioritize low-count pairs, and within same count do round-robin by test_id
|
| 210 |
seed_env = os.environ.get("SCHEDULE_SEED")
|
| 211 |
seed = int(seed_env) if seed_env and seed_env.isdigit() else None
|
| 212 |
+
def count_of(p: Dict[str, str]):
|
| 213 |
+
return global_counts.get(key_of(p), 0)
|
| 214 |
+
buckets: Dict[int, List[Dict[str, str]]] = {}
|
| 215 |
+
for p in sorted(pairs, key=count_of):
|
| 216 |
+
buckets.setdefault(count_of(p), []).append(p)
|
| 217 |
+
ordered: List[Dict[str, str]] = []
|
| 218 |
+
for c in sorted(buckets.keys()):
|
| 219 |
+
ordered.extend(_schedule_round_robin_by_test_id(buckets[c], seed=seed))
|
| 220 |
+
pairs = ordered
|
| 221 |
|
| 222 |
# Assign A/B order to counteract position bias: alternate after scheduling
|
| 223 |
for idx, p in enumerate(pairs):
|
|
|
|
| 227 |
try:
|
| 228 |
print("[VisArena] No pending pairs.")
|
| 229 |
print("[VisArena] total_pairs=", len(pairs_all))
|
| 230 |
+
print("[VisArena] already_done_by_user=", len(user_done_keys))
|
| 231 |
print("[VisArena] persist_csv=", _persist_csv_path_for_task(task_name))
|
| 232 |
except Exception:
|
| 233 |
pass
|
|
|
|
| 269 |
csv_exists = os.path.exists(csv_path)
|
| 270 |
fieldnames = [
|
| 271 |
"eval_date",
|
| 272 |
+
"annotator_id",
|
| 273 |
"test_id",
|
| 274 |
"model1_name",
|
| 275 |
"model2_name",
|
|
|
|
| 337 |
return False, f"Exception: {type(e).__name__}: {e}"
|
| 338 |
|
| 339 |
|
| 340 |
+
def on_task_change(task_name: str, annotator_id: str, _state_pairs: List[Dict[str, str]]):
|
| 341 |
+
pairs = load_task(task_name, annotator_id)
|
| 342 |
# Defaults for A and B (8 sliders total)
|
| 343 |
default_scores = [3, 3, 3, 3, 3, 3, 3, 3]
|
| 344 |
if not pairs:
|
|
|
|
| 402 |
|
| 403 |
def on_submit(
|
| 404 |
task_name: str,
|
| 405 |
+
annotator_id: str,
|
| 406 |
index: int,
|
| 407 |
pairs: List[Dict[str, str]],
|
| 408 |
a_physical_score: int,
|
|
|
|
| 468 |
}
|
| 469 |
# Build record
|
| 470 |
row = _build_eval_row(pair, score_map)
|
| 471 |
+
row["annotator_id"] = annotator_id
|
| 472 |
|
| 473 |
# Idempotency: check if this pair already evaluated; if so, skip writing
|
| 474 |
+
done_keys = _read_user_done_keys(task_name, annotator_id)
|
| 475 |
eval_key = (pair["test_id"], frozenset({pair["model1_name"], pair["model2_name"]}), pair["org_img"])
|
| 476 |
if eval_key in done_keys:
|
| 477 |
ok_local = False
|
|
|
|
| 540 |
interactive=True,
|
| 541 |
value="Scene Composition & Object Insertion",
|
| 542 |
)
|
| 543 |
+
annotator_id_input = gr.Textbox(label="Annotator ID", placeholder="请输入你的唯一标识 (如昵称/学号)")
|
| 544 |
index_slider = gr.Slider(
|
| 545 |
label="Pair Index",
|
| 546 |
value=0,
|
|
|
|
| 580 |
# Event bindings
|
| 581 |
task_selector.change(
|
| 582 |
fn=on_task_change,
|
| 583 |
+
inputs=[task_selector, annotator_id_input, pair_state],
|
| 584 |
outputs=[
|
| 585 |
pair_state,
|
| 586 |
index_slider,
|
|
|
|
| 624 |
fn=on_submit,
|
| 625 |
inputs=[
|
| 626 |
task_selector,
|
| 627 |
+
annotator_id_input,
|
| 628 |
index_slider,
|
| 629 |
pair_state,
|
| 630 |
a_physical_input,
|
|
|
|
| 658 |
# Auto-load default task on startup
|
| 659 |
demo.load(
|
| 660 |
fn=on_task_change,
|
| 661 |
+
inputs=[task_selector, annotator_id_input, pair_state],
|
| 662 |
outputs=[
|
| 663 |
pair_state,
|
| 664 |
index_slider,
|