Upload folder using huggingface_hub
Browse files- .gitattributes +3 -0
- .gitignore +69 -0
- README.md +68 -5
- app.py +44 -0
- assets/Instance Solve Count Distribution.png +3 -0
- assets/Performance Comparison Across Subsets.png +3 -0
- assets/SWE-Bench Verified Discriminative Subsets.png +3 -0
- config/requirements.txt +7 -0
- requirements.txt +7 -0
- src/__init__.py +0 -0
- src/core/__init__.py +0 -0
- src/core/app.py +1355 -0
- src/data/__init__.py +0 -0
- src/data/data_processor_dynamic.py +216 -0
- src/data/leaderboard_data_fetcher.py +274 -0
- src/data/leaderboard_scraper.py +99 -0
- src/utils/__init__.py +0 -0
- src/utils/automated_updater.py +305 -0
- src/utils/export_utils.py +286 -0
- src/utils/update_scheduler.py +146 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/Instance[[:space:]]Solve[[:space:]]Count[[:space:]]Distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/Performance[[:space:]]Comparison[[:space:]]Across[[:space:]]Subsets.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/SWE-Bench[[:space:]]Verified[[:space:]]Discriminative[[:space:]]Subsets.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Large experimental data directory (will be cloned by automated updater)
|
| 2 |
+
leaderboard_swe_bench_experiments/
|
| 3 |
+
|
| 4 |
+
# Cache directory (generated by app)
|
| 5 |
+
leaderboard_cache/
|
| 6 |
+
|
| 7 |
+
# Python
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.py[cod]
|
| 10 |
+
*$py.class
|
| 11 |
+
*.so
|
| 12 |
+
.Python
|
| 13 |
+
build/
|
| 14 |
+
develop-eggs/
|
| 15 |
+
dist/
|
| 16 |
+
downloads/
|
| 17 |
+
eggs/
|
| 18 |
+
.eggs/
|
| 19 |
+
lib/
|
| 20 |
+
lib64/
|
| 21 |
+
parts/
|
| 22 |
+
sdist/
|
| 23 |
+
var/
|
| 24 |
+
wheels/
|
| 25 |
+
*.egg-info/
|
| 26 |
+
.installed.cfg
|
| 27 |
+
*.egg
|
| 28 |
+
MANIFEST
|
| 29 |
+
|
| 30 |
+
# Virtual environments
|
| 31 |
+
.env
|
| 32 |
+
.venv
|
| 33 |
+
env/
|
| 34 |
+
venv/
|
| 35 |
+
ENV/
|
| 36 |
+
env.bak/
|
| 37 |
+
venv.bak/
|
| 38 |
+
|
| 39 |
+
# IDEs
|
| 40 |
+
.vscode/
|
| 41 |
+
.idea/
|
| 42 |
+
*.swp
|
| 43 |
+
*.swo
|
| 44 |
+
*~
|
| 45 |
+
|
| 46 |
+
# OS generated files
|
| 47 |
+
.DS_Store
|
| 48 |
+
.DS_Store?
|
| 49 |
+
._*
|
| 50 |
+
.Spotlight-V100
|
| 51 |
+
.Trashes
|
| 52 |
+
ehthumbs.db
|
| 53 |
+
Thumbs.db
|
| 54 |
+
|
| 55 |
+
# Temporary files
|
| 56 |
+
*.tmp
|
| 57 |
+
*.temp
|
| 58 |
+
temp/
|
| 59 |
+
tmp/
|
| 60 |
+
|
| 61 |
+
# Logs
|
| 62 |
+
*.log
|
| 63 |
+
logs/
|
| 64 |
+
|
| 65 |
+
# Environment variables
|
| 66 |
+
.env.local
|
| 67 |
+
.env.development.local
|
| 68 |
+
.env.test.local
|
| 69 |
+
.env.production.local
|
README.md
CHANGED
|
@@ -1,12 +1,75 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.34.2
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: SWE-Bench Verified Discriminative Subsets Leaderboard
|
| 3 |
+
emoji: 🏆
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.34.2
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
tags:
|
| 12 |
+
- leaderboard
|
| 13 |
+
- software-engineering
|
| 14 |
+
- swe-bench
|
| 15 |
+
- evaluation
|
| 16 |
+
- benchmark
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# 🏆 SWE-Bench Verified Discriminative Subsets Leaderboard
|
| 20 |
+
|
| 21 |
+
This interactive leaderboard displays SWE-agents performance across SWE-Bench_Verified and four discriminative subsets designed to provide enhanced evaluation sensitivity for state-of-the-art systems.
|
| 22 |
+
|
| 23 |
+
## 🎯 Why Discriminative Subsets?
|
| 24 |
+
|
| 25 |
+
As SWE-agents improve, achieving 70%+ success rates on the full SWE-Bench Verified benchmark, traditional evaluation loses discriminative power. These targeted subsets focus on the most challenging problems to better distinguish between top-tier systems.
|
| 26 |
+
|
| 27 |
+
## 📊 The Four Discriminative Subsets
|
| 28 |
+
|
| 29 |
+
1. **🔥 Frontier Subset** (95 instances): Problems solved by ≤5 agents - maximum evaluative sensitivity
|
| 30 |
+
- Combines unsolved, ultra-rare, and very-rare problems
|
| 31 |
+
- Top agent: 11.6% vs 73.2% on full benchmark (6x better discrimination)
|
| 32 |
+
|
| 33 |
+
2. **⚡ Challenging Subset** (155 instances): Problems solved by ≤20 agents - strong evaluative power
|
| 34 |
+
- Balances discrimination with statistical significance
|
| 35 |
+
- Includes frontier + rare and uncommon problems
|
| 36 |
+
|
| 37 |
+
3. **💪 Hard Subset** (45 instances): All Hard difficulty problems regardless of solve rate
|
| 38 |
+
- Traditional difficulty-based evaluation
|
| 39 |
+
- Focuses on problems originally classified as most difficult
|
| 40 |
+
|
| 41 |
+
4. **📁 MultiFile Subset** (40 instances): Multi-file problems solved by ≤10 agents
|
| 42 |
+
- Targets real-world complexity requiring coordinated edits
|
| 43 |
+
- Even leading agents achieve only 10% success rate
|
| 44 |
+
|
| 45 |
+
## 🔬 Methodology
|
| 46 |
+
|
| 47 |
+
Subsets were created through systematic analysis of solve distribution across 83 evaluated SWE-agents:
|
| 48 |
+
- Problems solved by fewer agents provide better discrimination
|
| 49 |
+
- Analysis covers submissions from October 2023 to May 2025
|
| 50 |
+
- "Solved" means the agent's fix passed the verification test suite
|
| 51 |
+
|
| 52 |
+
## 📈 Key Insights
|
| 53 |
+
|
| 54 |
+
- **Enhanced Resolution**: Frontier subset provides 6x better discrimination between top systems
|
| 55 |
+
- **Multi-file Complexity**: Represents genuine software engineering challenges
|
| 56 |
+
- **Statistical Significance**: Challenging subset offers robust evaluation with strong discrimination
|
| 57 |
+
- **Real Progress**: Performance on these subsets indicates genuine capability advances
|
| 58 |
+
|
| 59 |
+
## 🔗 Resources
|
| 60 |
+
|
| 61 |
+
- **Blog Post**: [From 73% to 11%: Revealing True SWE-Agent Capabilities](https://jatinganhotra.dev/blog/swe-agents/2025/06/05/swe-bench-verified-discriminative-subsets.html)
|
| 62 |
+
- **Dataset**: [SWE-bench_Verified-discriminative](https://huggingface.co/datasets/jatinganhotra/SWE-bench_Verified-discriminative)
|
| 63 |
+
- **Original SWE-Bench**: [SWE-bench.com](https://www.swebench.com/)
|
| 64 |
+
|
| 65 |
+
## 🚀 Usage
|
| 66 |
+
|
| 67 |
+
```python
|
| 68 |
+
from datasets import load_dataset
|
| 69 |
+
|
| 70 |
+
# Load specific discriminative subset
|
| 71 |
+
frontier = load_dataset("jatinganhotra/SWE-bench_Verified-discriminative", split="frontier")
|
| 72 |
+
challenging = load_dataset("jatinganhotra/SWE-bench_Verified-discriminative", split="challenging")
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
Created by [Jatin Ganhotra](https://jatinganhotra.dev) | Last Updated: June 19 2025
|
app.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Main entry point for the SWE-Bench Verified Discriminative Subsets Leaderboard
|
| 4 |
+
HuggingFace Spaces Entry Point
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
# Import the main application from the organized structure
|
| 8 |
+
from src.core.app import create_leaderboard, start_automated_updates
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
import tempfile
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
# Ensure experiments directory exists before starting the app
|
| 15 |
+
experiments_dir = Path("leaderboard_swe_bench_experiments")
|
| 16 |
+
if not experiments_dir.exists():
|
| 17 |
+
print("🔄 Experiments directory not found, cloning repository...")
|
| 18 |
+
try:
|
| 19 |
+
from src.utils.automated_updater import AutomatedUpdater
|
| 20 |
+
updater = AutomatedUpdater()
|
| 21 |
+
if updater.clone_or_update_experiments_repo():
|
| 22 |
+
print("✅ Successfully cloned experiments repository")
|
| 23 |
+
else:
|
| 24 |
+
print("❌ Failed to clone experiments repository")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"❌ Error setting up experiments repository: {e}")
|
| 27 |
+
else:
|
| 28 |
+
print("✅ Experiments directory found")
|
| 29 |
+
|
| 30 |
+
# Start automated updates
|
| 31 |
+
try:
|
| 32 |
+
start_automated_updates()
|
| 33 |
+
print("✅ Automated updates started")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"⚠️ Could not start automated updates: {e}")
|
| 36 |
+
|
| 37 |
+
# Create and launch the leaderboard
|
| 38 |
+
demo = create_leaderboard()
|
| 39 |
+
demo.launch(
|
| 40 |
+
server_name="0.0.0.0",
|
| 41 |
+
server_port=7860,
|
| 42 |
+
share=False,
|
| 43 |
+
allowed_paths=[tempfile.gettempdir()]
|
| 44 |
+
)
|
assets/Instance Solve Count Distribution.png
ADDED
|
Git LFS Details
|
assets/Performance Comparison Across Subsets.png
ADDED
|
Git LFS Details
|
assets/SWE-Bench Verified Discriminative Subsets.png
ADDED
|
Git LFS Details
|
config/requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.0
|
| 2 |
+
pandas>=2.1.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
datasets>=2.14.0
|
| 5 |
+
requests>=2.28.0
|
| 6 |
+
beautifulsoup4>=4.12.0
|
| 7 |
+
schedule>=1.2.0
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.34.2
|
| 2 |
+
pandas>=2.1.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
datasets>=2.14.0
|
| 5 |
+
requests>=2.28.0
|
| 6 |
+
schedule>=1.2.0
|
| 7 |
+
PyYAML>=6.0
|
src/__init__.py
ADDED
|
File without changes
|
src/core/__init__.py
ADDED
|
File without changes
|
src/core/app.py
ADDED
|
@@ -0,0 +1,1355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from src.data.data_processor_dynamic import (
|
| 4 |
+
load_actual_performance_data,
|
| 5 |
+
get_subset_descriptions,
|
| 6 |
+
get_top_10_data,
|
| 7 |
+
get_2025_data,
|
| 8 |
+
get_2024_data,
|
| 9 |
+
get_pre_2024_data,
|
| 10 |
+
create_html_table_for_subset
|
| 11 |
+
)
|
| 12 |
+
from src.utils.export_utils import (
|
| 13 |
+
export_full_leaderboard_csv,
|
| 14 |
+
export_full_leaderboard_json,
|
| 15 |
+
export_subset_csv,
|
| 16 |
+
export_subset_json
|
| 17 |
+
)
|
| 18 |
+
from src.utils.automated_updater import get_update_status, get_last_update_info, should_show_update_notification
|
| 19 |
+
from src.utils.update_scheduler import start_automated_updates, force_update, get_scheduler_status
|
| 20 |
+
|
| 21 |
+
def get_update_status_message() -> str:
|
| 22 |
+
"""Get update status message for display in UI"""
|
| 23 |
+
try:
|
| 24 |
+
last_update = get_last_update_info()
|
| 25 |
+
update_status = get_update_status()
|
| 26 |
+
|
| 27 |
+
# Format last update time
|
| 28 |
+
last_update_time = last_update.get("last_successful_update")
|
| 29 |
+
if last_update_time:
|
| 30 |
+
from datetime import datetime
|
| 31 |
+
try:
|
| 32 |
+
dt = datetime.fromisoformat(last_update_time)
|
| 33 |
+
formatted_time = dt.strftime("%B %d, %Y at %H:%M UTC")
|
| 34 |
+
except:
|
| 35 |
+
formatted_time = last_update_time
|
| 36 |
+
else:
|
| 37 |
+
formatted_time = "Never"
|
| 38 |
+
|
| 39 |
+
# Check for new submissions
|
| 40 |
+
new_count = update_status.get("new_submissions_count", 0)
|
| 41 |
+
|
| 42 |
+
base_message = f"**Last Updated**: {formatted_time}"
|
| 43 |
+
|
| 44 |
+
if new_count > 0:
|
| 45 |
+
base_message += f"\n\n✅ **{new_count} new submissions** detected and automatically processed."
|
| 46 |
+
|
| 47 |
+
return base_message
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
return f"**Last Updated**: Status unavailable (error: {e})"
|
| 51 |
+
|
| 52 |
+
def force_update_action():
|
| 53 |
+
"""Force an immediate update and return status with detailed reporting"""
|
| 54 |
+
try:
|
| 55 |
+
from src.utils.automated_updater import AutomatedUpdater
|
| 56 |
+
|
| 57 |
+
result = force_update()
|
| 58 |
+
|
| 59 |
+
# Get detailed information for the update
|
| 60 |
+
updater = AutomatedUpdater()
|
| 61 |
+
|
| 62 |
+
# Count directories in experiments repo
|
| 63 |
+
experiments_dir = updater.experiments_dir / "evaluation" / "verified"
|
| 64 |
+
directory_count = 0
|
| 65 |
+
metadata_count = 0
|
| 66 |
+
missing_metadata = []
|
| 67 |
+
|
| 68 |
+
if experiments_dir.exists():
|
| 69 |
+
for d in experiments_dir.iterdir():
|
| 70 |
+
if d.is_dir() and (d / "results" / "results.json").exists():
|
| 71 |
+
directory_count += 1
|
| 72 |
+
metadata_file = d / "metadata.yaml"
|
| 73 |
+
if metadata_file.exists():
|
| 74 |
+
metadata_count += 1
|
| 75 |
+
else:
|
| 76 |
+
missing_metadata.append(d.name)
|
| 77 |
+
|
| 78 |
+
# Build detailed message
|
| 79 |
+
if result["success"]:
|
| 80 |
+
main_message = f"✅ Update completed successfully. {result['message']}"
|
| 81 |
+
else:
|
| 82 |
+
main_message = f"❌ Update failed: {result['message']}"
|
| 83 |
+
|
| 84 |
+
# Build simplified details for metadata approach
|
| 85 |
+
details = f"""
|
| 86 |
+
**Data Source Status:**
|
| 87 |
+
- **Experiment Directories**: {directory_count} submissions with valid results.json
|
| 88 |
+
- **Metadata Files**: {metadata_count} directories with metadata.yaml containing official names
|
| 89 |
+
- **Leaderboard Display**: Showing {metadata_count} agents with official names from metadata.yaml
|
| 90 |
+
|
| 91 |
+
**Agent Name Source**: Official names are read directly from each submission's metadata.yaml file (info.name field), providing the most accurate and up-to-date agent information."""
|
| 92 |
+
|
| 93 |
+
if missing_metadata:
|
| 94 |
+
details += f"""
|
| 95 |
+
|
| 96 |
+
**Directories Missing metadata.yaml** ({len(missing_metadata)} total):
|
| 97 |
+
{chr(10).join([f'- {system}' for system in missing_metadata[:5]])}
|
| 98 |
+
{'- ... and ' + str(len(missing_metadata) - 5) + ' more' if len(missing_metadata) > 5 else ''}"""
|
| 99 |
+
|
| 100 |
+
return main_message, details.strip()
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
return f"❌ Update error: {e}", ""
|
| 104 |
+
|
| 105 |
+
def create_multi_table_html():
|
| 106 |
+
"""Create multiple HTML tables for different time periods"""
|
| 107 |
+
|
| 108 |
+
# Load the raw data
|
| 109 |
+
performance_data = load_actual_performance_data()
|
| 110 |
+
|
| 111 |
+
# Get data for different time periods
|
| 112 |
+
top_10_data = get_top_10_data(performance_data)
|
| 113 |
+
since_2025_data = get_2025_data(performance_data)
|
| 114 |
+
data_2024 = get_2024_data(performance_data)
|
| 115 |
+
pre_2024_data = get_pre_2024_data(performance_data)
|
| 116 |
+
|
| 117 |
+
# Create the complete HTML with multiple tables
|
| 118 |
+
html = """
|
| 119 |
+
<div class="leaderboard-container">
|
| 120 |
+
<!-- Top 10 Table (Always Visible) -->
|
| 121 |
+
<div class="table-section">
|
| 122 |
+
<h3>🏆 Top 10 Performers (All Time)</h3>
|
| 123 |
+
{}
|
| 124 |
+
</div>
|
| 125 |
+
|
| 126 |
+
<!-- Since 2025 Table (Always Visible) -->
|
| 127 |
+
<div class="table-section">
|
| 128 |
+
<h3>🚀 Since January 1, 2025 ({} submissions)</h3>
|
| 129 |
+
{}
|
| 130 |
+
</div>
|
| 131 |
+
|
| 132 |
+
<!-- 2024 Table (Collapsible) -->
|
| 133 |
+
<div class="table-section">
|
| 134 |
+
<details>
|
| 135 |
+
<summary><h3>📅 2024 Submissions ({} submissions)</h3></summary>
|
| 136 |
+
{}
|
| 137 |
+
</details>
|
| 138 |
+
</div>
|
| 139 |
+
|
| 140 |
+
<!-- Pre-2024 Table (Collapsible) -->
|
| 141 |
+
<div class="table-section">
|
| 142 |
+
<details>
|
| 143 |
+
<summary><h3>📜 Pre-2024 Submissions ({} submissions)</h3></summary>
|
| 144 |
+
{}
|
| 145 |
+
</details>
|
| 146 |
+
</div>
|
| 147 |
+
|
| 148 |
+
<!-- Full Table (Collapsible) -->
|
| 149 |
+
<div class="table-section">
|
| 150 |
+
<details>
|
| 151 |
+
<summary><h3>📊 Complete Leaderboard (All {} submissions)</h3></summary>
|
| 152 |
+
{}
|
| 153 |
+
</details>
|
| 154 |
+
</div>
|
| 155 |
+
</div>
|
| 156 |
+
""".format(
|
| 157 |
+
create_html_table_for_subset(top_10_data, "top10"),
|
| 158 |
+
len(since_2025_data),
|
| 159 |
+
create_html_table_for_subset(since_2025_data, "since2025"),
|
| 160 |
+
len(data_2024),
|
| 161 |
+
create_html_table_for_subset(data_2024, "data2024"),
|
| 162 |
+
len(pre_2024_data),
|
| 163 |
+
create_html_table_for_subset(pre_2024_data, "pre2024"),
|
| 164 |
+
len(performance_data),
|
| 165 |
+
create_html_table_for_subset(performance_data, "complete")
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
return html
|
| 169 |
+
|
| 170 |
+
# Export functions for Gradio - Fixed versions
|
| 171 |
+
def download_full_csv():
|
| 172 |
+
"""Export full leaderboard data as CSV"""
|
| 173 |
+
try:
|
| 174 |
+
performance_data = load_actual_performance_data()
|
| 175 |
+
return export_full_leaderboard_csv(performance_data)
|
| 176 |
+
except Exception as e:
|
| 177 |
+
print(f"Error in download_full_csv: {e}")
|
| 178 |
+
# Return a dummy file on error
|
| 179 |
+
import tempfile, os
|
| 180 |
+
temp_path = os.path.join(tempfile.gettempdir(), "error.txt")
|
| 181 |
+
with open(temp_path, 'w') as f:
|
| 182 |
+
f.write(f"Error: {e}")
|
| 183 |
+
return temp_path
|
| 184 |
+
|
| 185 |
+
def download_full_json():
|
| 186 |
+
"""Export full leaderboard data as JSON"""
|
| 187 |
+
try:
|
| 188 |
+
performance_data = load_actual_performance_data()
|
| 189 |
+
return export_full_leaderboard_json(performance_data)
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"Error in download_full_json: {e}")
|
| 192 |
+
import tempfile, os
|
| 193 |
+
temp_path = os.path.join(tempfile.gettempdir(), "error.txt")
|
| 194 |
+
with open(temp_path, 'w') as f:
|
| 195 |
+
f.write(f"Error: {e}")
|
| 196 |
+
return temp_path
|
| 197 |
+
|
| 198 |
+
def download_top10_csv():
|
| 199 |
+
"""Export top 10 agents as CSV"""
|
| 200 |
+
try:
|
| 201 |
+
performance_data = load_actual_performance_data()
|
| 202 |
+
top_10_data = get_top_10_data(performance_data)
|
| 203 |
+
return export_full_leaderboard_csv(top_10_data)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
print(f"Error in download_top10_csv: {e}")
|
| 206 |
+
import tempfile, os
|
| 207 |
+
temp_path = os.path.join(tempfile.gettempdir(), "error.txt")
|
| 208 |
+
with open(temp_path, 'w') as f:
|
| 209 |
+
f.write(f"Error: {e}")
|
| 210 |
+
return temp_path
|
| 211 |
+
|
| 212 |
+
def download_2025_csv():
|
| 213 |
+
"""Export 2025 submissions as CSV"""
|
| 214 |
+
try:
|
| 215 |
+
performance_data = load_actual_performance_data()
|
| 216 |
+
data_2025 = get_2025_data(performance_data)
|
| 217 |
+
return export_full_leaderboard_csv(data_2025)
|
| 218 |
+
except Exception as e:
|
| 219 |
+
print(f"Error in download_2025_csv: {e}")
|
| 220 |
+
import tempfile, os
|
| 221 |
+
temp_path = os.path.join(tempfile.gettempdir(), "error.txt")
|
| 222 |
+
with open(temp_path, 'w') as f:
|
| 223 |
+
f.write(f"Error: {e}")
|
| 224 |
+
return temp_path
|
| 225 |
+
|
| 226 |
+
def download_frontier_csv():
|
| 227 |
+
"""Export frontier subset results as CSV"""
|
| 228 |
+
try:
|
| 229 |
+
performance_data = load_actual_performance_data()
|
| 230 |
+
return export_subset_csv(performance_data, "frontier")
|
| 231 |
+
except Exception as e:
|
| 232 |
+
print(f"Error in download_frontier_csv: {e}")
|
| 233 |
+
import tempfile, os
|
| 234 |
+
temp_path = os.path.join(tempfile.gettempdir(), "error.txt")
|
| 235 |
+
with open(temp_path, 'w') as f:
|
| 236 |
+
f.write(f"Error: {e}")
|
| 237 |
+
return temp_path
|
| 238 |
+
|
| 239 |
+
def download_challenging_csv():
|
| 240 |
+
"""Export challenging subset results as CSV"""
|
| 241 |
+
try:
|
| 242 |
+
performance_data = load_actual_performance_data()
|
| 243 |
+
return export_subset_csv(performance_data, "challenging")
|
| 244 |
+
except Exception as e:
|
| 245 |
+
print(f"Error in download_challenging_csv: {e}")
|
| 246 |
+
import tempfile, os
|
| 247 |
+
temp_path = os.path.join(tempfile.gettempdir(), "error.txt")
|
| 248 |
+
with open(temp_path, 'w') as f:
|
| 249 |
+
f.write(f"Error: {e}")
|
| 250 |
+
return temp_path
|
| 251 |
+
|
| 252 |
+
def download_hard_csv():
|
| 253 |
+
"""Export hard subset results as CSV"""
|
| 254 |
+
try:
|
| 255 |
+
performance_data = load_actual_performance_data()
|
| 256 |
+
return export_subset_csv(performance_data, "hard")
|
| 257 |
+
except Exception as e:
|
| 258 |
+
print(f"Error in download_hard_csv: {e}")
|
| 259 |
+
import tempfile, os
|
| 260 |
+
temp_path = os.path.join(tempfile.gettempdir(), "error.txt")
|
| 261 |
+
with open(temp_path, 'w') as f:
|
| 262 |
+
f.write(f"Error: {e}")
|
| 263 |
+
return temp_path
|
| 264 |
+
|
| 265 |
+
def download_multifile_csv():
|
| 266 |
+
"""Export multifile subset results as CSV"""
|
| 267 |
+
try:
|
| 268 |
+
performance_data = load_actual_performance_data()
|
| 269 |
+
return export_subset_csv(performance_data, "multifile")
|
| 270 |
+
except Exception as e:
|
| 271 |
+
print(f"Error in download_multifile_csv: {e}")
|
| 272 |
+
import tempfile, os
|
| 273 |
+
temp_path = os.path.join(tempfile.gettempdir(), "error.txt")
|
| 274 |
+
with open(temp_path, 'w') as f:
|
| 275 |
+
f.write(f"Error: {e}")
|
| 276 |
+
return temp_path
|
| 277 |
+
|
| 278 |
+
def create_leaderboard():
|
| 279 |
+
"""Create the main leaderboard interface with filtering"""
|
| 280 |
+
|
| 281 |
+
# Ensure experiments directory exists before creating the leaderboard
|
| 282 |
+
from pathlib import Path
|
| 283 |
+
experiments_dir = Path("leaderboard_swe_bench_experiments")
|
| 284 |
+
if not experiments_dir.exists():
|
| 285 |
+
print("🔄 Experiments directory not found, cloning repository...")
|
| 286 |
+
try:
|
| 287 |
+
from src.utils.automated_updater import AutomatedUpdater
|
| 288 |
+
updater = AutomatedUpdater()
|
| 289 |
+
if updater.clone_or_update_experiments_repo():
|
| 290 |
+
print("✅ Successfully cloned experiments repository")
|
| 291 |
+
else:
|
| 292 |
+
print("❌ Failed to clone experiments repository")
|
| 293 |
+
except Exception as e:
|
| 294 |
+
print(f"❌ Error setting up experiments repository: {e}")
|
| 295 |
+
|
| 296 |
+
subset_descriptions = get_subset_descriptions()
|
| 297 |
+
|
| 298 |
+
# Create the Gradio interface
|
| 299 |
+
with gr.Blocks(
|
| 300 |
+
title="SWE-Bench Discriminative Subsets Leaderboard",
|
| 301 |
+
theme=gr.themes.Soft(),
|
| 302 |
+
css="""
|
| 303 |
+
.gradio-container {
|
| 304 |
+
max-width: 1600px !important;
|
| 305 |
+
padding: 0 10px !important;
|
| 306 |
+
font-family: 'Inter', 'Segoe UI', 'Roboto', -apple-system, BlinkMacSystemFont, sans-serif;
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
/* Custom HTML table styling - no Bootstrap */
|
| 310 |
+
.custom-table-container {
|
| 311 |
+
max-width: 900px;
|
| 312 |
+
margin: 0 auto;
|
| 313 |
+
overflow-x: auto;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
.custom-leaderboard-table {
|
| 317 |
+
width: 100%;
|
| 318 |
+
border-collapse: collapse;
|
| 319 |
+
font-family: 'Inter', sans-serif;
|
| 320 |
+
font-size: 14px;
|
| 321 |
+
margin: 0;
|
| 322 |
+
padding: 0;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
.custom-leaderboard-table th {
|
| 326 |
+
font-size: 14px;
|
| 327 |
+
padding: 10px 4px;
|
| 328 |
+
line-height: 1.3;
|
| 329 |
+
color: white;
|
| 330 |
+
font-weight: 600;
|
| 331 |
+
text-align: center;
|
| 332 |
+
cursor: pointer;
|
| 333 |
+
user-select: none;
|
| 334 |
+
border: 1px solid #ddd;
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
.custom-leaderboard-table td {
|
| 338 |
+
font-size: 14px;
|
| 339 |
+
padding: 8px 4px;
|
| 340 |
+
line-height: 1.3;
|
| 341 |
+
text-align: center;
|
| 342 |
+
border: 1px solid #ddd;
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
/* Column-specific cell backgrounds (lighter versions of header colors) */
|
| 346 |
+
.custom-leaderboard-table td.rank-col,
|
| 347 |
+
.custom-leaderboard-table td.agent-col,
|
| 348 |
+
.custom-leaderboard-table td.date-col {
|
| 349 |
+
background-color: rgba(109, 103, 107, 0.1);
|
| 350 |
+
}
|
| 351 |
+
.custom-leaderboard-table td.full-col {
|
| 352 |
+
background-color: rgba(52, 152, 219, 0.1);
|
| 353 |
+
}
|
| 354 |
+
.custom-leaderboard-table td.frontier-col {
|
| 355 |
+
background-color: rgba(231, 76, 60, 0.1);
|
| 356 |
+
}
|
| 357 |
+
.custom-leaderboard-table td.challenging-col {
|
| 358 |
+
background-color: rgba(243, 156, 18, 0.1);
|
| 359 |
+
}
|
| 360 |
+
.custom-leaderboard-table td.hard-col {
|
| 361 |
+
background-color: rgba(155, 89, 182, 0.1);
|
| 362 |
+
}
|
| 363 |
+
.custom-leaderboard-table td.multifile-col {
|
| 364 |
+
background-color: rgba(26, 188, 156, 0.1);
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
/* Column-specific widths and styling */
|
| 368 |
+
.rank-col { width: 60px; }
|
| 369 |
+
.agent-col { width: 220px; text-align: left !important; padding-left: 8px !important; }
|
| 370 |
+
.date-col { width: 100px; }
|
| 371 |
+
.full-col { width: 100px; }
|
| 372 |
+
.frontier-col { width: 80px; }
|
| 373 |
+
.challenging-col { width: 100px; }
|
| 374 |
+
.hard-col { width: 80px; }
|
| 375 |
+
.multifile-col { width: 90px; }
|
| 376 |
+
|
| 377 |
+
/* Color scheme for headers */
|
| 378 |
+
.custom-leaderboard-table th.rank-col,
|
| 379 |
+
.custom-leaderboard-table th.agent-col,
|
| 380 |
+
.custom-leaderboard-table th.date-col {
|
| 381 |
+
background: linear-gradient(135deg, #6d676b, #6d676b);
|
| 382 |
+
}
|
| 383 |
+
.custom-leaderboard-table th.full-col {
|
| 384 |
+
background: linear-gradient(135deg, #3498db, #2980b9);
|
| 385 |
+
}
|
| 386 |
+
.custom-leaderboard-table th.frontier-col {
|
| 387 |
+
background: linear-gradient(135deg, #e74c3c, #c0392b);
|
| 388 |
+
}
|
| 389 |
+
.custom-leaderboard-table th.challenging-col {
|
| 390 |
+
background: linear-gradient(135deg, #f39c12, #e67e22);
|
| 391 |
+
}
|
| 392 |
+
.custom-leaderboard-table th.hard-col {
|
| 393 |
+
background: linear-gradient(135deg, #9b59b6, #8e44ad);
|
| 394 |
+
}
|
| 395 |
+
.custom-leaderboard-table th.multifile-col {
|
| 396 |
+
background: linear-gradient(135deg, #1abc9c, #16a085);
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
/* Table section styling */
|
| 400 |
+
.table-section {
|
| 401 |
+
margin: 20px 0;
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
.table-section h3 {
|
| 405 |
+
margin-bottom: 10px;
|
| 406 |
+
margin-top: 5px;
|
| 407 |
+
color: var(--body-text-color, #333);
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
/* Collapsible sections */
|
| 411 |
+
details {
|
| 412 |
+
margin: 20px 0;
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
details summary {
|
| 416 |
+
cursor: pointer;
|
| 417 |
+
padding: 10px;
|
| 418 |
+
background-color: rgba(248, 249, 250, 0.8);
|
| 419 |
+
border-radius: 8px;
|
| 420 |
+
border: 1px solid #ddd;
|
| 421 |
+
font-weight: bold;
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
details summary:hover {
|
| 425 |
+
background-color: rgba(240, 240, 240, 0.9);
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
details[open] summary {
|
| 429 |
+
margin-bottom: 15px;
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
/* Dark theme for collapsible sections */
|
| 433 |
+
.dark details summary {
|
| 434 |
+
background-color: rgba(64, 68, 75, 0.8);
|
| 435 |
+
border-color: #495057;
|
| 436 |
+
color: var(--body-text-color, #fff);
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
.dark details summary:hover {
|
| 440 |
+
background-color: rgba(80, 85, 95, 0.9);
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
/* Tweet-style blockquotes */
|
| 444 |
+
blockquote {
|
| 445 |
+
border-left: 4px solid #1da1f2;
|
| 446 |
+
background: linear-gradient(135deg, rgba(29, 161, 242, 0.1), rgba(29, 161, 242, 0.05));
|
| 447 |
+
margin: 20px 0;
|
| 448 |
+
padding: 20px;
|
| 449 |
+
font-style: normal;
|
| 450 |
+
border-radius: 8px;
|
| 451 |
+
box-shadow: none;
|
| 452 |
+
box-sizing: border-box;
|
| 453 |
+
width: 100%;
|
| 454 |
+
max-width: 100%;
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
.dark blockquote {
|
| 458 |
+
border-left-color: #4dabf7;
|
| 459 |
+
background: linear-gradient(135deg, rgba(77, 171, 247, 0.15), rgba(77, 171, 247, 0.08));
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
/* Row styling */
|
| 463 |
+
.custom-leaderboard-table tbody tr:nth-child(even) {
|
| 464 |
+
background-color: #fafafa;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.custom-leaderboard-table tbody tr:hover {
|
| 468 |
+
background-color: #f0f0f0;
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
/* Dark theme compatibility */
|
| 472 |
+
.dark .custom-leaderboard-table th {
|
| 473 |
+
border-color: #495057;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
.dark .custom-leaderboard-table td {
|
| 477 |
+
border-color: #495057;
|
| 478 |
+
background-color: #2c3e50;
|
| 479 |
+
color: #ecf0f1;
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
.dark .custom-leaderboard-table tbody tr:nth-child(even) {
|
| 483 |
+
background-color: #34495e;
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
.dark .custom-leaderboard-table tbody tr:hover {
|
| 487 |
+
background-color: #3c4f66;
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
/* Light theme styles */
|
| 492 |
+
.subset-description {
|
| 493 |
+
background-color: rgba(248, 249, 250, 0.8);
|
| 494 |
+
color: var(--body-text-color, #333);
|
| 495 |
+
padding: 20px;
|
| 496 |
+
border-radius: 8px;
|
| 497 |
+
margin: 15px 0;
|
| 498 |
+
border-left: 4px solid #007acc;
|
| 499 |
+
box-shadow: none;
|
| 500 |
+
box-sizing: border-box;
|
| 501 |
+
width: 100%;
|
| 502 |
+
max-width: 100%;
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
/* Individual subset styling - Extra compact version */
|
| 506 |
+
.frontier-subset {
|
| 507 |
+
background: linear-gradient(135deg, rgba(231, 76, 60, 0.08), rgba(231, 76, 60, 0.04));
|
| 508 |
+
border-left: 4px solid #e74c3c;
|
| 509 |
+
border-radius: 6px;
|
| 510 |
+
padding: 8px 12px;
|
| 511 |
+
margin: 5px 0;
|
| 512 |
+
box-shadow: 0 1px 3px rgba(231, 76, 60, 0.1);
|
| 513 |
+
font-size: 12px;
|
| 514 |
+
line-height: 1.3;
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
.challenging-subset {
|
| 518 |
+
background: linear-gradient(135deg, rgba(243, 156, 18, 0.08), rgba(243, 156, 18, 0.04));
|
| 519 |
+
border-left: 4px solid #f39c12;
|
| 520 |
+
border-radius: 6px;
|
| 521 |
+
padding: 8px 12px;
|
| 522 |
+
margin: 5px 0;
|
| 523 |
+
box-shadow: 0 1px 3px rgba(243, 156, 18, 0.1);
|
| 524 |
+
font-size: 12px;
|
| 525 |
+
line-height: 1.3;
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
.hard-subset {
|
| 529 |
+
background: linear-gradient(135deg, rgba(155, 89, 182, 0.08), rgba(155, 89, 182, 0.04));
|
| 530 |
+
border-left: 4px solid #9b59b6;
|
| 531 |
+
border-radius: 6px;
|
| 532 |
+
padding: 8px 12px;
|
| 533 |
+
margin: 5px 0;
|
| 534 |
+
box-shadow: 0 1px 3px rgba(155, 89, 182, 0.1);
|
| 535 |
+
font-size: 12px;
|
| 536 |
+
line-height: 1.3;
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
.multifile-subset {
|
| 540 |
+
background: linear-gradient(135deg, rgba(26, 188, 156, 0.08), rgba(26, 188, 156, 0.04));
|
| 541 |
+
border-left: 4px solid #1abc9c;
|
| 542 |
+
border-radius: 6px;
|
| 543 |
+
padding: 8px 12px;
|
| 544 |
+
margin: 5px 0;
|
| 545 |
+
box-shadow: 0 1px 3px rgba(26, 188, 156, 0.1);
|
| 546 |
+
font-size: 12px;
|
| 547 |
+
line-height: 1.3;
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
.subset-title {
|
| 551 |
+
font-size: 18px;
|
| 552 |
+
font-weight: 700;
|
| 553 |
+
margin-bottom: 8px;
|
| 554 |
+
display: flex;
|
| 555 |
+
align-items: center;
|
| 556 |
+
gap: 8px;
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
.subset-subtitle {
|
| 560 |
+
font-size: 14px;
|
| 561 |
+
font-weight: 600;
|
| 562 |
+
margin-bottom: 12px;
|
| 563 |
+
opacity: 0.8;
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
.subset-details {
|
| 567 |
+
font-size: 14px;
|
| 568 |
+
line-height: 1.6;
|
| 569 |
+
margin-top: 12px;
|
| 570 |
+
}
|
| 571 |
+
.methodology-section {
|
| 572 |
+
background-color: rgba(232, 244, 248, 0.8);
|
| 573 |
+
color: var(--body-text-color, #333);
|
| 574 |
+
padding: 20px;
|
| 575 |
+
border-radius: 8px;
|
| 576 |
+
margin: 20px 0;
|
| 577 |
+
border-left: 4px solid #28a745;
|
| 578 |
+
box-shadow: none;
|
| 579 |
+
box-sizing: border-box;
|
| 580 |
+
width: 100%;
|
| 581 |
+
max-width: 100%;
|
| 582 |
+
}
|
| 583 |
+
.insights-section {
|
| 584 |
+
background-color: rgba(255, 243, 205, 0.8);
|
| 585 |
+
color: var(--body-text-color, #333);
|
| 586 |
+
padding: 20px;
|
| 587 |
+
border-radius: 8px;
|
| 588 |
+
margin: 20px 0;
|
| 589 |
+
border-left: 4px solid #ffc107;
|
| 590 |
+
box-shadow: none;
|
| 591 |
+
box-sizing: border-box;
|
| 592 |
+
width: 100%;
|
| 593 |
+
max-width: 100%;
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
/* Dark theme compatibility */
|
| 597 |
+
.dark .subset-description {
|
| 598 |
+
background-color: rgba(64, 68, 75, 0.8);
|
| 599 |
+
color: var(--body-text-color, #fff);
|
| 600 |
+
border-left: 5px solid #4dabf7;
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
.dark .frontier-subset {
|
| 604 |
+
background: linear-gradient(135deg, rgba(231, 76, 60, 0.15), rgba(231, 76, 60, 0.08));
|
| 605 |
+
border-left: 4px solid #ff6b6b;
|
| 606 |
+
color: var(--body-text-color, #fff);
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
.dark .challenging-subset {
|
| 610 |
+
background: linear-gradient(135deg, rgba(243, 156, 18, 0.15), rgba(243, 156, 18, 0.08));
|
| 611 |
+
border-left: 4px solid #ffd43b;
|
| 612 |
+
color: var(--body-text-color, #fff);
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
.dark .hard-subset {
|
| 616 |
+
background: linear-gradient(135deg, rgba(155, 89, 182, 0.15), rgba(155, 89, 182, 0.08));
|
| 617 |
+
border-left: 4px solid #b197fc;
|
| 618 |
+
color: var(--body-text-color, #fff);
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
.dark .multifile-subset {
|
| 622 |
+
background: linear-gradient(135deg, rgba(26, 188, 156, 0.15), rgba(26, 188, 156, 0.08));
|
| 623 |
+
border-left: 4px solid #51cf66;
|
| 624 |
+
color: var(--body-text-color, #fff);
|
| 625 |
+
}
|
| 626 |
+
.dark .methodology-section {
|
| 627 |
+
background-color: rgba(37, 56, 64, 0.8);
|
| 628 |
+
color: var(--body-text-color, #fff);
|
| 629 |
+
border-left: 5px solid #51cf66;
|
| 630 |
+
}
|
| 631 |
+
.dark .insights-section {
|
| 632 |
+
background-color: rgba(77, 63, 33, 0.8);
|
| 633 |
+
color: var(--body-text-color, #fff);
|
| 634 |
+
border-left: 5px solid #ffd43b;
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
.header {
|
| 638 |
+
text-align: center;
|
| 639 |
+
padding: 8px 0;
|
| 640 |
+
font-family: 'Inter', sans-serif;
|
| 641 |
+
}
|
| 642 |
+
|
| 643 |
+
/* Update Status Styling */
|
| 644 |
+
.update-status {
|
| 645 |
+
background-color: rgba(240, 248, 255, 0.8);
|
| 646 |
+
border-left: 4px solid #007acc;
|
| 647 |
+
padding: 15px;
|
| 648 |
+
border-radius: 8px;
|
| 649 |
+
margin: 10px 0;
|
| 650 |
+
font-size: 14px;
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
.dark .update-status {
|
| 654 |
+
background-color: rgba(37, 56, 64, 0.8);
|
| 655 |
+
border-left: 4px solid #4dabf7;
|
| 656 |
+
color: var(--body-text-color, #fff);
|
| 657 |
+
}
|
| 658 |
+
|
| 659 |
+
.update-message {
|
| 660 |
+
padding: 15px;
|
| 661 |
+
border-radius: 8px;
|
| 662 |
+
margin: 15px 0;
|
| 663 |
+
font-size: 14px;
|
| 664 |
+
border-left: 4px solid #28a745;
|
| 665 |
+
background-color: rgba(40, 167, 69, 0.1);
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
.update-details {
|
| 669 |
+
padding: 15px;
|
| 670 |
+
border-radius: 8px;
|
| 671 |
+
margin: 15px 0;
|
| 672 |
+
font-size: 13px;
|
| 673 |
+
border-left: 4px solid #6c757d;
|
| 674 |
+
background-color: rgba(108, 117, 125, 0.1);
|
| 675 |
+
font-family: 'Monaco', 'Consolas', monospace;
|
| 676 |
+
}
|
| 677 |
+
|
| 678 |
+
.dark .update-message {
|
| 679 |
+
background-color: rgba(40, 167, 69, 0.15);
|
| 680 |
+
color: var(--body-text-color, #fff);
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
.dark .update-details {
|
| 684 |
+
background-color: rgba(108, 117, 125, 0.15);
|
| 685 |
+
color: var(--body-text-color, #fff);
|
| 686 |
+
}
|
| 687 |
+
|
| 688 |
+
/* Enhanced update button styling */
|
| 689 |
+
.update-button {
|
| 690 |
+
background: linear-gradient(135deg, #007bff, #0056b3) !important;
|
| 691 |
+
border: none !important;
|
| 692 |
+
padding: 12px 24px !important;
|
| 693 |
+
font-weight: 600 !important;
|
| 694 |
+
font-size: 16px !important;
|
| 695 |
+
border-radius: 8px !important;
|
| 696 |
+
box-shadow: 0 4px 6px rgba(0, 123, 255, 0.25) !important;
|
| 697 |
+
transition: all 0.3s ease !important;
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
.update-button:hover {
|
| 701 |
+
background: linear-gradient(135deg, #0056b3, #004085) !important;
|
| 702 |
+
transform: translateY(-2px) !important;
|
| 703 |
+
box-shadow: 0 6px 12px rgba(0, 123, 255, 0.35) !important;
|
| 704 |
+
}
|
| 705 |
+
|
| 706 |
+
.update-button:active {
|
| 707 |
+
transform: translateY(0) !important;
|
| 708 |
+
}
|
| 709 |
+
.footer {
|
| 710 |
+
text-align: center;
|
| 711 |
+
color: var(--body-text-color-subdued, #666);
|
| 712 |
+
font-size: 14px;
|
| 713 |
+
margin-top: 20px;
|
| 714 |
+
font-family: 'Inter', sans-serif;
|
| 715 |
+
}
|
| 716 |
+
.tab-content {
|
| 717 |
+
padding: 0px 0;
|
| 718 |
+
width: 100%;
|
| 719 |
+
max-width: 100%;
|
| 720 |
+
box-sizing: border-box;
|
| 721 |
+
}
|
| 722 |
+
.image-container {
|
| 723 |
+
text-align: center;
|
| 724 |
+
margin: 20px 0;
|
| 725 |
+
padding: 15px;
|
| 726 |
+
border-radius: 8px;
|
| 727 |
+
background-color: rgba(255, 255, 255, 0.5);
|
| 728 |
+
}
|
| 729 |
+
.dark .image-container {
|
| 730 |
+
background-color: rgba(40, 44, 52, 0.5);
|
| 731 |
+
}
|
| 732 |
+
|
| 733 |
+
/* Responsive layout for mobile and tablet */
|
| 734 |
+
@media (max-width: 768px) {
|
| 735 |
+
.gradio-container {
|
| 736 |
+
padding: 0 10px !important;
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
+
.subset-description,
|
| 740 |
+
.methodology-section,
|
| 741 |
+
.insights-section,
|
| 742 |
+
blockquote {
|
| 743 |
+
margin: 15px 0;
|
| 744 |
+
padding: 15px;
|
| 745 |
+
}
|
| 746 |
+
|
| 747 |
+
.tab-content {
|
| 748 |
+
padding: 15px 0;
|
| 749 |
+
}
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
@media (max-width: 480px) {
|
| 753 |
+
.gradio-container {
|
| 754 |
+
padding: 0 5px !important;
|
| 755 |
+
}
|
| 756 |
+
|
| 757 |
+
.subset-description,
|
| 758 |
+
.methodology-section,
|
| 759 |
+
.insights-section,
|
| 760 |
+
blockquote {
|
| 761 |
+
margin: 10px 0;
|
| 762 |
+
padding: 12px;
|
| 763 |
+
}
|
| 764 |
+
}
|
| 765 |
+
</style>
|
| 766 |
+
""",
|
| 767 |
+
js="""
|
| 768 |
+
function() {
|
| 769 |
+
// Define sortTable function in global scope
|
| 770 |
+
window.sortTable = function(n, tableId) {
|
| 771 |
+
var table = document.getElementById(tableId);
|
| 772 |
+
if (!table) {
|
| 773 |
+
console.log('Table not found:', tableId);
|
| 774 |
+
return;
|
| 775 |
+
}
|
| 776 |
+
|
| 777 |
+
var tbody = table.querySelector('tbody');
|
| 778 |
+
if (!tbody) {
|
| 779 |
+
console.log('Tbody not found in table:', tableId);
|
| 780 |
+
return;
|
| 781 |
+
}
|
| 782 |
+
|
| 783 |
+
var rows = Array.from(tbody.rows);
|
| 784 |
+
var ascending = table.getAttribute('data-sort-col') !== n.toString() || table.getAttribute('data-sort-dir') === 'desc';
|
| 785 |
+
|
| 786 |
+
rows.sort(function(a, b) {
|
| 787 |
+
var aVal = a.cells[n].textContent.trim();
|
| 788 |
+
var bVal = b.cells[n].textContent.trim();
|
| 789 |
+
|
| 790 |
+
// Extract numeric value from percentage strings (e.g., "73.2%" -> 73.2)
|
| 791 |
+
var aNum = parseFloat(aVal.replace('%', '').split('(')[0].trim());
|
| 792 |
+
var bNum = parseFloat(bVal.replace('%', '').split('(')[0].trim());
|
| 793 |
+
|
| 794 |
+
// If both are numbers, compare numerically
|
| 795 |
+
if (!isNaN(aNum) && !isNaN(bNum)) {
|
| 796 |
+
if (aNum < bNum) return ascending ? -1 : 1;
|
| 797 |
+
if (aNum > bNum) return ascending ? 1 : -1;
|
| 798 |
+
return 0;
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
// Otherwise compare as strings
|
| 802 |
+
if (aVal < bVal) return ascending ? -1 : 1;
|
| 803 |
+
if (aVal > bVal) return ascending ? 1 : -1;
|
| 804 |
+
return 0;
|
| 805 |
+
});
|
| 806 |
+
|
| 807 |
+
// Re-append sorted rows
|
| 808 |
+
rows.forEach(function(row) {
|
| 809 |
+
tbody.appendChild(row);
|
| 810 |
+
});
|
| 811 |
+
|
| 812 |
+
// Update sort indicators
|
| 813 |
+
table.setAttribute('data-sort-col', n);
|
| 814 |
+
table.setAttribute('data-sort-dir', ascending ? 'asc' : 'desc');
|
| 815 |
+
|
| 816 |
+
// Visual feedback
|
| 817 |
+
var headers = table.querySelectorAll('th');
|
| 818 |
+
headers.forEach(function(header, index) {
|
| 819 |
+
header.style.opacity = index === n ? '1.0' : '0.7';
|
| 820 |
+
});
|
| 821 |
+
};
|
| 822 |
+
}
|
| 823 |
+
"""
|
| 824 |
+
) as demo:
|
| 825 |
+
|
| 826 |
+
# Header - More compact
|
| 827 |
+
gr.Markdown(
|
| 828 |
+
"""
|
| 829 |
+
# 🏆 SWE-Bench Verified Discriminative Subsets Leaderboard
|
| 830 |
+
|
| 831 |
+
**Top SWE-agents claim high success rates, but drop dramatically on truly challenging problems**
|
| 832 |
+
|
| 833 |
+
📖 **Full Analysis**: [From 73% to 11%: Revealing True SWE-Agent Capabilities](https://jatinganhotra.dev/blog/swe-agents/2025/06/05/swe-bench-verified-discriminative-subsets.html) | 📊 **Dataset**: [SWE-bench_Verified-discriminative](https://huggingface.co/datasets/jatinganhotra/SWE-bench_Verified-discriminative)
|
| 834 |
+
""",
|
| 835 |
+
elem_classes=["header"]
|
| 836 |
+
)
|
| 837 |
+
|
| 838 |
+
|
| 839 |
+
# Create tabs
|
| 840 |
+
with gr.Tabs():
|
| 841 |
+
# Tab 1: Performance Rankings
|
| 842 |
+
with gr.TabItem("📈 Performance Rankings"):
|
| 843 |
+
with gr.Column(elem_classes=["tab-content"]):
|
| 844 |
+
gr.Markdown(
|
| 845 |
+
"""
|
| 846 |
+
🚨 SWE-Bench Verified is saturated. When every agent solves 160+ of the same "easy" problems, how do we know which is actually better? Our discriminative subsets target the competitive frontier where top systems actually differ and their performance drops dramatically. Explore the methodology in the **"📊 Discriminative Subsets Explained"** tab.
|
| 847 |
+
|
| 848 |
+
**NOTE**: _These aren't just harder problems—they reveal what SWE agents actually can't do today._
|
| 849 |
+
|
| 850 |
+
**Subset Definitions**:
|
| 851 |
+
- 🔥 **Frontier** (95 instances): Problems solved by ≤5 agents (maximum discrimination)
|
| 852 |
+
- ⚡ **Challenging** (155 instances): Problems solved by ≤20 agents (balanced evaluation)
|
| 853 |
+
- 💪 **Hard** (45 instances): All Hard difficulty problems (traditional difficulty)
|
| 854 |
+
- 📁 **MultiFile** (40 instances): Multi-file problems solved by ≤10 agents (real-world complexity)
|
| 855 |
+
|
| 856 |
+
How to Read: The results are ranked by full benchmark performance. Click on the column headers to sort by subset performance.
|
| 857 |
+
"""
|
| 858 |
+
)
|
| 859 |
+
|
| 860 |
+
# Multi-table HTML with time-based filtering
|
| 861 |
+
table_html = create_multi_table_html()
|
| 862 |
+
gr.HTML(table_html)
|
| 863 |
+
|
| 864 |
+
# Tab 2: Discriminative Subsets Explained
|
| 865 |
+
with gr.TabItem("📊 Discriminative Subsets Explained"):
|
| 866 |
+
with gr.Column(elem_classes=["tab-content"]):
|
| 867 |
+
gr.Markdown(
|
| 868 |
+
"""
|
| 869 |
+
## The Emperor's New Benchmarks
|
| 870 |
+
|
| 871 |
+
Like the tale of the Emperor's new clothes, sometimes we need fresh eyes on familiar benchmarks.
|
| 872 |
+
SWE-agents parade around with impressive scores, but here's the reality:
|
| 873 |
+
|
| 874 |
+
- **Everyone's solving the same problems**: 69% of SWE-Bench Verified is solved by 21+ agents
|
| 875 |
+
- **Making most evaluations meaningless**: When everyone gets the same questions right, you can't tell who's actually better
|
| 876 |
+
- **The naked truth**: Many problems remain completely unsolved by ALL agents
|
| 877 |
+
|
| 878 |
+
**What everyone sees**: "Our agent achieves impressive results on SWE-Bench!"
|
| 879 |
+
**The reality**: It's the same set of 350 problems everyone solves.
|
| 880 |
+
|
| 881 |
+
📖 [Full breakdown of the saturation problem](https://jatinganhotra.dev/blog/swe-agents/2025/06/05/swe-bench-verified-discriminative-subsets.html)
|
| 882 |
+
"""
|
| 883 |
+
)
|
| 884 |
+
|
| 885 |
+
gr.Markdown("> 💡 **Key Insight**: Instead of arbitrary 'hard' problems, analyze HOW MANY agents solve each instance across 500 problems.")
|
| 886 |
+
|
| 887 |
+
gr.Markdown(
|
| 888 |
+
"""
|
| 889 |
+
## A Data-Driven Solution: Discriminative Subsets
|
| 890 |
+
|
| 891 |
+
**Instead of guessing which problems are "hard", I analyzed how many SWE-agents solve each instance.**
|
| 892 |
+
|
| 893 |
+
Rather than arbitrarily choosing "hard" problems, I developed a systematic methodology by analyzing
|
| 894 |
+
how many SWE-agents solve each instance across all 500 problems in SWE-Bench Verified.
|
| 895 |
+
|
| 896 |
+
### Instance Solve Distribution Analysis
|
| 897 |
+
|
| 898 |
+
Each of the 500 instances was checked against evaluation results from all SWE-agents
|
| 899 |
+
(submitted to the leaderboard) to record solve counts. "Solved" means the agent's fix
|
| 900 |
+
passed the verification test suite - the same standard used in the original SWE-bench evaluation.
|
| 901 |
+
|
| 902 |
+
📊 [Explore the discriminative dataset](https://huggingface.co/datasets/jatinganhotra/SWE-bench_Verified-discriminative)
|
| 903 |
+
"""
|
| 904 |
+
)
|
| 905 |
+
|
| 906 |
+
# Image 1: Instance Solve Count Distribution
|
| 907 |
+
with gr.Column(elem_classes=["image-container"]):
|
| 908 |
+
gr.Image(
|
| 909 |
+
"assets/Instance Solve Count Distribution.png",
|
| 910 |
+
label="Instance Solve Count Distribution",
|
| 911 |
+
show_label=True,
|
| 912 |
+
show_download_button=False,
|
| 913 |
+
height=400
|
| 914 |
+
)
|
| 915 |
+
gr.Markdown("> 📊 **Key Finding**: 69% of problems are solved by 21+ agents, creating limited evaluative sensitivity.")
|
| 916 |
+
|
| 917 |
+
gr.Markdown(
|
| 918 |
+
"""
|
| 919 |
+
I categorized all instances based on how many of the 83 evaluated SWE-agents successfully solve them:
|
| 920 |
+
|
| 921 |
+
| Bucket | Solve Count | Instances | Percentage | Easy | Medium | Hard | Single | Multi |
|
| 922 |
+
|--------|-------------|-----------|------------|------|--------|------|--------|-------|
|
| 923 |
+
| **Unsolved** | 0 agents | 52 | 10.4% | 5 | 26 | 21 | 27 | 25 |
|
| 924 |
+
| **Ultra Rare** | 1-2 agents | 26 | 5.2% | 6 | 16 | 4 | 17 | 9 |
|
| 925 |
+
| **Very Rare** | 3-5 agents | 17 | 3.4% | 3 | 10 | 4 | 14 | 3 |
|
| 926 |
+
| **Rare** | 6-10 agents | 22 | 4.4% | 1 | 19 | 2 | 19 | 3 |
|
| 927 |
+
| **Uncommon** | 11-20 agents | 38 | 7.6% | 13 | 22 | 3 | 28 | 10 |
|
| 928 |
+
| **Common** | 21-40 agents | 96 | 19.2% | 27 | 62 | 7 | 82 | 14 |
|
| 929 |
+
| **Very Common** | 41-60 agents | 93 | 18.6% | 38 | 52 | 3 | 88 | 5 |
|
| 930 |
+
| **Solved** | 61+ agents | 156 | 31.2% | 101 | 53 | 2 | 154 | 2 |
|
| 931 |
+
|
| 932 |
+
> 🎯 **The Solution**: Four discriminative subsets targeting the competitive frontier where agents actually differ.
|
| 933 |
+
"""
|
| 934 |
+
)
|
| 935 |
+
|
| 936 |
+
# Four Discriminative Subsets section
|
| 937 |
+
gr.Markdown(
|
| 938 |
+
"""
|
| 939 |
+
### Four Discriminative Subsets: Targeting the Competitive Frontier
|
| 940 |
+
|
| 941 |
+
**The Solution: 4 targeted subsets that reveal true agent capabilities.**
|
| 942 |
+
|
| 943 |
+
Rather than continuing to measure incremental improvements on largely-solved Easy problems,
|
| 944 |
+
I designed targeted subsets to focus on:
|
| 945 |
+
|
| 946 |
+
1. **Completely unsolved problems** (52 instances) - true frontier challenges that NO agent can solve
|
| 947 |
+
2. **Sparsely solved problems** - instances resolved by only a handful of agents
|
| 948 |
+
3. **Problems with high solution variance** - where top SWE-agents show meaningful differences
|
| 949 |
+
4. **Real-world complexity** - multi-file coordination that reflects actual software engineering
|
| 950 |
+
|
| 951 |
+
📖 [Deep dive into each subset's methodology](https://jatinganhotra.dev/blog/swe-agents/2025/06/05/swe-bench-verified-discriminative-subsets.html)
|
| 952 |
+
"""
|
| 953 |
+
)
|
| 954 |
+
|
| 955 |
+
# Image 2: Performance Comparison
|
| 956 |
+
with gr.Column(elem_classes=["image-container"]):
|
| 957 |
+
gr.Image(
|
| 958 |
+
"assets/Performance Comparison Across Subsets.png",
|
| 959 |
+
label="Performance Comparison Across Subsets",
|
| 960 |
+
show_label=True,
|
| 961 |
+
show_download_button=False,
|
| 962 |
+
height=400
|
| 963 |
+
)
|
| 964 |
+
gr.Markdown("> 🔥 **Breakthrough Result**: Top agents drop dramatically on Frontier subset - significantly better discrimination!")
|
| 965 |
+
|
| 966 |
+
# Four Discriminative Subsets - Compact Grid Layout
|
| 967 |
+
gr.Markdown("### 🎯 The Four Discriminative Subsets")
|
| 968 |
+
|
| 969 |
+
with gr.Row():
|
| 970 |
+
with gr.Column(scale=1):
|
| 971 |
+
gr.Markdown(
|
| 972 |
+
"""
|
| 973 |
+
**🔥 Frontier Subset** (95 instances)
|
| 974 |
+
Problems solved by ≤5 agents - maximum discrimination
|
| 975 |
+
*Combines unsolved, ultra-rare, and very-rare problems. Provides 6x better discrimination than full benchmark.*
|
| 976 |
+
""",
|
| 977 |
+
elem_classes=["frontier-subset"]
|
| 978 |
+
)
|
| 979 |
+
|
| 980 |
+
with gr.Column(scale=1):
|
| 981 |
+
gr.Markdown(
|
| 982 |
+
"""
|
| 983 |
+
**⚡ Challenging Subset** (155 instances)
|
| 984 |
+
Problems solved by ≤20 agents - strong evaluative power
|
| 985 |
+
*Expands Frontier to include rare and uncommon problems. Balances discrimination with statistical significance.*
|
| 986 |
+
""",
|
| 987 |
+
elem_classes=["challenging-subset"]
|
| 988 |
+
)
|
| 989 |
+
|
| 990 |
+
with gr.Row():
|
| 991 |
+
with gr.Column(scale=1):
|
| 992 |
+
gr.Markdown(
|
| 993 |
+
"""
|
| 994 |
+
**💪 Hard Subset** (45 instances)
|
| 995 |
+
All Hard difficulty problems regardless of solve rate
|
| 996 |
+
*Traditional difficulty-based subset. Includes all problems classified as "Hard" in the original SWE-Bench difficulty analysis.*
|
| 997 |
+
""",
|
| 998 |
+
elem_classes=["hard-subset"]
|
| 999 |
+
)
|
| 1000 |
+
|
| 1001 |
+
with gr.Column(scale=1):
|
| 1002 |
+
gr.Markdown(
|
| 1003 |
+
"""
|
| 1004 |
+
**📁 MultiFile Subset** (40 instances)
|
| 1005 |
+
Multi-file problems solved by ≤10 agents
|
| 1006 |
+
*Targets problems requiring coordinated edits across multiple source files - representing real-world software engineering complexity.*
|
| 1007 |
+
""",
|
| 1008 |
+
elem_classes=["multifile-subset"]
|
| 1009 |
+
)
|
| 1010 |
+
|
| 1011 |
+
# Subset summary table
|
| 1012 |
+
gr.Markdown(
|
| 1013 |
+
"""
|
| 1014 |
+
### Subset Summary
|
| 1015 |
+
|
| 1016 |
+
| Subset | Description | Total | Easy | Medium | Hard | Single | Multi | Solve Range | Top Agent % |
|
| 1017 |
+
|--------|-------------|-------|------|--------|------|--------|-------|-------------|-------------|
|
| 1018 |
+
| **Frontier** | Solved by ≤5 agents | 95 | 14 | 52 | 29 | 58 | 37 | 0–5 | 11.6% |
|
| 1019 |
+
| **Challenging** | Solved by ≤20 agents | 155 | 28 | 93 | 34 | 105 | 50 | 0–20 | 31.6% |
|
| 1020 |
+
| **Hard** | All Hard problems | 45 | 0 | 0 | 45 | 20 | 25 | 0–61 | 42.2% |
|
| 1021 |
+
| **MultiFile** | Multi-file + ≤10 solves | 40 | 3 | 17 | 20 | 0 | 40 | 0–7 | 10.0% |
|
| 1022 |
+
"""
|
| 1023 |
+
)
|
| 1024 |
+
|
| 1025 |
+
# Tab 3: Key Insights
|
| 1026 |
+
with gr.TabItem("🔍 Key Insights"):
|
| 1027 |
+
with gr.Column(elem_classes=["tab-content"]):
|
| 1028 |
+
gr.Markdown(
|
| 1029 |
+
"""
|
| 1030 |
+
## The Results Are Eye-Opening
|
| 1031 |
+
|
| 1032 |
+
**Top agent on full benchmark: Strong performance ✅**
|
| 1033 |
+
**Same agent on Frontier subset: Dramatic drop 😬**
|
| 1034 |
+
|
| 1035 |
+
The discriminative subsets expose critical insights about current SWE-agent capabilities that traditional benchmarks completely miss.
|
| 1036 |
+
|
| 1037 |
+
📖 [See the complete performance breakdown](https://jatinganhotra.dev/blog/swe-agents/2025/06/05/swe-bench-verified-discriminative-subsets.html)
|
| 1038 |
+
"""
|
| 1039 |
+
)
|
| 1040 |
+
|
| 1041 |
+
gr.Markdown("> 📈 **Breakthrough Result**: 6x better discrimination power. Finally, a way to meaningfully compare state-of-the-art SWE agents.")
|
| 1042 |
+
|
| 1043 |
+
gr.Markdown(
|
| 1044 |
+
"""
|
| 1045 |
+
### 🎯 Enhanced Resolution Power: From Saturated to Sensitive
|
| 1046 |
+
- **6x Better Discrimination**: Frontier subset provides much clearer distinction between top systems
|
| 1047 |
+
- **Dramatic Performance Drops**: Top agents show significant performance decreases on challenging problems
|
| 1048 |
+
- **52 Completely Unsolved Problems**: Not edge cases—problems that separate toys from tools
|
| 1049 |
+
- **Statistical Significance**: Challenging subset balances discrimination with robust evaluation
|
| 1050 |
+
"""
|
| 1051 |
+
)
|
| 1052 |
+
|
| 1053 |
+
gr.Markdown("> 📁 **Multi-file Challenge**: Multi-file coordination remains the hardest challenge - even top agents struggle at 10% success.")
|
| 1054 |
+
|
| 1055 |
+
gr.Markdown(
|
| 1056 |
+
"""
|
| 1057 |
+
### 🔧 Multi-file Problems = Everyday Dev Work, But Agents Flop Here
|
| 1058 |
+
- **Real-world Challenge**: Even leading agents achieve only 10% on multi-file problems
|
| 1059 |
+
- **Coordination Required**: Problems need edits across multiple source files simultaneously
|
| 1060 |
+
- **Industry Relevance**: Reflects actual software engineering complexity
|
| 1061 |
+
- **The Gap**: We need agents that can coordinate real code changes, not just single-file tweaks
|
| 1062 |
+
|
| 1063 |
+
### 🧠 "Hard" Doesn't Mean Hard for AI
|
| 1064 |
+
|
| 1065 |
+
**Key insight**: Traditional difficulty labels miss what makes problems truly challenging for AI agents:
|
| 1066 |
+
|
| 1067 |
+
- **2/45 "Hard" problems** are solved by 61+ agents (too easy for evaluation)
|
| 1068 |
+
- **5/194 "Easy" problems** are solved by ZERO agents (actually impossible)
|
| 1069 |
+
- **52 Medium problems** in Frontier subset vs 29 Hard problems
|
| 1070 |
+
|
| 1071 |
+
We need better ways to measure difficulty—and these subsets provide exactly that.
|
| 1072 |
+
|
| 1073 |
+
### 📊 Subset Comparison: The New Reality
|
| 1074 |
+
|
| 1075 |
+
| Subset | Instances | Top Performance | Focus | Insight |
|
| 1076 |
+
|--------|-----------|----------------|-------|---------|
|
| 1077 |
+
| **Full Benchmark** | 500 | 73.2% | Complete evaluation | Saturated for top agents |
|
| 1078 |
+
| **Frontier** | 95 | 11.6% | Maximum sensitivity | True cutting-edge capability |
|
| 1079 |
+
| **Challenging** | 155 | 31.6% | Balanced evaluation | Practical discrimination |
|
| 1080 |
+
| **Hard** | 45 | 42.2% | Traditional difficulty | Many are actually easy |
|
| 1081 |
+
| **MultiFile** | 40 | 10.0% | Real-world complexity | Critical capability gap |
|
| 1082 |
+
"""
|
| 1083 |
+
)
|
| 1084 |
+
|
| 1085 |
+
gr.Markdown("> 🚀 **Looking Ahead**: We need evaluation frameworks that grow with AI capabilities, not static benchmarks.")
|
| 1086 |
+
|
| 1087 |
+
gr.Markdown(
|
| 1088 |
+
"""
|
| 1089 |
+
### 🧠 Research Implications: A New Methodology for Benchmark Evolution
|
| 1090 |
+
|
| 1091 |
+
**New methodology for benchmark evolution as AI saturates existing evaluations:**
|
| 1092 |
+
|
| 1093 |
+
1. **Analyze solve distribution** across all systems (not just top performers)
|
| 1094 |
+
2. **Identify low-solve instances** where systems actually differ
|
| 1095 |
+
3. **Create discriminative subsets** targeting competitive frontiers
|
| 1096 |
+
4. **Maintain evaluative sensitivity** as capabilities advance
|
| 1097 |
+
|
| 1098 |
+
**Applied to SWE-Bench**: Performance drops dramatically on discriminative subsets, exposing real capability gaps.
|
| 1099 |
+
|
| 1100 |
+
- **Progress Tracking**: Improvements on discriminative subsets indicate genuine capability advances
|
| 1101 |
+
- **Research Focus**: Identifies specific bottlenecks for targeted development
|
| 1102 |
+
- **Benchmark Evolution**: Framework can adapt as agent capabilities improve
|
| 1103 |
+
- **Actionable Insight**: Focus research on sparsely-solved problems, not incrementally improving on saturated ones
|
| 1104 |
+
"""
|
| 1105 |
+
)
|
| 1106 |
+
|
| 1107 |
+
gr.Markdown("> 🎯 **Challenge for SWE-agent developers**: Can your agent solve ANY of the 52 completely unsolved problems? These aren't edge cases—they're what separate toys from tools.")
|
| 1108 |
+
|
| 1109 |
+
gr.Markdown(
|
| 1110 |
+
"""
|
| 1111 |
+
## 🚀 Ready to Benchmark YOUR Agent Properly?
|
| 1112 |
+
|
| 1113 |
+
**Stop optimizing for saturated benchmarks. Start measuring real progress.**
|
| 1114 |
+
|
| 1115 |
+
- **📊 Dataset Available Now**: [SWE-bench_Verified-discriminative](https://huggingface.co/datasets/jatinganhotra/SWE-bench_Verified-discriminative)
|
| 1116 |
+
- **📖 Full Analysis**: [From 73% to 11%: Revealing True SWE-Agent Capabilities](https://jatinganhotra.dev/blog/swe-agents/2025/06/05/swe-bench-verified-discriminative-subsets.html)
|
| 1117 |
+
- **🌐 Original SWE-Bench**: [SWE-bench.com](https://www.swebench.com/)
|
| 1118 |
+
|
| 1119 |
+
**The 52 unsolved problems are still out there. Who's going to be first?** 👀
|
| 1120 |
+
"""
|
| 1121 |
+
)
|
| 1122 |
+
|
| 1123 |
+
# Tab 4: Auto-Update (moved to last position)
|
| 1124 |
+
with gr.TabItem("🔄 Auto-Update"):
|
| 1125 |
+
with gr.Column(elem_classes=["tab-content"]):
|
| 1126 |
+
gr.Markdown(
|
| 1127 |
+
"""
|
| 1128 |
+
# 🔄 Automated Update System
|
| 1129 |
+
|
| 1130 |
+
This tab manages the synchronization with the SWE-Bench experiments repository.
|
| 1131 |
+
The system automatically checks for new submissions daily and can be manually triggered below.
|
| 1132 |
+
|
| 1133 |
+
## How It Works
|
| 1134 |
+
|
| 1135 |
+
1. **Daily Automatic Updates**: The system checks for new submissions every day at 6 AM UTC
|
| 1136 |
+
2. **Git Repository Sync**: Downloads latest data from the SWE-bench experiments repository
|
| 1137 |
+
3. **New Submission Detection**: Scans for new directories with valid results.json files and reads agent details directly from each submission's metadata.yaml file
|
| 1138 |
+
4. **Automatic Integration**: New submissions are immediately available with official names
|
| 1139 |
+
|
| 1140 |
+
## Update Status
|
| 1141 |
+
"""
|
| 1142 |
+
)
|
| 1143 |
+
|
| 1144 |
+
# Update Status Display
|
| 1145 |
+
update_status_display = gr.Markdown(
|
| 1146 |
+
get_update_status_message(),
|
| 1147 |
+
elem_classes=["update-status"]
|
| 1148 |
+
)
|
| 1149 |
+
|
| 1150 |
+
# Update Controls
|
| 1151 |
+
with gr.Row():
|
| 1152 |
+
with gr.Column(scale=2):
|
| 1153 |
+
gr.Markdown("### Manual Update")
|
| 1154 |
+
gr.Markdown("Click the button below to force an immediate update check and synchronization.")
|
| 1155 |
+
with gr.Column(scale=1):
|
| 1156 |
+
force_update_btn = gr.Button(
|
| 1157 |
+
"🚀 Sync with SWE-Bench Now",
|
| 1158 |
+
size="lg",
|
| 1159 |
+
variant="primary",
|
| 1160 |
+
elem_classes=["update-button"]
|
| 1161 |
+
)
|
| 1162 |
+
|
| 1163 |
+
# Update Results Area
|
| 1164 |
+
update_message = gr.Markdown(
|
| 1165 |
+
"",
|
| 1166 |
+
visible=False,
|
| 1167 |
+
elem_classes=["update-message"]
|
| 1168 |
+
)
|
| 1169 |
+
|
| 1170 |
+
# Update Details (for discrepancy reporting)
|
| 1171 |
+
update_details = gr.Markdown(
|
| 1172 |
+
"",
|
| 1173 |
+
visible=False,
|
| 1174 |
+
elem_classes=["update-details"]
|
| 1175 |
+
)
|
| 1176 |
+
|
| 1177 |
+
gr.Markdown(
|
| 1178 |
+
"""
|
| 1179 |
+
## Technical Details
|
| 1180 |
+
|
| 1181 |
+
**Data Sources**:
|
| 1182 |
+
- **Experiments Repository**: [github.com/SWE-bench/experiments](https://github.com/SWE-bench/experiments)
|
| 1183 |
+
- **Agent Metadata**: Each submission's `metadata.yaml` file contains official agent names
|
| 1184 |
+
- **Performance Data**: Results from `results/results.json` in each submission directory
|
| 1185 |
+
|
| 1186 |
+
**Automatic Processing**: all submissions with valid
|
| 1187 |
+
`metadata.yaml` and `results/results.json` files are automatically integrated with their official names.
|
| 1188 |
+
"""
|
| 1189 |
+
)
|
| 1190 |
+
|
| 1191 |
+
# Tab 5: Downloads
|
| 1192 |
+
with gr.TabItem("📥 Downloads"):
|
| 1193 |
+
with gr.Column(elem_classes=["tab-content"]):
|
| 1194 |
+
gr.Markdown(
|
| 1195 |
+
"""
|
| 1196 |
+
# 📥 Download Leaderboard Data
|
| 1197 |
+
|
| 1198 |
+
Export the complete SWE-Bench discriminative subsets leaderboard data in various formats.
|
| 1199 |
+
All downloads include comprehensive agent metadata and performance metrics across all subsets.
|
| 1200 |
+
|
| 1201 |
+
**📌 Important**: Double-click the download buttons below to initiate the download. Single clicks may not trigger the download in some browsers.
|
| 1202 |
+
|
| 1203 |
+
**File Formats**:
|
| 1204 |
+
- **CSV**: Spreadsheet-compatible format for analysis and visualization
|
| 1205 |
+
- **JSON**: Structured data with metadata for programmatic use
|
| 1206 |
+
|
| 1207 |
+
**Data Included**: Agent names, submission dates, performance across all 5 benchmarks,
|
| 1208 |
+
solve counts, and percentage scores.
|
| 1209 |
+
"""
|
| 1210 |
+
)
|
| 1211 |
+
|
| 1212 |
+
# Complete Leaderboard Section
|
| 1213 |
+
gr.Markdown("## 📊 Complete Leaderboard")
|
| 1214 |
+
gr.Markdown("Download the full dataset with all 89+ agents and performance across all subsets.")
|
| 1215 |
+
|
| 1216 |
+
with gr.Row():
|
| 1217 |
+
full_csv_btn = gr.DownloadButton(
|
| 1218 |
+
"📄 Download Complete CSV",
|
| 1219 |
+
size="lg",
|
| 1220 |
+
variant="primary"
|
| 1221 |
+
)
|
| 1222 |
+
full_json_btn = gr.DownloadButton(
|
| 1223 |
+
"📋 Download Complete JSON",
|
| 1224 |
+
size="lg",
|
| 1225 |
+
variant="secondary"
|
| 1226 |
+
)
|
| 1227 |
+
|
| 1228 |
+
# Filtered Exports Section
|
| 1229 |
+
gr.Markdown("## 🎯 Filtered Exports")
|
| 1230 |
+
gr.Markdown("Download specific agent groups or time periods.")
|
| 1231 |
+
|
| 1232 |
+
with gr.Row():
|
| 1233 |
+
with gr.Column():
|
| 1234 |
+
gr.Markdown("### 🏆 Top Performers")
|
| 1235 |
+
top10_csv_btn = gr.DownloadButton(
|
| 1236 |
+
"Top 10 Agents CSV",
|
| 1237 |
+
size="sm",
|
| 1238 |
+
variant="secondary"
|
| 1239 |
+
)
|
| 1240 |
+
|
| 1241 |
+
with gr.Column():
|
| 1242 |
+
gr.Markdown("### 🚀 Recent (2025)")
|
| 1243 |
+
recent_csv_btn = gr.DownloadButton(
|
| 1244 |
+
"2025 Submissions CSV",
|
| 1245 |
+
size="sm",
|
| 1246 |
+
variant="secondary"
|
| 1247 |
+
)
|
| 1248 |
+
|
| 1249 |
+
# Individual Subsets Section
|
| 1250 |
+
gr.Markdown("## 📊 Individual Subset Results")
|
| 1251 |
+
gr.Markdown("Download performance data for specific discriminative subsets only.")
|
| 1252 |
+
|
| 1253 |
+
with gr.Row():
|
| 1254 |
+
frontier_csv_btn = gr.DownloadButton(
|
| 1255 |
+
"🔥 Frontier Subset CSV",
|
| 1256 |
+
size="sm",
|
| 1257 |
+
variant="primary"
|
| 1258 |
+
)
|
| 1259 |
+
challenging_csv_btn = gr.DownloadButton(
|
| 1260 |
+
"⚡ Challenging Subset CSV",
|
| 1261 |
+
size="sm",
|
| 1262 |
+
variant="primary"
|
| 1263 |
+
)
|
| 1264 |
+
|
| 1265 |
+
with gr.Row():
|
| 1266 |
+
hard_csv_btn = gr.DownloadButton(
|
| 1267 |
+
"💪 Hard Subset CSV",
|
| 1268 |
+
size="sm",
|
| 1269 |
+
variant="primary"
|
| 1270 |
+
)
|
| 1271 |
+
multifile_csv_btn = gr.DownloadButton(
|
| 1272 |
+
"📁 MultiFile Subset CSV",
|
| 1273 |
+
size="sm",
|
| 1274 |
+
variant="primary"
|
| 1275 |
+
)
|
| 1276 |
+
|
| 1277 |
+
# Usage Examples
|
| 1278 |
+
gr.Markdown(
|
| 1279 |
+
"""
|
| 1280 |
+
## 💡 Usage Examples
|
| 1281 |
+
|
| 1282 |
+
**Research & Analysis**:
|
| 1283 |
+
- Load CSV files into Excel, Google Sheets, or Pandas for statistical analysis
|
| 1284 |
+
- Compare agent performance trends over time
|
| 1285 |
+
- Identify patterns in discriminative subset performance
|
| 1286 |
+
|
| 1287 |
+
**Integration**:
|
| 1288 |
+
- Use JSON exports for programmatic access to leaderboard data
|
| 1289 |
+
- Integrate with evaluation pipelines and research tools
|
| 1290 |
+
- Build custom visualizations and comparisons
|
| 1291 |
+
|
| 1292 |
+
**Citation**: If you use this data in research, please cite:
|
| 1293 |
+
> Ganhotra, J. (2025). "From 73% to 11%: Revealing True SWE-Agent Capabilities".
|
| 1294 |
+
> SWE-Bench Verified Discriminative Subsets Analysis.
|
| 1295 |
+
"""
|
| 1296 |
+
)
|
| 1297 |
+
|
| 1298 |
+
# Connect download buttons to functions with proper DownloadButton behavior
|
| 1299 |
+
full_csv_btn.click(fn=download_full_csv, outputs=full_csv_btn)
|
| 1300 |
+
full_json_btn.click(fn=download_full_json, outputs=full_json_btn)
|
| 1301 |
+
top10_csv_btn.click(fn=download_top10_csv, outputs=top10_csv_btn)
|
| 1302 |
+
recent_csv_btn.click(fn=download_2025_csv, outputs=recent_csv_btn)
|
| 1303 |
+
frontier_csv_btn.click(fn=download_frontier_csv, outputs=frontier_csv_btn)
|
| 1304 |
+
challenging_csv_btn.click(fn=download_challenging_csv, outputs=challenging_csv_btn)
|
| 1305 |
+
hard_csv_btn.click(fn=download_hard_csv, outputs=hard_csv_btn)
|
| 1306 |
+
multifile_csv_btn.click(fn=download_multifile_csv, outputs=multifile_csv_btn)
|
| 1307 |
+
|
| 1308 |
+
# Connect force update button
|
| 1309 |
+
def handle_force_update():
|
| 1310 |
+
message, details = force_update_action()
|
| 1311 |
+
new_status = get_update_status_message()
|
| 1312 |
+
return (
|
| 1313 |
+
new_status,
|
| 1314 |
+
message,
|
| 1315 |
+
gr.update(visible=True),
|
| 1316 |
+
details,
|
| 1317 |
+
gr.update(visible=True if details else False)
|
| 1318 |
+
)
|
| 1319 |
+
|
| 1320 |
+
force_update_btn.click(
|
| 1321 |
+
fn=handle_force_update,
|
| 1322 |
+
outputs=[update_status_display, update_message, update_message, update_details, update_details]
|
| 1323 |
+
)
|
| 1324 |
+
|
| 1325 |
+
# Footer
|
| 1326 |
+
gr.Markdown(
|
| 1327 |
+
"""
|
| 1328 |
+
---
|
| 1329 |
+
**Data Source**: [From 73% to 11%: Revealing True SWE-Agent Capabilities](https://jatinganhotra.dev/blog/swe-agents/2025/06/05/swe-bench-verified-discriminative-subsets.html) • Updated daily to match the SWE-Bench leaderboard
|
| 1330 |
+
**Last Updated**: June 19, 2025 at 12:00 UTC
|
| 1331 |
+
**Created by**: [Jatin Ganhotra](https://jatinganhotra.dev)
|
| 1332 |
+
""",
|
| 1333 |
+
elem_classes=["footer"]
|
| 1334 |
+
)
|
| 1335 |
+
|
| 1336 |
+
return demo
|
| 1337 |
+
|
| 1338 |
+
# Create and launch the app
|
| 1339 |
+
if __name__ == "__main__":
|
| 1340 |
+
import tempfile
|
| 1341 |
+
|
| 1342 |
+
# Start automated updates
|
| 1343 |
+
try:
|
| 1344 |
+
start_automated_updates()
|
| 1345 |
+
print("✅ Automated updates started")
|
| 1346 |
+
except Exception as e:
|
| 1347 |
+
print(f"⚠️ Could not start automated updates: {e}")
|
| 1348 |
+
|
| 1349 |
+
demo = create_leaderboard()
|
| 1350 |
+
demo.launch(
|
| 1351 |
+
server_name="0.0.0.0",
|
| 1352 |
+
server_port=7860,
|
| 1353 |
+
share=False,
|
| 1354 |
+
allowed_paths=[tempfile.gettempdir()]
|
| 1355 |
+
)
|
src/data/__init__.py
ADDED
|
File without changes
|
src/data/data_processor_dynamic.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from src.data.leaderboard_data_fetcher import get_leaderboard_data
|
| 5 |
+
|
| 6 |
+
def load_actual_performance_data():
|
| 7 |
+
"""
|
| 8 |
+
Load actual performance data from dynamic leaderboard fetcher
|
| 9 |
+
"""
|
| 10 |
+
# Get dynamic data from leaderboard fetcher
|
| 11 |
+
models_data, models_mapping = get_leaderboard_data()
|
| 12 |
+
|
| 13 |
+
# Convert to the format expected by the app
|
| 14 |
+
performance_data = []
|
| 15 |
+
|
| 16 |
+
for model in models_data:
|
| 17 |
+
performance_row = {
|
| 18 |
+
'agent_name': model['agent_name'],
|
| 19 |
+
'submission_id': model['submission_id'],
|
| 20 |
+
'submission_date': model.get('submission_date', 'not yet published on leaderboard'),
|
| 21 |
+
'rank': model['rank'],
|
| 22 |
+
'full_solved': model['full_solved'],
|
| 23 |
+
'full_total': model['full_total'],
|
| 24 |
+
'full_percentage': model['full_percentage'],
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
# Add subset performance
|
| 28 |
+
for subset in ['frontier', 'challenging', 'hard', 'multifile']:
|
| 29 |
+
performance_row[f'{subset}_solved'] = model.get(f'{subset}_solved', 0)
|
| 30 |
+
performance_row[f'{subset}_total'] = model.get(f'{subset}_total', 0)
|
| 31 |
+
performance_row[f'{subset}_percentage'] = model.get(f'{subset}_percentage', 0.0)
|
| 32 |
+
|
| 33 |
+
performance_data.append(performance_row)
|
| 34 |
+
|
| 35 |
+
return performance_data
|
| 36 |
+
|
| 37 |
+
def filter_by_date_range(performance_data, start_date=None, end_date=None):
|
| 38 |
+
"""Filter performance data by date range"""
|
| 39 |
+
from datetime import datetime
|
| 40 |
+
|
| 41 |
+
filtered_data = []
|
| 42 |
+
|
| 43 |
+
for row in performance_data:
|
| 44 |
+
submission_date_str = row.get('submission_date', 'not yet published on leaderboard')
|
| 45 |
+
|
| 46 |
+
if submission_date_str == 'not yet published on leaderboard':
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
submission_date = datetime.strptime(submission_date_str, '%Y-%m-%d')
|
| 51 |
+
|
| 52 |
+
# Check date range
|
| 53 |
+
if start_date and submission_date < start_date:
|
| 54 |
+
continue
|
| 55 |
+
if end_date and submission_date > end_date:
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
filtered_data.append(row)
|
| 59 |
+
|
| 60 |
+
except ValueError:
|
| 61 |
+
# Skip entries with invalid dates
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
return filtered_data
|
| 65 |
+
|
| 66 |
+
def get_top_10_data(performance_data):
|
| 67 |
+
"""Get top 10 performers regardless of date"""
|
| 68 |
+
return sorted(performance_data, key=lambda x: x['full_percentage'], reverse=True)[:10]
|
| 69 |
+
|
| 70 |
+
def get_2025_data(performance_data):
|
| 71 |
+
"""Get submissions from Jan 1, 2025 onwards"""
|
| 72 |
+
from datetime import datetime
|
| 73 |
+
start_date = datetime(2025, 1, 1)
|
| 74 |
+
return filter_by_date_range(performance_data, start_date=start_date)
|
| 75 |
+
|
| 76 |
+
def get_2024_data(performance_data):
|
| 77 |
+
"""Get submissions from 2024 (Jan 1 - Dec 31)"""
|
| 78 |
+
from datetime import datetime
|
| 79 |
+
start_date = datetime(2024, 1, 1)
|
| 80 |
+
end_date = datetime(2024, 12, 31)
|
| 81 |
+
return filter_by_date_range(performance_data, start_date=start_date, end_date=end_date)
|
| 82 |
+
|
| 83 |
+
def get_pre_2024_data(performance_data):
|
| 84 |
+
"""Get submissions before 2024"""
|
| 85 |
+
from datetime import datetime
|
| 86 |
+
end_date = datetime(2023, 12, 31)
|
| 87 |
+
return filter_by_date_range(performance_data, end_date=end_date)
|
| 88 |
+
|
| 89 |
+
def format_for_gradio_table(performance_data):
|
| 90 |
+
"""
|
| 91 |
+
Format the performance data for display in Gradio table
|
| 92 |
+
"""
|
| 93 |
+
table_data = []
|
| 94 |
+
|
| 95 |
+
for row in performance_data:
|
| 96 |
+
formatted_row = [
|
| 97 |
+
row['rank'],
|
| 98 |
+
row['agent_name'],
|
| 99 |
+
row.get('submission_date', 'not yet published on leaderboard'),
|
| 100 |
+
f"{row['full_percentage']}% ({row['full_solved']}/500)",
|
| 101 |
+
f"{row['frontier_percentage']}% ({row['frontier_solved']}/{row['frontier_total']})",
|
| 102 |
+
f"{row['challenging_percentage']}% ({row['challenging_solved']}/{row['challenging_total']})",
|
| 103 |
+
f"{row['hard_percentage']}% ({row['hard_solved']}/{row['hard_total']})",
|
| 104 |
+
f"{row['multifile_percentage']}% ({row['multifile_solved']}/{row['multifile_total']})"
|
| 105 |
+
]
|
| 106 |
+
table_data.append(formatted_row)
|
| 107 |
+
|
| 108 |
+
return table_data
|
| 109 |
+
|
| 110 |
+
def create_html_table_for_subset(performance_data, table_id, show_rank_column=True):
|
| 111 |
+
"""Create HTML table for a specific subset of data"""
|
| 112 |
+
|
| 113 |
+
if not performance_data:
|
| 114 |
+
return "<p>No data available for this time period.</p>"
|
| 115 |
+
|
| 116 |
+
# Re-rank the data for this subset
|
| 117 |
+
sorted_data = sorted(performance_data, key=lambda x: x['full_percentage'], reverse=True)
|
| 118 |
+
|
| 119 |
+
# Create table HTML
|
| 120 |
+
html = f"""
|
| 121 |
+
<div class="custom-table-container">
|
| 122 |
+
<table class="custom-leaderboard-table" id="{table_id}">
|
| 123 |
+
<thead>
|
| 124 |
+
<tr>
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
if show_rank_column:
|
| 128 |
+
html += '<th onclick="sortTable(0, \'{}\')\" class="rank-col">Rank</th>'.format(table_id)
|
| 129 |
+
col_offset = 1
|
| 130 |
+
else:
|
| 131 |
+
col_offset = 0
|
| 132 |
+
|
| 133 |
+
html += f"""
|
| 134 |
+
<th onclick="sortTable({col_offset}, '{table_id}')" class="agent-col">SWE-Agent</th>
|
| 135 |
+
<th onclick="sortTable({col_offset + 1}, '{table_id}')" class="date-col">Date</th>
|
| 136 |
+
<th onclick="sortTable({col_offset + 2}, '{table_id}')" class="full-col">Full<br>Benchmark</th>
|
| 137 |
+
<th onclick="sortTable({col_offset + 3}, '{table_id}')" class="frontier-col">Frontier</th>
|
| 138 |
+
<th onclick="sortTable({col_offset + 4}, '{table_id}')" class="challenging-col">Challenging</th>
|
| 139 |
+
<th onclick="sortTable({col_offset + 5}, '{table_id}')" class="hard-col">Hard</th>
|
| 140 |
+
<th onclick="sortTable({col_offset + 6}, '{table_id}')" class="multifile-col">MultiFile</th>
|
| 141 |
+
</tr>
|
| 142 |
+
</thead>
|
| 143 |
+
<tbody>
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
# Add data rows
|
| 147 |
+
for i, row in enumerate(sorted_data):
|
| 148 |
+
html += '<tr>'
|
| 149 |
+
|
| 150 |
+
if show_rank_column:
|
| 151 |
+
html += f'<td class="rank-col">{i + 1}</td>'
|
| 152 |
+
|
| 153 |
+
html += f"""
|
| 154 |
+
<td class="agent-col">{row['agent_name']}</td>
|
| 155 |
+
<td class="date-col">{row.get('submission_date', 'not yet published on leaderboard')}</td>
|
| 156 |
+
<td class="full-col">{row['full_percentage']}</td>
|
| 157 |
+
<td class="frontier-col">{row['frontier_percentage']}</td>
|
| 158 |
+
<td class="challenging-col">{row['challenging_percentage']}</td>
|
| 159 |
+
<td class="hard-col">{row['hard_percentage']}</td>
|
| 160 |
+
<td class="multifile-col">{row['multifile_percentage']}</td>
|
| 161 |
+
</tr>
|
| 162 |
+
"""
|
| 163 |
+
|
| 164 |
+
html += """
|
| 165 |
+
</tbody>
|
| 166 |
+
</table>
|
| 167 |
+
</div>
|
| 168 |
+
"""
|
| 169 |
+
|
| 170 |
+
return html
|
| 171 |
+
|
| 172 |
+
def get_subset_descriptions():
|
| 173 |
+
"""
|
| 174 |
+
Return descriptions for each subset with detailed methodology
|
| 175 |
+
"""
|
| 176 |
+
return {
|
| 177 |
+
'full': {
|
| 178 |
+
'name': 'SWE-Bench Verified',
|
| 179 |
+
'description': 'Complete benchmark with all 500 instances from SWE-Bench Verified',
|
| 180 |
+
'instances': 500,
|
| 181 |
+
'details': 'The gold standard software engineering benchmark covering real-world GitHub issues'
|
| 182 |
+
},
|
| 183 |
+
'frontier': {
|
| 184 |
+
'name': 'Frontier Subset',
|
| 185 |
+
'description': 'Problems solved by ≤5 agents - maximum evaluative sensitivity',
|
| 186 |
+
'instances': 95,
|
| 187 |
+
'details': 'Combines unsolved (0 agents), ultra-rare (1-2 agents), and very-rare (3-5 agents) problems. Provides 6x better discrimination than full benchmark.'
|
| 188 |
+
},
|
| 189 |
+
'challenging': {
|
| 190 |
+
'name': 'Challenging Subset',
|
| 191 |
+
'description': 'Problems solved by ≤20 agents - strong evaluative power',
|
| 192 |
+
'instances': 155,
|
| 193 |
+
'details': 'Expands Frontier to include rare (6-10 agents) and uncommon (11-20 agents) problems. Balances discrimination with statistical significance.'
|
| 194 |
+
},
|
| 195 |
+
'hard': {
|
| 196 |
+
'name': 'Hard Subset',
|
| 197 |
+
'description': 'All Hard difficulty problems regardless of solve rate',
|
| 198 |
+
'instances': 45,
|
| 199 |
+
'details': 'Traditional difficulty-based subset. Includes all problems classified as "Hard" in the original SWE-Bench difficulty analysis.'
|
| 200 |
+
},
|
| 201 |
+
'multifile': {
|
| 202 |
+
'name': 'MultiFile Subset',
|
| 203 |
+
'description': 'Multi-file problems solved by ≤10 agents',
|
| 204 |
+
'instances': 40,
|
| 205 |
+
'details': 'Targets problems requiring coordinated edits across multiple source files - representing real-world software engineering complexity.'
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
if __name__ == "__main__":
|
| 210 |
+
# Test the dynamic data loading
|
| 211 |
+
data = load_actual_performance_data()
|
| 212 |
+
print(f"Loaded dynamic data for {len(data)} agents")
|
| 213 |
+
print("Top 5 agents:")
|
| 214 |
+
for i in range(min(5, len(data))):
|
| 215 |
+
agent = data[i]
|
| 216 |
+
print(f"{i+1}. {agent['agent_name']}: {agent['full_percentage']}% (full), {agent['frontier_percentage']}% (frontier)")
|
src/data/leaderboard_data_fetcher.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Dict, List, Optional, Tuple
|
| 7 |
+
import requests
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
# Configure logging
|
| 12 |
+
logging.basicConfig(level=logging.INFO)
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
class LeaderboardDataFetcher:
|
| 16 |
+
"""Fetches and processes SWE-bench leaderboard data dynamically"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, cache_dir: str = "leaderboard_cache"):
|
| 19 |
+
self.cache_dir = Path(cache_dir)
|
| 20 |
+
self.cache_dir.mkdir(exist_ok=True)
|
| 21 |
+
self.experiments_dir = Path("leaderboard_swe_bench_experiments/evaluation/verified")
|
| 22 |
+
|
| 23 |
+
# Cache file paths
|
| 24 |
+
self.models_cache = self.cache_dir / "verified_models.json"
|
| 25 |
+
self.mapping_cache = self.cache_dir / "verified_models_mapping.json"
|
| 26 |
+
self.subsets_cache = self.cache_dir / "discriminative_subsets.json"
|
| 27 |
+
|
| 28 |
+
def fetch_with_retry(self, func, max_retries: int = 5, base_delay: float = 1.0):
|
| 29 |
+
"""Execute function with exponential backoff retry logic"""
|
| 30 |
+
for attempt in range(max_retries):
|
| 31 |
+
try:
|
| 32 |
+
return func()
|
| 33 |
+
except Exception as e:
|
| 34 |
+
if attempt == max_retries - 1:
|
| 35 |
+
logger.error(f"Failed after {max_retries} attempts: {e}")
|
| 36 |
+
raise
|
| 37 |
+
|
| 38 |
+
delay = base_delay * (2 ** attempt)
|
| 39 |
+
logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s...")
|
| 40 |
+
time.sleep(delay)
|
| 41 |
+
|
| 42 |
+
def fetch_discriminative_subsets(self) -> Dict:
|
| 43 |
+
"""Fetch discriminative subsets from Huggingface with retry logic"""
|
| 44 |
+
def _fetch():
|
| 45 |
+
logger.info("Fetching discriminative subsets from Huggingface...")
|
| 46 |
+
|
| 47 |
+
# Load each split separately since they're organized by subset
|
| 48 |
+
subsets = {}
|
| 49 |
+
subset_names = ['frontier', 'challenging', 'hard', 'multifile']
|
| 50 |
+
|
| 51 |
+
for subset_name in subset_names:
|
| 52 |
+
dataset = load_dataset("jatinganhotra/SWE-bench_Verified-discriminative", split=subset_name)
|
| 53 |
+
subsets[subset_name] = [item['instance_id'] for item in dataset]
|
| 54 |
+
|
| 55 |
+
logger.info(f"Loaded subsets: frontier={len(subsets['frontier'])}, "
|
| 56 |
+
f"challenging={len(subsets['challenging'])}, "
|
| 57 |
+
f"hard={len(subsets['hard'])}, "
|
| 58 |
+
f"multifile={len(subsets['multifile'])}")
|
| 59 |
+
|
| 60 |
+
return subsets
|
| 61 |
+
|
| 62 |
+
# Try to load from cache first
|
| 63 |
+
if self.subsets_cache.exists():
|
| 64 |
+
logger.info("Loading discriminative subsets from cache...")
|
| 65 |
+
with open(self.subsets_cache, 'r') as f:
|
| 66 |
+
return json.load(f)
|
| 67 |
+
|
| 68 |
+
# Fetch with retry logic
|
| 69 |
+
subsets = self.fetch_with_retry(_fetch)
|
| 70 |
+
|
| 71 |
+
# Cache the results
|
| 72 |
+
with open(self.subsets_cache, 'w') as f:
|
| 73 |
+
json.dump(subsets, f, indent=2)
|
| 74 |
+
|
| 75 |
+
return subsets
|
| 76 |
+
|
| 77 |
+
def scan_experiments_directory(self) -> List[Tuple[str, str, List[str]]]:
|
| 78 |
+
"""Scan experiments directory for submission data"""
|
| 79 |
+
submissions = []
|
| 80 |
+
|
| 81 |
+
if not self.experiments_dir.exists():
|
| 82 |
+
logger.error(f"Experiments directory not found: {self.experiments_dir}")
|
| 83 |
+
return submissions
|
| 84 |
+
|
| 85 |
+
for submission_dir in self.experiments_dir.iterdir():
|
| 86 |
+
if not submission_dir.is_dir():
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
results_file = submission_dir / "results" / "results.json"
|
| 90 |
+
if not results_file.exists():
|
| 91 |
+
logger.warning(f"No results.json found for {submission_dir.name}")
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
with open(results_file, 'r') as f:
|
| 96 |
+
results = json.load(f)
|
| 97 |
+
|
| 98 |
+
resolved_instances = results.get('resolved', [])
|
| 99 |
+
submissions.append((submission_dir.name, submission_dir.name, resolved_instances))
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logger.warning(f"Error processing {submission_dir.name}: {e}")
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
logger.info(f"Found {len(submissions)} valid submissions")
|
| 106 |
+
return submissions
|
| 107 |
+
|
| 108 |
+
def calculate_subset_performance(self, resolved_instances: List[str], subsets: Dict) -> Dict:
|
| 109 |
+
"""Calculate performance across different subsets"""
|
| 110 |
+
performance = {}
|
| 111 |
+
|
| 112 |
+
for subset_name, subset_instances in subsets.items():
|
| 113 |
+
resolved_in_subset = [inst for inst in resolved_instances if inst in subset_instances]
|
| 114 |
+
total_in_subset = len(subset_instances)
|
| 115 |
+
solved_count = len(resolved_in_subset)
|
| 116 |
+
percentage = (solved_count / total_in_subset * 100) if total_in_subset > 0 else 0.0
|
| 117 |
+
|
| 118 |
+
performance[subset_name] = {
|
| 119 |
+
'solved': solved_count,
|
| 120 |
+
'total': total_in_subset,
|
| 121 |
+
'percentage': round(percentage, 1)
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
# Calculate full benchmark performance
|
| 125 |
+
total_resolved = len(resolved_instances)
|
| 126 |
+
full_percentage = (total_resolved / 500) * 100
|
| 127 |
+
performance['full'] = {
|
| 128 |
+
'solved': total_resolved,
|
| 129 |
+
'total': 500,
|
| 130 |
+
'percentage': round(full_percentage, 1)
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
return performance
|
| 134 |
+
|
| 135 |
+
def generate_verified_models_data(self) -> Tuple[List[Dict], Dict]:
|
| 136 |
+
"""Generate VERIFIED_MODELS and VERIFIED_MODELS_MAPPING data"""
|
| 137 |
+
|
| 138 |
+
# Check cache first
|
| 139 |
+
if self.models_cache.exists() and self.mapping_cache.exists():
|
| 140 |
+
logger.info("Loading models data from cache...")
|
| 141 |
+
with open(self.models_cache, 'r') as f:
|
| 142 |
+
models = json.load(f)
|
| 143 |
+
with open(self.mapping_cache, 'r') as f:
|
| 144 |
+
mapping = json.load(f)
|
| 145 |
+
return models, mapping
|
| 146 |
+
|
| 147 |
+
logger.info("Generating fresh models data...")
|
| 148 |
+
|
| 149 |
+
# Get official leaderboard mapping
|
| 150 |
+
from src.data.leaderboard_scraper import get_official_leaderboard_mapping
|
| 151 |
+
official_mapping = get_official_leaderboard_mapping(self.experiments_dir)
|
| 152 |
+
|
| 153 |
+
# Fetch discriminative subsets
|
| 154 |
+
subsets = self.fetch_discriminative_subsets()
|
| 155 |
+
|
| 156 |
+
# Scan experiments directory
|
| 157 |
+
submissions = self.scan_experiments_directory()
|
| 158 |
+
|
| 159 |
+
verified_models = []
|
| 160 |
+
verified_models_mapping = {}
|
| 161 |
+
|
| 162 |
+
for submission_id, agent_name, resolved_instances in submissions:
|
| 163 |
+
# Get official agent name and date from mapping
|
| 164 |
+
official_data = official_mapping.get(submission_id, {})
|
| 165 |
+
official_agent_name = official_data.get('official_agent_name', agent_name)
|
| 166 |
+
official_date = official_data.get('official_date', 'not yet published on leaderboard')
|
| 167 |
+
|
| 168 |
+
# Calculate performance across subsets
|
| 169 |
+
performance = self.calculate_subset_performance(resolved_instances, subsets)
|
| 170 |
+
|
| 171 |
+
# Create model entry
|
| 172 |
+
model_entry = {
|
| 173 |
+
'agent_name': official_agent_name,
|
| 174 |
+
'submission_id': submission_id,
|
| 175 |
+
'submission_date': official_date,
|
| 176 |
+
'rank': 0, # Will be calculated later
|
| 177 |
+
'full_solved': performance['full']['solved'],
|
| 178 |
+
'full_total': performance['full']['total'],
|
| 179 |
+
'full_percentage': performance['full']['percentage'],
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
# Add subset performance
|
| 183 |
+
for subset_name in ['frontier', 'challenging', 'hard', 'multifile']:
|
| 184 |
+
if subset_name in performance:
|
| 185 |
+
model_entry.update({
|
| 186 |
+
f'{subset_name}_solved': performance[subset_name]['solved'],
|
| 187 |
+
f'{subset_name}_total': performance[subset_name]['total'],
|
| 188 |
+
f'{subset_name}_percentage': performance[subset_name]['percentage']
|
| 189 |
+
})
|
| 190 |
+
|
| 191 |
+
verified_models.append(model_entry)
|
| 192 |
+
|
| 193 |
+
# Create mapping entry using submission_id as key for uniqueness
|
| 194 |
+
verified_models_mapping[submission_id] = {
|
| 195 |
+
'agent_name': official_agent_name,
|
| 196 |
+
'submission_date': official_date,
|
| 197 |
+
'resolved_instances': resolved_instances
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
# Sort by full benchmark performance and assign ranks
|
| 201 |
+
verified_models.sort(key=lambda x: x['full_percentage'], reverse=True)
|
| 202 |
+
for i, model in enumerate(verified_models):
|
| 203 |
+
model['rank'] = i + 1
|
| 204 |
+
|
| 205 |
+
# Cache the results
|
| 206 |
+
with open(self.models_cache, 'w') as f:
|
| 207 |
+
json.dump(verified_models, f, indent=2)
|
| 208 |
+
with open(self.mapping_cache, 'w') as f:
|
| 209 |
+
json.dump(verified_models_mapping, f, indent=2)
|
| 210 |
+
|
| 211 |
+
logger.info(f"Generated data for {len(verified_models)} models")
|
| 212 |
+
return verified_models, verified_models_mapping
|
| 213 |
+
|
| 214 |
+
def get_cached_data(self) -> Optional[Tuple[List[Dict], Dict]]:
|
| 215 |
+
"""Get cached data if available"""
|
| 216 |
+
if self.models_cache.exists() and self.mapping_cache.exists():
|
| 217 |
+
try:
|
| 218 |
+
with open(self.models_cache, 'r') as f:
|
| 219 |
+
models = json.load(f)
|
| 220 |
+
with open(self.mapping_cache, 'r') as f:
|
| 221 |
+
mapping = json.load(f)
|
| 222 |
+
return models, mapping
|
| 223 |
+
except Exception as e:
|
| 224 |
+
logger.warning(f"Error loading cached data: {e}")
|
| 225 |
+
return None
|
| 226 |
+
|
| 227 |
+
def refresh_cache(self):
|
| 228 |
+
"""Force refresh of cached data"""
|
| 229 |
+
logger.info("Refreshing cache...")
|
| 230 |
+
|
| 231 |
+
# Remove cache files including directory mapping
|
| 232 |
+
directory_mapping_cache = self.cache_dir / "directory_mapping.json"
|
| 233 |
+
for cache_file in [self.models_cache, self.mapping_cache, self.subsets_cache, directory_mapping_cache]:
|
| 234 |
+
if cache_file.exists():
|
| 235 |
+
cache_file.unlink()
|
| 236 |
+
|
| 237 |
+
# Regenerate data
|
| 238 |
+
return self.generate_verified_models_data()
|
| 239 |
+
|
| 240 |
+
# Global instance for easy access
|
| 241 |
+
_fetcher = None
|
| 242 |
+
|
| 243 |
+
def get_leaderboard_data() -> Tuple[List[Dict], Dict]:
|
| 244 |
+
"""Get leaderboard data (cached or fresh)"""
|
| 245 |
+
global _fetcher
|
| 246 |
+
if _fetcher is None:
|
| 247 |
+
_fetcher = LeaderboardDataFetcher()
|
| 248 |
+
|
| 249 |
+
# Try to get cached data first
|
| 250 |
+
cached_data = _fetcher.get_cached_data()
|
| 251 |
+
if cached_data is not None:
|
| 252 |
+
return cached_data
|
| 253 |
+
|
| 254 |
+
# Generate fresh data
|
| 255 |
+
return _fetcher.generate_verified_models_data()
|
| 256 |
+
|
| 257 |
+
def refresh_leaderboard_data() -> Tuple[List[Dict], Dict]:
|
| 258 |
+
"""Force refresh of leaderboard data"""
|
| 259 |
+
global _fetcher
|
| 260 |
+
if _fetcher is None:
|
| 261 |
+
_fetcher = LeaderboardDataFetcher()
|
| 262 |
+
|
| 263 |
+
return _fetcher.refresh_cache()
|
| 264 |
+
|
| 265 |
+
if __name__ == "__main__":
|
| 266 |
+
# Test the data fetcher
|
| 267 |
+
fetcher = LeaderboardDataFetcher()
|
| 268 |
+
models, mapping = fetcher.generate_verified_models_data()
|
| 269 |
+
|
| 270 |
+
print(f"Generated data for {len(models)} models")
|
| 271 |
+
print("Top 5 models:")
|
| 272 |
+
for i, model in enumerate(models[:5]):
|
| 273 |
+
print(f"{i+1}. {model['agent_name']}: {model['full_percentage']}% "
|
| 274 |
+
f"(frontier: {model.get('frontier_percentage', 0)}%)")
|
src/data/leaderboard_scraper.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import re
|
| 3 |
+
import logging
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Dict, Optional
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
import yaml
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class SWEBenchMetadataReader:
|
| 12 |
+
"""Reads metadata.yaml files from experiment directories"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, cache_dir: str = "leaderboard_cache"):
|
| 15 |
+
self.cache_dir = Path(cache_dir)
|
| 16 |
+
self.cache_dir.mkdir(exist_ok=True)
|
| 17 |
+
|
| 18 |
+
def _extract_date_from_dirname(self, dirname: str) -> str:
|
| 19 |
+
"""Extract date from directory name format YYYYMMDD_*"""
|
| 20 |
+
match = re.match(r'(\d{8})_', dirname)
|
| 21 |
+
if match:
|
| 22 |
+
date_str = match.group(1)
|
| 23 |
+
try:
|
| 24 |
+
# Convert YYYYMMDD to readable format
|
| 25 |
+
date_obj = datetime.strptime(date_str, '%Y%m%d')
|
| 26 |
+
return date_obj.strftime('%Y-%m-%d')
|
| 27 |
+
except ValueError:
|
| 28 |
+
return date_str
|
| 29 |
+
return 'not yet published on leaderboard'
|
| 30 |
+
|
| 31 |
+
def get_cached_mapping(self) -> Optional[Dict]:
|
| 32 |
+
"""Get cached directory mapping if available"""
|
| 33 |
+
mapping_file = self.cache_dir / "directory_mapping.json"
|
| 34 |
+
if mapping_file.exists():
|
| 35 |
+
try:
|
| 36 |
+
with open(mapping_file, 'r') as f:
|
| 37 |
+
return json.load(f)
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.warning(f"Error loading cached mapping: {e}")
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
def save_mapping(self, mapping: Dict):
|
| 43 |
+
"""Save directory mapping to cache"""
|
| 44 |
+
mapping_file = self.cache_dir / "directory_mapping.json"
|
| 45 |
+
with open(mapping_file, 'w') as f:
|
| 46 |
+
json.dump(mapping, f, indent=2)
|
| 47 |
+
logger.info(f"Saved directory mapping to {mapping_file}")
|
| 48 |
+
|
| 49 |
+
def get_official_leaderboard_mapping(experiments_dir: Path) -> Dict:
|
| 50 |
+
"""Get mapping between directory names and official agent data from metadata.yaml files"""
|
| 51 |
+
metadata_reader = SWEBenchMetadataReader()
|
| 52 |
+
|
| 53 |
+
# Check if we have cached mapping
|
| 54 |
+
cached_mapping = metadata_reader.get_cached_mapping()
|
| 55 |
+
if cached_mapping:
|
| 56 |
+
logger.info("Using cached directory mapping")
|
| 57 |
+
return cached_mapping
|
| 58 |
+
|
| 59 |
+
logger.info("Generating mapping from metadata.yaml files...")
|
| 60 |
+
mapping = {}
|
| 61 |
+
|
| 62 |
+
# Create mapping for all experiment directories using metadata.yaml
|
| 63 |
+
if experiments_dir.exists():
|
| 64 |
+
for exp_dir in experiments_dir.iterdir():
|
| 65 |
+
if exp_dir.is_dir():
|
| 66 |
+
dir_date = metadata_reader._extract_date_from_dirname(exp_dir.name)
|
| 67 |
+
metadata_file = exp_dir / "metadata.yaml"
|
| 68 |
+
|
| 69 |
+
# Try to read official name from metadata.yaml
|
| 70 |
+
official_name = exp_dir.name # Default fallback
|
| 71 |
+
if metadata_file.exists():
|
| 72 |
+
try:
|
| 73 |
+
with open(metadata_file, 'r', encoding='utf-8') as f:
|
| 74 |
+
metadata = yaml.safe_load(f)
|
| 75 |
+
|
| 76 |
+
# Extract official name from info.name field
|
| 77 |
+
if metadata and 'info' in metadata and 'name' in metadata['info']:
|
| 78 |
+
official_name = metadata['info']['name']
|
| 79 |
+
logger.debug(f"Found official name for {exp_dir.name}: {official_name}")
|
| 80 |
+
else:
|
| 81 |
+
logger.warning(f"No info.name found in {metadata_file}")
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.warning(f"Error reading {metadata_file}: {e}")
|
| 84 |
+
else:
|
| 85 |
+
logger.warning(f"No metadata.yaml found for {exp_dir.name}")
|
| 86 |
+
|
| 87 |
+
mapping[exp_dir.name] = {
|
| 88 |
+
'official_agent_name': official_name,
|
| 89 |
+
'official_date': dir_date,
|
| 90 |
+
'directory_date': dir_date,
|
| 91 |
+
'directory_name': exp_dir.name,
|
| 92 |
+
'score': '',
|
| 93 |
+
'matched': official_name != exp_dir.name # True if we found a different name in metadata
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
logger.info(f"Generated mapping for {len(mapping)} directories from metadata.yaml files")
|
| 97 |
+
metadata_reader.save_mapping(mapping)
|
| 98 |
+
|
| 99 |
+
return mapping
|
src/utils/__init__.py
ADDED
|
File without changes
|
src/utils/automated_updater.py
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Automated updater for SWE-Bench leaderboard data.
|
| 4 |
+
Handles daily updates, new submission detection, and semi-automated name mapping.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import subprocess
|
| 10 |
+
import tempfile
|
| 11 |
+
import shutil
|
| 12 |
+
import logging
|
| 13 |
+
import time
|
| 14 |
+
from datetime import datetime, timedelta
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Dict, List, Optional, Tuple, Set
|
| 17 |
+
|
| 18 |
+
from src.data.leaderboard_scraper import SWEBenchMetadataReader
|
| 19 |
+
from src.data.leaderboard_data_fetcher import LeaderboardDataFetcher
|
| 20 |
+
|
| 21 |
+
# Configure logging
|
| 22 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
class AutomatedUpdater:
|
| 26 |
+
"""Automated updater for SWE-Bench leaderboard data"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, cache_dir: str = "leaderboard_cache"):
|
| 29 |
+
self.cache_dir = Path(cache_dir)
|
| 30 |
+
self.cache_dir.mkdir(exist_ok=True)
|
| 31 |
+
|
| 32 |
+
# Status files
|
| 33 |
+
self.last_update_file = self.cache_dir / "last_update.json"
|
| 34 |
+
self.update_status_file = self.cache_dir / "update_status.json"
|
| 35 |
+
self.new_submissions_file = self.cache_dir / "new_submissions.json"
|
| 36 |
+
|
| 37 |
+
# Git repo details
|
| 38 |
+
self.experiments_repo_url = "https://github.com/SWE-bench/experiments.git"
|
| 39 |
+
self.experiments_dir = Path("leaderboard_swe_bench_experiments")
|
| 40 |
+
|
| 41 |
+
# Components
|
| 42 |
+
self.metadata_reader = SWEBenchMetadataReader(cache_dir)
|
| 43 |
+
self.fetcher = LeaderboardDataFetcher(cache_dir)
|
| 44 |
+
|
| 45 |
+
def get_last_update_info(self) -> Dict:
|
| 46 |
+
"""Get information about the last update"""
|
| 47 |
+
if self.last_update_file.exists():
|
| 48 |
+
try:
|
| 49 |
+
with open(self.last_update_file, 'r') as f:
|
| 50 |
+
return json.load(f)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.warning(f"Error loading last update info: {e}")
|
| 53 |
+
|
| 54 |
+
return {
|
| 55 |
+
"last_update_time": None,
|
| 56 |
+
"last_submission_count": 0,
|
| 57 |
+
"last_successful_update": None,
|
| 58 |
+
"update_status": "never_updated"
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
def save_last_update_info(self, info: Dict):
|
| 62 |
+
"""Save information about the last update"""
|
| 63 |
+
with open(self.last_update_file, 'w') as f:
|
| 64 |
+
json.dump(info, f, indent=2)
|
| 65 |
+
|
| 66 |
+
def get_update_status(self) -> Dict:
|
| 67 |
+
"""Get current update status for UI display"""
|
| 68 |
+
if self.update_status_file.exists():
|
| 69 |
+
try:
|
| 70 |
+
with open(self.update_status_file, 'r') as f:
|
| 71 |
+
return json.load(f)
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.warning(f"Error loading update status: {e}")
|
| 74 |
+
|
| 75 |
+
return {
|
| 76 |
+
"status": "unknown",
|
| 77 |
+
"message": "Update status not available",
|
| 78 |
+
"last_checked": None,
|
| 79 |
+
"new_submissions_count": 0
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
def save_update_status(self, status: Dict):
|
| 83 |
+
"""Save update status for UI display"""
|
| 84 |
+
status["last_checked"] = datetime.now().isoformat()
|
| 85 |
+
with open(self.update_status_file, 'w') as f:
|
| 86 |
+
json.dump(status, f, indent=2)
|
| 87 |
+
|
| 88 |
+
def clone_or_update_experiments_repo(self) -> bool:
|
| 89 |
+
"""Clone or update the experiments repository"""
|
| 90 |
+
try:
|
| 91 |
+
if self.experiments_dir.exists():
|
| 92 |
+
logger.info("Updating existing experiments repository...")
|
| 93 |
+
result = subprocess.run(
|
| 94 |
+
["git", "pull"],
|
| 95 |
+
cwd=self.experiments_dir,
|
| 96 |
+
capture_output=True,
|
| 97 |
+
text=True,
|
| 98 |
+
timeout=300
|
| 99 |
+
)
|
| 100 |
+
if result.returncode != 0:
|
| 101 |
+
logger.error(f"Git pull failed: {result.stderr}")
|
| 102 |
+
return False
|
| 103 |
+
else:
|
| 104 |
+
logger.info("Cloning experiments repository...")
|
| 105 |
+
result = subprocess.run(
|
| 106 |
+
["git", "clone", self.experiments_repo_url, str(self.experiments_dir)],
|
| 107 |
+
capture_output=True,
|
| 108 |
+
text=True,
|
| 109 |
+
timeout=600
|
| 110 |
+
)
|
| 111 |
+
if result.returncode != 0:
|
| 112 |
+
logger.error(f"Git clone failed: {result.stderr}")
|
| 113 |
+
return False
|
| 114 |
+
|
| 115 |
+
logger.info("Successfully updated experiments repository")
|
| 116 |
+
return True
|
| 117 |
+
|
| 118 |
+
except subprocess.TimeoutExpired:
|
| 119 |
+
logger.error("Git operation timed out")
|
| 120 |
+
return False
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"Error updating experiments repository: {e}")
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
def detect_new_submissions(self) -> Tuple[List[str], int]:
|
| 126 |
+
"""Detect new submissions by comparing current directory list with cached list"""
|
| 127 |
+
verified_dir = self.experiments_dir / "evaluation" / "verified"
|
| 128 |
+
|
| 129 |
+
if not verified_dir.exists():
|
| 130 |
+
logger.error(f"Verified directory not found: {verified_dir}")
|
| 131 |
+
return [], 0
|
| 132 |
+
|
| 133 |
+
# Get current submission directories
|
| 134 |
+
current_submissions = set()
|
| 135 |
+
for submission_dir in verified_dir.iterdir():
|
| 136 |
+
if submission_dir.is_dir():
|
| 137 |
+
results_file = submission_dir / "results" / "results.json"
|
| 138 |
+
if results_file.exists():
|
| 139 |
+
current_submissions.add(submission_dir.name)
|
| 140 |
+
|
| 141 |
+
# Load previous submission list
|
| 142 |
+
previous_submissions = set()
|
| 143 |
+
if self.new_submissions_file.exists():
|
| 144 |
+
try:
|
| 145 |
+
with open(self.new_submissions_file, 'r') as f:
|
| 146 |
+
data = json.load(f)
|
| 147 |
+
previous_submissions = set(data.get("submissions", []))
|
| 148 |
+
except Exception as e:
|
| 149 |
+
logger.warning(f"Error loading previous submissions: {e}")
|
| 150 |
+
|
| 151 |
+
# Find new submissions
|
| 152 |
+
new_submissions = list(current_submissions - previous_submissions)
|
| 153 |
+
|
| 154 |
+
# Save current submissions for next time
|
| 155 |
+
submission_data = {
|
| 156 |
+
"submissions": list(current_submissions),
|
| 157 |
+
"last_check": datetime.now().isoformat(),
|
| 158 |
+
"new_since_last_check": new_submissions
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
with open(self.new_submissions_file, 'w') as f:
|
| 162 |
+
json.dump(submission_data, f, indent=2)
|
| 163 |
+
|
| 164 |
+
logger.info(f"Detected {len(new_submissions)} new submissions out of {len(current_submissions)} total")
|
| 165 |
+
return new_submissions, len(current_submissions)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def process_new_submissions(self, new_submissions: List[str]) -> Dict:
|
| 169 |
+
"""Process new submissions and create mapping entries"""
|
| 170 |
+
if not new_submissions:
|
| 171 |
+
return {"processed": 0, "auto_matched": 0}
|
| 172 |
+
|
| 173 |
+
logger.info(f"Processing {len(new_submissions)} new submissions...")
|
| 174 |
+
|
| 175 |
+
# All submissions are automatically processed since we read names from metadata.yaml
|
| 176 |
+
auto_matched = []
|
| 177 |
+
|
| 178 |
+
for submission_id in new_submissions:
|
| 179 |
+
logger.info(f"Processing new submission: {submission_id}")
|
| 180 |
+
|
| 181 |
+
auto_matched.append({
|
| 182 |
+
"submission_id": submission_id,
|
| 183 |
+
"matched_name": "Will be read from metadata.yaml",
|
| 184 |
+
"confidence": 1.0
|
| 185 |
+
})
|
| 186 |
+
|
| 187 |
+
return {
|
| 188 |
+
"processed": len(new_submissions),
|
| 189 |
+
"auto_matched": len(auto_matched),
|
| 190 |
+
"auto_matched_details": auto_matched
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
def run_daily_update(self) -> Dict:
|
| 194 |
+
"""Run the daily update process"""
|
| 195 |
+
logger.info("Starting daily update process...")
|
| 196 |
+
|
| 197 |
+
update_result = {
|
| 198 |
+
"success": False,
|
| 199 |
+
"timestamp": datetime.now().isoformat(),
|
| 200 |
+
"message": "",
|
| 201 |
+
"new_submissions_count": 0,
|
| 202 |
+
"error": None
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
# Step 1: Update git repository
|
| 207 |
+
logger.info("Step 1: Updating experiments repository...")
|
| 208 |
+
if not self.clone_or_update_experiments_repo():
|
| 209 |
+
raise Exception("Failed to update experiments repository")
|
| 210 |
+
|
| 211 |
+
# Step 2: Detect new submissions
|
| 212 |
+
logger.info("Step 2: Detecting new submissions...")
|
| 213 |
+
new_submissions, total_submissions = self.detect_new_submissions()
|
| 214 |
+
update_result["new_submissions_count"] = len(new_submissions)
|
| 215 |
+
|
| 216 |
+
if new_submissions:
|
| 217 |
+
logger.info(f"Found {len(new_submissions)} new submissions: {new_submissions}")
|
| 218 |
+
|
| 219 |
+
# Step 3: Process new submissions
|
| 220 |
+
logger.info("Step 3: Processing new submissions...")
|
| 221 |
+
processing_result = self.process_new_submissions(new_submissions)
|
| 222 |
+
|
| 223 |
+
update_result["message"] = f"Found {len(new_submissions)} new submissions. All automatically processed with official names from metadata."
|
| 224 |
+
|
| 225 |
+
else:
|
| 226 |
+
update_result["message"] = f"No new submissions found. Total: {total_submissions} submissions."
|
| 227 |
+
|
| 228 |
+
# Step 4: Refresh leaderboard data cache
|
| 229 |
+
logger.info("Step 4: Refreshing leaderboard data cache...")
|
| 230 |
+
try:
|
| 231 |
+
self.fetcher.refresh_cache()
|
| 232 |
+
logger.info("Successfully refreshed leaderboard data cache")
|
| 233 |
+
except Exception as e:
|
| 234 |
+
logger.warning(f"Error refreshing cache: {e}")
|
| 235 |
+
# Continue anyway - this is not critical for detecting new submissions
|
| 236 |
+
|
| 237 |
+
update_result["success"] = True
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"Daily update failed: {e}")
|
| 241 |
+
update_result["error"] = str(e)
|
| 242 |
+
update_result["message"] = f"Update failed: {e}"
|
| 243 |
+
|
| 244 |
+
# Save update status
|
| 245 |
+
self.save_update_status(update_result)
|
| 246 |
+
|
| 247 |
+
# Update last update info
|
| 248 |
+
last_update_info = self.get_last_update_info()
|
| 249 |
+
last_update_info.update({
|
| 250 |
+
"last_update_time": update_result["timestamp"],
|
| 251 |
+
"update_status": "success" if update_result["success"] else "failed",
|
| 252 |
+
"last_error": update_result.get("error")
|
| 253 |
+
})
|
| 254 |
+
|
| 255 |
+
if update_result["success"]:
|
| 256 |
+
last_update_info["last_successful_update"] = update_result["timestamp"]
|
| 257 |
+
|
| 258 |
+
self.save_last_update_info(last_update_info)
|
| 259 |
+
|
| 260 |
+
logger.info(f"Daily update completed. Success: {update_result['success']}")
|
| 261 |
+
return update_result
|
| 262 |
+
|
| 263 |
+
def should_run_update(self) -> bool:
|
| 264 |
+
"""Check if an update should be run based on timing"""
|
| 265 |
+
last_update_info = self.get_last_update_info()
|
| 266 |
+
|
| 267 |
+
if not last_update_info.get("last_update_time"):
|
| 268 |
+
return True # Never updated
|
| 269 |
+
|
| 270 |
+
try:
|
| 271 |
+
last_update = datetime.fromisoformat(last_update_info["last_update_time"])
|
| 272 |
+
time_since_update = datetime.now() - last_update
|
| 273 |
+
|
| 274 |
+
# Run update if more than 24 hours since last update
|
| 275 |
+
return time_since_update > timedelta(hours=24)
|
| 276 |
+
|
| 277 |
+
except ValueError:
|
| 278 |
+
return True # Invalid timestamp, run update
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def run_automated_update() -> Dict:
|
| 282 |
+
"""Entry point for running automated update"""
|
| 283 |
+
updater = AutomatedUpdater()
|
| 284 |
+
return updater.run_daily_update()
|
| 285 |
+
|
| 286 |
+
def get_update_status() -> Dict:
|
| 287 |
+
"""Get current update status for UI"""
|
| 288 |
+
updater = AutomatedUpdater()
|
| 289 |
+
return updater.get_update_status()
|
| 290 |
+
|
| 291 |
+
def get_last_update_info() -> Dict:
|
| 292 |
+
"""Get last update information for UI"""
|
| 293 |
+
updater = AutomatedUpdater()
|
| 294 |
+
return updater.get_last_update_info()
|
| 295 |
+
|
| 296 |
+
def should_show_update_notification() -> bool:
|
| 297 |
+
"""Check if update notification should be shown in UI"""
|
| 298 |
+
updater = AutomatedUpdater()
|
| 299 |
+
status = updater.get_update_status()
|
| 300 |
+
return status.get("new_submissions_count", 0) > 0
|
| 301 |
+
|
| 302 |
+
if __name__ == "__main__":
|
| 303 |
+
# Run automated update
|
| 304 |
+
result = run_automated_update()
|
| 305 |
+
print(json.dumps(result, indent=2))
|
src/utils/export_utils.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Export utilities for downloading leaderboard data in various formats
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import json
|
| 7 |
+
import csv
|
| 8 |
+
import io
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import List, Dict, Any, Tuple
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def format_export_data(performance_data: List[Dict[str, Any]], include_metadata: bool = True) -> List[Dict[str, Any]]:
|
| 14 |
+
"""
|
| 15 |
+
Format performance data for export with clean column names and optional metadata
|
| 16 |
+
"""
|
| 17 |
+
export_data = []
|
| 18 |
+
|
| 19 |
+
for row in performance_data:
|
| 20 |
+
export_row = {
|
| 21 |
+
'rank': row['rank'],
|
| 22 |
+
'agent_name': row['agent_name'],
|
| 23 |
+
'submission_id': row['submission_id'],
|
| 24 |
+
'submission_date': row.get('submission_date', 'not yet published on leaderboard'),
|
| 25 |
+
|
| 26 |
+
# Full benchmark
|
| 27 |
+
'full_solved': row['full_solved'],
|
| 28 |
+
'full_total': row['full_total'],
|
| 29 |
+
'full_percentage': row['full_percentage'],
|
| 30 |
+
|
| 31 |
+
# Frontier subset
|
| 32 |
+
'frontier_solved': row['frontier_solved'],
|
| 33 |
+
'frontier_total': row['frontier_total'],
|
| 34 |
+
'frontier_percentage': row['frontier_percentage'],
|
| 35 |
+
|
| 36 |
+
# Challenging subset
|
| 37 |
+
'challenging_solved': row['challenging_solved'],
|
| 38 |
+
'challenging_total': row['challenging_total'],
|
| 39 |
+
'challenging_percentage': row['challenging_percentage'],
|
| 40 |
+
|
| 41 |
+
# Hard subset
|
| 42 |
+
'hard_solved': row['hard_solved'],
|
| 43 |
+
'hard_total': row['hard_total'],
|
| 44 |
+
'hard_percentage': row['hard_percentage'],
|
| 45 |
+
|
| 46 |
+
# MultiFile subset
|
| 47 |
+
'multifile_solved': row['multifile_solved'],
|
| 48 |
+
'multifile_total': row['multifile_total'],
|
| 49 |
+
'multifile_percentage': row['multifile_percentage'],
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
export_data.append(export_row)
|
| 53 |
+
|
| 54 |
+
return export_data
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def create_csv_export(performance_data: List[Dict[str, Any]], filename_prefix: str = "swe_bench_leaderboard") -> Tuple[str, str]:
|
| 58 |
+
"""
|
| 59 |
+
Create CSV export from performance data
|
| 60 |
+
Returns: (csv_content, filename)
|
| 61 |
+
"""
|
| 62 |
+
export_data = format_export_data(performance_data)
|
| 63 |
+
|
| 64 |
+
# Create CSV content
|
| 65 |
+
output = io.StringIO()
|
| 66 |
+
if export_data:
|
| 67 |
+
fieldnames = export_data[0].keys()
|
| 68 |
+
writer = csv.DictWriter(output, fieldnames=fieldnames)
|
| 69 |
+
writer.writeheader()
|
| 70 |
+
writer.writerows(export_data)
|
| 71 |
+
|
| 72 |
+
csv_content = output.getvalue()
|
| 73 |
+
output.close()
|
| 74 |
+
|
| 75 |
+
# Generate filename with timestamp
|
| 76 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 77 |
+
filename = f"{filename_prefix}_{timestamp}.csv"
|
| 78 |
+
|
| 79 |
+
return csv_content, filename
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def create_json_export(performance_data: List[Dict[str, Any]], filename_prefix: str = "swe_bench_leaderboard") -> Tuple[str, str]:
|
| 83 |
+
"""
|
| 84 |
+
Create JSON export from performance data
|
| 85 |
+
Returns: (json_content, filename)
|
| 86 |
+
"""
|
| 87 |
+
export_data = format_export_data(performance_data)
|
| 88 |
+
|
| 89 |
+
# Create export metadata
|
| 90 |
+
export_metadata = {
|
| 91 |
+
"export_timestamp": datetime.now().isoformat(),
|
| 92 |
+
"total_agents": len(export_data),
|
| 93 |
+
"data_description": "SWE-Bench Verified Discriminative Subsets Leaderboard Data",
|
| 94 |
+
"subsets": {
|
| 95 |
+
"full": {"name": "SWE-Bench Verified", "total_instances": 500},
|
| 96 |
+
"frontier": {"name": "Frontier Subset", "total_instances": 95, "description": "Problems solved by ≤5 agents"},
|
| 97 |
+
"challenging": {"name": "Challenging Subset", "total_instances": 155, "description": "Problems solved by ≤20 agents"},
|
| 98 |
+
"hard": {"name": "Hard Subset", "total_instances": 45, "description": "All Hard difficulty problems"},
|
| 99 |
+
"multifile": {"name": "MultiFile Subset", "total_instances": 40, "description": "Multi-file problems solved by ≤10 agents"}
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
# Combine metadata and data
|
| 104 |
+
full_export = {
|
| 105 |
+
"metadata": export_metadata,
|
| 106 |
+
"leaderboard_data": export_data
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
json_content = json.dumps(full_export, indent=2, ensure_ascii=False)
|
| 110 |
+
|
| 111 |
+
# Generate filename with timestamp
|
| 112 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 113 |
+
filename = f"{filename_prefix}_{timestamp}.json"
|
| 114 |
+
|
| 115 |
+
return json_content, filename
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def create_subset_export(performance_data: List[Dict[str, Any]], subset_name: str, format_type: str = "csv") -> Tuple[str, str]:
|
| 119 |
+
"""
|
| 120 |
+
Create export for a specific subset with relevant columns only
|
| 121 |
+
"""
|
| 122 |
+
if subset_name == "full":
|
| 123 |
+
subset_columns = ['rank', 'agent_name', 'submission_date', 'full_solved', 'full_total', 'full_percentage']
|
| 124 |
+
filename_prefix = "swe_bench_full_benchmark"
|
| 125 |
+
elif subset_name == "frontier":
|
| 126 |
+
subset_columns = ['rank', 'agent_name', 'submission_date', 'frontier_solved', 'frontier_total', 'frontier_percentage']
|
| 127 |
+
filename_prefix = "swe_bench_frontier_subset"
|
| 128 |
+
elif subset_name == "challenging":
|
| 129 |
+
subset_columns = ['rank', 'agent_name', 'submission_date', 'challenging_solved', 'challenging_total', 'challenging_percentage']
|
| 130 |
+
filename_prefix = "swe_bench_challenging_subset"
|
| 131 |
+
elif subset_name == "hard":
|
| 132 |
+
subset_columns = ['rank', 'agent_name', 'submission_date', 'hard_solved', 'hard_total', 'hard_percentage']
|
| 133 |
+
filename_prefix = "swe_bench_hard_subset"
|
| 134 |
+
elif subset_name == "multifile":
|
| 135 |
+
subset_columns = ['rank', 'agent_name', 'submission_date', 'multifile_solved', 'multifile_total', 'multifile_percentage']
|
| 136 |
+
filename_prefix = "swe_bench_multifile_subset"
|
| 137 |
+
else:
|
| 138 |
+
raise ValueError(f"Unknown subset: {subset_name}")
|
| 139 |
+
|
| 140 |
+
# Filter data to only include relevant columns
|
| 141 |
+
export_data = format_export_data(performance_data)
|
| 142 |
+
filtered_data = []
|
| 143 |
+
|
| 144 |
+
for row in export_data:
|
| 145 |
+
filtered_row = {col: row[col] for col in subset_columns if col in row}
|
| 146 |
+
filtered_data.append(filtered_row)
|
| 147 |
+
|
| 148 |
+
if format_type.lower() == "csv":
|
| 149 |
+
# Create CSV content
|
| 150 |
+
output = io.StringIO()
|
| 151 |
+
if filtered_data:
|
| 152 |
+
fieldnames = filtered_data[0].keys()
|
| 153 |
+
writer = csv.DictWriter(output, fieldnames=fieldnames)
|
| 154 |
+
writer.writeheader()
|
| 155 |
+
writer.writerows(filtered_data)
|
| 156 |
+
|
| 157 |
+
content = output.getvalue()
|
| 158 |
+
output.close()
|
| 159 |
+
|
| 160 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 161 |
+
filename = f"{filename_prefix}_{timestamp}.csv"
|
| 162 |
+
|
| 163 |
+
elif format_type.lower() == "json":
|
| 164 |
+
# Create JSON content with metadata
|
| 165 |
+
subset_metadata = {
|
| 166 |
+
"export_timestamp": datetime.now().isoformat(),
|
| 167 |
+
"subset": subset_name,
|
| 168 |
+
"total_agents": len(filtered_data),
|
| 169 |
+
"description": f"SWE-Bench {subset_name.title()} Subset Results"
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
full_export = {
|
| 173 |
+
"metadata": subset_metadata,
|
| 174 |
+
"results": filtered_data
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
content = json.dumps(full_export, indent=2, ensure_ascii=False)
|
| 178 |
+
|
| 179 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 180 |
+
filename = f"{filename_prefix}_{timestamp}.json"
|
| 181 |
+
|
| 182 |
+
else:
|
| 183 |
+
raise ValueError(f"Unsupported format: {format_type}")
|
| 184 |
+
|
| 185 |
+
return content, filename
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def create_comparison_export(performance_data: List[Dict[str, Any]], agents_to_compare: List[str]) -> Tuple[str, str]:
|
| 189 |
+
"""
|
| 190 |
+
Create export comparing specific agents across all subsets
|
| 191 |
+
"""
|
| 192 |
+
# Filter data for selected agents
|
| 193 |
+
comparison_data = [row for row in performance_data if row['agent_name'] in agents_to_compare]
|
| 194 |
+
|
| 195 |
+
# Create comparison-focused format
|
| 196 |
+
comparison_rows = []
|
| 197 |
+
for row in comparison_data:
|
| 198 |
+
comparison_row = {
|
| 199 |
+
'agent_name': row['agent_name'],
|
| 200 |
+
'submission_date': row.get('submission_date', 'not yet published on leaderboard'),
|
| 201 |
+
'full_percentage': row['full_percentage'],
|
| 202 |
+
'frontier_percentage': row['frontier_percentage'],
|
| 203 |
+
'challenging_percentage': row['challenging_percentage'],
|
| 204 |
+
'hard_percentage': row['hard_percentage'],
|
| 205 |
+
'multifile_percentage': row['multifile_percentage'],
|
| 206 |
+
'frontier_vs_full_drop': round(row['full_percentage'] - row['frontier_percentage'], 1),
|
| 207 |
+
'challenging_vs_full_drop': round(row['full_percentage'] - row['challenging_percentage'], 1),
|
| 208 |
+
}
|
| 209 |
+
comparison_rows.append(comparison_row)
|
| 210 |
+
|
| 211 |
+
# Sort by full benchmark performance
|
| 212 |
+
comparison_rows.sort(key=lambda x: x['full_percentage'], reverse=True)
|
| 213 |
+
|
| 214 |
+
# Create JSON export
|
| 215 |
+
export_data = {
|
| 216 |
+
"metadata": {
|
| 217 |
+
"export_timestamp": datetime.now().isoformat(),
|
| 218 |
+
"comparison_type": "agent_comparison",
|
| 219 |
+
"agents_compared": agents_to_compare,
|
| 220 |
+
"total_agents": len(comparison_rows)
|
| 221 |
+
},
|
| 222 |
+
"comparison_data": comparison_rows
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
content = json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 226 |
+
|
| 227 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 228 |
+
filename = f"swe_bench_agent_comparison_{timestamp}.json"
|
| 229 |
+
|
| 230 |
+
return content, filename
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
# Gradio-compatible export functions
|
| 234 |
+
import tempfile
|
| 235 |
+
import os
|
| 236 |
+
|
| 237 |
+
def export_full_leaderboard_csv(performance_data: List[Dict[str, Any]]) -> str:
|
| 238 |
+
"""Gradio-compatible function to export full leaderboard as CSV"""
|
| 239 |
+
content, filename = create_csv_export(performance_data, "swe_bench_full_leaderboard")
|
| 240 |
+
|
| 241 |
+
# Write to system temp directory for Gradio download
|
| 242 |
+
temp_dir = tempfile.gettempdir()
|
| 243 |
+
temp_path = os.path.join(temp_dir, filename)
|
| 244 |
+
with open(temp_path, 'w', encoding='utf-8') as f:
|
| 245 |
+
f.write(content)
|
| 246 |
+
|
| 247 |
+
return temp_path
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def export_full_leaderboard_json(performance_data: List[Dict[str, Any]]) -> str:
|
| 251 |
+
"""Gradio-compatible function to export full leaderboard as JSON"""
|
| 252 |
+
content, filename = create_json_export(performance_data, "swe_bench_full_leaderboard")
|
| 253 |
+
|
| 254 |
+
# Write to system temp directory for Gradio download
|
| 255 |
+
temp_dir = tempfile.gettempdir()
|
| 256 |
+
temp_path = os.path.join(temp_dir, filename)
|
| 257 |
+
with open(temp_path, 'w', encoding='utf-8') as f:
|
| 258 |
+
f.write(content)
|
| 259 |
+
|
| 260 |
+
return temp_path
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def export_subset_csv(performance_data: List[Dict[str, Any]], subset_name: str) -> str:
|
| 264 |
+
"""Gradio-compatible function to export specific subset as CSV"""
|
| 265 |
+
content, filename = create_subset_export(performance_data, subset_name, "csv")
|
| 266 |
+
|
| 267 |
+
# Write to system temp directory for Gradio download
|
| 268 |
+
temp_dir = tempfile.gettempdir()
|
| 269 |
+
temp_path = os.path.join(temp_dir, filename)
|
| 270 |
+
with open(temp_path, 'w', encoding='utf-8') as f:
|
| 271 |
+
f.write(content)
|
| 272 |
+
|
| 273 |
+
return temp_path
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def export_subset_json(performance_data: List[Dict[str, Any]], subset_name: str) -> str:
|
| 277 |
+
"""Gradio-compatible function to export specific subset as JSON"""
|
| 278 |
+
content, filename = create_subset_export(performance_data, subset_name, "json")
|
| 279 |
+
|
| 280 |
+
# Write to system temp directory for Gradio download
|
| 281 |
+
temp_dir = tempfile.gettempdir()
|
| 282 |
+
temp_path = os.path.join(temp_dir, filename)
|
| 283 |
+
with open(temp_path, 'w', encoding='utf-8') as f:
|
| 284 |
+
f.write(content)
|
| 285 |
+
|
| 286 |
+
return temp_path
|
src/utils/update_scheduler.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Scheduler for automated SWE-Bench leaderboard updates.
|
| 4 |
+
Can be run as a standalone script or integrated into the Gradio app.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
import schedule
|
| 9 |
+
import threading
|
| 10 |
+
import logging
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import json
|
| 14 |
+
|
| 15 |
+
from src.utils.automated_updater import AutomatedUpdater
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
class UpdateScheduler:
|
| 20 |
+
"""Scheduler for automated leaderboard updates"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
self.updater = AutomatedUpdater()
|
| 24 |
+
self.running = False
|
| 25 |
+
self.scheduler_thread = None
|
| 26 |
+
|
| 27 |
+
def run_scheduled_update(self):
|
| 28 |
+
"""Run the scheduled update with error handling"""
|
| 29 |
+
try:
|
| 30 |
+
logger.info("Running scheduled update...")
|
| 31 |
+
result = self.updater.run_daily_update()
|
| 32 |
+
|
| 33 |
+
if result["success"]:
|
| 34 |
+
logger.info(f"Scheduled update completed successfully. {result['message']}")
|
| 35 |
+
else:
|
| 36 |
+
logger.error(f"Scheduled update failed: {result['message']}")
|
| 37 |
+
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.error(f"Error in scheduled update: {e}")
|
| 40 |
+
|
| 41 |
+
def start_scheduler(self):
|
| 42 |
+
"""Start the update scheduler"""
|
| 43 |
+
if self.running:
|
| 44 |
+
logger.warning("Scheduler is already running")
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
# Schedule daily updates at 6 AM UTC
|
| 48 |
+
schedule.every().day.at("06:00").do(self.run_scheduled_update)
|
| 49 |
+
|
| 50 |
+
# Also run immediately if it's been more than 24 hours
|
| 51 |
+
if self.updater.should_run_update():
|
| 52 |
+
logger.info("Running immediate update (more than 24 hours since last update)")
|
| 53 |
+
schedule.every().minute.do(self._run_immediate_update).tag('immediate')
|
| 54 |
+
|
| 55 |
+
self.running = True
|
| 56 |
+
self.scheduler_thread = threading.Thread(target=self._scheduler_loop, daemon=True)
|
| 57 |
+
self.scheduler_thread.start()
|
| 58 |
+
|
| 59 |
+
logger.info("Update scheduler started")
|
| 60 |
+
|
| 61 |
+
def _run_immediate_update(self):
|
| 62 |
+
"""Run immediate update and remove the job"""
|
| 63 |
+
self.run_scheduled_update()
|
| 64 |
+
schedule.clear('immediate') # Remove the immediate update job
|
| 65 |
+
return schedule.CancelJob
|
| 66 |
+
|
| 67 |
+
def stop_scheduler(self):
|
| 68 |
+
"""Stop the update scheduler"""
|
| 69 |
+
if not self.running:
|
| 70 |
+
return
|
| 71 |
+
|
| 72 |
+
self.running = False
|
| 73 |
+
schedule.clear()
|
| 74 |
+
|
| 75 |
+
if self.scheduler_thread and self.scheduler_thread.is_alive():
|
| 76 |
+
self.scheduler_thread.join(timeout=5)
|
| 77 |
+
|
| 78 |
+
logger.info("Update scheduler stopped")
|
| 79 |
+
|
| 80 |
+
def _scheduler_loop(self):
|
| 81 |
+
"""Main scheduler loop"""
|
| 82 |
+
while self.running:
|
| 83 |
+
try:
|
| 84 |
+
schedule.run_pending()
|
| 85 |
+
time.sleep(60) # Check every minute
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"Error in scheduler loop: {e}")
|
| 88 |
+
time.sleep(60)
|
| 89 |
+
|
| 90 |
+
def force_update(self):
|
| 91 |
+
"""Force an immediate update"""
|
| 92 |
+
logger.info("Forcing immediate update...")
|
| 93 |
+
return self.updater.run_daily_update()
|
| 94 |
+
|
| 95 |
+
def get_status(self):
|
| 96 |
+
"""Get scheduler status"""
|
| 97 |
+
return {
|
| 98 |
+
"running": self.running,
|
| 99 |
+
"next_run": str(schedule.next_run()) if schedule.jobs else None,
|
| 100 |
+
"job_count": len(schedule.jobs)
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
# Global scheduler instance
|
| 104 |
+
_scheduler = None
|
| 105 |
+
|
| 106 |
+
def get_scheduler() -> UpdateScheduler:
|
| 107 |
+
"""Get the global scheduler instance"""
|
| 108 |
+
global _scheduler
|
| 109 |
+
if _scheduler is None:
|
| 110 |
+
_scheduler = UpdateScheduler()
|
| 111 |
+
return _scheduler
|
| 112 |
+
|
| 113 |
+
def start_automated_updates():
|
| 114 |
+
"""Start automated updates"""
|
| 115 |
+
scheduler = get_scheduler()
|
| 116 |
+
scheduler.start_scheduler()
|
| 117 |
+
|
| 118 |
+
def stop_automated_updates():
|
| 119 |
+
"""Stop automated updates"""
|
| 120 |
+
scheduler = get_scheduler()
|
| 121 |
+
scheduler.stop_scheduler()
|
| 122 |
+
|
| 123 |
+
def force_update():
|
| 124 |
+
"""Force an immediate update"""
|
| 125 |
+
scheduler = get_scheduler()
|
| 126 |
+
return scheduler.force_update()
|
| 127 |
+
|
| 128 |
+
def get_scheduler_status():
|
| 129 |
+
"""Get scheduler status"""
|
| 130 |
+
scheduler = get_scheduler()
|
| 131 |
+
return scheduler.get_status()
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
# Run as standalone scheduler
|
| 135 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 136 |
+
|
| 137 |
+
scheduler = UpdateScheduler()
|
| 138 |
+
scheduler.start_scheduler()
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
# Keep running
|
| 142 |
+
while True:
|
| 143 |
+
time.sleep(60)
|
| 144 |
+
except KeyboardInterrupt:
|
| 145 |
+
logger.info("Received interrupt signal")
|
| 146 |
+
scheduler.stop_scheduler()
|