Spaces:
Running
Running
Konstantin Chernyshev
commited on
Commit
·
c933ce0
1
Parent(s):
7c8ff05
fix: add charts
Browse files- app.py +18 -1
- data/mu_math_eval_results.json +10 -10
- src/populate.py +40 -5
app.py
CHANGED
|
@@ -186,9 +186,25 @@ with demo:
|
|
| 186 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 187 |
with gr.TabItem("🏆 U-MATH", elem_id="u-math-benchmark-tab-table", id=0):
|
| 188 |
leaderboard_umath = init_leaderboard(LEADERBOARD_U_MATH_DF, U_MATH_COLUMNS_DICT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
with gr.TabItem("🏅 μ-MATH (Meta-Benchmark)", elem_id="mu-math-benchmark-tab-table", id=1):
|
| 191 |
leaderboard_mumath = init_leaderboard(LEADERBOARD_MU_MATH_DF, MU_MATH_COLUMNS_DICT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
with gr.TabItem("📝 About", elem_id="about-tab-table", id=2):
|
| 194 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
@@ -205,4 +221,5 @@ with demo:
|
|
| 205 |
scheduler = BackgroundScheduler()
|
| 206 |
scheduler.add_job(restart_space, "interval", seconds=60 * 60)
|
| 207 |
scheduler.start()
|
| 208 |
-
demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
|
|
|
|
|
|
| 186 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 187 |
with gr.TabItem("🏆 U-MATH", elem_id="u-math-benchmark-tab-table", id=0):
|
| 188 |
leaderboard_umath = init_leaderboard(LEADERBOARD_U_MATH_DF, U_MATH_COLUMNS_DICT)
|
| 189 |
+
gr.ScatterPlot(
|
| 190 |
+
value=LEADERBOARD_U_MATH_DF,
|
| 191 |
+
title="U-MATH: Text vs Visual Accuracy",
|
| 192 |
+
x=U_MATH_COLUMNS_DICT["u_math_text_acc"].pretty_name,
|
| 193 |
+
y=U_MATH_COLUMNS_DICT["u_math_visual_acc"].pretty_name,
|
| 194 |
+
color=U_MATH_COLUMNS_DICT["model_family"].pretty_name,
|
| 195 |
+
tooltip=[U_MATH_COLUMNS_DICT["full_model_name"].pretty_name, U_MATH_COLUMNS_DICT["u_math_acc"].pretty_name],
|
| 196 |
+
)
|
| 197 |
|
| 198 |
with gr.TabItem("🏅 μ-MATH (Meta-Benchmark)", elem_id="mu-math-benchmark-tab-table", id=1):
|
| 199 |
leaderboard_mumath = init_leaderboard(LEADERBOARD_MU_MATH_DF, MU_MATH_COLUMNS_DICT)
|
| 200 |
+
gr.ScatterPlot(
|
| 201 |
+
value=LEADERBOARD_MU_MATH_DF,
|
| 202 |
+
title="μ-MATH: True Positive Rate (Recall) vs True Negative Rate (Specificity)",
|
| 203 |
+
x=MU_MATH_COLUMNS_DICT["mu_math_tpr"].pretty_name,
|
| 204 |
+
y=MU_MATH_COLUMNS_DICT["mu_math_tnr"].pretty_name,
|
| 205 |
+
color=MU_MATH_COLUMNS_DICT["model_family"].pretty_name,
|
| 206 |
+
tooltip=[MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name],
|
| 207 |
+
)
|
| 208 |
|
| 209 |
with gr.TabItem("📝 About", elem_id="about-tab-table", id=2):
|
| 210 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 221 |
scheduler = BackgroundScheduler()
|
| 222 |
scheduler.add_job(restart_space, "interval", seconds=60 * 60)
|
| 223 |
scheduler.start()
|
| 224 |
+
# demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
|
| 225 |
+
demo.queue(default_concurrency_limit=40).launch()
|
data/mu_math_eval_results.json
CHANGED
|
@@ -2,19 +2,19 @@
|
|
| 2 |
{
|
| 3 |
"model_name": "mistralai/Ministral-8B-Instruct-2410",
|
| 4 |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 5 |
-
"mu_math": [0.664, 0.33, 0.651, 0.68, 0.701, 0.628],
|
| 6 |
-
"GPT-4o": [0.664, 0.332, 0.621, 0.71, 0.696, 0.637],
|
| 7 |
-
"Gemini-1.5-Pro": [0.672, 0.279, 0.709, 0.585, 0.798, 0.466],
|
| 8 |
-
"Llama-3.1-70B-Instruct": [0.675, 0.317, 0.619, 0.707, 0.541, 0.769],
|
| 9 |
-
"Qwen2.5-72B-Instruct": [0.646, 0.295, 0.626, 0.672, 0.719, 0.574]
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"model_name": "meta-llama/Llama-3.3-70B-Instruct",
|
| 13 |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 14 |
-
"mu_math": [0.741, 0.496, 0.666, 0.827, 0.816, 0.682],
|
| 15 |
-
"GPT-4o": [0.731, 0.475, 0.636, 0.832, 0.802, 0.681],
|
| 16 |
-
"Gemini-1.5-Pro": [0.705, 0.394, 0.693, 0.732, 0.856, 0.508],
|
| 17 |
-
"Llama-3.1-70B-Instruct": [0.823, 0.605, 0.67, 0.908, 0.802, 0.832],
|
| 18 |
-
"Qwen2.5-72B-Instruct": [0.705, 0.421, 0.658, 0.767, 0.791, 0.627]
|
| 19 |
}
|
| 20 |
]
|
|
|
|
| 2 |
{
|
| 3 |
"model_name": "mistralai/Ministral-8B-Instruct-2410",
|
| 4 |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 5 |
+
"mu_math": [0.664, 0.33, 0.651, 0.68, 0.701, 0.628, 0.574],
|
| 6 |
+
"GPT-4o": [0.664, 0.332, 0.621, 0.71, 0.696, 0.637, 0.574],
|
| 7 |
+
"Gemini-1.5-Pro": [0.672, 0.279, 0.709, 0.585, 0.798, 0.466, 0.574],
|
| 8 |
+
"Llama-3.1-70B-Instruct": [0.675, 0.317, 0.619, 0.707, 0.541, 0.769, 0.574],
|
| 9 |
+
"Qwen2.5-72B-Instruct": [0.646, 0.295, 0.626, 0.672, 0.719, 0.574, 0.574]
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"model_name": "meta-llama/Llama-3.3-70B-Instruct",
|
| 13 |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 14 |
+
"mu_math": [0.741, 0.496, 0.666, 0.827, 0.816, 0.682, 0.574],
|
| 15 |
+
"GPT-4o": [0.731, 0.475, 0.636, 0.832, 0.802, 0.681, 0.574],
|
| 16 |
+
"Gemini-1.5-Pro": [0.705, 0.394, 0.693, 0.732, 0.856, 0.508, 0.574],
|
| 17 |
+
"Llama-3.1-70B-Instruct": [0.823, 0.605, 0.67, 0.908, 0.802, 0.832, 0.574],
|
| 18 |
+
"Qwen2.5-72B-Instruct": [0.705, 0.421, 0.658, 0.767, 0.791, 0.627, 0.574]
|
| 19 |
}
|
| 20 |
]
|
src/populate.py
CHANGED
|
@@ -7,6 +7,9 @@ from huggingface_hub import model_info
|
|
| 7 |
from transformers import AutoConfig
|
| 8 |
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
def is_model_on_hub(
|
| 11 |
model_name: str, revision: str, token: str = None, trust_remote_code=False
|
| 12 |
) -> tuple[bool, str | None, str | None]:
|
|
@@ -48,6 +51,22 @@ def model_type_to_symbol(model_type: str) -> str:
|
|
| 48 |
|
| 49 |
def get_hf_data_by_model_name(model_name: str) -> dict:
|
| 50 |
"""Get model data from Hugging Face API by model name"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
|
| 52 |
if not still_on_hub and '/' in model_name:
|
| 53 |
print(f"Model {model_name} is not on the hub, try unsloth/...")
|
|
@@ -69,6 +88,8 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
|
|
| 69 |
print("SafeTensors not found in", model_name, e)
|
| 70 |
if 'Pixtral-12B' in model_name:
|
| 71 |
num_params = 12
|
|
|
|
|
|
|
| 72 |
pass
|
| 73 |
print("num_params", model_name, num_params)
|
| 74 |
|
|
@@ -93,6 +114,7 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
|
|
| 93 |
"model_size": num_params if num_params else None,
|
| 94 |
"model_url": model_url,
|
| 95 |
"model_license": model_license,
|
|
|
|
| 96 |
}
|
| 97 |
|
| 98 |
|
|
@@ -109,11 +131,14 @@ class Field:
|
|
| 109 |
MODEL_COLUMNS_DICT = {
|
| 110 |
"model_type_symbol": Field("T", "str", never_hidden=True),
|
| 111 |
"model_size_symbol": Field("S", "str", never_hidden=True),
|
|
|
|
| 112 |
"model_name": Field("Model Name", "markdown", never_hidden=True),
|
| 113 |
"model_type": Field("Type", "str", displayed_by_default=False),
|
| 114 |
"model_size": Field("#Params (B)", "number", displayed_by_default=False),
|
|
|
|
| 115 |
"model_architecture": Field("Architecture", "str", displayed_by_default=False),
|
| 116 |
"model_license": Field("License", "markdown", displayed_by_default=False),
|
|
|
|
| 117 |
}
|
| 118 |
|
| 119 |
U_MATH_COLUMNS_DICT = {
|
|
@@ -233,8 +258,11 @@ def get_u_math_leaderboard_df() -> pd.DataFrame:
|
|
| 233 |
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
| 234 |
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
| 235 |
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
|
|
|
| 236 |
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
|
|
|
| 237 |
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
|
|
|
| 238 |
df["model_name"] = df["model_name"].apply(
|
| 239 |
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
| 240 |
)
|
|
@@ -253,12 +281,16 @@ def get_mu_math_leaderboard_df() -> pd.DataFrame:
|
|
| 253 |
|
| 254 |
# Calculate columns with prefixes f1, tpr, tnr, ppv, npv
|
| 255 |
for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
| 256 |
-
df[col + "
|
| 257 |
-
df[col + "
|
| 258 |
-
df[col + "
|
| 259 |
-
df[col + "
|
| 260 |
-
df[col + "
|
|
|
|
|
|
|
| 261 |
del df[col]
|
|
|
|
|
|
|
| 262 |
|
| 263 |
# # flatten list [x, y, z] in columns as ["_f1", "_precision", "_recall"] suffixes for columns
|
| 264 |
# for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
|
@@ -281,8 +313,11 @@ def get_mu_math_leaderboard_df() -> pd.DataFrame:
|
|
| 281 |
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
| 282 |
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
| 283 |
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
|
|
|
| 284 |
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
|
|
|
| 285 |
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
|
|
|
| 286 |
df["model_name"] = df["model_name"].apply(
|
| 287 |
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
| 288 |
)
|
|
|
|
| 7 |
from transformers import AutoConfig
|
| 8 |
|
| 9 |
|
| 10 |
+
UNKNOWN_MODEL_SHOW_SIZE = 150
|
| 11 |
+
|
| 12 |
+
|
| 13 |
def is_model_on_hub(
|
| 14 |
model_name: str, revision: str, token: str = None, trust_remote_code=False
|
| 15 |
) -> tuple[bool, str | None, str | None]:
|
|
|
|
| 51 |
|
| 52 |
def get_hf_data_by_model_name(model_name: str) -> dict:
|
| 53 |
"""Get model data from Hugging Face API by model name"""
|
| 54 |
+
model_family = "Unknown"
|
| 55 |
+
if 'mistral' in model_name.lower() or 'numina' in model_name.lower():
|
| 56 |
+
model_family = "Mistral"
|
| 57 |
+
elif 'meta-llama' in model_name.lower():
|
| 58 |
+
model_family = "LLaMA"
|
| 59 |
+
elif 'claude' in model_name.lower():
|
| 60 |
+
model_family = "Claude"
|
| 61 |
+
elif 'qwen' in model_name.lower() or 'athene' in model_name.lower() or 'qwq' in model_name.lower() or 'qvq' in model_name.lower():
|
| 62 |
+
model_family = "Qwen"
|
| 63 |
+
elif 'gpt' in model_name.lower() or 'o1' in model_name.lower():
|
| 64 |
+
model_family = "GPT"
|
| 65 |
+
elif 'gemini' in model_name.lower():
|
| 66 |
+
model_family = "Gemini"
|
| 67 |
+
elif 'deepseek' in model_name.lower():
|
| 68 |
+
model_family = "DeepSeek"
|
| 69 |
+
|
| 70 |
still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
|
| 71 |
if not still_on_hub and '/' in model_name:
|
| 72 |
print(f"Model {model_name} is not on the hub, try unsloth/...")
|
|
|
|
| 88 |
print("SafeTensors not found in", model_name, e)
|
| 89 |
if 'Pixtral-12B' in model_name:
|
| 90 |
num_params = 12
|
| 91 |
+
elif 'Pixtral-Large-Instruct-2411' in model_name:
|
| 92 |
+
num_params = 123.3
|
| 93 |
pass
|
| 94 |
print("num_params", model_name, num_params)
|
| 95 |
|
|
|
|
| 114 |
"model_size": num_params if num_params else None,
|
| 115 |
"model_url": model_url,
|
| 116 |
"model_license": model_license,
|
| 117 |
+
"model_family": model_family,
|
| 118 |
}
|
| 119 |
|
| 120 |
|
|
|
|
| 131 |
MODEL_COLUMNS_DICT = {
|
| 132 |
"model_type_symbol": Field("T", "str", never_hidden=True),
|
| 133 |
"model_size_symbol": Field("S", "str", never_hidden=True),
|
| 134 |
+
"full_model_name": Field("Full Model Name", "markdown", fully_hidden=True),
|
| 135 |
"model_name": Field("Model Name", "markdown", never_hidden=True),
|
| 136 |
"model_type": Field("Type", "str", displayed_by_default=False),
|
| 137 |
"model_size": Field("#Params (B)", "number", displayed_by_default=False),
|
| 138 |
+
"model_size_including_unknown": Field("#Params inc. Proprietary (B)", "number", fully_hidden=True),
|
| 139 |
"model_architecture": Field("Architecture", "str", displayed_by_default=False),
|
| 140 |
"model_license": Field("License", "markdown", displayed_by_default=False),
|
| 141 |
+
"model_family": Field("Family", "str", displayed_by_default=False),
|
| 142 |
}
|
| 143 |
|
| 144 |
U_MATH_COLUMNS_DICT = {
|
|
|
|
| 258 |
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
| 259 |
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
| 260 |
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
| 261 |
+
df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
|
| 262 |
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
| 263 |
+
df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
|
| 264 |
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
| 265 |
+
df["full_model_name"] = df["model_name"]
|
| 266 |
df["model_name"] = df["model_name"].apply(
|
| 267 |
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
| 268 |
)
|
|
|
|
| 281 |
|
| 282 |
# Calculate columns with prefixes f1, tpr, tnr, ppv, npv
|
| 283 |
for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
| 284 |
+
df[col + "_acc"] = df[col].apply(lambda x: x[0])
|
| 285 |
+
df[col + "_f1"] = df[col].apply(lambda x: x[1])
|
| 286 |
+
df[col + "_mcc"] = df[col].apply(lambda x: x[2])
|
| 287 |
+
df[col + "_tpr"] = df[col].apply(lambda x: x[3])
|
| 288 |
+
df[col + "_tnr"] = df[col].apply(lambda x: x[4])
|
| 289 |
+
df[col + "_ppv"] = df[col].apply(lambda x: x[5])
|
| 290 |
+
df[col + "_npv"] = df[col].apply(lambda x: x[6])
|
| 291 |
del df[col]
|
| 292 |
+
del df[col + "_acc"]
|
| 293 |
+
del df[col + "_mcc"]
|
| 294 |
|
| 295 |
# # flatten list [x, y, z] in columns as ["_f1", "_precision", "_recall"] suffixes for columns
|
| 296 |
# for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
|
|
|
| 313 |
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
| 314 |
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
| 315 |
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
| 316 |
+
df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
|
| 317 |
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
| 318 |
+
df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
|
| 319 |
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
| 320 |
+
df["full_model_name"] = df["model_name"]
|
| 321 |
df["model_name"] = df["model_name"].apply(
|
| 322 |
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
| 323 |
)
|