Spaces:
Running
Running
chore: fix model repo names
Browse files- data/u_math_eval_results.json +13 -13
data/u_math_eval_results.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"model_name": "meta-llama/Llama-3.1-8B",
|
| 4 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 5 |
"u_math": [
|
| 6 |
29.545454545454547,
|
|
@@ -39,7 +39,7 @@
|
|
| 39 |
]
|
| 40 |
},
|
| 41 |
{
|
| 42 |
-
"model_name": "Qwen/Qwen2.5-7B",
|
| 43 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 44 |
"u_math": [
|
| 45 |
43.27272727272727,
|
|
@@ -78,7 +78,7 @@
|
|
| 78 |
]
|
| 79 |
},
|
| 80 |
{
|
| 81 |
-
"model_name": "Qwen/Qwen2.5-72B",
|
| 82 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 83 |
"u_math": [
|
| 84 |
51.18181818181819,
|
|
@@ -117,7 +117,7 @@
|
|
| 117 |
]
|
| 118 |
},
|
| 119 |
{
|
| 120 |
-
"model_name": "Qwen/Qwen2.5-Math-7B",
|
| 121 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 122 |
"u_math": [
|
| 123 |
45.45454545454545,
|
|
@@ -156,7 +156,7 @@
|
|
| 156 |
]
|
| 157 |
},
|
| 158 |
{
|
| 159 |
-
"model_name": "Qwen/Qwen2.5-Math-72B",
|
| 160 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 161 |
"u_math": [
|
| 162 |
59.45454545454546,
|
|
@@ -546,7 +546,7 @@
|
|
| 546 |
]
|
| 547 |
},
|
| 548 |
{
|
| 549 |
-
"model_name": "meta-llama/Llama-3.2-11B-Vision",
|
| 550 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 551 |
"u_math": [
|
| 552 |
20.363636363636363,
|
|
@@ -663,7 +663,7 @@
|
|
| 663 |
]
|
| 664 |
},
|
| 665 |
{
|
| 666 |
-
"model_name": "Qwen/Qwen2.5-32B",
|
| 667 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 668 |
"u_math": [
|
| 669 |
52.36363636363637,
|
|
@@ -936,7 +936,7 @@
|
|
| 936 |
]
|
| 937 |
},
|
| 938 |
{
|
| 939 |
-
"model_name": "meta-llama/Llama-3.1-70B",
|
| 940 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 941 |
"u_math": [
|
| 942 |
34.27272727272727,
|
|
@@ -975,7 +975,7 @@
|
|
| 975 |
]
|
| 976 |
},
|
| 977 |
{
|
| 978 |
-
"model_name": "nvidia/Llama-3.1-Nemotron-70B",
|
| 979 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 980 |
"u_math": [
|
| 981 |
42.54545454545455,
|
|
@@ -1014,7 +1014,7 @@
|
|
| 1014 |
]
|
| 1015 |
},
|
| 1016 |
{
|
| 1017 |
-
"model_name": "meta-llama/Llama-3.3-70B",
|
| 1018 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 1019 |
"u_math": [
|
| 1020 |
44.72727272727273,
|
|
@@ -1053,7 +1053,7 @@
|
|
| 1053 |
]
|
| 1054 |
},
|
| 1055 |
{
|
| 1056 |
-
"model_name": "meta-llama/Llama-3.2-90B-Vision",
|
| 1057 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 1058 |
"u_math": [
|
| 1059 |
37.18181818181818,
|
|
@@ -1092,7 +1092,7 @@
|
|
| 1092 |
]
|
| 1093 |
},
|
| 1094 |
{
|
| 1095 |
-
"model_name": "Qwen/Qwen2-VL-7B",
|
| 1096 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 1097 |
"u_math": [
|
| 1098 |
26.272727272727277,
|
|
@@ -1131,7 +1131,7 @@
|
|
| 1131 |
]
|
| 1132 |
},
|
| 1133 |
{
|
| 1134 |
-
"model_name": "Qwen/Qwen2-VL-72B",
|
| 1135 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 1136 |
"u_math": [
|
| 1137 |
41.81818181818181,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"model_name": "meta-llama/Llama-3.1-8B-Instruct",
|
| 4 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 5 |
"u_math": [
|
| 6 |
29.545454545454547,
|
|
|
|
| 39 |
]
|
| 40 |
},
|
| 41 |
{
|
| 42 |
+
"model_name": "Qwen/Qwen2.5-7B-Instruct",
|
| 43 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 44 |
"u_math": [
|
| 45 |
43.27272727272727,
|
|
|
|
| 78 |
]
|
| 79 |
},
|
| 80 |
{
|
| 81 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 82 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 83 |
"u_math": [
|
| 84 |
51.18181818181819,
|
|
|
|
| 117 |
]
|
| 118 |
},
|
| 119 |
{
|
| 120 |
+
"model_name": "Qwen/Qwen2.5-Math-7B-Instruct",
|
| 121 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 122 |
"u_math": [
|
| 123 |
45.45454545454545,
|
|
|
|
| 156 |
]
|
| 157 |
},
|
| 158 |
{
|
| 159 |
+
"model_name": "Qwen/Qwen2.5-Math-72B-Instruct",
|
| 160 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 161 |
"u_math": [
|
| 162 |
59.45454545454546,
|
|
|
|
| 546 |
]
|
| 547 |
},
|
| 548 |
{
|
| 549 |
+
"model_name": "meta-llama/Llama-3.2-11B-Vision-Instruct",
|
| 550 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 551 |
"u_math": [
|
| 552 |
20.363636363636363,
|
|
|
|
| 663 |
]
|
| 664 |
},
|
| 665 |
{
|
| 666 |
+
"model_name": "Qwen/Qwen2.5-32B-Instruct",
|
| 667 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 668 |
"u_math": [
|
| 669 |
52.36363636363637,
|
|
|
|
| 936 |
]
|
| 937 |
},
|
| 938 |
{
|
| 939 |
+
"model_name": "meta-llama/Llama-3.1-70B-Instruct",
|
| 940 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 941 |
"u_math": [
|
| 942 |
34.27272727272727,
|
|
|
|
| 975 |
]
|
| 976 |
},
|
| 977 |
{
|
| 978 |
+
"model_name": "nvidia/Llama-3.1-Nemotron-70B-Instruct",
|
| 979 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 980 |
"u_math": [
|
| 981 |
42.54545454545455,
|
|
|
|
| 1014 |
]
|
| 1015 |
},
|
| 1016 |
{
|
| 1017 |
+
"model_name": "meta-llama/Llama-3.3-70B-Instruct",
|
| 1018 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 1019 |
"u_math": [
|
| 1020 |
44.72727272727273,
|
|
|
|
| 1053 |
]
|
| 1054 |
},
|
| 1055 |
{
|
| 1056 |
+
"model_name": "meta-llama/Llama-3.2-90B-Vision-Instruct",
|
| 1057 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 1058 |
"u_math": [
|
| 1059 |
37.18181818181818,
|
|
|
|
| 1092 |
]
|
| 1093 |
},
|
| 1094 |
{
|
| 1095 |
+
"model_name": "Qwen/Qwen2-VL-7B-Instruct",
|
| 1096 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 1097 |
"u_math": [
|
| 1098 |
26.272727272727277,
|
|
|
|
| 1131 |
]
|
| 1132 |
},
|
| 1133 |
{
|
| 1134 |
+
"model_name": "Qwen/Qwen2-VL-72B-Instruct",
|
| 1135 |
"judge_model_name": "gpt-4o-2024-08-06",
|
| 1136 |
"u_math": [
|
| 1137 |
41.81818181818181,
|