Commit
·
7555fc7
1
Parent(s):
49498de
moved data into persistent dataset
Browse files- data/arena-hard-v0.1/question.jsonl +0 -0
- data/arena_hard_battles.jsonl +0 -0
- data/bootstrapping_results.jsonl +0 -100
- data/leaderboard.json +0 -329
- data/leaderboard_logs/README.md +0 -3
- src/envs.py +2 -1
- src/gen/arena_hard_leaderboard_20240514.json +0 -329
- src/gen/arena_hard_leaderboard_20240515.json +0 -329
- src/gen/show_result.py +2 -2
- src/leaderboard/build_leaderboard.py +5 -12
data/arena-hard-v0.1/question.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/arena_hard_battles.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/bootstrapping_results.jsonl
DELETED
|
@@ -1,100 +0,0 @@
|
|
| 1 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.5644665503,"gigachat_lite":726.6208252619}
|
| 2 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.0709454157,"gigachat_lite":738.5741612323}
|
| 3 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.0434024226,"gigachat_lite":734.1011761886}
|
| 4 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":860.399655762,"gigachat_lite":729.5571514643}
|
| 5 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.1731508697,"gigachat_lite":728.758372467}
|
| 6 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.5326400531,"gigachat_lite":733.7900136425}
|
| 7 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7819454641,"gigachat_lite":719.043685497}
|
| 8 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":858.5219875589,"gigachat_lite":714.8370789545}
|
| 9 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.4603125434,"gigachat_lite":725.8752720444}
|
| 10 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.8350548067,"gigachat_lite":715.266084892}
|
| 11 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.7609222876,"gigachat_lite":727.2017077065}
|
| 12 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":854.2414273092,"gigachat_lite":739.3798608124}
|
| 13 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.374147169,"gigachat_lite":719.6304899658}
|
| 14 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.1792770928,"gigachat_lite":734.0546251412}
|
| 15 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.2996605704,"gigachat_lite":718.4924449088}
|
| 16 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.8988771163,"gigachat_lite":721.0729415472}
|
| 17 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.0356240274,"gigachat_lite":738.5699274129}
|
| 18 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.6157440982,"gigachat_lite":723.7105361329}
|
| 19 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.9225322393,"gigachat_lite":728.2971721354}
|
| 20 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.7557130348,"gigachat_lite":737.8461934603}
|
| 21 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":853.284444198,"gigachat_lite":748.9971545908}
|
| 22 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.7087385877,"gigachat_lite":713.1462726999}
|
| 23 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.482425846,"gigachat_lite":720.2960317186}
|
| 24 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.6122634027,"gigachat_lite":727.2517234335}
|
| 25 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":852.7157509126,"gigachat_lite":694.2654473149}
|
| 26 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.7938560994,"gigachat_lite":735.6639839406}
|
| 27 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.1682886992,"gigachat_lite":730.5016731736}
|
| 28 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.4589887037,"gigachat_lite":734.4551919945}
|
| 29 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":850.0205093168,"gigachat_lite":728.8931636911}
|
| 30 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.7282859976,"gigachat_lite":717.6726330463}
|
| 31 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.3647024942,"gigachat_lite":733.3721052861}
|
| 32 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.1797064852,"gigachat_lite":725.7981758416}
|
| 33 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.6238850835,"gigachat_lite":731.0409312559}
|
| 34 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.7097671655,"gigachat_lite":715.3647090465}
|
| 35 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.4978660071,"gigachat_lite":737.7875979517}
|
| 36 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.5650653089,"gigachat_lite":729.3512200797}
|
| 37 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":890.8852955482,"gigachat_lite":715.9010959711}
|
| 38 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.6426165155,"gigachat_lite":722.2116159282}
|
| 39 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.3456423505,"gigachat_lite":724.6752254921}
|
| 40 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.4854945486,"gigachat_lite":718.5749125859}
|
| 41 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":880.1901418236,"gigachat_lite":723.0132896162}
|
| 42 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":849.6103242372,"gigachat_lite":732.3587564613}
|
| 43 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.0458800663,"gigachat_lite":740.6268654101}
|
| 44 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":877.4244267245,"gigachat_lite":724.6297632896}
|
| 45 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.3479511716,"gigachat_lite":743.701641735}
|
| 46 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.1269918194,"gigachat_lite":723.5736702859}
|
| 47 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.8015195801,"gigachat_lite":731.9752231934}
|
| 48 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":868.2750694028,"gigachat_lite":722.3929635211}
|
| 49 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":868.0957706924,"gigachat_lite":721.9705147906}
|
| 50 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":870.6012679715,"gigachat_lite":738.9123529498}
|
| 51 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.269673472,"gigachat_lite":733.7609432817}
|
| 52 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.2488571071,"gigachat_lite":724.1850017217}
|
| 53 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.1624601722,"gigachat_lite":727.8550112565}
|
| 54 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.1194231025,"gigachat_lite":731.3315308989}
|
| 55 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.1192986285,"gigachat_lite":722.5721295254}
|
| 56 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.0030926827,"gigachat_lite":729.8940208849}
|
| 57 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.5474187298,"gigachat_lite":735.9873637973}
|
| 58 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":880.5566205251,"gigachat_lite":730.6501947523}
|
| 59 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.7223684538,"gigachat_lite":702.8268457509}
|
| 60 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.9512628918,"gigachat_lite":732.6491227137}
|
| 61 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":858.7260910186,"gigachat_lite":736.225411771}
|
| 62 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.4133525673,"gigachat_lite":745.6156113918}
|
| 63 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.2715335516,"gigachat_lite":721.0912474577}
|
| 64 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.3256361213,"gigachat_lite":736.2254117629}
|
| 65 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.9022358038,"gigachat_lite":732.9674153867}
|
| 66 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.5601382523,"gigachat_lite":723.0966793643}
|
| 67 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.5272121008,"gigachat_lite":718.0704518208}
|
| 68 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7782194777,"gigachat_lite":722.2852812675}
|
| 69 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.4086246736,"gigachat_lite":745.1185090985}
|
| 70 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":870.0314924292,"gigachat_lite":736.9690722951}
|
| 71 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.3587976891,"gigachat_lite":742.6306627437}
|
| 72 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.5511568095,"gigachat_lite":733.1555506911}
|
| 73 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.2094645624,"gigachat_lite":721.7491525609}
|
| 74 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.0624318318,"gigachat_lite":723.0795022704}
|
| 75 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":848.5397354473,"gigachat_lite":717.9478748234}
|
| 76 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.9432204946,"gigachat_lite":726.703609728}
|
| 77 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.2370229881,"gigachat_lite":725.3073844986}
|
| 78 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":878.2964116149,"gigachat_lite":722.2116156669}
|
| 79 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.9909782749,"gigachat_lite":720.1865370325}
|
| 80 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.9069179589,"gigachat_lite":731.5240457448}
|
| 81 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":860.2445059252,"gigachat_lite":737.0781670626}
|
| 82 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":850.4012745111,"gigachat_lite":708.356058121}
|
| 83 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7922558028,"gigachat_lite":730.3511179714}
|
| 84 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.2175409513,"gigachat_lite":727.5035049316}
|
| 85 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.8494155845,"gigachat_lite":706.4191731996}
|
| 86 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.4641060792,"gigachat_lite":734.2333848904}
|
| 87 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":878.905415424,"gigachat_lite":736.5196621633}
|
| 88 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.8853822745,"gigachat_lite":724.9647865416}
|
| 89 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.2360763272,"gigachat_lite":718.7060814362}
|
| 90 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":869.1579952553,"gigachat_lite":722.5615781913}
|
| 91 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.2369472583,"gigachat_lite":731.6666527735}
|
| 92 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.2009612357,"gigachat_lite":722.1914533305}
|
| 93 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":876.2027799847,"gigachat_lite":719.1795542579}
|
| 94 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":849.6362696273,"gigachat_lite":730.3223324585}
|
| 95 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.1318475963,"gigachat_lite":724.1322488355}
|
| 96 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.8791178271,"gigachat_lite":734.6332090556}
|
| 97 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":873.3916447336,"gigachat_lite":716.1292305518}
|
| 98 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.1797828548,"gigachat_lite":726.7846008592}
|
| 99 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.1613697328,"gigachat_lite":717.027778133}
|
| 100 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.1689869302,"gigachat_lite":728.6562483681}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/leaderboard.json
DELETED
|
@@ -1,329 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"results":[
|
| 4 |
-
1000.0,
|
| 5 |
-
1000.0,
|
| 6 |
-
1000.0,
|
| 7 |
-
1000.0,
|
| 8 |
-
1000.0,
|
| 9 |
-
1000.0,
|
| 10 |
-
1000.0,
|
| 11 |
-
1000.0,
|
| 12 |
-
1000.0,
|
| 13 |
-
1000.0,
|
| 14 |
-
1000.0,
|
| 15 |
-
1000.0,
|
| 16 |
-
1000.0,
|
| 17 |
-
1000.0,
|
| 18 |
-
1000.0,
|
| 19 |
-
1000.0,
|
| 20 |
-
1000.0,
|
| 21 |
-
1000.0,
|
| 22 |
-
1000.0,
|
| 23 |
-
1000.0,
|
| 24 |
-
1000.0,
|
| 25 |
-
1000.0,
|
| 26 |
-
1000.0,
|
| 27 |
-
1000.0,
|
| 28 |
-
1000.0,
|
| 29 |
-
1000.0,
|
| 30 |
-
1000.0,
|
| 31 |
-
1000.0,
|
| 32 |
-
1000.0,
|
| 33 |
-
1000.0,
|
| 34 |
-
1000.0,
|
| 35 |
-
1000.0,
|
| 36 |
-
1000.0,
|
| 37 |
-
1000.0,
|
| 38 |
-
1000.0,
|
| 39 |
-
1000.0,
|
| 40 |
-
1000.0,
|
| 41 |
-
1000.0,
|
| 42 |
-
1000.0,
|
| 43 |
-
1000.0,
|
| 44 |
-
1000.0,
|
| 45 |
-
1000.0,
|
| 46 |
-
1000.0,
|
| 47 |
-
1000.0,
|
| 48 |
-
1000.0,
|
| 49 |
-
1000.0,
|
| 50 |
-
1000.0,
|
| 51 |
-
1000.0,
|
| 52 |
-
1000.0,
|
| 53 |
-
1000.0,
|
| 54 |
-
1000.0,
|
| 55 |
-
1000.0,
|
| 56 |
-
1000.0,
|
| 57 |
-
1000.0,
|
| 58 |
-
1000.0,
|
| 59 |
-
1000.0,
|
| 60 |
-
1000.0,
|
| 61 |
-
1000.0,
|
| 62 |
-
1000.0,
|
| 63 |
-
1000.0,
|
| 64 |
-
1000.0,
|
| 65 |
-
1000.0,
|
| 66 |
-
1000.0,
|
| 67 |
-
1000.0,
|
| 68 |
-
1000.0,
|
| 69 |
-
1000.0,
|
| 70 |
-
1000.0,
|
| 71 |
-
1000.0,
|
| 72 |
-
1000.0,
|
| 73 |
-
1000.0,
|
| 74 |
-
1000.0,
|
| 75 |
-
1000.0,
|
| 76 |
-
1000.0,
|
| 77 |
-
1000.0,
|
| 78 |
-
1000.0,
|
| 79 |
-
1000.0,
|
| 80 |
-
1000.0,
|
| 81 |
-
1000.0,
|
| 82 |
-
1000.0,
|
| 83 |
-
1000.0,
|
| 84 |
-
1000.0,
|
| 85 |
-
1000.0,
|
| 86 |
-
1000.0,
|
| 87 |
-
1000.0,
|
| 88 |
-
1000.0,
|
| 89 |
-
1000.0,
|
| 90 |
-
1000.0,
|
| 91 |
-
1000.0,
|
| 92 |
-
1000.0,
|
| 93 |
-
1000.0,
|
| 94 |
-
1000.0,
|
| 95 |
-
1000.0,
|
| 96 |
-
1000.0,
|
| 97 |
-
1000.0,
|
| 98 |
-
1000.0,
|
| 99 |
-
1000.0,
|
| 100 |
-
1000.0,
|
| 101 |
-
1000.0,
|
| 102 |
-
1000.0,
|
| 103 |
-
1000.0
|
| 104 |
-
],
|
| 105 |
-
"model":"gpt-3.5-turbo-0125",
|
| 106 |
-
"score":50.0,
|
| 107 |
-
"lower":50.0,
|
| 108 |
-
"upper":50.0,
|
| 109 |
-
"avg_tokens":0.0
|
| 110 |
-
},
|
| 111 |
-
{
|
| 112 |
-
"results":[
|
| 113 |
-
855.5644665503,
|
| 114 |
-
859.0709454157,
|
| 115 |
-
865.0434024226,
|
| 116 |
-
860.399655762,
|
| 117 |
-
855.1731508697,
|
| 118 |
-
855.5326400531,
|
| 119 |
-
866.7819454641,
|
| 120 |
-
858.5219875589,
|
| 121 |
-
861.4603125434,
|
| 122 |
-
859.8350548067,
|
| 123 |
-
862.7609222876,
|
| 124 |
-
854.2414273092,
|
| 125 |
-
862.374147169,
|
| 126 |
-
863.1792770928,
|
| 127 |
-
865.2996605704,
|
| 128 |
-
864.8988771163,
|
| 129 |
-
867.0356240274,
|
| 130 |
-
871.6157440982,
|
| 131 |
-
861.9225322393,
|
| 132 |
-
864.7557130348,
|
| 133 |
-
853.284444198,
|
| 134 |
-
851.7087385877,
|
| 135 |
-
871.482425846,
|
| 136 |
-
866.6122634027,
|
| 137 |
-
852.7157509126,
|
| 138 |
-
859.7938560994,
|
| 139 |
-
874.1682886992,
|
| 140 |
-
855.4589887037,
|
| 141 |
-
850.0205093168,
|
| 142 |
-
875.7282859976,
|
| 143 |
-
865.3647024942,
|
| 144 |
-
856.1797064852,
|
| 145 |
-
867.6238850835,
|
| 146 |
-
857.7097671655,
|
| 147 |
-
874.4978660071,
|
| 148 |
-
857.5650653089,
|
| 149 |
-
890.8852955482,
|
| 150 |
-
855.6426165155,
|
| 151 |
-
859.3456423505,
|
| 152 |
-
857.4854945486,
|
| 153 |
-
880.1901418236,
|
| 154 |
-
849.6103242372,
|
| 155 |
-
871.0458800663,
|
| 156 |
-
877.4244267245,
|
| 157 |
-
875.3479511716,
|
| 158 |
-
859.1269918194,
|
| 159 |
-
857.8015195801,
|
| 160 |
-
868.2750694028,
|
| 161 |
-
868.0957706924,
|
| 162 |
-
870.6012679715,
|
| 163 |
-
862.269673472,
|
| 164 |
-
864.2488571071,
|
| 165 |
-
874.1624601722,
|
| 166 |
-
863.1194231025,
|
| 167 |
-
857.1192986285,
|
| 168 |
-
862.0030926827,
|
| 169 |
-
861.5474187298,
|
| 170 |
-
880.5566205251,
|
| 171 |
-
861.7223684538,
|
| 172 |
-
874.9512628918,
|
| 173 |
-
858.7260910186,
|
| 174 |
-
871.4133525673,
|
| 175 |
-
866.2715335516,
|
| 176 |
-
861.3256361213,
|
| 177 |
-
866.9022358038,
|
| 178 |
-
867.5601382523,
|
| 179 |
-
864.5272121008,
|
| 180 |
-
866.7782194777,
|
| 181 |
-
865.4086246736,
|
| 182 |
-
870.0314924292,
|
| 183 |
-
855.3587976891,
|
| 184 |
-
851.5511568095,
|
| 185 |
-
863.2094645624,
|
| 186 |
-
861.0624318318,
|
| 187 |
-
848.5397354473,
|
| 188 |
-
857.9432204946,
|
| 189 |
-
861.2370229881,
|
| 190 |
-
878.2964116149,
|
| 191 |
-
857.9909782749,
|
| 192 |
-
871.9069179589,
|
| 193 |
-
860.2445059252,
|
| 194 |
-
850.4012745111,
|
| 195 |
-
866.7922558028,
|
| 196 |
-
862.2175409513,
|
| 197 |
-
856.8494155845,
|
| 198 |
-
856.4641060792,
|
| 199 |
-
878.905415424,
|
| 200 |
-
851.8853822745,
|
| 201 |
-
859.2360763272,
|
| 202 |
-
869.1579952553,
|
| 203 |
-
855.2369472583,
|
| 204 |
-
859.2009612357,
|
| 205 |
-
876.2027799847,
|
| 206 |
-
849.6362696273,
|
| 207 |
-
865.1318475963,
|
| 208 |
-
855.8791178271,
|
| 209 |
-
873.3916447336,
|
| 210 |
-
867.1797828548,
|
| 211 |
-
865.1613697328,
|
| 212 |
-
875.1689869302
|
| 213 |
-
],
|
| 214 |
-
"model":"gigachat_pro",
|
| 215 |
-
"score":31.37,
|
| 216 |
-
"lower":29.64,
|
| 217 |
-
"upper":33.33,
|
| 218 |
-
"avg_tokens":0.0
|
| 219 |
-
},
|
| 220 |
-
{
|
| 221 |
-
"results":[
|
| 222 |
-
726.6208252619,
|
| 223 |
-
738.5741612323,
|
| 224 |
-
734.1011761886,
|
| 225 |
-
729.5571514643,
|
| 226 |
-
728.758372467,
|
| 227 |
-
733.7900136425,
|
| 228 |
-
719.043685497,
|
| 229 |
-
714.8370789545,
|
| 230 |
-
725.8752720444,
|
| 231 |
-
715.266084892,
|
| 232 |
-
727.2017077065,
|
| 233 |
-
739.3798608124,
|
| 234 |
-
719.6304899658,
|
| 235 |
-
734.0546251412,
|
| 236 |
-
718.4924449088,
|
| 237 |
-
721.0729415472,
|
| 238 |
-
738.5699274129,
|
| 239 |
-
723.7105361329,
|
| 240 |
-
728.2971721354,
|
| 241 |
-
737.8461934603,
|
| 242 |
-
748.9971545908,
|
| 243 |
-
713.1462726999,
|
| 244 |
-
720.2960317186,
|
| 245 |
-
727.2517234335,
|
| 246 |
-
694.2654473149,
|
| 247 |
-
735.6639839406,
|
| 248 |
-
730.5016731736,
|
| 249 |
-
734.4551919945,
|
| 250 |
-
728.8931636911,
|
| 251 |
-
717.6726330463,
|
| 252 |
-
733.3721052861,
|
| 253 |
-
725.7981758416,
|
| 254 |
-
731.0409312559,
|
| 255 |
-
715.3647090465,
|
| 256 |
-
737.7875979517,
|
| 257 |
-
729.3512200797,
|
| 258 |
-
715.9010959711,
|
| 259 |
-
722.2116159282,
|
| 260 |
-
724.6752254921,
|
| 261 |
-
718.5749125859,
|
| 262 |
-
723.0132896162,
|
| 263 |
-
732.3587564613,
|
| 264 |
-
740.6268654101,
|
| 265 |
-
724.6297632896,
|
| 266 |
-
743.701641735,
|
| 267 |
-
723.5736702859,
|
| 268 |
-
731.9752231934,
|
| 269 |
-
722.3929635211,
|
| 270 |
-
721.9705147906,
|
| 271 |
-
738.9123529498,
|
| 272 |
-
733.7609432817,
|
| 273 |
-
724.1850017217,
|
| 274 |
-
727.8550112565,
|
| 275 |
-
731.3315308989,
|
| 276 |
-
722.5721295254,
|
| 277 |
-
729.8940208849,
|
| 278 |
-
735.9873637973,
|
| 279 |
-
730.6501947523,
|
| 280 |
-
702.8268457509,
|
| 281 |
-
732.6491227137,
|
| 282 |
-
736.225411771,
|
| 283 |
-
745.6156113918,
|
| 284 |
-
721.0912474577,
|
| 285 |
-
736.2254117629,
|
| 286 |
-
732.9674153867,
|
| 287 |
-
723.0966793643,
|
| 288 |
-
718.0704518208,
|
| 289 |
-
722.2852812675,
|
| 290 |
-
745.1185090985,
|
| 291 |
-
736.9690722951,
|
| 292 |
-
742.6306627437,
|
| 293 |
-
733.1555506911,
|
| 294 |
-
721.7491525609,
|
| 295 |
-
723.0795022704,
|
| 296 |
-
717.9478748234,
|
| 297 |
-
726.703609728,
|
| 298 |
-
725.3073844986,
|
| 299 |
-
722.2116156669,
|
| 300 |
-
720.1865370325,
|
| 301 |
-
731.5240457448,
|
| 302 |
-
737.0781670626,
|
| 303 |
-
708.356058121,
|
| 304 |
-
730.3511179714,
|
| 305 |
-
727.5035049316,
|
| 306 |
-
706.4191731996,
|
| 307 |
-
734.2333848904,
|
| 308 |
-
736.5196621633,
|
| 309 |
-
724.9647865416,
|
| 310 |
-
718.7060814362,
|
| 311 |
-
722.5615781913,
|
| 312 |
-
731.6666527735,
|
| 313 |
-
722.1914533305,
|
| 314 |
-
719.1795542579,
|
| 315 |
-
730.3223324585,
|
| 316 |
-
724.1322488355,
|
| 317 |
-
734.6332090556,
|
| 318 |
-
716.1292305518,
|
| 319 |
-
726.7846008592,
|
| 320 |
-
717.027778133,
|
| 321 |
-
728.6562483681
|
| 322 |
-
],
|
| 323 |
-
"model":"gigachat_lite",
|
| 324 |
-
"score":17.2,
|
| 325 |
-
"lower":15.65,
|
| 326 |
-
"upper":18.68,
|
| 327 |
-
"avg_tokens":276.0
|
| 328 |
-
}
|
| 329 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/leaderboard_logs/README.md
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
# Логи генерации leaderboard
|
| 2 |
-
Сюда из space отправляются после генерации
|
| 3 |
-
Сохраняется только последний за день
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
|
@@ -27,7 +27,8 @@ if not os.access(HF_HOME, os.W_OK):
|
|
| 27 |
else:
|
| 28 |
print("Write access confirmed for HF_HOME")
|
| 29 |
|
| 30 |
-
|
|
|
|
| 31 |
|
| 32 |
RESET_JUDGEMENT_ENV = "RESET_JUDGEMENT"
|
| 33 |
|
|
|
|
| 27 |
else:
|
| 28 |
print("Write access confirmed for HF_HOME")
|
| 29 |
|
| 30 |
+
DATA_PATH = os.path.join(HF_HOME, "data")
|
| 31 |
+
DATA_ARENA_PATH = os.path.join(DATA_PATH, "arena-hard-v0.1")
|
| 32 |
|
| 33 |
RESET_JUDGEMENT_ENV = "RESET_JUDGEMENT"
|
| 34 |
|
src/gen/arena_hard_leaderboard_20240514.json
DELETED
|
@@ -1,329 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"results":[
|
| 4 |
-
1000.0,
|
| 5 |
-
1000.0,
|
| 6 |
-
1000.0,
|
| 7 |
-
1000.0,
|
| 8 |
-
1000.0,
|
| 9 |
-
1000.0,
|
| 10 |
-
1000.0,
|
| 11 |
-
1000.0,
|
| 12 |
-
1000.0,
|
| 13 |
-
1000.0,
|
| 14 |
-
1000.0,
|
| 15 |
-
1000.0,
|
| 16 |
-
1000.0,
|
| 17 |
-
1000.0,
|
| 18 |
-
1000.0,
|
| 19 |
-
1000.0,
|
| 20 |
-
1000.0,
|
| 21 |
-
1000.0,
|
| 22 |
-
1000.0,
|
| 23 |
-
1000.0,
|
| 24 |
-
1000.0,
|
| 25 |
-
1000.0,
|
| 26 |
-
1000.0,
|
| 27 |
-
1000.0,
|
| 28 |
-
1000.0,
|
| 29 |
-
1000.0,
|
| 30 |
-
1000.0,
|
| 31 |
-
1000.0,
|
| 32 |
-
1000.0,
|
| 33 |
-
1000.0,
|
| 34 |
-
1000.0,
|
| 35 |
-
1000.0,
|
| 36 |
-
1000.0,
|
| 37 |
-
1000.0,
|
| 38 |
-
1000.0,
|
| 39 |
-
1000.0,
|
| 40 |
-
1000.0,
|
| 41 |
-
1000.0,
|
| 42 |
-
1000.0,
|
| 43 |
-
1000.0,
|
| 44 |
-
1000.0,
|
| 45 |
-
1000.0,
|
| 46 |
-
1000.0,
|
| 47 |
-
1000.0,
|
| 48 |
-
1000.0,
|
| 49 |
-
1000.0,
|
| 50 |
-
1000.0,
|
| 51 |
-
1000.0,
|
| 52 |
-
1000.0,
|
| 53 |
-
1000.0,
|
| 54 |
-
1000.0,
|
| 55 |
-
1000.0,
|
| 56 |
-
1000.0,
|
| 57 |
-
1000.0,
|
| 58 |
-
1000.0,
|
| 59 |
-
1000.0,
|
| 60 |
-
1000.0,
|
| 61 |
-
1000.0,
|
| 62 |
-
1000.0,
|
| 63 |
-
1000.0,
|
| 64 |
-
1000.0,
|
| 65 |
-
1000.0,
|
| 66 |
-
1000.0,
|
| 67 |
-
1000.0,
|
| 68 |
-
1000.0,
|
| 69 |
-
1000.0,
|
| 70 |
-
1000.0,
|
| 71 |
-
1000.0,
|
| 72 |
-
1000.0,
|
| 73 |
-
1000.0,
|
| 74 |
-
1000.0,
|
| 75 |
-
1000.0,
|
| 76 |
-
1000.0,
|
| 77 |
-
1000.0,
|
| 78 |
-
1000.0,
|
| 79 |
-
1000.0,
|
| 80 |
-
1000.0,
|
| 81 |
-
1000.0,
|
| 82 |
-
1000.0,
|
| 83 |
-
1000.0,
|
| 84 |
-
1000.0,
|
| 85 |
-
1000.0,
|
| 86 |
-
1000.0,
|
| 87 |
-
1000.0,
|
| 88 |
-
1000.0,
|
| 89 |
-
1000.0,
|
| 90 |
-
1000.0,
|
| 91 |
-
1000.0,
|
| 92 |
-
1000.0,
|
| 93 |
-
1000.0,
|
| 94 |
-
1000.0,
|
| 95 |
-
1000.0,
|
| 96 |
-
1000.0,
|
| 97 |
-
1000.0,
|
| 98 |
-
1000.0,
|
| 99 |
-
1000.0,
|
| 100 |
-
1000.0,
|
| 101 |
-
1000.0,
|
| 102 |
-
1000.0,
|
| 103 |
-
1000.0
|
| 104 |
-
],
|
| 105 |
-
"model":"gpt-3.5-turbo-0125",
|
| 106 |
-
"score":50.0,
|
| 107 |
-
"lower":50.0,
|
| 108 |
-
"upper":50.0,
|
| 109 |
-
"avg_tokens":0.0
|
| 110 |
-
},
|
| 111 |
-
{
|
| 112 |
-
"results":[
|
| 113 |
-
855.5644665503,
|
| 114 |
-
859.0709454157,
|
| 115 |
-
865.0434024226,
|
| 116 |
-
860.399655762,
|
| 117 |
-
855.1731508697,
|
| 118 |
-
855.5326400531,
|
| 119 |
-
866.7819454641,
|
| 120 |
-
858.5219875589,
|
| 121 |
-
861.4603125434,
|
| 122 |
-
859.8350548067,
|
| 123 |
-
862.7609222876,
|
| 124 |
-
854.2414273092,
|
| 125 |
-
862.374147169,
|
| 126 |
-
863.1792770928,
|
| 127 |
-
865.2996605704,
|
| 128 |
-
864.8988771163,
|
| 129 |
-
867.0356240274,
|
| 130 |
-
871.6157440982,
|
| 131 |
-
861.9225322393,
|
| 132 |
-
864.7557130348,
|
| 133 |
-
853.284444198,
|
| 134 |
-
851.7087385877,
|
| 135 |
-
871.482425846,
|
| 136 |
-
866.6122634027,
|
| 137 |
-
852.7157509126,
|
| 138 |
-
859.7938560994,
|
| 139 |
-
874.1682886992,
|
| 140 |
-
855.4589887037,
|
| 141 |
-
850.0205093168,
|
| 142 |
-
875.7282859976,
|
| 143 |
-
865.3647024942,
|
| 144 |
-
856.1797064852,
|
| 145 |
-
867.6238850835,
|
| 146 |
-
857.7097671655,
|
| 147 |
-
874.4978660071,
|
| 148 |
-
857.5650653089,
|
| 149 |
-
890.8852955482,
|
| 150 |
-
855.6426165155,
|
| 151 |
-
859.3456423505,
|
| 152 |
-
857.4854945486,
|
| 153 |
-
880.1901418236,
|
| 154 |
-
849.6103242372,
|
| 155 |
-
871.0458800663,
|
| 156 |
-
877.4244267245,
|
| 157 |
-
875.3479511716,
|
| 158 |
-
859.1269918194,
|
| 159 |
-
857.8015195801,
|
| 160 |
-
868.2750694028,
|
| 161 |
-
868.0957706924,
|
| 162 |
-
870.6012679715,
|
| 163 |
-
862.269673472,
|
| 164 |
-
864.2488571071,
|
| 165 |
-
874.1624601722,
|
| 166 |
-
863.1194231025,
|
| 167 |
-
857.1192986285,
|
| 168 |
-
862.0030926827,
|
| 169 |
-
861.5474187298,
|
| 170 |
-
880.5566205251,
|
| 171 |
-
861.7223684538,
|
| 172 |
-
874.9512628918,
|
| 173 |
-
858.7260910186,
|
| 174 |
-
871.4133525673,
|
| 175 |
-
866.2715335516,
|
| 176 |
-
861.3256361213,
|
| 177 |
-
866.9022358038,
|
| 178 |
-
867.5601382523,
|
| 179 |
-
864.5272121008,
|
| 180 |
-
866.7782194777,
|
| 181 |
-
865.4086246736,
|
| 182 |
-
870.0314924292,
|
| 183 |
-
855.3587976891,
|
| 184 |
-
851.5511568095,
|
| 185 |
-
863.2094645624,
|
| 186 |
-
861.0624318318,
|
| 187 |
-
848.5397354473,
|
| 188 |
-
857.9432204946,
|
| 189 |
-
861.2370229881,
|
| 190 |
-
878.2964116149,
|
| 191 |
-
857.9909782749,
|
| 192 |
-
871.9069179589,
|
| 193 |
-
860.2445059252,
|
| 194 |
-
850.4012745111,
|
| 195 |
-
866.7922558028,
|
| 196 |
-
862.2175409513,
|
| 197 |
-
856.8494155845,
|
| 198 |
-
856.4641060792,
|
| 199 |
-
878.905415424,
|
| 200 |
-
851.8853822745,
|
| 201 |
-
859.2360763272,
|
| 202 |
-
869.1579952553,
|
| 203 |
-
855.2369472583,
|
| 204 |
-
859.2009612357,
|
| 205 |
-
876.2027799847,
|
| 206 |
-
849.6362696273,
|
| 207 |
-
865.1318475963,
|
| 208 |
-
855.8791178271,
|
| 209 |
-
873.3916447336,
|
| 210 |
-
867.1797828548,
|
| 211 |
-
865.1613697328,
|
| 212 |
-
875.1689869302
|
| 213 |
-
],
|
| 214 |
-
"model":"gigachat_pro",
|
| 215 |
-
"score":31.37,
|
| 216 |
-
"lower":29.64,
|
| 217 |
-
"upper":33.33,
|
| 218 |
-
"avg_tokens":0.0
|
| 219 |
-
},
|
| 220 |
-
{
|
| 221 |
-
"results":[
|
| 222 |
-
726.6208252619,
|
| 223 |
-
738.5741612323,
|
| 224 |
-
734.1011761886,
|
| 225 |
-
729.5571514643,
|
| 226 |
-
728.758372467,
|
| 227 |
-
733.7900136425,
|
| 228 |
-
719.043685497,
|
| 229 |
-
714.8370789545,
|
| 230 |
-
725.8752720444,
|
| 231 |
-
715.266084892,
|
| 232 |
-
727.2017077065,
|
| 233 |
-
739.3798608124,
|
| 234 |
-
719.6304899658,
|
| 235 |
-
734.0546251412,
|
| 236 |
-
718.4924449088,
|
| 237 |
-
721.0729415472,
|
| 238 |
-
738.5699274129,
|
| 239 |
-
723.7105361329,
|
| 240 |
-
728.2971721354,
|
| 241 |
-
737.8461934603,
|
| 242 |
-
748.9971545908,
|
| 243 |
-
713.1462726999,
|
| 244 |
-
720.2960317186,
|
| 245 |
-
727.2517234335,
|
| 246 |
-
694.2654473149,
|
| 247 |
-
735.6639839406,
|
| 248 |
-
730.5016731736,
|
| 249 |
-
734.4551919945,
|
| 250 |
-
728.8931636911,
|
| 251 |
-
717.6726330463,
|
| 252 |
-
733.3721052861,
|
| 253 |
-
725.7981758416,
|
| 254 |
-
731.0409312559,
|
| 255 |
-
715.3647090465,
|
| 256 |
-
737.7875979517,
|
| 257 |
-
729.3512200797,
|
| 258 |
-
715.9010959711,
|
| 259 |
-
722.2116159282,
|
| 260 |
-
724.6752254921,
|
| 261 |
-
718.5749125859,
|
| 262 |
-
723.0132896162,
|
| 263 |
-
732.3587564613,
|
| 264 |
-
740.6268654101,
|
| 265 |
-
724.6297632896,
|
| 266 |
-
743.701641735,
|
| 267 |
-
723.5736702859,
|
| 268 |
-
731.9752231934,
|
| 269 |
-
722.3929635211,
|
| 270 |
-
721.9705147906,
|
| 271 |
-
738.9123529498,
|
| 272 |
-
733.7609432817,
|
| 273 |
-
724.1850017217,
|
| 274 |
-
727.8550112565,
|
| 275 |
-
731.3315308989,
|
| 276 |
-
722.5721295254,
|
| 277 |
-
729.8940208849,
|
| 278 |
-
735.9873637973,
|
| 279 |
-
730.6501947523,
|
| 280 |
-
702.8268457509,
|
| 281 |
-
732.6491227137,
|
| 282 |
-
736.225411771,
|
| 283 |
-
745.6156113918,
|
| 284 |
-
721.0912474577,
|
| 285 |
-
736.2254117629,
|
| 286 |
-
732.9674153867,
|
| 287 |
-
723.0966793643,
|
| 288 |
-
718.0704518208,
|
| 289 |
-
722.2852812675,
|
| 290 |
-
745.1185090985,
|
| 291 |
-
736.9690722951,
|
| 292 |
-
742.6306627437,
|
| 293 |
-
733.1555506911,
|
| 294 |
-
721.7491525609,
|
| 295 |
-
723.0795022704,
|
| 296 |
-
717.9478748234,
|
| 297 |
-
726.703609728,
|
| 298 |
-
725.3073844986,
|
| 299 |
-
722.2116156669,
|
| 300 |
-
720.1865370325,
|
| 301 |
-
731.5240457448,
|
| 302 |
-
737.0781670626,
|
| 303 |
-
708.356058121,
|
| 304 |
-
730.3511179714,
|
| 305 |
-
727.5035049316,
|
| 306 |
-
706.4191731996,
|
| 307 |
-
734.2333848904,
|
| 308 |
-
736.5196621633,
|
| 309 |
-
724.9647865416,
|
| 310 |
-
718.7060814362,
|
| 311 |
-
722.5615781913,
|
| 312 |
-
731.6666527735,
|
| 313 |
-
722.1914533305,
|
| 314 |
-
719.1795542579,
|
| 315 |
-
730.3223324585,
|
| 316 |
-
724.1322488355,
|
| 317 |
-
734.6332090556,
|
| 318 |
-
716.1292305518,
|
| 319 |
-
726.7846008592,
|
| 320 |
-
717.027778133,
|
| 321 |
-
728.6562483681
|
| 322 |
-
],
|
| 323 |
-
"model":"gigachat_lite",
|
| 324 |
-
"score":17.2,
|
| 325 |
-
"lower":15.65,
|
| 326 |
-
"upper":18.68,
|
| 327 |
-
"avg_tokens":276.0
|
| 328 |
-
}
|
| 329 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/gen/arena_hard_leaderboard_20240515.json
DELETED
|
@@ -1,329 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"results":[
|
| 4 |
-
1000.0,
|
| 5 |
-
1000.0,
|
| 6 |
-
1000.0,
|
| 7 |
-
1000.0,
|
| 8 |
-
1000.0,
|
| 9 |
-
1000.0,
|
| 10 |
-
1000.0,
|
| 11 |
-
1000.0,
|
| 12 |
-
1000.0,
|
| 13 |
-
1000.0,
|
| 14 |
-
1000.0,
|
| 15 |
-
1000.0,
|
| 16 |
-
1000.0,
|
| 17 |
-
1000.0,
|
| 18 |
-
1000.0,
|
| 19 |
-
1000.0,
|
| 20 |
-
1000.0,
|
| 21 |
-
1000.0,
|
| 22 |
-
1000.0,
|
| 23 |
-
1000.0,
|
| 24 |
-
1000.0,
|
| 25 |
-
1000.0,
|
| 26 |
-
1000.0,
|
| 27 |
-
1000.0,
|
| 28 |
-
1000.0,
|
| 29 |
-
1000.0,
|
| 30 |
-
1000.0,
|
| 31 |
-
1000.0,
|
| 32 |
-
1000.0,
|
| 33 |
-
1000.0,
|
| 34 |
-
1000.0,
|
| 35 |
-
1000.0,
|
| 36 |
-
1000.0,
|
| 37 |
-
1000.0,
|
| 38 |
-
1000.0,
|
| 39 |
-
1000.0,
|
| 40 |
-
1000.0,
|
| 41 |
-
1000.0,
|
| 42 |
-
1000.0,
|
| 43 |
-
1000.0,
|
| 44 |
-
1000.0,
|
| 45 |
-
1000.0,
|
| 46 |
-
1000.0,
|
| 47 |
-
1000.0,
|
| 48 |
-
1000.0,
|
| 49 |
-
1000.0,
|
| 50 |
-
1000.0,
|
| 51 |
-
1000.0,
|
| 52 |
-
1000.0,
|
| 53 |
-
1000.0,
|
| 54 |
-
1000.0,
|
| 55 |
-
1000.0,
|
| 56 |
-
1000.0,
|
| 57 |
-
1000.0,
|
| 58 |
-
1000.0,
|
| 59 |
-
1000.0,
|
| 60 |
-
1000.0,
|
| 61 |
-
1000.0,
|
| 62 |
-
1000.0,
|
| 63 |
-
1000.0,
|
| 64 |
-
1000.0,
|
| 65 |
-
1000.0,
|
| 66 |
-
1000.0,
|
| 67 |
-
1000.0,
|
| 68 |
-
1000.0,
|
| 69 |
-
1000.0,
|
| 70 |
-
1000.0,
|
| 71 |
-
1000.0,
|
| 72 |
-
1000.0,
|
| 73 |
-
1000.0,
|
| 74 |
-
1000.0,
|
| 75 |
-
1000.0,
|
| 76 |
-
1000.0,
|
| 77 |
-
1000.0,
|
| 78 |
-
1000.0,
|
| 79 |
-
1000.0,
|
| 80 |
-
1000.0,
|
| 81 |
-
1000.0,
|
| 82 |
-
1000.0,
|
| 83 |
-
1000.0,
|
| 84 |
-
1000.0,
|
| 85 |
-
1000.0,
|
| 86 |
-
1000.0,
|
| 87 |
-
1000.0,
|
| 88 |
-
1000.0,
|
| 89 |
-
1000.0,
|
| 90 |
-
1000.0,
|
| 91 |
-
1000.0,
|
| 92 |
-
1000.0,
|
| 93 |
-
1000.0,
|
| 94 |
-
1000.0,
|
| 95 |
-
1000.0,
|
| 96 |
-
1000.0,
|
| 97 |
-
1000.0,
|
| 98 |
-
1000.0,
|
| 99 |
-
1000.0,
|
| 100 |
-
1000.0,
|
| 101 |
-
1000.0,
|
| 102 |
-
1000.0,
|
| 103 |
-
1000.0
|
| 104 |
-
],
|
| 105 |
-
"model":"gpt-3.5-turbo-0125",
|
| 106 |
-
"score":50.0,
|
| 107 |
-
"lower":50.0,
|
| 108 |
-
"upper":50.0,
|
| 109 |
-
"avg_tokens":0.0
|
| 110 |
-
},
|
| 111 |
-
{
|
| 112 |
-
"results":[
|
| 113 |
-
855.5644665503,
|
| 114 |
-
859.0709454157,
|
| 115 |
-
865.0434024226,
|
| 116 |
-
860.399655762,
|
| 117 |
-
855.1731508697,
|
| 118 |
-
855.5326400531,
|
| 119 |
-
866.7819454641,
|
| 120 |
-
858.5219875589,
|
| 121 |
-
861.4603125434,
|
| 122 |
-
859.8350548067,
|
| 123 |
-
862.7609222876,
|
| 124 |
-
854.2414273092,
|
| 125 |
-
862.374147169,
|
| 126 |
-
863.1792770928,
|
| 127 |
-
865.2996605704,
|
| 128 |
-
864.8988771163,
|
| 129 |
-
867.0356240274,
|
| 130 |
-
871.6157440982,
|
| 131 |
-
861.9225322393,
|
| 132 |
-
864.7557130348,
|
| 133 |
-
853.284444198,
|
| 134 |
-
851.7087385877,
|
| 135 |
-
871.482425846,
|
| 136 |
-
866.6122634027,
|
| 137 |
-
852.7157509126,
|
| 138 |
-
859.7938560994,
|
| 139 |
-
874.1682886992,
|
| 140 |
-
855.4589887037,
|
| 141 |
-
850.0205093168,
|
| 142 |
-
875.7282859976,
|
| 143 |
-
865.3647024942,
|
| 144 |
-
856.1797064852,
|
| 145 |
-
867.6238850835,
|
| 146 |
-
857.7097671655,
|
| 147 |
-
874.4978660071,
|
| 148 |
-
857.5650653089,
|
| 149 |
-
890.8852955482,
|
| 150 |
-
855.6426165155,
|
| 151 |
-
859.3456423505,
|
| 152 |
-
857.4854945486,
|
| 153 |
-
880.1901418236,
|
| 154 |
-
849.6103242372,
|
| 155 |
-
871.0458800663,
|
| 156 |
-
877.4244267245,
|
| 157 |
-
875.3479511716,
|
| 158 |
-
859.1269918194,
|
| 159 |
-
857.8015195801,
|
| 160 |
-
868.2750694028,
|
| 161 |
-
868.0957706924,
|
| 162 |
-
870.6012679715,
|
| 163 |
-
862.269673472,
|
| 164 |
-
864.2488571071,
|
| 165 |
-
874.1624601722,
|
| 166 |
-
863.1194231025,
|
| 167 |
-
857.1192986285,
|
| 168 |
-
862.0030926827,
|
| 169 |
-
861.5474187298,
|
| 170 |
-
880.5566205251,
|
| 171 |
-
861.7223684538,
|
| 172 |
-
874.9512628918,
|
| 173 |
-
858.7260910186,
|
| 174 |
-
871.4133525673,
|
| 175 |
-
866.2715335516,
|
| 176 |
-
861.3256361213,
|
| 177 |
-
866.9022358038,
|
| 178 |
-
867.5601382523,
|
| 179 |
-
864.5272121008,
|
| 180 |
-
866.7782194777,
|
| 181 |
-
865.4086246736,
|
| 182 |
-
870.0314924292,
|
| 183 |
-
855.3587976891,
|
| 184 |
-
851.5511568095,
|
| 185 |
-
863.2094645624,
|
| 186 |
-
861.0624318318,
|
| 187 |
-
848.5397354473,
|
| 188 |
-
857.9432204946,
|
| 189 |
-
861.2370229881,
|
| 190 |
-
878.2964116149,
|
| 191 |
-
857.9909782749,
|
| 192 |
-
871.9069179589,
|
| 193 |
-
860.2445059252,
|
| 194 |
-
850.4012745111,
|
| 195 |
-
866.7922558028,
|
| 196 |
-
862.2175409513,
|
| 197 |
-
856.8494155845,
|
| 198 |
-
856.4641060792,
|
| 199 |
-
878.905415424,
|
| 200 |
-
851.8853822745,
|
| 201 |
-
859.2360763272,
|
| 202 |
-
869.1579952553,
|
| 203 |
-
855.2369472583,
|
| 204 |
-
859.2009612357,
|
| 205 |
-
876.2027799847,
|
| 206 |
-
849.6362696273,
|
| 207 |
-
865.1318475963,
|
| 208 |
-
855.8791178271,
|
| 209 |
-
873.3916447336,
|
| 210 |
-
867.1797828548,
|
| 211 |
-
865.1613697328,
|
| 212 |
-
875.1689869302
|
| 213 |
-
],
|
| 214 |
-
"model":"gigachat_pro",
|
| 215 |
-
"score":31.37,
|
| 216 |
-
"lower":29.64,
|
| 217 |
-
"upper":33.33,
|
| 218 |
-
"avg_tokens":0.0
|
| 219 |
-
},
|
| 220 |
-
{
|
| 221 |
-
"results":[
|
| 222 |
-
726.6208252619,
|
| 223 |
-
738.5741612323,
|
| 224 |
-
734.1011761886,
|
| 225 |
-
729.5571514643,
|
| 226 |
-
728.758372467,
|
| 227 |
-
733.7900136425,
|
| 228 |
-
719.043685497,
|
| 229 |
-
714.8370789545,
|
| 230 |
-
725.8752720444,
|
| 231 |
-
715.266084892,
|
| 232 |
-
727.2017077065,
|
| 233 |
-
739.3798608124,
|
| 234 |
-
719.6304899658,
|
| 235 |
-
734.0546251412,
|
| 236 |
-
718.4924449088,
|
| 237 |
-
721.0729415472,
|
| 238 |
-
738.5699274129,
|
| 239 |
-
723.7105361329,
|
| 240 |
-
728.2971721354,
|
| 241 |
-
737.8461934603,
|
| 242 |
-
748.9971545908,
|
| 243 |
-
713.1462726999,
|
| 244 |
-
720.2960317186,
|
| 245 |
-
727.2517234335,
|
| 246 |
-
694.2654473149,
|
| 247 |
-
735.6639839406,
|
| 248 |
-
730.5016731736,
|
| 249 |
-
734.4551919945,
|
| 250 |
-
728.8931636911,
|
| 251 |
-
717.6726330463,
|
| 252 |
-
733.3721052861,
|
| 253 |
-
725.7981758416,
|
| 254 |
-
731.0409312559,
|
| 255 |
-
715.3647090465,
|
| 256 |
-
737.7875979517,
|
| 257 |
-
729.3512200797,
|
| 258 |
-
715.9010959711,
|
| 259 |
-
722.2116159282,
|
| 260 |
-
724.6752254921,
|
| 261 |
-
718.5749125859,
|
| 262 |
-
723.0132896162,
|
| 263 |
-
732.3587564613,
|
| 264 |
-
740.6268654101,
|
| 265 |
-
724.6297632896,
|
| 266 |
-
743.701641735,
|
| 267 |
-
723.5736702859,
|
| 268 |
-
731.9752231934,
|
| 269 |
-
722.3929635211,
|
| 270 |
-
721.9705147906,
|
| 271 |
-
738.9123529498,
|
| 272 |
-
733.7609432817,
|
| 273 |
-
724.1850017217,
|
| 274 |
-
727.8550112565,
|
| 275 |
-
731.3315308989,
|
| 276 |
-
722.5721295254,
|
| 277 |
-
729.8940208849,
|
| 278 |
-
735.9873637973,
|
| 279 |
-
730.6501947523,
|
| 280 |
-
702.8268457509,
|
| 281 |
-
732.6491227137,
|
| 282 |
-
736.225411771,
|
| 283 |
-
745.6156113918,
|
| 284 |
-
721.0912474577,
|
| 285 |
-
736.2254117629,
|
| 286 |
-
732.9674153867,
|
| 287 |
-
723.0966793643,
|
| 288 |
-
718.0704518208,
|
| 289 |
-
722.2852812675,
|
| 290 |
-
745.1185090985,
|
| 291 |
-
736.9690722951,
|
| 292 |
-
742.6306627437,
|
| 293 |
-
733.1555506911,
|
| 294 |
-
721.7491525609,
|
| 295 |
-
723.0795022704,
|
| 296 |
-
717.9478748234,
|
| 297 |
-
726.703609728,
|
| 298 |
-
725.3073844986,
|
| 299 |
-
722.2116156669,
|
| 300 |
-
720.1865370325,
|
| 301 |
-
731.5240457448,
|
| 302 |
-
737.0781670626,
|
| 303 |
-
708.356058121,
|
| 304 |
-
730.3511179714,
|
| 305 |
-
727.5035049316,
|
| 306 |
-
706.4191731996,
|
| 307 |
-
734.2333848904,
|
| 308 |
-
736.5196621633,
|
| 309 |
-
724.9647865416,
|
| 310 |
-
718.7060814362,
|
| 311 |
-
722.5615781913,
|
| 312 |
-
731.6666527735,
|
| 313 |
-
722.1914533305,
|
| 314 |
-
719.1795542579,
|
| 315 |
-
730.3223324585,
|
| 316 |
-
724.1322488355,
|
| 317 |
-
734.6332090556,
|
| 318 |
-
716.1292305518,
|
| 319 |
-
726.7846008592,
|
| 320 |
-
717.027778133,
|
| 321 |
-
728.6562483681
|
| 322 |
-
],
|
| 323 |
-
"model":"gigachat_lite",
|
| 324 |
-
"score":17.2,
|
| 325 |
-
"lower":15.65,
|
| 326 |
-
"upper":18.68,
|
| 327 |
-
"avg_tokens":276.0
|
| 328 |
-
}
|
| 329 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/gen/show_result.py
CHANGED
|
@@ -263,13 +263,13 @@ if __name__ == "__main__":
|
|
| 263 |
huggingface_hub.HfApi().upload_file(
|
| 264 |
path_or_fileobj=json_file_name,
|
| 265 |
path_in_repo="data/leaderboard.json",
|
| 266 |
-
repo_id="Vikhrmodels/leaderboard",
|
| 267 |
repo_type="space",
|
| 268 |
)
|
| 269 |
|
| 270 |
huggingface_hub.HfApi().upload_file(
|
| 271 |
path_or_fileobj=json_file_name,
|
| 272 |
path_in_repo=f"data/leaderboard_logs/{json_file_name}",
|
| 273 |
-
repo_id="Vikhrmodels/leaderboard",
|
| 274 |
repo_type="dataset",
|
| 275 |
)
|
|
|
|
| 263 |
huggingface_hub.HfApi().upload_file(
|
| 264 |
path_or_fileobj=json_file_name,
|
| 265 |
path_in_repo="data/leaderboard.json",
|
| 266 |
+
repo_id="Vikhrmodels/arena-leaderboard-metainfo",
|
| 267 |
repo_type="space",
|
| 268 |
)
|
| 269 |
|
| 270 |
huggingface_hub.HfApi().upload_file(
|
| 271 |
path_or_fileobj=json_file_name,
|
| 272 |
path_in_repo=f"data/leaderboard_logs/{json_file_name}",
|
| 273 |
+
repo_id="Vikhrmodels/arena-leaderboard-metainfo",
|
| 274 |
repo_type="dataset",
|
| 275 |
)
|
src/leaderboard/build_leaderboard.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
import os
|
| 4 |
-
import subprocess
|
| 5 |
import time
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
from huggingface_hub import snapshot_download
|
| 9 |
|
| 10 |
-
from src.envs import DATA_ARENA_PATH,
|
| 11 |
|
| 12 |
# Configure logging
|
| 13 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
@@ -55,21 +54,15 @@ def download_openbench():
|
|
| 55 |
"""Downloads pre generated data"""
|
| 56 |
os.makedirs(DATA_ARENA_PATH, exist_ok=True)
|
| 57 |
|
|
|
|
|
|
|
|
|
|
| 58 |
# download answers of different models that we trust
|
| 59 |
download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
|
| 60 |
|
| 61 |
-
print("\nInternal models in openbench-eval:")
|
| 62 |
-
subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_answers/internal/"], check=False)
|
| 63 |
-
|
| 64 |
-
print("\nExternal models in openbench-eval:")
|
| 65 |
-
subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_answers/external/"], check=False)
|
| 66 |
-
|
| 67 |
-
print("\nJudgement in openbench-eval")
|
| 68 |
-
subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_judgement/gpt-4-1106-preview"], check=False)
|
| 69 |
-
|
| 70 |
|
| 71 |
def build_leadearboard_df():
|
| 72 |
# Retrieve the leaderboard DataFrame
|
| 73 |
-
with open(f"{
|
| 74 |
leaderboard_df = pd.DataFrame.from_records(json.load(eval_file))
|
| 75 |
return leaderboard_df.copy()
|
|
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
import os
|
|
|
|
| 4 |
import time
|
| 5 |
|
| 6 |
import pandas as pd
|
| 7 |
from huggingface_hub import snapshot_download
|
| 8 |
|
| 9 |
+
from src.envs import DATA_ARENA_PATH, DATA_PATH
|
| 10 |
|
| 11 |
# Configure logging
|
| 12 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
|
| 54 |
"""Downloads pre generated data"""
|
| 55 |
os.makedirs(DATA_ARENA_PATH, exist_ok=True)
|
| 56 |
|
| 57 |
+
# download prev autogenerated leaderboard files
|
| 58 |
+
download_dataset("Vikhrmodels/arena-leaderboard-metainfo", DATA_PATH)
|
| 59 |
+
|
| 60 |
# download answers of different models that we trust
|
| 61 |
download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
def build_leadearboard_df():
|
| 65 |
# Retrieve the leaderboard DataFrame
|
| 66 |
+
with open(f"{DATA_PATH}/leaderboard.json", "r", encoding="utf-8") as eval_file:
|
| 67 |
leaderboard_df = pd.DataFrame.from_records(json.load(eval_file))
|
| 68 |
return leaderboard_df.copy()
|