| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9986739912862285, | |
| "eval_steps": 500, | |
| "global_step": 659, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00757719265012313, | |
| "grad_norm": 14.689146511528078, | |
| "learning_rate": 3.0303030303030305e-07, | |
| "loss": 1.5538, | |
| "num_tokens": 671416.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01515438530024626, | |
| "grad_norm": 13.378740723930147, | |
| "learning_rate": 6.818181818181818e-07, | |
| "loss": 1.5284, | |
| "num_tokens": 1343365.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.022731577950369387, | |
| "grad_norm": 9.028347650887854, | |
| "learning_rate": 1.0606060606060608e-06, | |
| "loss": 1.392, | |
| "num_tokens": 2025633.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03030877060049252, | |
| "grad_norm": 4.978629002681296, | |
| "learning_rate": 1.4393939393939396e-06, | |
| "loss": 1.1089, | |
| "num_tokens": 2706981.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03788596325061565, | |
| "grad_norm": 4.62206734817716, | |
| "learning_rate": 1.8181818181818183e-06, | |
| "loss": 0.8855, | |
| "num_tokens": 3388628.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04546315590073877, | |
| "grad_norm": 2.3801704266966124, | |
| "learning_rate": 2.196969696969697e-06, | |
| "loss": 0.7432, | |
| "num_tokens": 4065933.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.053040348550861906, | |
| "grad_norm": 1.7405039480086826, | |
| "learning_rate": 2.575757575757576e-06, | |
| "loss": 0.655, | |
| "num_tokens": 4724691.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.06061754120098504, | |
| "grad_norm": 1.2646933885721225, | |
| "learning_rate": 2.954545454545455e-06, | |
| "loss": 0.5984, | |
| "num_tokens": 5396624.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06819473385110816, | |
| "grad_norm": 1.2609258302188018, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.5493, | |
| "num_tokens": 6069436.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0757719265012313, | |
| "grad_norm": 1.211902532539145, | |
| "learning_rate": 3.7121212121212124e-06, | |
| "loss": 0.5207, | |
| "num_tokens": 6724875.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08334911915135443, | |
| "grad_norm": 1.1208560528504965, | |
| "learning_rate": 4.0909090909090915e-06, | |
| "loss": 0.496, | |
| "num_tokens": 7410263.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.09092631180147755, | |
| "grad_norm": 1.1455847147400369, | |
| "learning_rate": 4.46969696969697e-06, | |
| "loss": 0.476, | |
| "num_tokens": 8090206.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09850350445160068, | |
| "grad_norm": 1.2402202213157907, | |
| "learning_rate": 4.848484848484849e-06, | |
| "loss": 0.4704, | |
| "num_tokens": 8752216.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.10608069710172381, | |
| "grad_norm": 1.1730469284206266, | |
| "learning_rate": 4.999715831294673e-06, | |
| "loss": 0.4437, | |
| "num_tokens": 9424314.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.11365788975184694, | |
| "grad_norm": 1.2572345533896205, | |
| "learning_rate": 4.9979795046972526e-06, | |
| "loss": 0.4393, | |
| "num_tokens": 10095498.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.12123508240197008, | |
| "grad_norm": 1.202320205976868, | |
| "learning_rate": 4.994665939778305e-06, | |
| "loss": 0.4344, | |
| "num_tokens": 10757698.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1288122750520932, | |
| "grad_norm": 1.0110597588294479, | |
| "learning_rate": 4.989777461417376e-06, | |
| "loss": 0.4287, | |
| "num_tokens": 11442115.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.13638946770221633, | |
| "grad_norm": 1.0547593677796412, | |
| "learning_rate": 4.983317499492444e-06, | |
| "loss": 0.4173, | |
| "num_tokens": 12117843.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.14396666035233946, | |
| "grad_norm": 1.1606411334713018, | |
| "learning_rate": 4.975290586473441e-06, | |
| "loss": 0.4158, | |
| "num_tokens": 12801735.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1515438530024626, | |
| "grad_norm": 1.083571026614714, | |
| "learning_rate": 4.965702354242146e-06, | |
| "loss": 0.4094, | |
| "num_tokens": 13485502.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.15912104565258572, | |
| "grad_norm": 1.064545176887418, | |
| "learning_rate": 4.954559530140725e-06, | |
| "loss": 0.4142, | |
| "num_tokens": 14158674.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.16669823830270886, | |
| "grad_norm": 1.0252361899177853, | |
| "learning_rate": 4.941869932251659e-06, | |
| "loss": 0.3958, | |
| "num_tokens": 14845188.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.174275430952832, | |
| "grad_norm": 1.1316583942434564, | |
| "learning_rate": 4.927642463912383e-06, | |
| "loss": 0.4053, | |
| "num_tokens": 15530682.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1818526236029551, | |
| "grad_norm": 1.0710593340675518, | |
| "learning_rate": 4.9118871074684815e-06, | |
| "loss": 0.3967, | |
| "num_tokens": 16198500.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.18942981625307823, | |
| "grad_norm": 1.030020134722499, | |
| "learning_rate": 4.894614917269827e-06, | |
| "loss": 0.3854, | |
| "num_tokens": 16874884.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.19700700890320136, | |
| "grad_norm": 1.1408468287734495, | |
| "learning_rate": 4.875838011914574e-06, | |
| "loss": 0.3897, | |
| "num_tokens": 17563009.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2045842015533245, | |
| "grad_norm": 1.078979471366339, | |
| "learning_rate": 4.8555695657464505e-06, | |
| "loss": 0.3893, | |
| "num_tokens": 18253977.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.21216139420344762, | |
| "grad_norm": 0.9577387651632541, | |
| "learning_rate": 4.833823799611309e-06, | |
| "loss": 0.3894, | |
| "num_tokens": 18924480.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.21973858685357076, | |
| "grad_norm": 0.9979699563632606, | |
| "learning_rate": 4.810615970879425e-06, | |
| "loss": 0.3823, | |
| "num_tokens": 19607692.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.2273157795036939, | |
| "grad_norm": 1.083731391070982, | |
| "learning_rate": 4.7859623627405525e-06, | |
| "loss": 0.3799, | |
| "num_tokens": 20295504.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.23489297215381702, | |
| "grad_norm": 0.9588323354107069, | |
| "learning_rate": 4.759880272779228e-06, | |
| "loss": 0.3814, | |
| "num_tokens": 20980967.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.24247016480394015, | |
| "grad_norm": 0.9397995296634589, | |
| "learning_rate": 4.732388000838359e-06, | |
| "loss": 0.3794, | |
| "num_tokens": 21663800.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.25004735745406326, | |
| "grad_norm": 1.149316897261889, | |
| "learning_rate": 4.703504836179595e-06, | |
| "loss": 0.3764, | |
| "num_tokens": 22370457.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2576245501041864, | |
| "grad_norm": 1.0979780566286403, | |
| "learning_rate": 4.673251043949505e-06, | |
| "loss": 0.3754, | |
| "num_tokens": 23040401.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2652017427543095, | |
| "grad_norm": 0.8834333298633666, | |
| "learning_rate": 4.6416478509610464e-06, | |
| "loss": 0.3781, | |
| "num_tokens": 23707041.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.27277893540443265, | |
| "grad_norm": 0.936045616821062, | |
| "learning_rate": 4.608717430800303e-06, | |
| "loss": 0.3759, | |
| "num_tokens": 24377111.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2803561280545558, | |
| "grad_norm": 0.9887231447085479, | |
| "learning_rate": 4.57448288826895e-06, | |
| "loss": 0.3673, | |
| "num_tokens": 25055343.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2879333207046789, | |
| "grad_norm": 0.8248139338597134, | |
| "learning_rate": 4.538968243173343e-06, | |
| "loss": 0.3668, | |
| "num_tokens": 25728981.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.29551051335480205, | |
| "grad_norm": 0.9150002884928868, | |
| "learning_rate": 4.502198413471634e-06, | |
| "loss": 0.3691, | |
| "num_tokens": 26409520.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3030877060049252, | |
| "grad_norm": 0.9053014180770657, | |
| "learning_rate": 4.464199197790705e-06, | |
| "loss": 0.3706, | |
| "num_tokens": 27099755.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3106648986550483, | |
| "grad_norm": 0.9549841540494592, | |
| "learning_rate": 4.424997257325213e-06, | |
| "loss": 0.3692, | |
| "num_tokens": 27764650.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.31824209130517145, | |
| "grad_norm": 0.9696672595730065, | |
| "learning_rate": 4.384620097131433e-06, | |
| "loss": 0.3601, | |
| "num_tokens": 28453535.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3258192839552946, | |
| "grad_norm": 1.0028667181687292, | |
| "learning_rate": 4.343096046829025e-06, | |
| "loss": 0.3562, | |
| "num_tokens": 29137151.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3333964766054177, | |
| "grad_norm": 0.9658631567314796, | |
| "learning_rate": 4.30045424072427e-06, | |
| "loss": 0.3616, | |
| "num_tokens": 29817369.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.34097366925554085, | |
| "grad_norm": 0.9408101125306896, | |
| "learning_rate": 4.256724597368713e-06, | |
| "loss": 0.3599, | |
| "num_tokens": 30501199.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.348550861905664, | |
| "grad_norm": 0.9703143560737951, | |
| "learning_rate": 4.211937798567569e-06, | |
| "loss": 0.3612, | |
| "num_tokens": 31180324.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.35612805455578705, | |
| "grad_norm": 0.9050766730813399, | |
| "learning_rate": 4.166125267852601e-06, | |
| "loss": 0.3642, | |
| "num_tokens": 31855036.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3637052472059102, | |
| "grad_norm": 0.9428957026430407, | |
| "learning_rate": 4.11931914843459e-06, | |
| "loss": 0.3631, | |
| "num_tokens": 32509354.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3712824398560333, | |
| "grad_norm": 0.916465479137018, | |
| "learning_rate": 4.071552280650856e-06, | |
| "loss": 0.3588, | |
| "num_tokens": 33178262.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.37885963250615645, | |
| "grad_norm": 0.9687892223580263, | |
| "learning_rate": 4.022858178923667e-06, | |
| "loss": 0.3492, | |
| "num_tokens": 33857989.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3864368251562796, | |
| "grad_norm": 1.043264768303208, | |
| "learning_rate": 3.973271008245684e-06, | |
| "loss": 0.3485, | |
| "num_tokens": 34550372.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3940140178064027, | |
| "grad_norm": 1.0626177134911259, | |
| "learning_rate": 3.922825560208949e-06, | |
| "loss": 0.3592, | |
| "num_tokens": 35214613.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.40159121045652585, | |
| "grad_norm": 0.91604627717572, | |
| "learning_rate": 3.871557228594243e-06, | |
| "loss": 0.3475, | |
| "num_tokens": 35904210.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.409168403106649, | |
| "grad_norm": 0.9175491926941394, | |
| "learning_rate": 3.81950198453793e-06, | |
| "loss": 0.3519, | |
| "num_tokens": 36570673.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4167455957567721, | |
| "grad_norm": 0.8296317012779955, | |
| "learning_rate": 3.766696351293709e-06, | |
| "loss": 0.3529, | |
| "num_tokens": 37259417.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.42432278840689525, | |
| "grad_norm": 0.8784297804071752, | |
| "learning_rate": 3.713177378606993e-06, | |
| "loss": 0.3436, | |
| "num_tokens": 37948015.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4318999810570184, | |
| "grad_norm": 0.8620896694006768, | |
| "learning_rate": 3.65898261671989e-06, | |
| "loss": 0.3532, | |
| "num_tokens": 38609821.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.4394771737071415, | |
| "grad_norm": 0.8892945434162819, | |
| "learning_rate": 3.6041500900250126e-06, | |
| "loss": 0.3458, | |
| "num_tokens": 39293466.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.44705436635726464, | |
| "grad_norm": 0.8624068465904439, | |
| "learning_rate": 3.5487182703866235e-06, | |
| "loss": 0.3487, | |
| "num_tokens": 39971718.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4546315590073878, | |
| "grad_norm": 0.8085150075292896, | |
| "learning_rate": 3.4927260501478164e-06, | |
| "loss": 0.3478, | |
| "num_tokens": 40640725.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4622087516575109, | |
| "grad_norm": 0.8404898292867258, | |
| "learning_rate": 3.4362127148426834e-06, | |
| "loss": 0.3453, | |
| "num_tokens": 41318493.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.46978594430763404, | |
| "grad_norm": 0.9925900662007742, | |
| "learning_rate": 3.3792179156326045e-06, | |
| "loss": 0.3484, | |
| "num_tokens": 41995362.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4773631369577572, | |
| "grad_norm": 0.8623383910484006, | |
| "learning_rate": 3.3217816414860083e-06, | |
| "loss": 0.3482, | |
| "num_tokens": 42678115.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.4849403296078803, | |
| "grad_norm": 0.8975797769749146, | |
| "learning_rate": 3.2639441911211178e-06, | |
| "loss": 0.3467, | |
| "num_tokens": 43378878.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.49251752225800344, | |
| "grad_norm": 0.8194623317436803, | |
| "learning_rate": 3.205746144731366e-06, | |
| "loss": 0.3396, | |
| "num_tokens": 44074633.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5000947149081265, | |
| "grad_norm": 0.8263193992649668, | |
| "learning_rate": 3.1472283355133254e-06, | |
| "loss": 0.349, | |
| "num_tokens": 44758521.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5076719075582496, | |
| "grad_norm": 0.8209994838731967, | |
| "learning_rate": 3.0884318210171173e-06, | |
| "loss": 0.3435, | |
| "num_tokens": 45441476.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.5152491002083728, | |
| "grad_norm": 0.9194054012383176, | |
| "learning_rate": 3.0293978543394207e-06, | |
| "loss": 0.3404, | |
| "num_tokens": 46126861.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5228262928584959, | |
| "grad_norm": 0.8692812034927447, | |
| "learning_rate": 2.9701678551792685e-06, | |
| "loss": 0.3425, | |
| "num_tokens": 46817925.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.530403485508619, | |
| "grad_norm": 0.8067817246185729, | |
| "learning_rate": 2.9107833807769566e-06, | |
| "loss": 0.3377, | |
| "num_tokens": 47497471.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5379806781587422, | |
| "grad_norm": 0.8339751108884305, | |
| "learning_rate": 2.851286096756453e-06, | |
| "loss": 0.3438, | |
| "num_tokens": 48172701.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5455578708088653, | |
| "grad_norm": 0.7717434957072956, | |
| "learning_rate": 2.7917177478917605e-06, | |
| "loss": 0.3311, | |
| "num_tokens": 48860699.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5531350634589884, | |
| "grad_norm": 0.859770404135416, | |
| "learning_rate": 2.7321201288177424e-06, | |
| "loss": 0.3439, | |
| "num_tokens": 49543737.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5607122561091116, | |
| "grad_norm": 0.8100249593766239, | |
| "learning_rate": 2.6725350547059682e-06, | |
| "loss": 0.34, | |
| "num_tokens": 50221154.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5682894487592347, | |
| "grad_norm": 0.803391445344769, | |
| "learning_rate": 2.6130043319261513e-06, | |
| "loss": 0.3392, | |
| "num_tokens": 50898508.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5758666414093578, | |
| "grad_norm": 0.8040693336090444, | |
| "learning_rate": 2.5535697287137585e-06, | |
| "loss": 0.3345, | |
| "num_tokens": 51592671.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.583443834059481, | |
| "grad_norm": 0.8504004369104251, | |
| "learning_rate": 2.4942729458643772e-06, | |
| "loss": 0.3379, | |
| "num_tokens": 52270525.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.5910210267096041, | |
| "grad_norm": 0.8460679730282097, | |
| "learning_rate": 2.4351555874754023e-06, | |
| "loss": 0.3344, | |
| "num_tokens": 52945561.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5985982193597272, | |
| "grad_norm": 0.8331966726775251, | |
| "learning_rate": 2.376259131755565e-06, | |
| "loss": 0.3354, | |
| "num_tokens": 53633168.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.6061754120098504, | |
| "grad_norm": 0.9208957482910517, | |
| "learning_rate": 2.3176249019227887e-06, | |
| "loss": 0.3396, | |
| "num_tokens": 54297321.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6137526046599735, | |
| "grad_norm": 0.7991681474000126, | |
| "learning_rate": 2.259294037210797e-06, | |
| "loss": 0.3321, | |
| "num_tokens": 54984763.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6213297973100966, | |
| "grad_norm": 0.8178882775094793, | |
| "learning_rate": 2.2013074640047984e-06, | |
| "loss": 0.342, | |
| "num_tokens": 55645272.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6289069899602198, | |
| "grad_norm": 0.8068516933987062, | |
| "learning_rate": 2.143705867126518e-06, | |
| "loss": 0.3365, | |
| "num_tokens": 56325652.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.6364841826103429, | |
| "grad_norm": 0.8118975393672736, | |
| "learning_rate": 2.0865296612887215e-06, | |
| "loss": 0.3337, | |
| "num_tokens": 57016551.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.644061375260466, | |
| "grad_norm": 0.8720525791746454, | |
| "learning_rate": 2.0298189627392366e-06, | |
| "loss": 0.33, | |
| "num_tokens": 57715996.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6516385679105892, | |
| "grad_norm": 0.8189240422464783, | |
| "learning_rate": 1.973613561114404e-06, | |
| "loss": 0.3319, | |
| "num_tokens": 58411871.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6592157605607123, | |
| "grad_norm": 0.8010090923788271, | |
| "learning_rate": 1.917952891521678e-06, | |
| "loss": 0.3261, | |
| "num_tokens": 59102622.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6667929532108354, | |
| "grad_norm": 0.8137824874604472, | |
| "learning_rate": 1.8628760068709694e-06, | |
| "loss": 0.3408, | |
| "num_tokens": 59768546.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6743701458609586, | |
| "grad_norm": 0.7822337587794234, | |
| "learning_rate": 1.8084215504741603e-06, | |
| "loss": 0.3289, | |
| "num_tokens": 60465312.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6819473385110817, | |
| "grad_norm": 0.7788309919289208, | |
| "learning_rate": 1.7546277289319907e-06, | |
| "loss": 0.3258, | |
| "num_tokens": 61147815.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6895245311612048, | |
| "grad_norm": 0.8195670772554525, | |
| "learning_rate": 1.701532285327358e-06, | |
| "loss": 0.3313, | |
| "num_tokens": 61832191.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.697101723811328, | |
| "grad_norm": 0.8214679319970593, | |
| "learning_rate": 1.6491724727438301e-06, | |
| "loss": 0.3329, | |
| "num_tokens": 62507231.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.704678916461451, | |
| "grad_norm": 0.7812180367757561, | |
| "learning_rate": 1.5975850281279626e-06, | |
| "loss": 0.3295, | |
| "num_tokens": 63182873.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.7122561091115741, | |
| "grad_norm": 0.7674580303784487, | |
| "learning_rate": 1.5468061465137335e-06, | |
| "loss": 0.3312, | |
| "num_tokens": 63860394.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7198333017616972, | |
| "grad_norm": 0.8336261408594463, | |
| "learning_rate": 1.4968714556272124e-06, | |
| "loss": 0.3278, | |
| "num_tokens": 64541974.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.7274104944118204, | |
| "grad_norm": 0.7612534657890504, | |
| "learning_rate": 1.4478159908892646e-06, | |
| "loss": 0.3337, | |
| "num_tokens": 65226223.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7349876870619435, | |
| "grad_norm": 0.7790959703943752, | |
| "learning_rate": 1.399674170833825e-06, | |
| "loss": 0.3271, | |
| "num_tokens": 65908856.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.7425648797120666, | |
| "grad_norm": 0.7934687328628046, | |
| "learning_rate": 1.3524797729589945e-06, | |
| "loss": 0.3279, | |
| "num_tokens": 66592993.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7501420723621898, | |
| "grad_norm": 0.8457662795541676, | |
| "learning_rate": 1.3062659100279198e-06, | |
| "loss": 0.327, | |
| "num_tokens": 67273074.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7577192650123129, | |
| "grad_norm": 0.8315491509901118, | |
| "learning_rate": 1.2610650068360442e-06, | |
| "loss": 0.329, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7577192650123129, | |
| "eval_loss": 0.33532413840293884, | |
| "eval_num_tokens": 67964833.0, | |
| "eval_runtime": 239.906, | |
| "eval_samples_per_second": 9.266, | |
| "eval_steps_per_second": 1.159, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.765296457662436, | |
| "grad_norm": 0.7220222867753697, | |
| "learning_rate": 1.2169087774610656e-06, | |
| "loss": 0.3251, | |
| "num_tokens": 68655791.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7728736503125592, | |
| "grad_norm": 0.8492025800299583, | |
| "learning_rate": 1.17382820301156e-06, | |
| "loss": 0.3294, | |
| "num_tokens": 69331093.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7804508429626823, | |
| "grad_norm": 0.8108134141335065, | |
| "learning_rate": 1.131853509889854e-06, | |
| "loss": 0.3278, | |
| "num_tokens": 70011469.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7880280356128054, | |
| "grad_norm": 0.8041767673010345, | |
| "learning_rate": 1.0910141485844363e-06, | |
| "loss": 0.323, | |
| "num_tokens": 70689883.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7956052282629286, | |
| "grad_norm": 0.7948666169290667, | |
| "learning_rate": 1.0513387730067626e-06, | |
| "loss": 0.3295, | |
| "num_tokens": 71353521.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.8031824209130517, | |
| "grad_norm": 0.7987662784926006, | |
| "learning_rate": 1.012855220386953e-06, | |
| "loss": 0.3289, | |
| "num_tokens": 72025801.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8107596135631748, | |
| "grad_norm": 0.8124248754570028, | |
| "learning_rate": 9.755904917425054e-07, | |
| "loss": 0.3361, | |
| "num_tokens": 72677000.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.818336806213298, | |
| "grad_norm": 0.8034403126732426, | |
| "learning_rate": 9.395707329337092e-07, | |
| "loss": 0.3258, | |
| "num_tokens": 73354814.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8259139988634211, | |
| "grad_norm": 0.729259511532679, | |
| "learning_rate": 9.048212163190542e-07, | |
| "loss": 0.3229, | |
| "num_tokens": 74043197.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.8334911915135442, | |
| "grad_norm": 0.760321674214779, | |
| "learning_rate": 8.713663230235226e-07, | |
| "loss": 0.3217, | |
| "num_tokens": 74718226.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8410683841636674, | |
| "grad_norm": 0.7260807686457017, | |
| "learning_rate": 8.392295258321817e-07, | |
| "loss": 0.3267, | |
| "num_tokens": 75405876.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.8486455768137905, | |
| "grad_norm": 0.7326560121754767, | |
| "learning_rate": 8.084333727210933e-07, | |
| "loss": 0.3194, | |
| "num_tokens": 76104574.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8562227694639136, | |
| "grad_norm": 0.7295825238005851, | |
| "learning_rate": 7.789994710370951e-07, | |
| "loss": 0.325, | |
| "num_tokens": 76778482.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.8637999621140368, | |
| "grad_norm": 0.7365331005684703, | |
| "learning_rate": 7.509484723375499e-07, | |
| "loss": 0.326, | |
| "num_tokens": 77448090.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8713771547641599, | |
| "grad_norm": 0.7518330331674917, | |
| "learning_rate": 7.243000579006945e-07, | |
| "loss": 0.3192, | |
| "num_tokens": 78127977.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.878954347414283, | |
| "grad_norm": 0.7696059216951111, | |
| "learning_rate": 6.990729249167704e-07, | |
| "loss": 0.3203, | |
| "num_tokens": 78809677.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8865315400644062, | |
| "grad_norm": 0.7540802540794419, | |
| "learning_rate": 6.752847733696091e-07, | |
| "loss": 0.3214, | |
| "num_tokens": 79494335.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8941087327145293, | |
| "grad_norm": 0.760684540360902, | |
| "learning_rate": 6.529522936178805e-07, | |
| "loss": 0.3207, | |
| "num_tokens": 80175110.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9016859253646524, | |
| "grad_norm": 0.6866096784785415, | |
| "learning_rate": 6.320911546847259e-07, | |
| "loss": 0.3244, | |
| "num_tokens": 80858228.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.9092631180147756, | |
| "grad_norm": 0.749755946733873, | |
| "learning_rate": 6.127159932639797e-07, | |
| "loss": 0.3246, | |
| "num_tokens": 81528605.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9168403106648987, | |
| "grad_norm": 0.7323158098643515, | |
| "learning_rate": 5.948404034507013e-07, | |
| "loss": 0.3229, | |
| "num_tokens": 82199743.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.9244175033150218, | |
| "grad_norm": 0.7716620129264333, | |
| "learning_rate": 5.784769272032198e-07, | |
| "loss": 0.3273, | |
| "num_tokens": 82865102.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.931994695965145, | |
| "grad_norm": 0.717119707148909, | |
| "learning_rate": 5.636370455433854e-07, | |
| "loss": 0.3193, | |
| "num_tokens": 83539618.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.9395718886152681, | |
| "grad_norm": 0.7816104210558041, | |
| "learning_rate": 5.503311705011973e-07, | |
| "loss": 0.3186, | |
| "num_tokens": 84229670.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9471490812653912, | |
| "grad_norm": 0.7268049825907208, | |
| "learning_rate": 5.385686378094653e-07, | |
| "loss": 0.3241, | |
| "num_tokens": 84908504.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.9547262739155143, | |
| "grad_norm": 0.6840492043651534, | |
| "learning_rate": 5.283577003536274e-07, | |
| "loss": 0.3284, | |
| "num_tokens": 85583606.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9623034665656375, | |
| "grad_norm": 0.7229941798275764, | |
| "learning_rate": 5.197055223813207e-07, | |
| "loss": 0.3203, | |
| "num_tokens": 86259953.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.9698806592157606, | |
| "grad_norm": 0.7215194242874737, | |
| "learning_rate": 5.126181744757663e-07, | |
| "loss": 0.3211, | |
| "num_tokens": 86940929.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9774578518658837, | |
| "grad_norm": 0.7395561763461616, | |
| "learning_rate": 5.071006292964973e-07, | |
| "loss": 0.3198, | |
| "num_tokens": 87613159.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.9850350445160069, | |
| "grad_norm": 0.7167293407789844, | |
| "learning_rate": 5.031567580904175e-07, | |
| "loss": 0.3223, | |
| "num_tokens": 88289922.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9926122371661299, | |
| "grad_norm": 0.7107427399897064, | |
| "learning_rate": 5.007893279756384e-07, | |
| "loss": 0.3202, | |
| "num_tokens": 88973021.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.9986739912862285, | |
| "num_tokens": 89507364.0, | |
| "step": 659, | |
| "total_flos": 4.0084130952600617e+18, | |
| "train_loss": 0.3981744751040237, | |
| "train_runtime": 12737.2069, | |
| "train_samples_per_second": 3.315, | |
| "train_steps_per_second": 0.052 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 659, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.0084130952600617e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |