mikkel-werling's picture
Model save
698976e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9986739912862285,
"eval_steps": 500,
"global_step": 659,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00757719265012313,
"grad_norm": 14.689146511528078,
"learning_rate": 3.0303030303030305e-07,
"loss": 1.5538,
"num_tokens": 671416.0,
"step": 5
},
{
"epoch": 0.01515438530024626,
"grad_norm": 13.378740723930147,
"learning_rate": 6.818181818181818e-07,
"loss": 1.5284,
"num_tokens": 1343365.0,
"step": 10
},
{
"epoch": 0.022731577950369387,
"grad_norm": 9.028347650887854,
"learning_rate": 1.0606060606060608e-06,
"loss": 1.392,
"num_tokens": 2025633.0,
"step": 15
},
{
"epoch": 0.03030877060049252,
"grad_norm": 4.978629002681296,
"learning_rate": 1.4393939393939396e-06,
"loss": 1.1089,
"num_tokens": 2706981.0,
"step": 20
},
{
"epoch": 0.03788596325061565,
"grad_norm": 4.62206734817716,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.8855,
"num_tokens": 3388628.0,
"step": 25
},
{
"epoch": 0.04546315590073877,
"grad_norm": 2.3801704266966124,
"learning_rate": 2.196969696969697e-06,
"loss": 0.7432,
"num_tokens": 4065933.0,
"step": 30
},
{
"epoch": 0.053040348550861906,
"grad_norm": 1.7405039480086826,
"learning_rate": 2.575757575757576e-06,
"loss": 0.655,
"num_tokens": 4724691.0,
"step": 35
},
{
"epoch": 0.06061754120098504,
"grad_norm": 1.2646933885721225,
"learning_rate": 2.954545454545455e-06,
"loss": 0.5984,
"num_tokens": 5396624.0,
"step": 40
},
{
"epoch": 0.06819473385110816,
"grad_norm": 1.2609258302188018,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.5493,
"num_tokens": 6069436.0,
"step": 45
},
{
"epoch": 0.0757719265012313,
"grad_norm": 1.211902532539145,
"learning_rate": 3.7121212121212124e-06,
"loss": 0.5207,
"num_tokens": 6724875.0,
"step": 50
},
{
"epoch": 0.08334911915135443,
"grad_norm": 1.1208560528504965,
"learning_rate": 4.0909090909090915e-06,
"loss": 0.496,
"num_tokens": 7410263.0,
"step": 55
},
{
"epoch": 0.09092631180147755,
"grad_norm": 1.1455847147400369,
"learning_rate": 4.46969696969697e-06,
"loss": 0.476,
"num_tokens": 8090206.0,
"step": 60
},
{
"epoch": 0.09850350445160068,
"grad_norm": 1.2402202213157907,
"learning_rate": 4.848484848484849e-06,
"loss": 0.4704,
"num_tokens": 8752216.0,
"step": 65
},
{
"epoch": 0.10608069710172381,
"grad_norm": 1.1730469284206266,
"learning_rate": 4.999715831294673e-06,
"loss": 0.4437,
"num_tokens": 9424314.0,
"step": 70
},
{
"epoch": 0.11365788975184694,
"grad_norm": 1.2572345533896205,
"learning_rate": 4.9979795046972526e-06,
"loss": 0.4393,
"num_tokens": 10095498.0,
"step": 75
},
{
"epoch": 0.12123508240197008,
"grad_norm": 1.202320205976868,
"learning_rate": 4.994665939778305e-06,
"loss": 0.4344,
"num_tokens": 10757698.0,
"step": 80
},
{
"epoch": 0.1288122750520932,
"grad_norm": 1.0110597588294479,
"learning_rate": 4.989777461417376e-06,
"loss": 0.4287,
"num_tokens": 11442115.0,
"step": 85
},
{
"epoch": 0.13638946770221633,
"grad_norm": 1.0547593677796412,
"learning_rate": 4.983317499492444e-06,
"loss": 0.4173,
"num_tokens": 12117843.0,
"step": 90
},
{
"epoch": 0.14396666035233946,
"grad_norm": 1.1606411334713018,
"learning_rate": 4.975290586473441e-06,
"loss": 0.4158,
"num_tokens": 12801735.0,
"step": 95
},
{
"epoch": 0.1515438530024626,
"grad_norm": 1.083571026614714,
"learning_rate": 4.965702354242146e-06,
"loss": 0.4094,
"num_tokens": 13485502.0,
"step": 100
},
{
"epoch": 0.15912104565258572,
"grad_norm": 1.064545176887418,
"learning_rate": 4.954559530140725e-06,
"loss": 0.4142,
"num_tokens": 14158674.0,
"step": 105
},
{
"epoch": 0.16669823830270886,
"grad_norm": 1.0252361899177853,
"learning_rate": 4.941869932251659e-06,
"loss": 0.3958,
"num_tokens": 14845188.0,
"step": 110
},
{
"epoch": 0.174275430952832,
"grad_norm": 1.1316583942434564,
"learning_rate": 4.927642463912383e-06,
"loss": 0.4053,
"num_tokens": 15530682.0,
"step": 115
},
{
"epoch": 0.1818526236029551,
"grad_norm": 1.0710593340675518,
"learning_rate": 4.9118871074684815e-06,
"loss": 0.3967,
"num_tokens": 16198500.0,
"step": 120
},
{
"epoch": 0.18942981625307823,
"grad_norm": 1.030020134722499,
"learning_rate": 4.894614917269827e-06,
"loss": 0.3854,
"num_tokens": 16874884.0,
"step": 125
},
{
"epoch": 0.19700700890320136,
"grad_norm": 1.1408468287734495,
"learning_rate": 4.875838011914574e-06,
"loss": 0.3897,
"num_tokens": 17563009.0,
"step": 130
},
{
"epoch": 0.2045842015533245,
"grad_norm": 1.078979471366339,
"learning_rate": 4.8555695657464505e-06,
"loss": 0.3893,
"num_tokens": 18253977.0,
"step": 135
},
{
"epoch": 0.21216139420344762,
"grad_norm": 0.9577387651632541,
"learning_rate": 4.833823799611309e-06,
"loss": 0.3894,
"num_tokens": 18924480.0,
"step": 140
},
{
"epoch": 0.21973858685357076,
"grad_norm": 0.9979699563632606,
"learning_rate": 4.810615970879425e-06,
"loss": 0.3823,
"num_tokens": 19607692.0,
"step": 145
},
{
"epoch": 0.2273157795036939,
"grad_norm": 1.083731391070982,
"learning_rate": 4.7859623627405525e-06,
"loss": 0.3799,
"num_tokens": 20295504.0,
"step": 150
},
{
"epoch": 0.23489297215381702,
"grad_norm": 0.9588323354107069,
"learning_rate": 4.759880272779228e-06,
"loss": 0.3814,
"num_tokens": 20980967.0,
"step": 155
},
{
"epoch": 0.24247016480394015,
"grad_norm": 0.9397995296634589,
"learning_rate": 4.732388000838359e-06,
"loss": 0.3794,
"num_tokens": 21663800.0,
"step": 160
},
{
"epoch": 0.25004735745406326,
"grad_norm": 1.149316897261889,
"learning_rate": 4.703504836179595e-06,
"loss": 0.3764,
"num_tokens": 22370457.0,
"step": 165
},
{
"epoch": 0.2576245501041864,
"grad_norm": 1.0979780566286403,
"learning_rate": 4.673251043949505e-06,
"loss": 0.3754,
"num_tokens": 23040401.0,
"step": 170
},
{
"epoch": 0.2652017427543095,
"grad_norm": 0.8834333298633666,
"learning_rate": 4.6416478509610464e-06,
"loss": 0.3781,
"num_tokens": 23707041.0,
"step": 175
},
{
"epoch": 0.27277893540443265,
"grad_norm": 0.936045616821062,
"learning_rate": 4.608717430800303e-06,
"loss": 0.3759,
"num_tokens": 24377111.0,
"step": 180
},
{
"epoch": 0.2803561280545558,
"grad_norm": 0.9887231447085479,
"learning_rate": 4.57448288826895e-06,
"loss": 0.3673,
"num_tokens": 25055343.0,
"step": 185
},
{
"epoch": 0.2879333207046789,
"grad_norm": 0.8248139338597134,
"learning_rate": 4.538968243173343e-06,
"loss": 0.3668,
"num_tokens": 25728981.0,
"step": 190
},
{
"epoch": 0.29551051335480205,
"grad_norm": 0.9150002884928868,
"learning_rate": 4.502198413471634e-06,
"loss": 0.3691,
"num_tokens": 26409520.0,
"step": 195
},
{
"epoch": 0.3030877060049252,
"grad_norm": 0.9053014180770657,
"learning_rate": 4.464199197790705e-06,
"loss": 0.3706,
"num_tokens": 27099755.0,
"step": 200
},
{
"epoch": 0.3106648986550483,
"grad_norm": 0.9549841540494592,
"learning_rate": 4.424997257325213e-06,
"loss": 0.3692,
"num_tokens": 27764650.0,
"step": 205
},
{
"epoch": 0.31824209130517145,
"grad_norm": 0.9696672595730065,
"learning_rate": 4.384620097131433e-06,
"loss": 0.3601,
"num_tokens": 28453535.0,
"step": 210
},
{
"epoch": 0.3258192839552946,
"grad_norm": 1.0028667181687292,
"learning_rate": 4.343096046829025e-06,
"loss": 0.3562,
"num_tokens": 29137151.0,
"step": 215
},
{
"epoch": 0.3333964766054177,
"grad_norm": 0.9658631567314796,
"learning_rate": 4.30045424072427e-06,
"loss": 0.3616,
"num_tokens": 29817369.0,
"step": 220
},
{
"epoch": 0.34097366925554085,
"grad_norm": 0.9408101125306896,
"learning_rate": 4.256724597368713e-06,
"loss": 0.3599,
"num_tokens": 30501199.0,
"step": 225
},
{
"epoch": 0.348550861905664,
"grad_norm": 0.9703143560737951,
"learning_rate": 4.211937798567569e-06,
"loss": 0.3612,
"num_tokens": 31180324.0,
"step": 230
},
{
"epoch": 0.35612805455578705,
"grad_norm": 0.9050766730813399,
"learning_rate": 4.166125267852601e-06,
"loss": 0.3642,
"num_tokens": 31855036.0,
"step": 235
},
{
"epoch": 0.3637052472059102,
"grad_norm": 0.9428957026430407,
"learning_rate": 4.11931914843459e-06,
"loss": 0.3631,
"num_tokens": 32509354.0,
"step": 240
},
{
"epoch": 0.3712824398560333,
"grad_norm": 0.916465479137018,
"learning_rate": 4.071552280650856e-06,
"loss": 0.3588,
"num_tokens": 33178262.0,
"step": 245
},
{
"epoch": 0.37885963250615645,
"grad_norm": 0.9687892223580263,
"learning_rate": 4.022858178923667e-06,
"loss": 0.3492,
"num_tokens": 33857989.0,
"step": 250
},
{
"epoch": 0.3864368251562796,
"grad_norm": 1.043264768303208,
"learning_rate": 3.973271008245684e-06,
"loss": 0.3485,
"num_tokens": 34550372.0,
"step": 255
},
{
"epoch": 0.3940140178064027,
"grad_norm": 1.0626177134911259,
"learning_rate": 3.922825560208949e-06,
"loss": 0.3592,
"num_tokens": 35214613.0,
"step": 260
},
{
"epoch": 0.40159121045652585,
"grad_norm": 0.91604627717572,
"learning_rate": 3.871557228594243e-06,
"loss": 0.3475,
"num_tokens": 35904210.0,
"step": 265
},
{
"epoch": 0.409168403106649,
"grad_norm": 0.9175491926941394,
"learning_rate": 3.81950198453793e-06,
"loss": 0.3519,
"num_tokens": 36570673.0,
"step": 270
},
{
"epoch": 0.4167455957567721,
"grad_norm": 0.8296317012779955,
"learning_rate": 3.766696351293709e-06,
"loss": 0.3529,
"num_tokens": 37259417.0,
"step": 275
},
{
"epoch": 0.42432278840689525,
"grad_norm": 0.8784297804071752,
"learning_rate": 3.713177378606993e-06,
"loss": 0.3436,
"num_tokens": 37948015.0,
"step": 280
},
{
"epoch": 0.4318999810570184,
"grad_norm": 0.8620896694006768,
"learning_rate": 3.65898261671989e-06,
"loss": 0.3532,
"num_tokens": 38609821.0,
"step": 285
},
{
"epoch": 0.4394771737071415,
"grad_norm": 0.8892945434162819,
"learning_rate": 3.6041500900250126e-06,
"loss": 0.3458,
"num_tokens": 39293466.0,
"step": 290
},
{
"epoch": 0.44705436635726464,
"grad_norm": 0.8624068465904439,
"learning_rate": 3.5487182703866235e-06,
"loss": 0.3487,
"num_tokens": 39971718.0,
"step": 295
},
{
"epoch": 0.4546315590073878,
"grad_norm": 0.8085150075292896,
"learning_rate": 3.4927260501478164e-06,
"loss": 0.3478,
"num_tokens": 40640725.0,
"step": 300
},
{
"epoch": 0.4622087516575109,
"grad_norm": 0.8404898292867258,
"learning_rate": 3.4362127148426834e-06,
"loss": 0.3453,
"num_tokens": 41318493.0,
"step": 305
},
{
"epoch": 0.46978594430763404,
"grad_norm": 0.9925900662007742,
"learning_rate": 3.3792179156326045e-06,
"loss": 0.3484,
"num_tokens": 41995362.0,
"step": 310
},
{
"epoch": 0.4773631369577572,
"grad_norm": 0.8623383910484006,
"learning_rate": 3.3217816414860083e-06,
"loss": 0.3482,
"num_tokens": 42678115.0,
"step": 315
},
{
"epoch": 0.4849403296078803,
"grad_norm": 0.8975797769749146,
"learning_rate": 3.2639441911211178e-06,
"loss": 0.3467,
"num_tokens": 43378878.0,
"step": 320
},
{
"epoch": 0.49251752225800344,
"grad_norm": 0.8194623317436803,
"learning_rate": 3.205746144731366e-06,
"loss": 0.3396,
"num_tokens": 44074633.0,
"step": 325
},
{
"epoch": 0.5000947149081265,
"grad_norm": 0.8263193992649668,
"learning_rate": 3.1472283355133254e-06,
"loss": 0.349,
"num_tokens": 44758521.0,
"step": 330
},
{
"epoch": 0.5076719075582496,
"grad_norm": 0.8209994838731967,
"learning_rate": 3.0884318210171173e-06,
"loss": 0.3435,
"num_tokens": 45441476.0,
"step": 335
},
{
"epoch": 0.5152491002083728,
"grad_norm": 0.9194054012383176,
"learning_rate": 3.0293978543394207e-06,
"loss": 0.3404,
"num_tokens": 46126861.0,
"step": 340
},
{
"epoch": 0.5228262928584959,
"grad_norm": 0.8692812034927447,
"learning_rate": 2.9701678551792685e-06,
"loss": 0.3425,
"num_tokens": 46817925.0,
"step": 345
},
{
"epoch": 0.530403485508619,
"grad_norm": 0.8067817246185729,
"learning_rate": 2.9107833807769566e-06,
"loss": 0.3377,
"num_tokens": 47497471.0,
"step": 350
},
{
"epoch": 0.5379806781587422,
"grad_norm": 0.8339751108884305,
"learning_rate": 2.851286096756453e-06,
"loss": 0.3438,
"num_tokens": 48172701.0,
"step": 355
},
{
"epoch": 0.5455578708088653,
"grad_norm": 0.7717434957072956,
"learning_rate": 2.7917177478917605e-06,
"loss": 0.3311,
"num_tokens": 48860699.0,
"step": 360
},
{
"epoch": 0.5531350634589884,
"grad_norm": 0.859770404135416,
"learning_rate": 2.7321201288177424e-06,
"loss": 0.3439,
"num_tokens": 49543737.0,
"step": 365
},
{
"epoch": 0.5607122561091116,
"grad_norm": 0.8100249593766239,
"learning_rate": 2.6725350547059682e-06,
"loss": 0.34,
"num_tokens": 50221154.0,
"step": 370
},
{
"epoch": 0.5682894487592347,
"grad_norm": 0.803391445344769,
"learning_rate": 2.6130043319261513e-06,
"loss": 0.3392,
"num_tokens": 50898508.0,
"step": 375
},
{
"epoch": 0.5758666414093578,
"grad_norm": 0.8040693336090444,
"learning_rate": 2.5535697287137585e-06,
"loss": 0.3345,
"num_tokens": 51592671.0,
"step": 380
},
{
"epoch": 0.583443834059481,
"grad_norm": 0.8504004369104251,
"learning_rate": 2.4942729458643772e-06,
"loss": 0.3379,
"num_tokens": 52270525.0,
"step": 385
},
{
"epoch": 0.5910210267096041,
"grad_norm": 0.8460679730282097,
"learning_rate": 2.4351555874754023e-06,
"loss": 0.3344,
"num_tokens": 52945561.0,
"step": 390
},
{
"epoch": 0.5985982193597272,
"grad_norm": 0.8331966726775251,
"learning_rate": 2.376259131755565e-06,
"loss": 0.3354,
"num_tokens": 53633168.0,
"step": 395
},
{
"epoch": 0.6061754120098504,
"grad_norm": 0.9208957482910517,
"learning_rate": 2.3176249019227887e-06,
"loss": 0.3396,
"num_tokens": 54297321.0,
"step": 400
},
{
"epoch": 0.6137526046599735,
"grad_norm": 0.7991681474000126,
"learning_rate": 2.259294037210797e-06,
"loss": 0.3321,
"num_tokens": 54984763.0,
"step": 405
},
{
"epoch": 0.6213297973100966,
"grad_norm": 0.8178882775094793,
"learning_rate": 2.2013074640047984e-06,
"loss": 0.342,
"num_tokens": 55645272.0,
"step": 410
},
{
"epoch": 0.6289069899602198,
"grad_norm": 0.8068516933987062,
"learning_rate": 2.143705867126518e-06,
"loss": 0.3365,
"num_tokens": 56325652.0,
"step": 415
},
{
"epoch": 0.6364841826103429,
"grad_norm": 0.8118975393672736,
"learning_rate": 2.0865296612887215e-06,
"loss": 0.3337,
"num_tokens": 57016551.0,
"step": 420
},
{
"epoch": 0.644061375260466,
"grad_norm": 0.8720525791746454,
"learning_rate": 2.0298189627392366e-06,
"loss": 0.33,
"num_tokens": 57715996.0,
"step": 425
},
{
"epoch": 0.6516385679105892,
"grad_norm": 0.8189240422464783,
"learning_rate": 1.973613561114404e-06,
"loss": 0.3319,
"num_tokens": 58411871.0,
"step": 430
},
{
"epoch": 0.6592157605607123,
"grad_norm": 0.8010090923788271,
"learning_rate": 1.917952891521678e-06,
"loss": 0.3261,
"num_tokens": 59102622.0,
"step": 435
},
{
"epoch": 0.6667929532108354,
"grad_norm": 0.8137824874604472,
"learning_rate": 1.8628760068709694e-06,
"loss": 0.3408,
"num_tokens": 59768546.0,
"step": 440
},
{
"epoch": 0.6743701458609586,
"grad_norm": 0.7822337587794234,
"learning_rate": 1.8084215504741603e-06,
"loss": 0.3289,
"num_tokens": 60465312.0,
"step": 445
},
{
"epoch": 0.6819473385110817,
"grad_norm": 0.7788309919289208,
"learning_rate": 1.7546277289319907e-06,
"loss": 0.3258,
"num_tokens": 61147815.0,
"step": 450
},
{
"epoch": 0.6895245311612048,
"grad_norm": 0.8195670772554525,
"learning_rate": 1.701532285327358e-06,
"loss": 0.3313,
"num_tokens": 61832191.0,
"step": 455
},
{
"epoch": 0.697101723811328,
"grad_norm": 0.8214679319970593,
"learning_rate": 1.6491724727438301e-06,
"loss": 0.3329,
"num_tokens": 62507231.0,
"step": 460
},
{
"epoch": 0.704678916461451,
"grad_norm": 0.7812180367757561,
"learning_rate": 1.5975850281279626e-06,
"loss": 0.3295,
"num_tokens": 63182873.0,
"step": 465
},
{
"epoch": 0.7122561091115741,
"grad_norm": 0.7674580303784487,
"learning_rate": 1.5468061465137335e-06,
"loss": 0.3312,
"num_tokens": 63860394.0,
"step": 470
},
{
"epoch": 0.7198333017616972,
"grad_norm": 0.8336261408594463,
"learning_rate": 1.4968714556272124e-06,
"loss": 0.3278,
"num_tokens": 64541974.0,
"step": 475
},
{
"epoch": 0.7274104944118204,
"grad_norm": 0.7612534657890504,
"learning_rate": 1.4478159908892646e-06,
"loss": 0.3337,
"num_tokens": 65226223.0,
"step": 480
},
{
"epoch": 0.7349876870619435,
"grad_norm": 0.7790959703943752,
"learning_rate": 1.399674170833825e-06,
"loss": 0.3271,
"num_tokens": 65908856.0,
"step": 485
},
{
"epoch": 0.7425648797120666,
"grad_norm": 0.7934687328628046,
"learning_rate": 1.3524797729589945e-06,
"loss": 0.3279,
"num_tokens": 66592993.0,
"step": 490
},
{
"epoch": 0.7501420723621898,
"grad_norm": 0.8457662795541676,
"learning_rate": 1.3062659100279198e-06,
"loss": 0.327,
"num_tokens": 67273074.0,
"step": 495
},
{
"epoch": 0.7577192650123129,
"grad_norm": 0.8315491509901118,
"learning_rate": 1.2610650068360442e-06,
"loss": 0.329,
"step": 500
},
{
"epoch": 0.7577192650123129,
"eval_loss": 0.33532413840293884,
"eval_num_tokens": 67964833.0,
"eval_runtime": 239.906,
"eval_samples_per_second": 9.266,
"eval_steps_per_second": 1.159,
"step": 500
},
{
"epoch": 0.765296457662436,
"grad_norm": 0.7220222867753697,
"learning_rate": 1.2169087774610656e-06,
"loss": 0.3251,
"num_tokens": 68655791.0,
"step": 505
},
{
"epoch": 0.7728736503125592,
"grad_norm": 0.8492025800299583,
"learning_rate": 1.17382820301156e-06,
"loss": 0.3294,
"num_tokens": 69331093.0,
"step": 510
},
{
"epoch": 0.7804508429626823,
"grad_norm": 0.8108134141335065,
"learning_rate": 1.131853509889854e-06,
"loss": 0.3278,
"num_tokens": 70011469.0,
"step": 515
},
{
"epoch": 0.7880280356128054,
"grad_norm": 0.8041767673010345,
"learning_rate": 1.0910141485844363e-06,
"loss": 0.323,
"num_tokens": 70689883.0,
"step": 520
},
{
"epoch": 0.7956052282629286,
"grad_norm": 0.7948666169290667,
"learning_rate": 1.0513387730067626e-06,
"loss": 0.3295,
"num_tokens": 71353521.0,
"step": 525
},
{
"epoch": 0.8031824209130517,
"grad_norm": 0.7987662784926006,
"learning_rate": 1.012855220386953e-06,
"loss": 0.3289,
"num_tokens": 72025801.0,
"step": 530
},
{
"epoch": 0.8107596135631748,
"grad_norm": 0.8124248754570028,
"learning_rate": 9.755904917425054e-07,
"loss": 0.3361,
"num_tokens": 72677000.0,
"step": 535
},
{
"epoch": 0.818336806213298,
"grad_norm": 0.8034403126732426,
"learning_rate": 9.395707329337092e-07,
"loss": 0.3258,
"num_tokens": 73354814.0,
"step": 540
},
{
"epoch": 0.8259139988634211,
"grad_norm": 0.729259511532679,
"learning_rate": 9.048212163190542e-07,
"loss": 0.3229,
"num_tokens": 74043197.0,
"step": 545
},
{
"epoch": 0.8334911915135442,
"grad_norm": 0.760321674214779,
"learning_rate": 8.713663230235226e-07,
"loss": 0.3217,
"num_tokens": 74718226.0,
"step": 550
},
{
"epoch": 0.8410683841636674,
"grad_norm": 0.7260807686457017,
"learning_rate": 8.392295258321817e-07,
"loss": 0.3267,
"num_tokens": 75405876.0,
"step": 555
},
{
"epoch": 0.8486455768137905,
"grad_norm": 0.7326560121754767,
"learning_rate": 8.084333727210933e-07,
"loss": 0.3194,
"num_tokens": 76104574.0,
"step": 560
},
{
"epoch": 0.8562227694639136,
"grad_norm": 0.7295825238005851,
"learning_rate": 7.789994710370951e-07,
"loss": 0.325,
"num_tokens": 76778482.0,
"step": 565
},
{
"epoch": 0.8637999621140368,
"grad_norm": 0.7365331005684703,
"learning_rate": 7.509484723375499e-07,
"loss": 0.326,
"num_tokens": 77448090.0,
"step": 570
},
{
"epoch": 0.8713771547641599,
"grad_norm": 0.7518330331674917,
"learning_rate": 7.243000579006945e-07,
"loss": 0.3192,
"num_tokens": 78127977.0,
"step": 575
},
{
"epoch": 0.878954347414283,
"grad_norm": 0.7696059216951111,
"learning_rate": 6.990729249167704e-07,
"loss": 0.3203,
"num_tokens": 78809677.0,
"step": 580
},
{
"epoch": 0.8865315400644062,
"grad_norm": 0.7540802540794419,
"learning_rate": 6.752847733696091e-07,
"loss": 0.3214,
"num_tokens": 79494335.0,
"step": 585
},
{
"epoch": 0.8941087327145293,
"grad_norm": 0.760684540360902,
"learning_rate": 6.529522936178805e-07,
"loss": 0.3207,
"num_tokens": 80175110.0,
"step": 590
},
{
"epoch": 0.9016859253646524,
"grad_norm": 0.6866096784785415,
"learning_rate": 6.320911546847259e-07,
"loss": 0.3244,
"num_tokens": 80858228.0,
"step": 595
},
{
"epoch": 0.9092631180147756,
"grad_norm": 0.749755946733873,
"learning_rate": 6.127159932639797e-07,
"loss": 0.3246,
"num_tokens": 81528605.0,
"step": 600
},
{
"epoch": 0.9168403106648987,
"grad_norm": 0.7323158098643515,
"learning_rate": 5.948404034507013e-07,
"loss": 0.3229,
"num_tokens": 82199743.0,
"step": 605
},
{
"epoch": 0.9244175033150218,
"grad_norm": 0.7716620129264333,
"learning_rate": 5.784769272032198e-07,
"loss": 0.3273,
"num_tokens": 82865102.0,
"step": 610
},
{
"epoch": 0.931994695965145,
"grad_norm": 0.717119707148909,
"learning_rate": 5.636370455433854e-07,
"loss": 0.3193,
"num_tokens": 83539618.0,
"step": 615
},
{
"epoch": 0.9395718886152681,
"grad_norm": 0.7816104210558041,
"learning_rate": 5.503311705011973e-07,
"loss": 0.3186,
"num_tokens": 84229670.0,
"step": 620
},
{
"epoch": 0.9471490812653912,
"grad_norm": 0.7268049825907208,
"learning_rate": 5.385686378094653e-07,
"loss": 0.3241,
"num_tokens": 84908504.0,
"step": 625
},
{
"epoch": 0.9547262739155143,
"grad_norm": 0.6840492043651534,
"learning_rate": 5.283577003536274e-07,
"loss": 0.3284,
"num_tokens": 85583606.0,
"step": 630
},
{
"epoch": 0.9623034665656375,
"grad_norm": 0.7229941798275764,
"learning_rate": 5.197055223813207e-07,
"loss": 0.3203,
"num_tokens": 86259953.0,
"step": 635
},
{
"epoch": 0.9698806592157606,
"grad_norm": 0.7215194242874737,
"learning_rate": 5.126181744757663e-07,
"loss": 0.3211,
"num_tokens": 86940929.0,
"step": 640
},
{
"epoch": 0.9774578518658837,
"grad_norm": 0.7395561763461616,
"learning_rate": 5.071006292964973e-07,
"loss": 0.3198,
"num_tokens": 87613159.0,
"step": 645
},
{
"epoch": 0.9850350445160069,
"grad_norm": 0.7167293407789844,
"learning_rate": 5.031567580904175e-07,
"loss": 0.3223,
"num_tokens": 88289922.0,
"step": 650
},
{
"epoch": 0.9926122371661299,
"grad_norm": 0.7107427399897064,
"learning_rate": 5.007893279756384e-07,
"loss": 0.3202,
"num_tokens": 88973021.0,
"step": 655
},
{
"epoch": 0.9986739912862285,
"num_tokens": 89507364.0,
"step": 659,
"total_flos": 4.0084130952600617e+18,
"train_loss": 0.3981744751040237,
"train_runtime": 12737.2069,
"train_samples_per_second": 3.315,
"train_steps_per_second": 0.052
}
],
"logging_steps": 5,
"max_steps": 659,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.0084130952600617e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}