lfm_complete_2b / checkpoint-3400 /trainer_state.json
Techiiot's picture
Upload folder using huggingface_hub
9eb655f verified
{
"best_global_step": 3400,
"best_metric": 0.7816377282142639,
"best_model_checkpoint": "./lfm_kokoro_complete/checkpoint-3400",
"epoch": 2.936096718480138,
"eval_steps": 100,
"global_step": 3400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008635578583765112,
"grad_norm": 5.131196975708008,
"learning_rate": 0.0,
"loss": 2.8308,
"step": 1
},
{
"epoch": 0.008635578583765112,
"grad_norm": 5.2136335372924805,
"learning_rate": 5.172413793103448e-06,
"loss": 2.6503,
"step": 10
},
{
"epoch": 0.017271157167530225,
"grad_norm": 2.5024237632751465,
"learning_rate": 1.091954022988506e-05,
"loss": 2.6409,
"step": 20
},
{
"epoch": 0.025906735751295335,
"grad_norm": 1.3332571983337402,
"learning_rate": 1.6666666666666667e-05,
"loss": 2.505,
"step": 30
},
{
"epoch": 0.03454231433506045,
"grad_norm": 1.121747374534607,
"learning_rate": 2.2413793103448276e-05,
"loss": 2.3877,
"step": 40
},
{
"epoch": 0.04317789291882556,
"grad_norm": 0.5361054539680481,
"learning_rate": 2.8160919540229884e-05,
"loss": 2.2456,
"step": 50
},
{
"epoch": 0.05181347150259067,
"grad_norm": 0.4509966969490051,
"learning_rate": 3.390804597701149e-05,
"loss": 2.18,
"step": 60
},
{
"epoch": 0.06044905008635579,
"grad_norm": 0.3262108266353607,
"learning_rate": 3.965517241379311e-05,
"loss": 2.1533,
"step": 70
},
{
"epoch": 0.0690846286701209,
"grad_norm": 0.31236183643341064,
"learning_rate": 4.5402298850574716e-05,
"loss": 2.1012,
"step": 80
},
{
"epoch": 0.07772020725388601,
"grad_norm": 0.2791730463504791,
"learning_rate": 5.1149425287356324e-05,
"loss": 2.0615,
"step": 90
},
{
"epoch": 0.08635578583765112,
"grad_norm": 0.29012593626976013,
"learning_rate": 5.689655172413794e-05,
"loss": 2.0758,
"step": 100
},
{
"epoch": 0.08635578583765112,
"eval_loss": 2.024885654449463,
"eval_runtime": 74.288,
"eval_samples_per_second": 31.163,
"eval_steps_per_second": 3.904,
"step": 100
},
{
"epoch": 0.09499136442141623,
"grad_norm": 0.27982184290885925,
"learning_rate": 6.264367816091954e-05,
"loss": 1.9746,
"step": 110
},
{
"epoch": 0.10362694300518134,
"grad_norm": 0.3128826320171356,
"learning_rate": 6.839080459770116e-05,
"loss": 2.0059,
"step": 120
},
{
"epoch": 0.11226252158894647,
"grad_norm": 0.29881423711776733,
"learning_rate": 7.413793103448277e-05,
"loss": 2.0007,
"step": 130
},
{
"epoch": 0.12089810017271158,
"grad_norm": 0.3187066316604614,
"learning_rate": 7.988505747126437e-05,
"loss": 1.9892,
"step": 140
},
{
"epoch": 0.12953367875647667,
"grad_norm": 0.2999899983406067,
"learning_rate": 8.563218390804599e-05,
"loss": 1.9454,
"step": 150
},
{
"epoch": 0.1381692573402418,
"grad_norm": 0.32296231389045715,
"learning_rate": 9.137931034482759e-05,
"loss": 1.9215,
"step": 160
},
{
"epoch": 0.14680483592400692,
"grad_norm": 0.3282780051231384,
"learning_rate": 9.71264367816092e-05,
"loss": 1.9189,
"step": 170
},
{
"epoch": 0.15544041450777202,
"grad_norm": 0.3565793037414551,
"learning_rate": 0.0001028735632183908,
"loss": 1.9356,
"step": 180
},
{
"epoch": 0.16407599309153714,
"grad_norm": 0.35819345712661743,
"learning_rate": 0.00010862068965517242,
"loss": 1.877,
"step": 190
},
{
"epoch": 0.17271157167530224,
"grad_norm": 0.38044115900993347,
"learning_rate": 0.00011436781609195404,
"loss": 1.9072,
"step": 200
},
{
"epoch": 0.17271157167530224,
"eval_loss": 1.8915574550628662,
"eval_runtime": 74.2599,
"eval_samples_per_second": 31.174,
"eval_steps_per_second": 3.905,
"step": 200
},
{
"epoch": 0.18134715025906736,
"grad_norm": 0.38403257727622986,
"learning_rate": 0.00012011494252873562,
"loss": 1.9223,
"step": 210
},
{
"epoch": 0.18998272884283246,
"grad_norm": 0.40475621819496155,
"learning_rate": 0.00012586206896551724,
"loss": 1.8787,
"step": 220
},
{
"epoch": 0.19861830742659758,
"grad_norm": 0.37040725350379944,
"learning_rate": 0.00013160919540229887,
"loss": 1.8916,
"step": 230
},
{
"epoch": 0.20725388601036268,
"grad_norm": 0.41001173853874207,
"learning_rate": 0.00013735632183908047,
"loss": 1.8767,
"step": 240
},
{
"epoch": 0.2158894645941278,
"grad_norm": 0.42713987827301025,
"learning_rate": 0.0001431034482758621,
"loss": 1.8756,
"step": 250
},
{
"epoch": 0.22452504317789293,
"grad_norm": 0.42754629254341125,
"learning_rate": 0.00014885057471264367,
"loss": 1.8208,
"step": 260
},
{
"epoch": 0.23316062176165803,
"grad_norm": 0.45471611618995667,
"learning_rate": 0.0001545977011494253,
"loss": 1.8424,
"step": 270
},
{
"epoch": 0.24179620034542315,
"grad_norm": 0.419595330953598,
"learning_rate": 0.0001603448275862069,
"loss": 1.7868,
"step": 280
},
{
"epoch": 0.2504317789291883,
"grad_norm": 0.40803587436676025,
"learning_rate": 0.0001660919540229885,
"loss": 1.8174,
"step": 290
},
{
"epoch": 0.25906735751295334,
"grad_norm": 0.4197799265384674,
"learning_rate": 0.00017183908045977013,
"loss": 1.8143,
"step": 300
},
{
"epoch": 0.25906735751295334,
"eval_loss": 1.8168917894363403,
"eval_runtime": 74.2622,
"eval_samples_per_second": 31.173,
"eval_steps_per_second": 3.905,
"step": 300
},
{
"epoch": 0.26770293609671847,
"grad_norm": 0.494597464799881,
"learning_rate": 0.00017758620689655173,
"loss": 1.8581,
"step": 310
},
{
"epoch": 0.2763385146804836,
"grad_norm": 0.41333019733428955,
"learning_rate": 0.00018333333333333334,
"loss": 1.7674,
"step": 320
},
{
"epoch": 0.2849740932642487,
"grad_norm": 0.38664960861206055,
"learning_rate": 0.00018908045977011494,
"loss": 1.8403,
"step": 330
},
{
"epoch": 0.29360967184801384,
"grad_norm": 0.5136725902557373,
"learning_rate": 0.00019482758620689657,
"loss": 1.8034,
"step": 340
},
{
"epoch": 0.3022452504317789,
"grad_norm": 0.4233579635620117,
"learning_rate": 0.00019999994949995492,
"loss": 1.8085,
"step": 350
},
{
"epoch": 0.31088082901554404,
"grad_norm": 0.47240543365478516,
"learning_rate": 0.0001999938895562612,
"loss": 1.7553,
"step": 360
},
{
"epoch": 0.31951640759930916,
"grad_norm": 0.42770665884017944,
"learning_rate": 0.00019997773030485974,
"loss": 1.81,
"step": 370
},
{
"epoch": 0.3281519861830743,
"grad_norm": 0.41995662450790405,
"learning_rate": 0.00019995147337782283,
"loss": 1.7934,
"step": 380
},
{
"epoch": 0.33678756476683935,
"grad_norm": 0.4032181203365326,
"learning_rate": 0.00019991512142708033,
"loss": 1.7521,
"step": 390
},
{
"epoch": 0.3454231433506045,
"grad_norm": 0.4150594174861908,
"learning_rate": 0.00019986867812415198,
"loss": 1.7813,
"step": 400
},
{
"epoch": 0.3454231433506045,
"eval_loss": 1.749611735343933,
"eval_runtime": 74.4513,
"eval_samples_per_second": 31.094,
"eval_steps_per_second": 3.895,
"step": 400
},
{
"epoch": 0.3540587219343696,
"grad_norm": 0.47208553552627563,
"learning_rate": 0.00019981214815977647,
"loss": 1.7951,
"step": 410
},
{
"epoch": 0.3626943005181347,
"grad_norm": 0.38943392038345337,
"learning_rate": 0.00019974553724343773,
"loss": 1.7224,
"step": 420
},
{
"epoch": 0.37132987910189985,
"grad_norm": 0.39383020997047424,
"learning_rate": 0.00019966885210278822,
"loss": 1.7327,
"step": 430
},
{
"epoch": 0.3799654576856649,
"grad_norm": 0.4155268669128418,
"learning_rate": 0.00019958210048296956,
"loss": 1.7447,
"step": 440
},
{
"epoch": 0.38860103626943004,
"grad_norm": 0.393373042345047,
"learning_rate": 0.00019948529114583013,
"loss": 1.7679,
"step": 450
},
{
"epoch": 0.39723661485319517,
"grad_norm": 0.4065350592136383,
"learning_rate": 0.0001993784338690403,
"loss": 1.7041,
"step": 460
},
{
"epoch": 0.4058721934369603,
"grad_norm": 0.42317506670951843,
"learning_rate": 0.0001992615394451047,
"loss": 1.662,
"step": 470
},
{
"epoch": 0.41450777202072536,
"grad_norm": 0.460857629776001,
"learning_rate": 0.00019913461968027227,
"loss": 1.7228,
"step": 480
},
{
"epoch": 0.4231433506044905,
"grad_norm": 0.47063159942626953,
"learning_rate": 0.00019899768739334393,
"loss": 1.759,
"step": 490
},
{
"epoch": 0.4317789291882556,
"grad_norm": 0.44678857922554016,
"learning_rate": 0.00019885075641437776,
"loss": 1.6796,
"step": 500
},
{
"epoch": 0.4317789291882556,
"eval_loss": 1.684213638305664,
"eval_runtime": 75.8643,
"eval_samples_per_second": 30.515,
"eval_steps_per_second": 3.823,
"step": 500
},
{
"epoch": 0.44041450777202074,
"grad_norm": 0.44266021251678467,
"learning_rate": 0.00019869384158329223,
"loss": 1.7078,
"step": 510
},
{
"epoch": 0.44905008635578586,
"grad_norm": 0.5028413534164429,
"learning_rate": 0.00019852695874836735,
"loss": 1.6464,
"step": 520
},
{
"epoch": 0.45768566493955093,
"grad_norm": 0.40895670652389526,
"learning_rate": 0.00019835012476464406,
"loss": 1.6559,
"step": 530
},
{
"epoch": 0.46632124352331605,
"grad_norm": 0.5382914543151855,
"learning_rate": 0.00019816335749222187,
"loss": 1.6413,
"step": 540
},
{
"epoch": 0.4749568221070812,
"grad_norm": 0.5184707045555115,
"learning_rate": 0.00019796667579445492,
"loss": 1.6706,
"step": 550
},
{
"epoch": 0.4835924006908463,
"grad_norm": 0.4206818640232086,
"learning_rate": 0.00019776009953604692,
"loss": 1.6429,
"step": 560
},
{
"epoch": 0.49222797927461137,
"grad_norm": 0.5006670355796814,
"learning_rate": 0.0001975436495810447,
"loss": 1.6556,
"step": 570
},
{
"epoch": 0.5008635578583766,
"grad_norm": 0.4992609918117523,
"learning_rate": 0.0001973173477907311,
"loss": 1.6296,
"step": 580
},
{
"epoch": 0.5094991364421416,
"grad_norm": 0.488678902387619,
"learning_rate": 0.0001970812170214169,
"loss": 1.6366,
"step": 590
},
{
"epoch": 0.5181347150259067,
"grad_norm": 0.5439748167991638,
"learning_rate": 0.00019683528112213235,
"loss": 1.6546,
"step": 600
},
{
"epoch": 0.5181347150259067,
"eval_loss": 1.6274060010910034,
"eval_runtime": 74.2854,
"eval_samples_per_second": 31.164,
"eval_steps_per_second": 3.904,
"step": 600
},
{
"epoch": 0.5267702936096719,
"grad_norm": 0.49797549843788147,
"learning_rate": 0.00019657956493221844,
"loss": 1.6206,
"step": 610
},
{
"epoch": 0.5354058721934369,
"grad_norm": 0.5434479117393494,
"learning_rate": 0.00019631409427881832,
"loss": 1.6198,
"step": 620
},
{
"epoch": 0.5440414507772021,
"grad_norm": 0.48510608077049255,
"learning_rate": 0.00019603889597426838,
"loss": 1.5839,
"step": 630
},
{
"epoch": 0.5526770293609672,
"grad_norm": 0.4494125545024872,
"learning_rate": 0.00019575399781339065,
"loss": 1.6295,
"step": 640
},
{
"epoch": 0.5613126079447323,
"grad_norm": 0.45310178399086,
"learning_rate": 0.00019545942857068527,
"loss": 1.6039,
"step": 650
},
{
"epoch": 0.5699481865284974,
"grad_norm": 0.5253885984420776,
"learning_rate": 0.00019515521799742444,
"loss": 1.5997,
"step": 660
},
{
"epoch": 0.5785837651122625,
"grad_norm": 0.48614782094955444,
"learning_rate": 0.00019484139681864745,
"loss": 1.5761,
"step": 670
},
{
"epoch": 0.5872193436960277,
"grad_norm": 0.502662718296051,
"learning_rate": 0.00019451799673005757,
"loss": 1.5793,
"step": 680
},
{
"epoch": 0.5958549222797928,
"grad_norm": 0.5655169486999512,
"learning_rate": 0.00019418505039482068,
"loss": 1.5643,
"step": 690
},
{
"epoch": 0.6044905008635578,
"grad_norm": 0.507977306842804,
"learning_rate": 0.00019384259144026653,
"loss": 1.5549,
"step": 700
},
{
"epoch": 0.6044905008635578,
"eval_loss": 1.5656999349594116,
"eval_runtime": 74.2824,
"eval_samples_per_second": 31.165,
"eval_steps_per_second": 3.904,
"step": 700
},
{
"epoch": 0.613126079447323,
"grad_norm": 0.5120140910148621,
"learning_rate": 0.00019349065445449214,
"loss": 1.5388,
"step": 710
},
{
"epoch": 0.6217616580310881,
"grad_norm": 0.5686282515525818,
"learning_rate": 0.00019312927498286867,
"loss": 1.5975,
"step": 720
},
{
"epoch": 0.6303972366148531,
"grad_norm": 0.5706737637519836,
"learning_rate": 0.00019275848952445115,
"loss": 1.5062,
"step": 730
},
{
"epoch": 0.6390328151986183,
"grad_norm": 0.4991269111633301,
"learning_rate": 0.0001923783355282923,
"loss": 1.5513,
"step": 740
},
{
"epoch": 0.6476683937823834,
"grad_norm": 0.6073980927467346,
"learning_rate": 0.00019198885138966009,
"loss": 1.5004,
"step": 750
},
{
"epoch": 0.6563039723661486,
"grad_norm": 0.5857861042022705,
"learning_rate": 0.00019159007644615981,
"loss": 1.5607,
"step": 760
},
{
"epoch": 0.6649395509499136,
"grad_norm": 0.5783904194831848,
"learning_rate": 0.00019118205097376113,
"loss": 1.5616,
"step": 770
},
{
"epoch": 0.6735751295336787,
"grad_norm": 0.5480038523674011,
"learning_rate": 0.00019076481618273018,
"loss": 1.5609,
"step": 780
},
{
"epoch": 0.6822107081174439,
"grad_norm": 0.6719979047775269,
"learning_rate": 0.00019033841421346734,
"loss": 1.5448,
"step": 790
},
{
"epoch": 0.690846286701209,
"grad_norm": 0.6396545171737671,
"learning_rate": 0.00018990288813225105,
"loss": 1.4898,
"step": 800
},
{
"epoch": 0.690846286701209,
"eval_loss": 1.5024266242980957,
"eval_runtime": 74.2858,
"eval_samples_per_second": 31.163,
"eval_steps_per_second": 3.904,
"step": 800
},
{
"epoch": 0.6994818652849741,
"grad_norm": 0.6165493130683899,
"learning_rate": 0.0001894582819268883,
"loss": 1.4581,
"step": 810
},
{
"epoch": 0.7081174438687392,
"grad_norm": 0.5979147553443909,
"learning_rate": 0.00018900464050227169,
"loss": 1.5436,
"step": 820
},
{
"epoch": 0.7167530224525043,
"grad_norm": 0.6082155108451843,
"learning_rate": 0.0001885420096758443,
"loss": 1.5205,
"step": 830
},
{
"epoch": 0.7253886010362695,
"grad_norm": 0.6365352272987366,
"learning_rate": 0.0001880704361729719,
"loss": 1.5159,
"step": 840
},
{
"epoch": 0.7340241796200345,
"grad_norm": 0.6347801685333252,
"learning_rate": 0.000187589967622224,
"loss": 1.4908,
"step": 850
},
{
"epoch": 0.7426597582037997,
"grad_norm": 0.5811082720756531,
"learning_rate": 0.00018710065255056314,
"loss": 1.4738,
"step": 860
},
{
"epoch": 0.7512953367875648,
"grad_norm": 0.6715326905250549,
"learning_rate": 0.00018660254037844388,
"loss": 1.4448,
"step": 870
},
{
"epoch": 0.7599309153713298,
"grad_norm": 0.661300778388977,
"learning_rate": 0.00018609568141482132,
"loss": 1.4712,
"step": 880
},
{
"epoch": 0.768566493955095,
"grad_norm": 0.5695850253105164,
"learning_rate": 0.00018558012685206997,
"loss": 1.4348,
"step": 890
},
{
"epoch": 0.7772020725388601,
"grad_norm": 0.610674262046814,
"learning_rate": 0.00018505592876081318,
"loss": 1.504,
"step": 900
},
{
"epoch": 0.7772020725388601,
"eval_loss": 1.4339938163757324,
"eval_runtime": 74.5243,
"eval_samples_per_second": 31.064,
"eval_steps_per_second": 3.891,
"step": 900
},
{
"epoch": 0.7858376511226253,
"grad_norm": 0.6574162244796753,
"learning_rate": 0.00018452314008466432,
"loss": 1.4541,
"step": 910
},
{
"epoch": 0.7944732297063903,
"grad_norm": 0.622951090335846,
"learning_rate": 0.00018398181463487933,
"loss": 1.4335,
"step": 920
},
{
"epoch": 0.8031088082901554,
"grad_norm": 0.7158159017562866,
"learning_rate": 0.0001834320070849219,
"loss": 1.3933,
"step": 930
},
{
"epoch": 0.8117443868739206,
"grad_norm": 0.6937190294265747,
"learning_rate": 0.0001828737729649414,
"loss": 1.4129,
"step": 940
},
{
"epoch": 0.8203799654576857,
"grad_norm": 0.6910032629966736,
"learning_rate": 0.00018230716865616452,
"loss": 1.4415,
"step": 950
},
{
"epoch": 0.8290155440414507,
"grad_norm": 0.7496052980422974,
"learning_rate": 0.00018173225138520065,
"loss": 1.3115,
"step": 960
},
{
"epoch": 0.8376511226252159,
"grad_norm": 0.8548805117607117,
"learning_rate": 0.00018114907921826215,
"loss": 1.3782,
"step": 970
},
{
"epoch": 0.846286701208981,
"grad_norm": 0.7024548053741455,
"learning_rate": 0.0001805577110552997,
"loss": 1.3649,
"step": 980
},
{
"epoch": 0.8549222797927462,
"grad_norm": 0.6912006139755249,
"learning_rate": 0.0001799582066240534,
"loss": 1.3884,
"step": 990
},
{
"epoch": 0.8635578583765112,
"grad_norm": 0.7504778504371643,
"learning_rate": 0.0001793506264740203,
"loss": 1.4177,
"step": 1000
},
{
"epoch": 0.8635578583765112,
"eval_loss": 1.371172308921814,
"eval_runtime": 74.2999,
"eval_samples_per_second": 31.157,
"eval_steps_per_second": 3.903,
"step": 1000
},
{
"epoch": 0.8721934369602763,
"grad_norm": 0.7364081740379333,
"learning_rate": 0.00017873503197033902,
"loss": 1.3732,
"step": 1010
},
{
"epoch": 0.8808290155440415,
"grad_norm": 0.7568293809890747,
"learning_rate": 0.00017811148528759183,
"loss": 1.3572,
"step": 1020
},
{
"epoch": 0.8894645941278065,
"grad_norm": 0.8201608657836914,
"learning_rate": 0.00017748004940352518,
"loss": 1.3735,
"step": 1030
},
{
"epoch": 0.8981001727115717,
"grad_norm": 0.7080292701721191,
"learning_rate": 0.00017684078809268887,
"loss": 1.3454,
"step": 1040
},
{
"epoch": 0.9067357512953368,
"grad_norm": 0.870185911655426,
"learning_rate": 0.00017619376591999493,
"loss": 1.3371,
"step": 1050
},
{
"epoch": 0.9153713298791019,
"grad_norm": 0.767082691192627,
"learning_rate": 0.00017553904823419667,
"loss": 1.3524,
"step": 1060
},
{
"epoch": 0.924006908462867,
"grad_norm": 0.6791857481002808,
"learning_rate": 0.00017487670116128832,
"loss": 1.3515,
"step": 1070
},
{
"epoch": 0.9326424870466321,
"grad_norm": 0.897812008857727,
"learning_rate": 0.0001742067915978266,
"loss": 1.3075,
"step": 1080
},
{
"epoch": 0.9412780656303973,
"grad_norm": 0.8470781445503235,
"learning_rate": 0.00017352938720417398,
"loss": 1.2876,
"step": 1090
},
{
"epoch": 0.9499136442141624,
"grad_norm": 0.7665865421295166,
"learning_rate": 0.0001728445563976652,
"loss": 1.4049,
"step": 1100
},
{
"epoch": 0.9499136442141624,
"eval_loss": 1.3132154941558838,
"eval_runtime": 74.4946,
"eval_samples_per_second": 31.076,
"eval_steps_per_second": 3.893,
"step": 1100
},
{
"epoch": 0.9585492227979274,
"grad_norm": 0.709002673625946,
"learning_rate": 0.0001721523683456972,
"loss": 1.3671,
"step": 1110
},
{
"epoch": 0.9671848013816926,
"grad_norm": 0.7099783420562744,
"learning_rate": 0.00017145289295874302,
"loss": 1.3471,
"step": 1120
},
{
"epoch": 0.9758203799654577,
"grad_norm": 0.6939783096313477,
"learning_rate": 0.00017074620088329122,
"loss": 1.3012,
"step": 1130
},
{
"epoch": 0.9844559585492227,
"grad_norm": 0.8194535374641418,
"learning_rate": 0.00017003236349471035,
"loss": 1.2853,
"step": 1140
},
{
"epoch": 0.9930915371329879,
"grad_norm": 0.7694395184516907,
"learning_rate": 0.00016931145289004023,
"loss": 1.3093,
"step": 1150
},
{
"epoch": 1.001727115716753,
"grad_norm": 0.7333533763885498,
"learning_rate": 0.0001685835418807103,
"loss": 1.3436,
"step": 1160
},
{
"epoch": 1.0103626943005182,
"grad_norm": 0.7282711863517761,
"learning_rate": 0.00016784870398518545,
"loss": 1.3019,
"step": 1170
},
{
"epoch": 1.0189982728842832,
"grad_norm": 0.8324429392814636,
"learning_rate": 0.00016710701342154106,
"loss": 1.2171,
"step": 1180
},
{
"epoch": 1.0276338514680483,
"grad_norm": 0.7838461995124817,
"learning_rate": 0.00016635854509996668,
"loss": 1.2805,
"step": 1190
},
{
"epoch": 1.0362694300518134,
"grad_norm": 0.9009427428245544,
"learning_rate": 0.00016560337461520036,
"loss": 1.2174,
"step": 1200
},
{
"epoch": 1.0362694300518134,
"eval_loss": 1.247739315032959,
"eval_runtime": 74.3416,
"eval_samples_per_second": 31.14,
"eval_steps_per_second": 3.901,
"step": 1200
},
{
"epoch": 1.0449050086355787,
"grad_norm": 0.817688524723053,
"learning_rate": 0.00016484157823889363,
"loss": 1.3382,
"step": 1210
},
{
"epoch": 1.0535405872193437,
"grad_norm": 0.9377408623695374,
"learning_rate": 0.00016407323291190803,
"loss": 1.187,
"step": 1220
},
{
"epoch": 1.0621761658031088,
"grad_norm": 0.7849322557449341,
"learning_rate": 0.00016329841623654434,
"loss": 1.2647,
"step": 1230
},
{
"epoch": 1.0708117443868739,
"grad_norm": 0.8397180438041687,
"learning_rate": 0.00016251720646870443,
"loss": 1.2102,
"step": 1240
},
{
"epoch": 1.079447322970639,
"grad_norm": 0.9595755934715271,
"learning_rate": 0.00016172968250998792,
"loss": 1.1938,
"step": 1250
},
{
"epoch": 1.0880829015544042,
"grad_norm": 0.7337958216667175,
"learning_rate": 0.00016093592389972286,
"loss": 1.2553,
"step": 1260
},
{
"epoch": 1.0967184801381693,
"grad_norm": 0.7563393115997314,
"learning_rate": 0.0001601360108069324,
"loss": 1.2577,
"step": 1270
},
{
"epoch": 1.1053540587219344,
"grad_norm": 0.8453429937362671,
"learning_rate": 0.0001593300240222379,
"loss": 1.2466,
"step": 1280
},
{
"epoch": 1.1139896373056994,
"grad_norm": 0.8459578156471252,
"learning_rate": 0.00015851804494969893,
"loss": 1.2145,
"step": 1290
},
{
"epoch": 1.1226252158894645,
"grad_norm": 0.9956552982330322,
"learning_rate": 0.00015770015559859172,
"loss": 1.1838,
"step": 1300
},
{
"epoch": 1.1226252158894645,
"eval_loss": 1.1956804990768433,
"eval_runtime": 74.2605,
"eval_samples_per_second": 31.174,
"eval_steps_per_second": 3.905,
"step": 1300
},
{
"epoch": 1.1312607944732298,
"grad_norm": 1.0404267311096191,
"learning_rate": 0.00015687643857512616,
"loss": 1.2361,
"step": 1310
},
{
"epoch": 1.1398963730569949,
"grad_norm": 1.0246553421020508,
"learning_rate": 0.00015604697707410255,
"loss": 1.1873,
"step": 1320
},
{
"epoch": 1.14853195164076,
"grad_norm": 0.8831927180290222,
"learning_rate": 0.0001552118548705094,
"loss": 1.1783,
"step": 1330
},
{
"epoch": 1.157167530224525,
"grad_norm": 0.9147486686706543,
"learning_rate": 0.0001543711563110616,
"loss": 1.1853,
"step": 1340
},
{
"epoch": 1.16580310880829,
"grad_norm": 0.9496821165084839,
"learning_rate": 0.000153524966305682,
"loss": 1.1501,
"step": 1350
},
{
"epoch": 1.1744386873920551,
"grad_norm": 0.9167485237121582,
"learning_rate": 0.00015267337031892527,
"loss": 1.2301,
"step": 1360
},
{
"epoch": 1.1830742659758204,
"grad_norm": 0.861179769039154,
"learning_rate": 0.0001518164543613462,
"loss": 1.1827,
"step": 1370
},
{
"epoch": 1.1917098445595855,
"grad_norm": 1.0012174844741821,
"learning_rate": 0.00015095430498081257,
"loss": 1.1598,
"step": 1380
},
{
"epoch": 1.2003454231433506,
"grad_norm": 0.9084812998771667,
"learning_rate": 0.000150087009253764,
"loss": 1.1446,
"step": 1390
},
{
"epoch": 1.2089810017271156,
"grad_norm": 0.9342795610427856,
"learning_rate": 0.0001492146547764172,
"loss": 1.1408,
"step": 1400
},
{
"epoch": 1.2089810017271156,
"eval_loss": 1.1498360633850098,
"eval_runtime": 74.2845,
"eval_samples_per_second": 31.164,
"eval_steps_per_second": 3.904,
"step": 1400
},
{
"epoch": 1.2176165803108807,
"grad_norm": 1.0500714778900146,
"learning_rate": 0.00014833732965591887,
"loss": 1.1475,
"step": 1410
},
{
"epoch": 1.226252158894646,
"grad_norm": 1.031998872756958,
"learning_rate": 0.00014745512250144695,
"loss": 1.121,
"step": 1420
},
{
"epoch": 1.234887737478411,
"grad_norm": 1.0070405006408691,
"learning_rate": 0.00014656812241526117,
"loss": 1.1167,
"step": 1430
},
{
"epoch": 1.2435233160621761,
"grad_norm": 1.0366291999816895,
"learning_rate": 0.0001456764189837037,
"loss": 1.1365,
"step": 1440
},
{
"epoch": 1.2521588946459412,
"grad_norm": 0.9328962564468384,
"learning_rate": 0.000144780102268151,
"loss": 1.1804,
"step": 1450
},
{
"epoch": 1.2607944732297063,
"grad_norm": 0.875531017780304,
"learning_rate": 0.000143879262795918,
"loss": 1.1061,
"step": 1460
},
{
"epoch": 1.2694300518134716,
"grad_norm": 1.023848533630371,
"learning_rate": 0.00014297399155111432,
"loss": 1.0955,
"step": 1470
},
{
"epoch": 1.2780656303972366,
"grad_norm": 0.9239136576652527,
"learning_rate": 0.00014206437996545554,
"loss": 1.1792,
"step": 1480
},
{
"epoch": 1.2867012089810017,
"grad_norm": 0.9566736221313477,
"learning_rate": 0.0001411505199090283,
"loss": 1.1599,
"step": 1490
},
{
"epoch": 1.2953367875647668,
"grad_norm": 0.8936079740524292,
"learning_rate": 0.00014023250368101157,
"loss": 1.0861,
"step": 1500
},
{
"epoch": 1.2953367875647668,
"eval_loss": 1.0975605249404907,
"eval_runtime": 74.2879,
"eval_samples_per_second": 31.163,
"eval_steps_per_second": 3.904,
"step": 1500
},
{
"epoch": 1.3039723661485318,
"grad_norm": 0.7882747650146484,
"learning_rate": 0.00013931042400035462,
"loss": 1.0991,
"step": 1510
},
{
"epoch": 1.3126079447322971,
"grad_norm": 1.0932565927505493,
"learning_rate": 0.00013838437399641226,
"loss": 1.1312,
"step": 1520
},
{
"epoch": 1.3212435233160622,
"grad_norm": 0.849192202091217,
"learning_rate": 0.00013745444719953908,
"loss": 1.1094,
"step": 1530
},
{
"epoch": 1.3298791018998273,
"grad_norm": 1.168214201927185,
"learning_rate": 0.0001365207375316428,
"loss": 1.0642,
"step": 1540
},
{
"epoch": 1.3385146804835923,
"grad_norm": 0.8027725219726562,
"learning_rate": 0.00013558333929669826,
"loss": 1.0682,
"step": 1550
},
{
"epoch": 1.3471502590673574,
"grad_norm": 0.9536592960357666,
"learning_rate": 0.0001346423471712228,
"loss": 1.1241,
"step": 1560
},
{
"epoch": 1.3557858376511227,
"grad_norm": 1.0561705827713013,
"learning_rate": 0.00013369785619471398,
"loss": 1.1582,
"step": 1570
},
{
"epoch": 1.3644214162348878,
"grad_norm": 1.3560823202133179,
"learning_rate": 0.0001327499617600508,
"loss": 1.0265,
"step": 1580
},
{
"epoch": 1.3730569948186528,
"grad_norm": 1.0698766708374023,
"learning_rate": 0.00013179875960385885,
"loss": 1.0433,
"step": 1590
},
{
"epoch": 1.381692573402418,
"grad_norm": 1.011797308921814,
"learning_rate": 0.00013084434579684114,
"loss": 1.0428,
"step": 1600
},
{
"epoch": 1.381692573402418,
"eval_loss": 1.050079345703125,
"eval_runtime": 74.2925,
"eval_samples_per_second": 31.161,
"eval_steps_per_second": 3.903,
"step": 1600
},
{
"epoch": 1.390328151986183,
"grad_norm": 1.052328109741211,
"learning_rate": 0.00012988681673407502,
"loss": 1.0955,
"step": 1610
},
{
"epoch": 1.3989637305699483,
"grad_norm": 0.975304126739502,
"learning_rate": 0.0001289262691252763,
"loss": 1.0776,
"step": 1620
},
{
"epoch": 1.4075993091537133,
"grad_norm": 0.9506198763847351,
"learning_rate": 0.00012796279998503174,
"loss": 1.0708,
"step": 1630
},
{
"epoch": 1.4162348877374784,
"grad_norm": 0.9697166085243225,
"learning_rate": 0.0001269965066230005,
"loss": 1.0098,
"step": 1640
},
{
"epoch": 1.4248704663212435,
"grad_norm": 1.0433659553527832,
"learning_rate": 0.00012602748663408613,
"loss": 1.0346,
"step": 1650
},
{
"epoch": 1.4335060449050085,
"grad_norm": 1.0530465841293335,
"learning_rate": 0.00012505583788857924,
"loss": 1.1224,
"step": 1660
},
{
"epoch": 1.4421416234887738,
"grad_norm": 0.9902591705322266,
"learning_rate": 0.0001240816585222731,
"loss": 1.1215,
"step": 1670
},
{
"epoch": 1.450777202072539,
"grad_norm": 0.9624248147010803,
"learning_rate": 0.00012310504692655166,
"loss": 1.028,
"step": 1680
},
{
"epoch": 1.459412780656304,
"grad_norm": 1.3916789293289185,
"learning_rate": 0.0001221261017384522,
"loss": 1.0322,
"step": 1690
},
{
"epoch": 1.468048359240069,
"grad_norm": 1.3031835556030273,
"learning_rate": 0.00012114492183070323,
"loss": 0.9959,
"step": 1700
},
{
"epoch": 1.468048359240069,
"eval_loss": 1.0126487016677856,
"eval_runtime": 74.2735,
"eval_samples_per_second": 31.169,
"eval_steps_per_second": 3.904,
"step": 1700
},
{
"epoch": 1.4766839378238341,
"grad_norm": 0.9306958913803101,
"learning_rate": 0.00012016160630173807,
"loss": 1.0158,
"step": 1710
},
{
"epoch": 1.4853195164075994,
"grad_norm": 1.1065701246261597,
"learning_rate": 0.00011917625446568626,
"loss": 1.0134,
"step": 1720
},
{
"epoch": 1.4939550949913645,
"grad_norm": 0.9095447659492493,
"learning_rate": 0.00011818896584234287,
"loss": 1.0405,
"step": 1730
},
{
"epoch": 1.5025906735751295,
"grad_norm": 1.1259651184082031,
"learning_rate": 0.00011719984014711693,
"loss": 1.0454,
"step": 1740
},
{
"epoch": 1.5112262521588946,
"grad_norm": 1.0899256467819214,
"learning_rate": 0.00011620897728096047,
"loss": 1.0925,
"step": 1750
},
{
"epoch": 1.5198618307426597,
"grad_norm": 1.173726201057434,
"learning_rate": 0.00011521647732027843,
"loss": 1.0111,
"step": 1760
},
{
"epoch": 1.528497409326425,
"grad_norm": 0.9733538031578064,
"learning_rate": 0.00011422244050682097,
"loss": 1.0432,
"step": 1770
},
{
"epoch": 1.5371329879101898,
"grad_norm": 1.2745634317398071,
"learning_rate": 0.00011322696723755935,
"loss": 1.035,
"step": 1780
},
{
"epoch": 1.545768566493955,
"grad_norm": 0.9993700385093689,
"learning_rate": 0.00011223015805454573,
"loss": 1.0128,
"step": 1790
},
{
"epoch": 1.5544041450777202,
"grad_norm": 1.0131609439849854,
"learning_rate": 0.00011123211363475863,
"loss": 1.0223,
"step": 1800
},
{
"epoch": 1.5544041450777202,
"eval_loss": 0.9773589372634888,
"eval_runtime": 74.3225,
"eval_samples_per_second": 31.148,
"eval_steps_per_second": 3.902,
"step": 1800
},
{
"epoch": 1.5630397236614852,
"grad_norm": 0.9971020221710205,
"learning_rate": 0.00011023293477993446,
"loss": 1.0477,
"step": 1810
},
{
"epoch": 1.5716753022452505,
"grad_norm": 0.9657288193702698,
"learning_rate": 0.00010923272240638676,
"loss": 1.0412,
"step": 1820
},
{
"epoch": 1.5803108808290154,
"grad_norm": 1.0480608940124512,
"learning_rate": 0.00010823157753481367,
"loss": 1.0009,
"step": 1830
},
{
"epoch": 1.5889464594127807,
"grad_norm": 1.0142576694488525,
"learning_rate": 0.00010722960128009491,
"loss": 1.0039,
"step": 1840
},
{
"epoch": 1.5975820379965457,
"grad_norm": 1.158823847770691,
"learning_rate": 0.00010622689484107935,
"loss": 1.033,
"step": 1850
},
{
"epoch": 1.6062176165803108,
"grad_norm": 1.025612473487854,
"learning_rate": 0.00010522355949036386,
"loss": 0.9911,
"step": 1860
},
{
"epoch": 1.614853195164076,
"grad_norm": 1.2156877517700195,
"learning_rate": 0.00010421969656406495,
"loss": 0.9672,
"step": 1870
},
{
"epoch": 1.623488773747841,
"grad_norm": 1.1628742218017578,
"learning_rate": 0.00010321540745158382,
"loss": 0.9499,
"step": 1880
},
{
"epoch": 1.6321243523316062,
"grad_norm": 0.9938153624534607,
"learning_rate": 0.00010221079358536619,
"loss": 0.972,
"step": 1890
},
{
"epoch": 1.6407599309153713,
"grad_norm": 1.0305359363555908,
"learning_rate": 0.00010120595643065769,
"loss": 0.9733,
"step": 1900
},
{
"epoch": 1.6407599309153713,
"eval_loss": 0.9481803774833679,
"eval_runtime": 74.2826,
"eval_samples_per_second": 31.165,
"eval_steps_per_second": 3.904,
"step": 1900
},
{
"epoch": 1.6493955094991364,
"grad_norm": 1.048543930053711,
"learning_rate": 0.00010020099747525586,
"loss": 0.9864,
"step": 1910
},
{
"epoch": 1.6580310880829017,
"grad_norm": 0.9962440729141235,
"learning_rate": 9.919601821926009e-05,
"loss": 0.9375,
"step": 1920
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.939709484577179,
"learning_rate": 9.819112016482001e-05,
"loss": 1.0237,
"step": 1930
},
{
"epoch": 1.6753022452504318,
"grad_norm": 0.9809345602989197,
"learning_rate": 9.718640480588409e-05,
"loss": 0.9283,
"step": 1940
},
{
"epoch": 1.6839378238341969,
"grad_norm": 1.0643101930618286,
"learning_rate": 9.618197361794854e-05,
"loss": 0.9252,
"step": 1950
},
{
"epoch": 1.692573402417962,
"grad_norm": 1.0364254713058472,
"learning_rate": 9.517792804780867e-05,
"loss": 0.9815,
"step": 1960
},
{
"epoch": 1.7012089810017272,
"grad_norm": 1.0348941087722778,
"learning_rate": 9.417436950331256e-05,
"loss": 0.9443,
"step": 1970
},
{
"epoch": 1.709844559585492,
"grad_norm": 0.9953681230545044,
"learning_rate": 9.31713993431191e-05,
"loss": 0.8284,
"step": 1980
},
{
"epoch": 1.7184801381692574,
"grad_norm": 1.092307209968567,
"learning_rate": 9.216911886646085e-05,
"loss": 0.9878,
"step": 1990
},
{
"epoch": 1.7271157167530224,
"grad_norm": 1.0429028272628784,
"learning_rate": 9.116762930291282e-05,
"loss": 0.9367,
"step": 2000
},
{
"epoch": 1.7271157167530224,
"eval_loss": 0.9206886887550354,
"eval_runtime": 76.0782,
"eval_samples_per_second": 30.429,
"eval_steps_per_second": 3.812,
"step": 2000
},
{
"epoch": 1.7357512953367875,
"grad_norm": 1.346091389656067,
"learning_rate": 9.016703180216834e-05,
"loss": 0.9867,
"step": 2010
},
{
"epoch": 1.7443868739205528,
"grad_norm": 1.3426605463027954,
"learning_rate": 8.916742742382316e-05,
"loss": 0.9452,
"step": 2020
},
{
"epoch": 1.7530224525043177,
"grad_norm": 1.1281945705413818,
"learning_rate": 8.816891712716834e-05,
"loss": 0.9285,
"step": 2030
},
{
"epoch": 1.761658031088083,
"grad_norm": 1.0097074508666992,
"learning_rate": 8.717160176099358e-05,
"loss": 0.9402,
"step": 2040
},
{
"epoch": 1.770293609671848,
"grad_norm": 0.9986193180084229,
"learning_rate": 8.617558205340144e-05,
"loss": 0.9679,
"step": 2050
},
{
"epoch": 1.778929188255613,
"grad_norm": 0.9868291020393372,
"learning_rate": 8.518095860163395e-05,
"loss": 0.9703,
"step": 2060
},
{
"epoch": 1.7875647668393784,
"grad_norm": 1.131056785583496,
"learning_rate": 8.418783186191236e-05,
"loss": 0.9633,
"step": 2070
},
{
"epoch": 1.7962003454231432,
"grad_norm": 1.214119791984558,
"learning_rate": 8.31963021392911e-05,
"loss": 1.0013,
"step": 2080
},
{
"epoch": 1.8048359240069085,
"grad_norm": 1.0295566320419312,
"learning_rate": 8.220646957752716e-05,
"loss": 0.924,
"step": 2090
},
{
"epoch": 1.8134715025906736,
"grad_norm": 1.0361146926879883,
"learning_rate": 8.121843414896547e-05,
"loss": 0.9298,
"step": 2100
},
{
"epoch": 1.8134715025906736,
"eval_loss": 0.8968186974525452,
"eval_runtime": 74.3145,
"eval_samples_per_second": 31.151,
"eval_steps_per_second": 3.902,
"step": 2100
},
{
"epoch": 1.8221070811744386,
"grad_norm": 1.1014063358306885,
"learning_rate": 8.023229564444188e-05,
"loss": 0.868,
"step": 2110
},
{
"epoch": 1.830742659758204,
"grad_norm": 1.1780112981796265,
"learning_rate": 7.924815366320434e-05,
"loss": 0.904,
"step": 2120
},
{
"epoch": 1.8393782383419688,
"grad_norm": 1.1275811195373535,
"learning_rate": 7.826610760285343e-05,
"loss": 0.8838,
"step": 2130
},
{
"epoch": 1.848013816925734,
"grad_norm": 1.2763252258300781,
"learning_rate": 7.728625664930336e-05,
"loss": 0.8688,
"step": 2140
},
{
"epoch": 1.8566493955094991,
"grad_norm": 1.1182912588119507,
"learning_rate": 7.630869976676413e-05,
"loss": 0.9025,
"step": 2150
},
{
"epoch": 1.8652849740932642,
"grad_norm": 1.165228247642517,
"learning_rate": 7.533353568774634e-05,
"loss": 0.9962,
"step": 2160
},
{
"epoch": 1.8739205526770295,
"grad_norm": 1.0472362041473389,
"learning_rate": 7.436086290308919e-05,
"loss": 0.9682,
"step": 2170
},
{
"epoch": 1.8825561312607944,
"grad_norm": 1.0311408042907715,
"learning_rate": 7.339077965201305e-05,
"loss": 0.9335,
"step": 2180
},
{
"epoch": 1.8911917098445596,
"grad_norm": 1.0582579374313354,
"learning_rate": 7.242338391219734e-05,
"loss": 0.985,
"step": 2190
},
{
"epoch": 1.8998272884283247,
"grad_norm": 0.923521876335144,
"learning_rate": 7.145877338988487e-05,
"loss": 0.9738,
"step": 2200
},
{
"epoch": 1.8998272884283247,
"eval_loss": 0.872840166091919,
"eval_runtime": 74.2854,
"eval_samples_per_second": 31.164,
"eval_steps_per_second": 3.904,
"step": 2200
},
{
"epoch": 1.9084628670120898,
"grad_norm": 1.0986098051071167,
"learning_rate": 7.049704551001358e-05,
"loss": 0.9313,
"step": 2210
},
{
"epoch": 1.917098445595855,
"grad_norm": 1.050269365310669,
"learning_rate": 6.953829740637662e-05,
"loss": 0.8601,
"step": 2220
},
{
"epoch": 1.92573402417962,
"grad_norm": 1.2400585412979126,
"learning_rate": 6.858262591181206e-05,
"loss": 0.8907,
"step": 2230
},
{
"epoch": 1.9343696027633852,
"grad_norm": 1.0596503019332886,
"learning_rate": 6.763012754842277e-05,
"loss": 0.9053,
"step": 2240
},
{
"epoch": 1.9430051813471503,
"grad_norm": 0.9167270064353943,
"learning_rate": 6.668089851782769e-05,
"loss": 0.9776,
"step": 2250
},
{
"epoch": 1.9516407599309153,
"grad_norm": 1.2059139013290405,
"learning_rate": 6.573503469144566e-05,
"loss": 0.8575,
"step": 2260
},
{
"epoch": 1.9602763385146806,
"grad_norm": 1.0621919631958008,
"learning_rate": 6.479263160081242e-05,
"loss": 0.9437,
"step": 2270
},
{
"epoch": 1.9689119170984455,
"grad_norm": 1.1505554914474487,
"learning_rate": 6.385378442793188e-05,
"loss": 0.8951,
"step": 2280
},
{
"epoch": 1.9775474956822108,
"grad_norm": 1.052738904953003,
"learning_rate": 6.29185879956632e-05,
"loss": 0.8694,
"step": 2290
},
{
"epoch": 1.9861830742659758,
"grad_norm": 0.9112501740455627,
"learning_rate": 6.198713675814318e-05,
"loss": 0.8679,
"step": 2300
},
{
"epoch": 1.9861830742659758,
"eval_loss": 0.8568958640098572,
"eval_runtime": 74.2813,
"eval_samples_per_second": 31.165,
"eval_steps_per_second": 3.904,
"step": 2300
},
{
"epoch": 1.994818652849741,
"grad_norm": 0.9624518156051636,
"learning_rate": 6.105952479124696e-05,
"loss": 0.9374,
"step": 2310
},
{
"epoch": 2.003454231433506,
"grad_norm": 1.1013827323913574,
"learning_rate": 6.0135845783086145e-05,
"loss": 0.8569,
"step": 2320
},
{
"epoch": 2.012089810017271,
"grad_norm": 1.0752055644989014,
"learning_rate": 5.921619302454645e-05,
"loss": 0.9713,
"step": 2330
},
{
"epoch": 2.0207253886010363,
"grad_norm": 1.123271107673645,
"learning_rate": 5.830065939986553e-05,
"loss": 0.8359,
"step": 2340
},
{
"epoch": 2.029360967184801,
"grad_norm": 1.0255523920059204,
"learning_rate": 5.73893373772515e-05,
"loss": 0.8339,
"step": 2350
},
{
"epoch": 2.0379965457685665,
"grad_norm": 1.169399619102478,
"learning_rate": 5.6482318999543807e-05,
"loss": 0.8717,
"step": 2360
},
{
"epoch": 2.0466321243523318,
"grad_norm": 1.0524979829788208,
"learning_rate": 5.5579695874917115e-05,
"loss": 0.8328,
"step": 2370
},
{
"epoch": 2.0552677029360966,
"grad_norm": 0.9218592047691345,
"learning_rate": 5.468155916762869e-05,
"loss": 0.8556,
"step": 2380
},
{
"epoch": 2.063903281519862,
"grad_norm": 1.2179642915725708,
"learning_rate": 5.3787999588811136e-05,
"loss": 0.8256,
"step": 2390
},
{
"epoch": 2.0725388601036268,
"grad_norm": 1.2335243225097656,
"learning_rate": 5.28991073873105e-05,
"loss": 0.891,
"step": 2400
},
{
"epoch": 2.0725388601036268,
"eval_loss": 0.8398398756980896,
"eval_runtime": 74.3373,
"eval_samples_per_second": 31.142,
"eval_steps_per_second": 3.901,
"step": 2400
},
{
"epoch": 2.081174438687392,
"grad_norm": 1.2605432271957397,
"learning_rate": 5.201497234057111e-05,
"loss": 0.7942,
"step": 2410
},
{
"epoch": 2.0898100172711573,
"grad_norm": 1.0830875635147095,
"learning_rate": 5.1135683745568455e-05,
"loss": 0.8772,
"step": 2420
},
{
"epoch": 2.098445595854922,
"grad_norm": 0.9472030997276306,
"learning_rate": 5.02613304097898e-05,
"loss": 0.926,
"step": 2430
},
{
"epoch": 2.1070811744386875,
"grad_norm": 1.091093897819519,
"learning_rate": 4.939200064226509e-05,
"loss": 0.8607,
"step": 2440
},
{
"epoch": 2.1157167530224523,
"grad_norm": 1.186557412147522,
"learning_rate": 4.8527782244647656e-05,
"loss": 0.9168,
"step": 2450
},
{
"epoch": 2.1243523316062176,
"grad_norm": 1.1364177465438843,
"learning_rate": 4.766876250234621e-05,
"loss": 0.8785,
"step": 2460
},
{
"epoch": 2.132987910189983,
"grad_norm": 1.1587848663330078,
"learning_rate": 4.681502817570929e-05,
"loss": 0.8479,
"step": 2470
},
{
"epoch": 2.1416234887737478,
"grad_norm": 1.1975603103637695,
"learning_rate": 4.59666654912623e-05,
"loss": 0.9014,
"step": 2480
},
{
"epoch": 2.150259067357513,
"grad_norm": 1.0785084962844849,
"learning_rate": 4.512376013299895e-05,
"loss": 0.8464,
"step": 2490
},
{
"epoch": 2.158894645941278,
"grad_norm": 1.0377299785614014,
"learning_rate": 4.428639723372706e-05,
"loss": 0.8461,
"step": 2500
},
{
"epoch": 2.158894645941278,
"eval_loss": 0.8259330987930298,
"eval_runtime": 74.3192,
"eval_samples_per_second": 31.149,
"eval_steps_per_second": 3.902,
"step": 2500
},
{
"epoch": 2.167530224525043,
"grad_norm": 1.2178107500076294,
"learning_rate": 4.345466136647018e-05,
"loss": 0.7985,
"step": 2510
},
{
"epoch": 2.1761658031088085,
"grad_norm": 1.0462040901184082,
"learning_rate": 4.2628636535926005e-05,
"loss": 0.8091,
"step": 2520
},
{
"epoch": 2.1848013816925733,
"grad_norm": 1.0872950553894043,
"learning_rate": 4.180840616998164e-05,
"loss": 0.8729,
"step": 2530
},
{
"epoch": 2.1934369602763386,
"grad_norm": 1.2299045324325562,
"learning_rate": 4.099405311128774e-05,
"loss": 0.8864,
"step": 2540
},
{
"epoch": 2.2020725388601035,
"grad_norm": 1.0725489854812622,
"learning_rate": 4.018565960889137e-05,
"loss": 0.9033,
"step": 2550
},
{
"epoch": 2.2107081174438687,
"grad_norm": 1.1338095664978027,
"learning_rate": 3.9383307309928744e-05,
"loss": 0.8792,
"step": 2560
},
{
"epoch": 2.219343696027634,
"grad_norm": 1.0339998006820679,
"learning_rate": 3.858707725137921e-05,
"loss": 0.8888,
"step": 2570
},
{
"epoch": 2.227979274611399,
"grad_norm": 1.1130526065826416,
"learning_rate": 3.7797049851880325e-05,
"loss": 0.7557,
"step": 2580
},
{
"epoch": 2.236614853195164,
"grad_norm": 1.013401746749878,
"learning_rate": 3.701330490360583e-05,
"loss": 0.8868,
"step": 2590
},
{
"epoch": 2.245250431778929,
"grad_norm": 1.0278291702270508,
"learning_rate": 3.623592156420661e-05,
"loss": 0.8474,
"step": 2600
},
{
"epoch": 2.245250431778929,
"eval_loss": 0.8156528472900391,
"eval_runtime": 74.3107,
"eval_samples_per_second": 31.153,
"eval_steps_per_second": 3.903,
"step": 2600
},
{
"epoch": 2.2538860103626943,
"grad_norm": 1.040278673171997,
"learning_rate": 3.546497834881572e-05,
"loss": 0.8268,
"step": 2610
},
{
"epoch": 2.2625215889464596,
"grad_norm": 1.1358767747879028,
"learning_rate": 3.4700553122118714e-05,
"loss": 0.8267,
"step": 2620
},
{
"epoch": 2.2711571675302245,
"grad_norm": 1.1165881156921387,
"learning_rate": 3.394272309048895e-05,
"loss": 0.9085,
"step": 2630
},
{
"epoch": 2.2797927461139897,
"grad_norm": 0.9508546590805054,
"learning_rate": 3.319156479419032e-05,
"loss": 0.8471,
"step": 2640
},
{
"epoch": 2.2884283246977546,
"grad_norm": 1.192872166633606,
"learning_rate": 3.244715409964625e-05,
"loss": 0.8641,
"step": 2650
},
{
"epoch": 2.29706390328152,
"grad_norm": 1.092782735824585,
"learning_rate": 3.170956619177749e-05,
"loss": 0.8154,
"step": 2660
},
{
"epoch": 2.305699481865285,
"grad_norm": 1.425848126411438,
"learning_rate": 3.097887556640855e-05,
"loss": 0.8828,
"step": 2670
},
{
"epoch": 2.31433506044905,
"grad_norm": 1.099098801612854,
"learning_rate": 3.025515602274346e-05,
"loss": 0.8424,
"step": 2680
},
{
"epoch": 2.3229706390328153,
"grad_norm": 1.2581557035446167,
"learning_rate": 2.9538480655912415e-05,
"loss": 0.8606,
"step": 2690
},
{
"epoch": 2.33160621761658,
"grad_norm": 1.152100682258606,
"learning_rate": 2.8828921849588898e-05,
"loss": 0.8429,
"step": 2700
},
{
"epoch": 2.33160621761658,
"eval_loss": 0.8052871227264404,
"eval_runtime": 74.353,
"eval_samples_per_second": 31.135,
"eval_steps_per_second": 3.9,
"step": 2700
},
{
"epoch": 2.3402417962003454,
"grad_norm": 1.0980738401412964,
"learning_rate": 2.8126551268679134e-05,
"loss": 0.8846,
"step": 2710
},
{
"epoch": 2.3488773747841103,
"grad_norm": 0.9057421684265137,
"learning_rate": 2.7431439852084072e-05,
"loss": 0.8655,
"step": 2720
},
{
"epoch": 2.3575129533678756,
"grad_norm": 1.0247701406478882,
"learning_rate": 2.6743657805534396e-05,
"loss": 0.7428,
"step": 2730
},
{
"epoch": 2.366148531951641,
"grad_norm": 1.1374061107635498,
"learning_rate": 2.6063274594500086e-05,
"loss": 0.8294,
"step": 2740
},
{
"epoch": 2.3747841105354057,
"grad_norm": 1.0613446235656738,
"learning_rate": 2.5390358937174165e-05,
"loss": 0.8164,
"step": 2750
},
{
"epoch": 2.383419689119171,
"grad_norm": 1.053713083267212,
"learning_rate": 2.472497879753235e-05,
"loss": 0.8204,
"step": 2760
},
{
"epoch": 2.3920552677029363,
"grad_norm": 1.1319301128387451,
"learning_rate": 2.4067201378468807e-05,
"loss": 0.7942,
"step": 2770
},
{
"epoch": 2.400690846286701,
"grad_norm": 1.3451365232467651,
"learning_rate": 2.3417093115008525e-05,
"loss": 0.8676,
"step": 2780
},
{
"epoch": 2.4093264248704664,
"grad_norm": 0.9686126112937927,
"learning_rate": 2.277471966759771e-05,
"loss": 0.8581,
"step": 2790
},
{
"epoch": 2.4179620034542313,
"grad_norm": 1.1396652460098267,
"learning_rate": 2.2140145915471778e-05,
"loss": 0.9025,
"step": 2800
},
{
"epoch": 2.4179620034542313,
"eval_loss": 0.7989487051963806,
"eval_runtime": 74.4139,
"eval_samples_per_second": 31.11,
"eval_steps_per_second": 3.897,
"step": 2800
},
{
"epoch": 2.4265975820379966,
"grad_norm": 1.4245641231536865,
"learning_rate": 2.1513435950102924e-05,
"loss": 0.793,
"step": 2810
},
{
"epoch": 2.4352331606217614,
"grad_norm": 1.1148205995559692,
"learning_rate": 2.0894653068726688e-05,
"loss": 0.8414,
"step": 2820
},
{
"epoch": 2.4438687392055267,
"grad_norm": 1.168946385383606,
"learning_rate": 2.0283859767949078e-05,
"loss": 0.8287,
"step": 2830
},
{
"epoch": 2.452504317789292,
"grad_norm": 1.1215981245040894,
"learning_rate": 1.9681117737434606e-05,
"loss": 0.8029,
"step": 2840
},
{
"epoch": 2.461139896373057,
"grad_norm": 1.2339212894439697,
"learning_rate": 1.9086487853675382e-05,
"loss": 0.8861,
"step": 2850
},
{
"epoch": 2.469775474956822,
"grad_norm": 1.1001675128936768,
"learning_rate": 1.8500030173842885e-05,
"loss": 0.865,
"step": 2860
},
{
"epoch": 2.4784110535405874,
"grad_norm": 1.1429920196533203,
"learning_rate": 1.7921803929722082e-05,
"loss": 0.8753,
"step": 2870
},
{
"epoch": 2.4870466321243523,
"grad_norm": 1.0880659818649292,
"learning_rate": 1.7351867521729072e-05,
"loss": 0.8774,
"step": 2880
},
{
"epoch": 2.4956822107081176,
"grad_norm": 0.9570063948631287,
"learning_rate": 1.6790278513012925e-05,
"loss": 0.8067,
"step": 2890
},
{
"epoch": 2.5043177892918824,
"grad_norm": 1.0723425149917603,
"learning_rate": 1.6237093623641443e-05,
"loss": 0.801,
"step": 2900
},
{
"epoch": 2.5043177892918824,
"eval_loss": 0.7922360301017761,
"eval_runtime": 74.2817,
"eval_samples_per_second": 31.165,
"eval_steps_per_second": 3.904,
"step": 2900
},
{
"epoch": 2.5129533678756477,
"grad_norm": 1.1429632902145386,
"learning_rate": 1.569236872487283e-05,
"loss": 0.7751,
"step": 2910
},
{
"epoch": 2.5215889464594126,
"grad_norm": 1.1071122884750366,
"learning_rate": 1.5156158833512523e-05,
"loss": 0.7598,
"step": 2920
},
{
"epoch": 2.530224525043178,
"grad_norm": 1.057666301727295,
"learning_rate": 1.462851810635658e-05,
"loss": 0.8577,
"step": 2930
},
{
"epoch": 2.538860103626943,
"grad_norm": 1.0120848417282104,
"learning_rate": 1.410949983472205e-05,
"loss": 0.8649,
"step": 2940
},
{
"epoch": 2.547495682210708,
"grad_norm": 1.1220532655715942,
"learning_rate": 1.3599156439064309e-05,
"loss": 0.8426,
"step": 2950
},
{
"epoch": 2.5561312607944733,
"grad_norm": 1.1136960983276367,
"learning_rate": 1.3097539463682874e-05,
"loss": 0.8952,
"step": 2960
},
{
"epoch": 2.5647668393782386,
"grad_norm": 1.004520297050476,
"learning_rate": 1.26046995715153e-05,
"loss": 0.8849,
"step": 2970
},
{
"epoch": 2.5734024179620034,
"grad_norm": 1.01373291015625,
"learning_rate": 1.2120686539020376e-05,
"loss": 0.8147,
"step": 2980
},
{
"epoch": 2.5820379965457687,
"grad_norm": 1.0711711645126343,
"learning_rate": 1.1645549251150711e-05,
"loss": 0.7414,
"step": 2990
},
{
"epoch": 2.5906735751295336,
"grad_norm": 1.0850844383239746,
"learning_rate": 1.1179335696415306e-05,
"loss": 0.8152,
"step": 3000
},
{
"epoch": 2.5906735751295336,
"eval_loss": 0.7877171635627747,
"eval_runtime": 74.2634,
"eval_samples_per_second": 31.173,
"eval_steps_per_second": 3.905,
"step": 3000
},
{
"epoch": 2.599309153713299,
"grad_norm": 1.0988068580627441,
"learning_rate": 1.0722092962032927e-05,
"loss": 0.7355,
"step": 3010
},
{
"epoch": 2.6079447322970637,
"grad_norm": 1.098163366317749,
"learning_rate": 1.0273867229176094e-05,
"loss": 0.7886,
"step": 3020
},
{
"epoch": 2.616580310880829,
"grad_norm": 1.2323061227798462,
"learning_rate": 9.834703768307063e-06,
"loss": 0.7491,
"step": 3030
},
{
"epoch": 2.6252158894645943,
"grad_norm": 0.9928609728813171,
"learning_rate": 9.404646934605399e-06,
"loss": 0.921,
"step": 3040
},
{
"epoch": 2.633851468048359,
"grad_norm": 1.285148024559021,
"learning_rate": 8.983740163488107e-06,
"loss": 0.8066,
"step": 3050
},
{
"epoch": 2.6424870466321244,
"grad_norm": 1.1035611629486084,
"learning_rate": 8.572025966222841e-06,
"loss": 0.8209,
"step": 3060
},
{
"epoch": 2.6511226252158897,
"grad_norm": 0.8766908049583435,
"learning_rate": 8.169545925634115e-06,
"loss": 0.8807,
"step": 3070
},
{
"epoch": 2.6597582037996546,
"grad_norm": 1.2940890789031982,
"learning_rate": 7.776340691903604e-06,
"loss": 0.875,
"step": 3080
},
{
"epoch": 2.66839378238342,
"grad_norm": 1.0446336269378662,
"learning_rate": 7.392449978464478e-06,
"loss": 0.789,
"step": 3090
},
{
"epoch": 2.6770293609671847,
"grad_norm": 1.1468182802200317,
"learning_rate": 7.0179125579902915e-06,
"loss": 0.8416,
"step": 3100
},
{
"epoch": 2.6770293609671847,
"eval_loss": 0.7846628427505493,
"eval_runtime": 74.271,
"eval_samples_per_second": 31.17,
"eval_steps_per_second": 3.905,
"step": 3100
},
{
"epoch": 2.68566493955095,
"grad_norm": 0.9985455870628357,
"learning_rate": 6.652766258479126e-06,
"loss": 0.7779,
"step": 3110
},
{
"epoch": 2.694300518134715,
"grad_norm": 1.0844076871871948,
"learning_rate": 6.2970479594328e-06,
"loss": 0.7998,
"step": 3120
},
{
"epoch": 2.70293609671848,
"grad_norm": 1.0988919734954834,
"learning_rate": 5.950793588132253e-06,
"loss": 0.7566,
"step": 3130
},
{
"epoch": 2.7115716753022454,
"grad_norm": 1.1833341121673584,
"learning_rate": 5.614038116008824e-06,
"loss": 0.7846,
"step": 3140
},
{
"epoch": 2.7202072538860103,
"grad_norm": 1.0258663892745972,
"learning_rate": 5.286815555112101e-06,
"loss": 0.7837,
"step": 3150
},
{
"epoch": 2.7288428324697755,
"grad_norm": 1.0312175750732422,
"learning_rate": 4.969158954674902e-06,
"loss": 0.8348,
"step": 3160
},
{
"epoch": 2.737478411053541,
"grad_norm": 1.152564287185669,
"learning_rate": 4.6611003977751425e-06,
"loss": 0.8359,
"step": 3170
},
{
"epoch": 2.7461139896373057,
"grad_norm": 1.0136488676071167,
"learning_rate": 4.362670998095597e-06,
"loss": 0.8209,
"step": 3180
},
{
"epoch": 2.754749568221071,
"grad_norm": 0.9231327176094055,
"learning_rate": 4.073900896781402e-06,
"loss": 0.8148,
"step": 3190
},
{
"epoch": 2.763385146804836,
"grad_norm": 0.9632856845855713,
"learning_rate": 3.7948192593957877e-06,
"loss": 0.7827,
"step": 3200
},
{
"epoch": 2.763385146804836,
"eval_loss": 0.7829655408859253,
"eval_runtime": 74.3256,
"eval_samples_per_second": 31.147,
"eval_steps_per_second": 3.902,
"step": 3200
},
{
"epoch": 2.772020725388601,
"grad_norm": 0.9402443170547485,
"learning_rate": 3.525454272974427e-06,
"loss": 0.8453,
"step": 3210
},
{
"epoch": 2.780656303972366,
"grad_norm": 1.1812800168991089,
"learning_rate": 3.265833143178543e-06,
"loss": 0.8018,
"step": 3220
},
{
"epoch": 2.7892918825561313,
"grad_norm": 0.9228396415710449,
"learning_rate": 3.0159820915471426e-06,
"loss": 0.7994,
"step": 3230
},
{
"epoch": 2.7979274611398965,
"grad_norm": 0.9304484724998474,
"learning_rate": 2.7759263528487345e-06,
"loss": 0.744,
"step": 3240
},
{
"epoch": 2.8065630397236614,
"grad_norm": 1.064064621925354,
"learning_rate": 2.5456901725325224e-06,
"loss": 0.8863,
"step": 3250
},
{
"epoch": 2.8151986183074267,
"grad_norm": 1.0431768894195557,
"learning_rate": 2.3252968042797083e-06,
"loss": 0.8184,
"step": 3260
},
{
"epoch": 2.823834196891192,
"grad_norm": 1.1347994804382324,
"learning_rate": 2.114768507654885e-06,
"loss": 0.8135,
"step": 3270
},
{
"epoch": 2.832469775474957,
"grad_norm": 1.2335035800933838,
"learning_rate": 1.9141265458578196e-06,
"loss": 0.8298,
"step": 3280
},
{
"epoch": 2.8411053540587217,
"grad_norm": 0.9481128454208374,
"learning_rate": 1.7233911835758843e-06,
"loss": 0.8401,
"step": 3290
},
{
"epoch": 2.849740932642487,
"grad_norm": 1.182274580001831,
"learning_rate": 1.5425816849373386e-06,
"loss": 0.7997,
"step": 3300
},
{
"epoch": 2.849740932642487,
"eval_loss": 0.7820163369178772,
"eval_runtime": 74.5484,
"eval_samples_per_second": 31.054,
"eval_steps_per_second": 3.89,
"step": 3300
},
{
"epoch": 2.8583765112262522,
"grad_norm": 1.0886763334274292,
"learning_rate": 1.3717163115656962e-06,
"loss": 0.78,
"step": 3310
},
{
"epoch": 2.867012089810017,
"grad_norm": 0.9863650798797607,
"learning_rate": 1.2108123207352662e-06,
"loss": 0.8321,
"step": 3320
},
{
"epoch": 2.8756476683937824,
"grad_norm": 1.0511281490325928,
"learning_rate": 1.0598859636282156e-06,
"loss": 0.8616,
"step": 3330
},
{
"epoch": 2.8842832469775477,
"grad_norm": 1.1429880857467651,
"learning_rate": 9.189524836932029e-07,
"loss": 0.8199,
"step": 3340
},
{
"epoch": 2.8929188255613125,
"grad_norm": 1.071950912475586,
"learning_rate": 7.88026115105811e-07,
"loss": 0.8046,
"step": 3350
},
{
"epoch": 2.901554404145078,
"grad_norm": 1.19888174533844,
"learning_rate": 6.671200813308742e-07,
"loss": 0.8907,
"step": 3360
},
{
"epoch": 2.910189982728843,
"grad_norm": 0.9800041317939758,
"learning_rate": 5.562465937869577e-07,
"loss": 0.8213,
"step": 3370
},
{
"epoch": 2.918825561312608,
"grad_norm": 1.047254204750061,
"learning_rate": 4.5541685061299964e-07,
"loss": 0.8474,
"step": 3380
},
{
"epoch": 2.927461139896373,
"grad_norm": 1.0879237651824951,
"learning_rate": 3.646410355372831e-07,
"loss": 0.7963,
"step": 3390
},
{
"epoch": 2.936096718480138,
"grad_norm": 1.0805079936981201,
"learning_rate": 2.8392831684891374e-07,
"loss": 0.8146,
"step": 3400
},
{
"epoch": 2.936096718480138,
"eval_loss": 0.7816377282142639,
"eval_runtime": 74.3018,
"eval_samples_per_second": 31.157,
"eval_steps_per_second": 3.903,
"step": 3400
}
],
"logging_steps": 10,
"max_steps": 3474,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6308483890777948e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}