| { | |
| "best_global_step": 3400, | |
| "best_metric": 0.7816377282142639, | |
| "best_model_checkpoint": "./lfm_kokoro_complete/checkpoint-3400", | |
| "epoch": 2.936096718480138, | |
| "eval_steps": 100, | |
| "global_step": 3400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0008635578583765112, | |
| "grad_norm": 5.131196975708008, | |
| "learning_rate": 0.0, | |
| "loss": 2.8308, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008635578583765112, | |
| "grad_norm": 5.2136335372924805, | |
| "learning_rate": 5.172413793103448e-06, | |
| "loss": 2.6503, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.017271157167530225, | |
| "grad_norm": 2.5024237632751465, | |
| "learning_rate": 1.091954022988506e-05, | |
| "loss": 2.6409, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.025906735751295335, | |
| "grad_norm": 1.3332571983337402, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 2.505, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03454231433506045, | |
| "grad_norm": 1.121747374534607, | |
| "learning_rate": 2.2413793103448276e-05, | |
| "loss": 2.3877, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04317789291882556, | |
| "grad_norm": 0.5361054539680481, | |
| "learning_rate": 2.8160919540229884e-05, | |
| "loss": 2.2456, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05181347150259067, | |
| "grad_norm": 0.4509966969490051, | |
| "learning_rate": 3.390804597701149e-05, | |
| "loss": 2.18, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06044905008635579, | |
| "grad_norm": 0.3262108266353607, | |
| "learning_rate": 3.965517241379311e-05, | |
| "loss": 2.1533, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0690846286701209, | |
| "grad_norm": 0.31236183643341064, | |
| "learning_rate": 4.5402298850574716e-05, | |
| "loss": 2.1012, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07772020725388601, | |
| "grad_norm": 0.2791730463504791, | |
| "learning_rate": 5.1149425287356324e-05, | |
| "loss": 2.0615, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08635578583765112, | |
| "grad_norm": 0.29012593626976013, | |
| "learning_rate": 5.689655172413794e-05, | |
| "loss": 2.0758, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08635578583765112, | |
| "eval_loss": 2.024885654449463, | |
| "eval_runtime": 74.288, | |
| "eval_samples_per_second": 31.163, | |
| "eval_steps_per_second": 3.904, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09499136442141623, | |
| "grad_norm": 0.27982184290885925, | |
| "learning_rate": 6.264367816091954e-05, | |
| "loss": 1.9746, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10362694300518134, | |
| "grad_norm": 0.3128826320171356, | |
| "learning_rate": 6.839080459770116e-05, | |
| "loss": 2.0059, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11226252158894647, | |
| "grad_norm": 0.29881423711776733, | |
| "learning_rate": 7.413793103448277e-05, | |
| "loss": 2.0007, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12089810017271158, | |
| "grad_norm": 0.3187066316604614, | |
| "learning_rate": 7.988505747126437e-05, | |
| "loss": 1.9892, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12953367875647667, | |
| "grad_norm": 0.2999899983406067, | |
| "learning_rate": 8.563218390804599e-05, | |
| "loss": 1.9454, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1381692573402418, | |
| "grad_norm": 0.32296231389045715, | |
| "learning_rate": 9.137931034482759e-05, | |
| "loss": 1.9215, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14680483592400692, | |
| "grad_norm": 0.3282780051231384, | |
| "learning_rate": 9.71264367816092e-05, | |
| "loss": 1.9189, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.15544041450777202, | |
| "grad_norm": 0.3565793037414551, | |
| "learning_rate": 0.0001028735632183908, | |
| "loss": 1.9356, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16407599309153714, | |
| "grad_norm": 0.35819345712661743, | |
| "learning_rate": 0.00010862068965517242, | |
| "loss": 1.877, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17271157167530224, | |
| "grad_norm": 0.38044115900993347, | |
| "learning_rate": 0.00011436781609195404, | |
| "loss": 1.9072, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17271157167530224, | |
| "eval_loss": 1.8915574550628662, | |
| "eval_runtime": 74.2599, | |
| "eval_samples_per_second": 31.174, | |
| "eval_steps_per_second": 3.905, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.18134715025906736, | |
| "grad_norm": 0.38403257727622986, | |
| "learning_rate": 0.00012011494252873562, | |
| "loss": 1.9223, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.18998272884283246, | |
| "grad_norm": 0.40475621819496155, | |
| "learning_rate": 0.00012586206896551724, | |
| "loss": 1.8787, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.19861830742659758, | |
| "grad_norm": 0.37040725350379944, | |
| "learning_rate": 0.00013160919540229887, | |
| "loss": 1.8916, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.20725388601036268, | |
| "grad_norm": 0.41001173853874207, | |
| "learning_rate": 0.00013735632183908047, | |
| "loss": 1.8767, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2158894645941278, | |
| "grad_norm": 0.42713987827301025, | |
| "learning_rate": 0.0001431034482758621, | |
| "loss": 1.8756, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.22452504317789293, | |
| "grad_norm": 0.42754629254341125, | |
| "learning_rate": 0.00014885057471264367, | |
| "loss": 1.8208, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.23316062176165803, | |
| "grad_norm": 0.45471611618995667, | |
| "learning_rate": 0.0001545977011494253, | |
| "loss": 1.8424, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24179620034542315, | |
| "grad_norm": 0.419595330953598, | |
| "learning_rate": 0.0001603448275862069, | |
| "loss": 1.7868, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2504317789291883, | |
| "grad_norm": 0.40803587436676025, | |
| "learning_rate": 0.0001660919540229885, | |
| "loss": 1.8174, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.25906735751295334, | |
| "grad_norm": 0.4197799265384674, | |
| "learning_rate": 0.00017183908045977013, | |
| "loss": 1.8143, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.25906735751295334, | |
| "eval_loss": 1.8168917894363403, | |
| "eval_runtime": 74.2622, | |
| "eval_samples_per_second": 31.173, | |
| "eval_steps_per_second": 3.905, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.26770293609671847, | |
| "grad_norm": 0.494597464799881, | |
| "learning_rate": 0.00017758620689655173, | |
| "loss": 1.8581, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2763385146804836, | |
| "grad_norm": 0.41333019733428955, | |
| "learning_rate": 0.00018333333333333334, | |
| "loss": 1.7674, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2849740932642487, | |
| "grad_norm": 0.38664960861206055, | |
| "learning_rate": 0.00018908045977011494, | |
| "loss": 1.8403, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.29360967184801384, | |
| "grad_norm": 0.5136725902557373, | |
| "learning_rate": 0.00019482758620689657, | |
| "loss": 1.8034, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3022452504317789, | |
| "grad_norm": 0.4233579635620117, | |
| "learning_rate": 0.00019999994949995492, | |
| "loss": 1.8085, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.31088082901554404, | |
| "grad_norm": 0.47240543365478516, | |
| "learning_rate": 0.0001999938895562612, | |
| "loss": 1.7553, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.31951640759930916, | |
| "grad_norm": 0.42770665884017944, | |
| "learning_rate": 0.00019997773030485974, | |
| "loss": 1.81, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3281519861830743, | |
| "grad_norm": 0.41995662450790405, | |
| "learning_rate": 0.00019995147337782283, | |
| "loss": 1.7934, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.33678756476683935, | |
| "grad_norm": 0.4032181203365326, | |
| "learning_rate": 0.00019991512142708033, | |
| "loss": 1.7521, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3454231433506045, | |
| "grad_norm": 0.4150594174861908, | |
| "learning_rate": 0.00019986867812415198, | |
| "loss": 1.7813, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3454231433506045, | |
| "eval_loss": 1.749611735343933, | |
| "eval_runtime": 74.4513, | |
| "eval_samples_per_second": 31.094, | |
| "eval_steps_per_second": 3.895, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3540587219343696, | |
| "grad_norm": 0.47208553552627563, | |
| "learning_rate": 0.00019981214815977647, | |
| "loss": 1.7951, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3626943005181347, | |
| "grad_norm": 0.38943392038345337, | |
| "learning_rate": 0.00019974553724343773, | |
| "loss": 1.7224, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.37132987910189985, | |
| "grad_norm": 0.39383020997047424, | |
| "learning_rate": 0.00019966885210278822, | |
| "loss": 1.7327, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3799654576856649, | |
| "grad_norm": 0.4155268669128418, | |
| "learning_rate": 0.00019958210048296956, | |
| "loss": 1.7447, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.38860103626943004, | |
| "grad_norm": 0.393373042345047, | |
| "learning_rate": 0.00019948529114583013, | |
| "loss": 1.7679, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.39723661485319517, | |
| "grad_norm": 0.4065350592136383, | |
| "learning_rate": 0.0001993784338690403, | |
| "loss": 1.7041, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4058721934369603, | |
| "grad_norm": 0.42317506670951843, | |
| "learning_rate": 0.0001992615394451047, | |
| "loss": 1.662, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.41450777202072536, | |
| "grad_norm": 0.460857629776001, | |
| "learning_rate": 0.00019913461968027227, | |
| "loss": 1.7228, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4231433506044905, | |
| "grad_norm": 0.47063159942626953, | |
| "learning_rate": 0.00019899768739334393, | |
| "loss": 1.759, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4317789291882556, | |
| "grad_norm": 0.44678857922554016, | |
| "learning_rate": 0.00019885075641437776, | |
| "loss": 1.6796, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4317789291882556, | |
| "eval_loss": 1.684213638305664, | |
| "eval_runtime": 75.8643, | |
| "eval_samples_per_second": 30.515, | |
| "eval_steps_per_second": 3.823, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.44041450777202074, | |
| "grad_norm": 0.44266021251678467, | |
| "learning_rate": 0.00019869384158329223, | |
| "loss": 1.7078, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.44905008635578586, | |
| "grad_norm": 0.5028413534164429, | |
| "learning_rate": 0.00019852695874836735, | |
| "loss": 1.6464, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.45768566493955093, | |
| "grad_norm": 0.40895670652389526, | |
| "learning_rate": 0.00019835012476464406, | |
| "loss": 1.6559, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.46632124352331605, | |
| "grad_norm": 0.5382914543151855, | |
| "learning_rate": 0.00019816335749222187, | |
| "loss": 1.6413, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.4749568221070812, | |
| "grad_norm": 0.5184707045555115, | |
| "learning_rate": 0.00019796667579445492, | |
| "loss": 1.6706, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4835924006908463, | |
| "grad_norm": 0.4206818640232086, | |
| "learning_rate": 0.00019776009953604692, | |
| "loss": 1.6429, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.49222797927461137, | |
| "grad_norm": 0.5006670355796814, | |
| "learning_rate": 0.0001975436495810447, | |
| "loss": 1.6556, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5008635578583766, | |
| "grad_norm": 0.4992609918117523, | |
| "learning_rate": 0.0001973173477907311, | |
| "loss": 1.6296, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5094991364421416, | |
| "grad_norm": 0.488678902387619, | |
| "learning_rate": 0.0001970812170214169, | |
| "loss": 1.6366, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5181347150259067, | |
| "grad_norm": 0.5439748167991638, | |
| "learning_rate": 0.00019683528112213235, | |
| "loss": 1.6546, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5181347150259067, | |
| "eval_loss": 1.6274060010910034, | |
| "eval_runtime": 74.2854, | |
| "eval_samples_per_second": 31.164, | |
| "eval_steps_per_second": 3.904, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5267702936096719, | |
| "grad_norm": 0.49797549843788147, | |
| "learning_rate": 0.00019657956493221844, | |
| "loss": 1.6206, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5354058721934369, | |
| "grad_norm": 0.5434479117393494, | |
| "learning_rate": 0.00019631409427881832, | |
| "loss": 1.6198, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.5440414507772021, | |
| "grad_norm": 0.48510608077049255, | |
| "learning_rate": 0.00019603889597426838, | |
| "loss": 1.5839, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5526770293609672, | |
| "grad_norm": 0.4494125545024872, | |
| "learning_rate": 0.00019575399781339065, | |
| "loss": 1.6295, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5613126079447323, | |
| "grad_norm": 0.45310178399086, | |
| "learning_rate": 0.00019545942857068527, | |
| "loss": 1.6039, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5699481865284974, | |
| "grad_norm": 0.5253885984420776, | |
| "learning_rate": 0.00019515521799742444, | |
| "loss": 1.5997, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5785837651122625, | |
| "grad_norm": 0.48614782094955444, | |
| "learning_rate": 0.00019484139681864745, | |
| "loss": 1.5761, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5872193436960277, | |
| "grad_norm": 0.502662718296051, | |
| "learning_rate": 0.00019451799673005757, | |
| "loss": 1.5793, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5958549222797928, | |
| "grad_norm": 0.5655169486999512, | |
| "learning_rate": 0.00019418505039482068, | |
| "loss": 1.5643, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6044905008635578, | |
| "grad_norm": 0.507977306842804, | |
| "learning_rate": 0.00019384259144026653, | |
| "loss": 1.5549, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6044905008635578, | |
| "eval_loss": 1.5656999349594116, | |
| "eval_runtime": 74.2824, | |
| "eval_samples_per_second": 31.165, | |
| "eval_steps_per_second": 3.904, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.613126079447323, | |
| "grad_norm": 0.5120140910148621, | |
| "learning_rate": 0.00019349065445449214, | |
| "loss": 1.5388, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6217616580310881, | |
| "grad_norm": 0.5686282515525818, | |
| "learning_rate": 0.00019312927498286867, | |
| "loss": 1.5975, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6303972366148531, | |
| "grad_norm": 0.5706737637519836, | |
| "learning_rate": 0.00019275848952445115, | |
| "loss": 1.5062, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6390328151986183, | |
| "grad_norm": 0.4991269111633301, | |
| "learning_rate": 0.0001923783355282923, | |
| "loss": 1.5513, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6476683937823834, | |
| "grad_norm": 0.6073980927467346, | |
| "learning_rate": 0.00019198885138966009, | |
| "loss": 1.5004, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6563039723661486, | |
| "grad_norm": 0.5857861042022705, | |
| "learning_rate": 0.00019159007644615981, | |
| "loss": 1.5607, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6649395509499136, | |
| "grad_norm": 0.5783904194831848, | |
| "learning_rate": 0.00019118205097376113, | |
| "loss": 1.5616, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6735751295336787, | |
| "grad_norm": 0.5480038523674011, | |
| "learning_rate": 0.00019076481618273018, | |
| "loss": 1.5609, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.6822107081174439, | |
| "grad_norm": 0.6719979047775269, | |
| "learning_rate": 0.00019033841421346734, | |
| "loss": 1.5448, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.690846286701209, | |
| "grad_norm": 0.6396545171737671, | |
| "learning_rate": 0.00018990288813225105, | |
| "loss": 1.4898, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.690846286701209, | |
| "eval_loss": 1.5024266242980957, | |
| "eval_runtime": 74.2858, | |
| "eval_samples_per_second": 31.163, | |
| "eval_steps_per_second": 3.904, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6994818652849741, | |
| "grad_norm": 0.6165493130683899, | |
| "learning_rate": 0.0001894582819268883, | |
| "loss": 1.4581, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7081174438687392, | |
| "grad_norm": 0.5979147553443909, | |
| "learning_rate": 0.00018900464050227169, | |
| "loss": 1.5436, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7167530224525043, | |
| "grad_norm": 0.6082155108451843, | |
| "learning_rate": 0.0001885420096758443, | |
| "loss": 1.5205, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.7253886010362695, | |
| "grad_norm": 0.6365352272987366, | |
| "learning_rate": 0.0001880704361729719, | |
| "loss": 1.5159, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7340241796200345, | |
| "grad_norm": 0.6347801685333252, | |
| "learning_rate": 0.000187589967622224, | |
| "loss": 1.4908, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.7426597582037997, | |
| "grad_norm": 0.5811082720756531, | |
| "learning_rate": 0.00018710065255056314, | |
| "loss": 1.4738, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7512953367875648, | |
| "grad_norm": 0.6715326905250549, | |
| "learning_rate": 0.00018660254037844388, | |
| "loss": 1.4448, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7599309153713298, | |
| "grad_norm": 0.661300778388977, | |
| "learning_rate": 0.00018609568141482132, | |
| "loss": 1.4712, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.768566493955095, | |
| "grad_norm": 0.5695850253105164, | |
| "learning_rate": 0.00018558012685206997, | |
| "loss": 1.4348, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.7772020725388601, | |
| "grad_norm": 0.610674262046814, | |
| "learning_rate": 0.00018505592876081318, | |
| "loss": 1.504, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7772020725388601, | |
| "eval_loss": 1.4339938163757324, | |
| "eval_runtime": 74.5243, | |
| "eval_samples_per_second": 31.064, | |
| "eval_steps_per_second": 3.891, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7858376511226253, | |
| "grad_norm": 0.6574162244796753, | |
| "learning_rate": 0.00018452314008466432, | |
| "loss": 1.4541, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.7944732297063903, | |
| "grad_norm": 0.622951090335846, | |
| "learning_rate": 0.00018398181463487933, | |
| "loss": 1.4335, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.8031088082901554, | |
| "grad_norm": 0.7158159017562866, | |
| "learning_rate": 0.0001834320070849219, | |
| "loss": 1.3933, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.8117443868739206, | |
| "grad_norm": 0.6937190294265747, | |
| "learning_rate": 0.0001828737729649414, | |
| "loss": 1.4129, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.8203799654576857, | |
| "grad_norm": 0.6910032629966736, | |
| "learning_rate": 0.00018230716865616452, | |
| "loss": 1.4415, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8290155440414507, | |
| "grad_norm": 0.7496052980422974, | |
| "learning_rate": 0.00018173225138520065, | |
| "loss": 1.3115, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8376511226252159, | |
| "grad_norm": 0.8548805117607117, | |
| "learning_rate": 0.00018114907921826215, | |
| "loss": 1.3782, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.846286701208981, | |
| "grad_norm": 0.7024548053741455, | |
| "learning_rate": 0.0001805577110552997, | |
| "loss": 1.3649, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.8549222797927462, | |
| "grad_norm": 0.6912006139755249, | |
| "learning_rate": 0.0001799582066240534, | |
| "loss": 1.3884, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8635578583765112, | |
| "grad_norm": 0.7504778504371643, | |
| "learning_rate": 0.0001793506264740203, | |
| "loss": 1.4177, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8635578583765112, | |
| "eval_loss": 1.371172308921814, | |
| "eval_runtime": 74.2999, | |
| "eval_samples_per_second": 31.157, | |
| "eval_steps_per_second": 3.903, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8721934369602763, | |
| "grad_norm": 0.7364081740379333, | |
| "learning_rate": 0.00017873503197033902, | |
| "loss": 1.3732, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.8808290155440415, | |
| "grad_norm": 0.7568293809890747, | |
| "learning_rate": 0.00017811148528759183, | |
| "loss": 1.3572, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8894645941278065, | |
| "grad_norm": 0.8201608657836914, | |
| "learning_rate": 0.00017748004940352518, | |
| "loss": 1.3735, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.8981001727115717, | |
| "grad_norm": 0.7080292701721191, | |
| "learning_rate": 0.00017684078809268887, | |
| "loss": 1.3454, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.9067357512953368, | |
| "grad_norm": 0.870185911655426, | |
| "learning_rate": 0.00017619376591999493, | |
| "loss": 1.3371, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.9153713298791019, | |
| "grad_norm": 0.767082691192627, | |
| "learning_rate": 0.00017553904823419667, | |
| "loss": 1.3524, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.924006908462867, | |
| "grad_norm": 0.6791857481002808, | |
| "learning_rate": 0.00017487670116128832, | |
| "loss": 1.3515, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.9326424870466321, | |
| "grad_norm": 0.897812008857727, | |
| "learning_rate": 0.0001742067915978266, | |
| "loss": 1.3075, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9412780656303973, | |
| "grad_norm": 0.8470781445503235, | |
| "learning_rate": 0.00017352938720417398, | |
| "loss": 1.2876, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.9499136442141624, | |
| "grad_norm": 0.7665865421295166, | |
| "learning_rate": 0.0001728445563976652, | |
| "loss": 1.4049, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9499136442141624, | |
| "eval_loss": 1.3132154941558838, | |
| "eval_runtime": 74.4946, | |
| "eval_samples_per_second": 31.076, | |
| "eval_steps_per_second": 3.893, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9585492227979274, | |
| "grad_norm": 0.709002673625946, | |
| "learning_rate": 0.0001721523683456972, | |
| "loss": 1.3671, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.9671848013816926, | |
| "grad_norm": 0.7099783420562744, | |
| "learning_rate": 0.00017145289295874302, | |
| "loss": 1.3471, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.9758203799654577, | |
| "grad_norm": 0.6939783096313477, | |
| "learning_rate": 0.00017074620088329122, | |
| "loss": 1.3012, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.9844559585492227, | |
| "grad_norm": 0.8194535374641418, | |
| "learning_rate": 0.00017003236349471035, | |
| "loss": 1.2853, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.9930915371329879, | |
| "grad_norm": 0.7694395184516907, | |
| "learning_rate": 0.00016931145289004023, | |
| "loss": 1.3093, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.001727115716753, | |
| "grad_norm": 0.7333533763885498, | |
| "learning_rate": 0.0001685835418807103, | |
| "loss": 1.3436, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.0103626943005182, | |
| "grad_norm": 0.7282711863517761, | |
| "learning_rate": 0.00016784870398518545, | |
| "loss": 1.3019, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.0189982728842832, | |
| "grad_norm": 0.8324429392814636, | |
| "learning_rate": 0.00016710701342154106, | |
| "loss": 1.2171, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.0276338514680483, | |
| "grad_norm": 0.7838461995124817, | |
| "learning_rate": 0.00016635854509996668, | |
| "loss": 1.2805, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.0362694300518134, | |
| "grad_norm": 0.9009427428245544, | |
| "learning_rate": 0.00016560337461520036, | |
| "loss": 1.2174, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0362694300518134, | |
| "eval_loss": 1.247739315032959, | |
| "eval_runtime": 74.3416, | |
| "eval_samples_per_second": 31.14, | |
| "eval_steps_per_second": 3.901, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0449050086355787, | |
| "grad_norm": 0.817688524723053, | |
| "learning_rate": 0.00016484157823889363, | |
| "loss": 1.3382, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.0535405872193437, | |
| "grad_norm": 0.9377408623695374, | |
| "learning_rate": 0.00016407323291190803, | |
| "loss": 1.187, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.0621761658031088, | |
| "grad_norm": 0.7849322557449341, | |
| "learning_rate": 0.00016329841623654434, | |
| "loss": 1.2647, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.0708117443868739, | |
| "grad_norm": 0.8397180438041687, | |
| "learning_rate": 0.00016251720646870443, | |
| "loss": 1.2102, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.079447322970639, | |
| "grad_norm": 0.9595755934715271, | |
| "learning_rate": 0.00016172968250998792, | |
| "loss": 1.1938, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.0880829015544042, | |
| "grad_norm": 0.7337958216667175, | |
| "learning_rate": 0.00016093592389972286, | |
| "loss": 1.2553, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.0967184801381693, | |
| "grad_norm": 0.7563393115997314, | |
| "learning_rate": 0.0001601360108069324, | |
| "loss": 1.2577, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.1053540587219344, | |
| "grad_norm": 0.8453429937362671, | |
| "learning_rate": 0.0001593300240222379, | |
| "loss": 1.2466, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.1139896373056994, | |
| "grad_norm": 0.8459578156471252, | |
| "learning_rate": 0.00015851804494969893, | |
| "loss": 1.2145, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.1226252158894645, | |
| "grad_norm": 0.9956552982330322, | |
| "learning_rate": 0.00015770015559859172, | |
| "loss": 1.1838, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1226252158894645, | |
| "eval_loss": 1.1956804990768433, | |
| "eval_runtime": 74.2605, | |
| "eval_samples_per_second": 31.174, | |
| "eval_steps_per_second": 3.905, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1312607944732298, | |
| "grad_norm": 1.0404267311096191, | |
| "learning_rate": 0.00015687643857512616, | |
| "loss": 1.2361, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.1398963730569949, | |
| "grad_norm": 1.0246553421020508, | |
| "learning_rate": 0.00015604697707410255, | |
| "loss": 1.1873, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.14853195164076, | |
| "grad_norm": 0.8831927180290222, | |
| "learning_rate": 0.0001552118548705094, | |
| "loss": 1.1783, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.157167530224525, | |
| "grad_norm": 0.9147486686706543, | |
| "learning_rate": 0.0001543711563110616, | |
| "loss": 1.1853, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.16580310880829, | |
| "grad_norm": 0.9496821165084839, | |
| "learning_rate": 0.000153524966305682, | |
| "loss": 1.1501, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.1744386873920551, | |
| "grad_norm": 0.9167485237121582, | |
| "learning_rate": 0.00015267337031892527, | |
| "loss": 1.2301, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.1830742659758204, | |
| "grad_norm": 0.861179769039154, | |
| "learning_rate": 0.0001518164543613462, | |
| "loss": 1.1827, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.1917098445595855, | |
| "grad_norm": 1.0012174844741821, | |
| "learning_rate": 0.00015095430498081257, | |
| "loss": 1.1598, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.2003454231433506, | |
| "grad_norm": 0.9084812998771667, | |
| "learning_rate": 0.000150087009253764, | |
| "loss": 1.1446, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.2089810017271156, | |
| "grad_norm": 0.9342795610427856, | |
| "learning_rate": 0.0001492146547764172, | |
| "loss": 1.1408, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2089810017271156, | |
| "eval_loss": 1.1498360633850098, | |
| "eval_runtime": 74.2845, | |
| "eval_samples_per_second": 31.164, | |
| "eval_steps_per_second": 3.904, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2176165803108807, | |
| "grad_norm": 1.0500714778900146, | |
| "learning_rate": 0.00014833732965591887, | |
| "loss": 1.1475, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.226252158894646, | |
| "grad_norm": 1.031998872756958, | |
| "learning_rate": 0.00014745512250144695, | |
| "loss": 1.121, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.234887737478411, | |
| "grad_norm": 1.0070405006408691, | |
| "learning_rate": 0.00014656812241526117, | |
| "loss": 1.1167, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.2435233160621761, | |
| "grad_norm": 1.0366291999816895, | |
| "learning_rate": 0.0001456764189837037, | |
| "loss": 1.1365, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.2521588946459412, | |
| "grad_norm": 0.9328962564468384, | |
| "learning_rate": 0.000144780102268151, | |
| "loss": 1.1804, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.2607944732297063, | |
| "grad_norm": 0.875531017780304, | |
| "learning_rate": 0.000143879262795918, | |
| "loss": 1.1061, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.2694300518134716, | |
| "grad_norm": 1.023848533630371, | |
| "learning_rate": 0.00014297399155111432, | |
| "loss": 1.0955, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.2780656303972366, | |
| "grad_norm": 0.9239136576652527, | |
| "learning_rate": 0.00014206437996545554, | |
| "loss": 1.1792, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.2867012089810017, | |
| "grad_norm": 0.9566736221313477, | |
| "learning_rate": 0.0001411505199090283, | |
| "loss": 1.1599, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.2953367875647668, | |
| "grad_norm": 0.8936079740524292, | |
| "learning_rate": 0.00014023250368101157, | |
| "loss": 1.0861, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2953367875647668, | |
| "eval_loss": 1.0975605249404907, | |
| "eval_runtime": 74.2879, | |
| "eval_samples_per_second": 31.163, | |
| "eval_steps_per_second": 3.904, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.3039723661485318, | |
| "grad_norm": 0.7882747650146484, | |
| "learning_rate": 0.00013931042400035462, | |
| "loss": 1.0991, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.3126079447322971, | |
| "grad_norm": 1.0932565927505493, | |
| "learning_rate": 0.00013838437399641226, | |
| "loss": 1.1312, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.3212435233160622, | |
| "grad_norm": 0.849192202091217, | |
| "learning_rate": 0.00013745444719953908, | |
| "loss": 1.1094, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.3298791018998273, | |
| "grad_norm": 1.168214201927185, | |
| "learning_rate": 0.0001365207375316428, | |
| "loss": 1.0642, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.3385146804835923, | |
| "grad_norm": 0.8027725219726562, | |
| "learning_rate": 0.00013558333929669826, | |
| "loss": 1.0682, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.3471502590673574, | |
| "grad_norm": 0.9536592960357666, | |
| "learning_rate": 0.0001346423471712228, | |
| "loss": 1.1241, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.3557858376511227, | |
| "grad_norm": 1.0561705827713013, | |
| "learning_rate": 0.00013369785619471398, | |
| "loss": 1.1582, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.3644214162348878, | |
| "grad_norm": 1.3560823202133179, | |
| "learning_rate": 0.0001327499617600508, | |
| "loss": 1.0265, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.3730569948186528, | |
| "grad_norm": 1.0698766708374023, | |
| "learning_rate": 0.00013179875960385885, | |
| "loss": 1.0433, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.381692573402418, | |
| "grad_norm": 1.011797308921814, | |
| "learning_rate": 0.00013084434579684114, | |
| "loss": 1.0428, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.381692573402418, | |
| "eval_loss": 1.050079345703125, | |
| "eval_runtime": 74.2925, | |
| "eval_samples_per_second": 31.161, | |
| "eval_steps_per_second": 3.903, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.390328151986183, | |
| "grad_norm": 1.052328109741211, | |
| "learning_rate": 0.00012988681673407502, | |
| "loss": 1.0955, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.3989637305699483, | |
| "grad_norm": 0.975304126739502, | |
| "learning_rate": 0.0001289262691252763, | |
| "loss": 1.0776, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.4075993091537133, | |
| "grad_norm": 0.9506198763847351, | |
| "learning_rate": 0.00012796279998503174, | |
| "loss": 1.0708, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.4162348877374784, | |
| "grad_norm": 0.9697166085243225, | |
| "learning_rate": 0.0001269965066230005, | |
| "loss": 1.0098, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.4248704663212435, | |
| "grad_norm": 1.0433659553527832, | |
| "learning_rate": 0.00012602748663408613, | |
| "loss": 1.0346, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.4335060449050085, | |
| "grad_norm": 1.0530465841293335, | |
| "learning_rate": 0.00012505583788857924, | |
| "loss": 1.1224, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.4421416234887738, | |
| "grad_norm": 0.9902591705322266, | |
| "learning_rate": 0.0001240816585222731, | |
| "loss": 1.1215, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.450777202072539, | |
| "grad_norm": 0.9624248147010803, | |
| "learning_rate": 0.00012310504692655166, | |
| "loss": 1.028, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.459412780656304, | |
| "grad_norm": 1.3916789293289185, | |
| "learning_rate": 0.0001221261017384522, | |
| "loss": 1.0322, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.468048359240069, | |
| "grad_norm": 1.3031835556030273, | |
| "learning_rate": 0.00012114492183070323, | |
| "loss": 0.9959, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.468048359240069, | |
| "eval_loss": 1.0126487016677856, | |
| "eval_runtime": 74.2735, | |
| "eval_samples_per_second": 31.169, | |
| "eval_steps_per_second": 3.904, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.4766839378238341, | |
| "grad_norm": 0.9306958913803101, | |
| "learning_rate": 0.00012016160630173807, | |
| "loss": 1.0158, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.4853195164075994, | |
| "grad_norm": 1.1065701246261597, | |
| "learning_rate": 0.00011917625446568626, | |
| "loss": 1.0134, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.4939550949913645, | |
| "grad_norm": 0.9095447659492493, | |
| "learning_rate": 0.00011818896584234287, | |
| "loss": 1.0405, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.5025906735751295, | |
| "grad_norm": 1.1259651184082031, | |
| "learning_rate": 0.00011719984014711693, | |
| "loss": 1.0454, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.5112262521588946, | |
| "grad_norm": 1.0899256467819214, | |
| "learning_rate": 0.00011620897728096047, | |
| "loss": 1.0925, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.5198618307426597, | |
| "grad_norm": 1.173726201057434, | |
| "learning_rate": 0.00011521647732027843, | |
| "loss": 1.0111, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.528497409326425, | |
| "grad_norm": 0.9733538031578064, | |
| "learning_rate": 0.00011422244050682097, | |
| "loss": 1.0432, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.5371329879101898, | |
| "grad_norm": 1.2745634317398071, | |
| "learning_rate": 0.00011322696723755935, | |
| "loss": 1.035, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.545768566493955, | |
| "grad_norm": 0.9993700385093689, | |
| "learning_rate": 0.00011223015805454573, | |
| "loss": 1.0128, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.5544041450777202, | |
| "grad_norm": 1.0131609439849854, | |
| "learning_rate": 0.00011123211363475863, | |
| "loss": 1.0223, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.5544041450777202, | |
| "eval_loss": 0.9773589372634888, | |
| "eval_runtime": 74.3225, | |
| "eval_samples_per_second": 31.148, | |
| "eval_steps_per_second": 3.902, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.5630397236614852, | |
| "grad_norm": 0.9971020221710205, | |
| "learning_rate": 0.00011023293477993446, | |
| "loss": 1.0477, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.5716753022452505, | |
| "grad_norm": 0.9657288193702698, | |
| "learning_rate": 0.00010923272240638676, | |
| "loss": 1.0412, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.5803108808290154, | |
| "grad_norm": 1.0480608940124512, | |
| "learning_rate": 0.00010823157753481367, | |
| "loss": 1.0009, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.5889464594127807, | |
| "grad_norm": 1.0142576694488525, | |
| "learning_rate": 0.00010722960128009491, | |
| "loss": 1.0039, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.5975820379965457, | |
| "grad_norm": 1.158823847770691, | |
| "learning_rate": 0.00010622689484107935, | |
| "loss": 1.033, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.6062176165803108, | |
| "grad_norm": 1.025612473487854, | |
| "learning_rate": 0.00010522355949036386, | |
| "loss": 0.9911, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.614853195164076, | |
| "grad_norm": 1.2156877517700195, | |
| "learning_rate": 0.00010421969656406495, | |
| "loss": 0.9672, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.623488773747841, | |
| "grad_norm": 1.1628742218017578, | |
| "learning_rate": 0.00010321540745158382, | |
| "loss": 0.9499, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.6321243523316062, | |
| "grad_norm": 0.9938153624534607, | |
| "learning_rate": 0.00010221079358536619, | |
| "loss": 0.972, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.6407599309153713, | |
| "grad_norm": 1.0305359363555908, | |
| "learning_rate": 0.00010120595643065769, | |
| "loss": 0.9733, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.6407599309153713, | |
| "eval_loss": 0.9481803774833679, | |
| "eval_runtime": 74.2826, | |
| "eval_samples_per_second": 31.165, | |
| "eval_steps_per_second": 3.904, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.6493955094991364, | |
| "grad_norm": 1.048543930053711, | |
| "learning_rate": 0.00010020099747525586, | |
| "loss": 0.9864, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.6580310880829017, | |
| "grad_norm": 0.9962440729141235, | |
| "learning_rate": 9.919601821926009e-05, | |
| "loss": 0.9375, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.939709484577179, | |
| "learning_rate": 9.819112016482001e-05, | |
| "loss": 1.0237, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.6753022452504318, | |
| "grad_norm": 0.9809345602989197, | |
| "learning_rate": 9.718640480588409e-05, | |
| "loss": 0.9283, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.6839378238341969, | |
| "grad_norm": 1.0643101930618286, | |
| "learning_rate": 9.618197361794854e-05, | |
| "loss": 0.9252, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.692573402417962, | |
| "grad_norm": 1.0364254713058472, | |
| "learning_rate": 9.517792804780867e-05, | |
| "loss": 0.9815, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.7012089810017272, | |
| "grad_norm": 1.0348941087722778, | |
| "learning_rate": 9.417436950331256e-05, | |
| "loss": 0.9443, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.709844559585492, | |
| "grad_norm": 0.9953681230545044, | |
| "learning_rate": 9.31713993431191e-05, | |
| "loss": 0.8284, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.7184801381692574, | |
| "grad_norm": 1.092307209968567, | |
| "learning_rate": 9.216911886646085e-05, | |
| "loss": 0.9878, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.7271157167530224, | |
| "grad_norm": 1.0429028272628784, | |
| "learning_rate": 9.116762930291282e-05, | |
| "loss": 0.9367, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.7271157167530224, | |
| "eval_loss": 0.9206886887550354, | |
| "eval_runtime": 76.0782, | |
| "eval_samples_per_second": 30.429, | |
| "eval_steps_per_second": 3.812, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.7357512953367875, | |
| "grad_norm": 1.346091389656067, | |
| "learning_rate": 9.016703180216834e-05, | |
| "loss": 0.9867, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.7443868739205528, | |
| "grad_norm": 1.3426605463027954, | |
| "learning_rate": 8.916742742382316e-05, | |
| "loss": 0.9452, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.7530224525043177, | |
| "grad_norm": 1.1281945705413818, | |
| "learning_rate": 8.816891712716834e-05, | |
| "loss": 0.9285, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.761658031088083, | |
| "grad_norm": 1.0097074508666992, | |
| "learning_rate": 8.717160176099358e-05, | |
| "loss": 0.9402, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.770293609671848, | |
| "grad_norm": 0.9986193180084229, | |
| "learning_rate": 8.617558205340144e-05, | |
| "loss": 0.9679, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.778929188255613, | |
| "grad_norm": 0.9868291020393372, | |
| "learning_rate": 8.518095860163395e-05, | |
| "loss": 0.9703, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.7875647668393784, | |
| "grad_norm": 1.131056785583496, | |
| "learning_rate": 8.418783186191236e-05, | |
| "loss": 0.9633, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.7962003454231432, | |
| "grad_norm": 1.214119791984558, | |
| "learning_rate": 8.31963021392911e-05, | |
| "loss": 1.0013, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.8048359240069085, | |
| "grad_norm": 1.0295566320419312, | |
| "learning_rate": 8.220646957752716e-05, | |
| "loss": 0.924, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.8134715025906736, | |
| "grad_norm": 1.0361146926879883, | |
| "learning_rate": 8.121843414896547e-05, | |
| "loss": 0.9298, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.8134715025906736, | |
| "eval_loss": 0.8968186974525452, | |
| "eval_runtime": 74.3145, | |
| "eval_samples_per_second": 31.151, | |
| "eval_steps_per_second": 3.902, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.8221070811744386, | |
| "grad_norm": 1.1014063358306885, | |
| "learning_rate": 8.023229564444188e-05, | |
| "loss": 0.868, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.830742659758204, | |
| "grad_norm": 1.1780112981796265, | |
| "learning_rate": 7.924815366320434e-05, | |
| "loss": 0.904, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.8393782383419688, | |
| "grad_norm": 1.1275811195373535, | |
| "learning_rate": 7.826610760285343e-05, | |
| "loss": 0.8838, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.848013816925734, | |
| "grad_norm": 1.2763252258300781, | |
| "learning_rate": 7.728625664930336e-05, | |
| "loss": 0.8688, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.8566493955094991, | |
| "grad_norm": 1.1182912588119507, | |
| "learning_rate": 7.630869976676413e-05, | |
| "loss": 0.9025, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.8652849740932642, | |
| "grad_norm": 1.165228247642517, | |
| "learning_rate": 7.533353568774634e-05, | |
| "loss": 0.9962, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.8739205526770295, | |
| "grad_norm": 1.0472362041473389, | |
| "learning_rate": 7.436086290308919e-05, | |
| "loss": 0.9682, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.8825561312607944, | |
| "grad_norm": 1.0311408042907715, | |
| "learning_rate": 7.339077965201305e-05, | |
| "loss": 0.9335, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.8911917098445596, | |
| "grad_norm": 1.0582579374313354, | |
| "learning_rate": 7.242338391219734e-05, | |
| "loss": 0.985, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.8998272884283247, | |
| "grad_norm": 0.923521876335144, | |
| "learning_rate": 7.145877338988487e-05, | |
| "loss": 0.9738, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.8998272884283247, | |
| "eval_loss": 0.872840166091919, | |
| "eval_runtime": 74.2854, | |
| "eval_samples_per_second": 31.164, | |
| "eval_steps_per_second": 3.904, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.9084628670120898, | |
| "grad_norm": 1.0986098051071167, | |
| "learning_rate": 7.049704551001358e-05, | |
| "loss": 0.9313, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.917098445595855, | |
| "grad_norm": 1.050269365310669, | |
| "learning_rate": 6.953829740637662e-05, | |
| "loss": 0.8601, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.92573402417962, | |
| "grad_norm": 1.2400585412979126, | |
| "learning_rate": 6.858262591181206e-05, | |
| "loss": 0.8907, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.9343696027633852, | |
| "grad_norm": 1.0596503019332886, | |
| "learning_rate": 6.763012754842277e-05, | |
| "loss": 0.9053, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.9430051813471503, | |
| "grad_norm": 0.9167270064353943, | |
| "learning_rate": 6.668089851782769e-05, | |
| "loss": 0.9776, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.9516407599309153, | |
| "grad_norm": 1.2059139013290405, | |
| "learning_rate": 6.573503469144566e-05, | |
| "loss": 0.8575, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.9602763385146806, | |
| "grad_norm": 1.0621919631958008, | |
| "learning_rate": 6.479263160081242e-05, | |
| "loss": 0.9437, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.9689119170984455, | |
| "grad_norm": 1.1505554914474487, | |
| "learning_rate": 6.385378442793188e-05, | |
| "loss": 0.8951, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.9775474956822108, | |
| "grad_norm": 1.052738904953003, | |
| "learning_rate": 6.29185879956632e-05, | |
| "loss": 0.8694, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.9861830742659758, | |
| "grad_norm": 0.9112501740455627, | |
| "learning_rate": 6.198713675814318e-05, | |
| "loss": 0.8679, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.9861830742659758, | |
| "eval_loss": 0.8568958640098572, | |
| "eval_runtime": 74.2813, | |
| "eval_samples_per_second": 31.165, | |
| "eval_steps_per_second": 3.904, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.994818652849741, | |
| "grad_norm": 0.9624518156051636, | |
| "learning_rate": 6.105952479124696e-05, | |
| "loss": 0.9374, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.003454231433506, | |
| "grad_norm": 1.1013827323913574, | |
| "learning_rate": 6.0135845783086145e-05, | |
| "loss": 0.8569, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.012089810017271, | |
| "grad_norm": 1.0752055644989014, | |
| "learning_rate": 5.921619302454645e-05, | |
| "loss": 0.9713, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.0207253886010363, | |
| "grad_norm": 1.123271107673645, | |
| "learning_rate": 5.830065939986553e-05, | |
| "loss": 0.8359, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.029360967184801, | |
| "grad_norm": 1.0255523920059204, | |
| "learning_rate": 5.73893373772515e-05, | |
| "loss": 0.8339, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.0379965457685665, | |
| "grad_norm": 1.169399619102478, | |
| "learning_rate": 5.6482318999543807e-05, | |
| "loss": 0.8717, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.0466321243523318, | |
| "grad_norm": 1.0524979829788208, | |
| "learning_rate": 5.5579695874917115e-05, | |
| "loss": 0.8328, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.0552677029360966, | |
| "grad_norm": 0.9218592047691345, | |
| "learning_rate": 5.468155916762869e-05, | |
| "loss": 0.8556, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.063903281519862, | |
| "grad_norm": 1.2179642915725708, | |
| "learning_rate": 5.3787999588811136e-05, | |
| "loss": 0.8256, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.0725388601036268, | |
| "grad_norm": 1.2335243225097656, | |
| "learning_rate": 5.28991073873105e-05, | |
| "loss": 0.891, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.0725388601036268, | |
| "eval_loss": 0.8398398756980896, | |
| "eval_runtime": 74.3373, | |
| "eval_samples_per_second": 31.142, | |
| "eval_steps_per_second": 3.901, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.081174438687392, | |
| "grad_norm": 1.2605432271957397, | |
| "learning_rate": 5.201497234057111e-05, | |
| "loss": 0.7942, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.0898100172711573, | |
| "grad_norm": 1.0830875635147095, | |
| "learning_rate": 5.1135683745568455e-05, | |
| "loss": 0.8772, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.098445595854922, | |
| "grad_norm": 0.9472030997276306, | |
| "learning_rate": 5.02613304097898e-05, | |
| "loss": 0.926, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.1070811744386875, | |
| "grad_norm": 1.091093897819519, | |
| "learning_rate": 4.939200064226509e-05, | |
| "loss": 0.8607, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.1157167530224523, | |
| "grad_norm": 1.186557412147522, | |
| "learning_rate": 4.8527782244647656e-05, | |
| "loss": 0.9168, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.1243523316062176, | |
| "grad_norm": 1.1364177465438843, | |
| "learning_rate": 4.766876250234621e-05, | |
| "loss": 0.8785, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.132987910189983, | |
| "grad_norm": 1.1587848663330078, | |
| "learning_rate": 4.681502817570929e-05, | |
| "loss": 0.8479, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.1416234887737478, | |
| "grad_norm": 1.1975603103637695, | |
| "learning_rate": 4.59666654912623e-05, | |
| "loss": 0.9014, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.150259067357513, | |
| "grad_norm": 1.0785084962844849, | |
| "learning_rate": 4.512376013299895e-05, | |
| "loss": 0.8464, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.158894645941278, | |
| "grad_norm": 1.0377299785614014, | |
| "learning_rate": 4.428639723372706e-05, | |
| "loss": 0.8461, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.158894645941278, | |
| "eval_loss": 0.8259330987930298, | |
| "eval_runtime": 74.3192, | |
| "eval_samples_per_second": 31.149, | |
| "eval_steps_per_second": 3.902, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.167530224525043, | |
| "grad_norm": 1.2178107500076294, | |
| "learning_rate": 4.345466136647018e-05, | |
| "loss": 0.7985, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.1761658031088085, | |
| "grad_norm": 1.0462040901184082, | |
| "learning_rate": 4.2628636535926005e-05, | |
| "loss": 0.8091, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.1848013816925733, | |
| "grad_norm": 1.0872950553894043, | |
| "learning_rate": 4.180840616998164e-05, | |
| "loss": 0.8729, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.1934369602763386, | |
| "grad_norm": 1.2299045324325562, | |
| "learning_rate": 4.099405311128774e-05, | |
| "loss": 0.8864, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.2020725388601035, | |
| "grad_norm": 1.0725489854812622, | |
| "learning_rate": 4.018565960889137e-05, | |
| "loss": 0.9033, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.2107081174438687, | |
| "grad_norm": 1.1338095664978027, | |
| "learning_rate": 3.9383307309928744e-05, | |
| "loss": 0.8792, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.219343696027634, | |
| "grad_norm": 1.0339998006820679, | |
| "learning_rate": 3.858707725137921e-05, | |
| "loss": 0.8888, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.227979274611399, | |
| "grad_norm": 1.1130526065826416, | |
| "learning_rate": 3.7797049851880325e-05, | |
| "loss": 0.7557, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.236614853195164, | |
| "grad_norm": 1.013401746749878, | |
| "learning_rate": 3.701330490360583e-05, | |
| "loss": 0.8868, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.245250431778929, | |
| "grad_norm": 1.0278291702270508, | |
| "learning_rate": 3.623592156420661e-05, | |
| "loss": 0.8474, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.245250431778929, | |
| "eval_loss": 0.8156528472900391, | |
| "eval_runtime": 74.3107, | |
| "eval_samples_per_second": 31.153, | |
| "eval_steps_per_second": 3.903, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.2538860103626943, | |
| "grad_norm": 1.040278673171997, | |
| "learning_rate": 3.546497834881572e-05, | |
| "loss": 0.8268, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.2625215889464596, | |
| "grad_norm": 1.1358767747879028, | |
| "learning_rate": 3.4700553122118714e-05, | |
| "loss": 0.8267, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.2711571675302245, | |
| "grad_norm": 1.1165881156921387, | |
| "learning_rate": 3.394272309048895e-05, | |
| "loss": 0.9085, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.2797927461139897, | |
| "grad_norm": 0.9508546590805054, | |
| "learning_rate": 3.319156479419032e-05, | |
| "loss": 0.8471, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.2884283246977546, | |
| "grad_norm": 1.192872166633606, | |
| "learning_rate": 3.244715409964625e-05, | |
| "loss": 0.8641, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.29706390328152, | |
| "grad_norm": 1.092782735824585, | |
| "learning_rate": 3.170956619177749e-05, | |
| "loss": 0.8154, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.305699481865285, | |
| "grad_norm": 1.425848126411438, | |
| "learning_rate": 3.097887556640855e-05, | |
| "loss": 0.8828, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.31433506044905, | |
| "grad_norm": 1.099098801612854, | |
| "learning_rate": 3.025515602274346e-05, | |
| "loss": 0.8424, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.3229706390328153, | |
| "grad_norm": 1.2581557035446167, | |
| "learning_rate": 2.9538480655912415e-05, | |
| "loss": 0.8606, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.33160621761658, | |
| "grad_norm": 1.152100682258606, | |
| "learning_rate": 2.8828921849588898e-05, | |
| "loss": 0.8429, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.33160621761658, | |
| "eval_loss": 0.8052871227264404, | |
| "eval_runtime": 74.353, | |
| "eval_samples_per_second": 31.135, | |
| "eval_steps_per_second": 3.9, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.3402417962003454, | |
| "grad_norm": 1.0980738401412964, | |
| "learning_rate": 2.8126551268679134e-05, | |
| "loss": 0.8846, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.3488773747841103, | |
| "grad_norm": 0.9057421684265137, | |
| "learning_rate": 2.7431439852084072e-05, | |
| "loss": 0.8655, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.3575129533678756, | |
| "grad_norm": 1.0247701406478882, | |
| "learning_rate": 2.6743657805534396e-05, | |
| "loss": 0.7428, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.366148531951641, | |
| "grad_norm": 1.1374061107635498, | |
| "learning_rate": 2.6063274594500086e-05, | |
| "loss": 0.8294, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.3747841105354057, | |
| "grad_norm": 1.0613446235656738, | |
| "learning_rate": 2.5390358937174165e-05, | |
| "loss": 0.8164, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.383419689119171, | |
| "grad_norm": 1.053713083267212, | |
| "learning_rate": 2.472497879753235e-05, | |
| "loss": 0.8204, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.3920552677029363, | |
| "grad_norm": 1.1319301128387451, | |
| "learning_rate": 2.4067201378468807e-05, | |
| "loss": 0.7942, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.400690846286701, | |
| "grad_norm": 1.3451365232467651, | |
| "learning_rate": 2.3417093115008525e-05, | |
| "loss": 0.8676, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.4093264248704664, | |
| "grad_norm": 0.9686126112937927, | |
| "learning_rate": 2.277471966759771e-05, | |
| "loss": 0.8581, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.4179620034542313, | |
| "grad_norm": 1.1396652460098267, | |
| "learning_rate": 2.2140145915471778e-05, | |
| "loss": 0.9025, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.4179620034542313, | |
| "eval_loss": 0.7989487051963806, | |
| "eval_runtime": 74.4139, | |
| "eval_samples_per_second": 31.11, | |
| "eval_steps_per_second": 3.897, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.4265975820379966, | |
| "grad_norm": 1.4245641231536865, | |
| "learning_rate": 2.1513435950102924e-05, | |
| "loss": 0.793, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.4352331606217614, | |
| "grad_norm": 1.1148205995559692, | |
| "learning_rate": 2.0894653068726688e-05, | |
| "loss": 0.8414, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.4438687392055267, | |
| "grad_norm": 1.168946385383606, | |
| "learning_rate": 2.0283859767949078e-05, | |
| "loss": 0.8287, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.452504317789292, | |
| "grad_norm": 1.1215981245040894, | |
| "learning_rate": 1.9681117737434606e-05, | |
| "loss": 0.8029, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.461139896373057, | |
| "grad_norm": 1.2339212894439697, | |
| "learning_rate": 1.9086487853675382e-05, | |
| "loss": 0.8861, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.469775474956822, | |
| "grad_norm": 1.1001675128936768, | |
| "learning_rate": 1.8500030173842885e-05, | |
| "loss": 0.865, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.4784110535405874, | |
| "grad_norm": 1.1429920196533203, | |
| "learning_rate": 1.7921803929722082e-05, | |
| "loss": 0.8753, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.4870466321243523, | |
| "grad_norm": 1.0880659818649292, | |
| "learning_rate": 1.7351867521729072e-05, | |
| "loss": 0.8774, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.4956822107081176, | |
| "grad_norm": 0.9570063948631287, | |
| "learning_rate": 1.6790278513012925e-05, | |
| "loss": 0.8067, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.5043177892918824, | |
| "grad_norm": 1.0723425149917603, | |
| "learning_rate": 1.6237093623641443e-05, | |
| "loss": 0.801, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.5043177892918824, | |
| "eval_loss": 0.7922360301017761, | |
| "eval_runtime": 74.2817, | |
| "eval_samples_per_second": 31.165, | |
| "eval_steps_per_second": 3.904, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.5129533678756477, | |
| "grad_norm": 1.1429632902145386, | |
| "learning_rate": 1.569236872487283e-05, | |
| "loss": 0.7751, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.5215889464594126, | |
| "grad_norm": 1.1071122884750366, | |
| "learning_rate": 1.5156158833512523e-05, | |
| "loss": 0.7598, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.530224525043178, | |
| "grad_norm": 1.057666301727295, | |
| "learning_rate": 1.462851810635658e-05, | |
| "loss": 0.8577, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.538860103626943, | |
| "grad_norm": 1.0120848417282104, | |
| "learning_rate": 1.410949983472205e-05, | |
| "loss": 0.8649, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.547495682210708, | |
| "grad_norm": 1.1220532655715942, | |
| "learning_rate": 1.3599156439064309e-05, | |
| "loss": 0.8426, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.5561312607944733, | |
| "grad_norm": 1.1136960983276367, | |
| "learning_rate": 1.3097539463682874e-05, | |
| "loss": 0.8952, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.5647668393782386, | |
| "grad_norm": 1.004520297050476, | |
| "learning_rate": 1.26046995715153e-05, | |
| "loss": 0.8849, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.5734024179620034, | |
| "grad_norm": 1.01373291015625, | |
| "learning_rate": 1.2120686539020376e-05, | |
| "loss": 0.8147, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.5820379965457687, | |
| "grad_norm": 1.0711711645126343, | |
| "learning_rate": 1.1645549251150711e-05, | |
| "loss": 0.7414, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.5906735751295336, | |
| "grad_norm": 1.0850844383239746, | |
| "learning_rate": 1.1179335696415306e-05, | |
| "loss": 0.8152, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.5906735751295336, | |
| "eval_loss": 0.7877171635627747, | |
| "eval_runtime": 74.2634, | |
| "eval_samples_per_second": 31.173, | |
| "eval_steps_per_second": 3.905, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.599309153713299, | |
| "grad_norm": 1.0988068580627441, | |
| "learning_rate": 1.0722092962032927e-05, | |
| "loss": 0.7355, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.6079447322970637, | |
| "grad_norm": 1.098163366317749, | |
| "learning_rate": 1.0273867229176094e-05, | |
| "loss": 0.7886, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.616580310880829, | |
| "grad_norm": 1.2323061227798462, | |
| "learning_rate": 9.834703768307063e-06, | |
| "loss": 0.7491, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.6252158894645943, | |
| "grad_norm": 0.9928609728813171, | |
| "learning_rate": 9.404646934605399e-06, | |
| "loss": 0.921, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.633851468048359, | |
| "grad_norm": 1.285148024559021, | |
| "learning_rate": 8.983740163488107e-06, | |
| "loss": 0.8066, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.6424870466321244, | |
| "grad_norm": 1.1035611629486084, | |
| "learning_rate": 8.572025966222841e-06, | |
| "loss": 0.8209, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.6511226252158897, | |
| "grad_norm": 0.8766908049583435, | |
| "learning_rate": 8.169545925634115e-06, | |
| "loss": 0.8807, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.6597582037996546, | |
| "grad_norm": 1.2940890789031982, | |
| "learning_rate": 7.776340691903604e-06, | |
| "loss": 0.875, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.66839378238342, | |
| "grad_norm": 1.0446336269378662, | |
| "learning_rate": 7.392449978464478e-06, | |
| "loss": 0.789, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.6770293609671847, | |
| "grad_norm": 1.1468182802200317, | |
| "learning_rate": 7.0179125579902915e-06, | |
| "loss": 0.8416, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.6770293609671847, | |
| "eval_loss": 0.7846628427505493, | |
| "eval_runtime": 74.271, | |
| "eval_samples_per_second": 31.17, | |
| "eval_steps_per_second": 3.905, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.68566493955095, | |
| "grad_norm": 0.9985455870628357, | |
| "learning_rate": 6.652766258479126e-06, | |
| "loss": 0.7779, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.694300518134715, | |
| "grad_norm": 1.0844076871871948, | |
| "learning_rate": 6.2970479594328e-06, | |
| "loss": 0.7998, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.70293609671848, | |
| "grad_norm": 1.0988919734954834, | |
| "learning_rate": 5.950793588132253e-06, | |
| "loss": 0.7566, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.7115716753022454, | |
| "grad_norm": 1.1833341121673584, | |
| "learning_rate": 5.614038116008824e-06, | |
| "loss": 0.7846, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.7202072538860103, | |
| "grad_norm": 1.0258663892745972, | |
| "learning_rate": 5.286815555112101e-06, | |
| "loss": 0.7837, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.7288428324697755, | |
| "grad_norm": 1.0312175750732422, | |
| "learning_rate": 4.969158954674902e-06, | |
| "loss": 0.8348, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.737478411053541, | |
| "grad_norm": 1.152564287185669, | |
| "learning_rate": 4.6611003977751425e-06, | |
| "loss": 0.8359, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.7461139896373057, | |
| "grad_norm": 1.0136488676071167, | |
| "learning_rate": 4.362670998095597e-06, | |
| "loss": 0.8209, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.754749568221071, | |
| "grad_norm": 0.9231327176094055, | |
| "learning_rate": 4.073900896781402e-06, | |
| "loss": 0.8148, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.763385146804836, | |
| "grad_norm": 0.9632856845855713, | |
| "learning_rate": 3.7948192593957877e-06, | |
| "loss": 0.7827, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.763385146804836, | |
| "eval_loss": 0.7829655408859253, | |
| "eval_runtime": 74.3256, | |
| "eval_samples_per_second": 31.147, | |
| "eval_steps_per_second": 3.902, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.772020725388601, | |
| "grad_norm": 0.9402443170547485, | |
| "learning_rate": 3.525454272974427e-06, | |
| "loss": 0.8453, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.780656303972366, | |
| "grad_norm": 1.1812800168991089, | |
| "learning_rate": 3.265833143178543e-06, | |
| "loss": 0.8018, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.7892918825561313, | |
| "grad_norm": 0.9228396415710449, | |
| "learning_rate": 3.0159820915471426e-06, | |
| "loss": 0.7994, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.7979274611398965, | |
| "grad_norm": 0.9304484724998474, | |
| "learning_rate": 2.7759263528487345e-06, | |
| "loss": 0.744, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.8065630397236614, | |
| "grad_norm": 1.064064621925354, | |
| "learning_rate": 2.5456901725325224e-06, | |
| "loss": 0.8863, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.8151986183074267, | |
| "grad_norm": 1.0431768894195557, | |
| "learning_rate": 2.3252968042797083e-06, | |
| "loss": 0.8184, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.823834196891192, | |
| "grad_norm": 1.1347994804382324, | |
| "learning_rate": 2.114768507654885e-06, | |
| "loss": 0.8135, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.832469775474957, | |
| "grad_norm": 1.2335035800933838, | |
| "learning_rate": 1.9141265458578196e-06, | |
| "loss": 0.8298, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.8411053540587217, | |
| "grad_norm": 0.9481128454208374, | |
| "learning_rate": 1.7233911835758843e-06, | |
| "loss": 0.8401, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.849740932642487, | |
| "grad_norm": 1.182274580001831, | |
| "learning_rate": 1.5425816849373386e-06, | |
| "loss": 0.7997, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.849740932642487, | |
| "eval_loss": 0.7820163369178772, | |
| "eval_runtime": 74.5484, | |
| "eval_samples_per_second": 31.054, | |
| "eval_steps_per_second": 3.89, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.8583765112262522, | |
| "grad_norm": 1.0886763334274292, | |
| "learning_rate": 1.3717163115656962e-06, | |
| "loss": 0.78, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.867012089810017, | |
| "grad_norm": 0.9863650798797607, | |
| "learning_rate": 1.2108123207352662e-06, | |
| "loss": 0.8321, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.8756476683937824, | |
| "grad_norm": 1.0511281490325928, | |
| "learning_rate": 1.0598859636282156e-06, | |
| "loss": 0.8616, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.8842832469775477, | |
| "grad_norm": 1.1429880857467651, | |
| "learning_rate": 9.189524836932029e-07, | |
| "loss": 0.8199, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.8929188255613125, | |
| "grad_norm": 1.071950912475586, | |
| "learning_rate": 7.88026115105811e-07, | |
| "loss": 0.8046, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.901554404145078, | |
| "grad_norm": 1.19888174533844, | |
| "learning_rate": 6.671200813308742e-07, | |
| "loss": 0.8907, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.910189982728843, | |
| "grad_norm": 0.9800041317939758, | |
| "learning_rate": 5.562465937869577e-07, | |
| "loss": 0.8213, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.918825561312608, | |
| "grad_norm": 1.047254204750061, | |
| "learning_rate": 4.5541685061299964e-07, | |
| "loss": 0.8474, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.927461139896373, | |
| "grad_norm": 1.0879237651824951, | |
| "learning_rate": 3.646410355372831e-07, | |
| "loss": 0.7963, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.936096718480138, | |
| "grad_norm": 1.0805079936981201, | |
| "learning_rate": 2.8392831684891374e-07, | |
| "loss": 0.8146, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.936096718480138, | |
| "eval_loss": 0.7816377282142639, | |
| "eval_runtime": 74.3018, | |
| "eval_samples_per_second": 31.157, | |
| "eval_steps_per_second": 3.903, | |
| "step": 3400 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3474, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6308483890777948e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |