diff --git "a/checkpoint-22050/trainer_state.json" "b/checkpoint-22050/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-22050/trainer_state.json" @@ -0,0 +1,4881 @@ +{ + "best_global_step": 22000, + "best_metric": 0.40355798602104187, + "best_model_checkpoint": "Qwen-3-0.6B-it-Medical-LoRA/checkpoint-22000", + "epoch": 49.998867497168746, + "eval_steps": 100, + "global_step": 22050, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.11325028312570781, + "grad_norm": 0.3080866038799286, + "learning_rate": 0.00019075425790754258, + "loss": 1.8711, + "step": 50 + }, + { + "epoch": 0.22650056625141562, + "grad_norm": 0.28974413871765137, + "learning_rate": 0.0001664233576642336, + "loss": 1.338, + "step": 100 + }, + { + "epoch": 0.22650056625141562, + "eval_loss": 1.3033037185668945, + "eval_runtime": 217.1489, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 0.907, + "step": 100 + }, + { + "epoch": 0.33975084937712347, + "grad_norm": 0.3371483385562897, + "learning_rate": 0.0001420924574209246, + "loss": 1.2923, + "step": 150 + }, + { + "epoch": 0.45300113250283125, + "grad_norm": 0.35730767250061035, + "learning_rate": 0.00011776155717761557, + "loss": 1.2703, + "step": 200 + }, + { + "epoch": 0.45300113250283125, + "eval_loss": 1.2490053176879883, + "eval_runtime": 217.4793, + "eval_samples_per_second": 7.214, + "eval_steps_per_second": 0.906, + "step": 200 + }, + { + "epoch": 0.5662514156285391, + "grad_norm": 0.35110780596733093, + "learning_rate": 9.343065693430657e-05, + "loss": 1.2397, + "step": 250 + }, + { + "epoch": 0.6795016987542469, + "grad_norm": 0.35077276825904846, + "learning_rate": 6.909975669099758e-05, + "loss": 1.232, + "step": 300 + }, + { + "epoch": 0.6795016987542469, + "eval_loss": 1.2211977243423462, + "eval_runtime": 217.5364, + "eval_samples_per_second": 7.213, + "eval_steps_per_second": 0.906, + "step": 300 + }, + { + "epoch": 0.7927519818799547, + "grad_norm": 0.3939191699028015, + "learning_rate": 4.476885644768857e-05, + "loss": 1.2241, + "step": 350 + }, + { + "epoch": 0.9060022650056625, + "grad_norm": 0.366871178150177, + "learning_rate": 2.0437956204379563e-05, + "loss": 1.2078, + "step": 400 + }, + { + "epoch": 0.9060022650056625, + "eval_loss": 1.2062289714813232, + "eval_runtime": 217.4838, + "eval_samples_per_second": 7.214, + "eval_steps_per_second": 0.906, + "step": 400 + }, + { + "epoch": 1.0203850509626273, + "grad_norm": 0.3808969259262085, + "learning_rate": 0.00010164319248826291, + "loss": 1.1819, + "step": 450 + }, + { + "epoch": 1.1336353340883352, + "grad_norm": 0.43216949701309204, + "learning_rate": 8.990610328638498e-05, + "loss": 1.19, + "step": 500 + }, + { + "epoch": 1.1336353340883352, + "eval_loss": 1.1994948387145996, + "eval_runtime": 217.3626, + "eval_samples_per_second": 7.218, + "eval_steps_per_second": 0.906, + "step": 500 + }, + { + "epoch": 1.246885617214043, + "grad_norm": 0.4280295968055725, + "learning_rate": 7.816901408450704e-05, + "loss": 1.1971, + "step": 550 + }, + { + "epoch": 1.3601359003397508, + "grad_norm": 0.4056779146194458, + "learning_rate": 6.643192488262912e-05, + "loss": 1.1771, + "step": 600 + }, + { + "epoch": 1.3601359003397508, + "eval_loss": 1.1834282875061035, + "eval_runtime": 217.4514, + "eval_samples_per_second": 7.215, + "eval_steps_per_second": 0.906, + "step": 600 + }, + { + "epoch": 1.4733861834654587, + "grad_norm": 0.4397243857383728, + "learning_rate": 5.469483568075118e-05, + "loss": 1.1544, + "step": 650 + }, + { + "epoch": 1.5866364665911665, + "grad_norm": 0.4214654862880707, + "learning_rate": 4.295774647887324e-05, + "loss": 1.1789, + "step": 700 + }, + { + "epoch": 1.5866364665911665, + "eval_loss": 1.1712530851364136, + "eval_runtime": 217.6023, + "eval_samples_per_second": 7.21, + "eval_steps_per_second": 0.905, + "step": 700 + }, + { + "epoch": 1.6998867497168741, + "grad_norm": 0.43076109886169434, + "learning_rate": 3.1220657276995305e-05, + "loss": 1.1522, + "step": 750 + }, + { + "epoch": 1.8131370328425822, + "grad_norm": 0.4253358244895935, + "learning_rate": 1.9483568075117372e-05, + "loss": 1.1508, + "step": 800 + }, + { + "epoch": 1.8131370328425822, + "eval_loss": 1.1622345447540283, + "eval_runtime": 217.3951, + "eval_samples_per_second": 7.217, + "eval_steps_per_second": 0.906, + "step": 800 + }, + { + "epoch": 1.9263873159682898, + "grad_norm": 0.4359077215194702, + "learning_rate": 7.746478873239436e-06, + "loss": 1.1422, + "step": 850 + }, + { + "epoch": 2.0407701019252547, + "grad_norm": 0.4511992633342743, + "learning_rate": 0.00014314687602224403, + "loss": 1.149, + "step": 900 + }, + { + "epoch": 2.0407701019252547, + "eval_loss": 1.167581558227539, + "eval_runtime": 218.6888, + "eval_samples_per_second": 7.175, + "eval_steps_per_second": 0.901, + "step": 900 + }, + { + "epoch": 2.1540203850509627, + "grad_norm": 0.47519659996032715, + "learning_rate": 0.00013987569512594048, + "loss": 1.1498, + "step": 950 + }, + { + "epoch": 2.2672706681766703, + "grad_norm": 0.4559363126754761, + "learning_rate": 0.00013660451422963692, + "loss": 1.1469, + "step": 1000 + }, + { + "epoch": 2.2672706681766703, + "eval_loss": 1.1536333560943604, + "eval_runtime": 218.921, + "eval_samples_per_second": 7.167, + "eval_steps_per_second": 0.9, + "step": 1000 + }, + { + "epoch": 2.3805209513023784, + "grad_norm": 0.49805569648742676, + "learning_rate": 0.00013333333333333334, + "loss": 1.123, + "step": 1050 + }, + { + "epoch": 2.493771234428086, + "grad_norm": 0.4767671823501587, + "learning_rate": 0.00013006215243702978, + "loss": 1.1119, + "step": 1100 + }, + { + "epoch": 2.493771234428086, + "eval_loss": 1.1321617364883423, + "eval_runtime": 218.8468, + "eval_samples_per_second": 7.169, + "eval_steps_per_second": 0.9, + "step": 1100 + }, + { + "epoch": 2.607021517553794, + "grad_norm": 0.4313490688800812, + "learning_rate": 0.0001267909715407262, + "loss": 1.1096, + "step": 1150 + }, + { + "epoch": 2.7202718006795017, + "grad_norm": 0.46401792764663696, + "learning_rate": 0.00012351979064442265, + "loss": 1.0929, + "step": 1200 + }, + { + "epoch": 2.7202718006795017, + "eval_loss": 1.1138092279434204, + "eval_runtime": 219.2403, + "eval_samples_per_second": 7.157, + "eval_steps_per_second": 0.899, + "step": 1200 + }, + { + "epoch": 2.8335220838052093, + "grad_norm": 0.46533071994781494, + "learning_rate": 0.00012024860974811907, + "loss": 1.0894, + "step": 1250 + }, + { + "epoch": 2.9467723669309174, + "grad_norm": 0.42782357335090637, + "learning_rate": 0.00011697742885181551, + "loss": 1.072, + "step": 1300 + }, + { + "epoch": 2.9467723669309174, + "eval_loss": 1.0986168384552002, + "eval_runtime": 219.0662, + "eval_samples_per_second": 7.162, + "eval_steps_per_second": 0.899, + "step": 1300 + }, + { + "epoch": 3.061155152887882, + "grad_norm": 0.46029889583587646, + "learning_rate": 0.00011370624795551194, + "loss": 1.0936, + "step": 1350 + }, + { + "epoch": 3.17440543601359, + "grad_norm": 0.5067735314369202, + "learning_rate": 0.00011043506705920839, + "loss": 1.0304, + "step": 1400 + }, + { + "epoch": 3.17440543601359, + "eval_loss": 1.0839864015579224, + "eval_runtime": 219.2396, + "eval_samples_per_second": 7.157, + "eval_steps_per_second": 0.899, + "step": 1400 + }, + { + "epoch": 3.287655719139298, + "grad_norm": 0.46760454773902893, + "learning_rate": 0.0001071638861629048, + "loss": 1.0361, + "step": 1450 + }, + { + "epoch": 3.4009060022650055, + "grad_norm": 0.5199077129364014, + "learning_rate": 0.00010389270526660124, + "loss": 1.0304, + "step": 1500 + }, + { + "epoch": 3.4009060022650055, + "eval_loss": 1.070657730102539, + "eval_runtime": 219.1222, + "eval_samples_per_second": 7.16, + "eval_steps_per_second": 0.899, + "step": 1500 + }, + { + "epoch": 3.5141562853907136, + "grad_norm": 0.496124267578125, + "learning_rate": 0.00010062152437029768, + "loss": 1.0202, + "step": 1550 + }, + { + "epoch": 3.627406568516421, + "grad_norm": 0.5154497623443604, + "learning_rate": 9.735034347399413e-05, + "loss": 1.007, + "step": 1600 + }, + { + "epoch": 3.627406568516421, + "eval_loss": 1.0554137229919434, + "eval_runtime": 219.2648, + "eval_samples_per_second": 7.156, + "eval_steps_per_second": 0.898, + "step": 1600 + }, + { + "epoch": 3.7406568516421292, + "grad_norm": 0.4881006181240082, + "learning_rate": 9.407916257769055e-05, + "loss": 0.9934, + "step": 1650 + }, + { + "epoch": 3.853907134767837, + "grad_norm": 0.5507743954658508, + "learning_rate": 9.080798168138699e-05, + "loss": 0.9894, + "step": 1700 + }, + { + "epoch": 3.853907134767837, + "eval_loss": 1.0418345928192139, + "eval_runtime": 219.154, + "eval_samples_per_second": 7.159, + "eval_steps_per_second": 0.899, + "step": 1700 + }, + { + "epoch": 3.967157417893545, + "grad_norm": 0.5333808064460754, + "learning_rate": 8.753680078508342e-05, + "loss": 0.9795, + "step": 1750 + }, + { + "epoch": 4.081540203850509, + "grad_norm": 0.551164448261261, + "learning_rate": 8.426561988877985e-05, + "loss": 0.974, + "step": 1800 + }, + { + "epoch": 4.081540203850509, + "eval_loss": 1.0327671766281128, + "eval_runtime": 219.1392, + "eval_samples_per_second": 7.16, + "eval_steps_per_second": 0.899, + "step": 1800 + }, + { + "epoch": 4.194790486976218, + "grad_norm": 0.5678717494010925, + "learning_rate": 8.099443899247629e-05, + "loss": 0.9204, + "step": 1850 + }, + { + "epoch": 4.308040770101925, + "grad_norm": 0.5472707152366638, + "learning_rate": 7.772325809617273e-05, + "loss": 0.9341, + "step": 1900 + }, + { + "epoch": 4.308040770101925, + "eval_loss": 1.0188047885894775, + "eval_runtime": 219.1338, + "eval_samples_per_second": 7.16, + "eval_steps_per_second": 0.899, + "step": 1900 + }, + { + "epoch": 4.421291053227633, + "grad_norm": 0.5799363255500793, + "learning_rate": 7.445207719986915e-05, + "loss": 0.9313, + "step": 1950 + }, + { + "epoch": 4.534541336353341, + "grad_norm": 0.6046631336212158, + "learning_rate": 7.11808963035656e-05, + "loss": 0.9325, + "step": 2000 + }, + { + "epoch": 4.534541336353341, + "eval_loss": 1.004631519317627, + "eval_runtime": 219.1114, + "eval_samples_per_second": 7.161, + "eval_steps_per_second": 0.899, + "step": 2000 + }, + { + "epoch": 4.647791619479049, + "grad_norm": 0.5897740721702576, + "learning_rate": 6.790971540726203e-05, + "loss": 0.9213, + "step": 2050 + }, + { + "epoch": 4.761041902604757, + "grad_norm": 0.583991289138794, + "learning_rate": 6.463853451095846e-05, + "loss": 0.9039, + "step": 2100 + }, + { + "epoch": 4.761041902604757, + "eval_loss": 0.9938989877700806, + "eval_runtime": 219.0293, + "eval_samples_per_second": 7.163, + "eval_steps_per_second": 0.899, + "step": 2100 + }, + { + "epoch": 4.874292185730464, + "grad_norm": 0.6264305710792542, + "learning_rate": 6.13673536146549e-05, + "loss": 0.9028, + "step": 2150 + }, + { + "epoch": 4.987542468856172, + "grad_norm": 0.6474761962890625, + "learning_rate": 5.809617271835133e-05, + "loss": 0.9053, + "step": 2200 + }, + { + "epoch": 4.987542468856172, + "eval_loss": 0.9845430254936218, + "eval_runtime": 219.2502, + "eval_samples_per_second": 7.156, + "eval_steps_per_second": 0.899, + "step": 2200 + }, + { + "epoch": 5.101925254813137, + "grad_norm": 0.6595875024795532, + "learning_rate": 5.4824991822047765e-05, + "loss": 0.882, + "step": 2250 + }, + { + "epoch": 5.215175537938845, + "grad_norm": 0.6405232548713684, + "learning_rate": 5.15538109257442e-05, + "loss": 0.8471, + "step": 2300 + }, + { + "epoch": 5.215175537938845, + "eval_loss": 0.9782047867774963, + "eval_runtime": 219.0751, + "eval_samples_per_second": 7.162, + "eval_steps_per_second": 0.899, + "step": 2300 + }, + { + "epoch": 5.3284258210645525, + "grad_norm": 0.6547350287437439, + "learning_rate": 4.828263002944063e-05, + "loss": 0.8602, + "step": 2350 + }, + { + "epoch": 5.44167610419026, + "grad_norm": 0.7046269178390503, + "learning_rate": 4.501144913313706e-05, + "loss": 0.8404, + "step": 2400 + }, + { + "epoch": 5.44167610419026, + "eval_loss": 0.9688066244125366, + "eval_runtime": 219.0622, + "eval_samples_per_second": 7.162, + "eval_steps_per_second": 0.899, + "step": 2400 + }, + { + "epoch": 5.554926387315969, + "grad_norm": 0.6331756114959717, + "learning_rate": 4.17402682368335e-05, + "loss": 0.8286, + "step": 2450 + }, + { + "epoch": 5.668176670441676, + "grad_norm": 0.7212900519371033, + "learning_rate": 3.846908734052994e-05, + "loss": 0.8382, + "step": 2500 + }, + { + "epoch": 5.668176670441676, + "eval_loss": 0.9589976668357849, + "eval_runtime": 219.1257, + "eval_samples_per_second": 7.16, + "eval_steps_per_second": 0.899, + "step": 2500 + }, + { + "epoch": 5.781426953567384, + "grad_norm": 0.6771254539489746, + "learning_rate": 3.519790644422637e-05, + "loss": 0.8359, + "step": 2550 + }, + { + "epoch": 5.8946772366930915, + "grad_norm": 0.7171376943588257, + "learning_rate": 3.19267255479228e-05, + "loss": 0.832, + "step": 2600 + }, + { + "epoch": 5.8946772366930915, + "eval_loss": 0.9501948952674866, + "eval_runtime": 219.0768, + "eval_samples_per_second": 7.162, + "eval_steps_per_second": 0.899, + "step": 2600 + }, + { + "epoch": 6.009060022650057, + "grad_norm": 0.6734739542007446, + "learning_rate": 2.865554465161924e-05, + "loss": 0.8437, + "step": 2650 + }, + { + "epoch": 6.122310305775764, + "grad_norm": 0.697407603263855, + "learning_rate": 2.538436375531567e-05, + "loss": 0.7937, + "step": 2700 + }, + { + "epoch": 6.122310305775764, + "eval_loss": 0.9480313658714294, + "eval_runtime": 218.949, + "eval_samples_per_second": 7.166, + "eval_steps_per_second": 0.9, + "step": 2700 + }, + { + "epoch": 6.235560588901472, + "grad_norm": 0.7092292904853821, + "learning_rate": 2.2113182859012105e-05, + "loss": 0.7804, + "step": 2750 + }, + { + "epoch": 6.34881087202718, + "grad_norm": 0.7284964919090271, + "learning_rate": 1.884200196270854e-05, + "loss": 0.7861, + "step": 2800 + }, + { + "epoch": 6.34881087202718, + "eval_loss": 0.942541241645813, + "eval_runtime": 219.1707, + "eval_samples_per_second": 7.159, + "eval_steps_per_second": 0.899, + "step": 2800 + }, + { + "epoch": 6.462061155152888, + "grad_norm": 0.7725135087966919, + "learning_rate": 1.557082106640497e-05, + "loss": 0.776, + "step": 2850 + }, + { + "epoch": 6.575311438278596, + "grad_norm": 0.7266800403594971, + "learning_rate": 1.2299640170101408e-05, + "loss": 0.7812, + "step": 2900 + }, + { + "epoch": 6.575311438278596, + "eval_loss": 0.939509928226471, + "eval_runtime": 219.1206, + "eval_samples_per_second": 7.16, + "eval_steps_per_second": 0.899, + "step": 2900 + }, + { + "epoch": 6.688561721404303, + "grad_norm": 0.7308298349380493, + "learning_rate": 9.028459273797842e-06, + "loss": 0.7827, + "step": 2950 + }, + { + "epoch": 6.801812004530011, + "grad_norm": 0.7362912893295288, + "learning_rate": 5.757278377494276e-06, + "loss": 0.7917, + "step": 3000 + }, + { + "epoch": 6.801812004530011, + "eval_loss": 0.9356247782707214, + "eval_runtime": 219.052, + "eval_samples_per_second": 7.163, + "eval_steps_per_second": 0.899, + "step": 3000 + }, + { + "epoch": 6.9150622876557195, + "grad_norm": 0.7543765902519226, + "learning_rate": 2.4860974811907098e-06, + "loss": 0.7738, + "step": 3050 + }, + { + "epoch": 7.029445073612684, + "grad_norm": 0.7134389877319336, + "learning_rate": 5.986301369863014e-05, + "loss": 0.7481, + "step": 3100 + }, + { + "epoch": 7.029445073612684, + "eval_loss": 0.9381225109100342, + "eval_runtime": 218.2269, + "eval_samples_per_second": 7.19, + "eval_steps_per_second": 0.903, + "step": 3100 + }, + { + "epoch": 7.1426953567383915, + "grad_norm": 0.8131405711174011, + "learning_rate": 5.757990867579909e-05, + "loss": 0.7725, + "step": 3150 + }, + { + "epoch": 7.2559456398641, + "grad_norm": 0.8759368062019348, + "learning_rate": 5.529680365296805e-05, + "loss": 0.7752, + "step": 3200 + }, + { + "epoch": 7.2559456398641, + "eval_loss": 0.9386877417564392, + "eval_runtime": 218.4203, + "eval_samples_per_second": 7.183, + "eval_steps_per_second": 0.902, + "step": 3200 + }, + { + "epoch": 7.369195922989808, + "grad_norm": 0.8374108076095581, + "learning_rate": 5.3013698630136986e-05, + "loss": 0.7765, + "step": 3250 + }, + { + "epoch": 7.482446206115515, + "grad_norm": 0.8505973815917969, + "learning_rate": 5.0730593607305946e-05, + "loss": 0.7791, + "step": 3300 + }, + { + "epoch": 7.482446206115515, + "eval_loss": 0.9266760349273682, + "eval_runtime": 218.4773, + "eval_samples_per_second": 7.182, + "eval_steps_per_second": 0.902, + "step": 3300 + }, + { + "epoch": 7.595696489241223, + "grad_norm": 0.8420349359512329, + "learning_rate": 4.8447488584474886e-05, + "loss": 0.7721, + "step": 3350 + }, + { + "epoch": 7.7089467723669305, + "grad_norm": 0.892084002494812, + "learning_rate": 4.616438356164384e-05, + "loss": 0.7626, + "step": 3400 + }, + { + "epoch": 7.7089467723669305, + "eval_loss": 0.9153051376342773, + "eval_runtime": 218.391, + "eval_samples_per_second": 7.184, + "eval_steps_per_second": 0.902, + "step": 3400 + }, + { + "epoch": 7.822197055492639, + "grad_norm": 1.0072320699691772, + "learning_rate": 4.3881278538812785e-05, + "loss": 0.7578, + "step": 3450 + }, + { + "epoch": 7.935447338618347, + "grad_norm": 0.841740608215332, + "learning_rate": 4.159817351598174e-05, + "loss": 0.755, + "step": 3500 + }, + { + "epoch": 7.935447338618347, + "eval_loss": 0.9030627012252808, + "eval_runtime": 218.5627, + "eval_samples_per_second": 7.179, + "eval_steps_per_second": 0.901, + "step": 3500 + }, + { + "epoch": 8.049830124575312, + "grad_norm": 0.9417058825492859, + "learning_rate": 3.9315068493150684e-05, + "loss": 0.7419, + "step": 3550 + }, + { + "epoch": 8.163080407701019, + "grad_norm": 0.8208181858062744, + "learning_rate": 3.703196347031964e-05, + "loss": 0.7079, + "step": 3600 + }, + { + "epoch": 8.163080407701019, + "eval_loss": 0.9004995226860046, + "eval_runtime": 218.599, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 3600 + }, + { + "epoch": 8.276330690826727, + "grad_norm": 0.8969956040382385, + "learning_rate": 3.4748858447488584e-05, + "loss": 0.7184, + "step": 3650 + }, + { + "epoch": 8.389580973952436, + "grad_norm": 0.9903959631919861, + "learning_rate": 3.246575342465754e-05, + "loss": 0.6977, + "step": 3700 + }, + { + "epoch": 8.389580973952436, + "eval_loss": 0.895404577255249, + "eval_runtime": 218.551, + "eval_samples_per_second": 7.179, + "eval_steps_per_second": 0.901, + "step": 3700 + }, + { + "epoch": 8.502831257078142, + "grad_norm": 0.8987964391708374, + "learning_rate": 3.0182648401826487e-05, + "loss": 0.6981, + "step": 3750 + }, + { + "epoch": 8.61608154020385, + "grad_norm": 0.9351384043693542, + "learning_rate": 2.7899543378995436e-05, + "loss": 0.6985, + "step": 3800 + }, + { + "epoch": 8.61608154020385, + "eval_loss": 0.8867019414901733, + "eval_runtime": 218.5395, + "eval_samples_per_second": 7.179, + "eval_steps_per_second": 0.901, + "step": 3800 + }, + { + "epoch": 8.729331823329558, + "grad_norm": 0.9520925283432007, + "learning_rate": 2.5616438356164386e-05, + "loss": 0.7041, + "step": 3850 + }, + { + "epoch": 8.842582106455266, + "grad_norm": 0.9150193333625793, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.6946, + "step": 3900 + }, + { + "epoch": 8.842582106455266, + "eval_loss": 0.8767301440238953, + "eval_runtime": 218.4183, + "eval_samples_per_second": 7.183, + "eval_steps_per_second": 0.902, + "step": 3900 + }, + { + "epoch": 8.955832389580975, + "grad_norm": 0.9718352556228638, + "learning_rate": 2.1050228310502286e-05, + "loss": 0.6837, + "step": 3950 + }, + { + "epoch": 9.070215175537939, + "grad_norm": 0.9025393724441528, + "learning_rate": 1.8767123287671235e-05, + "loss": 0.6821, + "step": 4000 + }, + { + "epoch": 9.070215175537939, + "eval_loss": 0.8735217452049255, + "eval_runtime": 218.5455, + "eval_samples_per_second": 7.179, + "eval_steps_per_second": 0.901, + "step": 4000 + }, + { + "epoch": 9.183465458663647, + "grad_norm": 0.9804911017417908, + "learning_rate": 1.6484018264840185e-05, + "loss": 0.6533, + "step": 4050 + }, + { + "epoch": 9.296715741789354, + "grad_norm": 0.8889093399047852, + "learning_rate": 1.4200913242009135e-05, + "loss": 0.6549, + "step": 4100 + }, + { + "epoch": 9.296715741789354, + "eval_loss": 0.8693042993545532, + "eval_runtime": 218.4928, + "eval_samples_per_second": 7.181, + "eval_steps_per_second": 0.902, + "step": 4100 + }, + { + "epoch": 9.409966024915063, + "grad_norm": 0.9306142926216125, + "learning_rate": 1.1917808219178083e-05, + "loss": 0.643, + "step": 4150 + }, + { + "epoch": 9.52321630804077, + "grad_norm": 1.0180792808532715, + "learning_rate": 9.634703196347032e-06, + "loss": 0.6498, + "step": 4200 + }, + { + "epoch": 9.52321630804077, + "eval_loss": 0.8649076223373413, + "eval_runtime": 218.6148, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.901, + "step": 4200 + }, + { + "epoch": 9.636466591166478, + "grad_norm": 1.038870930671692, + "learning_rate": 7.351598173515982e-06, + "loss": 0.6633, + "step": 4250 + }, + { + "epoch": 9.749716874292186, + "grad_norm": 0.9064520001411438, + "learning_rate": 5.068493150684932e-06, + "loss": 0.6503, + "step": 4300 + }, + { + "epoch": 9.749716874292186, + "eval_loss": 0.8624854683876038, + "eval_runtime": 218.607, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.901, + "step": 4300 + }, + { + "epoch": 9.862967157417893, + "grad_norm": 0.9673233032226562, + "learning_rate": 2.7853881278538815e-06, + "loss": 0.6439, + "step": 4350 + }, + { + "epoch": 9.976217440543602, + "grad_norm": 0.9512138366699219, + "learning_rate": 5.022831050228311e-07, + "loss": 0.6427, + "step": 4400 + }, + { + "epoch": 9.976217440543602, + "eval_loss": 0.8610817790031433, + "eval_runtime": 218.4692, + "eval_samples_per_second": 7.182, + "eval_steps_per_second": 0.902, + "step": 4400 + }, + { + "epoch": 10.090600226500566, + "grad_norm": 0.9980069994926453, + "learning_rate": 4.5028932140978435e-05, + "loss": 0.6435, + "step": 4450 + }, + { + "epoch": 10.203850509626275, + "grad_norm": 1.1042736768722534, + "learning_rate": 4.327546905137647e-05, + "loss": 0.6473, + "step": 4500 + }, + { + "epoch": 10.203850509626275, + "eval_loss": 0.8664056658744812, + "eval_runtime": 218.1832, + "eval_samples_per_second": 7.191, + "eval_steps_per_second": 0.903, + "step": 4500 + }, + { + "epoch": 10.317100792751981, + "grad_norm": 1.093027114868164, + "learning_rate": 4.1522005961774504e-05, + "loss": 0.6428, + "step": 4550 + }, + { + "epoch": 10.43035107587769, + "grad_norm": 1.1941519975662231, + "learning_rate": 3.9768542872172545e-05, + "loss": 0.6453, + "step": 4600 + }, + { + "epoch": 10.43035107587769, + "eval_loss": 0.8576545715332031, + "eval_runtime": 218.2133, + "eval_samples_per_second": 7.19, + "eval_steps_per_second": 0.903, + "step": 4600 + }, + { + "epoch": 10.543601359003397, + "grad_norm": 1.1875131130218506, + "learning_rate": 3.801507978257058e-05, + "loss": 0.6444, + "step": 4650 + }, + { + "epoch": 10.656851642129105, + "grad_norm": 1.057826042175293, + "learning_rate": 3.6261616692968614e-05, + "loss": 0.6374, + "step": 4700 + }, + { + "epoch": 10.656851642129105, + "eval_loss": 0.8471001982688904, + "eval_runtime": 218.248, + "eval_samples_per_second": 7.189, + "eval_steps_per_second": 0.903, + "step": 4700 + }, + { + "epoch": 10.770101925254814, + "grad_norm": 1.1481099128723145, + "learning_rate": 3.450815360336665e-05, + "loss": 0.6367, + "step": 4750 + }, + { + "epoch": 10.88335220838052, + "grad_norm": 1.043562412261963, + "learning_rate": 3.275469051376468e-05, + "loss": 0.6382, + "step": 4800 + }, + { + "epoch": 10.88335220838052, + "eval_loss": 0.8414534330368042, + "eval_runtime": 218.3266, + "eval_samples_per_second": 7.186, + "eval_steps_per_second": 0.902, + "step": 4800 + }, + { + "epoch": 10.996602491506229, + "grad_norm": 1.1026701927185059, + "learning_rate": 3.1001227424162724e-05, + "loss": 0.6363, + "step": 4850 + }, + { + "epoch": 11.110985277463193, + "grad_norm": 1.2548056840896606, + "learning_rate": 2.9247764334560758e-05, + "loss": 0.6197, + "step": 4900 + }, + { + "epoch": 11.110985277463193, + "eval_loss": 0.8344885110855103, + "eval_runtime": 218.346, + "eval_samples_per_second": 7.186, + "eval_steps_per_second": 0.902, + "step": 4900 + }, + { + "epoch": 11.224235560588902, + "grad_norm": 1.2327723503112793, + "learning_rate": 2.7494301244958792e-05, + "loss": 0.5955, + "step": 4950 + }, + { + "epoch": 11.337485843714608, + "grad_norm": 1.272136926651001, + "learning_rate": 2.5740838155356834e-05, + "loss": 0.5888, + "step": 5000 + }, + { + "epoch": 11.337485843714608, + "eval_loss": 0.8296782374382019, + "eval_runtime": 218.3829, + "eval_samples_per_second": 7.185, + "eval_steps_per_second": 0.902, + "step": 5000 + }, + { + "epoch": 11.450736126840317, + "grad_norm": 1.3154362440109253, + "learning_rate": 2.3987375065754868e-05, + "loss": 0.5821, + "step": 5050 + }, + { + "epoch": 11.563986409966025, + "grad_norm": 1.2641000747680664, + "learning_rate": 2.2233911976152902e-05, + "loss": 0.5786, + "step": 5100 + }, + { + "epoch": 11.563986409966025, + "eval_loss": 0.8227117657661438, + "eval_runtime": 218.3919, + "eval_samples_per_second": 7.184, + "eval_steps_per_second": 0.902, + "step": 5100 + }, + { + "epoch": 11.677236693091732, + "grad_norm": 1.308750033378601, + "learning_rate": 2.048044888655094e-05, + "loss": 0.5876, + "step": 5150 + }, + { + "epoch": 11.79048697621744, + "grad_norm": 1.2791666984558105, + "learning_rate": 1.8726985796948974e-05, + "loss": 0.5886, + "step": 5200 + }, + { + "epoch": 11.79048697621744, + "eval_loss": 0.8168981075286865, + "eval_runtime": 218.4082, + "eval_samples_per_second": 7.184, + "eval_steps_per_second": 0.902, + "step": 5200 + }, + { + "epoch": 11.90373725934315, + "grad_norm": 1.1309980154037476, + "learning_rate": 1.6973522707347012e-05, + "loss": 0.5816, + "step": 5250 + }, + { + "epoch": 12.018120045300114, + "grad_norm": 1.2232533693313599, + "learning_rate": 1.5220059617745046e-05, + "loss": 0.5993, + "step": 5300 + }, + { + "epoch": 12.018120045300114, + "eval_loss": 0.8122690916061401, + "eval_runtime": 218.3894, + "eval_samples_per_second": 7.184, + "eval_steps_per_second": 0.902, + "step": 5300 + }, + { + "epoch": 12.13137032842582, + "grad_norm": 1.1197330951690674, + "learning_rate": 1.3466596528143083e-05, + "loss": 0.552, + "step": 5350 + }, + { + "epoch": 12.244620611551529, + "grad_norm": 1.038383960723877, + "learning_rate": 1.171313343854112e-05, + "loss": 0.5461, + "step": 5400 + }, + { + "epoch": 12.244620611551529, + "eval_loss": 0.810990571975708, + "eval_runtime": 218.3359, + "eval_samples_per_second": 7.186, + "eval_steps_per_second": 0.902, + "step": 5400 + }, + { + "epoch": 12.357870894677237, + "grad_norm": 1.2155468463897705, + "learning_rate": 9.959670348939155e-06, + "loss": 0.5487, + "step": 5450 + }, + { + "epoch": 12.471121177802944, + "grad_norm": 1.0609550476074219, + "learning_rate": 8.20620725933719e-06, + "loss": 0.5524, + "step": 5500 + }, + { + "epoch": 12.471121177802944, + "eval_loss": 0.8065800070762634, + "eval_runtime": 218.3379, + "eval_samples_per_second": 7.186, + "eval_steps_per_second": 0.902, + "step": 5500 + }, + { + "epoch": 12.584371460928653, + "grad_norm": 1.1328603029251099, + "learning_rate": 6.452744169735227e-06, + "loss": 0.5437, + "step": 5550 + }, + { + "epoch": 12.69762174405436, + "grad_norm": 1.0544012784957886, + "learning_rate": 4.699281080133264e-06, + "loss": 0.5521, + "step": 5600 + }, + { + "epoch": 12.69762174405436, + "eval_loss": 0.8028028607368469, + "eval_runtime": 218.3014, + "eval_samples_per_second": 7.187, + "eval_steps_per_second": 0.902, + "step": 5600 + }, + { + "epoch": 12.810872027180068, + "grad_norm": 1.1334656476974487, + "learning_rate": 2.9458179905312994e-06, + "loss": 0.5537, + "step": 5650 + }, + { + "epoch": 12.924122310305776, + "grad_norm": 1.133388638496399, + "learning_rate": 1.1923549009293354e-06, + "loss": 0.5502, + "step": 5700 + }, + { + "epoch": 12.924122310305776, + "eval_loss": 0.801500141620636, + "eval_runtime": 218.1671, + "eval_samples_per_second": 7.192, + "eval_steps_per_second": 0.903, + "step": 5700 + }, + { + "epoch": 13.03850509626274, + "grad_norm": 1.093996524810791, + "learning_rate": 2.6302201974183753e-05, + "loss": 0.5348, + "step": 5750 + }, + { + "epoch": 13.15175537938845, + "grad_norm": 1.1750720739364624, + "learning_rate": 2.478359908883827e-05, + "loss": 0.5407, + "step": 5800 + }, + { + "epoch": 13.15175537938845, + "eval_loss": 0.8037387728691101, + "eval_runtime": 219.5954, + "eval_samples_per_second": 7.145, + "eval_steps_per_second": 0.897, + "step": 5800 + }, + { + "epoch": 13.265005662514156, + "grad_norm": 1.3370305299758911, + "learning_rate": 2.3264996203492785e-05, + "loss": 0.5427, + "step": 5850 + }, + { + "epoch": 13.378255945639864, + "grad_norm": 1.361132025718689, + "learning_rate": 2.1746393318147306e-05, + "loss": 0.5587, + "step": 5900 + }, + { + "epoch": 13.378255945639864, + "eval_loss": 0.7990919351577759, + "eval_runtime": 219.6972, + "eval_samples_per_second": 7.142, + "eval_steps_per_second": 0.897, + "step": 5900 + }, + { + "epoch": 13.491506228765571, + "grad_norm": 1.2097536325454712, + "learning_rate": 2.0227790432801824e-05, + "loss": 0.5378, + "step": 5950 + }, + { + "epoch": 13.60475651189128, + "grad_norm": 2.1065151691436768, + "learning_rate": 1.8709187547456342e-05, + "loss": 0.538, + "step": 6000 + }, + { + "epoch": 13.60475651189128, + "eval_loss": 0.7955650091171265, + "eval_runtime": 219.6995, + "eval_samples_per_second": 7.142, + "eval_steps_per_second": 0.897, + "step": 6000 + }, + { + "epoch": 13.718006795016988, + "grad_norm": 1.3054521083831787, + "learning_rate": 1.719058466211086e-05, + "loss": 0.5319, + "step": 6050 + }, + { + "epoch": 13.831257078142695, + "grad_norm": 1.2116392850875854, + "learning_rate": 1.5671981776765377e-05, + "loss": 0.5382, + "step": 6100 + }, + { + "epoch": 13.831257078142695, + "eval_loss": 0.7880010008811951, + "eval_runtime": 219.8172, + "eval_samples_per_second": 7.138, + "eval_steps_per_second": 0.896, + "step": 6100 + }, + { + "epoch": 13.944507361268403, + "grad_norm": 1.437024474143982, + "learning_rate": 1.4153378891419893e-05, + "loss": 0.5361, + "step": 6150 + }, + { + "epoch": 14.058890147225368, + "grad_norm": 1.1516680717468262, + "learning_rate": 1.2665148063781321e-05, + "loss": 0.5453, + "step": 6200 + }, + { + "epoch": 14.058890147225368, + "eval_loss": 0.783509373664856, + "eval_runtime": 219.7823, + "eval_samples_per_second": 7.139, + "eval_steps_per_second": 0.896, + "step": 6200 + }, + { + "epoch": 14.172140430351076, + "grad_norm": 1.182915210723877, + "learning_rate": 1.114654517843584e-05, + "loss": 0.5085, + "step": 6250 + }, + { + "epoch": 14.285390713476783, + "grad_norm": 1.225037693977356, + "learning_rate": 9.627942293090357e-06, + "loss": 0.5112, + "step": 6300 + }, + { + "epoch": 14.285390713476783, + "eval_loss": 0.7822558283805847, + "eval_runtime": 219.842, + "eval_samples_per_second": 7.137, + "eval_steps_per_second": 0.896, + "step": 6300 + }, + { + "epoch": 14.398640996602492, + "grad_norm": 1.1970784664154053, + "learning_rate": 8.109339407744875e-06, + "loss": 0.5079, + "step": 6350 + }, + { + "epoch": 14.5118912797282, + "grad_norm": 1.1259725093841553, + "learning_rate": 6.590736522399393e-06, + "loss": 0.5129, + "step": 6400 + }, + { + "epoch": 14.5118912797282, + "eval_loss": 0.7796412110328674, + "eval_runtime": 219.8188, + "eval_samples_per_second": 7.138, + "eval_steps_per_second": 0.896, + "step": 6400 + }, + { + "epoch": 14.625141562853907, + "grad_norm": 1.236473798751831, + "learning_rate": 5.072133637053911e-06, + "loss": 0.5055, + "step": 6450 + }, + { + "epoch": 14.738391845979615, + "grad_norm": 1.1682021617889404, + "learning_rate": 3.553530751708428e-06, + "loss": 0.5074, + "step": 6500 + }, + { + "epoch": 14.738391845979615, + "eval_loss": 0.7759413719177246, + "eval_runtime": 219.6677, + "eval_samples_per_second": 7.143, + "eval_steps_per_second": 0.897, + "step": 6500 + }, + { + "epoch": 14.851642129105322, + "grad_norm": 1.190508484840393, + "learning_rate": 2.0349278663629463e-06, + "loss": 0.5103, + "step": 6550 + }, + { + "epoch": 14.96489241223103, + "grad_norm": 1.18021559715271, + "learning_rate": 5.163249810174639e-07, + "loss": 0.5011, + "step": 6600 + }, + { + "epoch": 14.96489241223103, + "eval_loss": 0.7753218412399292, + "eval_runtime": 219.7467, + "eval_samples_per_second": 7.14, + "eval_steps_per_second": 0.896, + "step": 6600 + }, + { + "epoch": 15.079275198187995, + "grad_norm": 1.3528636693954468, + "learning_rate": 3.262518968133536e-05, + "loss": 0.5176, + "step": 6650 + }, + { + "epoch": 15.192525481313703, + "grad_norm": 1.3476513624191284, + "learning_rate": 3.1360647445624685e-05, + "loss": 0.5032, + "step": 6700 + }, + { + "epoch": 15.192525481313703, + "eval_loss": 0.7792025804519653, + "eval_runtime": 219.73, + "eval_samples_per_second": 7.141, + "eval_steps_per_second": 0.897, + "step": 6700 + }, + { + "epoch": 15.305775764439412, + "grad_norm": 1.3222737312316895, + "learning_rate": 3.009610520991401e-05, + "loss": 0.5141, + "step": 6750 + }, + { + "epoch": 15.419026047565119, + "grad_norm": 1.3413212299346924, + "learning_rate": 2.883156297420334e-05, + "loss": 0.5071, + "step": 6800 + }, + { + "epoch": 15.419026047565119, + "eval_loss": 0.7755314707756042, + "eval_runtime": 219.6961, + "eval_samples_per_second": 7.142, + "eval_steps_per_second": 0.897, + "step": 6800 + }, + { + "epoch": 15.532276330690827, + "grad_norm": 1.4169390201568604, + "learning_rate": 2.7567020738492665e-05, + "loss": 0.5066, + "step": 6850 + }, + { + "epoch": 15.645526613816534, + "grad_norm": 1.499665379524231, + "learning_rate": 2.6302478502781997e-05, + "loss": 0.5098, + "step": 6900 + }, + { + "epoch": 15.645526613816534, + "eval_loss": 0.7675374150276184, + "eval_runtime": 219.7136, + "eval_samples_per_second": 7.141, + "eval_steps_per_second": 0.897, + "step": 6900 + }, + { + "epoch": 15.758776896942242, + "grad_norm": 1.408177137374878, + "learning_rate": 2.5037936267071323e-05, + "loss": 0.5074, + "step": 6950 + }, + { + "epoch": 15.87202718006795, + "grad_norm": 1.5971038341522217, + "learning_rate": 2.3773394031360648e-05, + "loss": 0.4994, + "step": 7000 + }, + { + "epoch": 15.87202718006795, + "eval_loss": 0.7616310715675354, + "eval_runtime": 219.6635, + "eval_samples_per_second": 7.143, + "eval_steps_per_second": 0.897, + "step": 7000 + }, + { + "epoch": 15.985277463193658, + "grad_norm": 1.4312022924423218, + "learning_rate": 2.2508851795649977e-05, + "loss": 0.5081, + "step": 7050 + }, + { + "epoch": 16.099660249150624, + "grad_norm": 1.4189964532852173, + "learning_rate": 2.1244309559939302e-05, + "loss": 0.4831, + "step": 7100 + }, + { + "epoch": 16.099660249150624, + "eval_loss": 0.758693277835846, + "eval_runtime": 219.6153, + "eval_samples_per_second": 7.144, + "eval_steps_per_second": 0.897, + "step": 7100 + }, + { + "epoch": 16.212910532276332, + "grad_norm": 1.429587960243225, + "learning_rate": 1.9979767324228628e-05, + "loss": 0.4677, + "step": 7150 + }, + { + "epoch": 16.326160815402037, + "grad_norm": 1.5730829238891602, + "learning_rate": 1.8715225088517957e-05, + "loss": 0.4744, + "step": 7200 + }, + { + "epoch": 16.326160815402037, + "eval_loss": 0.7522332668304443, + "eval_runtime": 219.6797, + "eval_samples_per_second": 7.142, + "eval_steps_per_second": 0.897, + "step": 7200 + }, + { + "epoch": 16.439411098527746, + "grad_norm": 1.3818005323410034, + "learning_rate": 1.7450682852807286e-05, + "loss": 0.4821, + "step": 7250 + }, + { + "epoch": 16.552661381653454, + "grad_norm": 1.3803259134292603, + "learning_rate": 1.618614061709661e-05, + "loss": 0.4839, + "step": 7300 + }, + { + "epoch": 16.552661381653454, + "eval_loss": 0.7453923225402832, + "eval_runtime": 219.5972, + "eval_samples_per_second": 7.145, + "eval_steps_per_second": 0.897, + "step": 7300 + }, + { + "epoch": 16.665911664779163, + "grad_norm": 1.4981536865234375, + "learning_rate": 1.492159838138594e-05, + "loss": 0.468, + "step": 7350 + }, + { + "epoch": 16.77916194790487, + "grad_norm": 1.3549158573150635, + "learning_rate": 1.3657056145675265e-05, + "loss": 0.461, + "step": 7400 + }, + { + "epoch": 16.77916194790487, + "eval_loss": 0.7414634823799133, + "eval_runtime": 219.6011, + "eval_samples_per_second": 7.145, + "eval_steps_per_second": 0.897, + "step": 7400 + }, + { + "epoch": 16.892412231030576, + "grad_norm": 1.4302562475204468, + "learning_rate": 1.2392513909964594e-05, + "loss": 0.477, + "step": 7450 + }, + { + "epoch": 17.006795016987542, + "grad_norm": 1.2383838891983032, + "learning_rate": 1.112797167425392e-05, + "loss": 0.4667, + "step": 7500 + }, + { + "epoch": 17.006795016987542, + "eval_loss": 0.7361006140708923, + "eval_runtime": 219.5894, + "eval_samples_per_second": 7.145, + "eval_steps_per_second": 0.897, + "step": 7500 + }, + { + "epoch": 17.12004530011325, + "grad_norm": 1.2482600212097168, + "learning_rate": 9.863429438543249e-06, + "loss": 0.4463, + "step": 7550 + }, + { + "epoch": 17.23329558323896, + "grad_norm": 1.264907956123352, + "learning_rate": 8.598887202832576e-06, + "loss": 0.4479, + "step": 7600 + }, + { + "epoch": 17.23329558323896, + "eval_loss": 0.7333863973617554, + "eval_runtime": 219.7057, + "eval_samples_per_second": 7.141, + "eval_steps_per_second": 0.897, + "step": 7600 + }, + { + "epoch": 17.346545866364664, + "grad_norm": 1.2122907638549805, + "learning_rate": 7.334344967121902e-06, + "loss": 0.4535, + "step": 7650 + }, + { + "epoch": 17.459796149490373, + "grad_norm": 1.176712989807129, + "learning_rate": 6.06980273141123e-06, + "loss": 0.4404, + "step": 7700 + }, + { + "epoch": 17.459796149490373, + "eval_loss": 0.7308885455131531, + "eval_runtime": 219.6198, + "eval_samples_per_second": 7.144, + "eval_steps_per_second": 0.897, + "step": 7700 + }, + { + "epoch": 17.57304643261608, + "grad_norm": 1.264377474784851, + "learning_rate": 4.805260495700556e-06, + "loss": 0.4425, + "step": 7750 + }, + { + "epoch": 17.68629671574179, + "grad_norm": 1.3030773401260376, + "learning_rate": 3.5407182599898835e-06, + "loss": 0.4393, + "step": 7800 + }, + { + "epoch": 17.68629671574179, + "eval_loss": 0.7286545634269714, + "eval_runtime": 219.6474, + "eval_samples_per_second": 7.143, + "eval_steps_per_second": 0.897, + "step": 7800 + }, + { + "epoch": 17.7995469988675, + "grad_norm": 1.362890601158142, + "learning_rate": 2.276176024279211e-06, + "loss": 0.4425, + "step": 7850 + }, + { + "epoch": 17.912797281993203, + "grad_norm": 1.2483875751495361, + "learning_rate": 1.0116337885685382e-06, + "loss": 0.4434, + "step": 7900 + }, + { + "epoch": 17.912797281993203, + "eval_loss": 0.7274926900863647, + "eval_runtime": 219.7356, + "eval_samples_per_second": 7.14, + "eval_steps_per_second": 0.897, + "step": 7900 + }, + { + "epoch": 18.02718006795017, + "grad_norm": 1.2370803356170654, + "learning_rate": 1.9840728100113766e-05, + "loss": 0.4474, + "step": 7950 + }, + { + "epoch": 18.140430351075878, + "grad_norm": 1.454135537147522, + "learning_rate": 1.8703071672354948e-05, + "loss": 0.4294, + "step": 8000 + }, + { + "epoch": 18.140430351075878, + "eval_loss": 0.7317793965339661, + "eval_runtime": 216.9455, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.908, + "step": 8000 + }, + { + "epoch": 18.253680634201586, + "grad_norm": 1.4219353199005127, + "learning_rate": 1.7565415244596133e-05, + "loss": 0.4362, + "step": 8050 + }, + { + "epoch": 18.366930917327295, + "grad_norm": 1.4157588481903076, + "learning_rate": 1.6427758816837314e-05, + "loss": 0.4369, + "step": 8100 + }, + { + "epoch": 18.366930917327295, + "eval_loss": 0.7285795211791992, + "eval_runtime": 217.2426, + "eval_samples_per_second": 7.222, + "eval_steps_per_second": 0.907, + "step": 8100 + }, + { + "epoch": 18.480181200453, + "grad_norm": 1.5712941884994507, + "learning_rate": 1.52901023890785e-05, + "loss": 0.4393, + "step": 8150 + }, + { + "epoch": 18.59343148357871, + "grad_norm": 1.3149316310882568, + "learning_rate": 1.4152445961319682e-05, + "loss": 0.4377, + "step": 8200 + }, + { + "epoch": 18.59343148357871, + "eval_loss": 0.7221394181251526, + "eval_runtime": 217.271, + "eval_samples_per_second": 7.221, + "eval_steps_per_second": 0.907, + "step": 8200 + }, + { + "epoch": 18.706681766704417, + "grad_norm": 1.4053345918655396, + "learning_rate": 1.3014789533560864e-05, + "loss": 0.4395, + "step": 8250 + }, + { + "epoch": 18.819932049830125, + "grad_norm": 1.4755219221115112, + "learning_rate": 1.1877133105802047e-05, + "loss": 0.4464, + "step": 8300 + }, + { + "epoch": 18.819932049830125, + "eval_loss": 0.7166544795036316, + "eval_runtime": 217.3739, + "eval_samples_per_second": 7.218, + "eval_steps_per_second": 0.906, + "step": 8300 + }, + { + "epoch": 18.933182332955834, + "grad_norm": 1.3762329816818237, + "learning_rate": 1.073947667804323e-05, + "loss": 0.4482, + "step": 8350 + }, + { + "epoch": 19.047565118912797, + "grad_norm": 1.1810795068740845, + "learning_rate": 9.62457337883959e-06, + "loss": 0.4264, + "step": 8400 + }, + { + "epoch": 19.047565118912797, + "eval_loss": 0.7138365507125854, + "eval_runtime": 217.4069, + "eval_samples_per_second": 7.217, + "eval_steps_per_second": 0.906, + "step": 8400 + }, + { + "epoch": 19.160815402038505, + "grad_norm": 1.51250422000885, + "learning_rate": 8.486916951080774e-06, + "loss": 0.4197, + "step": 8450 + }, + { + "epoch": 19.274065685164214, + "grad_norm": 1.3608779907226562, + "learning_rate": 7.349260523321957e-06, + "loss": 0.4149, + "step": 8500 + }, + { + "epoch": 19.274065685164214, + "eval_loss": 0.7112516760826111, + "eval_runtime": 217.3073, + "eval_samples_per_second": 7.22, + "eval_steps_per_second": 0.907, + "step": 8500 + }, + { + "epoch": 19.387315968289922, + "grad_norm": 1.3459504842758179, + "learning_rate": 6.21160409556314e-06, + "loss": 0.4151, + "step": 8550 + }, + { + "epoch": 19.500566251415627, + "grad_norm": 1.270430326461792, + "learning_rate": 5.073947667804323e-06, + "loss": 0.4107, + "step": 8600 + }, + { + "epoch": 19.500566251415627, + "eval_loss": 0.7087224721908569, + "eval_runtime": 217.4347, + "eval_samples_per_second": 7.216, + "eval_steps_per_second": 0.906, + "step": 8600 + }, + { + "epoch": 19.613816534541336, + "grad_norm": 1.147330641746521, + "learning_rate": 3.936291240045506e-06, + "loss": 0.4204, + "step": 8650 + }, + { + "epoch": 19.727066817667044, + "grad_norm": 1.3679783344268799, + "learning_rate": 2.8213879408418657e-06, + "loss": 0.4241, + "step": 8700 + }, + { + "epoch": 19.727066817667044, + "eval_loss": 0.705489456653595, + "eval_runtime": 217.5656, + "eval_samples_per_second": 7.212, + "eval_steps_per_second": 0.905, + "step": 8700 + }, + { + "epoch": 19.840317100792753, + "grad_norm": 1.2595313787460327, + "learning_rate": 1.6837315130830492e-06, + "loss": 0.4157, + "step": 8750 + }, + { + "epoch": 19.95356738391846, + "grad_norm": 1.3279147148132324, + "learning_rate": 5.460750853242321e-07, + "loss": 0.4127, + "step": 8800 + }, + { + "epoch": 19.95356738391846, + "eval_loss": 0.7047748565673828, + "eval_runtime": 217.3024, + "eval_samples_per_second": 7.22, + "eval_steps_per_second": 0.907, + "step": 8800 + }, + { + "epoch": 20.067950169875424, + "grad_norm": 1.4412195682525635, + "learning_rate": 2.5650153268070802e-05, + "loss": 0.4066, + "step": 8850 + }, + { + "epoch": 20.181200453001132, + "grad_norm": 1.591495156288147, + "learning_rate": 2.466132700484525e-05, + "loss": 0.4107, + "step": 8900 + }, + { + "epoch": 20.181200453001132, + "eval_loss": 0.7136498093605042, + "eval_runtime": 217.2853, + "eval_samples_per_second": 7.221, + "eval_steps_per_second": 0.907, + "step": 8900 + }, + { + "epoch": 20.29445073612684, + "grad_norm": 1.5843544006347656, + "learning_rate": 2.3672500741619698e-05, + "loss": 0.4249, + "step": 8950 + }, + { + "epoch": 20.40770101925255, + "grad_norm": 1.7842884063720703, + "learning_rate": 2.268367447839415e-05, + "loss": 0.4292, + "step": 9000 + }, + { + "epoch": 20.40770101925255, + "eval_loss": 0.7056994438171387, + "eval_runtime": 217.247, + "eval_samples_per_second": 7.222, + "eval_steps_per_second": 0.907, + "step": 9000 + }, + { + "epoch": 20.520951302378258, + "grad_norm": 1.8154791593551636, + "learning_rate": 2.1694848215168594e-05, + "loss": 0.4171, + "step": 9050 + }, + { + "epoch": 20.634201585503963, + "grad_norm": 1.810947060585022, + "learning_rate": 2.0706021951943045e-05, + "loss": 0.4254, + "step": 9100 + }, + { + "epoch": 20.634201585503963, + "eval_loss": 0.7009139060974121, + "eval_runtime": 217.3713, + "eval_samples_per_second": 7.218, + "eval_steps_per_second": 0.906, + "step": 9100 + }, + { + "epoch": 20.74745186862967, + "grad_norm": 1.6273292303085327, + "learning_rate": 1.9717195688717493e-05, + "loss": 0.4182, + "step": 9150 + }, + { + "epoch": 20.86070215175538, + "grad_norm": 1.865356206893921, + "learning_rate": 1.8728369425491945e-05, + "loss": 0.4143, + "step": 9200 + }, + { + "epoch": 20.86070215175538, + "eval_loss": 0.6963376402854919, + "eval_runtime": 217.269, + "eval_samples_per_second": 7.221, + "eval_steps_per_second": 0.907, + "step": 9200 + }, + { + "epoch": 20.973952434881088, + "grad_norm": 2.093496799468994, + "learning_rate": 1.773954316226639e-05, + "loss": 0.4116, + "step": 9250 + }, + { + "epoch": 21.08833522083805, + "grad_norm": 1.6501940488815308, + "learning_rate": 1.6750716899040837e-05, + "loss": 0.4071, + "step": 9300 + }, + { + "epoch": 21.08833522083805, + "eval_loss": 0.6935945153236389, + "eval_runtime": 217.1749, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 0.907, + "step": 9300 + }, + { + "epoch": 21.20158550396376, + "grad_norm": 1.4257782697677612, + "learning_rate": 1.576189063581529e-05, + "loss": 0.3964, + "step": 9350 + }, + { + "epoch": 21.314835787089468, + "grad_norm": 1.7246989011764526, + "learning_rate": 1.4773064372589737e-05, + "loss": 0.3856, + "step": 9400 + }, + { + "epoch": 21.314835787089468, + "eval_loss": 0.6908048391342163, + "eval_runtime": 217.1554, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 0.907, + "step": 9400 + }, + { + "epoch": 21.428086070215176, + "grad_norm": 1.5051772594451904, + "learning_rate": 1.3784238109364186e-05, + "loss": 0.3903, + "step": 9450 + }, + { + "epoch": 21.541336353340885, + "grad_norm": 1.4209738969802856, + "learning_rate": 1.2795411846138633e-05, + "loss": 0.3992, + "step": 9500 + }, + { + "epoch": 21.541336353340885, + "eval_loss": 0.6845880746841431, + "eval_runtime": 217.0726, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 0.908, + "step": 9500 + }, + { + "epoch": 21.65458663646659, + "grad_norm": 1.4793322086334229, + "learning_rate": 1.1806585582913082e-05, + "loss": 0.392, + "step": 9550 + }, + { + "epoch": 21.7678369195923, + "grad_norm": 1.5042359828948975, + "learning_rate": 1.0817759319687532e-05, + "loss": 0.3833, + "step": 9600 + }, + { + "epoch": 21.7678369195923, + "eval_loss": 0.6798712611198425, + "eval_runtime": 217.2033, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 0.907, + "step": 9600 + }, + { + "epoch": 21.881087202718007, + "grad_norm": 1.4992612600326538, + "learning_rate": 9.82893305646198e-06, + "loss": 0.3912, + "step": 9650 + }, + { + "epoch": 21.994337485843715, + "grad_norm": 1.4592713117599487, + "learning_rate": 8.840106793236428e-06, + "loss": 0.3931, + "step": 9700 + }, + { + "epoch": 21.994337485843715, + "eval_loss": 0.6735964417457581, + "eval_runtime": 217.2254, + "eval_samples_per_second": 7.223, + "eval_steps_per_second": 0.907, + "step": 9700 + }, + { + "epoch": 22.108720271800678, + "grad_norm": 1.3605159521102905, + "learning_rate": 7.851280530010878e-06, + "loss": 0.378, + "step": 9750 + }, + { + "epoch": 22.221970554926386, + "grad_norm": 1.4335530996322632, + "learning_rate": 6.862454266785326e-06, + "loss": 0.379, + "step": 9800 + }, + { + "epoch": 22.221970554926386, + "eval_loss": 0.6728909015655518, + "eval_runtime": 217.1793, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 0.907, + "step": 9800 + }, + { + "epoch": 22.335220838052095, + "grad_norm": 1.2988905906677246, + "learning_rate": 5.873628003559775e-06, + "loss": 0.371, + "step": 9850 + }, + { + "epoch": 22.448471121177803, + "grad_norm": 1.407586693763733, + "learning_rate": 4.884801740334224e-06, + "loss": 0.3719, + "step": 9900 + }, + { + "epoch": 22.448471121177803, + "eval_loss": 0.670095682144165, + "eval_runtime": 217.2529, + "eval_samples_per_second": 7.222, + "eval_steps_per_second": 0.907, + "step": 9900 + }, + { + "epoch": 22.561721404303512, + "grad_norm": 1.5192447900772095, + "learning_rate": 1.9627103228740338e-05, + "loss": 0.3763, + "step": 9950 + }, + { + "epoch": 22.674971687429217, + "grad_norm": 1.6283540725708008, + "learning_rate": 1.8717598908594817e-05, + "loss": 0.3834, + "step": 10000 + }, + { + "epoch": 22.674971687429217, + "eval_loss": 0.6722336411476135, + "eval_runtime": 219.0452, + "eval_samples_per_second": 7.163, + "eval_steps_per_second": 0.899, + "step": 10000 + }, + { + "epoch": 22.788221970554925, + "grad_norm": 1.8841089010238647, + "learning_rate": 1.7808094588449296e-05, + "loss": 0.3766, + "step": 10050 + }, + { + "epoch": 22.901472253680634, + "grad_norm": 1.6647872924804688, + "learning_rate": 1.6898590268303775e-05, + "loss": 0.379, + "step": 10100 + }, + { + "epoch": 22.901472253680634, + "eval_loss": 0.6667923331260681, + "eval_runtime": 219.0896, + "eval_samples_per_second": 7.161, + "eval_steps_per_second": 0.899, + "step": 10100 + }, + { + "epoch": 23.0158550396376, + "grad_norm": 1.56221604347229, + "learning_rate": 1.5989085948158254e-05, + "loss": 0.3724, + "step": 10150 + }, + { + "epoch": 23.12910532276331, + "grad_norm": 1.741861343383789, + "learning_rate": 1.5079581628012735e-05, + "loss": 0.3648, + "step": 10200 + }, + { + "epoch": 23.12910532276331, + "eval_loss": 0.6666680574417114, + "eval_runtime": 219.1233, + "eval_samples_per_second": 7.16, + "eval_steps_per_second": 0.899, + "step": 10200 + }, + { + "epoch": 23.242355605889014, + "grad_norm": 1.4197698831558228, + "learning_rate": 1.4170077307867214e-05, + "loss": 0.3622, + "step": 10250 + }, + { + "epoch": 23.355605889014722, + "grad_norm": 1.5689094066619873, + "learning_rate": 1.3260572987721692e-05, + "loss": 0.3633, + "step": 10300 + }, + { + "epoch": 23.355605889014722, + "eval_loss": 0.6614246368408203, + "eval_runtime": 219.1161, + "eval_samples_per_second": 7.161, + "eval_steps_per_second": 0.899, + "step": 10300 + }, + { + "epoch": 23.46885617214043, + "grad_norm": 1.73819899559021, + "learning_rate": 1.2351068667576171e-05, + "loss": 0.3665, + "step": 10350 + }, + { + "epoch": 23.58210645526614, + "grad_norm": 1.470841884613037, + "learning_rate": 1.1441564347430652e-05, + "loss": 0.3594, + "step": 10400 + }, + { + "epoch": 23.58210645526614, + "eval_loss": 0.6564630270004272, + "eval_runtime": 219.1222, + "eval_samples_per_second": 7.16, + "eval_steps_per_second": 0.899, + "step": 10400 + }, + { + "epoch": 23.695356738391848, + "grad_norm": 1.4712560176849365, + "learning_rate": 1.0532060027285131e-05, + "loss": 0.3567, + "step": 10450 + }, + { + "epoch": 23.808607021517552, + "grad_norm": 1.3822436332702637, + "learning_rate": 9.622555707139608e-06, + "loss": 0.3655, + "step": 10500 + }, + { + "epoch": 23.808607021517552, + "eval_loss": 0.6519103050231934, + "eval_runtime": 219.1979, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.899, + "step": 10500 + }, + { + "epoch": 23.92185730464326, + "grad_norm": 1.49004065990448, + "learning_rate": 8.713051386994087e-06, + "loss": 0.356, + "step": 10550 + }, + { + "epoch": 24.036240090600227, + "grad_norm": 1.3333971500396729, + "learning_rate": 7.803547066848568e-06, + "loss": 0.3609, + "step": 10600 + }, + { + "epoch": 24.036240090600227, + "eval_loss": 0.6471272706985474, + "eval_runtime": 219.2457, + "eval_samples_per_second": 7.156, + "eval_steps_per_second": 0.899, + "step": 10600 + }, + { + "epoch": 24.149490373725936, + "grad_norm": 1.3648090362548828, + "learning_rate": 6.894042746703047e-06, + "loss": 0.3445, + "step": 10650 + }, + { + "epoch": 24.26274065685164, + "grad_norm": 1.2211579084396362, + "learning_rate": 5.984538426557527e-06, + "loss": 0.3438, + "step": 10700 + }, + { + "epoch": 24.26274065685164, + "eval_loss": 0.6461014151573181, + "eval_runtime": 219.1852, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.899, + "step": 10700 + }, + { + "epoch": 24.37599093997735, + "grad_norm": 1.332571029663086, + "learning_rate": 5.075034106412006e-06, + "loss": 0.3378, + "step": 10750 + }, + { + "epoch": 24.489241223103058, + "grad_norm": 1.263708233833313, + "learning_rate": 4.1655297862664855e-06, + "loss": 0.3457, + "step": 10800 + }, + { + "epoch": 24.489241223103058, + "eval_loss": 0.6429575681686401, + "eval_runtime": 219.1956, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.899, + "step": 10800 + }, + { + "epoch": 24.602491506228766, + "grad_norm": 1.2414239645004272, + "learning_rate": 3.256025466120964e-06, + "loss": 0.3478, + "step": 10850 + }, + { + "epoch": 24.715741789354475, + "grad_norm": 1.183813214302063, + "learning_rate": 2.3465211459754434e-06, + "loss": 0.3413, + "step": 10900 + }, + { + "epoch": 24.715741789354475, + "eval_loss": 0.6409078240394592, + "eval_runtime": 219.161, + "eval_samples_per_second": 7.159, + "eval_steps_per_second": 0.899, + "step": 10900 + }, + { + "epoch": 24.82899207248018, + "grad_norm": 1.3728307485580444, + "learning_rate": 1.4370168258299228e-06, + "loss": 0.3453, + "step": 10950 + }, + { + "epoch": 24.942242355605888, + "grad_norm": 1.182039499282837, + "learning_rate": 5.275125056844021e-07, + "loss": 0.3439, + "step": 11000 + }, + { + "epoch": 24.942242355605888, + "eval_loss": 0.6399772763252258, + "eval_runtime": 219.2001, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.899, + "step": 11000 + }, + { + "epoch": 25.056625141562854, + "grad_norm": 1.50559401512146, + "learning_rate": 1.4498610760293005e-05, + "loss": 0.3502, + "step": 11050 + }, + { + "epoch": 25.169875424688563, + "grad_norm": 1.501145839691162, + "learning_rate": 1.3656647301507115e-05, + "loss": 0.3373, + "step": 11100 + }, + { + "epoch": 25.169875424688563, + "eval_loss": 0.6438981294631958, + "eval_runtime": 219.2088, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.899, + "step": 11100 + }, + { + "epoch": 25.28312570781427, + "grad_norm": 2.4662117958068848, + "learning_rate": 1.2814683842721226e-05, + "loss": 0.3419, + "step": 11150 + }, + { + "epoch": 25.396375990939976, + "grad_norm": 1.5162239074707031, + "learning_rate": 1.1972720383935337e-05, + "loss": 0.3452, + "step": 11200 + }, + { + "epoch": 25.396375990939976, + "eval_loss": 0.6388878226280212, + "eval_runtime": 219.2066, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.899, + "step": 11200 + }, + { + "epoch": 25.509626274065685, + "grad_norm": 1.315088152885437, + "learning_rate": 1.113075692514945e-05, + "loss": 0.3487, + "step": 11250 + }, + { + "epoch": 25.622876557191393, + "grad_norm": 1.4352425336837769, + "learning_rate": 1.028879346636356e-05, + "loss": 0.3386, + "step": 11300 + }, + { + "epoch": 25.622876557191393, + "eval_loss": 0.6349427700042725, + "eval_runtime": 219.2727, + "eval_samples_per_second": 7.155, + "eval_steps_per_second": 0.898, + "step": 11300 + }, + { + "epoch": 25.7361268403171, + "grad_norm": 1.433242678642273, + "learning_rate": 9.446830007577671e-06, + "loss": 0.3365, + "step": 11350 + }, + { + "epoch": 25.84937712344281, + "grad_norm": 1.343719720840454, + "learning_rate": 8.604866548791782e-06, + "loss": 0.3409, + "step": 11400 + }, + { + "epoch": 25.84937712344281, + "eval_loss": 0.631538987159729, + "eval_runtime": 219.1534, + "eval_samples_per_second": 7.159, + "eval_steps_per_second": 0.899, + "step": 11400 + }, + { + "epoch": 25.962627406568515, + "grad_norm": 1.496169090270996, + "learning_rate": 7.762903090005893e-06, + "loss": 0.3316, + "step": 11450 + }, + { + "epoch": 26.07701019252548, + "grad_norm": 1.5395649671554565, + "learning_rate": 6.920939631220005e-06, + "loss": 0.3429, + "step": 11500 + }, + { + "epoch": 26.07701019252548, + "eval_loss": 0.6306207180023193, + "eval_runtime": 219.1057, + "eval_samples_per_second": 7.161, + "eval_steps_per_second": 0.899, + "step": 11500 + }, + { + "epoch": 26.19026047565119, + "grad_norm": 1.298531413078308, + "learning_rate": 6.078976172434116e-06, + "loss": 0.3274, + "step": 11550 + }, + { + "epoch": 26.3035107587769, + "grad_norm": 1.3206506967544556, + "learning_rate": 5.237012713648228e-06, + "loss": 0.3281, + "step": 11600 + }, + { + "epoch": 26.3035107587769, + "eval_loss": 0.6274815797805786, + "eval_runtime": 219.1686, + "eval_samples_per_second": 7.159, + "eval_steps_per_second": 0.899, + "step": 11600 + }, + { + "epoch": 26.416761041902603, + "grad_norm": 1.3031998872756958, + "learning_rate": 4.395049254862339e-06, + "loss": 0.3186, + "step": 11650 + }, + { + "epoch": 26.530011325028312, + "grad_norm": 1.232765555381775, + "learning_rate": 3.5530857960764503e-06, + "loss": 0.324, + "step": 11700 + }, + { + "epoch": 26.530011325028312, + "eval_loss": 0.6246664524078369, + "eval_runtime": 219.2008, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.899, + "step": 11700 + }, + { + "epoch": 26.64326160815402, + "grad_norm": 1.3108420372009277, + "learning_rate": 2.7111223372905617e-06, + "loss": 0.3265, + "step": 11750 + }, + { + "epoch": 26.75651189127973, + "grad_norm": 1.2938895225524902, + "learning_rate": 1.8691588785046728e-06, + "loss": 0.325, + "step": 11800 + }, + { + "epoch": 26.75651189127973, + "eval_loss": 0.6232734322547913, + "eval_runtime": 219.1916, + "eval_samples_per_second": 7.158, + "eval_steps_per_second": 0.899, + "step": 11800 + }, + { + "epoch": 26.869762174405437, + "grad_norm": 1.4028679132461548, + "learning_rate": 1.0271954197187842e-06, + "loss": 0.3177, + "step": 11850 + }, + { + "epoch": 26.983012457531142, + "grad_norm": 1.1903717517852783, + "learning_rate": 1.8523196093289553e-07, + "loss": 0.3282, + "step": 11900 + }, + { + "epoch": 26.983012457531142, + "eval_loss": 0.6224809885025024, + "eval_runtime": 219.1746, + "eval_samples_per_second": 7.159, + "eval_steps_per_second": 0.899, + "step": 11900 + }, + { + "epoch": 27.09739524348811, + "grad_norm": 1.5629881620407104, + "learning_rate": 1.9454545454545457e-05, + "loss": 0.322, + "step": 11950 + }, + { + "epoch": 27.210645526613817, + "grad_norm": 1.8830535411834717, + "learning_rate": 1.86969696969697e-05, + "loss": 0.3272, + "step": 12000 + }, + { + "epoch": 27.210645526613817, + "eval_loss": 0.6294634342193604, + "eval_runtime": 216.3539, + "eval_samples_per_second": 7.252, + "eval_steps_per_second": 0.911, + "step": 12000 + }, + { + "epoch": 27.323895809739525, + "grad_norm": 2.302112340927124, + "learning_rate": 1.793939393939394e-05, + "loss": 0.3343, + "step": 12050 + }, + { + "epoch": 27.43714609286523, + "grad_norm": 1.6443369388580322, + "learning_rate": 1.718181818181818e-05, + "loss": 0.3331, + "step": 12100 + }, + { + "epoch": 27.43714609286523, + "eval_loss": 0.6251102685928345, + "eval_runtime": 216.4341, + "eval_samples_per_second": 7.249, + "eval_steps_per_second": 0.91, + "step": 12100 + }, + { + "epoch": 27.55039637599094, + "grad_norm": 1.6903585195541382, + "learning_rate": 1.6424242424242424e-05, + "loss": 0.3338, + "step": 12150 + }, + { + "epoch": 27.663646659116647, + "grad_norm": 1.6333993673324585, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.3293, + "step": 12200 + }, + { + "epoch": 27.663646659116647, + "eval_loss": 0.6229289174079895, + "eval_runtime": 216.5229, + "eval_samples_per_second": 7.246, + "eval_steps_per_second": 0.91, + "step": 12200 + }, + { + "epoch": 27.776896942242356, + "grad_norm": 1.7001616954803467, + "learning_rate": 1.4909090909090908e-05, + "loss": 0.3245, + "step": 12250 + }, + { + "epoch": 27.890147225368064, + "grad_norm": 1.919396162033081, + "learning_rate": 1.4151515151515152e-05, + "loss": 0.3284, + "step": 12300 + }, + { + "epoch": 27.890147225368064, + "eval_loss": 0.6198203563690186, + "eval_runtime": 216.5445, + "eval_samples_per_second": 7.246, + "eval_steps_per_second": 0.91, + "step": 12300 + }, + { + "epoch": 28.004530011325027, + "grad_norm": 2.137244462966919, + "learning_rate": 1.3393939393939395e-05, + "loss": 0.3363, + "step": 12350 + }, + { + "epoch": 28.117780294450736, + "grad_norm": 2.0852112770080566, + "learning_rate": 1.2636363636363638e-05, + "loss": 0.31, + "step": 12400 + }, + { + "epoch": 28.117780294450736, + "eval_loss": 0.6158381104469299, + "eval_runtime": 216.498, + "eval_samples_per_second": 7.247, + "eval_steps_per_second": 0.91, + "step": 12400 + }, + { + "epoch": 28.231030577576444, + "grad_norm": 1.7770031690597534, + "learning_rate": 1.187878787878788e-05, + "loss": 0.3094, + "step": 12450 + }, + { + "epoch": 28.344280860702153, + "grad_norm": 2.2683119773864746, + "learning_rate": 1.1136363636363637e-05, + "loss": 0.3106, + "step": 12500 + }, + { + "epoch": 28.344280860702153, + "eval_loss": 0.6127829551696777, + "eval_runtime": 216.6692, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.909, + "step": 12500 + }, + { + "epoch": 28.45753114382786, + "grad_norm": 2.2971391677856445, + "learning_rate": 1.037878787878788e-05, + "loss": 0.3123, + "step": 12550 + }, + { + "epoch": 28.570781426953566, + "grad_norm": 1.5072888135910034, + "learning_rate": 9.62121212121212e-06, + "loss": 0.3126, + "step": 12600 + }, + { + "epoch": 28.570781426953566, + "eval_loss": 0.6085474491119385, + "eval_runtime": 216.6624, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 0.909, + "step": 12600 + }, + { + "epoch": 28.684031710079275, + "grad_norm": 1.9965884685516357, + "learning_rate": 8.863636363636365e-06, + "loss": 0.3159, + "step": 12650 + }, + { + "epoch": 28.797281993204983, + "grad_norm": 1.9271585941314697, + "learning_rate": 8.106060606060606e-06, + "loss": 0.317, + "step": 12700 + }, + { + "epoch": 28.797281993204983, + "eval_loss": 0.6035783886909485, + "eval_runtime": 216.6906, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.909, + "step": 12700 + }, + { + "epoch": 28.91053227633069, + "grad_norm": 1.6005176305770874, + "learning_rate": 7.3484848484848486e-06, + "loss": 0.3063, + "step": 12750 + }, + { + "epoch": 29.024915062287654, + "grad_norm": 1.3837414979934692, + "learning_rate": 6.59090909090909e-06, + "loss": 0.3149, + "step": 12800 + }, + { + "epoch": 29.024915062287654, + "eval_loss": 0.6015561819076538, + "eval_runtime": 216.6327, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.909, + "step": 12800 + }, + { + "epoch": 29.138165345413363, + "grad_norm": 1.3116227388381958, + "learning_rate": 5.833333333333334e-06, + "loss": 0.2962, + "step": 12850 + }, + { + "epoch": 29.25141562853907, + "grad_norm": 1.3354703187942505, + "learning_rate": 5.075757575757576e-06, + "loss": 0.2966, + "step": 12900 + }, + { + "epoch": 29.25141562853907, + "eval_loss": 0.5984891653060913, + "eval_runtime": 216.6939, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.909, + "step": 12900 + }, + { + "epoch": 29.36466591166478, + "grad_norm": 1.1777273416519165, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.2989, + "step": 12950 + }, + { + "epoch": 29.477916194790488, + "grad_norm": 1.9163764715194702, + "learning_rate": 3.5606060606060608e-06, + "loss": 0.3063, + "step": 13000 + }, + { + "epoch": 29.477916194790488, + "eval_loss": 0.5958673357963562, + "eval_runtime": 216.5489, + "eval_samples_per_second": 7.245, + "eval_steps_per_second": 0.91, + "step": 13000 + }, + { + "epoch": 29.591166477916193, + "grad_norm": 1.3537064790725708, + "learning_rate": 2.803030303030303e-06, + "loss": 0.2951, + "step": 13050 + }, + { + "epoch": 29.7044167610419, + "grad_norm": 1.3078798055648804, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.2963, + "step": 13100 + }, + { + "epoch": 29.7044167610419, + "eval_loss": 0.5946142077445984, + "eval_runtime": 216.5944, + "eval_samples_per_second": 7.244, + "eval_steps_per_second": 0.91, + "step": 13100 + }, + { + "epoch": 29.81766704416761, + "grad_norm": 1.289014458656311, + "learning_rate": 1.287878787878788e-06, + "loss": 0.2959, + "step": 13150 + }, + { + "epoch": 29.93091732729332, + "grad_norm": 1.3634095191955566, + "learning_rate": 5.303030303030304e-07, + "loss": 0.2942, + "step": 13200 + }, + { + "epoch": 29.93091732729332, + "eval_loss": 0.5935017466545105, + "eval_runtime": 216.5971, + "eval_samples_per_second": 7.244, + "eval_steps_per_second": 0.91, + "step": 13200 + }, + { + "epoch": 30.045300113250285, + "grad_norm": 1.3688397407531738, + "learning_rate": 1.8012807271224955e-05, + "loss": 0.2984, + "step": 13250 + }, + { + "epoch": 30.15855039637599, + "grad_norm": 1.7011109590530396, + "learning_rate": 1.7324244302141433e-05, + "loss": 0.2943, + "step": 13300 + }, + { + "epoch": 30.15855039637599, + "eval_loss": 0.6008950471878052, + "eval_runtime": 216.6155, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.909, + "step": 13300 + }, + { + "epoch": 30.2718006795017, + "grad_norm": 1.7913622856140137, + "learning_rate": 1.663568133305791e-05, + "loss": 0.2931, + "step": 13350 + }, + { + "epoch": 30.385050962627407, + "grad_norm": 1.8850469589233398, + "learning_rate": 1.5947118363974385e-05, + "loss": 0.305, + "step": 13400 + }, + { + "epoch": 30.385050962627407, + "eval_loss": 0.5989060997962952, + "eval_runtime": 216.6435, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 0.909, + "step": 13400 + }, + { + "epoch": 30.498301245753115, + "grad_norm": 1.997729778289795, + "learning_rate": 1.5258555394890863e-05, + "loss": 0.3083, + "step": 13450 + }, + { + "epoch": 30.611551528878824, + "grad_norm": 1.8760637044906616, + "learning_rate": 1.4569992425807341e-05, + "loss": 0.3006, + "step": 13500 + }, + { + "epoch": 30.611551528878824, + "eval_loss": 0.596034824848175, + "eval_runtime": 216.6232, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.909, + "step": 13500 + }, + { + "epoch": 30.72480181200453, + "grad_norm": 1.8395705223083496, + "learning_rate": 1.388142945672382e-05, + "loss": 0.3057, + "step": 13550 + }, + { + "epoch": 30.838052095130237, + "grad_norm": 1.8442955017089844, + "learning_rate": 1.3192866487640296e-05, + "loss": 0.3038, + "step": 13600 + }, + { + "epoch": 30.838052095130237, + "eval_loss": 0.5910864472389221, + "eval_runtime": 216.6693, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.909, + "step": 13600 + }, + { + "epoch": 30.951302378255946, + "grad_norm": 1.9047316312789917, + "learning_rate": 1.2504303518556774e-05, + "loss": 0.2949, + "step": 13650 + }, + { + "epoch": 31.065685164212912, + "grad_norm": 1.8259665966033936, + "learning_rate": 1.181574054947325e-05, + "loss": 0.2984, + "step": 13700 + }, + { + "epoch": 31.065685164212912, + "eval_loss": 0.5886039733886719, + "eval_runtime": 216.6071, + "eval_samples_per_second": 7.244, + "eval_steps_per_second": 0.909, + "step": 13700 + }, + { + "epoch": 31.178935447338617, + "grad_norm": 1.9464973211288452, + "learning_rate": 1.1127177580389728e-05, + "loss": 0.2915, + "step": 13750 + }, + { + "epoch": 31.292185730464325, + "grad_norm": 1.4512701034545898, + "learning_rate": 1.0438614611306204e-05, + "loss": 0.2865, + "step": 13800 + }, + { + "epoch": 31.292185730464325, + "eval_loss": 0.5855095982551575, + "eval_runtime": 216.5764, + "eval_samples_per_second": 7.245, + "eval_steps_per_second": 0.91, + "step": 13800 + }, + { + "epoch": 31.405436013590034, + "grad_norm": 1.6476430892944336, + "learning_rate": 9.750051642222682e-06, + "loss": 0.2794, + "step": 13850 + }, + { + "epoch": 31.518686296715742, + "grad_norm": 2.3963589668273926, + "learning_rate": 9.06148867313916e-06, + "loss": 0.2958, + "step": 13900 + }, + { + "epoch": 31.518686296715742, + "eval_loss": 0.5817484259605408, + "eval_runtime": 216.6161, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.909, + "step": 13900 + }, + { + "epoch": 31.63193657984145, + "grad_norm": 1.6295278072357178, + "learning_rate": 8.372925704055636e-06, + "loss": 0.2842, + "step": 13950 + }, + { + "epoch": 31.745186862967156, + "grad_norm": 1.7011767625808716, + "learning_rate": 7.684362734972115e-06, + "loss": 0.2853, + "step": 14000 + }, + { + "epoch": 31.745186862967156, + "eval_loss": 0.5777027010917664, + "eval_runtime": 216.6548, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 0.909, + "step": 14000 + }, + { + "epoch": 31.858437146092864, + "grad_norm": 1.2951115369796753, + "learning_rate": 6.995799765888592e-06, + "loss": 0.2822, + "step": 14050 + }, + { + "epoch": 31.971687429218573, + "grad_norm": 1.6724634170532227, + "learning_rate": 6.307236796805067e-06, + "loss": 0.282, + "step": 14100 + }, + { + "epoch": 31.971687429218573, + "eval_loss": 0.5746533274650574, + "eval_runtime": 216.6678, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 0.909, + "step": 14100 + }, + { + "epoch": 32.08607021517554, + "grad_norm": 1.1698694229125977, + "learning_rate": 5.618673827721545e-06, + "loss": 0.2858, + "step": 14150 + }, + { + "epoch": 32.19932049830125, + "grad_norm": 1.4823814630508423, + "learning_rate": 4.930110858638023e-06, + "loss": 0.2724, + "step": 14200 + }, + { + "epoch": 32.19932049830125, + "eval_loss": 0.5733225345611572, + "eval_runtime": 216.6993, + "eval_samples_per_second": 7.24, + "eval_steps_per_second": 0.909, + "step": 14200 + }, + { + "epoch": 32.312570781426956, + "grad_norm": 1.2654746770858765, + "learning_rate": 4.2415478895545e-06, + "loss": 0.2668, + "step": 14250 + }, + { + "epoch": 32.425821064552665, + "grad_norm": 1.390316367149353, + "learning_rate": 3.5529849204709775e-06, + "loss": 0.2732, + "step": 14300 + }, + { + "epoch": 32.425821064552665, + "eval_loss": 0.5705001354217529, + "eval_runtime": 216.6456, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 0.909, + "step": 14300 + }, + { + "epoch": 32.539071347678366, + "grad_norm": 1.0841820240020752, + "learning_rate": 2.864421951387454e-06, + "loss": 0.2757, + "step": 14350 + }, + { + "epoch": 32.652321630804074, + "grad_norm": 1.1355277299880981, + "learning_rate": 2.175858982303932e-06, + "loss": 0.2726, + "step": 14400 + }, + { + "epoch": 32.652321630804074, + "eval_loss": 0.5688679814338684, + "eval_runtime": 216.6214, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.909, + "step": 14400 + }, + { + "epoch": 32.76557191392978, + "grad_norm": 1.3108878135681152, + "learning_rate": 1.4872960132204092e-06, + "loss": 0.273, + "step": 14450 + }, + { + "epoch": 32.87882219705549, + "grad_norm": 1.175482153892517, + "learning_rate": 7.987330441368863e-07, + "loss": 0.2695, + "step": 14500 + }, + { + "epoch": 32.87882219705549, + "eval_loss": 0.567724347114563, + "eval_runtime": 216.7209, + "eval_samples_per_second": 7.24, + "eval_steps_per_second": 0.909, + "step": 14500 + }, + { + "epoch": 32.9920724801812, + "grad_norm": 1.3629848957061768, + "learning_rate": 1.1017007505336364e-07, + "loss": 0.2808, + "step": 14550 + }, + { + "epoch": 33.10645526613816, + "grad_norm": 3.8983519077301025, + "learning_rate": 5.299015897047691e-05, + "loss": 0.2925, + "step": 14600 + }, + { + "epoch": 33.10645526613816, + "eval_loss": 0.6180706024169922, + "eval_runtime": 217.7372, + "eval_samples_per_second": 7.206, + "eval_steps_per_second": 0.905, + "step": 14600 + }, + { + "epoch": 33.21970554926387, + "grad_norm": 3.372912645339966, + "learning_rate": 5.249558415341913e-05, + "loss": 0.3275, + "step": 14650 + }, + { + "epoch": 33.33295583238958, + "grad_norm": 3.959416389465332, + "learning_rate": 5.199091597274792e-05, + "loss": 0.341, + "step": 14700 + }, + { + "epoch": 33.33295583238958, + "eval_loss": 0.6353843808174133, + "eval_runtime": 217.8515, + "eval_samples_per_second": 7.202, + "eval_steps_per_second": 0.904, + "step": 14700 + }, + { + "epoch": 33.44620611551529, + "grad_norm": 3.4942378997802734, + "learning_rate": 5.1486247792076715e-05, + "loss": 0.3485, + "step": 14750 + }, + { + "epoch": 33.559456398641, + "grad_norm": 3.3839058876037598, + "learning_rate": 5.098157961140551e-05, + "loss": 0.3442, + "step": 14800 + }, + { + "epoch": 33.559456398641, + "eval_loss": 0.6342476606369019, + "eval_runtime": 217.9772, + "eval_samples_per_second": 7.198, + "eval_steps_per_second": 0.904, + "step": 14800 + }, + { + "epoch": 33.672706681766705, + "grad_norm": 3.631831407546997, + "learning_rate": 5.04769114307343e-05, + "loss": 0.3505, + "step": 14850 + }, + { + "epoch": 33.785956964892414, + "grad_norm": 3.05415678024292, + "learning_rate": 4.9972243250063086e-05, + "loss": 0.342, + "step": 14900 + }, + { + "epoch": 33.785956964892414, + "eval_loss": 0.6282561421394348, + "eval_runtime": 218.0099, + "eval_samples_per_second": 7.197, + "eval_steps_per_second": 0.904, + "step": 14900 + }, + { + "epoch": 33.89920724801812, + "grad_norm": 3.213174343109131, + "learning_rate": 4.946757506939187e-05, + "loss": 0.3526, + "step": 14950 + }, + { + "epoch": 34.013590033975085, + "grad_norm": 2.7019898891448975, + "learning_rate": 4.8962906888720665e-05, + "loss": 0.3596, + "step": 15000 + }, + { + "epoch": 34.013590033975085, + "eval_loss": 0.6229637265205383, + "eval_runtime": 217.9233, + "eval_samples_per_second": 7.2, + "eval_steps_per_second": 0.904, + "step": 15000 + }, + { + "epoch": 34.12684031710079, + "grad_norm": 3.5813961029052734, + "learning_rate": 4.845823870804946e-05, + "loss": 0.3202, + "step": 15050 + }, + { + "epoch": 34.2400906002265, + "grad_norm": 2.996546983718872, + "learning_rate": 4.795357052737825e-05, + "loss": 0.3208, + "step": 15100 + }, + { + "epoch": 34.2400906002265, + "eval_loss": 0.6200416684150696, + "eval_runtime": 217.9712, + "eval_samples_per_second": 7.198, + "eval_steps_per_second": 0.904, + "step": 15100 + }, + { + "epoch": 34.35334088335221, + "grad_norm": 3.248931407928467, + "learning_rate": 4.744890234670704e-05, + "loss": 0.3191, + "step": 15150 + }, + { + "epoch": 34.46659116647792, + "grad_norm": 2.503894805908203, + "learning_rate": 4.6944234166035835e-05, + "loss": 0.3206, + "step": 15200 + }, + { + "epoch": 34.46659116647792, + "eval_loss": 0.6110924482345581, + "eval_runtime": 217.9474, + "eval_samples_per_second": 7.199, + "eval_steps_per_second": 0.904, + "step": 15200 + }, + { + "epoch": 34.57984144960363, + "grad_norm": 3.3107473850250244, + "learning_rate": 4.643956598536463e-05, + "loss": 0.3198, + "step": 15250 + }, + { + "epoch": 34.69309173272933, + "grad_norm": 2.6435258388519287, + "learning_rate": 4.5934897804693414e-05, + "loss": 0.3261, + "step": 15300 + }, + { + "epoch": 34.69309173272933, + "eval_loss": 0.603391706943512, + "eval_runtime": 217.9424, + "eval_samples_per_second": 7.199, + "eval_steps_per_second": 0.904, + "step": 15300 + }, + { + "epoch": 34.80634201585504, + "grad_norm": 3.1980810165405273, + "learning_rate": 4.5430229624022207e-05, + "loss": 0.3216, + "step": 15350 + }, + { + "epoch": 34.919592298980746, + "grad_norm": 2.4994754791259766, + "learning_rate": 4.4925561443351e-05, + "loss": 0.3168, + "step": 15400 + }, + { + "epoch": 34.919592298980746, + "eval_loss": 0.5940945148468018, + "eval_runtime": 218.056, + "eval_samples_per_second": 7.195, + "eval_steps_per_second": 0.903, + "step": 15400 + }, + { + "epoch": 35.033975084937715, + "grad_norm": 2.75138521194458, + "learning_rate": 4.442089326267979e-05, + "loss": 0.3191, + "step": 15450 + }, + { + "epoch": 35.14722536806342, + "grad_norm": 3.1039974689483643, + "learning_rate": 4.3916225082008585e-05, + "loss": 0.296, + "step": 15500 + }, + { + "epoch": 35.14722536806342, + "eval_loss": 0.5926975607872009, + "eval_runtime": 218.0089, + "eval_samples_per_second": 7.197, + "eval_steps_per_second": 0.904, + "step": 15500 + }, + { + "epoch": 35.260475651189125, + "grad_norm": 2.9686388969421387, + "learning_rate": 4.341155690133738e-05, + "loss": 0.2921, + "step": 15550 + }, + { + "epoch": 35.373725934314834, + "grad_norm": 2.5670547485351562, + "learning_rate": 4.290688872066616e-05, + "loss": 0.2909, + "step": 15600 + }, + { + "epoch": 35.373725934314834, + "eval_loss": 0.5892407894134521, + "eval_runtime": 217.8483, + "eval_samples_per_second": 7.202, + "eval_steps_per_second": 0.904, + "step": 15600 + }, + { + "epoch": 35.48697621744054, + "grad_norm": 2.28952956199646, + "learning_rate": 4.2402220539994956e-05, + "loss": 0.2947, + "step": 15650 + }, + { + "epoch": 35.60022650056625, + "grad_norm": 2.401625394821167, + "learning_rate": 4.189755235932374e-05, + "loss": 0.2915, + "step": 15700 + }, + { + "epoch": 35.60022650056625, + "eval_loss": 0.5815189480781555, + "eval_runtime": 217.8623, + "eval_samples_per_second": 7.202, + "eval_steps_per_second": 0.904, + "step": 15700 + }, + { + "epoch": 35.71347678369196, + "grad_norm": 2.7113890647888184, + "learning_rate": 4.1392884178652534e-05, + "loss": 0.2908, + "step": 15750 + }, + { + "epoch": 35.82672706681767, + "grad_norm": 2.949303388595581, + "learning_rate": 4.088821599798133e-05, + "loss": 0.2942, + "step": 15800 + }, + { + "epoch": 35.82672706681767, + "eval_loss": 0.5712306499481201, + "eval_runtime": 217.792, + "eval_samples_per_second": 7.204, + "eval_steps_per_second": 0.905, + "step": 15800 + }, + { + "epoch": 35.939977349943376, + "grad_norm": 2.3547251224517822, + "learning_rate": 4.038354781731012e-05, + "loss": 0.2854, + "step": 15850 + }, + { + "epoch": 36.05436013590034, + "grad_norm": 2.6130595207214355, + "learning_rate": 3.987887963663891e-05, + "loss": 0.2877, + "step": 15900 + }, + { + "epoch": 36.05436013590034, + "eval_loss": 0.5668493509292603, + "eval_runtime": 217.7584, + "eval_samples_per_second": 7.205, + "eval_steps_per_second": 0.905, + "step": 15900 + }, + { + "epoch": 36.16761041902605, + "grad_norm": 2.4720046520233154, + "learning_rate": 3.9374211455967705e-05, + "loss": 0.272, + "step": 15950 + }, + { + "epoch": 36.280860702151756, + "grad_norm": 3.291337490081787, + "learning_rate": 3.886954327529649e-05, + "loss": 0.2756, + "step": 16000 + }, + { + "epoch": 36.280860702151756, + "eval_loss": 0.5569508075714111, + "eval_runtime": 217.7243, + "eval_samples_per_second": 7.206, + "eval_steps_per_second": 0.905, + "step": 16000 + }, + { + "epoch": 36.394110985277464, + "grad_norm": 2.275122880935669, + "learning_rate": 3.8364875094625284e-05, + "loss": 0.2699, + "step": 16050 + }, + { + "epoch": 36.50736126840317, + "grad_norm": 2.351252317428589, + "learning_rate": 3.7860206913954076e-05, + "loss": 0.263, + "step": 16100 + }, + { + "epoch": 36.50736126840317, + "eval_loss": 0.552777886390686, + "eval_runtime": 217.6644, + "eval_samples_per_second": 7.208, + "eval_steps_per_second": 0.905, + "step": 16100 + }, + { + "epoch": 36.62061155152888, + "grad_norm": 2.0470945835113525, + "learning_rate": 3.735553873328287e-05, + "loss": 0.2605, + "step": 16150 + }, + { + "epoch": 36.73386183465459, + "grad_norm": 2.258258819580078, + "learning_rate": 3.685087055261166e-05, + "loss": 0.2621, + "step": 16200 + }, + { + "epoch": 36.73386183465459, + "eval_loss": 0.548316478729248, + "eval_runtime": 217.7617, + "eval_samples_per_second": 7.205, + "eval_steps_per_second": 0.905, + "step": 16200 + }, + { + "epoch": 36.84711211778029, + "grad_norm": 2.473788261413574, + "learning_rate": 3.6346202371940454e-05, + "loss": 0.2606, + "step": 16250 + }, + { + "epoch": 36.960362400906, + "grad_norm": 2.4730281829833984, + "learning_rate": 3.584153419126925e-05, + "loss": 0.2674, + "step": 16300 + }, + { + "epoch": 36.960362400906, + "eval_loss": 0.5399536490440369, + "eval_runtime": 217.745, + "eval_samples_per_second": 7.206, + "eval_steps_per_second": 0.905, + "step": 16300 + }, + { + "epoch": 37.07474518686297, + "grad_norm": 2.3119349479675293, + "learning_rate": 3.533686601059803e-05, + "loss": 0.258, + "step": 16350 + }, + { + "epoch": 37.18799546998868, + "grad_norm": 2.451964855194092, + "learning_rate": 3.4832197829926826e-05, + "loss": 0.2452, + "step": 16400 + }, + { + "epoch": 37.18799546998868, + "eval_loss": 0.5389652252197266, + "eval_runtime": 217.7139, + "eval_samples_per_second": 7.207, + "eval_steps_per_second": 0.905, + "step": 16400 + }, + { + "epoch": 37.30124575311438, + "grad_norm": 2.2861897945404053, + "learning_rate": 3.432752964925562e-05, + "loss": 0.2483, + "step": 16450 + }, + { + "epoch": 37.41449603624009, + "grad_norm": 1.7861238718032837, + "learning_rate": 3.3822861468584404e-05, + "loss": 0.2493, + "step": 16500 + }, + { + "epoch": 37.41449603624009, + "eval_loss": 0.5293774604797363, + "eval_runtime": 217.8898, + "eval_samples_per_second": 7.201, + "eval_steps_per_second": 0.904, + "step": 16500 + }, + { + "epoch": 37.5277463193658, + "grad_norm": 2.2910056114196777, + "learning_rate": 3.33181932879132e-05, + "loss": 0.2449, + "step": 16550 + }, + { + "epoch": 37.640996602491505, + "grad_norm": 2.102193832397461, + "learning_rate": 3.281352510724199e-05, + "loss": 0.2398, + "step": 16600 + }, + { + "epoch": 37.640996602491505, + "eval_loss": 0.5246281027793884, + "eval_runtime": 217.7811, + "eval_samples_per_second": 7.204, + "eval_steps_per_second": 0.905, + "step": 16600 + }, + { + "epoch": 37.75424688561721, + "grad_norm": 2.1423254013061523, + "learning_rate": 3.230885692657078e-05, + "loss": 0.2438, + "step": 16650 + }, + { + "epoch": 37.86749716874292, + "grad_norm": 2.031027317047119, + "learning_rate": 3.180418874589957e-05, + "loss": 0.2427, + "step": 16700 + }, + { + "epoch": 37.86749716874292, + "eval_loss": 0.5190041661262512, + "eval_runtime": 217.6886, + "eval_samples_per_second": 7.208, + "eval_steps_per_second": 0.905, + "step": 16700 + }, + { + "epoch": 37.98074745186863, + "grad_norm": 1.8530203104019165, + "learning_rate": 3.129952056522836e-05, + "loss": 0.2446, + "step": 16750 + }, + { + "epoch": 38.09513023782559, + "grad_norm": 1.9591715335845947, + "learning_rate": 3.0794852384557153e-05, + "loss": 0.2288, + "step": 16800 + }, + { + "epoch": 38.09513023782559, + "eval_loss": 0.5154264569282532, + "eval_runtime": 217.6837, + "eval_samples_per_second": 7.208, + "eval_steps_per_second": 0.905, + "step": 16800 + }, + { + "epoch": 38.2083805209513, + "grad_norm": 1.752700686454773, + "learning_rate": 3.0290184203885946e-05, + "loss": 0.2249, + "step": 16850 + }, + { + "epoch": 38.32163080407701, + "grad_norm": 1.7865016460418701, + "learning_rate": 2.978551602321474e-05, + "loss": 0.2254, + "step": 16900 + }, + { + "epoch": 38.32163080407701, + "eval_loss": 0.510138750076294, + "eval_runtime": 217.7044, + "eval_samples_per_second": 7.207, + "eval_steps_per_second": 0.905, + "step": 16900 + }, + { + "epoch": 38.43488108720272, + "grad_norm": 1.851835012435913, + "learning_rate": 2.9280847842543528e-05, + "loss": 0.2255, + "step": 16950 + }, + { + "epoch": 38.54813137032843, + "grad_norm": 1.7320882081985474, + "learning_rate": 2.877617966187232e-05, + "loss": 0.227, + "step": 17000 + }, + { + "epoch": 38.54813137032843, + "eval_loss": 0.5055522322654724, + "eval_runtime": 217.6805, + "eval_samples_per_second": 7.208, + "eval_steps_per_second": 0.905, + "step": 17000 + }, + { + "epoch": 38.661381653454136, + "grad_norm": 2.6240079402923584, + "learning_rate": 2.8271511481201113e-05, + "loss": 0.2227, + "step": 17050 + }, + { + "epoch": 38.774631936579844, + "grad_norm": 1.8069425821304321, + "learning_rate": 2.7766843300529906e-05, + "loss": 0.223, + "step": 17100 + }, + { + "epoch": 38.774631936579844, + "eval_loss": 0.49882474541664124, + "eval_runtime": 217.6912, + "eval_samples_per_second": 7.207, + "eval_steps_per_second": 0.905, + "step": 17100 + }, + { + "epoch": 38.88788221970555, + "grad_norm": 1.8260191679000854, + "learning_rate": 2.7262175119858695e-05, + "loss": 0.2239, + "step": 17150 + }, + { + "epoch": 39.002265005662515, + "grad_norm": 5.091439723968506, + "learning_rate": 2.6757506939187488e-05, + "loss": 0.2295, + "step": 17200 + }, + { + "epoch": 39.002265005662515, + "eval_loss": 0.49227145314216614, + "eval_runtime": 217.7384, + "eval_samples_per_second": 7.206, + "eval_steps_per_second": 0.905, + "step": 17200 + }, + { + "epoch": 39.115515288788224, + "grad_norm": 2.467454433441162, + "learning_rate": 2.625283875851628e-05, + "loss": 0.2057, + "step": 17250 + }, + { + "epoch": 39.22876557191393, + "grad_norm": 1.6467406749725342, + "learning_rate": 2.5748170577845067e-05, + "loss": 0.2022, + "step": 17300 + }, + { + "epoch": 39.22876557191393, + "eval_loss": 0.49300825595855713, + "eval_runtime": 217.7288, + "eval_samples_per_second": 7.206, + "eval_steps_per_second": 0.905, + "step": 17300 + }, + { + "epoch": 39.34201585503964, + "grad_norm": 1.446031093597412, + "learning_rate": 2.5243502397173856e-05, + "loss": 0.2055, + "step": 17350 + }, + { + "epoch": 39.45526613816534, + "grad_norm": 1.6686514616012573, + "learning_rate": 2.4738834216502652e-05, + "loss": 0.2147, + "step": 17400 + }, + { + "epoch": 39.45526613816534, + "eval_loss": 0.485858678817749, + "eval_runtime": 217.207, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 0.907, + "step": 17400 + }, + { + "epoch": 39.56851642129105, + "grad_norm": 1.513580322265625, + "learning_rate": 2.4234166035831445e-05, + "loss": 0.2046, + "step": 17450 + }, + { + "epoch": 39.68176670441676, + "grad_norm": 1.5527840852737427, + "learning_rate": 2.372949785516023e-05, + "loss": 0.2039, + "step": 17500 + }, + { + "epoch": 39.68176670441676, + "eval_loss": 0.48166778683662415, + "eval_runtime": 217.3046, + "eval_samples_per_second": 7.22, + "eval_steps_per_second": 0.907, + "step": 17500 + }, + { + "epoch": 39.79501698754247, + "grad_norm": 1.5010417699813843, + "learning_rate": 2.3224829674489023e-05, + "loss": 0.21, + "step": 17550 + }, + { + "epoch": 39.908267270668176, + "grad_norm": 2.1489455699920654, + "learning_rate": 2.2720161493817816e-05, + "loss": 0.2042, + "step": 17600 + }, + { + "epoch": 39.908267270668176, + "eval_loss": 0.4749002754688263, + "eval_runtime": 217.3286, + "eval_samples_per_second": 7.219, + "eval_steps_per_second": 0.906, + "step": 17600 + }, + { + "epoch": 40.02265005662514, + "grad_norm": 1.933009147644043, + "learning_rate": 2.221549331314661e-05, + "loss": 0.2157, + "step": 17650 + }, + { + "epoch": 40.13590033975085, + "grad_norm": 1.5398054122924805, + "learning_rate": 2.1710825132475398e-05, + "loss": 0.1894, + "step": 17700 + }, + { + "epoch": 40.13590033975085, + "eval_loss": 0.4718286097049713, + "eval_runtime": 217.1637, + "eval_samples_per_second": 7.225, + "eval_steps_per_second": 0.907, + "step": 17700 + }, + { + "epoch": 40.249150622876556, + "grad_norm": 1.7476941347122192, + "learning_rate": 2.120615695180419e-05, + "loss": 0.1924, + "step": 17750 + }, + { + "epoch": 40.362400906002264, + "grad_norm": 1.5386378765106201, + "learning_rate": 2.0701488771132983e-05, + "loss": 0.1918, + "step": 17800 + }, + { + "epoch": 40.362400906002264, + "eval_loss": 0.46827250719070435, + "eval_runtime": 217.1473, + "eval_samples_per_second": 7.226, + "eval_steps_per_second": 0.907, + "step": 17800 + }, + { + "epoch": 40.47565118912797, + "grad_norm": 1.6006604433059692, + "learning_rate": 2.0196820590461773e-05, + "loss": 0.188, + "step": 17850 + }, + { + "epoch": 40.58890147225368, + "grad_norm": 1.5906981229782104, + "learning_rate": 1.9692152409790562e-05, + "loss": 0.1922, + "step": 17900 + }, + { + "epoch": 40.58890147225368, + "eval_loss": 0.4618977904319763, + "eval_runtime": 217.1294, + "eval_samples_per_second": 7.226, + "eval_steps_per_second": 0.907, + "step": 17900 + }, + { + "epoch": 40.70215175537939, + "grad_norm": 1.451889991760254, + "learning_rate": 1.9187484229119355e-05, + "loss": 0.1961, + "step": 17950 + }, + { + "epoch": 40.8154020385051, + "grad_norm": 1.2037873268127441, + "learning_rate": 1.8682816048448147e-05, + "loss": 0.1951, + "step": 18000 + }, + { + "epoch": 40.8154020385051, + "eval_loss": 0.45853978395462036, + "eval_runtime": 217.1867, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 0.907, + "step": 18000 + }, + { + "epoch": 40.92865232163081, + "grad_norm": 1.124363899230957, + "learning_rate": 1.8178147867776936e-05, + "loss": 0.1907, + "step": 18050 + }, + { + "epoch": 41.04303510758777, + "grad_norm": 1.1726500988006592, + "learning_rate": 1.767347968710573e-05, + "loss": 0.1893, + "step": 18100 + }, + { + "epoch": 41.04303510758777, + "eval_loss": 0.4549981355667114, + "eval_runtime": 217.2331, + "eval_samples_per_second": 7.223, + "eval_steps_per_second": 0.907, + "step": 18100 + }, + { + "epoch": 41.15628539071348, + "grad_norm": 1.6041500568389893, + "learning_rate": 1.7168811506434522e-05, + "loss": 0.1769, + "step": 18150 + }, + { + "epoch": 41.26953567383919, + "grad_norm": 1.9704344272613525, + "learning_rate": 1.666414332576331e-05, + "loss": 0.1798, + "step": 18200 + }, + { + "epoch": 41.26953567383919, + "eval_loss": 0.45383498072624207, + "eval_runtime": 217.2468, + "eval_samples_per_second": 7.222, + "eval_steps_per_second": 0.907, + "step": 18200 + }, + { + "epoch": 41.382785956964895, + "grad_norm": 1.1522181034088135, + "learning_rate": 1.6159475145092104e-05, + "loss": 0.1858, + "step": 18250 + }, + { + "epoch": 41.4960362400906, + "grad_norm": 1.6338062286376953, + "learning_rate": 1.5654806964420893e-05, + "loss": 0.1776, + "step": 18300 + }, + { + "epoch": 41.4960362400906, + "eval_loss": 0.448618620634079, + "eval_runtime": 217.2186, + "eval_samples_per_second": 7.223, + "eval_steps_per_second": 0.907, + "step": 18300 + }, + { + "epoch": 41.609286523216305, + "grad_norm": 1.1537904739379883, + "learning_rate": 1.5150138783749684e-05, + "loss": 0.1759, + "step": 18350 + }, + { + "epoch": 41.72253680634201, + "grad_norm": 1.285271406173706, + "learning_rate": 1.4645470603078477e-05, + "loss": 0.1794, + "step": 18400 + }, + { + "epoch": 41.72253680634201, + "eval_loss": 0.4447907507419586, + "eval_runtime": 217.1396, + "eval_samples_per_second": 7.226, + "eval_steps_per_second": 0.907, + "step": 18400 + }, + { + "epoch": 41.83578708946772, + "grad_norm": 1.125063419342041, + "learning_rate": 1.4140802422407268e-05, + "loss": 0.1756, + "step": 18450 + }, + { + "epoch": 41.94903737259343, + "grad_norm": 1.1060149669647217, + "learning_rate": 1.3636134241736059e-05, + "loss": 0.1787, + "step": 18500 + }, + { + "epoch": 41.94903737259343, + "eval_loss": 0.4420225918292999, + "eval_runtime": 217.4988, + "eval_samples_per_second": 7.214, + "eval_steps_per_second": 0.906, + "step": 18500 + }, + { + "epoch": 42.06342015855039, + "grad_norm": 1.0146502256393433, + "learning_rate": 1.3131466061064851e-05, + "loss": 0.1791, + "step": 18550 + }, + { + "epoch": 42.1766704416761, + "grad_norm": 1.1884300708770752, + "learning_rate": 1.2626797880393642e-05, + "loss": 0.1658, + "step": 18600 + }, + { + "epoch": 42.1766704416761, + "eval_loss": 0.4396124482154846, + "eval_runtime": 217.7883, + "eval_samples_per_second": 7.204, + "eval_steps_per_second": 0.905, + "step": 18600 + }, + { + "epoch": 42.28992072480181, + "grad_norm": 1.1497679948806763, + "learning_rate": 1.2122129699722433e-05, + "loss": 0.1696, + "step": 18650 + }, + { + "epoch": 42.40317100792752, + "grad_norm": 1.32937490940094, + "learning_rate": 1.1617461519051224e-05, + "loss": 0.1643, + "step": 18700 + }, + { + "epoch": 42.40317100792752, + "eval_loss": 0.43940281867980957, + "eval_runtime": 218.6239, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.901, + "step": 18700 + }, + { + "epoch": 42.51642129105323, + "grad_norm": 1.5960180759429932, + "learning_rate": 1.1112793338380017e-05, + "loss": 0.1699, + "step": 18750 + }, + { + "epoch": 42.629671574178936, + "grad_norm": 1.0415377616882324, + "learning_rate": 1.0608125157708806e-05, + "loss": 0.1654, + "step": 18800 + }, + { + "epoch": 42.629671574178936, + "eval_loss": 0.43373051285743713, + "eval_runtime": 218.5653, + "eval_samples_per_second": 7.179, + "eval_steps_per_second": 0.901, + "step": 18800 + }, + { + "epoch": 42.742921857304644, + "grad_norm": 1.5094951391220093, + "learning_rate": 1.0103456977037597e-05, + "loss": 0.1669, + "step": 18850 + }, + { + "epoch": 42.85617214043035, + "grad_norm": 0.9974751472473145, + "learning_rate": 9.59878879636639e-06, + "loss": 0.1681, + "step": 18900 + }, + { + "epoch": 42.85617214043035, + "eval_loss": 0.4303882420063019, + "eval_runtime": 218.6322, + "eval_samples_per_second": 7.176, + "eval_steps_per_second": 0.901, + "step": 18900 + }, + { + "epoch": 42.96942242355606, + "grad_norm": 0.9117754697799683, + "learning_rate": 9.094120615695181e-06, + "loss": 0.1706, + "step": 18950 + }, + { + "epoch": 43.083805209513024, + "grad_norm": 1.0373188257217407, + "learning_rate": 8.589452435023972e-06, + "loss": 0.1643, + "step": 19000 + }, + { + "epoch": 43.083805209513024, + "eval_loss": 0.42856693267822266, + "eval_runtime": 218.5993, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 19000 + }, + { + "epoch": 43.19705549263873, + "grad_norm": 0.9998382329940796, + "learning_rate": 8.084784254352763e-06, + "loss": 0.1617, + "step": 19050 + }, + { + "epoch": 43.31030577576444, + "grad_norm": 0.9849778413772583, + "learning_rate": 7.580116073681555e-06, + "loss": 0.1603, + "step": 19100 + }, + { + "epoch": 43.31030577576444, + "eval_loss": 0.4269334077835083, + "eval_runtime": 218.6737, + "eval_samples_per_second": 7.175, + "eval_steps_per_second": 0.901, + "step": 19100 + }, + { + "epoch": 43.42355605889015, + "grad_norm": 1.2009530067443848, + "learning_rate": 7.0754478930103465e-06, + "loss": 0.157, + "step": 19150 + }, + { + "epoch": 43.53680634201586, + "grad_norm": 0.8868136405944824, + "learning_rate": 6.570779712339137e-06, + "loss": 0.1582, + "step": 19200 + }, + { + "epoch": 43.53680634201586, + "eval_loss": 0.42409417033195496, + "eval_runtime": 218.6076, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.901, + "step": 19200 + }, + { + "epoch": 43.650056625141566, + "grad_norm": 0.8435959815979004, + "learning_rate": 6.0661115316679285e-06, + "loss": 0.158, + "step": 19250 + }, + { + "epoch": 43.76330690826727, + "grad_norm": 1.1476356983184814, + "learning_rate": 5.56144335099672e-06, + "loss": 0.1608, + "step": 19300 + }, + { + "epoch": 43.76330690826727, + "eval_loss": 0.422664076089859, + "eval_runtime": 218.5875, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 19300 + }, + { + "epoch": 43.876557191392976, + "grad_norm": 0.765332043170929, + "learning_rate": 5.056775170325511e-06, + "loss": 0.1606, + "step": 19350 + }, + { + "epoch": 43.989807474518685, + "grad_norm": 0.9879748821258545, + "learning_rate": 4.552106989654302e-06, + "loss": 0.1573, + "step": 19400 + }, + { + "epoch": 43.989807474518685, + "eval_loss": 0.4201904535293579, + "eval_runtime": 218.5744, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 19400 + }, + { + "epoch": 44.104190260475654, + "grad_norm": 0.6540424227714539, + "learning_rate": 4.047438808983093e-06, + "loss": 0.1572, + "step": 19450 + }, + { + "epoch": 44.217440543601356, + "grad_norm": 0.9124572277069092, + "learning_rate": 3.542770628311885e-06, + "loss": 0.1498, + "step": 19500 + }, + { + "epoch": 44.217440543601356, + "eval_loss": 0.4200960695743561, + "eval_runtime": 218.5932, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 19500 + }, + { + "epoch": 44.330690826727064, + "grad_norm": 0.8609676957130432, + "learning_rate": 3.0381024476406765e-06, + "loss": 0.1509, + "step": 19550 + }, + { + "epoch": 44.44394110985277, + "grad_norm": 0.7417690753936768, + "learning_rate": 2.533434266969468e-06, + "loss": 0.1492, + "step": 19600 + }, + { + "epoch": 44.44394110985277, + "eval_loss": 0.41948238015174866, + "eval_runtime": 218.5587, + "eval_samples_per_second": 7.179, + "eval_steps_per_second": 0.901, + "step": 19600 + }, + { + "epoch": 44.55719139297848, + "grad_norm": 0.8361729979515076, + "learning_rate": 2.0287660862982593e-06, + "loss": 0.1541, + "step": 19650 + }, + { + "epoch": 44.67044167610419, + "grad_norm": 0.911729097366333, + "learning_rate": 1.5240979056270503e-06, + "loss": 0.1559, + "step": 19700 + }, + { + "epoch": 44.67044167610419, + "eval_loss": 0.41870439052581787, + "eval_runtime": 218.5239, + "eval_samples_per_second": 7.18, + "eval_steps_per_second": 0.902, + "step": 19700 + }, + { + "epoch": 44.7836919592299, + "grad_norm": 0.7706825733184814, + "learning_rate": 1.0194297249558415e-06, + "loss": 0.1554, + "step": 19750 + }, + { + "epoch": 44.89694224235561, + "grad_norm": 0.9403465986251831, + "learning_rate": 5.147615442846329e-07, + "loss": 0.1549, + "step": 19800 + }, + { + "epoch": 44.89694224235561, + "eval_loss": 0.4180174469947815, + "eval_runtime": 218.6086, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 0.901, + "step": 19800 + }, + { + "epoch": 45.01132502831257, + "grad_norm": 0.9403154253959656, + "learning_rate": 8.532637580325652e-06, + "loss": 0.1533, + "step": 19850 + }, + { + "epoch": 45.12457531143828, + "grad_norm": 0.8529797196388245, + "learning_rate": 8.049475769435185e-06, + "loss": 0.1507, + "step": 19900 + }, + { + "epoch": 45.12457531143828, + "eval_loss": 0.41985705494880676, + "eval_runtime": 218.5997, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 19900 + }, + { + "epoch": 45.237825594563986, + "grad_norm": 0.877526581287384, + "learning_rate": 7.5663139585447175e-06, + "loss": 0.1498, + "step": 19950 + }, + { + "epoch": 45.351075877689695, + "grad_norm": 0.9668393731117249, + "learning_rate": 7.0831521476542495e-06, + "loss": 0.152, + "step": 20000 + }, + { + "epoch": 45.351075877689695, + "eval_loss": 0.41871750354766846, + "eval_runtime": 218.6395, + "eval_samples_per_second": 7.176, + "eval_steps_per_second": 0.901, + "step": 20000 + }, + { + "epoch": 45.4643261608154, + "grad_norm": 1.0251694917678833, + "learning_rate": 6.599990336763782e-06, + "loss": 0.1529, + "step": 20050 + }, + { + "epoch": 45.57757644394111, + "grad_norm": 1.4579505920410156, + "learning_rate": 6.116828525873315e-06, + "loss": 0.1571, + "step": 20100 + }, + { + "epoch": 45.57757644394111, + "eval_loss": 0.4161282181739807, + "eval_runtime": 218.584, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 20100 + }, + { + "epoch": 45.69082672706682, + "grad_norm": 0.7462686896324158, + "learning_rate": 5.633666714982848e-06, + "loss": 0.1611, + "step": 20150 + }, + { + "epoch": 45.80407701019253, + "grad_norm": 0.9031079411506653, + "learning_rate": 5.150504904092381e-06, + "loss": 0.153, + "step": 20200 + }, + { + "epoch": 45.80407701019253, + "eval_loss": 0.41474393010139465, + "eval_runtime": 218.6388, + "eval_samples_per_second": 7.176, + "eval_steps_per_second": 0.901, + "step": 20200 + }, + { + "epoch": 45.91732729331823, + "grad_norm": 0.8560954332351685, + "learning_rate": 4.667343093201913e-06, + "loss": 0.1531, + "step": 20250 + }, + { + "epoch": 46.0317100792752, + "grad_norm": 1.1464442014694214, + "learning_rate": 4.184181282311446e-06, + "loss": 0.1535, + "step": 20300 + }, + { + "epoch": 46.0317100792752, + "eval_loss": 0.414587140083313, + "eval_runtime": 218.5994, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 20300 + }, + { + "epoch": 46.14496036240091, + "grad_norm": 0.8384661674499512, + "learning_rate": 3.7010194714209794e-06, + "loss": 0.1488, + "step": 20350 + }, + { + "epoch": 46.25821064552662, + "grad_norm": 0.8300140500068665, + "learning_rate": 3.217857660530512e-06, + "loss": 0.1507, + "step": 20400 + }, + { + "epoch": 46.25821064552662, + "eval_loss": 0.413276344537735, + "eval_runtime": 218.6607, + "eval_samples_per_second": 7.176, + "eval_steps_per_second": 0.901, + "step": 20400 + }, + { + "epoch": 46.37146092865232, + "grad_norm": 0.7903048396110535, + "learning_rate": 2.7346958496400447e-06, + "loss": 0.148, + "step": 20450 + }, + { + "epoch": 46.48471121177803, + "grad_norm": 0.888008713722229, + "learning_rate": 2.2515340387495775e-06, + "loss": 0.1447, + "step": 20500 + }, + { + "epoch": 46.48471121177803, + "eval_loss": 0.4132575988769531, + "eval_runtime": 218.6308, + "eval_samples_per_second": 7.176, + "eval_steps_per_second": 0.901, + "step": 20500 + }, + { + "epoch": 46.597961494903736, + "grad_norm": 0.975723147392273, + "learning_rate": 1.7683722278591102e-06, + "loss": 0.1448, + "step": 20550 + }, + { + "epoch": 46.711211778029444, + "grad_norm": 0.7616918087005615, + "learning_rate": 1.2852104169686428e-06, + "loss": 0.1489, + "step": 20600 + }, + { + "epoch": 46.711211778029444, + "eval_loss": 0.4121854305267334, + "eval_runtime": 218.5727, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 0.901, + "step": 20600 + }, + { + "epoch": 46.82446206115515, + "grad_norm": 0.8662727475166321, + "learning_rate": 8.117118422959849e-07, + "loss": 0.1483, + "step": 20650 + }, + { + "epoch": 46.93771234428086, + "grad_norm": 0.7502096891403198, + "learning_rate": 3.2855003140551773e-07, + "loss": 0.1504, + "step": 20700 + }, + { + "epoch": 46.93771234428086, + "eval_loss": 0.41195544600486755, + "eval_runtime": 218.7035, + "eval_samples_per_second": 7.174, + "eval_steps_per_second": 0.901, + "step": 20700 + }, + { + "epoch": 47.052095130237824, + "grad_norm": 0.9510757923126221, + "learning_rate": 1.1871026339691191e-05, + "loss": 0.1467, + "step": 20750 + }, + { + "epoch": 47.16534541336353, + "grad_norm": 1.0743557214736938, + "learning_rate": 1.1416893732970029e-05, + "loss": 0.1497, + "step": 20800 + }, + { + "epoch": 47.16534541336353, + "eval_loss": 0.4156714379787445, + "eval_runtime": 217.4784, + "eval_samples_per_second": 7.215, + "eval_steps_per_second": 0.906, + "step": 20800 + }, + { + "epoch": 47.27859569648924, + "grad_norm": 1.567784070968628, + "learning_rate": 1.0962761126248864e-05, + "loss": 0.1513, + "step": 20850 + }, + { + "epoch": 47.39184597961495, + "grad_norm": 1.3992472887039185, + "learning_rate": 1.0508628519527702e-05, + "loss": 0.1533, + "step": 20900 + }, + { + "epoch": 47.39184597961495, + "eval_loss": 0.4152044653892517, + "eval_runtime": 217.5597, + "eval_samples_per_second": 7.212, + "eval_steps_per_second": 0.905, + "step": 20900 + }, + { + "epoch": 47.50509626274066, + "grad_norm": 1.5980275869369507, + "learning_rate": 1.005449591280654e-05, + "loss": 0.1523, + "step": 20950 + }, + { + "epoch": 47.618346545866366, + "grad_norm": 1.2810208797454834, + "learning_rate": 9.600363306085377e-06, + "loss": 0.1502, + "step": 21000 + }, + { + "epoch": 47.618346545866366, + "eval_loss": 0.4143332839012146, + "eval_runtime": 218.7933, + "eval_samples_per_second": 7.171, + "eval_steps_per_second": 0.9, + "step": 21000 + }, + { + "epoch": 47.731596828992075, + "grad_norm": 1.4590628147125244, + "learning_rate": 9.146230699364216e-06, + "loss": 0.1512, + "step": 21050 + }, + { + "epoch": 47.84484711211778, + "grad_norm": 1.3043591976165771, + "learning_rate": 8.692098092643053e-06, + "loss": 0.1561, + "step": 21100 + }, + { + "epoch": 47.84484711211778, + "eval_loss": 0.41214144229888916, + "eval_runtime": 218.7674, + "eval_samples_per_second": 7.172, + "eval_steps_per_second": 0.9, + "step": 21100 + }, + { + "epoch": 47.958097395243485, + "grad_norm": 0.8709500432014465, + "learning_rate": 8.247048138056313e-06, + "loss": 0.1478, + "step": 21150 + }, + { + "epoch": 48.072480181200454, + "grad_norm": 1.005632758140564, + "learning_rate": 7.79291553133515e-06, + "loss": 0.1534, + "step": 21200 + }, + { + "epoch": 48.072480181200454, + "eval_loss": 0.4120267927646637, + "eval_runtime": 218.9152, + "eval_samples_per_second": 7.167, + "eval_steps_per_second": 0.9, + "step": 21200 + }, + { + "epoch": 48.18573046432616, + "grad_norm": 1.2001721858978271, + "learning_rate": 7.347865576748411e-06, + "loss": 0.1431, + "step": 21250 + }, + { + "epoch": 48.29898074745187, + "grad_norm": 1.2004830837249756, + "learning_rate": 6.893732970027249e-06, + "loss": 0.1457, + "step": 21300 + }, + { + "epoch": 48.29898074745187, + "eval_loss": 0.4105300009250641, + "eval_runtime": 218.8468, + "eval_samples_per_second": 7.169, + "eval_steps_per_second": 0.9, + "step": 21300 + }, + { + "epoch": 48.41223103057758, + "grad_norm": 1.0889978408813477, + "learning_rate": 6.439600363306085e-06, + "loss": 0.1462, + "step": 21350 + }, + { + "epoch": 48.52548131370328, + "grad_norm": 0.9354040026664734, + "learning_rate": 5.985467756584924e-06, + "loss": 0.1464, + "step": 21400 + }, + { + "epoch": 48.52548131370328, + "eval_loss": 0.40966492891311646, + "eval_runtime": 218.8783, + "eval_samples_per_second": 7.168, + "eval_steps_per_second": 0.9, + "step": 21400 + }, + { + "epoch": 48.63873159682899, + "grad_norm": 0.8427848815917969, + "learning_rate": 5.53133514986376e-06, + "loss": 0.146, + "step": 21450 + }, + { + "epoch": 48.7519818799547, + "grad_norm": 0.9390880465507507, + "learning_rate": 5.077202543142598e-06, + "loss": 0.1462, + "step": 21500 + }, + { + "epoch": 48.7519818799547, + "eval_loss": 0.40723294019699097, + "eval_runtime": 218.8819, + "eval_samples_per_second": 7.168, + "eval_steps_per_second": 0.9, + "step": 21500 + }, + { + "epoch": 48.86523216308041, + "grad_norm": 1.0009453296661377, + "learning_rate": 4.623069936421435e-06, + "loss": 0.1442, + "step": 21550 + }, + { + "epoch": 48.978482446206115, + "grad_norm": 1.11566960811615, + "learning_rate": 4.168937329700273e-06, + "loss": 0.1469, + "step": 21600 + }, + { + "epoch": 48.978482446206115, + "eval_loss": 0.405407190322876, + "eval_runtime": 218.8588, + "eval_samples_per_second": 7.169, + "eval_steps_per_second": 0.9, + "step": 21600 + }, + { + "epoch": 49.09286523216308, + "grad_norm": 0.8854078054428101, + "learning_rate": 3.71480472297911e-06, + "loss": 0.1435, + "step": 21650 + }, + { + "epoch": 49.206115515288786, + "grad_norm": 0.8558112978935242, + "learning_rate": 3.260672116257948e-06, + "loss": 0.1378, + "step": 21700 + }, + { + "epoch": 49.206115515288786, + "eval_loss": 0.4061279296875, + "eval_runtime": 218.8569, + "eval_samples_per_second": 7.169, + "eval_steps_per_second": 0.9, + "step": 21700 + }, + { + "epoch": 49.319365798414495, + "grad_norm": 0.7999886870384216, + "learning_rate": 2.806539509536785e-06, + "loss": 0.1417, + "step": 21750 + }, + { + "epoch": 49.4326160815402, + "grad_norm": 0.948358952999115, + "learning_rate": 2.3524069028156224e-06, + "loss": 0.1415, + "step": 21800 + }, + { + "epoch": 49.4326160815402, + "eval_loss": 0.40446802973747253, + "eval_runtime": 218.7745, + "eval_samples_per_second": 7.172, + "eval_steps_per_second": 0.9, + "step": 21800 + }, + { + "epoch": 49.54586636466591, + "grad_norm": 0.7728579640388489, + "learning_rate": 1.8982742960944597e-06, + "loss": 0.1396, + "step": 21850 + }, + { + "epoch": 49.65911664779162, + "grad_norm": 0.7241719365119934, + "learning_rate": 1.4441416893732972e-06, + "loss": 0.1398, + "step": 21900 + }, + { + "epoch": 49.65911664779162, + "eval_loss": 0.4039037525653839, + "eval_runtime": 218.8617, + "eval_samples_per_second": 7.169, + "eval_steps_per_second": 0.9, + "step": 21900 + }, + { + "epoch": 49.77236693091733, + "grad_norm": 0.7789280414581299, + "learning_rate": 9.900090826521344e-07, + "loss": 0.1427, + "step": 21950 + }, + { + "epoch": 49.88561721404304, + "grad_norm": 0.8703135848045349, + "learning_rate": 5.358764759309719e-07, + "loss": 0.139, + "step": 22000 + }, + { + "epoch": 49.88561721404304, + "eval_loss": 0.40355798602104187, + "eval_runtime": 218.7677, + "eval_samples_per_second": 7.172, + "eval_steps_per_second": 0.9, + "step": 22000 + }, + { + "epoch": 49.998867497168746, + "grad_norm": 0.8729577660560608, + "learning_rate": 8.174386920980928e-08, + "loss": 0.1422, + "step": 22050 + } + ], + "logging_steps": 50, + "max_steps": 22050, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.11661035307008e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}