| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 344, | |
| "global_step": 5510, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0014519056261343012, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.6156, | |
| "num_input_tokens_seen": 499226, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0029038112522686023, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.0994, | |
| "num_input_tokens_seen": 1014244, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.004355716878402904, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0849, | |
| "num_input_tokens_seen": 1528464, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.005807622504537205, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 3.999979012178918e-05, | |
| "loss": 0.12, | |
| "num_input_tokens_seen": 2041011, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.007259528130671506, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.9999160491561583e-05, | |
| "loss": 0.1437, | |
| "num_input_tokens_seen": 2530185, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.008711433756805808, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 3.9998111122531796e-05, | |
| "loss": 0.0898, | |
| "num_input_tokens_seen": 3017273, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.010163339382940109, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 3.999664203672378e-05, | |
| "loss": 0.1247, | |
| "num_input_tokens_seen": 3507672, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.01161524500907441, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 3.999475326497044e-05, | |
| "loss": 0.0819, | |
| "num_input_tokens_seen": 4018539, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.013067150635208712, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 3.999244484691299e-05, | |
| "loss": 0.1078, | |
| "num_input_tokens_seen": 4525857, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.014519056261343012, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 3.998971683100009e-05, | |
| "loss": 0.099, | |
| "num_input_tokens_seen": 5023032, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.015970961887477313, | |
| "grad_norm": 1.625, | |
| "learning_rate": 3.9986569274486843e-05, | |
| "loss": 0.0855, | |
| "num_input_tokens_seen": 5524113, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.017422867513611617, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 3.9983002243433615e-05, | |
| "loss": 0.1026, | |
| "num_input_tokens_seen": 5999882, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.018874773139745917, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 3.9979015812704605e-05, | |
| "loss": 0.0843, | |
| "num_input_tokens_seen": 6471878, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.020326678765880218, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 3.997461006596631e-05, | |
| "loss": 0.0841, | |
| "num_input_tokens_seen": 6944973, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.021778584392014518, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 3.9969785095685765e-05, | |
| "loss": 0.0982, | |
| "num_input_tokens_seen": 7460215, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.02323049001814882, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 3.996454100312857e-05, | |
| "loss": 0.0971, | |
| "num_input_tokens_seen": 7942417, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.024682395644283123, | |
| "grad_norm": 82.0, | |
| "learning_rate": 3.9958877898356806e-05, | |
| "loss": 0.2563, | |
| "num_input_tokens_seen": 8454243, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.026134301270417423, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 3.99527959002267e-05, | |
| "loss": 0.1566, | |
| "num_input_tokens_seen": 8973734, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.027586206896551724, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 3.994629513638614e-05, | |
| "loss": 0.1109, | |
| "num_input_tokens_seen": 9497439, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.029038112522686024, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 3.993937574327201e-05, | |
| "loss": 0.1353, | |
| "num_input_tokens_seen": 9988636, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.030490018148820328, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 3.993203786610727e-05, | |
| "loss": 0.1002, | |
| "num_input_tokens_seen": 10460548, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.031941923774954625, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 3.992428165889799e-05, | |
| "loss": 0.0952, | |
| "num_input_tokens_seen": 10983644, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.033393829401088926, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 3.991610728443006e-05, | |
| "loss": 0.1082, | |
| "num_input_tokens_seen": 11485663, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.03484573502722323, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 3.9907514914265776e-05, | |
| "loss": 0.0907, | |
| "num_input_tokens_seen": 11981340, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.036297640653357534, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 3.989850472874027e-05, | |
| "loss": 0.0704, | |
| "num_input_tokens_seen": 12482463, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.037749546279491834, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 3.988907691695771e-05, | |
| "loss": 0.0847, | |
| "num_input_tokens_seen": 12968571, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.039201451905626135, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 3.987923167678732e-05, | |
| "loss": 0.0968, | |
| "num_input_tokens_seen": 13451536, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.040653357531760435, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 3.986896921485924e-05, | |
| "loss": 0.1026, | |
| "num_input_tokens_seen": 13949131, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.042105263157894736, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 3.9858289746560183e-05, | |
| "loss": 0.1126, | |
| "num_input_tokens_seen": 14447251, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.043557168784029036, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 3.984719349602892e-05, | |
| "loss": 0.0934, | |
| "num_input_tokens_seen": 14937783, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.04500907441016334, | |
| "grad_norm": 1.75, | |
| "learning_rate": 3.983568069615157e-05, | |
| "loss": 0.0936, | |
| "num_input_tokens_seen": 15429323, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.04646098003629764, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 3.982375158855672e-05, | |
| "loss": 0.0749, | |
| "num_input_tokens_seen": 15920688, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.047912885662431945, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 3.981140642361034e-05, | |
| "loss": 0.0868, | |
| "num_input_tokens_seen": 16393398, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.049364791288566245, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 3.9798645460410544e-05, | |
| "loss": 0.0997, | |
| "num_input_tokens_seen": 16894283, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.050816696914700546, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 3.9785468966782155e-05, | |
| "loss": 0.0849, | |
| "num_input_tokens_seen": 17371830, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.052268602540834846, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 3.9771877219271055e-05, | |
| "loss": 0.0925, | |
| "num_input_tokens_seen": 17893827, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.05372050816696915, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 3.975787050313841e-05, | |
| "loss": 0.0822, | |
| "num_input_tokens_seen": 18380621, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.05517241379310345, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 3.9743449112354676e-05, | |
| "loss": 0.1172, | |
| "num_input_tokens_seen": 18905348, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.05662431941923775, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 3.9728613349593415e-05, | |
| "loss": 0.1075, | |
| "num_input_tokens_seen": 19399905, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.05807622504537205, | |
| "grad_norm": 18.25, | |
| "learning_rate": 3.971336352622496e-05, | |
| "loss": 0.1882, | |
| "num_input_tokens_seen": 19921923, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.05952813067150635, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 3.969769996230989e-05, | |
| "loss": 0.1074, | |
| "num_input_tokens_seen": 20436822, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.060980036297640657, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 3.968162298659227e-05, | |
| "loss": 0.1112, | |
| "num_input_tokens_seen": 20943888, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.06243194192377496, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 3.9665132936492794e-05, | |
| "loss": 0.1519, | |
| "num_input_tokens_seen": 21418243, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.06243194192377496, | |
| "eval_loss": 0.11010845005512238, | |
| "eval_runtime": 2622.9951, | |
| "eval_samples_per_second": 1.188, | |
| "eval_steps_per_second": 0.149, | |
| "num_input_tokens_seen": 21418243, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.06388384754990925, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 3.9648230158101674e-05, | |
| "loss": 0.123, | |
| "num_input_tokens_seen": 21924518, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.06533575317604355, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 3.9630915006171416e-05, | |
| "loss": 0.1086, | |
| "num_input_tokens_seen": 22403227, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06678765880217785, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 3.961318784410932e-05, | |
| "loss": 0.1068, | |
| "num_input_tokens_seen": 22901361, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.06823956442831217, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 3.95950490439699e-05, | |
| "loss": 0.0931, | |
| "num_input_tokens_seen": 23408098, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.06969147005444647, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 3.9576498986447026e-05, | |
| "loss": 0.0817, | |
| "num_input_tokens_seen": 23890867, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.07114337568058077, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 3.9557538060866005e-05, | |
| "loss": 0.0917, | |
| "num_input_tokens_seen": 24393313, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.07259528130671507, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 3.9538166665175354e-05, | |
| "loss": 0.0865, | |
| "num_input_tokens_seen": 24894282, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07404718693284937, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 3.9518385205938446e-05, | |
| "loss": 0.1222, | |
| "num_input_tokens_seen": 25397169, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.07549909255898367, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 3.949819409832502e-05, | |
| "loss": 0.0899, | |
| "num_input_tokens_seen": 25894407, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.07695099818511797, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 3.947759376610242e-05, | |
| "loss": 0.0716, | |
| "num_input_tokens_seen": 26375741, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.07840290381125227, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.945658464162674e-05, | |
| "loss": 0.1094, | |
| "num_input_tokens_seen": 26881148, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.07985480943738657, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 3.9435167165833724e-05, | |
| "loss": 0.1517, | |
| "num_input_tokens_seen": 27373108, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.08130671506352087, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 3.9413341788229524e-05, | |
| "loss": 0.0959, | |
| "num_input_tokens_seen": 27852888, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.08275862068965517, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 3.939110896688126e-05, | |
| "loss": 0.0824, | |
| "num_input_tokens_seen": 28338065, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.08421052631578947, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 3.93684691684074e-05, | |
| "loss": 0.1234, | |
| "num_input_tokens_seen": 28842856, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.08566243194192377, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 3.9345422867967995e-05, | |
| "loss": 0.1118, | |
| "num_input_tokens_seen": 29349096, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.08711433756805807, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 3.9321970549254664e-05, | |
| "loss": 0.1055, | |
| "num_input_tokens_seen": 29826034, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.08856624319419237, | |
| "grad_norm": 18.75, | |
| "learning_rate": 3.929811270448049e-05, | |
| "loss": 0.1166, | |
| "num_input_tokens_seen": 30321718, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.09001814882032667, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 3.927384983436964e-05, | |
| "loss": 0.1134, | |
| "num_input_tokens_seen": 30812607, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.09147005444646097, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 3.924918244814689e-05, | |
| "loss": 0.0805, | |
| "num_input_tokens_seen": 31304931, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.09292196007259527, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 3.922411106352694e-05, | |
| "loss": 0.0849, | |
| "num_input_tokens_seen": 31792831, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.09437386569872959, | |
| "grad_norm": 1.375, | |
| "learning_rate": 3.9198636206703516e-05, | |
| "loss": 0.0919, | |
| "num_input_tokens_seen": 32286282, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.09582577132486389, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 3.9172758412338346e-05, | |
| "loss": 0.0896, | |
| "num_input_tokens_seen": 32770941, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.09727767695099819, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 3.9146478223549974e-05, | |
| "loss": 0.0925, | |
| "num_input_tokens_seen": 33253136, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.09872958257713249, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 3.9119796191902274e-05, | |
| "loss": 0.0656, | |
| "num_input_tokens_seen": 33760146, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.10018148820326679, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 3.9092712877392965e-05, | |
| "loss": 0.1162, | |
| "num_input_tokens_seen": 34251987, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.10163339382940109, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 3.906522884844181e-05, | |
| "loss": 0.1153, | |
| "num_input_tokens_seen": 34730598, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.10308529945553539, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 3.903734468187868e-05, | |
| "loss": 0.0731, | |
| "num_input_tokens_seen": 35215481, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.10453720508166969, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 3.900906096293148e-05, | |
| "loss": 0.0992, | |
| "num_input_tokens_seen": 35691971, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.105989110707804, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 3.8980378285213846e-05, | |
| "loss": 0.1058, | |
| "num_input_tokens_seen": 36191442, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.1074410163339383, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 3.895129725071268e-05, | |
| "loss": 0.0841, | |
| "num_input_tokens_seen": 36677760, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.1088929219600726, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 3.892181846977553e-05, | |
| "loss": 0.096, | |
| "num_input_tokens_seen": 37169594, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1103448275862069, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 3.8891942561097787e-05, | |
| "loss": 0.0865, | |
| "num_input_tokens_seen": 37658243, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.1117967332123412, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.8861670151709664e-05, | |
| "loss": 0.0926, | |
| "num_input_tokens_seen": 38172841, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.1132486388384755, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 3.883100187696308e-05, | |
| "loss": 0.0844, | |
| "num_input_tokens_seen": 38680418, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.1147005444646098, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 3.87999383805183e-05, | |
| "loss": 0.0889, | |
| "num_input_tokens_seen": 39168241, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.1161524500907441, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 3.876848031433042e-05, | |
| "loss": 0.0931, | |
| "num_input_tokens_seen": 39636702, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.1176043557168784, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 3.8736628338635716e-05, | |
| "loss": 0.0638, | |
| "num_input_tokens_seen": 40118232, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.1190562613430127, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 3.870438312193774e-05, | |
| "loss": 0.0775, | |
| "num_input_tokens_seen": 40614511, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 0.120508166969147, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 3.8671745340993354e-05, | |
| "loss": 0.0902, | |
| "num_input_tokens_seen": 41136221, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 0.12196007259528131, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.863871568079845e-05, | |
| "loss": 0.1083, | |
| "num_input_tokens_seen": 41626515, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.12341197822141561, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 3.860529483457362e-05, | |
| "loss": 0.0914, | |
| "num_input_tokens_seen": 42128107, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.12486388384754991, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 3.8571483503749625e-05, | |
| "loss": 0.1172, | |
| "num_input_tokens_seen": 42626752, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.12486388384754991, | |
| "eval_loss": 0.08887020498514175, | |
| "eval_runtime": 2566.1938, | |
| "eval_samples_per_second": 1.215, | |
| "eval_steps_per_second": 0.152, | |
| "num_input_tokens_seen": 42626752, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.12631578947368421, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 3.8537282397952604e-05, | |
| "loss": 0.0873, | |
| "num_input_tokens_seen": 43128274, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 0.1277676950998185, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 3.8502692234989265e-05, | |
| "loss": 0.0807, | |
| "num_input_tokens_seen": 43630580, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.12921960072595282, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.846771374083175e-05, | |
| "loss": 0.0792, | |
| "num_input_tokens_seen": 44143904, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.1306715063520871, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 3.843234764960244e-05, | |
| "loss": 0.0808, | |
| "num_input_tokens_seen": 44635682, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.13212341197822142, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 3.839659470355853e-05, | |
| "loss": 0.0902, | |
| "num_input_tokens_seen": 45110870, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.1335753176043557, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 3.8360455653076446e-05, | |
| "loss": 0.0872, | |
| "num_input_tokens_seen": 45620246, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.13502722323049002, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 3.832393125663613e-05, | |
| "loss": 0.1095, | |
| "num_input_tokens_seen": 46106634, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 0.13647912885662433, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 3.8287022280805064e-05, | |
| "loss": 0.1008, | |
| "num_input_tokens_seen": 46599497, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 3.824972950022224e-05, | |
| "loss": 0.0761, | |
| "num_input_tokens_seen": 47098121, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.13938294010889293, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 3.8212053697581855e-05, | |
| "loss": 0.0864, | |
| "num_input_tokens_seen": 47599433, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.14083484573502722, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 3.817399566361692e-05, | |
| "loss": 0.0756, | |
| "num_input_tokens_seen": 48099996, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 0.14228675136116153, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 3.8135556197082647e-05, | |
| "loss": 0.0991, | |
| "num_input_tokens_seen": 48591151, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 0.14373865698729582, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 3.809673610473967e-05, | |
| "loss": 0.0859, | |
| "num_input_tokens_seen": 49119581, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 0.14519056261343014, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 3.805753620133715e-05, | |
| "loss": 0.0938, | |
| "num_input_tokens_seen": 49589057, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14664246823956442, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 3.801795730959565e-05, | |
| "loss": 0.0657, | |
| "num_input_tokens_seen": 50091363, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 0.14809437386569874, | |
| "grad_norm": 1.5, | |
| "learning_rate": 3.7978000260189854e-05, | |
| "loss": 0.1124, | |
| "num_input_tokens_seen": 50595440, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 0.14954627949183302, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 3.793766589173117e-05, | |
| "loss": 0.0969, | |
| "num_input_tokens_seen": 51097536, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 0.15099818511796734, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 3.789695505075013e-05, | |
| "loss": 0.0815, | |
| "num_input_tokens_seen": 51592933, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.15245009074410162, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.785586859167855e-05, | |
| "loss": 0.0806, | |
| "num_input_tokens_seen": 52089163, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.15390199637023594, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 3.78144073768317e-05, | |
| "loss": 0.0628, | |
| "num_input_tokens_seen": 52591035, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 0.15535390199637023, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 3.7772572276390125e-05, | |
| "loss": 0.1, | |
| "num_input_tokens_seen": 53108139, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 0.15680580762250454, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 3.7730364168381444e-05, | |
| "loss": 0.1083, | |
| "num_input_tokens_seen": 53612734, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.15825771324863883, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 3.768778393866186e-05, | |
| "loss": 0.0782, | |
| "num_input_tokens_seen": 54104981, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 0.15970961887477314, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 3.764483248089763e-05, | |
| "loss": 0.1166, | |
| "num_input_tokens_seen": 54591628, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.16116152450090745, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 3.760151069654626e-05, | |
| "loss": 0.0958, | |
| "num_input_tokens_seen": 55092240, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 0.16261343012704174, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 3.75578194948376e-05, | |
| "loss": 0.0904, | |
| "num_input_tokens_seen": 55596058, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.16406533575317606, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 3.751375979275479e-05, | |
| "loss": 0.0816, | |
| "num_input_tokens_seen": 56065485, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 0.16551724137931034, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 3.746933251501497e-05, | |
| "loss": 0.0729, | |
| "num_input_tokens_seen": 56559741, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 0.16696914700544466, | |
| "grad_norm": 0.875, | |
| "learning_rate": 3.7424538594049886e-05, | |
| "loss": 0.0626, | |
| "num_input_tokens_seen": 57042468, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.16842105263157894, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 3.737937896998634e-05, | |
| "loss": 0.0872, | |
| "num_input_tokens_seen": 57530081, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.16987295825771326, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 3.733385459062645e-05, | |
| "loss": 0.0863, | |
| "num_input_tokens_seen": 58052036, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 0.17132486388384754, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 3.728796641142775e-05, | |
| "loss": 0.0747, | |
| "num_input_tokens_seen": 58558654, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 0.17277676950998186, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 3.724171539548311e-05, | |
| "loss": 0.0946, | |
| "num_input_tokens_seen": 59069780, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 0.17422867513611615, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 3.71951025135006e-05, | |
| "loss": 0.0707, | |
| "num_input_tokens_seen": 59546270, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.17568058076225046, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 3.714812874378305e-05, | |
| "loss": 0.0796, | |
| "num_input_tokens_seen": 60050879, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 0.17713248638838475, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 3.710079507220751e-05, | |
| "loss": 0.0908, | |
| "num_input_tokens_seen": 60542881, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 0.17858439201451906, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 3.705310249220463e-05, | |
| "loss": 0.0799, | |
| "num_input_tokens_seen": 61009270, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 0.18003629764065335, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.700505200473774e-05, | |
| "loss": 0.0937, | |
| "num_input_tokens_seen": 61499242, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.18148820326678766, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 3.695664461828187e-05, | |
| "loss": 0.0913, | |
| "num_input_tokens_seen": 61987954, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18294010889292195, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 3.69078813488026e-05, | |
| "loss": 0.0546, | |
| "num_input_tokens_seen": 62482644, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 0.18439201451905626, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 3.68587632197347e-05, | |
| "loss": 0.0788, | |
| "num_input_tokens_seen": 62950426, | |
| "step": 1016 | |
| }, | |
| { | |
| "epoch": 0.18584392014519055, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 3.6809291261960655e-05, | |
| "loss": 0.0865, | |
| "num_input_tokens_seen": 63454867, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.18729582577132486, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 3.675946651378909e-05, | |
| "loss": 0.0832, | |
| "num_input_tokens_seen": 63980224, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 0.18729582577132486, | |
| "eval_loss": 0.07875645905733109, | |
| "eval_runtime": 2702.6122, | |
| "eval_samples_per_second": 1.153, | |
| "eval_steps_per_second": 0.144, | |
| "num_input_tokens_seen": 63980224, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 0.18874773139745918, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 3.67092900209329e-05, | |
| "loss": 0.0831, | |
| "num_input_tokens_seen": 64445080, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.19019963702359347, | |
| "grad_norm": 1.25, | |
| "learning_rate": 3.665876283648732e-05, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 64941877, | |
| "step": 1048 | |
| }, | |
| { | |
| "epoch": 0.19165154264972778, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 3.660788602090788e-05, | |
| "loss": 0.0845, | |
| "num_input_tokens_seen": 65451057, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.19310344827586207, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 3.655666064198807e-05, | |
| "loss": 0.0822, | |
| "num_input_tokens_seen": 65944830, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 0.19455535390199638, | |
| "grad_norm": 1.125, | |
| "learning_rate": 3.6505087774836977e-05, | |
| "loss": 0.0974, | |
| "num_input_tokens_seen": 66458462, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 0.19600725952813067, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 3.645316850185672e-05, | |
| "loss": 0.0907, | |
| "num_input_tokens_seen": 66955532, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.19745916515426498, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 3.6400903912719696e-05, | |
| "loss": 0.0791, | |
| "num_input_tokens_seen": 67453162, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.19891107078039927, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 3.6348295104345764e-05, | |
| "loss": 0.0593, | |
| "num_input_tokens_seen": 67939256, | |
| "step": 1096 | |
| }, | |
| { | |
| "epoch": 0.20036297640653358, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 3.629534318087918e-05, | |
| "loss": 0.1024, | |
| "num_input_tokens_seen": 68457767, | |
| "step": 1104 | |
| }, | |
| { | |
| "epoch": 0.20181488203266787, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 3.624204925366543e-05, | |
| "loss": 0.0621, | |
| "num_input_tokens_seen": 68964063, | |
| "step": 1112 | |
| }, | |
| { | |
| "epoch": 0.20326678765880218, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 3.618841444122794e-05, | |
| "loss": 0.0685, | |
| "num_input_tokens_seen": 69443542, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.20471869328493647, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 3.613443986924455e-05, | |
| "loss": 0.0866, | |
| "num_input_tokens_seen": 69941074, | |
| "step": 1128 | |
| }, | |
| { | |
| "epoch": 0.20617059891107078, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 3.60801266705239e-05, | |
| "loss": 0.0873, | |
| "num_input_tokens_seen": 70410725, | |
| "step": 1136 | |
| }, | |
| { | |
| "epoch": 0.20762250453720507, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 3.6025475984981716e-05, | |
| "loss": 0.0767, | |
| "num_input_tokens_seen": 70885703, | |
| "step": 1144 | |
| }, | |
| { | |
| "epoch": 0.20907441016333939, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 3.59704889596168e-05, | |
| "loss": 0.08, | |
| "num_input_tokens_seen": 71379385, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 3.5915166748486984e-05, | |
| "loss": 0.0974, | |
| "num_input_tokens_seen": 71863351, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.211978221415608, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 3.585951051268496e-05, | |
| "loss": 0.0799, | |
| "num_input_tokens_seen": 72351447, | |
| "step": 1168 | |
| }, | |
| { | |
| "epoch": 0.21343012704174227, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 3.5803521420313836e-05, | |
| "loss": 0.0598, | |
| "num_input_tokens_seen": 72853284, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 0.2148820326678766, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 3.574720064646267e-05, | |
| "loss": 0.1021, | |
| "num_input_tokens_seen": 73354953, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.2163339382940109, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 3.5690549373181785e-05, | |
| "loss": 0.0749, | |
| "num_input_tokens_seen": 73851645, | |
| "step": 1192 | |
| }, | |
| { | |
| "epoch": 0.2177858439201452, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 3.563356878945797e-05, | |
| "loss": 0.0677, | |
| "num_input_tokens_seen": 74351802, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2192377495462795, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 3.557626009118951e-05, | |
| "loss": 0.0632, | |
| "num_input_tokens_seen": 74849173, | |
| "step": 1208 | |
| }, | |
| { | |
| "epoch": 0.2206896551724138, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 3.551862448116113e-05, | |
| "loss": 0.1037, | |
| "num_input_tokens_seen": 75333244, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.2221415607985481, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 3.546066316901869e-05, | |
| "loss": 0.0675, | |
| "num_input_tokens_seen": 75799822, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 0.2235934664246824, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 3.540237737124384e-05, | |
| "loss": 0.0684, | |
| "num_input_tokens_seen": 76300896, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 0.2250453720508167, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 3.534376831112848e-05, | |
| "loss": 0.0757, | |
| "num_input_tokens_seen": 76787655, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.226497277676951, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 3.528483721874907e-05, | |
| "loss": 0.0651, | |
| "num_input_tokens_seen": 77298718, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.2279491833030853, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 3.522558533094084e-05, | |
| "loss": 0.0863, | |
| "num_input_tokens_seen": 77797727, | |
| "step": 1256 | |
| }, | |
| { | |
| "epoch": 0.2294010889292196, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.51660138912718e-05, | |
| "loss": 0.0885, | |
| "num_input_tokens_seen": 78292669, | |
| "step": 1264 | |
| }, | |
| { | |
| "epoch": 0.2308529945553539, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.510612415001668e-05, | |
| "loss": 0.0892, | |
| "num_input_tokens_seen": 78800617, | |
| "step": 1272 | |
| }, | |
| { | |
| "epoch": 0.2323049001814882, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 3.5045917364130644e-05, | |
| "loss": 0.0527, | |
| "num_input_tokens_seen": 79317483, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.2337568058076225, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.4985394797222954e-05, | |
| "loss": 0.0587, | |
| "num_input_tokens_seen": 79807917, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 0.2352087114337568, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 3.49245577195304e-05, | |
| "loss": 0.0546, | |
| "num_input_tokens_seen": 80289419, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 0.2366606170598911, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.4863407407890696e-05, | |
| "loss": 0.0982, | |
| "num_input_tokens_seen": 80784249, | |
| "step": 1304 | |
| }, | |
| { | |
| "epoch": 0.2381125226860254, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 3.480194514571564e-05, | |
| "loss": 0.0965, | |
| "num_input_tokens_seen": 81278666, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.2395644283121597, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 3.474017222296419e-05, | |
| "loss": 0.0984, | |
| "num_input_tokens_seen": 81786558, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.241016333938294, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.4678089936115395e-05, | |
| "loss": 0.1122, | |
| "num_input_tokens_seen": 82281843, | |
| "step": 1328 | |
| }, | |
| { | |
| "epoch": 0.2424682395644283, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 3.461569958814119e-05, | |
| "loss": 0.0745, | |
| "num_input_tokens_seen": 82776869, | |
| "step": 1336 | |
| }, | |
| { | |
| "epoch": 0.24392014519056263, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 3.455300248847903e-05, | |
| "loss": 0.1094, | |
| "num_input_tokens_seen": 83275171, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.2453720508166969, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 3.448999995300443e-05, | |
| "loss": 0.0663, | |
| "num_input_tokens_seen": 83755833, | |
| "step": 1352 | |
| }, | |
| { | |
| "epoch": 0.24682395644283123, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 3.4426693304003324e-05, | |
| "loss": 0.0879, | |
| "num_input_tokens_seen": 84237888, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.2482758620689655, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 3.4363083870144346e-05, | |
| "loss": 0.0661, | |
| "num_input_tokens_seen": 84739837, | |
| "step": 1368 | |
| }, | |
| { | |
| "epoch": 0.24972776769509983, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 3.4299172986450906e-05, | |
| "loss": 0.0764, | |
| "num_input_tokens_seen": 85221444, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.24972776769509983, | |
| "eval_loss": 0.08076217025518417, | |
| "eval_runtime": 2579.1691, | |
| "eval_samples_per_second": 1.209, | |
| "eval_steps_per_second": 0.151, | |
| "num_input_tokens_seen": 85221444, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.25117967332123414, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 3.4234961994273206e-05, | |
| "loss": 0.0714, | |
| "num_input_tokens_seen": 85711647, | |
| "step": 1384 | |
| }, | |
| { | |
| "epoch": 0.25263157894736843, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.417045224126004e-05, | |
| "loss": 0.0774, | |
| "num_input_tokens_seen": 86223550, | |
| "step": 1392 | |
| }, | |
| { | |
| "epoch": 0.2540834845735027, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 3.410564508133058e-05, | |
| "loss": 0.0872, | |
| "num_input_tokens_seen": 86721404, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.255535390199637, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 3.40405418746459e-05, | |
| "loss": 0.0729, | |
| "num_input_tokens_seen": 87180793, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.25698729582577134, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 3.397514398758046e-05, | |
| "loss": 0.0732, | |
| "num_input_tokens_seen": 87680677, | |
| "step": 1416 | |
| }, | |
| { | |
| "epoch": 0.25843920145190563, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.39094527926934e-05, | |
| "loss": 0.0765, | |
| "num_input_tokens_seen": 88187512, | |
| "step": 1424 | |
| }, | |
| { | |
| "epoch": 0.2598911070780399, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 3.384346966869976e-05, | |
| "loss": 0.0684, | |
| "num_input_tokens_seen": 88692751, | |
| "step": 1432 | |
| }, | |
| { | |
| "epoch": 0.2613430127041742, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 3.377719600044156e-05, | |
| "loss": 0.0878, | |
| "num_input_tokens_seen": 89183444, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.26279491833030855, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 3.371063317885868e-05, | |
| "loss": 0.0738, | |
| "num_input_tokens_seen": 89681459, | |
| "step": 1448 | |
| }, | |
| { | |
| "epoch": 0.26424682395644283, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 3.364378260095972e-05, | |
| "loss": 0.075, | |
| "num_input_tokens_seen": 90168008, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 0.2656987295825771, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 3.3576645669792634e-05, | |
| "loss": 0.0606, | |
| "num_input_tokens_seen": 90654438, | |
| "step": 1464 | |
| }, | |
| { | |
| "epoch": 0.2671506352087114, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 3.350922379441534e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 91167951, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.26860254083484575, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 3.3441518389866075e-05, | |
| "loss": 0.0518, | |
| "num_input_tokens_seen": 91650643, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.27005444646098004, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 3.3373530877133764e-05, | |
| "loss": 0.0749, | |
| "num_input_tokens_seen": 92155336, | |
| "step": 1488 | |
| }, | |
| { | |
| "epoch": 0.2715063520871143, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 3.330526268312817e-05, | |
| "loss": 0.0583, | |
| "num_input_tokens_seen": 92628298, | |
| "step": 1496 | |
| }, | |
| { | |
| "epoch": 0.27295825771324866, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 3.323671524064992e-05, | |
| "loss": 0.0885, | |
| "num_input_tokens_seen": 93154901, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.27441016333938295, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 3.316788998836048e-05, | |
| "loss": 0.0583, | |
| "num_input_tokens_seen": 93650095, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 3.309878837075193e-05, | |
| "loss": 0.0764, | |
| "num_input_tokens_seen": 94136210, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.2773139745916515, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 3.3029411838116654e-05, | |
| "loss": 0.0638, | |
| "num_input_tokens_seen": 94624523, | |
| "step": 1528 | |
| }, | |
| { | |
| "epoch": 0.27876588021778587, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 3.295976184651691e-05, | |
| "loss": 0.0685, | |
| "num_input_tokens_seen": 95110498, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.28021778584392015, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 3.288983985775426e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 95620511, | |
| "step": 1544 | |
| }, | |
| { | |
| "epoch": 0.28166969147005444, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 3.281964733933889e-05, | |
| "loss": 0.0779, | |
| "num_input_tokens_seen": 96130692, | |
| "step": 1552 | |
| }, | |
| { | |
| "epoch": 0.2831215970961887, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 3.274918576445882e-05, | |
| "loss": 0.0713, | |
| "num_input_tokens_seen": 96638367, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.28457350272232307, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 3.267845661194898e-05, | |
| "loss": 0.0653, | |
| "num_input_tokens_seen": 97154890, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.28602540834845736, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 3.260746136626016e-05, | |
| "loss": 0.0522, | |
| "num_input_tokens_seen": 97650182, | |
| "step": 1576 | |
| }, | |
| { | |
| "epoch": 0.28747731397459164, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 3.253620151742788e-05, | |
| "loss": 0.0868, | |
| "num_input_tokens_seen": 98121695, | |
| "step": 1584 | |
| }, | |
| { | |
| "epoch": 0.28892921960072593, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 3.24646785610411e-05, | |
| "loss": 0.0844, | |
| "num_input_tokens_seen": 98595616, | |
| "step": 1592 | |
| }, | |
| { | |
| "epoch": 0.29038112522686027, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 3.239289399821083e-05, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 99105755, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.29183303085299456, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 3.2320849335538636e-05, | |
| "loss": 0.0699, | |
| "num_input_tokens_seen": 99595258, | |
| "step": 1608 | |
| }, | |
| { | |
| "epoch": 0.29328493647912884, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 3.2248546085084995e-05, | |
| "loss": 0.0903, | |
| "num_input_tokens_seen": 100106643, | |
| "step": 1616 | |
| }, | |
| { | |
| "epoch": 0.29473684210526313, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 3.21759857643376e-05, | |
| "loss": 0.0826, | |
| "num_input_tokens_seen": 100593045, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 0.2961887477313975, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 3.2103169896179476e-05, | |
| "loss": 0.084, | |
| "num_input_tokens_seen": 101094273, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.29764065335753176, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 3.203010000885704e-05, | |
| "loss": 0.0742, | |
| "num_input_tokens_seen": 101593296, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.29909255898366605, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 3.1956777635948016e-05, | |
| "loss": 0.064, | |
| "num_input_tokens_seen": 102074203, | |
| "step": 1648 | |
| }, | |
| { | |
| "epoch": 0.3005444646098004, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.188320431632924e-05, | |
| "loss": 0.0569, | |
| "num_input_tokens_seen": 102576481, | |
| "step": 1656 | |
| }, | |
| { | |
| "epoch": 0.3019963702359347, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.180938159414439e-05, | |
| "loss": 0.0932, | |
| "num_input_tokens_seen": 103070807, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.30344827586206896, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 3.173531101877155e-05, | |
| "loss": 0.0621, | |
| "num_input_tokens_seen": 103568290, | |
| "step": 1672 | |
| }, | |
| { | |
| "epoch": 0.30490018148820325, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 3.166099414479069e-05, | |
| "loss": 0.0579, | |
| "num_input_tokens_seen": 104059494, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.3063520871143376, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 3.158643253195108e-05, | |
| "loss": 0.0695, | |
| "num_input_tokens_seen": 104556886, | |
| "step": 1688 | |
| }, | |
| { | |
| "epoch": 0.3078039927404719, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 3.15116277451385e-05, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 105058562, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.30925589836660616, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 3.143658135434244e-05, | |
| "loss": 0.0652, | |
| "num_input_tokens_seen": 105536081, | |
| "step": 1704 | |
| }, | |
| { | |
| "epoch": 0.31070780399274045, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 3.136129493462312e-05, | |
| "loss": 0.0748, | |
| "num_input_tokens_seen": 106037792, | |
| "step": 1712 | |
| }, | |
| { | |
| "epoch": 0.3121597096188748, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 3.1285770066078445e-05, | |
| "loss": 0.072, | |
| "num_input_tokens_seen": 106546503, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.3121597096188748, | |
| "eval_loss": 0.06825637072324753, | |
| "eval_runtime": 2711.2246, | |
| "eval_samples_per_second": 1.15, | |
| "eval_steps_per_second": 0.144, | |
| "num_input_tokens_seen": 106546503, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.3136116152450091, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 3.121000833381084e-05, | |
| "loss": 0.0737, | |
| "num_input_tokens_seen": 107037952, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.31506352087114337, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 3.113401132789399e-05, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 107540349, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 0.31651542649727765, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 3.1057780643339465e-05, | |
| "loss": 0.0685, | |
| "num_input_tokens_seen": 108034983, | |
| "step": 1744 | |
| }, | |
| { | |
| "epoch": 0.317967332123412, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 3.098131788006322e-05, | |
| "loss": 0.0718, | |
| "num_input_tokens_seen": 108503192, | |
| "step": 1752 | |
| }, | |
| { | |
| "epoch": 0.3194192377495463, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 3.0904624642852065e-05, | |
| "loss": 0.076, | |
| "num_input_tokens_seen": 109019554, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.32087114337568057, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 3.082770254132993e-05, | |
| "loss": 0.0549, | |
| "num_input_tokens_seen": 109504850, | |
| "step": 1768 | |
| }, | |
| { | |
| "epoch": 0.3223230490018149, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 3.075055318992412e-05, | |
| "loss": 0.068, | |
| "num_input_tokens_seen": 110008850, | |
| "step": 1776 | |
| }, | |
| { | |
| "epoch": 0.3237749546279492, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 3.067317820783143e-05, | |
| "loss": 0.0676, | |
| "num_input_tokens_seen": 110528376, | |
| "step": 1784 | |
| }, | |
| { | |
| "epoch": 0.3252268602540835, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 3.0595579218984124e-05, | |
| "loss": 0.0862, | |
| "num_input_tokens_seen": 111026349, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.32667876588021777, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 3.05177578520159e-05, | |
| "loss": 0.0561, | |
| "num_input_tokens_seen": 111515922, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3281306715063521, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 3.04397157402277e-05, | |
| "loss": 0.0599, | |
| "num_input_tokens_seen": 112007455, | |
| "step": 1808 | |
| }, | |
| { | |
| "epoch": 0.3295825771324864, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.0361454521553383e-05, | |
| "loss": 0.0856, | |
| "num_input_tokens_seen": 112491694, | |
| "step": 1816 | |
| }, | |
| { | |
| "epoch": 0.3310344827586207, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 3.028297583852541e-05, | |
| "loss": 0.055, | |
| "num_input_tokens_seen": 112968009, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.33248638838475497, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 3.020428133824035e-05, | |
| "loss": 0.0495, | |
| "num_input_tokens_seen": 113462356, | |
| "step": 1832 | |
| }, | |
| { | |
| "epoch": 0.3339382940108893, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 3.0125372672324285e-05, | |
| "loss": 0.0765, | |
| "num_input_tokens_seen": 113976443, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.3353901996370236, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.0046251496898177e-05, | |
| "loss": 0.0521, | |
| "num_input_tokens_seen": 114445408, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 0.3368421052631579, | |
| "grad_norm": 1.0, | |
| "learning_rate": 2.9966919472543098e-05, | |
| "loss": 0.0659, | |
| "num_input_tokens_seen": 114933077, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.3382940108892922, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 2.9887378264265387e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 115416098, | |
| "step": 1864 | |
| }, | |
| { | |
| "epoch": 0.3397459165154265, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 2.9807629541461693e-05, | |
| "loss": 0.0611, | |
| "num_input_tokens_seen": 115937997, | |
| "step": 1872 | |
| }, | |
| { | |
| "epoch": 0.3411978221415608, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 2.972767497788393e-05, | |
| "loss": 0.048, | |
| "num_input_tokens_seen": 116441850, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.3426497277676951, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 2.9647516251604192e-05, | |
| "loss": 0.0777, | |
| "num_input_tokens_seen": 116937086, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.3441016333938294, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 2.9567155044979466e-05, | |
| "loss": 0.0598, | |
| "num_input_tokens_seen": 117443956, | |
| "step": 1896 | |
| }, | |
| { | |
| "epoch": 0.3455535390199637, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 2.9486593044616394e-05, | |
| "loss": 0.0686, | |
| "num_input_tokens_seen": 117937379, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 0.347005444646098, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 2.9405831941335816e-05, | |
| "loss": 0.053, | |
| "num_input_tokens_seen": 118423431, | |
| "step": 1912 | |
| }, | |
| { | |
| "epoch": 0.3484573502722323, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.932487343013732e-05, | |
| "loss": 0.0485, | |
| "num_input_tokens_seen": 118938547, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.34990925589836663, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.9243719210163654e-05, | |
| "loss": 0.076, | |
| "num_input_tokens_seen": 119414827, | |
| "step": 1928 | |
| }, | |
| { | |
| "epoch": 0.3513611615245009, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 2.916237098466507e-05, | |
| "loss": 0.037, | |
| "num_input_tokens_seen": 119906010, | |
| "step": 1936 | |
| }, | |
| { | |
| "epoch": 0.3528130671506352, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 2.9080830460963563e-05, | |
| "loss": 0.0561, | |
| "num_input_tokens_seen": 120390508, | |
| "step": 1944 | |
| }, | |
| { | |
| "epoch": 0.3542649727767695, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 2.8999099350417065e-05, | |
| "loss": 0.0846, | |
| "num_input_tokens_seen": 120863309, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.35571687840290384, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 2.8917179368383493e-05, | |
| "loss": 0.0403, | |
| "num_input_tokens_seen": 121339176, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3571687840290381, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 2.883507223418478e-05, | |
| "loss": 0.0645, | |
| "num_input_tokens_seen": 121867501, | |
| "step": 1968 | |
| }, | |
| { | |
| "epoch": 0.3586206896551724, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 2.875277967107076e-05, | |
| "loss": 0.0911, | |
| "num_input_tokens_seen": 122375421, | |
| "step": 1976 | |
| }, | |
| { | |
| "epoch": 0.3600725952813067, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 2.867030340618303e-05, | |
| "loss": 0.0454, | |
| "num_input_tokens_seen": 122856601, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.36152450090744104, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.858764517051868e-05, | |
| "loss": 0.0615, | |
| "num_input_tokens_seen": 123347371, | |
| "step": 1992 | |
| }, | |
| { | |
| "epoch": 0.3629764065335753, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 2.850480669889397e-05, | |
| "loss": 0.0536, | |
| "num_input_tokens_seen": 123846779, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3644283121597096, | |
| "grad_norm": 1.875, | |
| "learning_rate": 2.8421789729907928e-05, | |
| "loss": 0.0499, | |
| "num_input_tokens_seen": 124332390, | |
| "step": 2008 | |
| }, | |
| { | |
| "epoch": 0.3658802177858439, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.833859600590583e-05, | |
| "loss": 0.076, | |
| "num_input_tokens_seen": 124806640, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 0.36733212341197824, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 2.825522727294268e-05, | |
| "loss": 0.0347, | |
| "num_input_tokens_seen": 125289556, | |
| "step": 2024 | |
| }, | |
| { | |
| "epoch": 0.3687840290381125, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 2.817168528074654e-05, | |
| "loss": 0.0854, | |
| "num_input_tokens_seen": 125783042, | |
| "step": 2032 | |
| }, | |
| { | |
| "epoch": 0.3702359346642468, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 2.8087971782681774e-05, | |
| "loss": 0.0731, | |
| "num_input_tokens_seen": 126277662, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.3716878402903811, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.8004088535712315e-05, | |
| "loss": 0.0833, | |
| "num_input_tokens_seen": 126770182, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.37313974591651544, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 2.7920037300364746e-05, | |
| "loss": 0.0752, | |
| "num_input_tokens_seen": 127265873, | |
| "step": 2056 | |
| }, | |
| { | |
| "epoch": 0.37459165154264973, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 2.783581984069134e-05, | |
| "loss": 0.0652, | |
| "num_input_tokens_seen": 127767598, | |
| "step": 2064 | |
| }, | |
| { | |
| "epoch": 0.37459165154264973, | |
| "eval_loss": 0.06295192986726761, | |
| "eval_runtime": 2754.9055, | |
| "eval_samples_per_second": 1.131, | |
| "eval_steps_per_second": 0.142, | |
| "num_input_tokens_seen": 127767598, | |
| "step": 2064 | |
| }, | |
| { | |
| "epoch": 0.376043557168784, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 2.7751437924233093e-05, | |
| "loss": 0.06, | |
| "num_input_tokens_seen": 128256289, | |
| "step": 2072 | |
| }, | |
| { | |
| "epoch": 0.37749546279491836, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 2.7666893321982548e-05, | |
| "loss": 0.0714, | |
| "num_input_tokens_seen": 128789423, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.37894736842105264, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.758218780834671e-05, | |
| "loss": 0.0608, | |
| "num_input_tokens_seen": 129283910, | |
| "step": 2088 | |
| }, | |
| { | |
| "epoch": 0.38039927404718693, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 2.7497323161109734e-05, | |
| "loss": 0.0567, | |
| "num_input_tokens_seen": 129762227, | |
| "step": 2096 | |
| }, | |
| { | |
| "epoch": 0.3818511796733212, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 2.741230116139565e-05, | |
| "loss": 0.0822, | |
| "num_input_tokens_seen": 130260949, | |
| "step": 2104 | |
| }, | |
| { | |
| "epoch": 0.38330308529945556, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 2.7327123593630984e-05, | |
| "loss": 0.0744, | |
| "num_input_tokens_seen": 130738461, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 0.38475499092558985, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 2.7241792245507284e-05, | |
| "loss": 0.0428, | |
| "num_input_tokens_seen": 131250070, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.38620689655172413, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.715630890794362e-05, | |
| "loss": 0.0764, | |
| "num_input_tokens_seen": 131731607, | |
| "step": 2128 | |
| }, | |
| { | |
| "epoch": 0.3876588021778584, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 2.7070675375048984e-05, | |
| "loss": 0.0464, | |
| "num_input_tokens_seen": 132241144, | |
| "step": 2136 | |
| }, | |
| { | |
| "epoch": 0.38911070780399276, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 2.698489344408464e-05, | |
| "loss": 0.0598, | |
| "num_input_tokens_seen": 132728134, | |
| "step": 2144 | |
| }, | |
| { | |
| "epoch": 0.39056261343012705, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 2.689896491542642e-05, | |
| "loss": 0.0897, | |
| "num_input_tokens_seen": 133209860, | |
| "step": 2152 | |
| }, | |
| { | |
| "epoch": 0.39201451905626133, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 2.681289159252689e-05, | |
| "loss": 0.0525, | |
| "num_input_tokens_seen": 133711627, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3934664246823956, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 2.6726675281877567e-05, | |
| "loss": 0.0602, | |
| "num_input_tokens_seen": 134198176, | |
| "step": 2168 | |
| }, | |
| { | |
| "epoch": 0.39491833030852996, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 2.6640317792970947e-05, | |
| "loss": 0.0562, | |
| "num_input_tokens_seen": 134689114, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 0.39637023593466425, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 2.6553820938262557e-05, | |
| "loss": 0.0341, | |
| "num_input_tokens_seen": 135179499, | |
| "step": 2184 | |
| }, | |
| { | |
| "epoch": 0.39782214156079854, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.6467186533132906e-05, | |
| "loss": 0.0783, | |
| "num_input_tokens_seen": 135700208, | |
| "step": 2192 | |
| }, | |
| { | |
| "epoch": 0.3992740471869328, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.638041639584939e-05, | |
| "loss": 0.0604, | |
| "num_input_tokens_seen": 136212202, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.40072595281306717, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.6293512347528122e-05, | |
| "loss": 0.0591, | |
| "num_input_tokens_seen": 136698380, | |
| "step": 2208 | |
| }, | |
| { | |
| "epoch": 0.40217785843920145, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 2.6206476212095734e-05, | |
| "loss": 0.0743, | |
| "num_input_tokens_seen": 137191271, | |
| "step": 2216 | |
| }, | |
| { | |
| "epoch": 0.40362976406533574, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.6119309816251042e-05, | |
| "loss": 0.0437, | |
| "num_input_tokens_seen": 137660173, | |
| "step": 2224 | |
| }, | |
| { | |
| "epoch": 0.4050816696914701, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 2.6032014989426784e-05, | |
| "loss": 0.0597, | |
| "num_input_tokens_seen": 138165909, | |
| "step": 2232 | |
| }, | |
| { | |
| "epoch": 0.40653357531760437, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 2.594459356375116e-05, | |
| "loss": 0.0504, | |
| "num_input_tokens_seen": 138631528, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.40798548094373865, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 2.585704737400941e-05, | |
| "loss": 0.0611, | |
| "num_input_tokens_seen": 139130348, | |
| "step": 2248 | |
| }, | |
| { | |
| "epoch": 0.40943738656987294, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.57693782576053e-05, | |
| "loss": 0.0461, | |
| "num_input_tokens_seen": 139617268, | |
| "step": 2256 | |
| }, | |
| { | |
| "epoch": 0.4108892921960073, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 2.568158805452256e-05, | |
| "loss": 0.062, | |
| "num_input_tokens_seen": 140121646, | |
| "step": 2264 | |
| }, | |
| { | |
| "epoch": 0.41234119782214157, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 2.559367860728627e-05, | |
| "loss": 0.0506, | |
| "num_input_tokens_seen": 140625443, | |
| "step": 2272 | |
| }, | |
| { | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 2.5505651760924182e-05, | |
| "loss": 0.0757, | |
| "num_input_tokens_seen": 141135512, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.41524500907441014, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 2.5417509362927986e-05, | |
| "loss": 0.078, | |
| "num_input_tokens_seen": 141614186, | |
| "step": 2288 | |
| }, | |
| { | |
| "epoch": 0.4166969147005445, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 2.5329253263214573e-05, | |
| "loss": 0.0549, | |
| "num_input_tokens_seen": 142126285, | |
| "step": 2296 | |
| }, | |
| { | |
| "epoch": 0.41814882032667877, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 2.5240885314087162e-05, | |
| "loss": 0.0592, | |
| "num_input_tokens_seen": 142609607, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 0.41960072595281306, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 2.5152407370196467e-05, | |
| "loss": 0.0477, | |
| "num_input_tokens_seen": 143090080, | |
| "step": 2312 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 2.5063821288501746e-05, | |
| "loss": 0.0576, | |
| "num_input_tokens_seen": 143576776, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.4225045372050817, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.4975128928231823e-05, | |
| "loss": 0.0671, | |
| "num_input_tokens_seen": 144070311, | |
| "step": 2328 | |
| }, | |
| { | |
| "epoch": 0.423956442831216, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 2.4886332150846092e-05, | |
| "loss": 0.0637, | |
| "num_input_tokens_seen": 144581612, | |
| "step": 2336 | |
| }, | |
| { | |
| "epoch": 0.42540834845735026, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.4797432819995427e-05, | |
| "loss": 0.0496, | |
| "num_input_tokens_seen": 145085129, | |
| "step": 2344 | |
| }, | |
| { | |
| "epoch": 0.42686025408348455, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 2.4708432801483086e-05, | |
| "loss": 0.0662, | |
| "num_input_tokens_seen": 145568633, | |
| "step": 2352 | |
| }, | |
| { | |
| "epoch": 0.4283121597096189, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 2.4619333963225525e-05, | |
| "loss": 0.059, | |
| "num_input_tokens_seen": 146076350, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.4297640653357532, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 2.4530138175213222e-05, | |
| "loss": 0.1076, | |
| "num_input_tokens_seen": 146577893, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 0.43121597096188746, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 2.4440847309471422e-05, | |
| "loss": 0.0794, | |
| "num_input_tokens_seen": 147074725, | |
| "step": 2376 | |
| }, | |
| { | |
| "epoch": 0.4326678765880218, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 2.435146324002083e-05, | |
| "loss": 0.0537, | |
| "num_input_tokens_seen": 147559139, | |
| "step": 2384 | |
| }, | |
| { | |
| "epoch": 0.4341197822141561, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 2.426198784283831e-05, | |
| "loss": 0.0429, | |
| "num_input_tokens_seen": 148055859, | |
| "step": 2392 | |
| }, | |
| { | |
| "epoch": 0.4355716878402904, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 2.4172422995817496e-05, | |
| "loss": 0.0583, | |
| "num_input_tokens_seen": 148559803, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.43702359346642466, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 2.408277057872936e-05, | |
| "loss": 0.0693, | |
| "num_input_tokens_seen": 149047633, | |
| "step": 2408 | |
| }, | |
| { | |
| "epoch": 0.43702359346642466, | |
| "eval_loss": 0.05809076130390167, | |
| "eval_runtime": 2813.328, | |
| "eval_samples_per_second": 1.108, | |
| "eval_steps_per_second": 0.139, | |
| "num_input_tokens_seen": 149047633, | |
| "step": 2408 | |
| }, | |
| { | |
| "epoch": 0.438475499092559, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.3993032473182796e-05, | |
| "loss": 0.0627, | |
| "num_input_tokens_seen": 149553600, | |
| "step": 2416 | |
| }, | |
| { | |
| "epoch": 0.4399274047186933, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 2.390321056258511e-05, | |
| "loss": 0.0518, | |
| "num_input_tokens_seen": 150031007, | |
| "step": 2424 | |
| }, | |
| { | |
| "epoch": 0.4413793103448276, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.3813306732102483e-05, | |
| "loss": 0.0564, | |
| "num_input_tokens_seen": 150506503, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 0.44283121597096187, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 2.3723322868620436e-05, | |
| "loss": 0.0728, | |
| "num_input_tokens_seen": 151018070, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.4442831215970962, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 2.3633260860704188e-05, | |
| "loss": 0.0428, | |
| "num_input_tokens_seen": 151507916, | |
| "step": 2448 | |
| }, | |
| { | |
| "epoch": 0.4457350272232305, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 2.3543122598559053e-05, | |
| "loss": 0.0458, | |
| "num_input_tokens_seen": 151999967, | |
| "step": 2456 | |
| }, | |
| { | |
| "epoch": 0.4471869328493648, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.345290997399074e-05, | |
| "loss": 0.051, | |
| "num_input_tokens_seen": 152499025, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 0.44863883847549907, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 2.3362624880365677e-05, | |
| "loss": 0.0713, | |
| "num_input_tokens_seen": 152984867, | |
| "step": 2472 | |
| }, | |
| { | |
| "epoch": 0.4500907441016334, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 2.3272269212571262e-05, | |
| "loss": 0.0627, | |
| "num_input_tokens_seen": 153473082, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.4515426497277677, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.3181844866976076e-05, | |
| "loss": 0.048, | |
| "num_input_tokens_seen": 153951602, | |
| "step": 2488 | |
| }, | |
| { | |
| "epoch": 0.452994555353902, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 2.3091353741390116e-05, | |
| "loss": 0.0476, | |
| "num_input_tokens_seen": 154432971, | |
| "step": 2496 | |
| }, | |
| { | |
| "epoch": 0.45444646098003627, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 2.3000797735024922e-05, | |
| "loss": 0.049, | |
| "num_input_tokens_seen": 154912331, | |
| "step": 2504 | |
| }, | |
| { | |
| "epoch": 0.4558983666061706, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 2.2910178748453765e-05, | |
| "loss": 0.0544, | |
| "num_input_tokens_seen": 155385055, | |
| "step": 2512 | |
| }, | |
| { | |
| "epoch": 0.4573502722323049, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 2.2819498683571718e-05, | |
| "loss": 0.0494, | |
| "num_input_tokens_seen": 155892191, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4588021778584392, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.272875944355575e-05, | |
| "loss": 0.066, | |
| "num_input_tokens_seen": 156405102, | |
| "step": 2528 | |
| }, | |
| { | |
| "epoch": 0.46025408348457353, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 2.2637962932824803e-05, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 156909466, | |
| "step": 2536 | |
| }, | |
| { | |
| "epoch": 0.4617059891107078, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.2547111056999808e-05, | |
| "loss": 0.0394, | |
| "num_input_tokens_seen": 157391122, | |
| "step": 2544 | |
| }, | |
| { | |
| "epoch": 0.4631578947368421, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.245620572286366e-05, | |
| "loss": 0.0525, | |
| "num_input_tokens_seen": 157880121, | |
| "step": 2552 | |
| }, | |
| { | |
| "epoch": 0.4646098003629764, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 2.2365248838321273e-05, | |
| "loss": 0.0491, | |
| "num_input_tokens_seen": 158360167, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.46606170598911073, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 2.2274242312359445e-05, | |
| "loss": 0.0528, | |
| "num_input_tokens_seen": 158867422, | |
| "step": 2568 | |
| }, | |
| { | |
| "epoch": 0.467513611615245, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 2.2183188055006867e-05, | |
| "loss": 0.0679, | |
| "num_input_tokens_seen": 159364296, | |
| "step": 2576 | |
| }, | |
| { | |
| "epoch": 0.4689655172413793, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 2.2092087977294e-05, | |
| "loss": 0.0744, | |
| "num_input_tokens_seen": 159890619, | |
| "step": 2584 | |
| }, | |
| { | |
| "epoch": 0.4704174228675136, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 2.2000943991212977e-05, | |
| "loss": 0.0419, | |
| "num_input_tokens_seen": 160398651, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 0.47186932849364793, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 2.190975800967747e-05, | |
| "loss": 0.0616, | |
| "num_input_tokens_seen": 160922909, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4733212341197822, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.1818531946482543e-05, | |
| "loss": 0.0442, | |
| "num_input_tokens_seen": 161419902, | |
| "step": 2608 | |
| }, | |
| { | |
| "epoch": 0.4747731397459165, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.172726771626449e-05, | |
| "loss": 0.0469, | |
| "num_input_tokens_seen": 161929180, | |
| "step": 2616 | |
| }, | |
| { | |
| "epoch": 0.4762250453720508, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 2.163596723446065e-05, | |
| "loss": 0.0573, | |
| "num_input_tokens_seen": 162437709, | |
| "step": 2624 | |
| }, | |
| { | |
| "epoch": 0.47767695099818513, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 2.1544632417269194e-05, | |
| "loss": 0.052, | |
| "num_input_tokens_seen": 162950151, | |
| "step": 2632 | |
| }, | |
| { | |
| "epoch": 0.4791288566243194, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 2.145326518160893e-05, | |
| "loss": 0.0576, | |
| "num_input_tokens_seen": 163429462, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.4805807622504537, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.136186744507904e-05, | |
| "loss": 0.0577, | |
| "num_input_tokens_seen": 163939160, | |
| "step": 2648 | |
| }, | |
| { | |
| "epoch": 0.482032667876588, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 2.1270441125918882e-05, | |
| "loss": 0.051, | |
| "num_input_tokens_seen": 164446079, | |
| "step": 2656 | |
| }, | |
| { | |
| "epoch": 0.48348457350272234, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.1178988142967678e-05, | |
| "loss": 0.0489, | |
| "num_input_tokens_seen": 164936233, | |
| "step": 2664 | |
| }, | |
| { | |
| "epoch": 0.4849364791288566, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 2.108751041562427e-05, | |
| "loss": 0.0622, | |
| "num_input_tokens_seen": 165409965, | |
| "step": 2672 | |
| }, | |
| { | |
| "epoch": 0.4863883847549909, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 2.0996009863806834e-05, | |
| "loss": 0.0578, | |
| "num_input_tokens_seen": 165901841, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.48784029038112525, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 2.0904488407912575e-05, | |
| "loss": 0.0389, | |
| "num_input_tokens_seen": 166384603, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 0.48929219600725954, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.0812947968777437e-05, | |
| "loss": 0.0432, | |
| "num_input_tokens_seen": 166889709, | |
| "step": 2696 | |
| }, | |
| { | |
| "epoch": 0.4907441016333938, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 2.0721390467635788e-05, | |
| "loss": 0.0453, | |
| "num_input_tokens_seen": 167372121, | |
| "step": 2704 | |
| }, | |
| { | |
| "epoch": 0.4921960072595281, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 2.0629817826080073e-05, | |
| "loss": 0.0447, | |
| "num_input_tokens_seen": 167871991, | |
| "step": 2712 | |
| }, | |
| { | |
| "epoch": 0.49364791288566245, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 2.053823196602051e-05, | |
| "loss": 0.0543, | |
| "num_input_tokens_seen": 168369985, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.49509981851179674, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.044663480964474e-05, | |
| "loss": 0.0416, | |
| "num_input_tokens_seen": 168846412, | |
| "step": 2728 | |
| }, | |
| { | |
| "epoch": 0.496551724137931, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.0355028279377498e-05, | |
| "loss": 0.0467, | |
| "num_input_tokens_seen": 169335334, | |
| "step": 2736 | |
| }, | |
| { | |
| "epoch": 0.4980036297640653, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 2.026341429784025e-05, | |
| "loss": 0.0724, | |
| "num_input_tokens_seen": 169830612, | |
| "step": 2744 | |
| }, | |
| { | |
| "epoch": 0.49945553539019966, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.0171794787810842e-05, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 170349739, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 0.49945553539019966, | |
| "eval_loss": 0.054387591779232025, | |
| "eval_runtime": 2838.6975, | |
| "eval_samples_per_second": 1.098, | |
| "eval_steps_per_second": 0.137, | |
| "num_input_tokens_seen": 170349739, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 0.5009074410163339, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.008017167218317e-05, | |
| "loss": 0.0365, | |
| "num_input_tokens_seen": 170843316, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.5023593466424683, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 1.9988546873926788e-05, | |
| "loss": 0.0456, | |
| "num_input_tokens_seen": 171324496, | |
| "step": 2768 | |
| }, | |
| { | |
| "epoch": 0.5038112522686026, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.9896922316046562e-05, | |
| "loss": 0.0416, | |
| "num_input_tokens_seen": 171829665, | |
| "step": 2776 | |
| }, | |
| { | |
| "epoch": 0.5052631578947369, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.980529992154233e-05, | |
| "loss": 0.0395, | |
| "num_input_tokens_seen": 172325874, | |
| "step": 2784 | |
| }, | |
| { | |
| "epoch": 0.5067150635208711, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 1.9713681613368506e-05, | |
| "loss": 0.0536, | |
| "num_input_tokens_seen": 172832464, | |
| "step": 2792 | |
| }, | |
| { | |
| "epoch": 0.5081669691470054, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.9622069314393753e-05, | |
| "loss": 0.0505, | |
| "num_input_tokens_seen": 173320567, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5096188747731397, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.9530464947360615e-05, | |
| "loss": 0.0528, | |
| "num_input_tokens_seen": 173816293, | |
| "step": 2808 | |
| }, | |
| { | |
| "epoch": 0.511070780399274, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.943887043484515e-05, | |
| "loss": 0.0766, | |
| "num_input_tokens_seen": 174302982, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 0.5125226860254084, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.9347287699216602e-05, | |
| "loss": 0.0574, | |
| "num_input_tokens_seen": 174807598, | |
| "step": 2824 | |
| }, | |
| { | |
| "epoch": 0.5139745916515427, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 1.9255718662597044e-05, | |
| "loss": 0.0667, | |
| "num_input_tokens_seen": 175302323, | |
| "step": 2832 | |
| }, | |
| { | |
| "epoch": 0.515426497277677, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.9164165246821026e-05, | |
| "loss": 0.0434, | |
| "num_input_tokens_seen": 175782712, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.5168784029038113, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 1.9072629373395268e-05, | |
| "loss": 0.0573, | |
| "num_input_tokens_seen": 176252965, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 0.5183303085299455, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.8981112963458293e-05, | |
| "loss": 0.0541, | |
| "num_input_tokens_seen": 176746353, | |
| "step": 2856 | |
| }, | |
| { | |
| "epoch": 0.5197822141560798, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.8889617937740146e-05, | |
| "loss": 0.0457, | |
| "num_input_tokens_seen": 177252614, | |
| "step": 2864 | |
| }, | |
| { | |
| "epoch": 0.5212341197822141, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.879814621652206e-05, | |
| "loss": 0.0588, | |
| "num_input_tokens_seen": 177752505, | |
| "step": 2872 | |
| }, | |
| { | |
| "epoch": 0.5226860254083484, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.8706699719596138e-05, | |
| "loss": 0.0717, | |
| "num_input_tokens_seen": 178248588, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.5241379310344828, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.8615280366225113e-05, | |
| "loss": 0.0634, | |
| "num_input_tokens_seen": 178746624, | |
| "step": 2888 | |
| }, | |
| { | |
| "epoch": 0.5255898366606171, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 1.852389007510201e-05, | |
| "loss": 0.0573, | |
| "num_input_tokens_seen": 179239200, | |
| "step": 2896 | |
| }, | |
| { | |
| "epoch": 0.5270417422867514, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.8432530764309916e-05, | |
| "loss": 0.0574, | |
| "num_input_tokens_seen": 179731398, | |
| "step": 2904 | |
| }, | |
| { | |
| "epoch": 0.5284936479128857, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.8341204351281684e-05, | |
| "loss": 0.0786, | |
| "num_input_tokens_seen": 180216141, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 0.52994555353902, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 1.8249912752759748e-05, | |
| "loss": 0.0481, | |
| "num_input_tokens_seen": 180719896, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.5313974591651542, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.8158657884755832e-05, | |
| "loss": 0.0595, | |
| "num_input_tokens_seen": 181215874, | |
| "step": 2928 | |
| }, | |
| { | |
| "epoch": 0.5328493647912885, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 1.8067441662510782e-05, | |
| "loss": 0.0495, | |
| "num_input_tokens_seen": 181715660, | |
| "step": 2936 | |
| }, | |
| { | |
| "epoch": 0.5343012704174228, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.797626600045435e-05, | |
| "loss": 0.0507, | |
| "num_input_tokens_seen": 182189644, | |
| "step": 2944 | |
| }, | |
| { | |
| "epoch": 0.5357531760435572, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 1.7885132812165022e-05, | |
| "loss": 0.0457, | |
| "num_input_tokens_seen": 182692258, | |
| "step": 2952 | |
| }, | |
| { | |
| "epoch": 0.5372050816696915, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 1.7794044010329844e-05, | |
| "loss": 0.0454, | |
| "num_input_tokens_seen": 183173683, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.5386569872958258, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.7703001506704297e-05, | |
| "loss": 0.0612, | |
| "num_input_tokens_seen": 183670207, | |
| "step": 2968 | |
| }, | |
| { | |
| "epoch": 0.5401088929219601, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.761200721207215e-05, | |
| "loss": 0.0559, | |
| "num_input_tokens_seen": 184191448, | |
| "step": 2976 | |
| }, | |
| { | |
| "epoch": 0.5415607985480944, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 1.7521063036205383e-05, | |
| "loss": 0.032, | |
| "num_input_tokens_seen": 184672691, | |
| "step": 2984 | |
| }, | |
| { | |
| "epoch": 0.5430127041742286, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.7430170887824088e-05, | |
| "loss": 0.0597, | |
| "num_input_tokens_seen": 185179876, | |
| "step": 2992 | |
| }, | |
| { | |
| "epoch": 0.5444646098003629, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 1.7339332674556408e-05, | |
| "loss": 0.0566, | |
| "num_input_tokens_seen": 185659670, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5459165154264973, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 1.724855030289852e-05, | |
| "loss": 0.028, | |
| "num_input_tokens_seen": 186148613, | |
| "step": 3008 | |
| }, | |
| { | |
| "epoch": 0.5473684210526316, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.715782567817459e-05, | |
| "loss": 0.0567, | |
| "num_input_tokens_seen": 186651171, | |
| "step": 3016 | |
| }, | |
| { | |
| "epoch": 0.5488203266787659, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 1.7067160704496817e-05, | |
| "loss": 0.0584, | |
| "num_input_tokens_seen": 187155654, | |
| "step": 3024 | |
| }, | |
| { | |
| "epoch": 0.5502722323049002, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.6976557284725434e-05, | |
| "loss": 0.0554, | |
| "num_input_tokens_seen": 187631290, | |
| "step": 3032 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.6886017320428817e-05, | |
| "loss": 0.0654, | |
| "num_input_tokens_seen": 188114682, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.5531760435571688, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 1.6795542711843535e-05, | |
| "loss": 0.0489, | |
| "num_input_tokens_seen": 188586657, | |
| "step": 3048 | |
| }, | |
| { | |
| "epoch": 0.554627949183303, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 1.670513535783448e-05, | |
| "loss": 0.0432, | |
| "num_input_tokens_seen": 189073577, | |
| "step": 3056 | |
| }, | |
| { | |
| "epoch": 0.5560798548094373, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.661479715585503e-05, | |
| "loss": 0.0559, | |
| "num_input_tokens_seen": 189536844, | |
| "step": 3064 | |
| }, | |
| { | |
| "epoch": 0.5575317604355717, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 1.6524530001907196e-05, | |
| "loss": 0.0552, | |
| "num_input_tokens_seen": 190005564, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.558983666061706, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 1.643433579050186e-05, | |
| "loss": 0.0479, | |
| "num_input_tokens_seen": 190494115, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.5604355716878403, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 1.6344216414618998e-05, | |
| "loss": 0.0558, | |
| "num_input_tokens_seen": 190997100, | |
| "step": 3088 | |
| }, | |
| { | |
| "epoch": 0.5618874773139746, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 1.625417376566794e-05, | |
| "loss": 0.0854, | |
| "num_input_tokens_seen": 191513399, | |
| "step": 3096 | |
| }, | |
| { | |
| "epoch": 0.5618874773139746, | |
| "eval_loss": 0.0525849312543869, | |
| "eval_runtime": 2614.8433, | |
| "eval_samples_per_second": 1.192, | |
| "eval_steps_per_second": 0.149, | |
| "num_input_tokens_seen": 191513399, | |
| "step": 3096 | |
| }, | |
| { | |
| "epoch": 0.5633393829401089, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 1.616420973344769e-05, | |
| "loss": 0.0467, | |
| "num_input_tokens_seen": 191995923, | |
| "step": 3104 | |
| }, | |
| { | |
| "epoch": 0.5647912885662432, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.607432620610727e-05, | |
| "loss": 0.0564, | |
| "num_input_tokens_seen": 192465595, | |
| "step": 3112 | |
| }, | |
| { | |
| "epoch": 0.5662431941923775, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 1.5984525070106065e-05, | |
| "loss": 0.0507, | |
| "num_input_tokens_seen": 192958871, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5676950998185119, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 1.5894808210174252e-05, | |
| "loss": 0.0574, | |
| "num_input_tokens_seen": 193430762, | |
| "step": 3128 | |
| }, | |
| { | |
| "epoch": 0.5691470054446461, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 1.5805177509273226e-05, | |
| "loss": 0.0545, | |
| "num_input_tokens_seen": 193908960, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 0.5705989110707804, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.571563484855611e-05, | |
| "loss": 0.0532, | |
| "num_input_tokens_seen": 194435990, | |
| "step": 3144 | |
| }, | |
| { | |
| "epoch": 0.5720508166969147, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.5626182107328253e-05, | |
| "loss": 0.0402, | |
| "num_input_tokens_seen": 194945870, | |
| "step": 3152 | |
| }, | |
| { | |
| "epoch": 0.573502722323049, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 1.5536821163007768e-05, | |
| "loss": 0.0728, | |
| "num_input_tokens_seen": 195449492, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.5749546279491833, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.5447553891086178e-05, | |
| "loss": 0.0457, | |
| "num_input_tokens_seen": 195943237, | |
| "step": 3168 | |
| }, | |
| { | |
| "epoch": 0.5764065335753176, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.5358382165089008e-05, | |
| "loss": 0.0612, | |
| "num_input_tokens_seen": 196442834, | |
| "step": 3176 | |
| }, | |
| { | |
| "epoch": 0.5778584392014519, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.5269307856536486e-05, | |
| "loss": 0.0533, | |
| "num_input_tokens_seen": 196964754, | |
| "step": 3184 | |
| }, | |
| { | |
| "epoch": 0.5793103448275863, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.5180332834904276e-05, | |
| "loss": 0.0331, | |
| "num_input_tokens_seen": 197500093, | |
| "step": 3192 | |
| }, | |
| { | |
| "epoch": 0.5807622504537205, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 1.5091458967584199e-05, | |
| "loss": 0.0689, | |
| "num_input_tokens_seen": 197994930, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5822141560798548, | |
| "grad_norm": 4.5, | |
| "learning_rate": 1.5002688119845086e-05, | |
| "loss": 0.0541, | |
| "num_input_tokens_seen": 198501247, | |
| "step": 3208 | |
| }, | |
| { | |
| "epoch": 0.5836660617059891, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.4914022154793613e-05, | |
| "loss": 0.0435, | |
| "num_input_tokens_seen": 199000501, | |
| "step": 3216 | |
| }, | |
| { | |
| "epoch": 0.5851179673321234, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.482546293333518e-05, | |
| "loss": 0.0557, | |
| "num_input_tokens_seen": 199479084, | |
| "step": 3224 | |
| }, | |
| { | |
| "epoch": 0.5865698729582577, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 1.473701231413489e-05, | |
| "loss": 0.0382, | |
| "num_input_tokens_seen": 200003062, | |
| "step": 3232 | |
| }, | |
| { | |
| "epoch": 0.588021778584392, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.464867215357851e-05, | |
| "loss": 0.0529, | |
| "num_input_tokens_seen": 200510961, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.5894736842105263, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 1.4560444305733521e-05, | |
| "loss": 0.0628, | |
| "num_input_tokens_seen": 201013169, | |
| "step": 3248 | |
| }, | |
| { | |
| "epoch": 0.5909255898366607, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 1.447233062231022e-05, | |
| "loss": 0.0322, | |
| "num_input_tokens_seen": 201480209, | |
| "step": 3256 | |
| }, | |
| { | |
| "epoch": 0.592377495462795, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.4384332952622815e-05, | |
| "loss": 0.0567, | |
| "num_input_tokens_seen": 201973667, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 0.5938294010889292, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.4296453143550664e-05, | |
| "loss": 0.0463, | |
| "num_input_tokens_seen": 202453986, | |
| "step": 3272 | |
| }, | |
| { | |
| "epoch": 0.5952813067150635, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.4208693039499468e-05, | |
| "loss": 0.0425, | |
| "num_input_tokens_seen": 202952414, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5967332123411978, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.4121054482362592e-05, | |
| "loss": 0.048, | |
| "num_input_tokens_seen": 203470869, | |
| "step": 3288 | |
| }, | |
| { | |
| "epoch": 0.5981851179673321, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.4033539311482403e-05, | |
| "loss": 0.0449, | |
| "num_input_tokens_seen": 203946575, | |
| "step": 3296 | |
| }, | |
| { | |
| "epoch": 0.5996370235934664, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.3946149363611631e-05, | |
| "loss": 0.0579, | |
| "num_input_tokens_seen": 204443918, | |
| "step": 3304 | |
| }, | |
| { | |
| "epoch": 0.6010889292196008, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.3858886472874881e-05, | |
| "loss": 0.1074, | |
| "num_input_tokens_seen": 204950872, | |
| "step": 3312 | |
| }, | |
| { | |
| "epoch": 0.6025408348457351, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.3771752470730078e-05, | |
| "loss": 0.0591, | |
| "num_input_tokens_seen": 205454235, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.6039927404718693, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 1.3684749185930088e-05, | |
| "loss": 0.055, | |
| "num_input_tokens_seen": 205939041, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 0.6054446460980036, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 1.3597878444484272e-05, | |
| "loss": 0.0483, | |
| "num_input_tokens_seen": 206431197, | |
| "step": 3336 | |
| }, | |
| { | |
| "epoch": 0.6068965517241379, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 1.351114206962021e-05, | |
| "loss": 0.0568, | |
| "num_input_tokens_seen": 206925320, | |
| "step": 3344 | |
| }, | |
| { | |
| "epoch": 0.6083484573502722, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 1.3424541881745425e-05, | |
| "loss": 0.0553, | |
| "num_input_tokens_seen": 207406668, | |
| "step": 3352 | |
| }, | |
| { | |
| "epoch": 0.6098003629764065, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.333807969840916e-05, | |
| "loss": 0.0517, | |
| "num_input_tokens_seen": 207877782, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.6112522686025408, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.3251757334264253e-05, | |
| "loss": 0.04, | |
| "num_input_tokens_seen": 208344318, | |
| "step": 3368 | |
| }, | |
| { | |
| "epoch": 0.6127041742286752, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.316557660102903e-05, | |
| "loss": 0.0488, | |
| "num_input_tokens_seen": 208814858, | |
| "step": 3376 | |
| }, | |
| { | |
| "epoch": 0.6141560798548095, | |
| "grad_norm": 0.5, | |
| "learning_rate": 1.3079539307449311e-05, | |
| "loss": 0.044, | |
| "num_input_tokens_seen": 209297102, | |
| "step": 3384 | |
| }, | |
| { | |
| "epoch": 0.6156079854809438, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.2993647259260418e-05, | |
| "loss": 0.0469, | |
| "num_input_tokens_seen": 209774677, | |
| "step": 3392 | |
| }, | |
| { | |
| "epoch": 0.617059891107078, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 1.2907902259149287e-05, | |
| "loss": 0.0694, | |
| "num_input_tokens_seen": 210275870, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6185117967332123, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.2822306106716645e-05, | |
| "loss": 0.0595, | |
| "num_input_tokens_seen": 210797636, | |
| "step": 3408 | |
| }, | |
| { | |
| "epoch": 0.6199637023593466, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 1.2736860598439215e-05, | |
| "loss": 0.0665, | |
| "num_input_tokens_seen": 211287706, | |
| "step": 3416 | |
| }, | |
| { | |
| "epoch": 0.6214156079854809, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.2651567527632045e-05, | |
| "loss": 0.0698, | |
| "num_input_tokens_seen": 211773156, | |
| "step": 3424 | |
| }, | |
| { | |
| "epoch": 0.6228675136116153, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.2566428684410843e-05, | |
| "loss": 0.0348, | |
| "num_input_tokens_seen": 212277142, | |
| "step": 3432 | |
| }, | |
| { | |
| "epoch": 0.6243194192377496, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.2481445855654415e-05, | |
| "loss": 0.0474, | |
| "num_input_tokens_seen": 212767513, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.6243194192377496, | |
| "eval_loss": 0.05037084221839905, | |
| "eval_runtime": 2739.6179, | |
| "eval_samples_per_second": 1.138, | |
| "eval_steps_per_second": 0.142, | |
| "num_input_tokens_seen": 212767513, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.6257713248638839, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.2396620824967169e-05, | |
| "loss": 0.1043, | |
| "num_input_tokens_seen": 213273298, | |
| "step": 3448 | |
| }, | |
| { | |
| "epoch": 0.6272232304900182, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.2311955372641674e-05, | |
| "loss": 0.0779, | |
| "num_input_tokens_seen": 213743600, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 0.6286751361161524, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.222745127562129e-05, | |
| "loss": 0.0474, | |
| "num_input_tokens_seen": 214249105, | |
| "step": 3464 | |
| }, | |
| { | |
| "epoch": 0.6301270417422867, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.2143110307462892e-05, | |
| "loss": 0.0914, | |
| "num_input_tokens_seen": 214743732, | |
| "step": 3472 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.2058934238299625e-05, | |
| "loss": 0.0333, | |
| "num_input_tokens_seen": 215240214, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.6330308529945553, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.1974924834803765e-05, | |
| "loss": 0.0477, | |
| "num_input_tokens_seen": 215752215, | |
| "step": 3488 | |
| }, | |
| { | |
| "epoch": 0.6344827586206897, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.1891083860149653e-05, | |
| "loss": 0.0456, | |
| "num_input_tokens_seen": 216218681, | |
| "step": 3496 | |
| }, | |
| { | |
| "epoch": 0.635934664246824, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 1.1807413073976655e-05, | |
| "loss": 0.0537, | |
| "num_input_tokens_seen": 216717186, | |
| "step": 3504 | |
| }, | |
| { | |
| "epoch": 0.6373865698729583, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.1723914232352265e-05, | |
| "loss": 0.0543, | |
| "num_input_tokens_seen": 217224763, | |
| "step": 3512 | |
| }, | |
| { | |
| "epoch": 0.6388384754990926, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.1640589087735222e-05, | |
| "loss": 0.053, | |
| "num_input_tokens_seen": 217712978, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.6402903811252268, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 1.1557439388938772e-05, | |
| "loss": 0.0464, | |
| "num_input_tokens_seen": 218177197, | |
| "step": 3528 | |
| }, | |
| { | |
| "epoch": 0.6417422867513611, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.1474466881093904e-05, | |
| "loss": 0.0679, | |
| "num_input_tokens_seen": 218664950, | |
| "step": 3536 | |
| }, | |
| { | |
| "epoch": 0.6431941923774954, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.139167330561277e-05, | |
| "loss": 0.0551, | |
| "num_input_tokens_seen": 219190307, | |
| "step": 3544 | |
| }, | |
| { | |
| "epoch": 0.6446460980036298, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.130906040015211e-05, | |
| "loss": 0.045, | |
| "num_input_tokens_seen": 219656276, | |
| "step": 3552 | |
| }, | |
| { | |
| "epoch": 0.6460980036297641, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.1226629898576818e-05, | |
| "loss": 0.0516, | |
| "num_input_tokens_seen": 220153311, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.6475499092558984, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.1144383530923505e-05, | |
| "loss": 0.04, | |
| "num_input_tokens_seen": 220641855, | |
| "step": 3568 | |
| }, | |
| { | |
| "epoch": 0.6490018148820327, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.1062323023364217e-05, | |
| "loss": 0.0566, | |
| "num_input_tokens_seen": 221165742, | |
| "step": 3576 | |
| }, | |
| { | |
| "epoch": 0.650453720508167, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 1.0980450098170211e-05, | |
| "loss": 0.0598, | |
| "num_input_tokens_seen": 221645634, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 0.6519056261343013, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.0898766473675795e-05, | |
| "loss": 0.0582, | |
| "num_input_tokens_seen": 222128368, | |
| "step": 3592 | |
| }, | |
| { | |
| "epoch": 0.6533575317604355, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.081727386424225e-05, | |
| "loss": 0.0637, | |
| "num_input_tokens_seen": 222630366, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6548094373865698, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.0735973980221898e-05, | |
| "loss": 0.0319, | |
| "num_input_tokens_seen": 223132889, | |
| "step": 3608 | |
| }, | |
| { | |
| "epoch": 0.6562613430127042, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 1.0654868527922157e-05, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 223620866, | |
| "step": 3616 | |
| }, | |
| { | |
| "epoch": 0.6577132486388385, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 1.0573959209569736e-05, | |
| "loss": 0.0563, | |
| "num_input_tokens_seen": 224112161, | |
| "step": 3624 | |
| }, | |
| { | |
| "epoch": 0.6591651542649728, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.0493247723274949e-05, | |
| "loss": 0.0637, | |
| "num_input_tokens_seen": 224615692, | |
| "step": 3632 | |
| }, | |
| { | |
| "epoch": 0.6606170598911071, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 1.0412735762996022e-05, | |
| "loss": 0.0525, | |
| "num_input_tokens_seen": 225123661, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.6620689655172414, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 1.0332425018503573e-05, | |
| "loss": 0.0448, | |
| "num_input_tokens_seen": 225606843, | |
| "step": 3648 | |
| }, | |
| { | |
| "epoch": 0.6635208711433757, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.025231717534513e-05, | |
| "loss": 0.0511, | |
| "num_input_tokens_seen": 226083858, | |
| "step": 3656 | |
| }, | |
| { | |
| "epoch": 0.6649727767695099, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.0172413914809791e-05, | |
| "loss": 0.0297, | |
| "num_input_tokens_seen": 226586157, | |
| "step": 3664 | |
| }, | |
| { | |
| "epoch": 0.6664246823956442, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 1.0092716913892878e-05, | |
| "loss": 0.0542, | |
| "num_input_tokens_seen": 227090262, | |
| "step": 3672 | |
| }, | |
| { | |
| "epoch": 0.6678765880217786, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 1.0013227845260785e-05, | |
| "loss": 0.0496, | |
| "num_input_tokens_seen": 227568348, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.6693284936479129, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 9.933948377215873e-06, | |
| "loss": 0.0474, | |
| "num_input_tokens_seen": 228069156, | |
| "step": 3688 | |
| }, | |
| { | |
| "epoch": 0.6707803992740472, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 9.85488017366143e-06, | |
| "loss": 0.0276, | |
| "num_input_tokens_seen": 228546696, | |
| "step": 3696 | |
| }, | |
| { | |
| "epoch": 0.6722323049001815, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 9.776024894066755e-06, | |
| "loss": 0.0413, | |
| "num_input_tokens_seen": 229039860, | |
| "step": 3704 | |
| }, | |
| { | |
| "epoch": 0.6736842105263158, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 9.697384193432365e-06, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 229524911, | |
| "step": 3712 | |
| }, | |
| { | |
| "epoch": 0.6751361161524501, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 9.618959722255204e-06, | |
| "loss": 0.0448, | |
| "num_input_tokens_seen": 230032334, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.6765880217785843, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 9.540753126494035e-06, | |
| "loss": 0.0746, | |
| "num_input_tokens_seen": 230518610, | |
| "step": 3728 | |
| }, | |
| { | |
| "epoch": 0.6780399274047187, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 9.462766047534915e-06, | |
| "loss": 0.0463, | |
| "num_input_tokens_seen": 231010962, | |
| "step": 3736 | |
| }, | |
| { | |
| "epoch": 0.679491833030853, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 9.385000122156695e-06, | |
| "loss": 0.0675, | |
| "num_input_tokens_seen": 231515592, | |
| "step": 3744 | |
| }, | |
| { | |
| "epoch": 0.6809437386569873, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 9.3074569824967e-06, | |
| "loss": 0.0627, | |
| "num_input_tokens_seen": 232031254, | |
| "step": 3752 | |
| }, | |
| { | |
| "epoch": 0.6823956442831216, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 9.230138256016461e-06, | |
| "loss": 0.0601, | |
| "num_input_tokens_seen": 232525195, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.6838475499092559, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 9.153045565467605e-06, | |
| "loss": 0.0587, | |
| "num_input_tokens_seen": 232999291, | |
| "step": 3768 | |
| }, | |
| { | |
| "epoch": 0.6852994555353902, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 9.076180528857709e-06, | |
| "loss": 0.0536, | |
| "num_input_tokens_seen": 233490579, | |
| "step": 3776 | |
| }, | |
| { | |
| "epoch": 0.6867513611615245, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 8.999544759416413e-06, | |
| "loss": 0.0346, | |
| "num_input_tokens_seen": 234000641, | |
| "step": 3784 | |
| }, | |
| { | |
| "epoch": 0.6867513611615245, | |
| "eval_loss": 0.04955988749861717, | |
| "eval_runtime": 2842.036, | |
| "eval_samples_per_second": 1.097, | |
| "eval_steps_per_second": 0.137, | |
| "num_input_tokens_seen": 234000641, | |
| "step": 3784 | |
| }, | |
| { | |
| "epoch": 0.6882032667876588, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 8.923139865561525e-06, | |
| "loss": 0.0568, | |
| "num_input_tokens_seen": 234523989, | |
| "step": 3792 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 8.846967450865302e-06, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 234995824, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6911070780399274, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 8.77102911402075e-06, | |
| "loss": 0.0396, | |
| "num_input_tokens_seen": 235480070, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 0.6925589836660617, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 8.695326448808089e-06, | |
| "loss": 0.0427, | |
| "num_input_tokens_seen": 235969468, | |
| "step": 3816 | |
| }, | |
| { | |
| "epoch": 0.694010889292196, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 8.61986104406132e-06, | |
| "loss": 0.0468, | |
| "num_input_tokens_seen": 236457438, | |
| "step": 3824 | |
| }, | |
| { | |
| "epoch": 0.6954627949183303, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 8.544634483634855e-06, | |
| "loss": 0.07, | |
| "num_input_tokens_seen": 236964483, | |
| "step": 3832 | |
| }, | |
| { | |
| "epoch": 0.6969147005444646, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 8.469648346370275e-06, | |
| "loss": 0.0681, | |
| "num_input_tokens_seen": 237478465, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.6983666061705989, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 8.39490420606323e-06, | |
| "loss": 0.0486, | |
| "num_input_tokens_seen": 237972518, | |
| "step": 3848 | |
| }, | |
| { | |
| "epoch": 0.6998185117967333, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 8.320403631430352e-06, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 238453985, | |
| "step": 3856 | |
| }, | |
| { | |
| "epoch": 0.7012704174228676, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 8.246148186076367e-06, | |
| "loss": 0.0565, | |
| "num_input_tokens_seen": 238956557, | |
| "step": 3864 | |
| }, | |
| { | |
| "epoch": 0.7027223230490018, | |
| "grad_norm": 1.125, | |
| "learning_rate": 8.172139428461292e-06, | |
| "loss": 0.0699, | |
| "num_input_tokens_seen": 239428560, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 0.7041742286751361, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 8.098378911867682e-06, | |
| "loss": 0.0595, | |
| "num_input_tokens_seen": 239904462, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.7056261343012704, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 8.02486818436806e-06, | |
| "loss": 0.0696, | |
| "num_input_tokens_seen": 240404479, | |
| "step": 3888 | |
| }, | |
| { | |
| "epoch": 0.7070780399274047, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 7.95160878879242e-06, | |
| "loss": 0.0534, | |
| "num_input_tokens_seen": 240926945, | |
| "step": 3896 | |
| }, | |
| { | |
| "epoch": 0.708529945553539, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 7.87860226269586e-06, | |
| "loss": 0.0596, | |
| "num_input_tokens_seen": 241440836, | |
| "step": 3904 | |
| }, | |
| { | |
| "epoch": 0.7099818511796733, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 7.805850138326282e-06, | |
| "loss": 0.035, | |
| "num_input_tokens_seen": 241942169, | |
| "step": 3912 | |
| }, | |
| { | |
| "epoch": 0.7114337568058077, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 7.733353942592246e-06, | |
| "loss": 0.0501, | |
| "num_input_tokens_seen": 242419037, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.712885662431942, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 7.661115197030954e-06, | |
| "loss": 0.0576, | |
| "num_input_tokens_seen": 242917759, | |
| "step": 3928 | |
| }, | |
| { | |
| "epoch": 0.7143375680580762, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 7.589135417776266e-06, | |
| "loss": 0.0394, | |
| "num_input_tokens_seen": 243411063, | |
| "step": 3936 | |
| }, | |
| { | |
| "epoch": 0.7157894736842105, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 7.517416115526901e-06, | |
| "loss": 0.0485, | |
| "num_input_tokens_seen": 243885516, | |
| "step": 3944 | |
| }, | |
| { | |
| "epoch": 0.7172413793103448, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 7.445958795514761e-06, | |
| "loss": 0.0642, | |
| "num_input_tokens_seen": 244397104, | |
| "step": 3952 | |
| }, | |
| { | |
| "epoch": 0.7186932849364791, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 7.374764957473281e-06, | |
| "loss": 0.0486, | |
| "num_input_tokens_seen": 244892690, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.7201451905626134, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 7.303836095605994e-06, | |
| "loss": 0.0532, | |
| "num_input_tokens_seen": 245418852, | |
| "step": 3968 | |
| }, | |
| { | |
| "epoch": 0.7215970961887477, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.233173698555174e-06, | |
| "loss": 0.0389, | |
| "num_input_tokens_seen": 245925757, | |
| "step": 3976 | |
| }, | |
| { | |
| "epoch": 0.7230490018148821, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 7.16277924937056e-06, | |
| "loss": 0.0514, | |
| "num_input_tokens_seen": 246421511, | |
| "step": 3984 | |
| }, | |
| { | |
| "epoch": 0.7245009074410164, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 7.092654225478257e-06, | |
| "loss": 0.041, | |
| "num_input_tokens_seen": 246952363, | |
| "step": 3992 | |
| }, | |
| { | |
| "epoch": 0.7259528130671506, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 7.022800098649716e-06, | |
| "loss": 0.0446, | |
| "num_input_tokens_seen": 247450049, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7274047186932849, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 6.953218334970861e-06, | |
| "loss": 0.0379, | |
| "num_input_tokens_seen": 247943269, | |
| "step": 4008 | |
| }, | |
| { | |
| "epoch": 0.7288566243194192, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 6.8839103948113e-06, | |
| "loss": 0.0394, | |
| "num_input_tokens_seen": 248447780, | |
| "step": 4016 | |
| }, | |
| { | |
| "epoch": 0.7303085299455535, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 6.814877732793663e-06, | |
| "loss": 0.0401, | |
| "num_input_tokens_seen": 248921260, | |
| "step": 4024 | |
| }, | |
| { | |
| "epoch": 0.7317604355716878, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 6.7461217977631325e-06, | |
| "loss": 0.0447, | |
| "num_input_tokens_seen": 249435130, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 0.7332123411978222, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 6.67764403275696e-06, | |
| "loss": 0.0457, | |
| "num_input_tokens_seen": 249913307, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.7346642468239565, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 6.609445874974218e-06, | |
| "loss": 0.066, | |
| "num_input_tokens_seen": 250435878, | |
| "step": 4048 | |
| }, | |
| { | |
| "epoch": 0.7361161524500908, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 6.5415287557456585e-06, | |
| "loss": 0.0509, | |
| "num_input_tokens_seen": 250946234, | |
| "step": 4056 | |
| }, | |
| { | |
| "epoch": 0.737568058076225, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 6.473894100503615e-06, | |
| "loss": 0.0553, | |
| "num_input_tokens_seen": 251435205, | |
| "step": 4064 | |
| }, | |
| { | |
| "epoch": 0.7390199637023593, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 6.4065433287521306e-06, | |
| "loss": 0.0445, | |
| "num_input_tokens_seen": 251949775, | |
| "step": 4072 | |
| }, | |
| { | |
| "epoch": 0.7404718693284936, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 6.33947785403716e-06, | |
| "loss": 0.0626, | |
| "num_input_tokens_seen": 252447111, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.7419237749546279, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 6.272699083916885e-06, | |
| "loss": 0.0685, | |
| "num_input_tokens_seen": 252958790, | |
| "step": 4088 | |
| }, | |
| { | |
| "epoch": 0.7433756805807622, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 6.20620841993218e-06, | |
| "loss": 0.0705, | |
| "num_input_tokens_seen": 253436330, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.7448275862068966, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 6.1400072575772056e-06, | |
| "loss": 0.0599, | |
| "num_input_tokens_seen": 253927128, | |
| "step": 4104 | |
| }, | |
| { | |
| "epoch": 0.7462794918330309, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 6.0740969862701195e-06, | |
| "loss": 0.0407, | |
| "num_input_tokens_seen": 254426830, | |
| "step": 4112 | |
| }, | |
| { | |
| "epoch": 0.7477313974591652, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 6.008478989323898e-06, | |
| "loss": 0.0566, | |
| "num_input_tokens_seen": 254922990, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.7491833030852995, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 5.943154643917315e-06, | |
| "loss": 0.0498, | |
| "num_input_tokens_seen": 255423630, | |
| "step": 4128 | |
| }, | |
| { | |
| "epoch": 0.7491833030852995, | |
| "eval_loss": 0.049039360135793686, | |
| "eval_runtime": 2629.7216, | |
| "eval_samples_per_second": 1.185, | |
| "eval_steps_per_second": 0.148, | |
| "num_input_tokens_seen": 255423630, | |
| "step": 4128 | |
| }, | |
| { | |
| "epoch": 0.7506352087114337, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 5.87812532106606e-06, | |
| "loss": 0.0614, | |
| "num_input_tokens_seen": 255929632, | |
| "step": 4136 | |
| }, | |
| { | |
| "epoch": 0.752087114337568, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 5.813392385593915e-06, | |
| "loss": 0.0651, | |
| "num_input_tokens_seen": 256430965, | |
| "step": 4144 | |
| }, | |
| { | |
| "epoch": 0.7535390199637023, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 5.7489571961041415e-06, | |
| "loss": 0.0618, | |
| "num_input_tokens_seen": 256934909, | |
| "step": 4152 | |
| }, | |
| { | |
| "epoch": 0.7549909255898367, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 5.684821104950984e-06, | |
| "loss": 0.0604, | |
| "num_input_tokens_seen": 257421654, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.756442831215971, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 5.620985458211241e-06, | |
| "loss": 0.0516, | |
| "num_input_tokens_seen": 257913684, | |
| "step": 4168 | |
| }, | |
| { | |
| "epoch": 0.7578947368421053, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 5.55745159565604e-06, | |
| "loss": 0.0418, | |
| "num_input_tokens_seen": 258400849, | |
| "step": 4176 | |
| }, | |
| { | |
| "epoch": 0.7593466424682396, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 5.494220850722729e-06, | |
| "loss": 0.062, | |
| "num_input_tokens_seen": 258878333, | |
| "step": 4184 | |
| }, | |
| { | |
| "epoch": 0.7607985480943739, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 5.431294550486869e-06, | |
| "loss": 0.0615, | |
| "num_input_tokens_seen": 259369068, | |
| "step": 4192 | |
| }, | |
| { | |
| "epoch": 0.7622504537205081, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 5.3686740156343805e-06, | |
| "loss": 0.0584, | |
| "num_input_tokens_seen": 259870513, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7637023593466424, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 5.306360560433854e-06, | |
| "loss": 0.0419, | |
| "num_input_tokens_seen": 260370376, | |
| "step": 4208 | |
| }, | |
| { | |
| "epoch": 0.7651542649727767, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 5.244355492708941e-06, | |
| "loss": 0.0582, | |
| "num_input_tokens_seen": 260881761, | |
| "step": 4216 | |
| }, | |
| { | |
| "epoch": 0.7666061705989111, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 5.182660113810907e-06, | |
| "loss": 0.0468, | |
| "num_input_tokens_seen": 261402673, | |
| "step": 4224 | |
| }, | |
| { | |
| "epoch": 0.7680580762250454, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 5.121275718591321e-06, | |
| "loss": 0.0686, | |
| "num_input_tokens_seen": 261898525, | |
| "step": 4232 | |
| }, | |
| { | |
| "epoch": 0.7695099818511797, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 5.0602035953748865e-06, | |
| "loss": 0.0624, | |
| "num_input_tokens_seen": 262392396, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.770961887477314, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.999445025932408e-06, | |
| "loss": 0.0429, | |
| "num_input_tokens_seen": 262882816, | |
| "step": 4248 | |
| }, | |
| { | |
| "epoch": 0.7724137931034483, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 4.939001285453864e-06, | |
| "loss": 0.0372, | |
| "num_input_tokens_seen": 263383267, | |
| "step": 4256 | |
| }, | |
| { | |
| "epoch": 0.7738656987295826, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 4.8788736425216595e-06, | |
| "loss": 0.0343, | |
| "num_input_tokens_seen": 263858756, | |
| "step": 4264 | |
| }, | |
| { | |
| "epoch": 0.7753176043557168, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.81906335908402e-06, | |
| "loss": 0.048, | |
| "num_input_tokens_seen": 264345998, | |
| "step": 4272 | |
| }, | |
| { | |
| "epoch": 0.7767695099818511, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.759571690428464e-06, | |
| "loss": 0.0595, | |
| "num_input_tokens_seen": 264834486, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.7782214156079855, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 4.700399885155487e-06, | |
| "loss": 0.0456, | |
| "num_input_tokens_seen": 265331269, | |
| "step": 4288 | |
| }, | |
| { | |
| "epoch": 0.7796733212341198, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 4.641549185152359e-06, | |
| "loss": 0.0374, | |
| "num_input_tokens_seen": 265836347, | |
| "step": 4296 | |
| }, | |
| { | |
| "epoch": 0.7811252268602541, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 4.583020825567039e-06, | |
| "loss": 0.0359, | |
| "num_input_tokens_seen": 266324737, | |
| "step": 4304 | |
| }, | |
| { | |
| "epoch": 0.7825771324863884, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 4.524816034782263e-06, | |
| "loss": 0.0575, | |
| "num_input_tokens_seen": 266808164, | |
| "step": 4312 | |
| }, | |
| { | |
| "epoch": 0.7840290381125227, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 4.46693603438977e-06, | |
| "loss": 0.0502, | |
| "num_input_tokens_seen": 267324813, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.785480943738657, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 4.409382039164653e-06, | |
| "loss": 0.063, | |
| "num_input_tokens_seen": 267822646, | |
| "step": 4328 | |
| }, | |
| { | |
| "epoch": 0.7869328493647912, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 4.352155257039865e-06, | |
| "loss": 0.0736, | |
| "num_input_tokens_seen": 268320339, | |
| "step": 4336 | |
| }, | |
| { | |
| "epoch": 0.7883847549909256, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 4.295256889080865e-06, | |
| "loss": 0.0568, | |
| "num_input_tokens_seen": 268805229, | |
| "step": 4344 | |
| }, | |
| { | |
| "epoch": 0.7898366606170599, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 4.238688129460431e-06, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 269290686, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 0.7912885662431942, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 4.18245016543356e-06, | |
| "loss": 0.0468, | |
| "num_input_tokens_seen": 269771817, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.7927404718693285, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 4.126544177312577e-06, | |
| "loss": 0.0497, | |
| "num_input_tokens_seen": 270261530, | |
| "step": 4368 | |
| }, | |
| { | |
| "epoch": 0.7941923774954628, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 4.0709713384423685e-06, | |
| "loss": 0.0356, | |
| "num_input_tokens_seen": 270769688, | |
| "step": 4376 | |
| }, | |
| { | |
| "epoch": 0.7956442831215971, | |
| "grad_norm": 2.0, | |
| "learning_rate": 4.015732815175728e-06, | |
| "loss": 0.0573, | |
| "num_input_tokens_seen": 271284923, | |
| "step": 4384 | |
| }, | |
| { | |
| "epoch": 0.7970961887477314, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 3.960829766848893e-06, | |
| "loss": 0.056, | |
| "num_input_tokens_seen": 271756884, | |
| "step": 4392 | |
| }, | |
| { | |
| "epoch": 0.7985480943738656, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 3.906263345757231e-06, | |
| "loss": 0.0309, | |
| "num_input_tokens_seen": 272248473, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 3.852034697131015e-06, | |
| "loss": 0.0447, | |
| "num_input_tokens_seen": 272755455, | |
| "step": 4408 | |
| }, | |
| { | |
| "epoch": 0.8014519056261343, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 3.7981449591114207e-06, | |
| "loss": 0.0459, | |
| "num_input_tokens_seen": 273244979, | |
| "step": 4416 | |
| }, | |
| { | |
| "epoch": 0.8029038112522686, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 3.7445952627266336e-06, | |
| "loss": 0.0642, | |
| "num_input_tokens_seen": 273749266, | |
| "step": 4424 | |
| }, | |
| { | |
| "epoch": 0.8043557168784029, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 3.6913867318680984e-06, | |
| "loss": 0.0455, | |
| "num_input_tokens_seen": 274271081, | |
| "step": 4432 | |
| }, | |
| { | |
| "epoch": 0.8058076225045372, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 3.6385204832669385e-06, | |
| "loss": 0.0414, | |
| "num_input_tokens_seen": 274770517, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.8072595281306715, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 3.585997626470519e-06, | |
| "loss": 0.0426, | |
| "num_input_tokens_seen": 275248505, | |
| "step": 4448 | |
| }, | |
| { | |
| "epoch": 0.8087114337568058, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.533819263819167e-06, | |
| "loss": 0.0498, | |
| "num_input_tokens_seen": 275748095, | |
| "step": 4456 | |
| }, | |
| { | |
| "epoch": 0.8101633393829402, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 3.4819864904230195e-06, | |
| "loss": 0.0508, | |
| "num_input_tokens_seen": 276242421, | |
| "step": 4464 | |
| }, | |
| { | |
| "epoch": 0.8116152450090744, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 3.4305003941390468e-06, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 276731693, | |
| "step": 4472 | |
| }, | |
| { | |
| "epoch": 0.8116152450090744, | |
| "eval_loss": 0.04871319234371185, | |
| "eval_runtime": 2768.9798, | |
| "eval_samples_per_second": 1.126, | |
| "eval_steps_per_second": 0.141, | |
| "num_input_tokens_seen": 276731693, | |
| "step": 4472 | |
| }, | |
| { | |
| "epoch": 0.8130671506352087, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 3.3793620555482322e-06, | |
| "loss": 0.053, | |
| "num_input_tokens_seen": 277218277, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.814519056261343, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.3285725479328757e-06, | |
| "loss": 0.0582, | |
| "num_input_tokens_seen": 277705169, | |
| "step": 4488 | |
| }, | |
| { | |
| "epoch": 0.8159709618874773, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 3.2781329372540683e-06, | |
| "loss": 0.0618, | |
| "num_input_tokens_seen": 278213285, | |
| "step": 4496 | |
| }, | |
| { | |
| "epoch": 0.8174228675136116, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 3.2280442821293455e-06, | |
| "loss": 0.0556, | |
| "num_input_tokens_seen": 278697097, | |
| "step": 4504 | |
| }, | |
| { | |
| "epoch": 0.8188747731397459, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 3.178307633810436e-06, | |
| "loss": 0.0526, | |
| "num_input_tokens_seen": 279193929, | |
| "step": 4512 | |
| }, | |
| { | |
| "epoch": 0.8203266787658802, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 3.128924036161207e-06, | |
| "loss": 0.0411, | |
| "num_input_tokens_seen": 279698041, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.8217785843920146, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 3.079894525635783e-06, | |
| "loss": 0.0505, | |
| "num_input_tokens_seen": 280182805, | |
| "step": 4528 | |
| }, | |
| { | |
| "epoch": 0.8232304900181489, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 3.0312201312567536e-06, | |
| "loss": 0.04, | |
| "num_input_tokens_seen": 280651028, | |
| "step": 4536 | |
| }, | |
| { | |
| "epoch": 0.8246823956442831, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 2.982901874593598e-06, | |
| "loss": 0.0696, | |
| "num_input_tokens_seen": 281162798, | |
| "step": 4544 | |
| }, | |
| { | |
| "epoch": 0.8261343012704174, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 2.934940769741239e-06, | |
| "loss": 0.0356, | |
| "num_input_tokens_seen": 281658265, | |
| "step": 4552 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 2.8873378232987726e-06, | |
| "loss": 0.0503, | |
| "num_input_tokens_seen": 282170245, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.829038112522686, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 2.840094034348315e-06, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 282655198, | |
| "step": 4568 | |
| }, | |
| { | |
| "epoch": 0.8304900181488203, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 2.793210394434056e-06, | |
| "loss": 0.0615, | |
| "num_input_tokens_seen": 283132416, | |
| "step": 4576 | |
| }, | |
| { | |
| "epoch": 0.8319419237749546, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 2.746687887541448e-06, | |
| "loss": 0.0537, | |
| "num_input_tokens_seen": 283628667, | |
| "step": 4584 | |
| }, | |
| { | |
| "epoch": 0.833393829401089, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.700527490076539e-06, | |
| "loss": 0.0375, | |
| "num_input_tokens_seen": 284146751, | |
| "step": 4592 | |
| }, | |
| { | |
| "epoch": 0.8348457350272233, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 2.6547301708454877e-06, | |
| "loss": 0.041, | |
| "num_input_tokens_seen": 284643128, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8362976406533575, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.609296891034241e-06, | |
| "loss": 0.0473, | |
| "num_input_tokens_seen": 285145371, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 0.8377495462794918, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 2.5642286041883458e-06, | |
| "loss": 0.0472, | |
| "num_input_tokens_seen": 285639963, | |
| "step": 4616 | |
| }, | |
| { | |
| "epoch": 0.8392014519056261, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 2.519526256192939e-06, | |
| "loss": 0.0493, | |
| "num_input_tokens_seen": 286128983, | |
| "step": 4624 | |
| }, | |
| { | |
| "epoch": 0.8406533575317604, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 2.47519078525291e-06, | |
| "loss": 0.0726, | |
| "num_input_tokens_seen": 286625920, | |
| "step": 4632 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.431223121873183e-06, | |
| "loss": 0.0465, | |
| "num_input_tokens_seen": 287119525, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.8435571687840291, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.3876241888392173e-06, | |
| "loss": 0.0553, | |
| "num_input_tokens_seen": 287610722, | |
| "step": 4648 | |
| }, | |
| { | |
| "epoch": 0.8450090744101634, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.3443949011976107e-06, | |
| "loss": 0.0428, | |
| "num_input_tokens_seen": 288097243, | |
| "step": 4656 | |
| }, | |
| { | |
| "epoch": 0.8464609800362977, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 2.301536166236926e-06, | |
| "loss": 0.048, | |
| "num_input_tokens_seen": 288598177, | |
| "step": 4664 | |
| }, | |
| { | |
| "epoch": 0.847912885662432, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 2.259048883468622e-06, | |
| "loss": 0.0436, | |
| "num_input_tokens_seen": 289095940, | |
| "step": 4672 | |
| }, | |
| { | |
| "epoch": 0.8493647912885662, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 2.216933944608184e-06, | |
| "loss": 0.0525, | |
| "num_input_tokens_seen": 289579822, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.8508166969147005, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 2.1751922335564134e-06, | |
| "loss": 0.0752, | |
| "num_input_tokens_seen": 290090500, | |
| "step": 4688 | |
| }, | |
| { | |
| "epoch": 0.8522686025408348, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 2.13382462638088e-06, | |
| "loss": 0.0348, | |
| "num_input_tokens_seen": 290583181, | |
| "step": 4696 | |
| }, | |
| { | |
| "epoch": 0.8537205081669691, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 2.0928319912975193e-06, | |
| "loss": 0.063, | |
| "num_input_tokens_seen": 291086649, | |
| "step": 4704 | |
| }, | |
| { | |
| "epoch": 0.8551724137931035, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.0522151886524153e-06, | |
| "loss": 0.0492, | |
| "num_input_tokens_seen": 291577384, | |
| "step": 4712 | |
| }, | |
| { | |
| "epoch": 0.8566243194192378, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 2.0119750709037646e-06, | |
| "loss": 0.0428, | |
| "num_input_tokens_seen": 292058725, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.8580762250453721, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.972112482603954e-06, | |
| "loss": 0.074, | |
| "num_input_tokens_seen": 292542677, | |
| "step": 4728 | |
| }, | |
| { | |
| "epoch": 0.8595281306715064, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 1.9326282603818526e-06, | |
| "loss": 0.0493, | |
| "num_input_tokens_seen": 293025201, | |
| "step": 4736 | |
| }, | |
| { | |
| "epoch": 0.8609800362976406, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 1.8935232329252585e-06, | |
| "loss": 0.0431, | |
| "num_input_tokens_seen": 293508845, | |
| "step": 4744 | |
| }, | |
| { | |
| "epoch": 0.8624319419237749, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 1.854798220963485e-06, | |
| "loss": 0.0356, | |
| "num_input_tokens_seen": 293995884, | |
| "step": 4752 | |
| }, | |
| { | |
| "epoch": 0.8638838475499092, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 1.816454037250155e-06, | |
| "loss": 0.0548, | |
| "num_input_tokens_seen": 294512519, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.8653357531760436, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.778491486546141e-06, | |
| "loss": 0.0409, | |
| "num_input_tokens_seen": 295012760, | |
| "step": 4768 | |
| }, | |
| { | |
| "epoch": 0.8667876588021779, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.7409113656026643e-06, | |
| "loss": 0.0336, | |
| "num_input_tokens_seen": 295509942, | |
| "step": 4776 | |
| }, | |
| { | |
| "epoch": 0.8682395644283122, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.7037144631445745e-06, | |
| "loss": 0.0413, | |
| "num_input_tokens_seen": 296013081, | |
| "step": 4784 | |
| }, | |
| { | |
| "epoch": 0.8696914700544465, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.666901559853804e-06, | |
| "loss": 0.0387, | |
| "num_input_tokens_seen": 296492427, | |
| "step": 4792 | |
| }, | |
| { | |
| "epoch": 0.8711433756805808, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 1.63047342835299e-06, | |
| "loss": 0.0468, | |
| "num_input_tokens_seen": 297011120, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.872595281306715, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.594430833189231e-06, | |
| "loss": 0.0518, | |
| "num_input_tokens_seen": 297502338, | |
| "step": 4808 | |
| }, | |
| { | |
| "epoch": 0.8740471869328493, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.5587745308180656e-06, | |
| "loss": 0.055, | |
| "num_input_tokens_seen": 298011343, | |
| "step": 4816 | |
| }, | |
| { | |
| "epoch": 0.8740471869328493, | |
| "eval_loss": 0.04861417040228844, | |
| "eval_runtime": 2715.815, | |
| "eval_samples_per_second": 1.148, | |
| "eval_steps_per_second": 0.144, | |
| "num_input_tokens_seen": 298011343, | |
| "step": 4816 | |
| }, | |
| { | |
| "epoch": 0.8754990925589836, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.523505269587595e-06, | |
| "loss": 0.0366, | |
| "num_input_tokens_seen": 298524933, | |
| "step": 4824 | |
| }, | |
| { | |
| "epoch": 0.876950998185118, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.4886237897227584e-06, | |
| "loss": 0.0466, | |
| "num_input_tokens_seen": 299031985, | |
| "step": 4832 | |
| }, | |
| { | |
| "epoch": 0.8784029038112523, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.4541308233098117e-06, | |
| "loss": 0.0472, | |
| "num_input_tokens_seen": 299512381, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.8798548094373866, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 1.420027094280969e-06, | |
| "loss": 0.0585, | |
| "num_input_tokens_seen": 300023962, | |
| "step": 4848 | |
| }, | |
| { | |
| "epoch": 0.8813067150635209, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.3863133183991905e-06, | |
| "loss": 0.0455, | |
| "num_input_tokens_seen": 300499402, | |
| "step": 4856 | |
| }, | |
| { | |
| "epoch": 0.8827586206896552, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.3529902032431698e-06, | |
| "loss": 0.0572, | |
| "num_input_tokens_seen": 301015365, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 0.8842105263157894, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.3200584481924915e-06, | |
| "loss": 0.054, | |
| "num_input_tokens_seen": 301509565, | |
| "step": 4872 | |
| }, | |
| { | |
| "epoch": 0.8856624319419237, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.2875187444129366e-06, | |
| "loss": 0.0505, | |
| "num_input_tokens_seen": 302023484, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.8871143375680581, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.2553717748419846e-06, | |
| "loss": 0.0426, | |
| "num_input_tokens_seen": 302520603, | |
| "step": 4888 | |
| }, | |
| { | |
| "epoch": 0.8885662431941924, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.2236182141744757e-06, | |
| "loss": 0.0495, | |
| "num_input_tokens_seen": 303012766, | |
| "step": 4896 | |
| }, | |
| { | |
| "epoch": 0.8900181488203267, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 1.192258728848472e-06, | |
| "loss": 0.0561, | |
| "num_input_tokens_seen": 303502416, | |
| "step": 4904 | |
| }, | |
| { | |
| "epoch": 0.891470054446461, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.1612939770312325e-06, | |
| "loss": 0.0365, | |
| "num_input_tokens_seen": 304003546, | |
| "step": 4912 | |
| }, | |
| { | |
| "epoch": 0.8929219600725953, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 1.130724608605427e-06, | |
| "loss": 0.05, | |
| "num_input_tokens_seen": 304494827, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.8943738656987296, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.1005512651554983e-06, | |
| "loss": 0.0365, | |
| "num_input_tokens_seen": 304962434, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 0.8958257713248639, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 1.0707745799541748e-06, | |
| "loss": 0.0505, | |
| "num_input_tokens_seen": 305453792, | |
| "step": 4936 | |
| }, | |
| { | |
| "epoch": 0.8972776769509981, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 1.041395177949196e-06, | |
| "loss": 0.0371, | |
| "num_input_tokens_seen": 305940285, | |
| "step": 4944 | |
| }, | |
| { | |
| "epoch": 0.8987295825771325, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.0124136757502012e-06, | |
| "loss": 0.0523, | |
| "num_input_tokens_seen": 306438405, | |
| "step": 4952 | |
| }, | |
| { | |
| "epoch": 0.9001814882032668, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 9.838306816157695e-07, | |
| "loss": 0.0405, | |
| "num_input_tokens_seen": 306937715, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.9016333938294011, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 9.556467954406634e-07, | |
| "loss": 0.0742, | |
| "num_input_tokens_seen": 307458431, | |
| "step": 4968 | |
| }, | |
| { | |
| "epoch": 0.9030852994555354, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 9.278626087432529e-07, | |
| "loss": 0.049, | |
| "num_input_tokens_seen": 307956789, | |
| "step": 4976 | |
| }, | |
| { | |
| "epoch": 0.9045372050816697, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 9.004787046530694e-07, | |
| "loss": 0.0432, | |
| "num_input_tokens_seen": 308463995, | |
| "step": 4984 | |
| }, | |
| { | |
| "epoch": 0.905989110707804, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 8.734956578985976e-07, | |
| "loss": 0.057, | |
| "num_input_tokens_seen": 308971509, | |
| "step": 4992 | |
| }, | |
| { | |
| "epoch": 0.9074410163339383, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 8.469140347951898e-07, | |
| "loss": 0.0461, | |
| "num_input_tokens_seen": 309453074, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.9088929219600725, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 8.207343932332023e-07, | |
| "loss": 0.042, | |
| "num_input_tokens_seen": 309930257, | |
| "step": 5008 | |
| }, | |
| { | |
| "epoch": 0.9103448275862069, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 7.949572826662622e-07, | |
| "loss": 0.077, | |
| "num_input_tokens_seen": 310432591, | |
| "step": 5016 | |
| }, | |
| { | |
| "epoch": 0.9117967332123412, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 7.695832440997563e-07, | |
| "loss": 0.0504, | |
| "num_input_tokens_seen": 310899484, | |
| "step": 5024 | |
| }, | |
| { | |
| "epoch": 0.9132486388384755, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 7.44612810079468e-07, | |
| "loss": 0.0577, | |
| "num_input_tokens_seen": 311385620, | |
| "step": 5032 | |
| }, | |
| { | |
| "epoch": 0.9147005444646098, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 7.200465046803984e-07, | |
| "loss": 0.065, | |
| "num_input_tokens_seen": 311886953, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.9161524500907441, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 6.958848434957643e-07, | |
| "loss": 0.0473, | |
| "num_input_tokens_seen": 312387145, | |
| "step": 5048 | |
| }, | |
| { | |
| "epoch": 0.9176043557168784, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 6.721283336261964e-07, | |
| "loss": 0.0464, | |
| "num_input_tokens_seen": 312865084, | |
| "step": 5056 | |
| }, | |
| { | |
| "epoch": 0.9190562613430127, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 6.487774736690688e-07, | |
| "loss": 0.0462, | |
| "num_input_tokens_seen": 313342169, | |
| "step": 5064 | |
| }, | |
| { | |
| "epoch": 0.9205081669691471, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 6.258327537080488e-07, | |
| "loss": 0.0407, | |
| "num_input_tokens_seen": 313820850, | |
| "step": 5072 | |
| }, | |
| { | |
| "epoch": 0.9219600725952813, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 6.032946553028196e-07, | |
| "loss": 0.048, | |
| "num_input_tokens_seen": 314294169, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.9234119782214156, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 5.811636514789598e-07, | |
| "loss": 0.0393, | |
| "num_input_tokens_seen": 314789090, | |
| "step": 5088 | |
| }, | |
| { | |
| "epoch": 0.9248638838475499, | |
| "grad_norm": 0.5, | |
| "learning_rate": 5.594402067180116e-07, | |
| "loss": 0.0466, | |
| "num_input_tokens_seen": 315317576, | |
| "step": 5096 | |
| }, | |
| { | |
| "epoch": 0.9263157894736842, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 5.381247769477504e-07, | |
| "loss": 0.0336, | |
| "num_input_tokens_seen": 315804951, | |
| "step": 5104 | |
| }, | |
| { | |
| "epoch": 0.9277676950998185, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 5.172178095326019e-07, | |
| "loss": 0.0515, | |
| "num_input_tokens_seen": 316286642, | |
| "step": 5112 | |
| }, | |
| { | |
| "epoch": 0.9292196007259528, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 4.967197432642579e-07, | |
| "loss": 0.079, | |
| "num_input_tokens_seen": 316792651, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.9306715063520871, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 4.7663100835246614e-07, | |
| "loss": 0.0423, | |
| "num_input_tokens_seen": 317277912, | |
| "step": 5128 | |
| }, | |
| { | |
| "epoch": 0.9321234119782215, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.569520264159977e-07, | |
| "loss": 0.0307, | |
| "num_input_tokens_seen": 317761276, | |
| "step": 5136 | |
| }, | |
| { | |
| "epoch": 0.9335753176043557, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 4.3768321047380936e-07, | |
| "loss": 0.0443, | |
| "num_input_tokens_seen": 318275629, | |
| "step": 5144 | |
| }, | |
| { | |
| "epoch": 0.93502722323049, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 4.188249649363596e-07, | |
| "loss": 0.037, | |
| "num_input_tokens_seen": 318764138, | |
| "step": 5152 | |
| }, | |
| { | |
| "epoch": 0.9364791288566243, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 4.0037768559712864e-07, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 319237492, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.9364791288566243, | |
| "eval_loss": 0.04859951138496399, | |
| "eval_runtime": 2495.2416, | |
| "eval_samples_per_second": 1.249, | |
| "eval_steps_per_second": 0.156, | |
| "num_input_tokens_seen": 319237492, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.9379310344827586, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 3.8234175962432284e-07, | |
| "loss": 0.0643, | |
| "num_input_tokens_seen": 319726771, | |
| "step": 5168 | |
| }, | |
| { | |
| "epoch": 0.9393829401088929, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 3.647175655527235e-07, | |
| "loss": 0.0545, | |
| "num_input_tokens_seen": 320207370, | |
| "step": 5176 | |
| }, | |
| { | |
| "epoch": 0.9408348457350272, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 3.4750547327576434e-07, | |
| "loss": 0.0645, | |
| "num_input_tokens_seen": 320689649, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 0.9422867513611616, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 3.3070584403775754e-07, | |
| "loss": 0.0368, | |
| "num_input_tokens_seen": 321189372, | |
| "step": 5192 | |
| }, | |
| { | |
| "epoch": 0.9437386569872959, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.143190304263177e-07, | |
| "loss": 0.0461, | |
| "num_input_tokens_seen": 321681717, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.9451905626134302, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.9834537636495466e-07, | |
| "loss": 0.0348, | |
| "num_input_tokens_seen": 322172599, | |
| "step": 5208 | |
| }, | |
| { | |
| "epoch": 0.9466424682395644, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.8278521710586315e-07, | |
| "loss": 0.0484, | |
| "num_input_tokens_seen": 322668094, | |
| "step": 5216 | |
| }, | |
| { | |
| "epoch": 0.9480943738656987, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 2.6763887922288236e-07, | |
| "loss": 0.0589, | |
| "num_input_tokens_seen": 323137080, | |
| "step": 5224 | |
| }, | |
| { | |
| "epoch": 0.949546279491833, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 2.5290668060464095e-07, | |
| "loss": 0.0323, | |
| "num_input_tokens_seen": 323645462, | |
| "step": 5232 | |
| }, | |
| { | |
| "epoch": 0.9509981851179673, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.385889304478872e-07, | |
| "loss": 0.05, | |
| "num_input_tokens_seen": 324137149, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.9524500907441016, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.2468592925100062e-07, | |
| "loss": 0.0392, | |
| "num_input_tokens_seen": 324621626, | |
| "step": 5248 | |
| }, | |
| { | |
| "epoch": 0.953901996370236, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 2.1119796880768374e-07, | |
| "loss": 0.0468, | |
| "num_input_tokens_seen": 325115784, | |
| "step": 5256 | |
| }, | |
| { | |
| "epoch": 0.9553539019963703, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.9812533220083362e-07, | |
| "loss": 0.0679, | |
| "num_input_tokens_seen": 325614737, | |
| "step": 5264 | |
| }, | |
| { | |
| "epoch": 0.9568058076225046, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.8546829379661125e-07, | |
| "loss": 0.07, | |
| "num_input_tokens_seen": 326095021, | |
| "step": 5272 | |
| }, | |
| { | |
| "epoch": 0.9582577132486388, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 1.7322711923867475e-07, | |
| "loss": 0.0609, | |
| "num_input_tokens_seen": 326613882, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.9597096188747731, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.6140206544260407e-07, | |
| "loss": 0.0323, | |
| "num_input_tokens_seen": 327087152, | |
| "step": 5288 | |
| }, | |
| { | |
| "epoch": 0.9611615245009074, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 1.4999338059051184e-07, | |
| "loss": 0.0431, | |
| "num_input_tokens_seen": 327601813, | |
| "step": 5296 | |
| }, | |
| { | |
| "epoch": 0.9626134301270417, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.3900130412583646e-07, | |
| "loss": 0.0378, | |
| "num_input_tokens_seen": 328093647, | |
| "step": 5304 | |
| }, | |
| { | |
| "epoch": 0.964065335753176, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 1.2842606674831058e-07, | |
| "loss": 0.0777, | |
| "num_input_tokens_seen": 328588015, | |
| "step": 5312 | |
| }, | |
| { | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 1.1826789040912723e-07, | |
| "loss": 0.0603, | |
| "num_input_tokens_seen": 329080878, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.9669691470054447, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.0852698830627007e-07, | |
| "loss": 0.0433, | |
| "num_input_tokens_seen": 329543543, | |
| "step": 5328 | |
| }, | |
| { | |
| "epoch": 0.968421052631579, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 9.920356488005045e-08, | |
| "loss": 0.0625, | |
| "num_input_tokens_seen": 330031499, | |
| "step": 5336 | |
| }, | |
| { | |
| "epoch": 0.9698729582577132, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 9.029781580881081e-08, | |
| "loss": 0.0408, | |
| "num_input_tokens_seen": 330508472, | |
| "step": 5344 | |
| }, | |
| { | |
| "epoch": 0.9713248638838475, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 8.180992800482124e-08, | |
| "loss": 0.0362, | |
| "num_input_tokens_seen": 330999368, | |
| "step": 5352 | |
| }, | |
| { | |
| "epoch": 0.9727767695099818, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 7.374007961035157e-08, | |
| "loss": 0.0372, | |
| "num_input_tokens_seen": 331494527, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.9742286751361161, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 6.608843999393655e-08, | |
| "loss": 0.0544, | |
| "num_input_tokens_seen": 331992801, | |
| "step": 5368 | |
| }, | |
| { | |
| "epoch": 0.9756805807622505, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 5.885516974681871e-08, | |
| "loss": 0.0434, | |
| "num_input_tokens_seen": 332484019, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 0.9771324863883848, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 5.2040420679577706e-08, | |
| "loss": 0.0463, | |
| "num_input_tokens_seen": 332971275, | |
| "step": 5384 | |
| }, | |
| { | |
| "epoch": 0.9785843920145191, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.564433581895067e-08, | |
| "loss": 0.0291, | |
| "num_input_tokens_seen": 333465979, | |
| "step": 5392 | |
| }, | |
| { | |
| "epoch": 0.9800362976406534, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.966704940482347e-08, | |
| "loss": 0.0428, | |
| "num_input_tokens_seen": 333965786, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9814882032667877, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 3.4108686887408537e-08, | |
| "loss": 0.0382, | |
| "num_input_tokens_seen": 334462422, | |
| "step": 5408 | |
| }, | |
| { | |
| "epoch": 0.9829401088929219, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.8969364924629205e-08, | |
| "loss": 0.0335, | |
| "num_input_tokens_seen": 334957763, | |
| "step": 5416 | |
| }, | |
| { | |
| "epoch": 0.9843920145190562, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 2.424919137965276e-08, | |
| "loss": 0.0386, | |
| "num_input_tokens_seen": 335453503, | |
| "step": 5424 | |
| }, | |
| { | |
| "epoch": 0.9858439201451905, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 1.9948265318638915e-08, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 335956152, | |
| "step": 5432 | |
| }, | |
| { | |
| "epoch": 0.9872958257713249, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.606667700865261e-08, | |
| "loss": 0.0428, | |
| "num_input_tokens_seen": 336428666, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.9887477313974592, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 1.2604507915774389e-08, | |
| "loss": 0.0409, | |
| "num_input_tokens_seen": 336955164, | |
| "step": 5448 | |
| }, | |
| { | |
| "epoch": 0.9901996370235935, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 9.561830703390673e-09, | |
| "loss": 0.0468, | |
| "num_input_tokens_seen": 337481648, | |
| "step": 5456 | |
| }, | |
| { | |
| "epoch": 0.9916515426497278, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 6.938709230666085e-09, | |
| "loss": 0.0517, | |
| "num_input_tokens_seen": 337980342, | |
| "step": 5464 | |
| }, | |
| { | |
| "epoch": 0.993103448275862, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 4.7351985512067435e-09, | |
| "loss": 0.0586, | |
| "num_input_tokens_seen": 338476887, | |
| "step": 5472 | |
| }, | |
| { | |
| "epoch": 0.9945553539019963, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 2.9513449118967475e-09, | |
| "loss": 0.0758, | |
| "num_input_tokens_seen": 338954735, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.9960072595281306, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 1.5871857519411671e-09, | |
| "loss": 0.0453, | |
| "num_input_tokens_seen": 339472532, | |
| "step": 5488 | |
| }, | |
| { | |
| "epoch": 0.997459165154265, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 6.427497020644602e-10, | |
| "loss": 0.0365, | |
| "num_input_tokens_seen": 339948028, | |
| "step": 5496 | |
| }, | |
| { | |
| "epoch": 0.9989110707803993, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 1.1805658392427533e-10, | |
| "loss": 0.0511, | |
| "num_input_tokens_seen": 340437678, | |
| "step": 5504 | |
| }, | |
| { | |
| "epoch": 0.9989110707803993, | |
| "eval_loss": 0.04862402379512787, | |
| "eval_runtime": 2527.5451, | |
| "eval_samples_per_second": 1.233, | |
| "eval_steps_per_second": 0.154, | |
| "num_input_tokens_seen": 340437678, | |
| "step": 5504 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "num_input_tokens_seen": 340779614, | |
| "step": 5510, | |
| "total_flos": 1.7763887818171482e+19, | |
| "train_loss": 0.06540190598601221, | |
| "train_runtime": 392745.8674, | |
| "train_samples_per_second": 0.786, | |
| "train_steps_per_second": 0.014, | |
| "train_tokens_per_second": 108.825 | |
| } | |
| ], | |
| "logging_steps": 8, | |
| "max_steps": 5510, | |
| "num_input_tokens_seen": 340779614, | |
| "num_train_epochs": 1, | |
| "save_steps": 688, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.7763887818171482e+19, | |
| "train_batch_size": 7, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |