| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.7818654775176515, | |
| "eval_steps": 200, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014864362690449646, | |
| "grad_norm": 2.1816246446736707, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.5437, | |
| "num_input_tokens_seen": 30040, | |
| "step": 5, | |
| "train_runtime": 185.9609, | |
| "train_tokens_per_second": 161.539 | |
| }, | |
| { | |
| "epoch": 0.029728725380899292, | |
| "grad_norm": 1.297173579671946, | |
| "learning_rate": 3.75e-06, | |
| "loss": 1.3114, | |
| "num_input_tokens_seen": 65616, | |
| "step": 10, | |
| "train_runtime": 371.0314, | |
| "train_tokens_per_second": 176.848 | |
| }, | |
| { | |
| "epoch": 0.044593088071348944, | |
| "grad_norm": 1.3154044523767012, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 1.5722, | |
| "num_input_tokens_seen": 92696, | |
| "step": 15, | |
| "train_runtime": 537.3955, | |
| "train_tokens_per_second": 172.491 | |
| }, | |
| { | |
| "epoch": 0.059457450761798585, | |
| "grad_norm": 1.901062718640284, | |
| "learning_rate": 7.916666666666667e-06, | |
| "loss": 1.4077, | |
| "num_input_tokens_seen": 123944, | |
| "step": 20, | |
| "train_runtime": 720.0196, | |
| "train_tokens_per_second": 172.14 | |
| }, | |
| { | |
| "epoch": 0.07432181345224824, | |
| "grad_norm": 0.865431080037828, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2642, | |
| "num_input_tokens_seen": 154432, | |
| "step": 25, | |
| "train_runtime": 892.3433, | |
| "train_tokens_per_second": 173.063 | |
| }, | |
| { | |
| "epoch": 0.08918617614269789, | |
| "grad_norm": 0.7781390780834406, | |
| "learning_rate": 1.2083333333333333e-05, | |
| "loss": 1.1572, | |
| "num_input_tokens_seen": 181960, | |
| "step": 30, | |
| "train_runtime": 1057.0334, | |
| "train_tokens_per_second": 172.142 | |
| }, | |
| { | |
| "epoch": 0.10405053883314754, | |
| "grad_norm": 0.665108652040459, | |
| "learning_rate": 1.4166666666666668e-05, | |
| "loss": 0.9421, | |
| "num_input_tokens_seen": 212640, | |
| "step": 35, | |
| "train_runtime": 1231.9846, | |
| "train_tokens_per_second": 172.6 | |
| }, | |
| { | |
| "epoch": 0.11891490152359717, | |
| "grad_norm": 0.2825351078495791, | |
| "learning_rate": 1.6250000000000002e-05, | |
| "loss": 0.8661, | |
| "num_input_tokens_seen": 245808, | |
| "step": 40, | |
| "train_runtime": 1416.5893, | |
| "train_tokens_per_second": 173.521 | |
| }, | |
| { | |
| "epoch": 0.13377926421404682, | |
| "grad_norm": 0.36334526479148993, | |
| "learning_rate": 1.8333333333333333e-05, | |
| "loss": 0.9087, | |
| "num_input_tokens_seen": 272224, | |
| "step": 45, | |
| "train_runtime": 1582.5403, | |
| "train_tokens_per_second": 172.017 | |
| }, | |
| { | |
| "epoch": 0.14864362690449648, | |
| "grad_norm": 0.44642145260416205, | |
| "learning_rate": 2.0416666666666667e-05, | |
| "loss": 0.8855, | |
| "num_input_tokens_seen": 305560, | |
| "step": 50, | |
| "train_runtime": 1768.3526, | |
| "train_tokens_per_second": 172.794 | |
| }, | |
| { | |
| "epoch": 0.1635079895949461, | |
| "grad_norm": 0.41738282561467666, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.8737, | |
| "num_input_tokens_seen": 332024, | |
| "step": 55, | |
| "train_runtime": 1927.929, | |
| "train_tokens_per_second": 172.218 | |
| }, | |
| { | |
| "epoch": 0.17837235228539577, | |
| "grad_norm": 0.39634079236288305, | |
| "learning_rate": 2.4583333333333332e-05, | |
| "loss": 0.8043, | |
| "num_input_tokens_seen": 360856, | |
| "step": 60, | |
| "train_runtime": 2097.4995, | |
| "train_tokens_per_second": 172.041 | |
| }, | |
| { | |
| "epoch": 0.1932367149758454, | |
| "grad_norm": 0.35491219061131213, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.7943, | |
| "num_input_tokens_seen": 398608, | |
| "step": 65, | |
| "train_runtime": 2304.6738, | |
| "train_tokens_per_second": 172.956 | |
| }, | |
| { | |
| "epoch": 0.20810107766629507, | |
| "grad_norm": 0.35737550120310063, | |
| "learning_rate": 2.8749999999999997e-05, | |
| "loss": 0.8492, | |
| "num_input_tokens_seen": 425552, | |
| "step": 70, | |
| "train_runtime": 2460.8004, | |
| "train_tokens_per_second": 172.932 | |
| }, | |
| { | |
| "epoch": 0.2229654403567447, | |
| "grad_norm": 0.48693276619814596, | |
| "learning_rate": 3.0833333333333335e-05, | |
| "loss": 0.6925, | |
| "num_input_tokens_seen": 457392, | |
| "step": 75, | |
| "train_runtime": 2638.2785, | |
| "train_tokens_per_second": 173.368 | |
| }, | |
| { | |
| "epoch": 0.23782980304719434, | |
| "grad_norm": 0.5078391701662902, | |
| "learning_rate": 3.291666666666667e-05, | |
| "loss": 0.8085, | |
| "num_input_tokens_seen": 483824, | |
| "step": 80, | |
| "train_runtime": 2806.7407, | |
| "train_tokens_per_second": 172.379 | |
| }, | |
| { | |
| "epoch": 0.25269416573764397, | |
| "grad_norm": 0.3024390891532131, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.7564, | |
| "num_input_tokens_seen": 512864, | |
| "step": 85, | |
| "train_runtime": 2979.1533, | |
| "train_tokens_per_second": 172.151 | |
| }, | |
| { | |
| "epoch": 0.26755852842809363, | |
| "grad_norm": 0.37806969672351337, | |
| "learning_rate": 3.708333333333334e-05, | |
| "loss": 0.8039, | |
| "num_input_tokens_seen": 538720, | |
| "step": 90, | |
| "train_runtime": 3137.4212, | |
| "train_tokens_per_second": 171.708 | |
| }, | |
| { | |
| "epoch": 0.2824228911185433, | |
| "grad_norm": 0.3125837610721519, | |
| "learning_rate": 3.9166666666666665e-05, | |
| "loss": 0.7889, | |
| "num_input_tokens_seen": 569688, | |
| "step": 95, | |
| "train_runtime": 3317.1304, | |
| "train_tokens_per_second": 171.741 | |
| }, | |
| { | |
| "epoch": 0.29728725380899296, | |
| "grad_norm": 0.2612296186361751, | |
| "learning_rate": 4.125e-05, | |
| "loss": 0.8101, | |
| "num_input_tokens_seen": 603480, | |
| "step": 100, | |
| "train_runtime": 3508.249, | |
| "train_tokens_per_second": 172.017 | |
| }, | |
| { | |
| "epoch": 0.31215161649944256, | |
| "grad_norm": 0.4664200778768721, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 0.7587, | |
| "num_input_tokens_seen": 632928, | |
| "step": 105, | |
| "train_runtime": 3681.3714, | |
| "train_tokens_per_second": 171.927 | |
| }, | |
| { | |
| "epoch": 0.3270159791898922, | |
| "grad_norm": 0.4122820082467493, | |
| "learning_rate": 4.541666666666667e-05, | |
| "loss": 0.7925, | |
| "num_input_tokens_seen": 659840, | |
| "step": 110, | |
| "train_runtime": 3850.7584, | |
| "train_tokens_per_second": 171.353 | |
| }, | |
| { | |
| "epoch": 0.3418803418803419, | |
| "grad_norm": 0.29086455774181913, | |
| "learning_rate": 4.75e-05, | |
| "loss": 0.8142, | |
| "num_input_tokens_seen": 690432, | |
| "step": 115, | |
| "train_runtime": 4029.493, | |
| "train_tokens_per_second": 171.345 | |
| }, | |
| { | |
| "epoch": 0.35674470457079155, | |
| "grad_norm": 0.49406027919264844, | |
| "learning_rate": 4.958333333333334e-05, | |
| "loss": 0.7088, | |
| "num_input_tokens_seen": 726264, | |
| "step": 120, | |
| "train_runtime": 4222.1079, | |
| "train_tokens_per_second": 172.015 | |
| }, | |
| { | |
| "epoch": 0.37160906726124115, | |
| "grad_norm": 0.29533667352828036, | |
| "learning_rate": 4.9999194067399066e-05, | |
| "loss": 0.6237, | |
| "num_input_tokens_seen": 763328, | |
| "step": 125, | |
| "train_runtime": 4420.0509, | |
| "train_tokens_per_second": 172.697 | |
| }, | |
| { | |
| "epoch": 0.3864734299516908, | |
| "grad_norm": 0.554389641306832, | |
| "learning_rate": 4.999592005526383e-05, | |
| "loss": 0.6609, | |
| "num_input_tokens_seen": 792888, | |
| "step": 130, | |
| "train_runtime": 4590.9234, | |
| "train_tokens_per_second": 172.708 | |
| }, | |
| { | |
| "epoch": 0.4013377926421405, | |
| "grad_norm": 0.3913197863670873, | |
| "learning_rate": 4.999012792238118e-05, | |
| "loss": 0.7873, | |
| "num_input_tokens_seen": 830784, | |
| "step": 135, | |
| "train_runtime": 4780.7961, | |
| "train_tokens_per_second": 173.775 | |
| }, | |
| { | |
| "epoch": 0.41620215533259014, | |
| "grad_norm": 0.7292785043191194, | |
| "learning_rate": 4.998181825225791e-05, | |
| "loss": 0.7047, | |
| "num_input_tokens_seen": 869304, | |
| "step": 140, | |
| "train_runtime": 4979.1373, | |
| "train_tokens_per_second": 174.589 | |
| }, | |
| { | |
| "epoch": 0.43106651802303975, | |
| "grad_norm": 0.3288366913256035, | |
| "learning_rate": 4.997099188202077e-05, | |
| "loss": 0.6608, | |
| "num_input_tokens_seen": 903752, | |
| "step": 145, | |
| "train_runtime": 5170.5438, | |
| "train_tokens_per_second": 174.789 | |
| }, | |
| { | |
| "epoch": 0.4459308807134894, | |
| "grad_norm": 0.32850031143064623, | |
| "learning_rate": 4.995764990233205e-05, | |
| "loss": 0.7203, | |
| "num_input_tokens_seen": 935672, | |
| "step": 150, | |
| "train_runtime": 5350.7854, | |
| "train_tokens_per_second": 174.866 | |
| }, | |
| { | |
| "epoch": 0.46079524340393907, | |
| "grad_norm": 0.4093406468089712, | |
| "learning_rate": 4.994179365727973e-05, | |
| "loss": 0.7793, | |
| "num_input_tokens_seen": 967928, | |
| "step": 155, | |
| "train_runtime": 5526.3425, | |
| "train_tokens_per_second": 175.148 | |
| }, | |
| { | |
| "epoch": 0.4756596060943887, | |
| "grad_norm": 0.26538290553046884, | |
| "learning_rate": 4.992342474424209e-05, | |
| "loss": 0.6941, | |
| "num_input_tokens_seen": 1006856, | |
| "step": 160, | |
| "train_runtime": 5732.4064, | |
| "train_tokens_per_second": 175.643 | |
| }, | |
| { | |
| "epoch": 0.49052396878483834, | |
| "grad_norm": 0.3293209375345381, | |
| "learning_rate": 4.990254501372677e-05, | |
| "loss": 0.7154, | |
| "num_input_tokens_seen": 1040320, | |
| "step": 165, | |
| "train_runtime": 5919.4184, | |
| "train_tokens_per_second": 175.747 | |
| }, | |
| { | |
| "epoch": 0.5053883314752879, | |
| "grad_norm": 0.37872717401082634, | |
| "learning_rate": 4.987915656918435e-05, | |
| "loss": 0.7354, | |
| "num_input_tokens_seen": 1071792, | |
| "step": 170, | |
| "train_runtime": 6093.1775, | |
| "train_tokens_per_second": 175.9 | |
| }, | |
| { | |
| "epoch": 0.5202526941657376, | |
| "grad_norm": 0.34111517332292757, | |
| "learning_rate": 4.985326176679645e-05, | |
| "loss": 0.7598, | |
| "num_input_tokens_seen": 1103216, | |
| "step": 175, | |
| "train_runtime": 6268.9044, | |
| "train_tokens_per_second": 175.982 | |
| }, | |
| { | |
| "epoch": 0.5351170568561873, | |
| "grad_norm": 0.44888012700873, | |
| "learning_rate": 4.9824863215238373e-05, | |
| "loss": 0.7502, | |
| "num_input_tokens_seen": 1131576, | |
| "step": 180, | |
| "train_runtime": 6435.3922, | |
| "train_tokens_per_second": 175.836 | |
| }, | |
| { | |
| "epoch": 0.5499814195466369, | |
| "grad_norm": 0.4802566147521983, | |
| "learning_rate": 4.979396377541628e-05, | |
| "loss": 0.6844, | |
| "num_input_tokens_seen": 1160040, | |
| "step": 185, | |
| "train_runtime": 6605.4368, | |
| "train_tokens_per_second": 175.619 | |
| }, | |
| { | |
| "epoch": 0.5648457822370866, | |
| "grad_norm": 0.4821989259826241, | |
| "learning_rate": 4.976056656017901e-05, | |
| "loss": 0.8291, | |
| "num_input_tokens_seen": 1190376, | |
| "step": 190, | |
| "train_runtime": 6782.0935, | |
| "train_tokens_per_second": 175.517 | |
| }, | |
| { | |
| "epoch": 0.5797101449275363, | |
| "grad_norm": 0.386641731129637, | |
| "learning_rate": 4.972467493400445e-05, | |
| "loss": 0.7762, | |
| "num_input_tokens_seen": 1217904, | |
| "step": 195, | |
| "train_runtime": 6948.2037, | |
| "train_tokens_per_second": 175.283 | |
| }, | |
| { | |
| "epoch": 0.5945745076179859, | |
| "grad_norm": 0.4418388818648707, | |
| "learning_rate": 4.968629251266064e-05, | |
| "loss": 0.6688, | |
| "num_input_tokens_seen": 1256312, | |
| "step": 200, | |
| "train_runtime": 7158.3945, | |
| "train_tokens_per_second": 175.502 | |
| }, | |
| { | |
| "epoch": 0.5945745076179859, | |
| "eval_loss": 0.8953876495361328, | |
| "eval_runtime": 353.0879, | |
| "eval_samples_per_second": 1.325, | |
| "eval_steps_per_second": 0.663, | |
| "num_input_tokens_seen": 1256312, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6094388703084356, | |
| "grad_norm": 0.5521465457074078, | |
| "learning_rate": 4.964542316284147e-05, | |
| "loss": 0.7398, | |
| "num_input_tokens_seen": 1290880, | |
| "step": 205, | |
| "train_runtime": 7728.2908, | |
| "train_tokens_per_second": 167.033 | |
| }, | |
| { | |
| "epoch": 0.6243032329988851, | |
| "grad_norm": 0.4773991938686689, | |
| "learning_rate": 4.960207100177716e-05, | |
| "loss": 0.7169, | |
| "num_input_tokens_seen": 1320288, | |
| "step": 210, | |
| "train_runtime": 7894.6486, | |
| "train_tokens_per_second": 167.238 | |
| }, | |
| { | |
| "epoch": 0.6391675956893348, | |
| "grad_norm": 0.51804635767658, | |
| "learning_rate": 4.955624039681952e-05, | |
| "loss": 0.7618, | |
| "num_input_tokens_seen": 1346816, | |
| "step": 215, | |
| "train_runtime": 8060.6928, | |
| "train_tokens_per_second": 167.084 | |
| }, | |
| { | |
| "epoch": 0.6540319583797845, | |
| "grad_norm": 0.5900793472953364, | |
| "learning_rate": 4.950793596500192e-05, | |
| "loss": 0.7563, | |
| "num_input_tokens_seen": 1382472, | |
| "step": 220, | |
| "train_runtime": 8252.4478, | |
| "train_tokens_per_second": 167.523 | |
| }, | |
| { | |
| "epoch": 0.6688963210702341, | |
| "grad_norm": 0.39058107426349015, | |
| "learning_rate": 4.94571625725742e-05, | |
| "loss": 0.6626, | |
| "num_input_tokens_seen": 1420832, | |
| "step": 225, | |
| "train_runtime": 8450.5909, | |
| "train_tokens_per_second": 168.134 | |
| }, | |
| { | |
| "epoch": 0.6837606837606838, | |
| "grad_norm": 0.5663525201259818, | |
| "learning_rate": 4.940392533451244e-05, | |
| "loss": 0.7741, | |
| "num_input_tokens_seen": 1451496, | |
| "step": 230, | |
| "train_runtime": 8624.0331, | |
| "train_tokens_per_second": 168.308 | |
| }, | |
| { | |
| "epoch": 0.6986250464511334, | |
| "grad_norm": 0.3852996317746423, | |
| "learning_rate": 4.9348229614003615e-05, | |
| "loss": 0.7749, | |
| "num_input_tokens_seen": 1483136, | |
| "step": 235, | |
| "train_runtime": 8805.4306, | |
| "train_tokens_per_second": 168.434 | |
| }, | |
| { | |
| "epoch": 0.7134894091415831, | |
| "grad_norm": 0.4546419228429511, | |
| "learning_rate": 4.9290081021905416e-05, | |
| "loss": 0.7186, | |
| "num_input_tokens_seen": 1522176, | |
| "step": 240, | |
| "train_runtime": 9011.7223, | |
| "train_tokens_per_second": 168.911 | |
| }, | |
| { | |
| "epoch": 0.7283537718320326, | |
| "grad_norm": 0.27319864084916956, | |
| "learning_rate": 4.9229485416180876e-05, | |
| "loss": 0.659, | |
| "num_input_tokens_seen": 1562400, | |
| "step": 245, | |
| "train_runtime": 9221.6215, | |
| "train_tokens_per_second": 169.428 | |
| }, | |
| { | |
| "epoch": 0.7432181345224823, | |
| "grad_norm": 0.5971581275751557, | |
| "learning_rate": 4.916644890130831e-05, | |
| "loss": 0.747, | |
| "num_input_tokens_seen": 1587136, | |
| "step": 250, | |
| "train_runtime": 9378.073, | |
| "train_tokens_per_second": 169.239 | |
| }, | |
| { | |
| "epoch": 0.758082497212932, | |
| "grad_norm": 0.3725130213516166, | |
| "learning_rate": 4.9100977827666345e-05, | |
| "loss": 0.698, | |
| "num_input_tokens_seen": 1614920, | |
| "step": 255, | |
| "train_runtime": 9541.7299, | |
| "train_tokens_per_second": 169.248 | |
| }, | |
| { | |
| "epoch": 0.7729468599033816, | |
| "grad_norm": 0.4907840552388026, | |
| "learning_rate": 4.903307879089411e-05, | |
| "loss": 0.6477, | |
| "num_input_tokens_seen": 1645688, | |
| "step": 260, | |
| "train_runtime": 9716.0007, | |
| "train_tokens_per_second": 169.379 | |
| }, | |
| { | |
| "epoch": 0.7878112225938313, | |
| "grad_norm": 0.5241784073891005, | |
| "learning_rate": 4.896275863122685e-05, | |
| "loss": 0.7262, | |
| "num_input_tokens_seen": 1681888, | |
| "step": 265, | |
| "train_runtime": 9905.9233, | |
| "train_tokens_per_second": 169.786 | |
| }, | |
| { | |
| "epoch": 0.802675585284281, | |
| "grad_norm": 0.4747498444810908, | |
| "learning_rate": 4.8890024432806806e-05, | |
| "loss": 0.7729, | |
| "num_input_tokens_seen": 1709480, | |
| "step": 270, | |
| "train_runtime": 10078.2583, | |
| "train_tokens_per_second": 169.621 | |
| }, | |
| { | |
| "epoch": 0.8175399479747306, | |
| "grad_norm": 0.40268303548713125, | |
| "learning_rate": 4.8814883522969545e-05, | |
| "loss": 0.653, | |
| "num_input_tokens_seen": 1744560, | |
| "step": 275, | |
| "train_runtime": 10262.7966, | |
| "train_tokens_per_second": 169.989 | |
| }, | |
| { | |
| "epoch": 0.8324043106651803, | |
| "grad_norm": 0.5361180434189863, | |
| "learning_rate": 4.8737343471505806e-05, | |
| "loss": 0.75, | |
| "num_input_tokens_seen": 1773576, | |
| "step": 280, | |
| "train_runtime": 10433.1959, | |
| "train_tokens_per_second": 169.994 | |
| }, | |
| { | |
| "epoch": 0.8472686733556298, | |
| "grad_norm": 0.4150141248761779, | |
| "learning_rate": 4.865741208989888e-05, | |
| "loss": 0.6366, | |
| "num_input_tokens_seen": 1807248, | |
| "step": 285, | |
| "train_runtime": 10621.0739, | |
| "train_tokens_per_second": 170.157 | |
| }, | |
| { | |
| "epoch": 0.8621330360460795, | |
| "grad_norm": 0.557076443271248, | |
| "learning_rate": 4.857509743053774e-05, | |
| "loss": 0.7992, | |
| "num_input_tokens_seen": 1836512, | |
| "step": 290, | |
| "train_runtime": 10790.2552, | |
| "train_tokens_per_second": 170.201 | |
| }, | |
| { | |
| "epoch": 0.8769973987365292, | |
| "grad_norm": 0.622394894811983, | |
| "learning_rate": 4.8490407785905756e-05, | |
| "loss": 0.7259, | |
| "num_input_tokens_seen": 1864768, | |
| "step": 295, | |
| "train_runtime": 10958.7213, | |
| "train_tokens_per_second": 170.163 | |
| }, | |
| { | |
| "epoch": 0.8918617614269788, | |
| "grad_norm": 0.32132112389269674, | |
| "learning_rate": 4.840335168774532e-05, | |
| "loss": 0.6569, | |
| "num_input_tokens_seen": 1897416, | |
| "step": 300, | |
| "train_runtime": 11132.0894, | |
| "train_tokens_per_second": 170.446 | |
| }, | |
| { | |
| "epoch": 0.9067261241174285, | |
| "grad_norm": 0.5699931721537229, | |
| "learning_rate": 4.8313937906198415e-05, | |
| "loss": 0.7109, | |
| "num_input_tokens_seen": 1925464, | |
| "step": 305, | |
| "train_runtime": 11302.0412, | |
| "train_tokens_per_second": 170.364 | |
| }, | |
| { | |
| "epoch": 0.9215904868078781, | |
| "grad_norm": 0.49216652434145286, | |
| "learning_rate": 4.822217544892298e-05, | |
| "loss": 0.6374, | |
| "num_input_tokens_seen": 1956064, | |
| "step": 310, | |
| "train_runtime": 11477.605, | |
| "train_tokens_per_second": 170.424 | |
| }, | |
| { | |
| "epoch": 0.9364548494983278, | |
| "grad_norm": 0.289252551788521, | |
| "learning_rate": 4.812807356018556e-05, | |
| "loss": 0.7321, | |
| "num_input_tokens_seen": 1986504, | |
| "step": 315, | |
| "train_runtime": 11660.7035, | |
| "train_tokens_per_second": 170.359 | |
| }, | |
| { | |
| "epoch": 0.9513192121887774, | |
| "grad_norm": 0.4471738070997589, | |
| "learning_rate": 4.803164171993001e-05, | |
| "loss": 0.6904, | |
| "num_input_tokens_seen": 2012976, | |
| "step": 320, | |
| "train_runtime": 11819.4269, | |
| "train_tokens_per_second": 170.311 | |
| }, | |
| { | |
| "epoch": 0.966183574879227, | |
| "grad_norm": 0.4988862477404023, | |
| "learning_rate": 4.793288964282244e-05, | |
| "loss": 0.8029, | |
| "num_input_tokens_seen": 2041512, | |
| "step": 325, | |
| "train_runtime": 11993.4775, | |
| "train_tokens_per_second": 170.219 | |
| }, | |
| { | |
| "epoch": 0.9810479375696767, | |
| "grad_norm": 0.4455586736978044, | |
| "learning_rate": 4.783182727727258e-05, | |
| "loss": 0.7208, | |
| "num_input_tokens_seen": 2065864, | |
| "step": 330, | |
| "train_runtime": 12150.1754, | |
| "train_tokens_per_second": 170.028 | |
| }, | |
| { | |
| "epoch": 0.9959123002601263, | |
| "grad_norm": 0.45313304199376386, | |
| "learning_rate": 4.772846480443154e-05, | |
| "loss": 0.6892, | |
| "num_input_tokens_seen": 2099192, | |
| "step": 335, | |
| "train_runtime": 12332.4171, | |
| "train_tokens_per_second": 170.217 | |
| }, | |
| { | |
| "epoch": 1.0089186176142697, | |
| "grad_norm": 0.34932689166112835, | |
| "learning_rate": 4.762281263716619e-05, | |
| "loss": 0.7558, | |
| "num_input_tokens_seen": 2125816, | |
| "step": 340, | |
| "train_runtime": 12484.2733, | |
| "train_tokens_per_second": 170.28 | |
| }, | |
| { | |
| "epoch": 1.0237829803047194, | |
| "grad_norm": 0.4092397826706536, | |
| "learning_rate": 4.751488141901009e-05, | |
| "loss": 0.5846, | |
| "num_input_tokens_seen": 2160160, | |
| "step": 345, | |
| "train_runtime": 12674.5489, | |
| "train_tokens_per_second": 170.433 | |
| }, | |
| { | |
| "epoch": 1.038647342995169, | |
| "grad_norm": 0.31168294599405477, | |
| "learning_rate": 4.740468202309132e-05, | |
| "loss": 0.5396, | |
| "num_input_tokens_seen": 2202880, | |
| "step": 350, | |
| "train_runtime": 12878.5359, | |
| "train_tokens_per_second": 171.05 | |
| }, | |
| { | |
| "epoch": 1.0535117056856187, | |
| "grad_norm": 0.4562243418713546, | |
| "learning_rate": 4.729222555103703e-05, | |
| "loss": 0.6491, | |
| "num_input_tokens_seen": 2232216, | |
| "step": 355, | |
| "train_runtime": 13048.5779, | |
| "train_tokens_per_second": 171.07 | |
| }, | |
| { | |
| "epoch": 1.0683760683760684, | |
| "grad_norm": 0.37853300240110366, | |
| "learning_rate": 4.717752333185511e-05, | |
| "loss": 0.5624, | |
| "num_input_tokens_seen": 2255520, | |
| "step": 360, | |
| "train_runtime": 13200.2201, | |
| "train_tokens_per_second": 170.87 | |
| }, | |
| { | |
| "epoch": 1.083240431066518, | |
| "grad_norm": 0.505365165589364, | |
| "learning_rate": 4.706058692079288e-05, | |
| "loss": 0.5882, | |
| "num_input_tokens_seen": 2295400, | |
| "step": 365, | |
| "train_runtime": 13416.4241, | |
| "train_tokens_per_second": 171.089 | |
| }, | |
| { | |
| "epoch": 1.0981047937569677, | |
| "grad_norm": 0.7984507974057846, | |
| "learning_rate": 4.6941428098172956e-05, | |
| "loss": 0.6382, | |
| "num_input_tokens_seen": 2322496, | |
| "step": 370, | |
| "train_runtime": 13576.7515, | |
| "train_tokens_per_second": 171.064 | |
| }, | |
| { | |
| "epoch": 1.1129691564474173, | |
| "grad_norm": 0.5722349779883125, | |
| "learning_rate": 4.682005886820656e-05, | |
| "loss": 0.6791, | |
| "num_input_tokens_seen": 2353904, | |
| "step": 375, | |
| "train_runtime": 13756.8344, | |
| "train_tokens_per_second": 171.108 | |
| }, | |
| { | |
| "epoch": 1.127833519137867, | |
| "grad_norm": 0.826155965272619, | |
| "learning_rate": 4.669649145778412e-05, | |
| "loss": 0.6277, | |
| "num_input_tokens_seen": 2389640, | |
| "step": 380, | |
| "train_runtime": 13955.4804, | |
| "train_tokens_per_second": 171.233 | |
| }, | |
| { | |
| "epoch": 1.1426978818283167, | |
| "grad_norm": 0.48208601176965793, | |
| "learning_rate": 4.657073831524358e-05, | |
| "loss": 0.5038, | |
| "num_input_tokens_seen": 2427872, | |
| "step": 385, | |
| "train_runtime": 14154.7296, | |
| "train_tokens_per_second": 171.524 | |
| }, | |
| { | |
| "epoch": 1.1575622445187663, | |
| "grad_norm": 0.862951567640207, | |
| "learning_rate": 4.644281210911631e-05, | |
| "loss": 0.6581, | |
| "num_input_tokens_seen": 2453984, | |
| "step": 390, | |
| "train_runtime": 14320.9438, | |
| "train_tokens_per_second": 171.356 | |
| }, | |
| { | |
| "epoch": 1.172426607209216, | |
| "grad_norm": 0.6446091778736193, | |
| "learning_rate": 4.631272572685086e-05, | |
| "loss": 0.5966, | |
| "num_input_tokens_seen": 2482696, | |
| "step": 395, | |
| "train_runtime": 14484.582, | |
| "train_tokens_per_second": 171.403 | |
| }, | |
| { | |
| "epoch": 1.1872909698996654, | |
| "grad_norm": 0.5940985307805492, | |
| "learning_rate": 4.618049227351467e-05, | |
| "loss": 0.6238, | |
| "num_input_tokens_seen": 2515024, | |
| "step": 400, | |
| "train_runtime": 14670.3277, | |
| "train_tokens_per_second": 171.436 | |
| }, | |
| { | |
| "epoch": 1.1872909698996654, | |
| "eval_loss": 0.8729309439659119, | |
| "eval_runtime": 355.7054, | |
| "eval_samples_per_second": 1.316, | |
| "eval_steps_per_second": 0.658, | |
| "num_input_tokens_seen": 2515024, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2021553325901153, | |
| "grad_norm": 0.42070342961969176, | |
| "learning_rate": 4.6046125070473854e-05, | |
| "loss": 0.6013, | |
| "num_input_tokens_seen": 2544048, | |
| "step": 405, | |
| "train_runtime": 15222.6559, | |
| "train_tokens_per_second": 167.122 | |
| }, | |
| { | |
| "epoch": 1.2170196952805648, | |
| "grad_norm": 0.5065404783086159, | |
| "learning_rate": 4.5909637654051194e-05, | |
| "loss": 0.6597, | |
| "num_input_tokens_seen": 2577080, | |
| "step": 410, | |
| "train_runtime": 15412.9664, | |
| "train_tokens_per_second": 167.202 | |
| }, | |
| { | |
| "epoch": 1.2318840579710144, | |
| "grad_norm": 0.7043622561243575, | |
| "learning_rate": 4.577104377416243e-05, | |
| "loss": 0.6459, | |
| "num_input_tokens_seen": 2605240, | |
| "step": 415, | |
| "train_runtime": 15585.0317, | |
| "train_tokens_per_second": 167.163 | |
| }, | |
| { | |
| "epoch": 1.246748420661464, | |
| "grad_norm": 0.5815684563195597, | |
| "learning_rate": 4.5630357392931136e-05, | |
| "loss": 0.5973, | |
| "num_input_tokens_seen": 2634016, | |
| "step": 420, | |
| "train_runtime": 15757.2057, | |
| "train_tokens_per_second": 167.163 | |
| }, | |
| { | |
| "epoch": 1.2616127833519137, | |
| "grad_norm": 0.5571563336020446, | |
| "learning_rate": 4.548759268328211e-05, | |
| "loss": 0.6341, | |
| "num_input_tokens_seen": 2663784, | |
| "step": 425, | |
| "train_runtime": 15930.8715, | |
| "train_tokens_per_second": 167.209 | |
| }, | |
| { | |
| "epoch": 1.2764771460423634, | |
| "grad_norm": 0.6460743117704272, | |
| "learning_rate": 4.534276402751361e-05, | |
| "loss": 0.651, | |
| "num_input_tokens_seen": 2697024, | |
| "step": 430, | |
| "train_runtime": 16122.3767, | |
| "train_tokens_per_second": 167.285 | |
| }, | |
| { | |
| "epoch": 1.291341508732813, | |
| "grad_norm": 0.6482175581649149, | |
| "learning_rate": 4.5195886015848454e-05, | |
| "loss": 0.6475, | |
| "num_input_tokens_seen": 2719400, | |
| "step": 435, | |
| "train_runtime": 16266.13, | |
| "train_tokens_per_second": 167.182 | |
| }, | |
| { | |
| "epoch": 1.3062058714232627, | |
| "grad_norm": 0.6094464333253732, | |
| "learning_rate": 4.5046973444964165e-05, | |
| "loss": 0.6373, | |
| "num_input_tokens_seen": 2751744, | |
| "step": 440, | |
| "train_runtime": 16434.9886, | |
| "train_tokens_per_second": 167.432 | |
| }, | |
| { | |
| "epoch": 1.3210702341137124, | |
| "grad_norm": 0.8126749512005293, | |
| "learning_rate": 4.4896041316502335e-05, | |
| "loss": 0.6434, | |
| "num_input_tokens_seen": 2791080, | |
| "step": 445, | |
| "train_runtime": 16641.4496, | |
| "train_tokens_per_second": 167.719 | |
| }, | |
| { | |
| "epoch": 1.335934596804162, | |
| "grad_norm": 0.5026965778291799, | |
| "learning_rate": 4.474310483555739e-05, | |
| "loss": 0.5877, | |
| "num_input_tokens_seen": 2823104, | |
| "step": 450, | |
| "train_runtime": 16826.3481, | |
| "train_tokens_per_second": 167.779 | |
| }, | |
| { | |
| "epoch": 1.3507989594946117, | |
| "grad_norm": 0.824948158879483, | |
| "learning_rate": 4.4588179409144734e-05, | |
| "loss": 0.6944, | |
| "num_input_tokens_seen": 2850480, | |
| "step": 455, | |
| "train_runtime": 16993.1957, | |
| "train_tokens_per_second": 167.742 | |
| }, | |
| { | |
| "epoch": 1.3656633221850614, | |
| "grad_norm": 0.6935578787290407, | |
| "learning_rate": 4.4431280644648676e-05, | |
| "loss": 0.6677, | |
| "num_input_tokens_seen": 2877920, | |
| "step": 460, | |
| "train_runtime": 17162.7566, | |
| "train_tokens_per_second": 167.684 | |
| }, | |
| { | |
| "epoch": 1.380527684875511, | |
| "grad_norm": 0.775256808439057, | |
| "learning_rate": 4.427242434825013e-05, | |
| "loss": 0.6185, | |
| "num_input_tokens_seen": 2910104, | |
| "step": 465, | |
| "train_runtime": 17349.4341, | |
| "train_tokens_per_second": 167.735 | |
| }, | |
| { | |
| "epoch": 1.3953920475659607, | |
| "grad_norm": 0.5290136444612117, | |
| "learning_rate": 4.4111626523334235e-05, | |
| "loss": 0.4792, | |
| "num_input_tokens_seen": 2943600, | |
| "step": 470, | |
| "train_runtime": 17535.2111, | |
| "train_tokens_per_second": 167.868 | |
| }, | |
| { | |
| "epoch": 1.4102564102564101, | |
| "grad_norm": 0.431851575535354, | |
| "learning_rate": 4.394890336887819e-05, | |
| "loss": 0.5479, | |
| "num_input_tokens_seen": 2983616, | |
| "step": 475, | |
| "train_runtime": 17735.6146, | |
| "train_tokens_per_second": 168.227 | |
| }, | |
| { | |
| "epoch": 1.42512077294686, | |
| "grad_norm": 0.707234404224925, | |
| "learning_rate": 4.378427127781935e-05, | |
| "loss": 0.6563, | |
| "num_input_tokens_seen": 3010896, | |
| "step": 480, | |
| "train_runtime": 17897.1321, | |
| "train_tokens_per_second": 168.233 | |
| }, | |
| { | |
| "epoch": 1.4399851356373095, | |
| "grad_norm": 0.5496990766753803, | |
| "learning_rate": 4.361774683540375e-05, | |
| "loss": 0.5839, | |
| "num_input_tokens_seen": 3040128, | |
| "step": 485, | |
| "train_runtime": 18074.4912, | |
| "train_tokens_per_second": 168.2 | |
| }, | |
| { | |
| "epoch": 1.4548494983277591, | |
| "grad_norm": 0.40637808524319596, | |
| "learning_rate": 4.34493468175153e-05, | |
| "loss": 0.5499, | |
| "num_input_tokens_seen": 3070192, | |
| "step": 490, | |
| "train_runtime": 18243.0065, | |
| "train_tokens_per_second": 168.294 | |
| }, | |
| { | |
| "epoch": 1.4697138610182088, | |
| "grad_norm": 0.5939003521421573, | |
| "learning_rate": 4.327908818898581e-05, | |
| "loss": 0.6602, | |
| "num_input_tokens_seen": 3100544, | |
| "step": 495, | |
| "train_runtime": 18419.7661, | |
| "train_tokens_per_second": 168.327 | |
| }, | |
| { | |
| "epoch": 1.4845782237086584, | |
| "grad_norm": 0.5699315589577931, | |
| "learning_rate": 4.3106988101885825e-05, | |
| "loss": 0.6172, | |
| "num_input_tokens_seen": 3134864, | |
| "step": 500, | |
| "train_runtime": 18605.4997, | |
| "train_tokens_per_second": 168.491 | |
| }, | |
| { | |
| "epoch": 1.499442586399108, | |
| "grad_norm": 0.5176180044018102, | |
| "learning_rate": 4.293306389379682e-05, | |
| "loss": 0.6932, | |
| "num_input_tokens_seen": 3164496, | |
| "step": 505, | |
| "train_runtime": 18782.3882, | |
| "train_tokens_per_second": 168.482 | |
| }, | |
| { | |
| "epoch": 1.5143069490895578, | |
| "grad_norm": 0.8125909060302873, | |
| "learning_rate": 4.275733308606452e-05, | |
| "loss": 0.6075, | |
| "num_input_tokens_seen": 3201592, | |
| "step": 510, | |
| "train_runtime": 18974.7401, | |
| "train_tokens_per_second": 168.729 | |
| }, | |
| { | |
| "epoch": 1.5291713117800074, | |
| "grad_norm": 0.4238803109744304, | |
| "learning_rate": 4.2579813382033764e-05, | |
| "loss": 0.6465, | |
| "num_input_tokens_seen": 3242824, | |
| "step": 515, | |
| "train_runtime": 19182.5699, | |
| "train_tokens_per_second": 169.051 | |
| }, | |
| { | |
| "epoch": 1.544035674470457, | |
| "grad_norm": 0.5568097005839904, | |
| "learning_rate": 4.240052266526512e-05, | |
| "loss": 0.6159, | |
| "num_input_tokens_seen": 3273600, | |
| "step": 520, | |
| "train_runtime": 19367.6233, | |
| "train_tokens_per_second": 169.024 | |
| }, | |
| { | |
| "epoch": 1.5589000371609067, | |
| "grad_norm": 0.5943741262587303, | |
| "learning_rate": 4.22194789977332e-05, | |
| "loss": 0.6439, | |
| "num_input_tokens_seen": 3308472, | |
| "step": 525, | |
| "train_runtime": 19562.3706, | |
| "train_tokens_per_second": 169.124 | |
| }, | |
| { | |
| "epoch": 1.5737643998513564, | |
| "grad_norm": 0.8079971411544303, | |
| "learning_rate": 4.203670061800712e-05, | |
| "loss": 0.6874, | |
| "num_input_tokens_seen": 3332568, | |
| "step": 530, | |
| "train_runtime": 19717.9765, | |
| "train_tokens_per_second": 169.012 | |
| }, | |
| { | |
| "epoch": 1.588628762541806, | |
| "grad_norm": 0.551762199364396, | |
| "learning_rate": 4.1852205939413104e-05, | |
| "loss": 0.6013, | |
| "num_input_tokens_seen": 3366712, | |
| "step": 535, | |
| "train_runtime": 19903.6186, | |
| "train_tokens_per_second": 169.151 | |
| }, | |
| { | |
| "epoch": 1.6034931252322555, | |
| "grad_norm": 0.5068985496527462, | |
| "learning_rate": 4.1666013548179496e-05, | |
| "loss": 0.6359, | |
| "num_input_tokens_seen": 3391608, | |
| "step": 540, | |
| "train_runtime": 20057.39, | |
| "train_tokens_per_second": 169.095 | |
| }, | |
| { | |
| "epoch": 1.6183574879227054, | |
| "grad_norm": 0.6855669469937492, | |
| "learning_rate": 4.147814220156437e-05, | |
| "loss": 0.6127, | |
| "num_input_tokens_seen": 3421400, | |
| "step": 545, | |
| "train_runtime": 20228.9847, | |
| "train_tokens_per_second": 169.134 | |
| }, | |
| { | |
| "epoch": 1.6332218506131548, | |
| "grad_norm": 0.4879299108867579, | |
| "learning_rate": 4.128861082596592e-05, | |
| "loss": 0.5612, | |
| "num_input_tokens_seen": 3455584, | |
| "step": 550, | |
| "train_runtime": 20414.7402, | |
| "train_tokens_per_second": 169.269 | |
| }, | |
| { | |
| "epoch": 1.6480862133036047, | |
| "grad_norm": 0.8129316983564234, | |
| "learning_rate": 4.109743851501573e-05, | |
| "loss": 0.6068, | |
| "num_input_tokens_seen": 3480392, | |
| "step": 555, | |
| "train_runtime": 20570.2335, | |
| "train_tokens_per_second": 169.196 | |
| }, | |
| { | |
| "epoch": 1.6629505759940542, | |
| "grad_norm": 0.5732379522903723, | |
| "learning_rate": 4.090464452765535e-05, | |
| "loss": 0.6331, | |
| "num_input_tokens_seen": 3515664, | |
| "step": 560, | |
| "train_runtime": 20754.2066, | |
| "train_tokens_per_second": 169.395 | |
| }, | |
| { | |
| "epoch": 1.677814938684504, | |
| "grad_norm": 0.8129257440384582, | |
| "learning_rate": 4.0710248286195994e-05, | |
| "loss": 0.6944, | |
| "num_input_tokens_seen": 3538880, | |
| "step": 565, | |
| "train_runtime": 20901.2916, | |
| "train_tokens_per_second": 169.314 | |
| }, | |
| { | |
| "epoch": 1.6926793013749535, | |
| "grad_norm": 0.5743351437424525, | |
| "learning_rate": 4.051426937436207e-05, | |
| "loss": 0.6007, | |
| "num_input_tokens_seen": 3576168, | |
| "step": 570, | |
| "train_runtime": 21091.5246, | |
| "train_tokens_per_second": 169.555 | |
| }, | |
| { | |
| "epoch": 1.7075436640654031, | |
| "grad_norm": 1.7481870694933204, | |
| "learning_rate": 4.0316727535318175e-05, | |
| "loss": 0.6374, | |
| "num_input_tokens_seen": 3606944, | |
| "step": 575, | |
| "train_runtime": 21266.3906, | |
| "train_tokens_per_second": 169.608 | |
| }, | |
| { | |
| "epoch": 1.7224080267558528, | |
| "grad_norm": 0.5813131410466363, | |
| "learning_rate": 4.0117642669680164e-05, | |
| "loss": 0.5499, | |
| "num_input_tokens_seen": 3637888, | |
| "step": 580, | |
| "train_runtime": 21440.8922, | |
| "train_tokens_per_second": 169.671 | |
| }, | |
| { | |
| "epoch": 1.7372723894463025, | |
| "grad_norm": 0.8484268316277586, | |
| "learning_rate": 3.991703483351039e-05, | |
| "loss": 0.6668, | |
| "num_input_tokens_seen": 3661216, | |
| "step": 585, | |
| "train_runtime": 21594.0452, | |
| "train_tokens_per_second": 169.547 | |
| }, | |
| { | |
| "epoch": 1.7521367521367521, | |
| "grad_norm": 0.34272148077746284, | |
| "learning_rate": 3.9714924236297155e-05, | |
| "loss": 0.6033, | |
| "num_input_tokens_seen": 3700064, | |
| "step": 590, | |
| "train_runtime": 21798.0768, | |
| "train_tokens_per_second": 169.743 | |
| }, | |
| { | |
| "epoch": 1.7670011148272018, | |
| "grad_norm": 0.588668446023156, | |
| "learning_rate": 3.9511331238918837e-05, | |
| "loss": 0.5136, | |
| "num_input_tokens_seen": 3730992, | |
| "step": 595, | |
| "train_runtime": 21979.8348, | |
| "train_tokens_per_second": 169.746 | |
| }, | |
| { | |
| "epoch": 1.7818654775176515, | |
| "grad_norm": 0.5461111827190066, | |
| "learning_rate": 3.9306276351592685e-05, | |
| "loss": 0.5867, | |
| "num_input_tokens_seen": 3766176, | |
| "step": 600, | |
| "train_runtime": 22165.8728, | |
| "train_tokens_per_second": 169.909 | |
| }, | |
| { | |
| "epoch": 1.7818654775176515, | |
| "eval_loss": 0.8493290543556213, | |
| "eval_runtime": 354.1156, | |
| "eval_samples_per_second": 1.322, | |
| "eval_steps_per_second": 0.661, | |
| "num_input_tokens_seen": 3766176, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1685, | |
| "num_input_tokens_seen": 3766176, | |
| "num_train_epochs": 5, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 166671066071040.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |