diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.037392884964944, + "eval_steps": 1000, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 775.203125, + "epoch": 0.002077382498052454, + "grad_norm": 0.16490910947322845, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0, + "reward": 0.2167968787252903, + "reward_std": 0.11324757407419384, + "rewards/argmax_reward_func": 0.0625, + "rewards/format_reward_func": 0.154296875, + "step": 1 + }, + { + "completion_length": 820.609375, + "epoch": 0.004154764996104908, + "grad_norm": 0.15733271837234497, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": 0.0, + "reward": 0.1472656298428774, + "reward_std": 0.020439805870410055, + "rewards/argmax_reward_func": 0.0, + "rewards/format_reward_func": 0.14726562798023224, + "step": 2 + }, + { + "completion_length": 901.25, + "epoch": 0.006232147494157362, + "grad_norm": 0.14691142737865448, + "kl": 0.0010660013067536056, + "learning_rate": 4e-05, + "loss": 0.0, + "reward": 0.20703125, + "reward_std": 0.11269514623563737, + "rewards/argmax_reward_func": 0.0625, + "rewards/format_reward_func": 0.14453125, + "step": 3 + }, + { + "completion_length": 873.015625, + "epoch": 0.008309529992209816, + "grad_norm": 0.14998185634613037, + "kl": 0.0019050340633839369, + "learning_rate": 6e-05, + "loss": 0.0, + "reward": 0.2011718824505806, + "reward_std": 0.09004563023336232, + "rewards/argmax_reward_func": 0.046875, + "rewards/format_reward_func": 0.1542968787252903, + "step": 4 + }, + { + "completion_length": 870.546875, + "epoch": 0.01038691249026227, + "grad_norm": 0.1567591279745102, + "kl": 0.005349995743017644, + "learning_rate": 8e-05, + "loss": 0.0, + "reward": 0.2285156361758709, + "reward_std": 0.1110378596931696, + "rewards/argmax_reward_func": 0.0625, + "rewards/format_reward_func": 0.1660156287252903, + "step": 5 + }, + { + "completion_length": 849.125, + "epoch": 0.012464294988314724, + "grad_norm": 0.10938515514135361, + "kl": 0.01296996301971376, + "learning_rate": 0.0001, + "loss": 0.0, + "reward": 0.24414063058793545, + "reward_std": 0.0999893163680099, + "rewards/argmax_reward_func": 0.0625, + "rewards/format_reward_func": 0.1816406324505806, + "step": 6 + }, + { + "completion_length": 901.015625, + "epoch": 0.014541677486367177, + "grad_norm": 0.12581659853458405, + "kl": 0.02171943092253059, + "learning_rate": 9.999973058889791e-05, + "loss": 0.0, + "reward": 0.2585937548428774, + "reward_std": 0.12816310487687588, + "rewards/argmax_reward_func": 0.078125, + "rewards/format_reward_func": 0.18046875298023224, + "step": 7 + }, + { + "completion_length": 916.671875, + "epoch": 0.01661905998441963, + "grad_norm": 0.12178487330675125, + "kl": 0.04081101668998599, + "learning_rate": 9.999892235849491e-05, + "loss": 0.0, + "reward": 0.3437500111758709, + "reward_std": 0.1900349531788379, + "rewards/argmax_reward_func": 0.15625, + "rewards/format_reward_func": 0.1875, + "step": 8 + }, + { + "completion_length": 803.90625, + "epoch": 0.018696442482472084, + "grad_norm": 0.12499672174453735, + "kl": 0.06826442573219538, + "learning_rate": 9.999757531750085e-05, + "loss": 0.0, + "reward": 0.45625001564621925, + "reward_std": 0.25411650398746133, + "rewards/argmax_reward_func": 0.265625, + "rewards/format_reward_func": 0.1906250026077032, + "step": 9 + }, + { + "completion_length": 953.875, + "epoch": 0.02077382498052454, + "grad_norm": 0.11061865091323853, + "kl": 0.06516677932813764, + "learning_rate": 9.999568948043205e-05, + "loss": 0.0, + "reward": 0.3804687615483999, + "reward_std": 0.23091456340625882, + "rewards/argmax_reward_func": 0.1875, + "rewards/format_reward_func": 0.19296875037252903, + "step": 10 + }, + { + "completion_length": 824.546875, + "epoch": 0.022851207478576992, + "grad_norm": 0.10025237500667572, + "kl": 0.10202133795246482, + "learning_rate": 9.999326486761114e-05, + "loss": 0.0001, + "reward": 0.4562500212341547, + "reward_std": 0.203293202444911, + "rewards/argmax_reward_func": 0.265625, + "rewards/format_reward_func": 0.1906250026077032, + "step": 11 + }, + { + "completion_length": 925.234375, + "epoch": 0.02492858997662945, + "grad_norm": 0.12423845380544662, + "kl": 0.14641187246888876, + "learning_rate": 9.99903015051668e-05, + "loss": 0.0001, + "reward": 0.6261719018220901, + "reward_std": 0.22925727342953905, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.1886718738824129, + "step": 12 + }, + { + "completion_length": 810.546875, + "epoch": 0.0270059724746819, + "grad_norm": 0.1263190507888794, + "kl": 0.23557536769658327, + "learning_rate": 9.998679942503358e-05, + "loss": 0.0001, + "reward": 0.5953125320374966, + "reward_std": 0.2717941626906395, + "rewards/argmax_reward_func": 0.40625, + "rewards/format_reward_func": 0.18906250409781933, + "step": 13 + }, + { + "completion_length": 738.953125, + "epoch": 0.029083354972734354, + "grad_norm": 0.09476204961538315, + "kl": 0.29587008990347385, + "learning_rate": 9.998275866495138e-05, + "loss": 0.0001, + "reward": 0.7289062887430191, + "reward_std": 0.18009126000106335, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.19765625335276127, + "step": 14 + }, + { + "completion_length": 699.109375, + "epoch": 0.03116073747078681, + "grad_norm": 0.15413929522037506, + "kl": 0.2644388508051634, + "learning_rate": 9.997817926846529e-05, + "loss": 0.0001, + "reward": 0.6968750357627869, + "reward_std": 0.4021669775247574, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.19687500223517418, + "step": 15 + }, + { + "completion_length": 716.4375, + "epoch": 0.03323811996883926, + "grad_norm": 0.13675570487976074, + "kl": 0.41714945435523987, + "learning_rate": 9.99730612849249e-05, + "loss": 0.0002, + "reward": 0.6320312805473804, + "reward_std": 0.27289901627227664, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.19453125074505806, + "step": 16 + }, + { + "completion_length": 688.609375, + "epoch": 0.03531550246689172, + "grad_norm": 0.14246560633182526, + "kl": 0.35029047913849354, + "learning_rate": 9.996740476948385e-05, + "loss": 0.0002, + "reward": 0.6304687857627869, + "reward_std": 0.31930290907621384, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.19296875223517418, + "step": 17 + }, + { + "completion_length": 630.828125, + "epoch": 0.03739288496494417, + "grad_norm": 2.1836376190185547, + "kl": 10.275608837604523, + "learning_rate": 9.996120978309931e-05, + "loss": 0.0051, + "reward": 0.5742187947034836, + "reward_std": 0.39885240606963634, + "rewards/argmax_reward_func": 0.375, + "rewards/format_reward_func": 0.1992187537252903, + "step": 18 + }, + { + "completion_length": 647.890625, + "epoch": 0.039470267462996624, + "grad_norm": 0.1236676499247551, + "kl": 0.366399560123682, + "learning_rate": 9.995447639253115e-05, + "loss": 0.0002, + "reward": 0.7765625417232513, + "reward_std": 0.2894718423485756, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19843750074505806, + "step": 19 + }, + { + "completion_length": 567.578125, + "epoch": 0.04154764996104908, + "grad_norm": 0.12248539924621582, + "kl": 0.28800770081579685, + "learning_rate": 9.994720467034142e-05, + "loss": 0.0001, + "reward": 0.807812537997961, + "reward_std": 0.24527766928076744, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.1984375026077032, + "step": 20 + }, + { + "completion_length": 538.796875, + "epoch": 0.04362503245910153, + "grad_norm": 0.14135704934597015, + "kl": 0.5838185884058475, + "learning_rate": 9.993939469489342e-05, + "loss": 0.0003, + "reward": 0.6835937947034836, + "reward_std": 0.24417280592024326, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.19921875186264515, + "step": 21 + }, + { + "completion_length": 534.40625, + "epoch": 0.045702414957153985, + "grad_norm": 0.16315752267837524, + "kl": 0.37346063926815987, + "learning_rate": 9.993104655035088e-05, + "loss": 0.0002, + "reward": 0.6835937909781933, + "reward_std": 0.37675532698631287, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.1992187537252903, + "step": 22 + }, + { + "completion_length": 583.4375, + "epoch": 0.04777979745520644, + "grad_norm": 0.13479600846767426, + "kl": 0.5061899088323116, + "learning_rate": 9.992216032667716e-05, + "loss": 0.0003, + "reward": 0.5878906548023224, + "reward_std": 0.2911291141062975, + "rewards/argmax_reward_func": 0.390625, + "rewards/format_reward_func": 0.19726562686264515, + "step": 23 + }, + { + "completion_length": 532.203125, + "epoch": 0.0498571799532589, + "grad_norm": 0.12497097253799438, + "kl": 0.5098075568675995, + "learning_rate": 9.991273611963412e-05, + "loss": 0.0003, + "reward": 0.8250000476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 24 + }, + { + "completion_length": 503.4375, + "epoch": 0.051934562451311346, + "grad_norm": 0.153394415974617, + "kl": 0.35232703387737274, + "learning_rate": 9.990277403078122e-05, + "loss": 0.0002, + "reward": 0.7156250439584255, + "reward_std": 0.3314562924206257, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 25 + }, + { + "completion_length": 491.25, + "epoch": 0.0540119449493638, + "grad_norm": 0.15910868346691132, + "kl": 0.41421468555927277, + "learning_rate": 9.989227416747434e-05, + "loss": 0.0002, + "reward": 0.7625000476837158, + "reward_std": 0.35355337895452976, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 26 + }, + { + "completion_length": 485.390625, + "epoch": 0.05608932744741626, + "grad_norm": 34.968101501464844, + "kl": 504.1205723620951, + "learning_rate": 9.988123664286469e-05, + "loss": 0.2521, + "reward": 0.6375000439584255, + "reward_std": 0.39774755388498306, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 27 + }, + { + "completion_length": 543.859375, + "epoch": 0.05816670994546871, + "grad_norm": 0.1266699880361557, + "kl": 0.3936588950455189, + "learning_rate": 9.98696615758975e-05, + "loss": 0.0002, + "reward": 0.7296875417232513, + "reward_std": 0.2673747483640909, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.1984375026077032, + "step": 28 + }, + { + "completion_length": 520.84375, + "epoch": 0.06024409244352116, + "grad_norm": 0.1384185552597046, + "kl": 0.41820336878299713, + "learning_rate": 9.985754909131085e-05, + "loss": 0.0002, + "reward": 0.6523437947034836, + "reward_std": 0.2883669789880514, + "rewards/argmax_reward_func": 0.453125, + "rewards/format_reward_func": 0.19921875186264515, + "step": 29 + }, + { + "completion_length": 566.515625, + "epoch": 0.06232147494157362, + "grad_norm": 0.12411382049322128, + "kl": 0.37708618491888046, + "learning_rate": 9.984489931963428e-05, + "loss": 0.0002, + "reward": 0.7304687909781933, + "reward_std": 0.26626989617943764, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.1992187537252903, + "step": 30 + }, + { + "completion_length": 557.71875, + "epoch": 0.06439885743962608, + "grad_norm": 0.1417299211025238, + "kl": 0.5239567384123802, + "learning_rate": 9.98317123971873e-05, + "loss": 0.0003, + "reward": 0.5867187865078449, + "reward_std": 0.33698057383298874, + "rewards/argmax_reward_func": 0.390625, + "rewards/format_reward_func": 0.19609375298023224, + "step": 31 + }, + { + "completion_length": 571.46875, + "epoch": 0.06647623993767852, + "grad_norm": 0.12581190466880798, + "kl": 0.3905966766178608, + "learning_rate": 9.981798846607808e-05, + "loss": 0.0002, + "reward": 0.8238281756639481, + "reward_std": 0.31101649068295956, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19882812909781933, + "step": 32 + }, + { + "completion_length": 617.171875, + "epoch": 0.06855362243573097, + "grad_norm": 0.119756318628788, + "kl": 0.37264879420399666, + "learning_rate": 9.980372767420177e-05, + "loss": 0.0002, + "reward": 0.6210937835276127, + "reward_std": 0.2883669789880514, + "rewards/argmax_reward_func": 0.421875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 33 + }, + { + "completion_length": 604.265625, + "epoch": 0.07063100493378344, + "grad_norm": 0.12877410650253296, + "kl": 0.4410099685192108, + "learning_rate": 9.978893017523903e-05, + "loss": 0.0002, + "reward": 0.6687500476837158, + "reward_std": 0.3535533808171749, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 34 + }, + { + "completion_length": 664.875, + "epoch": 0.07270838743183589, + "grad_norm": 0.10366171598434448, + "kl": 0.5219907499849796, + "learning_rate": 9.977359612865423e-05, + "loss": 0.0003, + "reward": 0.726562537252903, + "reward_std": 0.26958445459604263, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.1953125037252903, + "step": 35 + }, + { + "completion_length": 658.515625, + "epoch": 0.07478576992988833, + "grad_norm": 0.1170380637049675, + "kl": 0.46299856156110764, + "learning_rate": 9.97577256996939e-05, + "loss": 0.0002, + "reward": 0.8226562887430191, + "reward_std": 0.3126737759448588, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19765625335276127, + "step": 36 + }, + { + "completion_length": 621.890625, + "epoch": 0.0768631524279408, + "grad_norm": 0.09539435803890228, + "kl": 0.39572376012802124, + "learning_rate": 9.974131905938483e-05, + "loss": 0.0002, + "reward": 0.851562537252903, + "reward_std": 0.18119611439760774, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.1953125037252903, + "step": 37 + }, + { + "completion_length": 616.671875, + "epoch": 0.07894053492599325, + "grad_norm": 0.1066877692937851, + "kl": 0.3970871977508068, + "learning_rate": 9.972437638453227e-05, + "loss": 0.0002, + "reward": 0.5734375342726707, + "reward_std": 0.2673747483640909, + "rewards/argmax_reward_func": 0.375, + "rewards/format_reward_func": 0.1984375026077032, + "step": 38 + }, + { + "completion_length": 620.515625, + "epoch": 0.0810179174240457, + "grad_norm": 0.09872303903102875, + "kl": 0.4494887478649616, + "learning_rate": 9.970689785771798e-05, + "loss": 0.0002, + "reward": 0.8539062887430191, + "reward_std": 0.22428543493151665, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19765625521540642, + "step": 39 + }, + { + "completion_length": 646.25, + "epoch": 0.08309529992209816, + "grad_norm": 0.10916193574666977, + "kl": 0.3997967578470707, + "learning_rate": 9.968888366729835e-05, + "loss": 0.0002, + "reward": 0.7867187820374966, + "reward_std": 0.31156892515718937, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19296875223517418, + "step": 40 + }, + { + "completion_length": 636.578125, + "epoch": 0.08517268242015061, + "grad_norm": 0.109690822660923, + "kl": 0.4680747017264366, + "learning_rate": 9.967033400740227e-05, + "loss": 0.0002, + "reward": 0.7125000357627869, + "reward_std": 0.2916815411299467, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.19687500409781933, + "step": 41 + }, + { + "completion_length": 633.8125, + "epoch": 0.08725006491820306, + "grad_norm": 0.11978733539581299, + "kl": 0.42677244916558266, + "learning_rate": 9.965124907792915e-05, + "loss": 0.0002, + "reward": 0.6804687902331352, + "reward_std": 0.3369805682450533, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.19609375298023224, + "step": 42 + }, + { + "completion_length": 641.328125, + "epoch": 0.08932744741625552, + "grad_norm": 0.10323718935251236, + "kl": 0.4870793893933296, + "learning_rate": 9.963162908454664e-05, + "loss": 0.0002, + "reward": 0.6820312812924385, + "reward_std": 0.2905766926705837, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.19765625335276127, + "step": 43 + }, + { + "completion_length": 672.90625, + "epoch": 0.09140482991430797, + "grad_norm": 0.10177203267812729, + "kl": 0.4337821826338768, + "learning_rate": 9.96114742386885e-05, + "loss": 0.0002, + "reward": 0.7445312850177288, + "reward_std": 0.3347708657383919, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.19765625335276127, + "step": 44 + }, + { + "completion_length": 639.203125, + "epoch": 0.09348221241236043, + "grad_norm": 0.07864588499069214, + "kl": 0.6409419141709805, + "learning_rate": 9.95907847575523e-05, + "loss": 0.0003, + "reward": 0.6367187835276127, + "reward_std": 0.17788154818117619, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.1992187537252903, + "step": 45 + }, + { + "completion_length": 662.859375, + "epoch": 0.09555959491041288, + "grad_norm": 0.1133044883608818, + "kl": 0.49651604518294334, + "learning_rate": 9.95695608640971e-05, + "loss": 0.0002, + "reward": 0.6664062887430191, + "reward_std": 0.31267377361655235, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.19765624962747097, + "step": 46 + }, + { + "completion_length": 615.1875, + "epoch": 0.09763697740846533, + "grad_norm": 0.09465198963880539, + "kl": 0.42672090977430344, + "learning_rate": 9.954780278704097e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.2651650384068489, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 47 + }, + { + "completion_length": 624.234375, + "epoch": 0.0997143599065178, + "grad_norm": 0.12003368884325027, + "kl": 0.45439790561795235, + "learning_rate": 9.952551076085864e-05, + "loss": 0.0002, + "reward": 0.6531250402331352, + "reward_std": 0.375650467351079, + "rewards/argmax_reward_func": 0.453125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 48 + }, + { + "completion_length": 657.6875, + "epoch": 0.10179174240457024, + "grad_norm": 0.10603732615709305, + "kl": 0.4523283280432224, + "learning_rate": 9.950268502577884e-05, + "loss": 0.0002, + "reward": 0.823437537997961, + "reward_std": 0.3115689232945442, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19843750074505806, + "step": 49 + }, + { + "completion_length": 611.640625, + "epoch": 0.10386912490262269, + "grad_norm": 0.11261381953954697, + "kl": 0.5920008532702923, + "learning_rate": 9.947932582778188e-05, + "loss": 0.0003, + "reward": 0.7765625417232513, + "reward_std": 0.33366600796580315, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.1984375026077032, + "step": 50 + }, + { + "completion_length": 662.359375, + "epoch": 0.10594650740067516, + "grad_norm": 0.11194069683551788, + "kl": 0.40367136895656586, + "learning_rate": 9.94554334185968e-05, + "loss": 0.0002, + "reward": 0.6375000365078449, + "reward_std": 0.35355338267982006, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 51 + }, + { + "completion_length": 686.265625, + "epoch": 0.1080238898987276, + "grad_norm": 0.10016939043998718, + "kl": 0.429857462644577, + "learning_rate": 9.943100805569887e-05, + "loss": 0.0002, + "reward": 0.7468750476837158, + "reward_std": 0.33145629055798054, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 52 + }, + { + "completion_length": 613.890625, + "epoch": 0.11010127239678005, + "grad_norm": 0.11434896290302277, + "kl": 0.41631242260336876, + "learning_rate": 9.94060500023066e-05, + "loss": 0.0002, + "reward": 0.620312537997961, + "reward_std": 0.333666006103158, + "rewards/argmax_reward_func": 0.421875, + "rewards/format_reward_func": 0.1984375026077032, + "step": 53 + }, + { + "completion_length": 671.75, + "epoch": 0.11217865489483252, + "grad_norm": 0.0824907198548317, + "kl": 0.4247642531991005, + "learning_rate": 9.938055952737907e-05, + "loss": 0.0002, + "reward": 0.7156250476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 54 + }, + { + "completion_length": 711.078125, + "epoch": 0.11425603739288497, + "grad_norm": 0.09910566359758377, + "kl": 0.46975456923246384, + "learning_rate": 9.935453690561297e-05, + "loss": 0.0002, + "reward": 0.7906250469386578, + "reward_std": 0.3137786239385605, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 55 + }, + { + "completion_length": 659.5625, + "epoch": 0.11633341989093741, + "grad_norm": 0.0940733402967453, + "kl": 0.42034388333559036, + "learning_rate": 9.932798241743961e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.2872621212154627, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 56 + }, + { + "completion_length": 694.796875, + "epoch": 0.11841080238898988, + "grad_norm": 0.20298048853874207, + "kl": 1.01119814068079, + "learning_rate": 9.930089634902197e-05, + "loss": 0.0005, + "reward": 0.714062537997961, + "reward_std": 0.28947182931005955, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.19843750447034836, + "step": 57 + }, + { + "completion_length": 679.421875, + "epoch": 0.12048818488704233, + "grad_norm": 0.10724397003650665, + "kl": 0.45981432124972343, + "learning_rate": 9.927327899225151e-05, + "loss": 0.0002, + "reward": 0.7156250476837158, + "reward_std": 0.375650467351079, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 58 + }, + { + "completion_length": 666.390625, + "epoch": 0.12256556738509478, + "grad_norm": 0.09369952231645584, + "kl": 0.5710588954389095, + "learning_rate": 9.924513064474519e-05, + "loss": 0.0003, + "reward": 0.8085937909781933, + "reward_std": 0.24417280405759811, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19921875186264515, + "step": 59 + }, + { + "completion_length": 667.3125, + "epoch": 0.12464294988314724, + "grad_norm": 0.10410826653242111, + "kl": 0.6697803623974323, + "learning_rate": 9.921645160984206e-05, + "loss": 0.0003, + "reward": 0.7625000476837158, + "reward_std": 0.35355337895452976, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 60 + }, + { + "completion_length": 694.453125, + "epoch": 0.1267203323811997, + "grad_norm": 0.10938889533281326, + "kl": 0.42841707170009613, + "learning_rate": 9.918724219660013e-05, + "loss": 0.0002, + "reward": 0.7781250476837158, + "reward_std": 0.3756504636257887, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 61 + }, + { + "completion_length": 767.796875, + "epoch": 0.12879771487925215, + "grad_norm": 0.08572812378406525, + "kl": 0.42277197539806366, + "learning_rate": 9.915750271979305e-05, + "loss": 0.0002, + "reward": 0.6843750402331352, + "reward_std": 0.28726212307810783, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 62 + }, + { + "completion_length": 746.703125, + "epoch": 0.1308750973773046, + "grad_norm": 0.08672405034303665, + "kl": 0.4743144288659096, + "learning_rate": 9.91272334999066e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.2872621212154627, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 63 + }, + { + "completion_length": 785.140625, + "epoch": 0.13295247987535705, + "grad_norm": 0.07892299443483353, + "kl": 0.5303685143589973, + "learning_rate": 9.909643486313533e-05, + "loss": 0.0003, + "reward": 0.7312500402331352, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 64 + }, + { + "completion_length": 816.015625, + "epoch": 0.1350298623734095, + "grad_norm": 0.071454256772995, + "kl": 0.39743437245488167, + "learning_rate": 9.906510714137905e-05, + "loss": 0.0002, + "reward": 0.6218750402331352, + "reward_std": 0.24306794628500938, + "rewards/argmax_reward_func": 0.421875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 65 + }, + { + "completion_length": 836.4375, + "epoch": 0.13710724487146195, + "grad_norm": 0.08313830941915512, + "kl": 0.3903077654540539, + "learning_rate": 9.903325067223919e-05, + "loss": 0.0002, + "reward": 0.6367187909781933, + "reward_std": 0.31046406738460064, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.1992187537252903, + "step": 66 + }, + { + "completion_length": 787.484375, + "epoch": 0.13918462736951442, + "grad_norm": 0.08504212647676468, + "kl": 0.5619952343404293, + "learning_rate": 9.90008657990152e-05, + "loss": 0.0003, + "reward": 0.7464844211935997, + "reward_std": 0.28781455010175705, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.19960937649011612, + "step": 67 + }, + { + "completion_length": 807.921875, + "epoch": 0.14126200986756687, + "grad_norm": 0.08398205786943436, + "kl": 0.47908810153603554, + "learning_rate": 9.896795287070086e-05, + "loss": 0.0002, + "reward": 0.7468750476837158, + "reward_std": 0.331456296145916, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 68 + }, + { + "completion_length": 837.234375, + "epoch": 0.14333939236561932, + "grad_norm": 0.054244451224803925, + "kl": 0.39820099994540215, + "learning_rate": 9.893451224198052e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 69 + }, + { + "completion_length": 910.0625, + "epoch": 0.14541677486367177, + "grad_norm": 0.08078251034021378, + "kl": 0.4756108485162258, + "learning_rate": 9.890054427322521e-05, + "loss": 0.0002, + "reward": 0.7781250439584255, + "reward_std": 0.331456296145916, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 70 + }, + { + "completion_length": 867.84375, + "epoch": 0.14749415736172422, + "grad_norm": 0.08043571561574936, + "kl": 0.3970469869673252, + "learning_rate": 9.886604933048888e-05, + "loss": 0.0002, + "reward": 0.6679687947034836, + "reward_std": 0.3104640601668507, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 71 + }, + { + "completion_length": 882.265625, + "epoch": 0.14957153985977667, + "grad_norm": 0.09208390861749649, + "kl": 0.40190327540040016, + "learning_rate": 9.883102778550434e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.3977475520223379, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 72 + }, + { + "completion_length": 889.78125, + "epoch": 0.15164892235782915, + "grad_norm": 0.09202940762042999, + "kl": 0.38338571041822433, + "learning_rate": 9.879548001567931e-05, + "loss": 0.0002, + "reward": 0.7000000476837158, + "reward_std": 0.4419417232275009, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.20000000298023224, + "step": 73 + }, + { + "completion_length": 942.609375, + "epoch": 0.1537263048558816, + "grad_norm": 0.06312800943851471, + "kl": 0.4080694951117039, + "learning_rate": 9.875940640409234e-05, + "loss": 0.0002, + "reward": 0.5750000402331352, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 74 + }, + { + "completion_length": 948.859375, + "epoch": 0.15580368735393405, + "grad_norm": 0.0712570995092392, + "kl": 0.4405221752822399, + "learning_rate": 9.872280733948867e-05, + "loss": 0.0002, + "reward": 0.8085937947034836, + "reward_std": 0.2883669827133417, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19921875186264515, + "step": 75 + }, + { + "completion_length": 1053.21875, + "epoch": 0.1578810698519865, + "grad_norm": 0.05858299508690834, + "kl": 0.4397047348320484, + "learning_rate": 9.868568321627611e-05, + "loss": 0.0002, + "reward": 0.7000000383704901, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.20000000298023224, + "step": 76 + }, + { + "completion_length": 1019.96875, + "epoch": 0.15995845235003894, + "grad_norm": 0.07670939713716507, + "kl": 0.40835118666291237, + "learning_rate": 9.86480344345207e-05, + "loss": 0.0002, + "reward": 0.7781250439584255, + "reward_std": 0.33145629800856113, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 77 + }, + { + "completion_length": 1075.15625, + "epoch": 0.1620358348480914, + "grad_norm": 0.06651510298252106, + "kl": 0.42486657947301865, + "learning_rate": 9.860986139994239e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.28726211935281754, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 78 + }, + { + "completion_length": 1096.828125, + "epoch": 0.16411321734614387, + "grad_norm": 0.06264790147542953, + "kl": 0.3813174143433571, + "learning_rate": 9.857116452391079e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 79 + }, + { + "completion_length": 1159.390625, + "epoch": 0.16619059984419632, + "grad_norm": 0.06721258908510208, + "kl": 0.41810835897922516, + "learning_rate": 9.85319442234406e-05, + "loss": 0.0002, + "reward": 0.7617187947034836, + "reward_std": 0.3104640692472458, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.1992187537252903, + "step": 80 + }, + { + "completion_length": 1207.40625, + "epoch": 0.16826798234224877, + "grad_norm": 0.07961631566286087, + "kl": 0.353565227240324, + "learning_rate": 9.84922009211872e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.4419417269527912, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 81 + }, + { + "completion_length": 1267.5625, + "epoch": 0.17034536484030122, + "grad_norm": 0.06159353628754616, + "kl": 0.3608316369354725, + "learning_rate": 9.845193504544209e-05, + "loss": 0.0002, + "reward": 0.6218750365078449, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.421875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 82 + }, + { + "completion_length": 1271.1875, + "epoch": 0.17242274733835367, + "grad_norm": 0.0616268515586853, + "kl": 0.3721548244357109, + "learning_rate": 9.841114703012817e-05, + "loss": 0.0002, + "reward": 0.7613281682133675, + "reward_std": 0.26682231575250626, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.19882812723517418, + "step": 83 + }, + { + "completion_length": 1197.84375, + "epoch": 0.17450012983640611, + "grad_norm": 0.06743966042995453, + "kl": 0.46105678752064705, + "learning_rate": 9.836983731479525e-05, + "loss": 0.0002, + "reward": 0.7625000476837158, + "reward_std": 0.30935920774936676, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 84 + }, + { + "completion_length": 1239.859375, + "epoch": 0.1765775123344586, + "grad_norm": 0.07362944632768631, + "kl": 0.35114892572164536, + "learning_rate": 9.832800634461518e-05, + "loss": 0.0002, + "reward": 0.6828125417232513, + "reward_std": 0.3336660098284483, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.19843750074505806, + "step": 85 + }, + { + "completion_length": 1253.21875, + "epoch": 0.17865489483251104, + "grad_norm": 0.060973405838012695, + "kl": 0.3400215059518814, + "learning_rate": 9.828565457037703e-05, + "loss": 0.0002, + "reward": 0.7613281719386578, + "reward_std": 0.2668223213404417, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.19882812723517418, + "step": 86 + }, + { + "completion_length": 1251.828125, + "epoch": 0.1807322773305635, + "grad_norm": 0.06071100011467934, + "kl": 0.3388819098472595, + "learning_rate": 9.824278244848235e-05, + "loss": 0.0002, + "reward": 0.6843750402331352, + "reward_std": 0.28726212307810783, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 87 + }, + { + "completion_length": 1177.9375, + "epoch": 0.18280965982861594, + "grad_norm": 0.07785635441541672, + "kl": 0.39376673474907875, + "learning_rate": 9.819939044094016e-05, + "loss": 0.0002, + "reward": 0.6687500476837158, + "reward_std": 0.3977475520223379, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 88 + }, + { + "completion_length": 1312.515625, + "epoch": 0.1848870423266684, + "grad_norm": 0.06982032209634781, + "kl": 0.3353493846952915, + "learning_rate": 9.815547901536201e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.33145629428327084, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 89 + }, + { + "completion_length": 1360.75, + "epoch": 0.18696442482472087, + "grad_norm": 0.06107737869024277, + "kl": 0.45528167858719826, + "learning_rate": 9.811104864495691e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.24306794814765453, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 90 + }, + { + "completion_length": 1353.296875, + "epoch": 0.18904180732277331, + "grad_norm": 0.06465540081262589, + "kl": 0.353522464632988, + "learning_rate": 9.806609980852628e-05, + "loss": 0.0002, + "reward": 0.8046875409781933, + "reward_std": 0.2938912510871887, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19531250186264515, + "step": 91 + }, + { + "completion_length": 1441.984375, + "epoch": 0.19111918982082576, + "grad_norm": 0.0610247403383255, + "kl": 0.36326174437999725, + "learning_rate": 9.802063299045873e-05, + "loss": 0.0002, + "reward": 0.7468750402331352, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 92 + }, + { + "completion_length": 1441.265625, + "epoch": 0.1931965723188782, + "grad_norm": 0.05115514621138573, + "kl": 0.411540150642395, + "learning_rate": 9.797464868072488e-05, + "loss": 0.0002, + "reward": 0.6812500357627869, + "reward_std": 0.2032931987196207, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 93 + }, + { + "completion_length": 1397.03125, + "epoch": 0.19527395481693066, + "grad_norm": 0.053147751837968826, + "kl": 0.4489905573427677, + "learning_rate": 9.792814737487207e-05, + "loss": 0.0002, + "reward": 0.7937500439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 94 + }, + { + "completion_length": 1407.515625, + "epoch": 0.1973513373149831, + "grad_norm": 0.0552426278591156, + "kl": 0.3701773174107075, + "learning_rate": 9.788112957401903e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 95 + }, + { + "completion_length": 1505.546875, + "epoch": 0.1994287198130356, + "grad_norm": 0.05075477808713913, + "kl": 0.39650479704141617, + "learning_rate": 9.783359578485047e-05, + "loss": 0.0002, + "reward": 0.8855469226837158, + "reward_std": 0.17953883111476898, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19804687798023224, + "step": 96 + }, + { + "completion_length": 1542.90625, + "epoch": 0.20150610231108804, + "grad_norm": 0.053789589554071426, + "kl": 0.35163769498467445, + "learning_rate": 9.778554651961159e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 97 + }, + { + "completion_length": 1533.46875, + "epoch": 0.20358348480914049, + "grad_norm": 0.05969106778502464, + "kl": 0.40055200457572937, + "learning_rate": 9.773698229610263e-05, + "loss": 0.0002, + "reward": 0.8664062917232513, + "reward_std": 0.29499610885977745, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.1945312526077032, + "step": 98 + }, + { + "completion_length": 1658.3125, + "epoch": 0.20566086730719293, + "grad_norm": 0.05904076248407364, + "kl": 0.3737713471055031, + "learning_rate": 9.768790363767322e-05, + "loss": 0.0002, + "reward": 0.7132812924683094, + "reward_std": 0.2905766908079386, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.19765625149011612, + "step": 99 + }, + { + "completion_length": 1522.9375, + "epoch": 0.20773824980524538, + "grad_norm": 0.04626452177762985, + "kl": 0.3718419596552849, + "learning_rate": 9.763831107321678e-05, + "loss": 0.0002, + "reward": 0.6843750439584255, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 100 + }, + { + "completion_length": 1437.34375, + "epoch": 0.20981563230329783, + "grad_norm": 0.0583551786839962, + "kl": 0.3823527656495571, + "learning_rate": 9.75882051371648e-05, + "loss": 0.0002, + "reward": 0.7929687947034836, + "reward_std": 0.26626989617943764, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19921875186264515, + "step": 101 + }, + { + "completion_length": 1665.859375, + "epoch": 0.2118930148013503, + "grad_norm": 0.071258544921875, + "kl": 0.3524062894284725, + "learning_rate": 9.753758636948111e-05, + "loss": 0.0002, + "reward": 0.7121094167232513, + "reward_std": 0.3806223217397928, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.19648437574505806, + "step": 102 + }, + { + "completion_length": 1570.484375, + "epoch": 0.21397039729940276, + "grad_norm": 0.06221286952495575, + "kl": 0.4012618362903595, + "learning_rate": 9.748645531565604e-05, + "loss": 0.0002, + "reward": 0.8691406697034836, + "reward_std": 0.2911291141062975, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19726562686264515, + "step": 103 + }, + { + "completion_length": 1470.140625, + "epoch": 0.2160477797974552, + "grad_norm": 0.05706779286265373, + "kl": 0.37375468015670776, + "learning_rate": 9.743481252670049e-05, + "loss": 0.0002, + "reward": 0.7136719189584255, + "reward_std": 0.24583008396439254, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.19804687798023224, + "step": 104 + }, + { + "completion_length": 1487.796875, + "epoch": 0.21812516229550766, + "grad_norm": 0.04427757114171982, + "kl": 0.40517764165997505, + "learning_rate": 9.738265855914013e-05, + "loss": 0.0002, + "reward": 0.7468750476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 105 + }, + { + "completion_length": 1542.703125, + "epoch": 0.2202025447935601, + "grad_norm": 0.060884129256010056, + "kl": 0.41977495700120926, + "learning_rate": 9.732999397500926e-05, + "loss": 0.0002, + "reward": 0.6503906659781933, + "reward_std": 0.24693494127131999, + "rewards/argmax_reward_func": 0.453125, + "rewards/format_reward_func": 0.19726562686264515, + "step": 106 + }, + { + "completion_length": 1549.03125, + "epoch": 0.22227992729161256, + "grad_norm": 0.04595618322491646, + "kl": 0.47253532335162163, + "learning_rate": 9.727681934184481e-05, + "loss": 0.0002, + "reward": 0.9000000506639481, + "reward_std": 0.1590990237891674, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.19687500223517418, + "step": 107 + }, + { + "completion_length": 1636.546875, + "epoch": 0.22435730978966503, + "grad_norm": 0.03207004442811012, + "kl": 0.37253231182694435, + "learning_rate": 9.722313523268028e-05, + "loss": 0.0002, + "reward": 0.8875000439584255, + "reward_std": 0.0883883461356163, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 108 + }, + { + "completion_length": 1696.84375, + "epoch": 0.22643469228771748, + "grad_norm": 0.08920740336179733, + "kl": 0.7168225161731243, + "learning_rate": 9.716894222603942e-05, + "loss": 0.0004, + "reward": 0.8093750476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 109 + }, + { + "completion_length": 1443.65625, + "epoch": 0.22851207478576993, + "grad_norm": 0.06247260421514511, + "kl": 0.3850158527493477, + "learning_rate": 9.711424090593019e-05, + "loss": 0.0002, + "reward": 0.7617187947034836, + "reward_std": 0.2662698905915022, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.1992187537252903, + "step": 110 + }, + { + "completion_length": 1509.8125, + "epoch": 0.23058945728382238, + "grad_norm": 0.06556280702352524, + "kl": 0.3532305136322975, + "learning_rate": 9.705903186183828e-05, + "loss": 0.0002, + "reward": 0.7281250506639481, + "reward_std": 0.3137786276638508, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.19687500223517418, + "step": 111 + }, + { + "completion_length": 1467.875, + "epoch": 0.23266683978187483, + "grad_norm": 0.06337332725524902, + "kl": 0.3515300862491131, + "learning_rate": 9.700331568872086e-05, + "loss": 0.0002, + "reward": 0.8054687976837158, + "reward_std": 0.2905766889452934, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19609375298023224, + "step": 112 + }, + { + "completion_length": 1409.40625, + "epoch": 0.23474422227992728, + "grad_norm": 0.06349179893732071, + "kl": 0.35185598209500313, + "learning_rate": 9.694709298700016e-05, + "loss": 0.0002, + "reward": 0.7750000469386578, + "reward_std": 0.24748736945912242, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19687500223517418, + "step": 113 + }, + { + "completion_length": 1412.53125, + "epoch": 0.23682160477797976, + "grad_norm": 0.064293272793293, + "kl": 0.506983544677496, + "learning_rate": 9.689036436255699e-05, + "loss": 0.0003, + "reward": 0.724609412252903, + "reward_std": 0.22483785497024655, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.19335937686264515, + "step": 114 + }, + { + "completion_length": 1372.859375, + "epoch": 0.2388989872760322, + "grad_norm": 0.06638536602258682, + "kl": 0.35727328434586525, + "learning_rate": 9.683313042672418e-05, + "loss": 0.0002, + "reward": 0.7781250439584255, + "reward_std": 0.287262124940753, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 115 + }, + { + "completion_length": 1373.09375, + "epoch": 0.24097636977408465, + "grad_norm": 0.06149492412805557, + "kl": 0.3754408285021782, + "learning_rate": 9.677539179628005e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 116 + }, + { + "completion_length": 1313.734375, + "epoch": 0.2430537522721371, + "grad_norm": 0.062166426330804825, + "kl": 0.40280015021562576, + "learning_rate": 9.671714909344174e-05, + "loss": 0.0002, + "reward": 0.8531250506639481, + "reward_std": 0.2695844564586878, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19687500223517418, + "step": 117 + }, + { + "completion_length": 1132.78125, + "epoch": 0.24513113477018955, + "grad_norm": 0.06206024810671806, + "kl": 0.4296950623393059, + "learning_rate": 9.665840294585845e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 118 + }, + { + "completion_length": 1270.625, + "epoch": 0.24720851726824203, + "grad_norm": 0.05055106431245804, + "kl": 0.3436691351234913, + "learning_rate": 9.659915398660477e-05, + "loss": 0.0002, + "reward": 0.7742187902331352, + "reward_std": 0.16020388156175613, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19609375298023224, + "step": 119 + }, + { + "completion_length": 1221.015625, + "epoch": 0.24928589976629448, + "grad_norm": 0.06833141297101974, + "kl": 0.3341045156121254, + "learning_rate": 9.65394028541738e-05, + "loss": 0.0002, + "reward": 0.7937500439584255, + "reward_std": 0.2651650384068489, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 120 + }, + { + "completion_length": 1273.125, + "epoch": 0.2513632822643469, + "grad_norm": 0.06194274127483368, + "kl": 0.3503304682672024, + "learning_rate": 9.647915019247029e-05, + "loss": 0.0002, + "reward": 0.6687500439584255, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 121 + }, + { + "completion_length": 1169.4375, + "epoch": 0.2534406647623994, + "grad_norm": 0.05682160705327988, + "kl": 0.4410577192902565, + "learning_rate": 9.641839665080363e-05, + "loss": 0.0002, + "reward": 0.7781250476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 122 + }, + { + "completion_length": 1201.921875, + "epoch": 0.25551804726045185, + "grad_norm": 0.061100929975509644, + "kl": 0.3569498844444752, + "learning_rate": 9.635714288388102e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 123 + }, + { + "completion_length": 1296.234375, + "epoch": 0.2575954297585043, + "grad_norm": 0.0515943244099617, + "kl": 0.34240079671144485, + "learning_rate": 9.629538955180021e-05, + "loss": 0.0002, + "reward": 0.6835937909781933, + "reward_std": 0.15578446350991726, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.19921875186264515, + "step": 124 + }, + { + "completion_length": 1237.078125, + "epoch": 0.25967281225655675, + "grad_norm": 0.07088616490364075, + "kl": 0.34578079730272293, + "learning_rate": 9.623313732004258e-05, + "loss": 0.0002, + "reward": 0.6687500402331352, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 125 + }, + { + "completion_length": 1213.71875, + "epoch": 0.2617501947546092, + "grad_norm": 0.05374123901128769, + "kl": 0.3377624601125717, + "learning_rate": 9.617038685946578e-05, + "loss": 0.0002, + "reward": 0.7468750402331352, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 126 + }, + { + "completion_length": 1325.421875, + "epoch": 0.26382757725266165, + "grad_norm": 0.05408313870429993, + "kl": 0.3581954091787338, + "learning_rate": 9.610713884629666e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 127 + }, + { + "completion_length": 1305.421875, + "epoch": 0.2659049597507141, + "grad_norm": 0.05651029199361801, + "kl": 0.3299425356090069, + "learning_rate": 9.60433939621239e-05, + "loss": 0.0002, + "reward": 0.6500000394880772, + "reward_std": 0.15909902285784483, + "rewards/argmax_reward_func": 0.453125, + "rewards/format_reward_func": 0.19687500223517418, + "step": 128 + }, + { + "completion_length": 1394.34375, + "epoch": 0.26798234224876655, + "grad_norm": 0.05847406014800072, + "kl": 0.3248457871377468, + "learning_rate": 9.597915289389066e-05, + "loss": 0.0002, + "reward": 0.8847656697034836, + "reward_std": 0.22483785450458527, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19726562686264515, + "step": 129 + }, + { + "completion_length": 1361.125, + "epoch": 0.270059724746819, + "grad_norm": 0.03918185085058212, + "kl": 0.29694442078471184, + "learning_rate": 9.591441633388724e-05, + "loss": 0.0001, + "reward": 0.8687500506639481, + "reward_std": 0.11490485025569797, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19687500223517418, + "step": 130 + }, + { + "completion_length": 1294.34375, + "epoch": 0.27213710724487145, + "grad_norm": 0.06627894192934036, + "kl": 0.317622110247612, + "learning_rate": 9.584918497974354e-05, + "loss": 0.0002, + "reward": 0.8031250387430191, + "reward_std": 0.2519067842513323, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19375000149011612, + "step": 131 + }, + { + "completion_length": 1264.578125, + "epoch": 0.2742144897429239, + "grad_norm": 0.05716657266020775, + "kl": 0.33274614438414574, + "learning_rate": 9.578345953442162e-05, + "loss": 0.0002, + "reward": 0.7093750424683094, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.19375000335276127, + "step": 132 + }, + { + "completion_length": 1101.5625, + "epoch": 0.27629187224097634, + "grad_norm": 0.06597350537776947, + "kl": 0.3318898268043995, + "learning_rate": 9.571724070620806e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 133 + }, + { + "completion_length": 1340.328125, + "epoch": 0.27836925473902885, + "grad_norm": 0.06743122637271881, + "kl": 0.2939135618507862, + "learning_rate": 9.565052920870636e-05, + "loss": 0.0001, + "reward": 0.6312500461935997, + "reward_std": 0.27400387404486537, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.19375000149011612, + "step": 134 + }, + { + "completion_length": 1385.546875, + "epoch": 0.2804466372370813, + "grad_norm": 0.05118987336754799, + "kl": 0.27961407601833344, + "learning_rate": 9.558332576082925e-05, + "loss": 0.0001, + "reward": 0.8664062991738319, + "reward_std": 0.20660776272416115, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19453125074505806, + "step": 135 + }, + { + "completion_length": 1284.90625, + "epoch": 0.28252401973513375, + "grad_norm": 0.060963716357946396, + "kl": 0.310220867395401, + "learning_rate": 9.551563108679091e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 136 + }, + { + "completion_length": 1260.078125, + "epoch": 0.2846014022331862, + "grad_norm": 0.0460037924349308, + "kl": 0.39314381033182144, + "learning_rate": 9.544744591609922e-05, + "loss": 0.0002, + "reward": 0.7781250402331352, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 137 + }, + { + "completion_length": 1031.03125, + "epoch": 0.28667878473123865, + "grad_norm": 0.06592284142971039, + "kl": 0.4330439232289791, + "learning_rate": 9.537877098354786e-05, + "loss": 0.0002, + "reward": 0.9343750476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 138 + }, + { + "completion_length": 1101.796875, + "epoch": 0.2887561672292911, + "grad_norm": 0.0644359141588211, + "kl": 0.2887462917715311, + "learning_rate": 9.53096070292084e-05, + "loss": 0.0001, + "reward": 0.8218750432133675, + "reward_std": 0.22539028152823448, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19687500223517418, + "step": 139 + }, + { + "completion_length": 1065.90625, + "epoch": 0.29083354972734354, + "grad_norm": 0.065298892557621, + "kl": 0.30470659770071507, + "learning_rate": 9.523995479842232e-05, + "loss": 0.0002, + "reward": 0.6218750365078449, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.421875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 140 + }, + { + "completion_length": 978.953125, + "epoch": 0.292910932225396, + "grad_norm": 0.05792571231722832, + "kl": 0.4863986298441887, + "learning_rate": 9.516981504179299e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 141 + }, + { + "completion_length": 1087.9375, + "epoch": 0.29498831472344844, + "grad_norm": 0.06688184291124344, + "kl": 0.29886077158153057, + "learning_rate": 9.509918851517758e-05, + "loss": 0.0001, + "reward": 0.8562500476837158, + "reward_std": 0.2651650384068489, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 142 + }, + { + "completion_length": 1019.140625, + "epoch": 0.2970656972215009, + "grad_norm": 0.06881757080554962, + "kl": 0.3445068225264549, + "learning_rate": 9.502807597967893e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 143 + }, + { + "completion_length": 1186.5625, + "epoch": 0.29914307971955334, + "grad_norm": 0.05837235972285271, + "kl": 0.32853276655077934, + "learning_rate": 9.495647820163725e-05, + "loss": 0.0002, + "reward": 0.8855469226837158, + "reward_std": 0.17953882738947868, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19804687798023224, + "step": 144 + }, + { + "completion_length": 931.265625, + "epoch": 0.30122046221760584, + "grad_norm": 0.046699460595846176, + "kl": 0.33741075173020363, + "learning_rate": 9.488439595262204e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 145 + }, + { + "completion_length": 944.25, + "epoch": 0.3032978447156583, + "grad_norm": 0.06217503920197487, + "kl": 0.3317374251782894, + "learning_rate": 9.48118300094236e-05, + "loss": 0.0002, + "reward": 0.7312500476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 146 + }, + { + "completion_length": 1001.078125, + "epoch": 0.30537522721371074, + "grad_norm": 0.06545262783765793, + "kl": 0.3109145648777485, + "learning_rate": 9.473878115404477e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 147 + }, + { + "completion_length": 910.796875, + "epoch": 0.3074526097117632, + "grad_norm": 0.05757139250636101, + "kl": 0.3004848547279835, + "learning_rate": 9.466525017369243e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 148 + }, + { + "completion_length": 1078.265625, + "epoch": 0.30952999220981564, + "grad_norm": 0.07616781443357468, + "kl": 0.28462448343634605, + "learning_rate": 9.459123786076912e-05, + "loss": 0.0001, + "reward": 0.8093750476837158, + "reward_std": 0.33145629428327084, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 149 + }, + { + "completion_length": 963.71875, + "epoch": 0.3116073747078681, + "grad_norm": 0.06607849150896072, + "kl": 0.2945715934038162, + "learning_rate": 9.451674501286436e-05, + "loss": 0.0001, + "reward": 0.7468750402331352, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 150 + }, + { + "completion_length": 925.484375, + "epoch": 0.31368475720592054, + "grad_norm": 0.08415860682725906, + "kl": 0.326167568564415, + "learning_rate": 9.444177243274618e-05, + "loss": 0.0002, + "reward": 0.7000000439584255, + "reward_std": 0.35355337895452976, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.20000000298023224, + "step": 151 + }, + { + "completion_length": 814.296875, + "epoch": 0.315762139703973, + "grad_norm": 0.049171049147844315, + "kl": 0.312137458473444, + "learning_rate": 9.436632092835239e-05, + "loss": 0.0002, + "reward": 1.0281250476837158, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.828125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 152 + }, + { + "completion_length": 783.484375, + "epoch": 0.31783952220202544, + "grad_norm": 0.06367822736501694, + "kl": 0.33039499446749687, + "learning_rate": 9.42903913127819e-05, + "loss": 0.0002, + "reward": 0.7281250394880772, + "reward_std": 0.18119611032307148, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.19687500223517418, + "step": 153 + }, + { + "completion_length": 861.609375, + "epoch": 0.3199169047000779, + "grad_norm": 0.06421905755996704, + "kl": 0.3065376691520214, + "learning_rate": 9.421398440428597e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 154 + }, + { + "completion_length": 902.390625, + "epoch": 0.32199428719813034, + "grad_norm": 0.07363509386777878, + "kl": 0.33688198402523994, + "learning_rate": 9.413710102625938e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.287262124940753, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 155 + }, + { + "completion_length": 922.0625, + "epoch": 0.3240716696961828, + "grad_norm": 0.06810685992240906, + "kl": 0.34481339529156685, + "learning_rate": 9.405974200723155e-05, + "loss": 0.0002, + "reward": 0.7937500476837158, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 156 + }, + { + "completion_length": 884.640625, + "epoch": 0.3261490521942353, + "grad_norm": 0.06919455528259277, + "kl": 0.3362896367907524, + "learning_rate": 9.398190818085763e-05, + "loss": 0.0002, + "reward": 0.8398437947034836, + "reward_std": 0.2441728077828884, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.19921875186264515, + "step": 157 + }, + { + "completion_length": 831.109375, + "epoch": 0.32822643469228774, + "grad_norm": 0.08263985067605972, + "kl": 0.882828488945961, + "learning_rate": 9.390360038590951e-05, + "loss": 0.0004, + "reward": 0.8531250506639481, + "reward_std": 0.22539028339087963, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19687500223517418, + "step": 158 + }, + { + "completion_length": 879.40625, + "epoch": 0.3303038171903402, + "grad_norm": 0.0637197494506836, + "kl": 0.31912703067064285, + "learning_rate": 9.382481946626674e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 159 + }, + { + "completion_length": 696.3125, + "epoch": 0.33238119968839264, + "grad_norm": 0.08041277527809143, + "kl": 0.3748646304011345, + "learning_rate": 9.374556627090749e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.2651650384068489, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 160 + }, + { + "completion_length": 891.265625, + "epoch": 0.3344585821864451, + "grad_norm": 0.06974095106124878, + "kl": 0.3626530338078737, + "learning_rate": 9.366584165389941e-05, + "loss": 0.0002, + "reward": 0.8406250439584255, + "reward_std": 0.24306794814765453, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 161 + }, + { + "completion_length": 813.03125, + "epoch": 0.33653596468449753, + "grad_norm": 0.092588409781456, + "kl": 0.3759094402194023, + "learning_rate": 9.358564647439037e-05, + "loss": 0.0002, + "reward": 0.7593750506639481, + "reward_std": 0.35797279700636864, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.19687500223517418, + "step": 162 + }, + { + "completion_length": 838.296875, + "epoch": 0.33861334718255, + "grad_norm": 0.06730964034795761, + "kl": 0.3410007916390896, + "learning_rate": 9.350498159659924e-05, + "loss": 0.0002, + "reward": 0.8406250439584255, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 163 + }, + { + "completion_length": 775.375, + "epoch": 0.34069072968060243, + "grad_norm": 0.06589485704898834, + "kl": 0.3439077027142048, + "learning_rate": 9.342384788980656e-05, + "loss": 0.0002, + "reward": 0.7312500420957804, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 164 + }, + { + "completion_length": 781.0, + "epoch": 0.3427681121786549, + "grad_norm": 0.07955412566661835, + "kl": 0.359022606164217, + "learning_rate": 9.33422462283452e-05, + "loss": 0.0002, + "reward": 0.7468750402331352, + "reward_std": 0.287262124940753, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 165 + }, + { + "completion_length": 895.96875, + "epoch": 0.34484549467670733, + "grad_norm": 0.06293340772390366, + "kl": 0.4317344203591347, + "learning_rate": 9.326017749159087e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 166 + }, + { + "completion_length": 884.25, + "epoch": 0.3469228771747598, + "grad_norm": 0.07700355350971222, + "kl": 0.5744567923247814, + "learning_rate": 9.317764256395275e-05, + "loss": 0.0003, + "reward": 0.6031250357627869, + "reward_std": 0.26958445459604263, + "rewards/argmax_reward_func": 0.40625, + "rewards/format_reward_func": 0.19687500409781933, + "step": 167 + }, + { + "completion_length": 889.921875, + "epoch": 0.34900025967281223, + "grad_norm": 0.062210842967033386, + "kl": 0.32679086178541183, + "learning_rate": 9.309464233486387e-05, + "loss": 0.0002, + "reward": 0.7468750402331352, + "reward_std": 0.19887377507984638, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 168 + }, + { + "completion_length": 788.84375, + "epoch": 0.35107764217086473, + "grad_norm": 0.0710466280579567, + "kl": 0.35585347935557365, + "learning_rate": 9.301117769877153e-05, + "loss": 0.0002, + "reward": 0.8187500350177288, + "reward_std": 0.22207572124898434, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19375000149011612, + "step": 169 + }, + { + "completion_length": 849.484375, + "epoch": 0.3531550246689172, + "grad_norm": 0.0648435726761818, + "kl": 0.32205165177583694, + "learning_rate": 9.292724955512774e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 170 + }, + { + "completion_length": 856.984375, + "epoch": 0.35523240716696963, + "grad_norm": 0.06077580899000168, + "kl": 0.34612051025032997, + "learning_rate": 9.284285880837946e-05, + "loss": 0.0002, + "reward": 0.7156250365078449, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 171 + }, + { + "completion_length": 776.5, + "epoch": 0.3573097896650221, + "grad_norm": 0.07481009513139725, + "kl": 0.3612271770834923, + "learning_rate": 9.275800636795884e-05, + "loss": 0.0002, + "reward": 0.7773437909781933, + "reward_std": 0.28836698085069656, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.1992187537252903, + "step": 172 + }, + { + "completion_length": 789.046875, + "epoch": 0.35938717216307453, + "grad_norm": 0.07435107976198196, + "kl": 0.3340052030980587, + "learning_rate": 9.267269314827345e-05, + "loss": 0.0002, + "reward": 0.8398437947034836, + "reward_std": 0.28836698085069656, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.19921875186264515, + "step": 173 + }, + { + "completion_length": 768.390625, + "epoch": 0.361464554661127, + "grad_norm": 0.08201409131288528, + "kl": 0.3197612836956978, + "learning_rate": 9.258692006869643e-05, + "loss": 0.0002, + "reward": 0.621093787252903, + "reward_std": 0.3325611485633999, + "rewards/argmax_reward_func": 0.421875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 174 + }, + { + "completion_length": 740.515625, + "epoch": 0.36354193715917943, + "grad_norm": 0.08215157687664032, + "kl": 0.3205004744231701, + "learning_rate": 9.250068805355658e-05, + "loss": 0.0002, + "reward": 0.7781250476837158, + "reward_std": 0.2872621212154627, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 175 + }, + { + "completion_length": 833.796875, + "epoch": 0.3656193196572319, + "grad_norm": 0.06702969968318939, + "kl": 0.31005076318979263, + "learning_rate": 9.24139980321284e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.28726212307810783, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 176 + }, + { + "completion_length": 790.546875, + "epoch": 0.3676967021552843, + "grad_norm": 0.08229520171880722, + "kl": 0.32232359051704407, + "learning_rate": 9.232685093862204e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.33145629428327084, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 177 + }, + { + "completion_length": 814.953125, + "epoch": 0.3697740846533368, + "grad_norm": 0.0782497227191925, + "kl": 0.3153250627219677, + "learning_rate": 9.22392477121733e-05, + "loss": 0.0002, + "reward": 0.7750000506639481, + "reward_std": 0.3358757123351097, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19687500223517418, + "step": 178 + }, + { + "completion_length": 972.390625, + "epoch": 0.3718514671513892, + "grad_norm": 0.07078168541193008, + "kl": 0.3324251137673855, + "learning_rate": 9.215118929683344e-05, + "loss": 0.0002, + "reward": 0.7750000469386578, + "reward_std": 0.29168154671788216, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19687500223517418, + "step": 179 + }, + { + "completion_length": 757.90625, + "epoch": 0.37392884964944173, + "grad_norm": 0.09468799084424973, + "kl": 0.33874499425292015, + "learning_rate": 9.206267664155907e-05, + "loss": 0.0002, + "reward": 0.7781250476837158, + "reward_std": 0.4640388172119856, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 180 + }, + { + "completion_length": 844.265625, + "epoch": 0.3760062321474942, + "grad_norm": 0.08337994664907455, + "kl": 0.31716278567910194, + "learning_rate": 9.197371070020184e-05, + "loss": 0.0002, + "reward": 0.7906250506639481, + "reward_std": 0.3579728025943041, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 181 + }, + { + "completion_length": 864.234375, + "epoch": 0.37808361464554663, + "grad_norm": 0.0694384053349495, + "kl": 0.32097451388835907, + "learning_rate": 9.188429243149824e-05, + "loss": 0.0002, + "reward": 0.6472656652331352, + "reward_std": 0.24362037517130375, + "rewards/argmax_reward_func": 0.453125, + "rewards/format_reward_func": 0.19414062798023224, + "step": 182 + }, + { + "completion_length": 749.046875, + "epoch": 0.3801609971435991, + "grad_norm": 0.062498513609170914, + "kl": 0.3336629420518875, + "learning_rate": 9.179442279905928e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 183 + }, + { + "completion_length": 923.640625, + "epoch": 0.3822383796416515, + "grad_norm": 0.07083828747272491, + "kl": 0.32143479958176613, + "learning_rate": 9.170410277135999e-05, + "loss": 0.0002, + "reward": 0.75625004991889, + "reward_std": 0.31819804944097996, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.19375000149011612, + "step": 184 + }, + { + "completion_length": 884.015625, + "epoch": 0.384315762139704, + "grad_norm": 0.0693785548210144, + "kl": 0.4987417571246624, + "learning_rate": 9.161333332172912e-05, + "loss": 0.0002, + "reward": 0.7062500454485416, + "reward_std": 0.24748736806213856, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.1906250026077032, + "step": 185 + }, + { + "completion_length": 795.234375, + "epoch": 0.3863931446377564, + "grad_norm": 0.07442086935043335, + "kl": 0.33293722197413445, + "learning_rate": 9.152211542833857e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.3535533845424652, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 186 + }, + { + "completion_length": 929.25, + "epoch": 0.3884705271358089, + "grad_norm": 0.05656367912888527, + "kl": 0.31210994347929955, + "learning_rate": 9.143045007419284e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 187 + }, + { + "completion_length": 902.828125, + "epoch": 0.3905479096338613, + "grad_norm": 0.06982313841581345, + "kl": 0.3056885749101639, + "learning_rate": 9.133833824711853e-05, + "loss": 0.0002, + "reward": 0.7156250439584255, + "reward_std": 0.28726212307810783, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 188 + }, + { + "completion_length": 995.71875, + "epoch": 0.3926252921319138, + "grad_norm": 0.07361900061368942, + "kl": 0.3039589188992977, + "learning_rate": 9.124578093975358e-05, + "loss": 0.0002, + "reward": 0.7300781719386578, + "reward_std": 0.35521066188812256, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.19882812909781933, + "step": 189 + }, + { + "completion_length": 964.359375, + "epoch": 0.3947026746299662, + "grad_norm": 0.07159875333309174, + "kl": 0.3399963229894638, + "learning_rate": 9.115277914953662e-05, + "loss": 0.0002, + "reward": 0.7781250476837158, + "reward_std": 0.33145629428327084, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 190 + }, + { + "completion_length": 1034.03125, + "epoch": 0.39678005712801867, + "grad_norm": 0.07782501727342606, + "kl": 0.3184865601360798, + "learning_rate": 9.105933387869628e-05, + "loss": 0.0002, + "reward": 0.6910156607627869, + "reward_std": 0.4104533866047859, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.19101562723517418, + "step": 191 + }, + { + "completion_length": 1139.8125, + "epoch": 0.3988574396260712, + "grad_norm": 0.05474551394581795, + "kl": 0.29825419560074806, + "learning_rate": 9.096544613424025e-05, + "loss": 0.0001, + "reward": 0.8804688006639481, + "reward_std": 0.27510872669517994, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19296875223517418, + "step": 192 + }, + { + "completion_length": 1003.0, + "epoch": 0.4009348221241236, + "grad_norm": 0.0725637748837471, + "kl": 0.3301442116498947, + "learning_rate": 9.087111692794459e-05, + "loss": 0.0002, + "reward": 0.7304687947034836, + "reward_std": 0.31046406365931034, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.1992187537252903, + "step": 193 + }, + { + "completion_length": 1139.734375, + "epoch": 0.4030122046221761, + "grad_norm": 0.057006120681762695, + "kl": 0.31723184883594513, + "learning_rate": 9.077634727634272e-05, + "loss": 0.0002, + "reward": 0.8449219167232513, + "reward_std": 0.23146697832271457, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.1886718776077032, + "step": 194 + }, + { + "completion_length": 1048.71875, + "epoch": 0.4050895871202285, + "grad_norm": 0.07205278426408768, + "kl": 0.33233997970819473, + "learning_rate": 9.068113820071447e-05, + "loss": 0.0002, + "reward": 0.7875000387430191, + "reward_std": 0.3181980513036251, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19375000335276127, + "step": 195 + }, + { + "completion_length": 955.6875, + "epoch": 0.40716696961828097, + "grad_norm": 0.057774197310209274, + "kl": 0.3174768090248108, + "learning_rate": 9.058549072707513e-05, + "loss": 0.0002, + "reward": 0.8347656726837158, + "reward_std": 0.1994262058287859, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.19414062798023224, + "step": 196 + }, + { + "completion_length": 1300.125, + "epoch": 0.4092443521163334, + "grad_norm": 0.05007508769631386, + "kl": 0.30298993550240993, + "learning_rate": 9.048940588616435e-05, + "loss": 0.0002, + "reward": 0.7843750491738319, + "reward_std": 0.22539028525352478, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.1906250026077032, + "step": 197 + }, + { + "completion_length": 1237.953125, + "epoch": 0.41132173461438587, + "grad_norm": 0.060646846890449524, + "kl": 0.3001830168068409, + "learning_rate": 9.039288471343504e-05, + "loss": 0.0002, + "reward": 0.8812500461935997, + "reward_std": 0.27400387451052666, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19375000335276127, + "step": 198 + }, + { + "completion_length": 1186.453125, + "epoch": 0.4133991171124383, + "grad_norm": 0.05342816561460495, + "kl": 0.30144498124718666, + "learning_rate": 9.029592824904225e-05, + "loss": 0.0002, + "reward": 0.8074219226837158, + "reward_std": 0.24583008885383606, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19804687798023224, + "step": 199 + }, + { + "completion_length": 1274.375, + "epoch": 0.41547649961049077, + "grad_norm": 0.05866052210330963, + "kl": 0.3122952822595835, + "learning_rate": 9.019853753783185e-05, + "loss": 0.0002, + "reward": 0.652343787252903, + "reward_std": 0.2264951393008232, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.1835937537252903, + "step": 200 + }, + { + "completion_length": 1241.5, + "epoch": 0.4175538821085432, + "grad_norm": 0.05399727076292038, + "kl": 0.33987458795309067, + "learning_rate": 9.010071362932944e-05, + "loss": 0.0002, + "reward": 0.8687500432133675, + "reward_std": 0.2032931987196207, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19687500223517418, + "step": 201 + }, + { + "completion_length": 1336.140625, + "epoch": 0.41963126460659567, + "grad_norm": 0.06403433531522751, + "kl": 0.28705168329179287, + "learning_rate": 9.000245757772885e-05, + "loss": 0.0001, + "reward": 0.8281250521540642, + "reward_std": 0.29610096476972103, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.18750000558793545, + "step": 202 + }, + { + "completion_length": 1035.203125, + "epoch": 0.42170864710464817, + "grad_norm": 0.0628470629453659, + "kl": 0.30243775993585587, + "learning_rate": 8.990377044188098e-05, + "loss": 0.0002, + "reward": 0.85000004991889, + "reward_std": 0.22980970283970237, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19375000149011612, + "step": 203 + }, + { + "completion_length": 1175.90625, + "epoch": 0.4237860296027006, + "grad_norm": 0.05064735934138298, + "kl": 0.3158372975885868, + "learning_rate": 8.980465328528219e-05, + "loss": 0.0002, + "reward": 0.7906250394880772, + "reward_std": 0.18119611404836178, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 204 + }, + { + "completion_length": 1109.28125, + "epoch": 0.42586341210075307, + "grad_norm": 0.060826126486063004, + "kl": 0.2915416620671749, + "learning_rate": 8.9705107176063e-05, + "loss": 0.0001, + "reward": 0.9437500461935997, + "reward_std": 0.22980970703065395, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.19375000149011612, + "step": 205 + }, + { + "completion_length": 979.453125, + "epoch": 0.4279407945988055, + "grad_norm": 0.062372464686632156, + "kl": 0.3590022251009941, + "learning_rate": 8.960513318697647e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 206 + }, + { + "completion_length": 1014.40625, + "epoch": 0.43001817709685797, + "grad_norm": 0.0675223246216774, + "kl": 0.3203696608543396, + "learning_rate": 8.950473239538673e-05, + "loss": 0.0002, + "reward": 0.8835937976837158, + "reward_std": 0.27068931609392166, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.1960937511175871, + "step": 207 + }, + { + "completion_length": 1077.65625, + "epoch": 0.4320955595949104, + "grad_norm": 0.07710019499063492, + "kl": 0.28732946887612343, + "learning_rate": 8.940390588325727e-05, + "loss": 0.0001, + "reward": 0.8781250491738319, + "reward_std": 0.4110058154910803, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19062500447034836, + "step": 208 + }, + { + "completion_length": 985.4375, + "epoch": 0.43417294209296287, + "grad_norm": 0.04350803792476654, + "kl": 0.3246513232588768, + "learning_rate": 8.930265473713938e-05, + "loss": 0.0002, + "reward": 0.8531250469386578, + "reward_std": 0.13700193725526333, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19687500409781933, + "step": 209 + }, + { + "completion_length": 984.140625, + "epoch": 0.4362503245910153, + "grad_norm": 0.06984654814004898, + "kl": 0.32982902973890305, + "learning_rate": 8.920098004816036e-05, + "loss": 0.0002, + "reward": 0.8710937947034836, + "reward_std": 0.24417280592024326, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19921875186264515, + "step": 210 + }, + { + "completion_length": 870.03125, + "epoch": 0.43832770708906776, + "grad_norm": 0.06809406727552414, + "kl": 0.29028210788965225, + "learning_rate": 8.909888291201182e-05, + "loss": 0.0001, + "reward": 0.8718750439584255, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 211 + }, + { + "completion_length": 906.40625, + "epoch": 0.4404050895871202, + "grad_norm": 0.08602919429540634, + "kl": 0.2871505431830883, + "learning_rate": 8.899636442893783e-05, + "loss": 0.0001, + "reward": 0.8062500469386578, + "reward_std": 0.3800698835402727, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 212 + }, + { + "completion_length": 869.421875, + "epoch": 0.44248247208517266, + "grad_norm": 0.05844723433256149, + "kl": 0.26848769187927246, + "learning_rate": 8.88934257037231e-05, + "loss": 0.0001, + "reward": 0.8710937947034836, + "reward_std": 0.19997863844037056, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 213 + }, + { + "completion_length": 864.75, + "epoch": 0.4445598545832251, + "grad_norm": 0.07575644552707672, + "kl": 0.4048551693558693, + "learning_rate": 8.879006784568104e-05, + "loss": 0.0002, + "reward": 0.7613281682133675, + "reward_std": 0.26682231947779655, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.19882812723517418, + "step": 214 + }, + { + "completion_length": 877.96875, + "epoch": 0.4466372370812776, + "grad_norm": 0.07090688496828079, + "kl": 0.2992668803781271, + "learning_rate": 8.868629196864182e-05, + "loss": 0.0001, + "reward": 0.8035156689584255, + "reward_std": 0.24362037889659405, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19414062798023224, + "step": 215 + }, + { + "completion_length": 779.8125, + "epoch": 0.44871461957933007, + "grad_norm": 0.069987952709198, + "kl": 0.2883603498339653, + "learning_rate": 8.858209919094039e-05, + "loss": 0.0001, + "reward": 0.7156250402331352, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 216 + }, + { + "completion_length": 748.59375, + "epoch": 0.4507920020773825, + "grad_norm": 0.07478881627321243, + "kl": 0.2929369006305933, + "learning_rate": 8.847749063540439e-05, + "loss": 0.0001, + "reward": 0.8066406697034836, + "reward_std": 0.24693494103848934, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19726562686264515, + "step": 217 + }, + { + "completion_length": 749.8125, + "epoch": 0.45286938457543496, + "grad_norm": 0.08688110113143921, + "kl": 0.3713537007570267, + "learning_rate": 8.837246742934207e-05, + "loss": 0.0002, + "reward": 0.7765625454485416, + "reward_std": 0.33366601169109344, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.1984375026077032, + "step": 218 + }, + { + "completion_length": 697.546875, + "epoch": 0.4549467670734874, + "grad_norm": 0.08973264694213867, + "kl": 0.36194442212581635, + "learning_rate": 8.826703070453015e-05, + "loss": 0.0002, + "reward": 0.8511719219386578, + "reward_std": 0.316540764644742, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19492187909781933, + "step": 219 + }, + { + "completion_length": 736.953125, + "epoch": 0.45702414957153986, + "grad_norm": 0.06507878005504608, + "kl": 0.32681479677557945, + "learning_rate": 8.816118159720156e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 220 + }, + { + "completion_length": 686.046875, + "epoch": 0.4591015320695923, + "grad_norm": 0.08443711698055267, + "kl": 0.27842542715370655, + "learning_rate": 8.805492124803331e-05, + "loss": 0.0001, + "reward": 0.7750000506639481, + "reward_std": 0.2474873699247837, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19687500223517418, + "step": 221 + }, + { + "completion_length": 636.28125, + "epoch": 0.46117891456764476, + "grad_norm": 0.08033400774002075, + "kl": 0.2751711644232273, + "learning_rate": 8.794825080213414e-05, + "loss": 0.0001, + "reward": 0.7781250476837158, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 222 + }, + { + "completion_length": 652.40625, + "epoch": 0.4632562970656972, + "grad_norm": 0.07973612844944, + "kl": 0.28703486546874046, + "learning_rate": 8.78411714090321e-05, + "loss": 0.0001, + "reward": 0.7937500439584255, + "reward_std": 0.2651650384068489, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 223 + }, + { + "completion_length": 656.5, + "epoch": 0.46533367956374966, + "grad_norm": 0.0935521349310875, + "kl": 0.28887104988098145, + "learning_rate": 8.77336842226623e-05, + "loss": 0.0001, + "reward": 0.7937500439584255, + "reward_std": 0.3535533845424652, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 224 + }, + { + "completion_length": 601.90625, + "epoch": 0.4674110620618021, + "grad_norm": 0.08775703608989716, + "kl": 0.2865128982812166, + "learning_rate": 8.76257904013544e-05, + "loss": 0.0001, + "reward": 0.7468750439584255, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 225 + }, + { + "completion_length": 649.09375, + "epoch": 0.46948844455985456, + "grad_norm": 0.07471180707216263, + "kl": 0.3112582378089428, + "learning_rate": 8.751749110782012e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 226 + }, + { + "completion_length": 651.375, + "epoch": 0.47156582705790706, + "grad_norm": 0.08434654772281647, + "kl": 0.33592014387249947, + "learning_rate": 8.740878750914076e-05, + "loss": 0.0002, + "reward": 0.8390625491738319, + "reward_std": 0.24527766555547714, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.1984375026077032, + "step": 227 + }, + { + "completion_length": 592.15625, + "epoch": 0.4736432095559595, + "grad_norm": 0.10138159990310669, + "kl": 0.3475854229182005, + "learning_rate": 8.729968077675454e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.33145629428327084, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 228 + }, + { + "completion_length": 618.953125, + "epoch": 0.47572059205401196, + "grad_norm": 0.08923006802797318, + "kl": 0.32317574694752693, + "learning_rate": 8.71901720864441e-05, + "loss": 0.0002, + "reward": 0.8085937947034836, + "reward_std": 0.28836698085069656, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.1992187537252903, + "step": 229 + }, + { + "completion_length": 606.46875, + "epoch": 0.4777979745520644, + "grad_norm": 0.07547228038311005, + "kl": 0.4202072508633137, + "learning_rate": 8.70802626183237e-05, + "loss": 0.0002, + "reward": 0.7757812924683094, + "reward_std": 0.20218834839761257, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19765625149011612, + "step": 230 + }, + { + "completion_length": 567.375, + "epoch": 0.47987535705011686, + "grad_norm": 0.07534275949001312, + "kl": 0.5509752966463566, + "learning_rate": 8.696995355682656e-05, + "loss": 0.0003, + "reward": 0.8250000439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 231 + }, + { + "completion_length": 615.265625, + "epoch": 0.4819527395481693, + "grad_norm": 0.08038201183080673, + "kl": 0.3771616071462631, + "learning_rate": 8.685924609069214e-05, + "loss": 0.0002, + "reward": 0.8695312887430191, + "reward_std": 0.20218834280967712, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19765625149011612, + "step": 232 + }, + { + "completion_length": 602.453125, + "epoch": 0.48403012204622176, + "grad_norm": 0.07698789983987808, + "kl": 0.6121297106146812, + "learning_rate": 8.674814141295324e-05, + "loss": 0.0003, + "reward": 0.8718750439584255, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 233 + }, + { + "completion_length": 592.75, + "epoch": 0.4861075045442742, + "grad_norm": 0.09831973165273666, + "kl": 0.31690799072384834, + "learning_rate": 8.663664072092323e-05, + "loss": 0.0002, + "reward": 0.8246094211935997, + "reward_std": 0.3099116366356611, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19960937835276127, + "step": 234 + }, + { + "completion_length": 558.90625, + "epoch": 0.48818488704232665, + "grad_norm": 0.09684620797634125, + "kl": 0.3237866424024105, + "learning_rate": 8.652474521618306e-05, + "loss": 0.0002, + "reward": 0.7937500439584255, + "reward_std": 0.3093592096120119, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 235 + }, + { + "completion_length": 626.078125, + "epoch": 0.4902622695403791, + "grad_norm": 0.06933271139860153, + "kl": 0.3663709722459316, + "learning_rate": 8.641245610456838e-05, + "loss": 0.0002, + "reward": 0.9812500476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.78125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 236 + }, + { + "completion_length": 579.40625, + "epoch": 0.49233965203843155, + "grad_norm": 0.08088324964046478, + "kl": 0.3435916490852833, + "learning_rate": 8.629977459615655e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 237 + }, + { + "completion_length": 607.125, + "epoch": 0.49441703453648406, + "grad_norm": 0.07071245461702347, + "kl": 0.2806865181773901, + "learning_rate": 8.618670190525352e-05, + "loss": 0.0001, + "reward": 0.8250000439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 238 + }, + { + "completion_length": 558.390625, + "epoch": 0.4964944170345365, + "grad_norm": 0.08282584697008133, + "kl": 0.40584639832377434, + "learning_rate": 8.607323925038082e-05, + "loss": 0.0002, + "reward": 0.7156250439584255, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 239 + }, + { + "completion_length": 667.71875, + "epoch": 0.49857179953258896, + "grad_norm": 0.08190900087356567, + "kl": 0.35680179484188557, + "learning_rate": 8.595938785426241e-05, + "loss": 0.0002, + "reward": 0.9343750476837158, + "reward_std": 0.24306794814765453, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 240 + }, + { + "completion_length": 670.8125, + "epoch": 0.5006491820306413, + "grad_norm": 0.08591850101947784, + "kl": 0.3619570918381214, + "learning_rate": 8.584514894381151e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 241 + }, + { + "completion_length": 676.25, + "epoch": 0.5027265645286938, + "grad_norm": 0.08506251126527786, + "kl": 0.3224334083497524, + "learning_rate": 8.573052375011733e-05, + "loss": 0.0002, + "reward": 0.8867187947034836, + "reward_std": 0.2662698905915022, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 242 + }, + { + "completion_length": 673.1875, + "epoch": 0.5048039470267462, + "grad_norm": 0.06150234118103981, + "kl": 0.3478453829884529, + "learning_rate": 8.561551350843186e-05, + "loss": 0.0002, + "reward": 0.9656250476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.765625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 243 + }, + { + "completion_length": 692.546875, + "epoch": 0.5068813295247988, + "grad_norm": 0.06618204712867737, + "kl": 0.29330621659755707, + "learning_rate": 8.550011945815655e-05, + "loss": 0.0001, + "reward": 0.8562500476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 244 + }, + { + "completion_length": 687.03125, + "epoch": 0.5089587120228513, + "grad_norm": 0.07623764872550964, + "kl": 0.3498356007039547, + "learning_rate": 8.538434284282892e-05, + "loss": 0.0002, + "reward": 0.7937500476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 245 + }, + { + "completion_length": 660.703125, + "epoch": 0.5110360945209037, + "grad_norm": 0.04135030135512352, + "kl": 0.32844917103648186, + "learning_rate": 8.526818491010922e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.0883883461356163, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 246 + }, + { + "completion_length": 716.609375, + "epoch": 0.5131134770189562, + "grad_norm": 0.0865129679441452, + "kl": 0.3059841375797987, + "learning_rate": 8.515164691176687e-05, + "loss": 0.0002, + "reward": 0.7312500439584255, + "reward_std": 0.3093592096120119, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 247 + }, + { + "completion_length": 757.4375, + "epoch": 0.5151908595170086, + "grad_norm": 0.07157998532056808, + "kl": 0.2929275669157505, + "learning_rate": 8.503473010366713e-05, + "loss": 0.0001, + "reward": 0.8867187909781933, + "reward_std": 0.22207572311162949, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19921875186264515, + "step": 248 + }, + { + "completion_length": 662.859375, + "epoch": 0.517268242015061, + "grad_norm": 0.06820650398731232, + "kl": 0.31902188807725906, + "learning_rate": 8.491743574575743e-05, + "loss": 0.0002, + "reward": 0.7781250476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 249 + }, + { + "completion_length": 721.625, + "epoch": 0.5193456245131135, + "grad_norm": 0.0756940096616745, + "kl": 0.31403973512351513, + "learning_rate": 8.479976510205387e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.22097086161375046, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 250 + }, + { + "completion_length": 748.046875, + "epoch": 0.521423007011166, + "grad_norm": 0.07090619206428528, + "kl": 0.2569838650524616, + "learning_rate": 8.468171944062755e-05, + "loss": 0.0001, + "reward": 0.7929687947034836, + "reward_std": 0.22207572311162949, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19921875186264515, + "step": 251 + }, + { + "completion_length": 693.609375, + "epoch": 0.5235003895092184, + "grad_norm": 0.06538081914186478, + "kl": 0.2928556613624096, + "learning_rate": 8.456330003359093e-05, + "loss": 0.0001, + "reward": 0.8250000476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 252 + }, + { + "completion_length": 728.953125, + "epoch": 0.5255777720072708, + "grad_norm": 0.09474781900644302, + "kl": 0.27900537475943565, + "learning_rate": 8.444450815708415e-05, + "loss": 0.0001, + "reward": 0.8250000476837158, + "reward_std": 0.3977475557476282, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 253 + }, + { + "completion_length": 737.625, + "epoch": 0.5276551545053233, + "grad_norm": 0.06914320588111877, + "kl": 0.26191011257469654, + "learning_rate": 8.432534509126122e-05, + "loss": 0.0001, + "reward": 0.7468750439584255, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 254 + }, + { + "completion_length": 743.609375, + "epoch": 0.5297325370033757, + "grad_norm": 0.05855982005596161, + "kl": 0.26190576888620853, + "learning_rate": 8.420581212027624e-05, + "loss": 0.0001, + "reward": 0.8875000476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 255 + }, + { + "completion_length": 768.265625, + "epoch": 0.5318099195014282, + "grad_norm": 0.058118585497140884, + "kl": 0.2930552177131176, + "learning_rate": 8.408591053226964e-05, + "loss": 0.0001, + "reward": 0.9492187947034836, + "reward_std": 0.13368737325072289, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.1992187537252903, + "step": 256 + }, + { + "completion_length": 763.0, + "epoch": 0.5338873019994806, + "grad_norm": 0.07115372270345688, + "kl": 0.35762836039066315, + "learning_rate": 8.396564161935411e-05, + "loss": 0.0002, + "reward": 0.8710937947034836, + "reward_std": 0.1999786328524351, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 257 + }, + { + "completion_length": 865.8125, + "epoch": 0.5359646844975331, + "grad_norm": 0.06384899467229843, + "kl": 0.2847513500601053, + "learning_rate": 8.38450066776009e-05, + "loss": 0.0001, + "reward": 0.8375000506639481, + "reward_std": 0.2032931987196207, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.19687500223517418, + "step": 258 + }, + { + "completion_length": 691.734375, + "epoch": 0.5380420669955855, + "grad_norm": 0.08122014999389648, + "kl": 0.2869179602712393, + "learning_rate": 8.37240070070257e-05, + "loss": 0.0001, + "reward": 0.7937500439584255, + "reward_std": 0.30935921147465706, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 259 + }, + { + "completion_length": 759.203125, + "epoch": 0.540119449493638, + "grad_norm": 0.06432370841503143, + "kl": 0.3190025221556425, + "learning_rate": 8.360264391157471e-05, + "loss": 0.0002, + "reward": 0.9500000476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.20000000298023224, + "step": 260 + }, + { + "completion_length": 872.890625, + "epoch": 0.5421968319916904, + "grad_norm": 0.08087541162967682, + "kl": 0.2903926521539688, + "learning_rate": 8.348091869911054e-05, + "loss": 0.0001, + "reward": 0.7554687969386578, + "reward_std": 0.27510873042047024, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.19296875409781933, + "step": 261 + }, + { + "completion_length": 868.875, + "epoch": 0.5442742144897429, + "grad_norm": 0.06983164697885513, + "kl": 0.25976957008242607, + "learning_rate": 8.335883268139813e-05, + "loss": 0.0001, + "reward": 0.8062500506639481, + "reward_std": 0.247487373650074, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 262 + }, + { + "completion_length": 781.53125, + "epoch": 0.5463515969877953, + "grad_norm": 0.07666690647602081, + "kl": 0.286643173545599, + "learning_rate": 8.323638717409061e-05, + "loss": 0.0001, + "reward": 0.8406250476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 263 + }, + { + "completion_length": 768.796875, + "epoch": 0.5484289794858478, + "grad_norm": 0.06480922549962997, + "kl": 0.30170151591300964, + "learning_rate": 8.311358349671517e-05, + "loss": 0.0002, + "reward": 0.7625000476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 264 + }, + { + "completion_length": 874.328125, + "epoch": 0.5505063619839002, + "grad_norm": 0.06416033208370209, + "kl": 0.28673115372657776, + "learning_rate": 8.299042297265876e-05, + "loss": 0.0001, + "reward": 0.8843750506639481, + "reward_std": 0.22539028525352478, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19687500223517418, + "step": 265 + }, + { + "completion_length": 790.890625, + "epoch": 0.5525837444819527, + "grad_norm": 0.06319725513458252, + "kl": 0.3224434554576874, + "learning_rate": 8.286690692915386e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 266 + }, + { + "completion_length": 837.4375, + "epoch": 0.5546611269800052, + "grad_norm": 0.07317644357681274, + "kl": 0.3563056066632271, + "learning_rate": 8.274303669726426e-05, + "loss": 0.0002, + "reward": 0.7906250432133675, + "reward_std": 0.22539028525352478, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 267 + }, + { + "completion_length": 718.125, + "epoch": 0.5567385094780577, + "grad_norm": 0.06230226531624794, + "kl": 0.30831460282206535, + "learning_rate": 8.261881361187054e-05, + "loss": 0.0002, + "reward": 0.9500000476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.20000000298023224, + "step": 268 + }, + { + "completion_length": 879.25, + "epoch": 0.5588158919761101, + "grad_norm": 0.07465776056051254, + "kl": 0.47362302988767624, + "learning_rate": 8.249423901165584e-05, + "loss": 0.0002, + "reward": 0.8535156697034836, + "reward_std": 0.22483785264194012, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19726562686264515, + "step": 269 + }, + { + "completion_length": 669.40625, + "epoch": 0.5608932744741626, + "grad_norm": 0.07374807447195053, + "kl": 0.329727228730917, + "learning_rate": 8.236931423909138e-05, + "loss": 0.0002, + "reward": 0.7773437947034836, + "reward_std": 0.19997863098978996, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.1992187537252903, + "step": 270 + }, + { + "completion_length": 732.3125, + "epoch": 0.562970656972215, + "grad_norm": 0.0676698312163353, + "kl": 0.3591331150382757, + "learning_rate": 8.2244040640422e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 271 + }, + { + "completion_length": 840.921875, + "epoch": 0.5650480394702675, + "grad_norm": 0.0682259052991867, + "kl": 0.35003719478845596, + "learning_rate": 8.21184195656516e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 272 + }, + { + "completion_length": 722.375, + "epoch": 0.5671254219683199, + "grad_norm": 0.05751950666308403, + "kl": 0.37356993556022644, + "learning_rate": 8.199245236852871e-05, + "loss": 0.0002, + "reward": 0.6843750476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 273 + }, + { + "completion_length": 679.171875, + "epoch": 0.5692028044663724, + "grad_norm": 0.09351193159818649, + "kl": 0.3799058124423027, + "learning_rate": 8.186614040653176e-05, + "loss": 0.0002, + "reward": 0.9343750476837158, + "reward_std": 0.33145629428327084, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 274 + }, + { + "completion_length": 767.84375, + "epoch": 0.5712801869644248, + "grad_norm": 0.06785906106233597, + "kl": 0.3164171427488327, + "learning_rate": 8.173948504085454e-05, + "loss": 0.0002, + "reward": 0.8242187947034836, + "reward_std": 0.17788154631853104, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19921875186264515, + "step": 275 + }, + { + "completion_length": 700.25, + "epoch": 0.5733575694624773, + "grad_norm": 0.06914710998535156, + "kl": 0.3422697074711323, + "learning_rate": 8.161248763639153e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 276 + }, + { + "completion_length": 662.9375, + "epoch": 0.5754349519605297, + "grad_norm": 0.05800582095980644, + "kl": 0.37523847445845604, + "learning_rate": 8.148514956172315e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.1325825173407793, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 277 + }, + { + "completion_length": 834.5625, + "epoch": 0.5775123344585822, + "grad_norm": 0.06169675290584564, + "kl": 0.31875982135534286, + "learning_rate": 8.135747218910104e-05, + "loss": 0.0002, + "reward": 0.8367187976837158, + "reward_std": 0.20439805276691914, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.1960937511175871, + "step": 278 + }, + { + "completion_length": 766.546875, + "epoch": 0.5795897169566346, + "grad_norm": 0.08040869235992432, + "kl": 1.0613461509346962, + "learning_rate": 8.122945689443328e-05, + "loss": 0.0005, + "reward": 0.8703125417232513, + "reward_std": 0.1568893175572157, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19843750074505806, + "step": 279 + }, + { + "completion_length": 737.5625, + "epoch": 0.5816670994546871, + "grad_norm": 0.0702415257692337, + "kl": 0.34963829442858696, + "learning_rate": 8.11011050572695e-05, + "loss": 0.0002, + "reward": 0.8222656659781933, + "reward_std": 0.22483785450458527, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19726562686264515, + "step": 280 + }, + { + "completion_length": 753.703125, + "epoch": 0.5837444819527395, + "grad_norm": 0.07661338895559311, + "kl": 0.38233664259314537, + "learning_rate": 8.097241806078615e-05, + "loss": 0.0002, + "reward": 0.7929687909781933, + "reward_std": 0.26626989245414734, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19921875186264515, + "step": 281 + }, + { + "completion_length": 880.609375, + "epoch": 0.585821864450792, + "grad_norm": 0.07432933151721954, + "kl": 0.42199838161468506, + "learning_rate": 8.084339729177142e-05, + "loss": 0.0002, + "reward": 0.8500000461935997, + "reward_std": 0.27400387451052666, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19375000335276127, + "step": 282 + }, + { + "completion_length": 778.453125, + "epoch": 0.5878992469488444, + "grad_norm": 0.07835783809423447, + "kl": 0.36370869539678097, + "learning_rate": 8.071404414061041e-05, + "loss": 0.0002, + "reward": 0.8207031637430191, + "reward_std": 0.2712417396251112, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19570313021540642, + "step": 283 + }, + { + "completion_length": 806.515625, + "epoch": 0.5899766294468969, + "grad_norm": 0.048540204763412476, + "kl": 0.3912508450448513, + "learning_rate": 8.058436000127014e-05, + "loss": 0.0002, + "reward": 0.8679687865078449, + "reward_std": 0.1602038759738207, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.1960937511175871, + "step": 284 + }, + { + "completion_length": 859.765625, + "epoch": 0.5920540119449493, + "grad_norm": 0.06207654997706413, + "kl": 0.31765272468328476, + "learning_rate": 8.045434627128446e-05, + "loss": 0.0002, + "reward": 0.9312500506639481, + "reward_std": 0.2032931987196207, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 285 + }, + { + "completion_length": 710.453125, + "epoch": 0.5941313944430018, + "grad_norm": 0.08810100704431534, + "kl": 0.40968091040849686, + "learning_rate": 8.032400435173907e-05, + "loss": 0.0002, + "reward": 0.8542969226837158, + "reward_std": 0.31212134286761284, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19804687798023224, + "step": 286 + }, + { + "completion_length": 700.296875, + "epoch": 0.5962087769410542, + "grad_norm": 0.07407598942518234, + "kl": 0.3017115257680416, + "learning_rate": 8.019333564725639e-05, + "loss": 0.0002, + "reward": 0.9476562887430191, + "reward_std": 0.18009125301614404, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.19765625521540642, + "step": 287 + }, + { + "completion_length": 628.984375, + "epoch": 0.5982861594391067, + "grad_norm": 0.05131203308701515, + "kl": 0.3888060562312603, + "learning_rate": 8.006234156598042e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.0883883461356163, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 288 + }, + { + "completion_length": 648.28125, + "epoch": 0.6003635419371591, + "grad_norm": 0.07319964468479156, + "kl": 0.3936074487864971, + "learning_rate": 7.99310235195615e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 289 + }, + { + "completion_length": 788.953125, + "epoch": 0.6024409244352117, + "grad_norm": 0.07722538709640503, + "kl": 0.35653146356344223, + "learning_rate": 7.979938292314129e-05, + "loss": 0.0002, + "reward": 0.8386719189584255, + "reward_std": 0.24583008512854576, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.19804687798023224, + "step": 290 + }, + { + "completion_length": 679.46875, + "epoch": 0.6045183069332641, + "grad_norm": 0.03349410742521286, + "kl": 0.35145866870880127, + "learning_rate": 7.966742119533723e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.04419417306780815, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 291 + }, + { + "completion_length": 761.25, + "epoch": 0.6065956894313166, + "grad_norm": 0.06922980397939682, + "kl": 0.33772632107138634, + "learning_rate": 7.953513975822755e-05, + "loss": 0.0002, + "reward": 0.8242187947034836, + "reward_std": 0.2220757193863392, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19921875186264515, + "step": 292 + }, + { + "completion_length": 618.25, + "epoch": 0.608673071929369, + "grad_norm": 0.07786116003990173, + "kl": 0.5136113204061985, + "learning_rate": 7.940254003733578e-05, + "loss": 0.0003, + "reward": 0.7781250476837158, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 293 + }, + { + "completion_length": 704.921875, + "epoch": 0.6107504544274215, + "grad_norm": 0.0848776176571846, + "kl": 0.4174853079020977, + "learning_rate": 7.926962346161535e-05, + "loss": 0.0002, + "reward": 0.699218787252903, + "reward_std": 0.22207571775652468, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.1992187537252903, + "step": 294 + }, + { + "completion_length": 657.734375, + "epoch": 0.6128278369254739, + "grad_norm": 0.0675949826836586, + "kl": 0.4570797383785248, + "learning_rate": 7.913639146343435e-05, + "loss": 0.0002, + "reward": 0.7937500439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 295 + }, + { + "completion_length": 689.328125, + "epoch": 0.6149052194235264, + "grad_norm": 0.07435144484043121, + "kl": 0.3593181371688843, + "learning_rate": 7.900284547855991e-05, + "loss": 0.0002, + "reward": 0.8691406697034836, + "reward_std": 0.2469349391758442, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19726562686264515, + "step": 296 + }, + { + "completion_length": 783.453125, + "epoch": 0.6169826019215788, + "grad_norm": 0.07517191022634506, + "kl": 0.7363171242177486, + "learning_rate": 7.886898694614291e-05, + "loss": 0.0004, + "reward": 0.8375000469386578, + "reward_std": 0.20329319685697556, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.19687500223517418, + "step": 297 + }, + { + "completion_length": 665.171875, + "epoch": 0.6190599844196313, + "grad_norm": 0.07602944225072861, + "kl": 0.4283002242445946, + "learning_rate": 7.873481730870232e-05, + "loss": 0.0002, + "reward": 0.7781250439584255, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 298 + }, + { + "completion_length": 741.75, + "epoch": 0.6211373669176837, + "grad_norm": 0.07438351958990097, + "kl": 0.2955322675406933, + "learning_rate": 7.860033801210976e-05, + "loss": 0.0001, + "reward": 0.8250000439584255, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 299 + }, + { + "completion_length": 719.9375, + "epoch": 0.6232147494157362, + "grad_norm": 0.08875050395727158, + "kl": 0.34281647577881813, + "learning_rate": 7.84655505055738e-05, + "loss": 0.0002, + "reward": 0.7125000432133675, + "reward_std": 0.38006988912820816, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.19687500409781933, + "step": 300 + }, + { + "completion_length": 755.453125, + "epoch": 0.6252921319137886, + "grad_norm": 0.07758081704378128, + "kl": 0.29608317092061043, + "learning_rate": 7.833045624162452e-05, + "loss": 0.0001, + "reward": 0.7781250476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 301 + }, + { + "completion_length": 721.40625, + "epoch": 0.6273695144118411, + "grad_norm": 0.07114533334970474, + "kl": 0.5064779743552208, + "learning_rate": 7.819505667609767e-05, + "loss": 0.0003, + "reward": 0.7468750439584255, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 302 + }, + { + "completion_length": 804.59375, + "epoch": 0.6294468969098935, + "grad_norm": 0.06897041946649551, + "kl": 0.3391858469694853, + "learning_rate": 7.805935326811912e-05, + "loss": 0.0002, + "reward": 0.9500000476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.20000000298023224, + "step": 303 + }, + { + "completion_length": 723.96875, + "epoch": 0.631524279407946, + "grad_norm": 0.07760775089263916, + "kl": 0.3714125622063875, + "learning_rate": 7.792334748008905e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 304 + }, + { + "completion_length": 694.328125, + "epoch": 0.6336016619059984, + "grad_norm": 0.08604968339204788, + "kl": 0.3233291208744049, + "learning_rate": 7.77870407776662e-05, + "loss": 0.0002, + "reward": 0.7000000476837158, + "reward_std": 0.3093592058867216, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.20000000298023224, + "step": 305 + }, + { + "completion_length": 875.234375, + "epoch": 0.6356790444040509, + "grad_norm": 0.07271739840507507, + "kl": 0.2942599691450596, + "learning_rate": 7.765043462975217e-05, + "loss": 0.0001, + "reward": 0.7464844100177288, + "reward_std": 0.1817485373467207, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.18398437649011612, + "step": 306 + }, + { + "completion_length": 674.65625, + "epoch": 0.6377564269021033, + "grad_norm": 0.06499814242124557, + "kl": 0.6551753357052803, + "learning_rate": 7.751353050847545e-05, + "loss": 0.0003, + "reward": 0.6683594062924385, + "reward_std": 0.13313494622707367, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.19960937649011612, + "step": 307 + }, + { + "completion_length": 794.671875, + "epoch": 0.6398338094001558, + "grad_norm": 0.07812398672103882, + "kl": 0.29106237180531025, + "learning_rate": 7.737632988917564e-05, + "loss": 0.0001, + "reward": 0.8218750506639481, + "reward_std": 0.3137786276638508, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19687500223517418, + "step": 308 + }, + { + "completion_length": 726.0, + "epoch": 0.6419111918982082, + "grad_norm": 0.08285919576883316, + "kl": 0.3495354764163494, + "learning_rate": 7.723883425038758e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 309 + }, + { + "completion_length": 862.09375, + "epoch": 0.6439885743962607, + "grad_norm": 0.06892167776823044, + "kl": 0.318182036280632, + "learning_rate": 7.710104507382531e-05, + "loss": 0.0002, + "reward": 0.7753906697034836, + "reward_std": 0.24693494103848934, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19726562686264515, + "step": 310 + }, + { + "completion_length": 705.40625, + "epoch": 0.6460659568943131, + "grad_norm": 0.053015708923339844, + "kl": 0.30275189504027367, + "learning_rate": 7.696296384436619e-05, + "loss": 0.0002, + "reward": 0.7781250402331352, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 311 + }, + { + "completion_length": 689.0, + "epoch": 0.6481433393923656, + "grad_norm": 0.08785798400640488, + "kl": 0.3134246002882719, + "learning_rate": 7.682459205003483e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.2872621212154627, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 312 + }, + { + "completion_length": 875.421875, + "epoch": 0.650220721890418, + "grad_norm": 0.07502438127994537, + "kl": 0.33200008049607277, + "learning_rate": 7.668593118198719e-05, + "loss": 0.0002, + "reward": 0.8218750506639481, + "reward_std": 0.26958445087075233, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19687500409781933, + "step": 313 + }, + { + "completion_length": 708.828125, + "epoch": 0.6522981043884706, + "grad_norm": 0.08758591115474701, + "kl": 0.3114005923271179, + "learning_rate": 7.654698273449435e-05, + "loss": 0.0002, + "reward": 0.9179687947034836, + "reward_std": 0.31046406738460064, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 314 + }, + { + "completion_length": 843.875, + "epoch": 0.654375486886523, + "grad_norm": 0.06280484795570374, + "kl": 0.2624143324792385, + "learning_rate": 7.640774820492647e-05, + "loss": 0.0001, + "reward": 0.7937500439584255, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 315 + }, + { + "completion_length": 676.296875, + "epoch": 0.6564528693845755, + "grad_norm": 0.06573140621185303, + "kl": 0.29568540304899216, + "learning_rate": 7.626822909373667e-05, + "loss": 0.0001, + "reward": 0.7781250402331352, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 316 + }, + { + "completion_length": 708.625, + "epoch": 0.6585302518826279, + "grad_norm": 0.07694177329540253, + "kl": 0.2915249727666378, + "learning_rate": 7.612842690444486e-05, + "loss": 0.0001, + "reward": 0.9648437947034836, + "reward_std": 0.2441728077828884, + "rewards/argmax_reward_func": 0.765625, + "rewards/format_reward_func": 0.1992187537252903, + "step": 317 + }, + { + "completion_length": 656.375, + "epoch": 0.6606076343806804, + "grad_norm": 0.09348881989717484, + "kl": 0.3169392794370651, + "learning_rate": 7.598834314362151e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.3314562924206257, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 318 + }, + { + "completion_length": 640.75, + "epoch": 0.6626850168787328, + "grad_norm": 0.07280497252941132, + "kl": 0.2968177553266287, + "learning_rate": 7.584797932087145e-05, + "loss": 0.0001, + "reward": 0.8710937947034836, + "reward_std": 0.19997863844037056, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19921875186264515, + "step": 319 + }, + { + "completion_length": 694.359375, + "epoch": 0.6647623993767853, + "grad_norm": 0.09892084449529648, + "kl": 0.5367627218365669, + "learning_rate": 7.570733694881755e-05, + "loss": 0.0003, + "reward": 0.9031250439584255, + "reward_std": 0.28726212307810783, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 320 + }, + { + "completion_length": 653.953125, + "epoch": 0.6668397818748377, + "grad_norm": 0.07763518393039703, + "kl": 0.31165359169244766, + "learning_rate": 7.556641754308447e-05, + "loss": 0.0002, + "reward": 0.8406250439584255, + "reward_std": 0.24306794814765453, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 321 + }, + { + "completion_length": 738.703125, + "epoch": 0.6689171643728902, + "grad_norm": 0.0881708562374115, + "kl": 0.3136756382882595, + "learning_rate": 7.542522262228231e-05, + "loss": 0.0002, + "reward": 0.8085937947034836, + "reward_std": 0.33256115205585957, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.1992187537252903, + "step": 322 + }, + { + "completion_length": 757.421875, + "epoch": 0.6709945468709426, + "grad_norm": 0.0727957934141159, + "kl": 0.28189600445330143, + "learning_rate": 7.528375370799024e-05, + "loss": 0.0001, + "reward": 0.8093750476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 323 + }, + { + "completion_length": 660.125, + "epoch": 0.6730719293689951, + "grad_norm": 0.068515844643116, + "kl": 0.29710386879742146, + "learning_rate": 7.514201232474011e-05, + "loss": 0.0001, + "reward": 0.8562500439584255, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 324 + }, + { + "completion_length": 704.8125, + "epoch": 0.6751493118670475, + "grad_norm": 0.07097381353378296, + "kl": 0.31055452302098274, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0002, + "reward": 0.8406250439584255, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 325 + }, + { + "completion_length": 680.921875, + "epoch": 0.6772266943651, + "grad_norm": 0.06986773759126663, + "kl": 0.3125472627580166, + "learning_rate": 7.48577182641578e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 326 + }, + { + "completion_length": 744.046875, + "epoch": 0.6793040768631524, + "grad_norm": 0.06576069444417953, + "kl": 0.3361051678657532, + "learning_rate": 7.471516865050467e-05, + "loss": 0.0002, + "reward": 0.9343750476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 327 + }, + { + "completion_length": 654.0, + "epoch": 0.6813814593612049, + "grad_norm": 0.07205154001712799, + "kl": 0.30227479338645935, + "learning_rate": 7.457235269521856e-05, + "loss": 0.0002, + "reward": 0.7617187909781933, + "reward_std": 0.17788154468871653, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.1992187537252903, + "step": 328 + }, + { + "completion_length": 647.859375, + "epoch": 0.6834588418592573, + "grad_norm": 0.0794130265712738, + "kl": 0.4595659039914608, + "learning_rate": 7.44292719373476e-05, + "loss": 0.0002, + "reward": 0.9343750476837158, + "reward_std": 0.24306794628500938, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 329 + }, + { + "completion_length": 735.078125, + "epoch": 0.6855362243573098, + "grad_norm": 0.0897228941321373, + "kl": 0.3426021710038185, + "learning_rate": 7.428592791879361e-05, + "loss": 0.0002, + "reward": 0.7781250439584255, + "reward_std": 0.33145629800856113, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 330 + }, + { + "completion_length": 853.703125, + "epoch": 0.6876136068553622, + "grad_norm": 0.07609646022319794, + "kl": 0.26740806736052036, + "learning_rate": 7.414232218429537e-05, + "loss": 0.0001, + "reward": 0.9156250506639481, + "reward_std": 0.26958445832133293, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.19687500223517418, + "step": 331 + }, + { + "completion_length": 616.890625, + "epoch": 0.6896909893534147, + "grad_norm": 0.09115231037139893, + "kl": 0.334526427090168, + "learning_rate": 7.399845628141206e-05, + "loss": 0.0002, + "reward": 0.8718750439584255, + "reward_std": 0.2872621212154627, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 332 + }, + { + "completion_length": 622.765625, + "epoch": 0.6917683718514671, + "grad_norm": 0.08646494895219803, + "kl": 0.3055717647075653, + "learning_rate": 7.385433176050653e-05, + "loss": 0.0002, + "reward": 0.8710937909781933, + "reward_std": 0.2883669827133417, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19921875186264515, + "step": 333 + }, + { + "completion_length": 686.921875, + "epoch": 0.6938457543495196, + "grad_norm": 0.0787225142121315, + "kl": 0.3159499131143093, + "learning_rate": 7.370995017472863e-05, + "loss": 0.0002, + "reward": 0.8531250506639481, + "reward_std": 0.26958445459604263, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19687500223517418, + "step": 334 + }, + { + "completion_length": 634.703125, + "epoch": 0.695923136847572, + "grad_norm": 0.09521856158971786, + "kl": 0.3115619271993637, + "learning_rate": 7.356531307999843e-05, + "loss": 0.0002, + "reward": 0.7468750476837158, + "reward_std": 0.375650467351079, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 335 + }, + { + "completion_length": 753.703125, + "epoch": 0.6980005193456245, + "grad_norm": 0.09729248285293579, + "kl": 0.3037104904651642, + "learning_rate": 7.342042203498951e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.3977475520223379, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 336 + }, + { + "completion_length": 674.0, + "epoch": 0.700077901843677, + "grad_norm": 0.09549879282712936, + "kl": 0.3098057843744755, + "learning_rate": 7.32752786011121e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.37565046921372414, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 337 + }, + { + "completion_length": 673.1875, + "epoch": 0.7021552843417295, + "grad_norm": 0.08708694577217102, + "kl": 0.29914069548249245, + "learning_rate": 7.312988434249632e-05, + "loss": 0.0001, + "reward": 0.9031250476837158, + "reward_std": 0.33145629800856113, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 338 + }, + { + "completion_length": 699.65625, + "epoch": 0.7042326668397819, + "grad_norm": 0.09121581166982651, + "kl": 0.31991639360785484, + "learning_rate": 7.298424082597526e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.3093592096120119, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 339 + }, + { + "completion_length": 667.546875, + "epoch": 0.7063100493378344, + "grad_norm": 0.08295177668333054, + "kl": 0.3119734339416027, + "learning_rate": 7.283834962106811e-05, + "loss": 0.0002, + "reward": 0.6656250394880772, + "reward_std": 0.31377863325178623, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.19687500409781933, + "step": 340 + }, + { + "completion_length": 734.140625, + "epoch": 0.7083874318358868, + "grad_norm": 0.07721901684999466, + "kl": 0.2920740433037281, + "learning_rate": 7.269221229996331e-05, + "loss": 0.0001, + "reward": 0.8875000476837158, + "reward_std": 0.30935920774936676, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 341 + }, + { + "completion_length": 723.78125, + "epoch": 0.7104648143339393, + "grad_norm": 0.07446262985467911, + "kl": 0.31096627190709114, + "learning_rate": 7.254583043750151e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 342 + }, + { + "completion_length": 598.140625, + "epoch": 0.7125421968319917, + "grad_norm": 0.07494507730007172, + "kl": 0.3281702548265457, + "learning_rate": 7.239920561115867e-05, + "loss": 0.0002, + "reward": 0.7000000476837158, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.20000000298023224, + "step": 343 + }, + { + "completion_length": 669.484375, + "epoch": 0.7146195793300442, + "grad_norm": 0.06954500079154968, + "kl": 0.29709911718964577, + "learning_rate": 7.225233940102906e-05, + "loss": 0.0001, + "reward": 0.9343750476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 344 + }, + { + "completion_length": 738.015625, + "epoch": 0.7166969618280966, + "grad_norm": 0.08409620076417923, + "kl": 0.3333327900618315, + "learning_rate": 7.210523338980813e-05, + "loss": 0.0002, + "reward": 0.8398437947034836, + "reward_std": 0.2883669827133417, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.1992187537252903, + "step": 345 + }, + { + "completion_length": 596.5, + "epoch": 0.7187743443261491, + "grad_norm": 0.07967247068881989, + "kl": 0.3089658170938492, + "learning_rate": 7.195788916277565e-05, + "loss": 0.0002, + "reward": 0.7929687947034836, + "reward_std": 0.22207571775652468, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19921875186264515, + "step": 346 + }, + { + "completion_length": 717.390625, + "epoch": 0.7208517268242015, + "grad_norm": 0.0864432230591774, + "kl": 0.3028757870197296, + "learning_rate": 7.181030830777837e-05, + "loss": 0.0002, + "reward": 0.8843750506639481, + "reward_std": 0.2695844564586878, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19687500223517418, + "step": 347 + }, + { + "completion_length": 716.515625, + "epoch": 0.722929109322254, + "grad_norm": 0.07595375925302505, + "kl": 0.30636318400502205, + "learning_rate": 7.166249241521318e-05, + "loss": 0.0002, + "reward": 0.7898437976837158, + "reward_std": 0.22649514116346836, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19609375298023224, + "step": 348 + }, + { + "completion_length": 584.3125, + "epoch": 0.7250064918203064, + "grad_norm": 0.10229937732219696, + "kl": 0.32858528569340706, + "learning_rate": 7.151444307800975e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.3756504710763693, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 349 + }, + { + "completion_length": 590.1875, + "epoch": 0.7270838743183589, + "grad_norm": 0.07948501408100128, + "kl": 0.3115417957305908, + "learning_rate": 7.13661618916135e-05, + "loss": 0.0002, + "reward": 0.7937500439584255, + "reward_std": 0.2651650346815586, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 350 + }, + { + "completion_length": 618.46875, + "epoch": 0.7291612568164113, + "grad_norm": 0.06686828285455704, + "kl": 0.32769910246133804, + "learning_rate": 7.121765045396834e-05, + "loss": 0.0002, + "reward": 0.8867187947034836, + "reward_std": 0.17788154655136168, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 351 + }, + { + "completion_length": 600.5625, + "epoch": 0.7312386393144638, + "grad_norm": 0.07947742938995361, + "kl": 0.3473210446536541, + "learning_rate": 7.106891036549945e-05, + "loss": 0.0002, + "reward": 0.7937500439584255, + "reward_std": 0.2651650384068489, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 352 + }, + { + "completion_length": 551.78125, + "epoch": 0.7333160218125162, + "grad_norm": 0.04208023473620415, + "kl": 0.35688477009534836, + "learning_rate": 7.091994322909611e-05, + "loss": 0.0002, + "reward": 0.9968750476837158, + "reward_std": 0.06629125960171223, + "rewards/argmax_reward_func": 0.796875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 353 + }, + { + "completion_length": 576.609375, + "epoch": 0.7353934043105687, + "grad_norm": 0.06657633185386658, + "kl": 0.32345687225461006, + "learning_rate": 7.077075065009433e-05, + "loss": 0.0002, + "reward": 0.7156250476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 354 + }, + { + "completion_length": 552.921875, + "epoch": 0.7374707868086211, + "grad_norm": 0.0544576533138752, + "kl": 0.3251136727631092, + "learning_rate": 7.062133423625959e-05, + "loss": 0.0002, + "reward": 0.8406250439584255, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 355 + }, + { + "completion_length": 606.703125, + "epoch": 0.7395481693066736, + "grad_norm": 0.07147221267223358, + "kl": 0.3404123783111572, + "learning_rate": 7.04716955977695e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 356 + }, + { + "completion_length": 605.28125, + "epoch": 0.741625551804726, + "grad_norm": 0.05363324284553528, + "kl": 0.3204925172030926, + "learning_rate": 7.03218363471965e-05, + "loss": 0.0002, + "reward": 0.9500000439584255, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.20000000298023224, + "step": 357 + }, + { + "completion_length": 563.65625, + "epoch": 0.7437029343027785, + "grad_norm": 0.041013430804014206, + "kl": 0.3419278897345066, + "learning_rate": 7.017175809949044e-05, + "loss": 0.0002, + "reward": 0.8562500439584255, + "reward_std": 0.0883883461356163, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 358 + }, + { + "completion_length": 557.15625, + "epoch": 0.7457803168008309, + "grad_norm": 0.06874032318592072, + "kl": 0.3553139455616474, + "learning_rate": 7.002146247196113e-05, + "loss": 0.0002, + "reward": 0.776562537997961, + "reward_std": 0.1568893138319254, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.1984375026077032, + "step": 359 + }, + { + "completion_length": 567.15625, + "epoch": 0.7478576992988835, + "grad_norm": 0.0703793615102768, + "kl": 0.33949872851371765, + "learning_rate": 6.987095108426101e-05, + "loss": 0.0002, + "reward": 0.7312500402331352, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 360 + }, + { + "completion_length": 552.984375, + "epoch": 0.7499350817969359, + "grad_norm": 0.06514879316091537, + "kl": 0.3468449302017689, + "learning_rate": 6.972022555836764e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 361 + }, + { + "completion_length": 540.453125, + "epoch": 0.7520124642949884, + "grad_norm": 0.08254203200340271, + "kl": 0.36483363062143326, + "learning_rate": 6.956928751856623e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 362 + }, + { + "completion_length": 553.078125, + "epoch": 0.7540898467930408, + "grad_norm": 0.08209247887134552, + "kl": 0.37783167138695717, + "learning_rate": 6.94181385914321e-05, + "loss": 0.0002, + "reward": 0.667187537997961, + "reward_std": 0.17898640409111977, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.1984375026077032, + "step": 363 + }, + { + "completion_length": 610.234375, + "epoch": 0.7561672292910933, + "grad_norm": 0.062447499483823776, + "kl": 0.34449223801493645, + "learning_rate": 6.926678040581323e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 364 + }, + { + "completion_length": 527.78125, + "epoch": 0.7582446117891457, + "grad_norm": 0.07009898126125336, + "kl": 0.36786164715886116, + "learning_rate": 6.911521459281265e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 365 + }, + { + "completion_length": 587.296875, + "epoch": 0.7603219942871982, + "grad_norm": 0.06519950181245804, + "kl": 0.35170425847172737, + "learning_rate": 6.896344278577083e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 366 + }, + { + "completion_length": 583.96875, + "epoch": 0.7623993767852506, + "grad_norm": 0.051437534391880035, + "kl": 0.34650370851159096, + "learning_rate": 6.881146662024822e-05, + "loss": 0.0002, + "reward": 1.0593750476837158, + "reward_std": 0.11048543080687523, + "rewards/argmax_reward_func": 0.859375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 367 + }, + { + "completion_length": 538.484375, + "epoch": 0.764476759283303, + "grad_norm": 0.06923159956932068, + "kl": 0.38169170916080475, + "learning_rate": 6.865928773400743e-05, + "loss": 0.0002, + "reward": 0.8250000439584255, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 368 + }, + { + "completion_length": 580.03125, + "epoch": 0.7665541417813555, + "grad_norm": 0.0665920302271843, + "kl": 0.3636031821370125, + "learning_rate": 6.850690776699573e-05, + "loss": 0.0002, + "reward": 0.7289062924683094, + "reward_std": 0.13589708344079554, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.19765625149011612, + "step": 369 + }, + { + "completion_length": 562.015625, + "epoch": 0.768631524279408, + "grad_norm": 0.06946459412574768, + "kl": 0.4947234131395817, + "learning_rate": 6.835432836132731e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 370 + }, + { + "completion_length": 575.0625, + "epoch": 0.7707089067774604, + "grad_norm": 0.0689174011349678, + "kl": 0.3747940734028816, + "learning_rate": 6.820155116126561e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 371 + }, + { + "completion_length": 571.109375, + "epoch": 0.7727862892755128, + "grad_norm": 0.08710569888353348, + "kl": 0.39623570069670677, + "learning_rate": 6.804857781320558e-05, + "loss": 0.0002, + "reward": 0.7464844174683094, + "reward_std": 0.28670969791710377, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.19960937835276127, + "step": 372 + }, + { + "completion_length": 607.59375, + "epoch": 0.7748636717735653, + "grad_norm": 0.0731528028845787, + "kl": 0.3582250289618969, + "learning_rate": 6.789540996565593e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 373 + }, + { + "completion_length": 579.28125, + "epoch": 0.7769410542716177, + "grad_norm": 0.0625411793589592, + "kl": 0.3635551296174526, + "learning_rate": 6.774204926922145e-05, + "loss": 0.0002, + "reward": 0.8875000439584255, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 374 + }, + { + "completion_length": 599.3125, + "epoch": 0.7790184367696702, + "grad_norm": 0.08092815428972244, + "kl": 0.4265919253230095, + "learning_rate": 6.758849737658509e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 375 + }, + { + "completion_length": 547.796875, + "epoch": 0.7810958192677226, + "grad_norm": 0.07175435870885849, + "kl": 0.3616880625486374, + "learning_rate": 6.743475594249021e-05, + "loss": 0.0002, + "reward": 0.8843750432133675, + "reward_std": 0.18119611218571663, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.19687500223517418, + "step": 376 + }, + { + "completion_length": 591.609375, + "epoch": 0.7831732017657751, + "grad_norm": 0.07784335315227509, + "kl": 0.42067378014326096, + "learning_rate": 6.728082662372282e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 377 + }, + { + "completion_length": 531.859375, + "epoch": 0.7852505842638275, + "grad_norm": 0.07652134448289871, + "kl": 0.3934118077158928, + "learning_rate": 6.712671107909359e-05, + "loss": 0.0002, + "reward": 0.8875000439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 378 + }, + { + "completion_length": 542.640625, + "epoch": 0.78732796676188, + "grad_norm": 0.054136764258146286, + "kl": 0.4239979311823845, + "learning_rate": 6.697241096942006e-05, + "loss": 0.0002, + "reward": 0.8562500439584255, + "reward_std": 0.0883883461356163, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 379 + }, + { + "completion_length": 548.53125, + "epoch": 0.7894053492599324, + "grad_norm": 0.07836976647377014, + "kl": 0.4376937076449394, + "learning_rate": 6.681792795750875e-05, + "loss": 0.0002, + "reward": 0.7308594211935997, + "reward_std": 0.17732911929488182, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.19960937649011612, + "step": 380 + }, + { + "completion_length": 545.140625, + "epoch": 0.7914827317579849, + "grad_norm": 0.06161171570420265, + "kl": 0.526831716299057, + "learning_rate": 6.666326370813723e-05, + "loss": 0.0003, + "reward": 0.8562500476837158, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 381 + }, + { + "completion_length": 539.4375, + "epoch": 0.7935601142560373, + "grad_norm": 0.050724372267723083, + "kl": 0.4309442602097988, + "learning_rate": 6.650841988803606e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.0883883461356163, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 382 + }, + { + "completion_length": 515.109375, + "epoch": 0.7956374967540899, + "grad_norm": 0.08242635428905487, + "kl": 0.4333142638206482, + "learning_rate": 6.635339816587109e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 383 + }, + { + "completion_length": 582.0, + "epoch": 0.7977148792521424, + "grad_norm": 0.07576624304056168, + "kl": 0.40080199763178825, + "learning_rate": 6.619820021222518e-05, + "loss": 0.0002, + "reward": 0.8406250439584255, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 384 + }, + { + "completion_length": 563.59375, + "epoch": 0.7997922617501948, + "grad_norm": 0.08377435803413391, + "kl": 0.43326959386467934, + "learning_rate": 6.604282769958044e-05, + "loss": 0.0002, + "reward": 0.8089844211935997, + "reward_std": 0.2436203770339489, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.19960937649011612, + "step": 385 + }, + { + "completion_length": 548.859375, + "epoch": 0.8018696442482472, + "grad_norm": 0.09505198895931244, + "kl": 0.625109825283289, + "learning_rate": 6.588728230230004e-05, + "loss": 0.0003, + "reward": 0.7933594211935997, + "reward_std": 0.3088067825883627, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19960937649011612, + "step": 386 + }, + { + "completion_length": 534.046875, + "epoch": 0.8039470267462997, + "grad_norm": 0.09738834947347641, + "kl": 0.5551509782671928, + "learning_rate": 6.573156569661025e-05, + "loss": 0.0003, + "reward": 0.8703125491738319, + "reward_std": 0.289471834897995, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.1984375026077032, + "step": 387 + }, + { + "completion_length": 557.40625, + "epoch": 0.8060244092443521, + "grad_norm": 0.0654783695936203, + "kl": 0.42286501079797745, + "learning_rate": 6.557567956058239e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 388 + }, + { + "completion_length": 509.921875, + "epoch": 0.8081017917424046, + "grad_norm": 0.07516364008188248, + "kl": 0.5226034559309483, + "learning_rate": 6.541962557411469e-05, + "loss": 0.0003, + "reward": 0.8250000476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 389 + }, + { + "completion_length": 540.953125, + "epoch": 0.810179174240457, + "grad_norm": 0.08237718045711517, + "kl": 0.49187011271715164, + "learning_rate": 6.526340541891418e-05, + "loss": 0.0002, + "reward": 0.7937500476837158, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 390 + }, + { + "completion_length": 538.03125, + "epoch": 0.8122565567385095, + "grad_norm": 0.08174508810043335, + "kl": 0.45481956005096436, + "learning_rate": 6.510702077847863e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 391 + }, + { + "completion_length": 552.703125, + "epoch": 0.8143339392365619, + "grad_norm": 0.09640171378850937, + "kl": 0.45114999637007713, + "learning_rate": 6.495047333807842e-05, + "loss": 0.0002, + "reward": 0.7621094211935997, + "reward_std": 0.309911634773016, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.19960937835276127, + "step": 392 + }, + { + "completion_length": 555.75, + "epoch": 0.8164113217346144, + "grad_norm": 0.07019418478012085, + "kl": 0.44265756756067276, + "learning_rate": 6.479376478473823e-05, + "loss": 0.0002, + "reward": 0.9500000476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.20000000298023224, + "step": 393 + }, + { + "completion_length": 588.09375, + "epoch": 0.8184887042326668, + "grad_norm": 0.05258520692586899, + "kl": 0.4588502533733845, + "learning_rate": 6.463689680721904e-05, + "loss": 0.0002, + "reward": 0.8718750439584255, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 394 + }, + { + "completion_length": 527.578125, + "epoch": 0.8205660867307193, + "grad_norm": 0.08728921413421631, + "kl": 0.44911035895347595, + "learning_rate": 6.447987109599986e-05, + "loss": 0.0002, + "reward": 0.7937500476837158, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 395 + }, + { + "completion_length": 635.40625, + "epoch": 0.8226434692287717, + "grad_norm": 0.06634779274463654, + "kl": 0.38350560516119003, + "learning_rate": 6.432268934325946e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 396 + }, + { + "completion_length": 532.09375, + "epoch": 0.8247208517268242, + "grad_norm": 0.09231170266866684, + "kl": 0.46880777925252914, + "learning_rate": 6.416535324285824e-05, + "loss": 0.0002, + "reward": 0.6843750402331352, + "reward_std": 0.2872621212154627, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 397 + }, + { + "completion_length": 588.875, + "epoch": 0.8267982342248766, + "grad_norm": 0.07496833801269531, + "kl": 0.850627463310957, + "learning_rate": 6.400786449031986e-05, + "loss": 0.0004, + "reward": 0.8875000476837158, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 398 + }, + { + "completion_length": 511.953125, + "epoch": 0.8288756167229291, + "grad_norm": 0.06271515041589737, + "kl": 0.4049219489097595, + "learning_rate": 6.385022478281306e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 399 + }, + { + "completion_length": 555.046875, + "epoch": 0.8309529992209815, + "grad_norm": 0.07291208207607269, + "kl": 0.4224717430770397, + "learning_rate": 6.369243581913336e-05, + "loss": 0.0002, + "reward": 0.7781250439584255, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 400 + }, + { + "completion_length": 573.515625, + "epoch": 0.833030381719034, + "grad_norm": 0.06133547052741051, + "kl": 0.42930199950933456, + "learning_rate": 6.353449929968465e-05, + "loss": 0.0002, + "reward": 0.7781250439584255, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 401 + }, + { + "completion_length": 522.8125, + "epoch": 0.8351077642170864, + "grad_norm": 0.06863158941268921, + "kl": 0.4221891984343529, + "learning_rate": 6.337641692646106e-05, + "loss": 0.0002, + "reward": 0.9019531756639481, + "reward_std": 0.15633688867092133, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.19882812723517418, + "step": 402 + }, + { + "completion_length": 544.0, + "epoch": 0.8371851467151389, + "grad_norm": 0.07222079485654831, + "kl": 0.45428359508514404, + "learning_rate": 6.321819040302839e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 403 + }, + { + "completion_length": 527.671875, + "epoch": 0.8392625292131913, + "grad_norm": 0.08342251926660538, + "kl": 0.4082505330443382, + "learning_rate": 6.305982143450597e-05, + "loss": 0.0002, + "reward": 0.8402344286441803, + "reward_std": 0.24362037930404767, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.19960937649011612, + "step": 404 + }, + { + "completion_length": 544.171875, + "epoch": 0.8413399117112438, + "grad_norm": 0.064121775329113, + "kl": 0.43093303963541985, + "learning_rate": 6.290131172754811e-05, + "loss": 0.0002, + "reward": 0.9949219226837158, + "reward_std": 0.15744174271821976, + "rewards/argmax_reward_func": 0.796875, + "rewards/format_reward_func": 0.19804687798023224, + "step": 405 + }, + { + "completion_length": 517.578125, + "epoch": 0.8434172942092963, + "grad_norm": 0.08640465885400772, + "kl": 0.44819287210702896, + "learning_rate": 6.274266299032582e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 406 + }, + { + "completion_length": 536.734375, + "epoch": 0.8454946767073488, + "grad_norm": 0.09182075411081314, + "kl": 0.39375099167227745, + "learning_rate": 6.25838769325083e-05, + "loss": 0.0002, + "reward": 0.7625000476837158, + "reward_std": 0.3093592096120119, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 407 + }, + { + "completion_length": 537.265625, + "epoch": 0.8475720592054012, + "grad_norm": 0.05770527943968773, + "kl": 0.48194558918476105, + "learning_rate": 6.24249552652447e-05, + "loss": 0.0002, + "reward": 0.7292969226837158, + "reward_std": 0.13534465618431568, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.19804687798023224, + "step": 408 + }, + { + "completion_length": 536.84375, + "epoch": 0.8496494417034537, + "grad_norm": 0.07029449939727783, + "kl": 0.4273468554019928, + "learning_rate": 6.226589970114543e-05, + "loss": 0.0002, + "reward": 0.8406250439584255, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 409 + }, + { + "completion_length": 539.140625, + "epoch": 0.8517268242015061, + "grad_norm": 0.0789664089679718, + "kl": 0.4228878915309906, + "learning_rate": 6.210671195426387e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 410 + }, + { + "completion_length": 582.265625, + "epoch": 0.8538042066995586, + "grad_norm": 0.05472075939178467, + "kl": 0.39347052946686745, + "learning_rate": 6.194739374007792e-05, + "loss": 0.0002, + "reward": 0.8562500439584255, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 411 + }, + { + "completion_length": 511.125, + "epoch": 0.855881589197611, + "grad_norm": 0.08022020757198334, + "kl": 0.44926824048161507, + "learning_rate": 6.178794677547137e-05, + "loss": 0.0002, + "reward": 0.7312500439584255, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 412 + }, + { + "completion_length": 502.40625, + "epoch": 0.8579589716956635, + "grad_norm": 0.08022835850715637, + "kl": 0.4819503165781498, + "learning_rate": 6.162837277871553e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.24306794814765453, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 413 + }, + { + "completion_length": 537.1875, + "epoch": 0.8600363541937159, + "grad_norm": 0.06297382712364197, + "kl": 0.49380555003881454, + "learning_rate": 6.146867346945066e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 414 + }, + { + "completion_length": 521.859375, + "epoch": 0.8621137366917684, + "grad_norm": 0.07238580286502838, + "kl": 0.4831845983862877, + "learning_rate": 6.130885056866742e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 415 + }, + { + "completion_length": 538.84375, + "epoch": 0.8641911191898208, + "grad_norm": 0.070571668446064, + "kl": 0.5007887817919254, + "learning_rate": 6.114890579868837e-05, + "loss": 0.0003, + "reward": 0.8250000439584255, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 416 + }, + { + "completion_length": 573.25, + "epoch": 0.8662685016878733, + "grad_norm": 0.0768335610628128, + "kl": 0.4633421525359154, + "learning_rate": 6.098884088314938e-05, + "loss": 0.0002, + "reward": 0.8875000439584255, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 417 + }, + { + "completion_length": 627.6875, + "epoch": 0.8683458841859257, + "grad_norm": 0.07244177162647247, + "kl": 0.45967796072363853, + "learning_rate": 6.082865754698109e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 418 + }, + { + "completion_length": 558.171875, + "epoch": 0.8704232666839782, + "grad_norm": 0.07271222770214081, + "kl": 0.46447786316275597, + "learning_rate": 6.066835751639022e-05, + "loss": 0.0002, + "reward": 0.7925781682133675, + "reward_std": 0.22152329608798027, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.19882812909781933, + "step": 419 + }, + { + "completion_length": 539.90625, + "epoch": 0.8725006491820306, + "grad_norm": 0.05372535437345505, + "kl": 0.47306570410728455, + "learning_rate": 6.050794251884112e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 420 + }, + { + "completion_length": 579.59375, + "epoch": 0.8745780316800831, + "grad_norm": 0.06956978142261505, + "kl": 0.4903941936790943, + "learning_rate": 6.0347414283037004e-05, + "loss": 0.0002, + "reward": 0.7937500476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 421 + }, + { + "completion_length": 543.5625, + "epoch": 0.8766554141781355, + "grad_norm": 0.05839576572179794, + "kl": 0.46378039941191673, + "learning_rate": 6.018677453890149e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 422 + }, + { + "completion_length": 571.265625, + "epoch": 0.878732796676188, + "grad_norm": 0.08274129778146744, + "kl": 0.4939221628010273, + "learning_rate": 6.002602501755974e-05, + "loss": 0.0002, + "reward": 0.9656250476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.765625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 423 + }, + { + "completion_length": 562.875, + "epoch": 0.8808101791742404, + "grad_norm": 0.052250444889068604, + "kl": 0.4885864891111851, + "learning_rate": 5.9865167451320005e-05, + "loss": 0.0002, + "reward": 0.9031250439584255, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 424 + }, + { + "completion_length": 564.171875, + "epoch": 0.8828875616722929, + "grad_norm": 0.07596340775489807, + "kl": 0.4957350380718708, + "learning_rate": 5.970420357365486e-05, + "loss": 0.0002, + "reward": 0.6843750439584255, + "reward_std": 0.24306794814765453, + "rewards/argmax_reward_func": 0.484375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 425 + }, + { + "completion_length": 563.375, + "epoch": 0.8849649441703453, + "grad_norm": 0.0591680072247982, + "kl": 0.47624582052230835, + "learning_rate": 5.9543135119182514e-05, + "loss": 0.0002, + "reward": 0.7757812812924385, + "reward_std": 0.11269513890147209, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19765625335276127, + "step": 426 + }, + { + "completion_length": 607.78125, + "epoch": 0.8870423266683978, + "grad_norm": 0.0683642029762268, + "kl": 0.5224468521773815, + "learning_rate": 5.938196382364818e-05, + "loss": 0.0003, + "reward": 0.8718750439584255, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 427 + }, + { + "completion_length": 584.8125, + "epoch": 0.8891197091664502, + "grad_norm": 0.05711337924003601, + "kl": 0.4908281937241554, + "learning_rate": 5.9220691423905305e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.1325825173407793, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 428 + }, + { + "completion_length": 626.34375, + "epoch": 0.8911970916645027, + "grad_norm": 0.060382284224033356, + "kl": 0.586872935295105, + "learning_rate": 5.9059319657896884e-05, + "loss": 0.0003, + "reward": 0.9343750476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 429 + }, + { + "completion_length": 597.703125, + "epoch": 0.8932744741625552, + "grad_norm": 0.07129113376140594, + "kl": 0.6612692400813103, + "learning_rate": 5.889785026463672e-05, + "loss": 0.0003, + "reward": 0.8554687947034836, + "reward_std": 0.17788155190646648, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19921875186264515, + "step": 430 + }, + { + "completion_length": 568.046875, + "epoch": 0.8953518566606077, + "grad_norm": 0.07546839118003845, + "kl": 0.5523902028799057, + "learning_rate": 5.873628498419073e-05, + "loss": 0.0003, + "reward": 0.8250000476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 431 + }, + { + "completion_length": 573.609375, + "epoch": 0.8974292391586601, + "grad_norm": 0.06730344146490097, + "kl": 0.5382697433233261, + "learning_rate": 5.8574625557658095e-05, + "loss": 0.0003, + "reward": 0.7312500402331352, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.53125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 432 + }, + { + "completion_length": 576.84375, + "epoch": 0.8995066216567126, + "grad_norm": 0.04371188208460808, + "kl": 0.49328725039958954, + "learning_rate": 5.8412873727152595e-05, + "loss": 0.0002, + "reward": 0.8875000439584255, + "reward_std": 0.0883883461356163, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 433 + }, + { + "completion_length": 634.609375, + "epoch": 0.901584004154765, + "grad_norm": 0.05455589294433594, + "kl": 0.4651510939002037, + "learning_rate": 5.825103123578379e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 434 + }, + { + "completion_length": 607.640625, + "epoch": 0.9036613866528175, + "grad_norm": 0.06638182699680328, + "kl": 0.4994208887219429, + "learning_rate": 5.808909982763825e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 435 + }, + { + "completion_length": 577.171875, + "epoch": 0.9057387691508699, + "grad_norm": 0.05038674548268318, + "kl": 0.4841819517314434, + "learning_rate": 5.792708124776072e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 436 + }, + { + "completion_length": 618.765625, + "epoch": 0.9078161516489224, + "grad_norm": 0.06103122606873512, + "kl": 0.45625371113419533, + "learning_rate": 5.776497724213536e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 437 + }, + { + "completion_length": 577.34375, + "epoch": 0.9098935341469748, + "grad_norm": 0.06659764796495438, + "kl": 0.5014519467949867, + "learning_rate": 5.760278955766695e-05, + "loss": 0.0003, + "reward": 0.7937500439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.59375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 438 + }, + { + "completion_length": 588.828125, + "epoch": 0.9119709166450273, + "grad_norm": 0.06549356877803802, + "kl": 0.4932373948395252, + "learning_rate": 5.744051994216201e-05, + "loss": 0.0002, + "reward": 0.7781250476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 439 + }, + { + "completion_length": 587.828125, + "epoch": 0.9140482991430797, + "grad_norm": 0.08965161442756653, + "kl": 0.4944054037332535, + "learning_rate": 5.727817014430992e-05, + "loss": 0.0002, + "reward": 0.7777344174683094, + "reward_std": 0.28781455382704735, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19960937649011612, + "step": 440 + }, + { + "completion_length": 580.125, + "epoch": 0.9161256816411322, + "grad_norm": 0.07723158597946167, + "kl": 0.4910140074789524, + "learning_rate": 5.7115741913664264e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.24306795187294483, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 441 + }, + { + "completion_length": 598.890625, + "epoch": 0.9182030641391846, + "grad_norm": 0.068883016705513, + "kl": 0.481427326798439, + "learning_rate": 5.695323700062375e-05, + "loss": 0.0002, + "reward": 0.7000000439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.5, + "rewards/format_reward_func": 0.20000000298023224, + "step": 442 + }, + { + "completion_length": 600.140625, + "epoch": 0.9202804466372371, + "grad_norm": 0.07069353759288788, + "kl": 0.5068237520754337, + "learning_rate": 5.6790657156413504e-05, + "loss": 0.0003, + "reward": 0.714843787252903, + "reward_std": 0.1999786365777254, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.19921875186264515, + "step": 443 + }, + { + "completion_length": 587.625, + "epoch": 0.9223578291352895, + "grad_norm": 1.0064716339111328, + "kl": 9.217760100960732, + "learning_rate": 5.66280041330661e-05, + "loss": 0.0046, + "reward": 0.7781250439584255, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 444 + }, + { + "completion_length": 590.78125, + "epoch": 0.924435211633342, + "grad_norm": 0.2128666639328003, + "kl": 3.3808604292571545, + "learning_rate": 5.646527968340278e-05, + "loss": 0.0017, + "reward": 0.7625000476837158, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 445 + }, + { + "completion_length": 633.46875, + "epoch": 0.9265125941313944, + "grad_norm": 0.05456709861755371, + "kl": 0.49203020706772804, + "learning_rate": 5.6302485561014475e-05, + "loss": 0.0002, + "reward": 0.9500000439584255, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.20000000298023224, + "step": 446 + }, + { + "completion_length": 584.015625, + "epoch": 0.9285899766294469, + "grad_norm": 0.06649811565876007, + "kl": 0.47326431795954704, + "learning_rate": 5.613962352024292e-05, + "loss": 0.0002, + "reward": 0.7625000439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 447 + }, + { + "completion_length": 657.1875, + "epoch": 0.9306673591274993, + "grad_norm": 0.08579502999782562, + "kl": 0.4674902521073818, + "learning_rate": 5.597669531616181e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.331456296145916, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 448 + }, + { + "completion_length": 664.71875, + "epoch": 0.9327447416255518, + "grad_norm": 0.056638821959495544, + "kl": 0.470287274569273, + "learning_rate": 5.5813702704557814e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 449 + }, + { + "completion_length": 618.296875, + "epoch": 0.9348221241236042, + "grad_norm": 0.040137626230716705, + "kl": 0.46139009296894073, + "learning_rate": 5.5650647441911706e-05, + "loss": 0.0002, + "reward": 0.9656250476837158, + "reward_std": 0.06629125960171223, + "rewards/argmax_reward_func": 0.765625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 450 + }, + { + "completion_length": 632.96875, + "epoch": 0.9368995066216567, + "grad_norm": 0.06284568458795547, + "kl": 0.517881490290165, + "learning_rate": 5.548753128537939e-05, + "loss": 0.0003, + "reward": 0.8718750476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 451 + }, + { + "completion_length": 677.46875, + "epoch": 0.9389768891197091, + "grad_norm": 0.07211080193519592, + "kl": 0.46745334565639496, + "learning_rate": 5.532435599277303e-05, + "loss": 0.0002, + "reward": 0.7781250439584255, + "reward_std": 0.24306794814765453, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 452 + }, + { + "completion_length": 697.96875, + "epoch": 0.9410542716177617, + "grad_norm": 0.06623782962560654, + "kl": 0.4283139891922474, + "learning_rate": 5.516112332254203e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.2209708634763956, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 453 + }, + { + "completion_length": 619.375, + "epoch": 0.9431316541158141, + "grad_norm": 0.08626007288694382, + "kl": 0.5277771130204201, + "learning_rate": 5.499783503375412e-05, + "loss": 0.0003, + "reward": 0.8250000476837158, + "reward_std": 0.30935920774936676, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 454 + }, + { + "completion_length": 674.0, + "epoch": 0.9452090366138666, + "grad_norm": 0.06324354559183121, + "kl": 0.4568277336657047, + "learning_rate": 5.4834492886076446e-05, + "loss": 0.0002, + "reward": 0.8714844174683094, + "reward_std": 0.19942620425717905, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.19960937835276127, + "step": 455 + }, + { + "completion_length": 701.0625, + "epoch": 0.947286419111919, + "grad_norm": 0.07262270897626877, + "kl": 0.4528024010360241, + "learning_rate": 5.4671098639756504e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 456 + }, + { + "completion_length": 693.90625, + "epoch": 0.9493638016099715, + "grad_norm": 0.06315562129020691, + "kl": 0.43744752556085587, + "learning_rate": 5.4507654055603275e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 457 + }, + { + "completion_length": 664.984375, + "epoch": 0.9514411841080239, + "grad_norm": 0.06270638853311539, + "kl": 0.4934372082352638, + "learning_rate": 5.4344160894968145e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 458 + }, + { + "completion_length": 679.78125, + "epoch": 0.9535185666060764, + "grad_norm": 0.06320095807313919, + "kl": 0.4636671505868435, + "learning_rate": 5.418062091972604e-05, + "loss": 0.0002, + "reward": 0.9019531756639481, + "reward_std": 0.15633688890375197, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.19882812537252903, + "step": 459 + }, + { + "completion_length": 768.765625, + "epoch": 0.9555959491041288, + "grad_norm": 0.08169972896575928, + "kl": 0.4431908018887043, + "learning_rate": 5.4017035892256365e-05, + "loss": 0.0002, + "reward": 0.7773437909781933, + "reward_std": 0.33256115578114986, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.19921875186264515, + "step": 460 + }, + { + "completion_length": 649.203125, + "epoch": 0.9576733316021813, + "grad_norm": 0.06060326099395752, + "kl": 0.4794473238289356, + "learning_rate": 5.385340757542402e-05, + "loss": 0.0002, + "reward": 0.6375000365078449, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.4375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 461 + }, + { + "completion_length": 752.859375, + "epoch": 0.9597507141002337, + "grad_norm": 0.07235154509544373, + "kl": 0.43572117015719414, + "learning_rate": 5.36897377325604e-05, + "loss": 0.0002, + "reward": 0.9328125417232513, + "reward_std": 0.2452776599675417, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.1984375026077032, + "step": 462 + }, + { + "completion_length": 680.78125, + "epoch": 0.9618280965982862, + "grad_norm": 0.07361527532339096, + "kl": 0.45767712593078613, + "learning_rate": 5.352602812744441e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.26516503654420376, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 463 + }, + { + "completion_length": 702.609375, + "epoch": 0.9639054790963386, + "grad_norm": 0.0740986093878746, + "kl": 0.493575606495142, + "learning_rate": 5.336228052428348e-05, + "loss": 0.0002, + "reward": 0.9031250476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.703125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 464 + }, + { + "completion_length": 733.359375, + "epoch": 0.9659828615943911, + "grad_norm": 0.07798778265714645, + "kl": 0.45794639363884926, + "learning_rate": 5.319849668769449e-05, + "loss": 0.0002, + "reward": 0.6675781644880772, + "reward_std": 0.2668223176151514, + "rewards/argmax_reward_func": 0.46875, + "rewards/format_reward_func": 0.19882812723517418, + "step": 465 + }, + { + "completion_length": 673.03125, + "epoch": 0.9680602440924435, + "grad_norm": 0.06236180663108826, + "kl": 0.4699827618896961, + "learning_rate": 5.303467838268478e-05, + "loss": 0.0002, + "reward": 0.8718750439584255, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 466 + }, + { + "completion_length": 695.03125, + "epoch": 0.970137626590496, + "grad_norm": 0.06047491356730461, + "kl": 0.42734822258353233, + "learning_rate": 5.287082737463317e-05, + "loss": 0.0002, + "reward": 0.8875000439584255, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 467 + }, + { + "completion_length": 700.515625, + "epoch": 0.9722150090885484, + "grad_norm": 0.05264519900083542, + "kl": 0.514576718211174, + "learning_rate": 5.270694542927088e-05, + "loss": 0.0003, + "reward": 0.9500000476837158, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.20000000298023224, + "step": 468 + }, + { + "completion_length": 685.015625, + "epoch": 0.9742923915866009, + "grad_norm": 0.07000822573900223, + "kl": 0.47352610528469086, + "learning_rate": 5.254303431266254e-05, + "loss": 0.0002, + "reward": 0.8382812961935997, + "reward_std": 0.24638251960277557, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.19765625149011612, + "step": 469 + }, + { + "completion_length": 778.0, + "epoch": 0.9763697740846533, + "grad_norm": 0.0691061019897461, + "kl": 0.44779016450047493, + "learning_rate": 5.2379095791187124e-05, + "loss": 0.0002, + "reward": 0.8238281644880772, + "reward_std": 0.2226281464099884, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.19882812723517418, + "step": 470 + }, + { + "completion_length": 721.953125, + "epoch": 0.9784471565827058, + "grad_norm": 0.056446801871061325, + "kl": 0.4972013346850872, + "learning_rate": 5.2215131631518945e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 471 + }, + { + "completion_length": 734.296875, + "epoch": 0.9805245390807582, + "grad_norm": 0.04742085933685303, + "kl": 0.4290156289935112, + "learning_rate": 5.20511436006086e-05, + "loss": 0.0002, + "reward": 0.9187500439584255, + "reward_std": 0.13258251920342445, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 472 + }, + { + "completion_length": 699.078125, + "epoch": 0.9826019215788107, + "grad_norm": 0.06520857661962509, + "kl": 0.44061052426695824, + "learning_rate": 5.188713346566393e-05, + "loss": 0.0002, + "reward": 0.9187500476837158, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 473 + }, + { + "completion_length": 816.515625, + "epoch": 0.9846793040768631, + "grad_norm": 0.06763774901628494, + "kl": 0.462362315505743, + "learning_rate": 5.172310299413099e-05, + "loss": 0.0002, + "reward": 0.8875000476837158, + "reward_std": 0.2651650384068489, + "rewards/argmax_reward_func": 0.6875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 474 + }, + { + "completion_length": 703.703125, + "epoch": 0.9867566865749156, + "grad_norm": 0.06497927010059357, + "kl": 0.4288316182792187, + "learning_rate": 5.1559053953674975e-05, + "loss": 0.0002, + "reward": 0.8562500476837158, + "reward_std": 0.1767766922712326, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 475 + }, + { + "completion_length": 734.265625, + "epoch": 0.9888340690729681, + "grad_norm": 0.061615679413080215, + "kl": 0.4229474924504757, + "learning_rate": 5.139498811216122e-05, + "loss": 0.0002, + "reward": 0.7156250383704901, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.515625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 476 + }, + { + "completion_length": 745.546875, + "epoch": 0.9909114515710206, + "grad_norm": 0.061035335063934326, + "kl": 0.44998469576239586, + "learning_rate": 5.123090723763606e-05, + "loss": 0.0002, + "reward": 0.7781250476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.578125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 477 + }, + { + "completion_length": 726.9375, + "epoch": 0.992988834069073, + "grad_norm": 0.05501917377114296, + "kl": 0.4711364693939686, + "learning_rate": 5.106681309830791e-05, + "loss": 0.0002, + "reward": 0.9312500469386578, + "reward_std": 0.1590990237891674, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.19687500223517418, + "step": 478 + }, + { + "completion_length": 731.265625, + "epoch": 0.9950662165671255, + "grad_norm": 0.06437938660383224, + "kl": 0.488413542509079, + "learning_rate": 5.090270746252802e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 479 + }, + { + "completion_length": 877.296875, + "epoch": 0.9971435990651779, + "grad_norm": 0.05059191957116127, + "kl": 0.3898215554654598, + "learning_rate": 5.073859209877168e-05, + "loss": 0.0002, + "reward": 0.9179687947034836, + "reward_std": 0.22207572311162949, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.19921875186264515, + "step": 480 + }, + { + "completion_length": 737.34375, + "epoch": 0.9992209815632304, + "grad_norm": 0.05600970238447189, + "kl": 0.4228545166552067, + "learning_rate": 5.057446877561884e-05, + "loss": 0.0002, + "reward": 0.9179687909781933, + "reward_std": 0.17788155004382133, + "rewards/argmax_reward_func": 0.71875, + "rewards/format_reward_func": 0.1992187537252903, + "step": 481 + }, + { + "completion_length": 867.2916666666666, + "epoch": 1.0, + "grad_norm": 0.0311344675719738, + "kl": 0.4165251553058624, + "learning_rate": 5.0410339261735384e-05, + "loss": 0.0001, + "reward": 0.9500000476837158, + "reward_std": 0.23570225636164346, + "rewards/argmax_reward_func": 0.75, + "rewards/format_reward_func": 0.20000000298023224, + "step": 482 + }, + { + "completion_length": 721.59375, + "epoch": 1.0020773824980524, + "grad_norm": 0.08058605343103409, + "kl": 0.4754480682313442, + "learning_rate": 5.0246205325853826e-05, + "loss": 0.0002, + "reward": 0.8250000476837158, + "reward_std": 0.30935921147465706, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 483 + }, + { + "completion_length": 749.40625, + "epoch": 1.004154764996105, + "grad_norm": 0.06808894872665405, + "kl": 0.4143032245337963, + "learning_rate": 5.008206873675433e-05, + "loss": 0.0002, + "reward": 0.9343750476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 484 + }, + { + "completion_length": 739.546875, + "epoch": 1.0062321474941573, + "grad_norm": 0.04413120448589325, + "kl": 0.4007079564034939, + "learning_rate": 4.991793126324568e-05, + "loss": 0.0002, + "reward": 0.9656250476837158, + "reward_std": 0.11048543266952038, + "rewards/argmax_reward_func": 0.765625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 485 + }, + { + "completion_length": 764.390625, + "epoch": 1.0083095299922098, + "grad_norm": 0.05544662848114967, + "kl": 0.40518468618392944, + "learning_rate": 4.9753794674146206e-05, + "loss": 0.0002, + "reward": 1.0125000476837158, + "reward_std": 0.17677669040858746, + "rewards/argmax_reward_func": 0.8125, + "rewards/format_reward_func": 0.20000000298023224, + "step": 486 + }, + { + "completion_length": 770.71875, + "epoch": 1.0103869124902622, + "grad_norm": 0.0641309842467308, + "kl": 0.41656066104769707, + "learning_rate": 4.9589660738264614e-05, + "loss": 0.0002, + "reward": 0.8718750476837158, + "reward_std": 0.19887377880513668, + "rewards/argmax_reward_func": 0.671875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 487 + }, + { + "completion_length": 774.984375, + "epoch": 1.0124642949883147, + "grad_norm": 0.06294507533311844, + "kl": 0.41369784995913506, + "learning_rate": 4.9425531224381163e-05, + "loss": 0.0002, + "reward": 0.7625000402331352, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 488 + }, + { + "completion_length": 725.171875, + "epoch": 1.0145416774863671, + "grad_norm": 0.058232299983501434, + "kl": 0.4683380052447319, + "learning_rate": 4.926140790122835e-05, + "loss": 0.0002, + "reward": 0.8406250476837158, + "reward_std": 0.19887377694249153, + "rewards/argmax_reward_func": 0.640625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 489 + }, + { + "completion_length": 782.046875, + "epoch": 1.0166190599844196, + "grad_norm": 0.053580548614263535, + "kl": 0.42908982560038567, + "learning_rate": 4.909729253747197e-05, + "loss": 0.0002, + "reward": 0.8093750476837158, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 490 + }, + { + "completion_length": 812.921875, + "epoch": 1.018696442482472, + "grad_norm": 0.07418368011713028, + "kl": 0.4282660707831383, + "learning_rate": 4.893318690169211e-05, + "loss": 0.0002, + "reward": 0.7625000476837158, + "reward_std": 0.30935920774936676, + "rewards/argmax_reward_func": 0.5625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 491 + }, + { + "completion_length": 772.4375, + "epoch": 1.0207738249805245, + "grad_norm": 0.053191013634204865, + "kl": 0.42502470314502716, + "learning_rate": 4.876909276236395e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 492 + }, + { + "completion_length": 752.390625, + "epoch": 1.022851207478577, + "grad_norm": 0.05855753272771835, + "kl": 0.42473678290843964, + "learning_rate": 4.8605011887838797e-05, + "loss": 0.0002, + "reward": 0.8554687947034836, + "reward_std": 0.22207572497427464, + "rewards/argmax_reward_func": 0.65625, + "rewards/format_reward_func": 0.19921875186264515, + "step": 493 + }, + { + "completion_length": 757.84375, + "epoch": 1.0249285899766294, + "grad_norm": 0.06828629225492477, + "kl": 0.409926887601614, + "learning_rate": 4.844094604632502e-05, + "loss": 0.0002, + "reward": 0.9968750476837158, + "reward_std": 0.24306795001029968, + "rewards/argmax_reward_func": 0.796875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 494 + }, + { + "completion_length": 781.3125, + "epoch": 1.0270059724746818, + "grad_norm": 0.07438631355762482, + "kl": 0.41192958503961563, + "learning_rate": 4.827689700586902e-05, + "loss": 0.0002, + "reward": 0.8093750439584255, + "reward_std": 0.331456296145916, + "rewards/argmax_reward_func": 0.609375, + "rewards/format_reward_func": 0.20000000298023224, + "step": 495 + }, + { + "completion_length": 793.234375, + "epoch": 1.0290833549727343, + "grad_norm": 0.05079561844468117, + "kl": 0.4043182320892811, + "learning_rate": 4.811286653433609e-05, + "loss": 0.0002, + "reward": 0.7468750476837158, + "reward_std": 0.15467960387468338, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 496 + }, + { + "completion_length": 781.796875, + "epoch": 1.0311607374707867, + "grad_norm": 0.05202874913811684, + "kl": 0.413474939763546, + "learning_rate": 4.794885639939142e-05, + "loss": 0.0002, + "reward": 0.7468750439584255, + "reward_std": 0.15467960573732853, + "rewards/argmax_reward_func": 0.546875, + "rewards/format_reward_func": 0.20000000298023224, + "step": 497 + }, + { + "completion_length": 810.25, + "epoch": 1.0332381199688392, + "grad_norm": 0.05652881786227226, + "kl": 0.40745414793491364, + "learning_rate": 4.7784868368481067e-05, + "loss": 0.0002, + "reward": 0.9335937947034836, + "reward_std": 0.1999786328524351, + "rewards/argmax_reward_func": 0.734375, + "rewards/format_reward_func": 0.1992187537252903, + "step": 498 + }, + { + "completion_length": 799.765625, + "epoch": 1.0353155024668916, + "grad_norm": 0.0548785924911499, + "kl": 0.39410270750522614, + "learning_rate": 4.762090420881289e-05, + "loss": 0.0002, + "reward": 0.9953125491738319, + "reward_std": 0.1568893175572157, + "rewards/argmax_reward_func": 0.796875, + "rewards/format_reward_func": 0.1984375026077032, + "step": 499 + }, + { + "completion_length": 777.75, + "epoch": 1.037392884964944, + "grad_norm": 0.06177806481719017, + "kl": 0.6858577094972134, + "learning_rate": 4.745696568733748e-05, + "loss": 0.0003, + "reward": 0.8250000439584255, + "reward_std": 0.22097086533904076, + "rewards/argmax_reward_func": 0.625, + "rewards/format_reward_func": 0.20000000298023224, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 962, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}