| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.467661691542289, | |
| "eval_steps": 500, | |
| "global_step": 450, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 497.89441871643066, | |
| "epoch": 0.01990049751243781, | |
| "grad_norm": 0.10737393795486937, | |
| "kl": 0.0, | |
| "learning_rate": 7.142857142857142e-08, | |
| "loss": -0.0, | |
| "reward": 0.31375559605658054, | |
| "reward_std": 0.4638704024255276, | |
| "rewards/equation_reward_func": 0.055524556897580624, | |
| "rewards/format_reward_func": 0.2582310391589999, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 500.6192989349365, | |
| "epoch": 0.03980099502487562, | |
| "grad_norm": 0.1599096035861993, | |
| "kl": 0.0005021095275878906, | |
| "learning_rate": 1.4285714285714285e-07, | |
| "loss": 0.0, | |
| "reward": 0.31654577143490314, | |
| "reward_std": 0.4677821956574917, | |
| "rewards/equation_reward_func": 0.052315850742161274, | |
| "rewards/format_reward_func": 0.2642299197614193, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 498.7277030944824, | |
| "epoch": 0.05970149253731343, | |
| "grad_norm": 0.12470905475686579, | |
| "kl": 0.0005538463592529297, | |
| "learning_rate": 2.1428571428571426e-07, | |
| "loss": 0.0, | |
| "reward": 0.31794086284935474, | |
| "reward_std": 0.4660887122154236, | |
| "rewards/equation_reward_func": 0.04980468982830644, | |
| "rewards/format_reward_func": 0.2681361688300967, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 499.9832820892334, | |
| "epoch": 0.07960199004975124, | |
| "grad_norm": 0.09022097705508712, | |
| "kl": 0.0006265640258789062, | |
| "learning_rate": 2.857142857142857e-07, | |
| "loss": 0.0, | |
| "reward": 0.32268416695296764, | |
| "reward_std": 0.46451471373438835, | |
| "rewards/equation_reward_func": 0.05259486869908869, | |
| "rewards/format_reward_func": 0.270089297555387, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 496.9704475402832, | |
| "epoch": 0.09950248756218906, | |
| "grad_norm": 0.2052586878420191, | |
| "kl": 0.0007350444793701172, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 0.0, | |
| "reward": 0.354213185608387, | |
| "reward_std": 0.4816671200096607, | |
| "rewards/equation_reward_func": 0.05315290507860482, | |
| "rewards/format_reward_func": 0.3010602779686451, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 485.7511405944824, | |
| "epoch": 0.11940298507462686, | |
| "grad_norm": 0.21186116379159584, | |
| "kl": 0.0010306835174560547, | |
| "learning_rate": 4.285714285714285e-07, | |
| "loss": 0.0, | |
| "reward": 0.40011162497103214, | |
| "reward_std": 0.5103424116969109, | |
| "rewards/equation_reward_func": 0.06138393096625805, | |
| "rewards/format_reward_func": 0.33872769214212894, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 473.2119331359863, | |
| "epoch": 0.13930348258706468, | |
| "grad_norm": 0.09785836206724959, | |
| "kl": 0.0022177696228027344, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.5080915465950966, | |
| "reward_std": 0.5347487553954124, | |
| "rewards/equation_reward_func": 0.051060270983725786, | |
| "rewards/format_reward_func": 0.45703127048909664, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 463.4574737548828, | |
| "epoch": 0.15920398009950248, | |
| "grad_norm": 0.1249696964395279, | |
| "kl": 0.0034818649291992188, | |
| "learning_rate": 4.999740409224932e-07, | |
| "loss": 0.0, | |
| "reward": 0.5510602928698063, | |
| "reward_std": 0.5517751909792423, | |
| "rewards/equation_reward_func": 0.05970982392318547, | |
| "rewards/format_reward_func": 0.49135046638548374, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 461.6699447631836, | |
| "epoch": 0.1791044776119403, | |
| "grad_norm": 0.07719879335571161, | |
| "kl": 0.02056884765625, | |
| "learning_rate": 4.998961690809627e-07, | |
| "loss": 0.0, | |
| "reward": 0.8258928880095482, | |
| "reward_std": 0.48028773814439774, | |
| "rewards/equation_reward_func": 0.07463728077709675, | |
| "rewards/format_reward_func": 0.7512556128203869, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 459.39315605163574, | |
| "epoch": 0.19900497512437812, | |
| "grad_norm": 0.05432953005138694, | |
| "kl": 0.01270294189453125, | |
| "learning_rate": 4.997664006472578e-07, | |
| "loss": 0.0, | |
| "reward": 0.8645368739962578, | |
| "reward_std": 0.462785379961133, | |
| "rewards/equation_reward_func": 0.08398437779396772, | |
| "rewards/format_reward_func": 0.7805524915456772, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 465.21360969543457, | |
| "epoch": 0.21890547263681592, | |
| "grad_norm": 0.04023038097471816, | |
| "kl": 0.018253326416015625, | |
| "learning_rate": 4.995847625707292e-07, | |
| "loss": 0.0, | |
| "reward": 0.9042969159781933, | |
| "reward_std": 0.44192507304251194, | |
| "rewards/equation_reward_func": 0.08663504850119352, | |
| "rewards/format_reward_func": 0.8176618590950966, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 458.09446716308594, | |
| "epoch": 0.23880597014925373, | |
| "grad_norm": 0.3953751452129234, | |
| "kl": 0.22201919555664062, | |
| "learning_rate": 4.993512925726318e-07, | |
| "loss": 0.0002, | |
| "reward": 0.940011203289032, | |
| "reward_std": 0.4100842922925949, | |
| "rewards/equation_reward_func": 0.09012277191504836, | |
| "rewards/format_reward_func": 0.8498884290456772, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 454.33079528808594, | |
| "epoch": 0.25870646766169153, | |
| "grad_norm": 0.03286998841319716, | |
| "kl": 0.013294219970703125, | |
| "learning_rate": 4.990660391382923e-07, | |
| "loss": 0.0, | |
| "reward": 1.002092681825161, | |
| "reward_std": 0.33628420904278755, | |
| "rewards/equation_reward_func": 0.09430804010480642, | |
| "rewards/format_reward_func": 0.9077846370637417, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 445.8059616088867, | |
| "epoch": 0.27860696517412936, | |
| "grad_norm": 0.039643071301690654, | |
| "kl": 0.01692962646484375, | |
| "learning_rate": 4.987290615070384e-07, | |
| "loss": 0.0, | |
| "reward": 1.0570592060685158, | |
| "reward_std": 0.3109265537932515, | |
| "rewards/equation_reward_func": 0.1146763451397419, | |
| "rewards/format_reward_func": 0.9423828534781933, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 438.74750900268555, | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 0.029650001026949938, | |
| "kl": 0.0180511474609375, | |
| "learning_rate": 4.983404296598978e-07, | |
| "loss": 0.0, | |
| "reward": 1.0705915540456772, | |
| "reward_std": 0.28584692627191544, | |
| "rewards/equation_reward_func": 0.11621094262227416, | |
| "rewards/format_reward_func": 0.9543806239962578, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 433.07144927978516, | |
| "epoch": 0.31840796019900497, | |
| "grad_norm": 0.023126884095602856, | |
| "kl": 0.027069091796875, | |
| "learning_rate": 4.979002243050646e-07, | |
| "loss": 0.0, | |
| "reward": 1.074776828289032, | |
| "reward_std": 0.2771673630923033, | |
| "rewards/equation_reward_func": 0.11523438105359674, | |
| "rewards/format_reward_func": 0.959542453289032, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 434.3010845184326, | |
| "epoch": 0.3383084577114428, | |
| "grad_norm": 0.023783878023720312, | |
| "kl": 0.02630615234375, | |
| "learning_rate": 4.974085368611381e-07, | |
| "loss": 0.0, | |
| "reward": 1.0905413404107094, | |
| "reward_std": 0.2704134099185467, | |
| "rewards/equation_reward_func": 0.12513951491564512, | |
| "rewards/format_reward_func": 0.965401828289032, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 424.73927307128906, | |
| "epoch": 0.3582089552238806, | |
| "grad_norm": 0.022826217100098863, | |
| "kl": 0.02862548828125, | |
| "learning_rate": 4.968654694381379e-07, | |
| "loss": 0.0, | |
| "reward": 1.1011440306901932, | |
| "reward_std": 0.27443134970963, | |
| "rewards/equation_reward_func": 0.13364955829456449, | |
| "rewards/format_reward_func": 0.9674944616854191, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 421.71764945983887, | |
| "epoch": 0.3781094527363184, | |
| "grad_norm": 0.02576898038639457, | |
| "kl": 0.031646728515625, | |
| "learning_rate": 4.962711348162987e-07, | |
| "loss": 0.0, | |
| "reward": 1.1131417900323868, | |
| "reward_std": 0.2660633362829685, | |
| "rewards/equation_reward_func": 0.13950893329456449, | |
| "rewards/format_reward_func": 0.9736328534781933, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 424.4537010192871, | |
| "epoch": 0.39800995024875624, | |
| "grad_norm": 0.022612440149115728, | |
| "kl": 0.026824951171875, | |
| "learning_rate": 4.956256564226487e-07, | |
| "loss": 0.0, | |
| "reward": 1.1064453646540642, | |
| "reward_std": 0.27615197841078043, | |
| "rewards/equation_reward_func": 0.13560268422588706, | |
| "rewards/format_reward_func": 0.9708426780998707, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 425.5305690765381, | |
| "epoch": 0.417910447761194, | |
| "grad_norm": 0.023153553071042596, | |
| "kl": 0.02577972412109375, | |
| "learning_rate": 4.949291683053768e-07, | |
| "loss": 0.0, | |
| "reward": 1.115373931825161, | |
| "reward_std": 0.2722494639456272, | |
| "rewards/equation_reward_func": 0.14383371267467737, | |
| "rewards/format_reward_func": 0.9715402275323868, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 422.5471706390381, | |
| "epoch": 0.43781094527363185, | |
| "grad_norm": 0.027299150469364885, | |
| "kl": 0.028350830078125, | |
| "learning_rate": 4.941818151059955e-07, | |
| "loss": 0.0, | |
| "reward": 1.125837117433548, | |
| "reward_std": 0.2778179133310914, | |
| "rewards/equation_reward_func": 0.15150670520961285, | |
| "rewards/format_reward_func": 0.9743304029107094, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 422.8593921661377, | |
| "epoch": 0.4577114427860697, | |
| "grad_norm": 0.023537286216153662, | |
| "kl": 0.03047943115234375, | |
| "learning_rate": 4.933837520293017e-07, | |
| "loss": 0.0, | |
| "reward": 1.1294643208384514, | |
| "reward_std": 0.2802300015464425, | |
| "rewards/equation_reward_func": 0.15583148133009672, | |
| "rewards/format_reward_func": 0.9736328534781933, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 428.58651542663574, | |
| "epoch": 0.47761194029850745, | |
| "grad_norm": 0.025036984912196695, | |
| "kl": 0.04383087158203125, | |
| "learning_rate": 4.925351448111454e-07, | |
| "loss": 0.0, | |
| "reward": 1.1287667974829674, | |
| "reward_std": 0.2709479182958603, | |
| "rewards/equation_reward_func": 0.15401786426082253, | |
| "rewards/format_reward_func": 0.9747489280998707, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 419.9082202911377, | |
| "epoch": 0.4975124378109453, | |
| "grad_norm": 0.020185437487083808, | |
| "kl": 0.0331573486328125, | |
| "learning_rate": 4.91636169684011e-07, | |
| "loss": 0.0, | |
| "reward": 1.160993367433548, | |
| "reward_std": 0.29635069239884615, | |
| "rewards/equation_reward_func": 0.18931362591683865, | |
| "rewards/format_reward_func": 0.9716797284781933, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 425.99974250793457, | |
| "epoch": 0.5174129353233831, | |
| "grad_norm": 0.020624606663129857, | |
| "kl": 0.03849029541015625, | |
| "learning_rate": 4.906870133404186e-07, | |
| "loss": 0.0, | |
| "reward": 1.1577846482396126, | |
| "reward_std": 0.283615630120039, | |
| "rewards/equation_reward_func": 0.18191965157166123, | |
| "rewards/format_reward_func": 0.9758649952709675, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 424.38018226623535, | |
| "epoch": 0.5373134328358209, | |
| "grad_norm": 0.023250898076743586, | |
| "kl": 0.036407470703125, | |
| "learning_rate": 4.896878728941531e-07, | |
| "loss": 0.0, | |
| "reward": 1.1752232536673546, | |
| "reward_std": 0.29128449968993664, | |
| "rewards/equation_reward_func": 0.19824219681322575, | |
| "rewards/format_reward_func": 0.9769810698926449, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 433.0477294921875, | |
| "epoch": 0.5572139303482587, | |
| "grad_norm": 0.02065475126352545, | |
| "kl": 0.040374755859375, | |
| "learning_rate": 4.886389558393284e-07, | |
| "loss": 0.0, | |
| "reward": 1.189313679933548, | |
| "reward_std": 0.2798562031239271, | |
| "rewards/equation_reward_func": 0.21149554569274187, | |
| "rewards/format_reward_func": 0.9778181277215481, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 428.04257011413574, | |
| "epoch": 0.5771144278606966, | |
| "grad_norm": 0.02202684429694144, | |
| "kl": 0.04254150390625, | |
| "learning_rate": 4.875404800072976e-07, | |
| "loss": 0.0, | |
| "reward": 1.2169364243745804, | |
| "reward_std": 0.30818541161715984, | |
| "rewards/equation_reward_func": 0.23883929383009672, | |
| "rewards/format_reward_func": 0.9780971370637417, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 440.5400581359863, | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 0.022136857567870986, | |
| "kl": 0.0439300537109375, | |
| "learning_rate": 4.86392673521415e-07, | |
| "loss": 0.0, | |
| "reward": 1.197823703289032, | |
| "reward_std": 0.2901919763535261, | |
| "rewards/equation_reward_func": 0.22070313524454832, | |
| "rewards/format_reward_func": 0.9771205745637417, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 446.76606369018555, | |
| "epoch": 0.6169154228855721, | |
| "grad_norm": 0.020045833524807325, | |
| "kl": 0.0489501953125, | |
| "learning_rate": 4.851957747496606e-07, | |
| "loss": 0.0, | |
| "reward": 1.2128906697034836, | |
| "reward_std": 0.30074305925518274, | |
| "rewards/equation_reward_func": 0.23995536845177412, | |
| "rewards/format_reward_func": 0.9729353003203869, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 452.036434173584, | |
| "epoch": 0.6368159203980099, | |
| "grad_norm": 0.0239338686052094, | |
| "kl": 0.0522613525390625, | |
| "learning_rate": 4.839500322551386e-07, | |
| "loss": 0.0001, | |
| "reward": 1.2333984822034836, | |
| "reward_std": 0.30399987660348415, | |
| "rewards/equation_reward_func": 0.258928582072258, | |
| "rewards/format_reward_func": 0.9744699038565159, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 448.9843940734863, | |
| "epoch": 0.6567164179104478, | |
| "grad_norm": 0.022610330073719636, | |
| "kl": 0.057861328125, | |
| "learning_rate": 4.826557047444563e-07, | |
| "loss": 0.0001, | |
| "reward": 1.2505580931901932, | |
| "reward_std": 0.31434865668416023, | |
| "rewards/equation_reward_func": 0.276925235055387, | |
| "rewards/format_reward_func": 0.973632849752903, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 454.09237480163574, | |
| "epoch": 0.6766169154228856, | |
| "grad_norm": 0.023403129215536396, | |
| "kl": 0.05816650390625, | |
| "learning_rate": 4.813130610139993e-07, | |
| "loss": 0.0001, | |
| "reward": 1.255998931825161, | |
| "reward_std": 0.31524653546512127, | |
| "rewards/equation_reward_func": 0.28669085912406445, | |
| "rewards/format_reward_func": 0.9693080857396126, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 466.7805767059326, | |
| "epoch": 0.6965174129353234, | |
| "grad_norm": 0.02003913094689346, | |
| "kl": 0.0601806640625, | |
| "learning_rate": 4.799223798941089e-07, | |
| "loss": 0.0001, | |
| "reward": 1.24874447286129, | |
| "reward_std": 0.3061249665915966, | |
| "rewards/equation_reward_func": 0.2795759029686451, | |
| "rewards/format_reward_func": 0.9691685661673546, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 467.3309383392334, | |
| "epoch": 0.7164179104477612, | |
| "grad_norm": 0.01978949781210455, | |
| "kl": 0.061767578125, | |
| "learning_rate": 4.78483950191177e-07, | |
| "loss": 0.0001, | |
| "reward": 1.258928619325161, | |
| "reward_std": 0.3061672504991293, | |
| "rewards/equation_reward_func": 0.28724889643490314, | |
| "rewards/format_reward_func": 0.9716797359287739, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 471.1625499725342, | |
| "epoch": 0.736318407960199, | |
| "grad_norm": 0.020300590196667745, | |
| "kl": 0.0648193359375, | |
| "learning_rate": 4.769980706276687e-07, | |
| "loss": 0.0001, | |
| "reward": 1.2569754868745804, | |
| "reward_std": 0.299039950594306, | |
| "rewards/equation_reward_func": 0.2861328236758709, | |
| "rewards/format_reward_func": 0.9708426706492901, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 478.2706756591797, | |
| "epoch": 0.7562189054726368, | |
| "grad_norm": 0.021712386701862433, | |
| "kl": 0.06854248046875, | |
| "learning_rate": 4.7546504978008595e-07, | |
| "loss": 0.0001, | |
| "reward": 1.2862723618745804, | |
| "reward_std": 0.32256563380360603, | |
| "rewards/equation_reward_func": 0.3172433190047741, | |
| "rewards/format_reward_func": 0.9690290540456772, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 474.00350761413574, | |
| "epoch": 0.7761194029850746, | |
| "grad_norm": 0.024547045380638694, | |
| "kl": 0.0723876953125, | |
| "learning_rate": 4.738852060148848e-07, | |
| "loss": 0.0001, | |
| "reward": 1.2960380166769028, | |
| "reward_std": 0.3062659613788128, | |
| "rewards/equation_reward_func": 0.3250558152794838, | |
| "rewards/format_reward_func": 0.9709821864962578, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 478.568660736084, | |
| "epoch": 0.7960199004975125, | |
| "grad_norm": 0.0288169498485096, | |
| "kl": 0.0826416015625, | |
| "learning_rate": 4.722588674223593e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3084543123841286, | |
| "reward_std": 0.31972748413681984, | |
| "rewards/equation_reward_func": 0.3383091650903225, | |
| "rewards/format_reward_func": 0.9701451361179352, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 479.23690605163574, | |
| "epoch": 0.8159203980099502, | |
| "grad_norm": 0.023334759508638104, | |
| "kl": 0.076751708984375, | |
| "learning_rate": 4.70586371748506e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3094308599829674, | |
| "reward_std": 0.3208370003849268, | |
| "rewards/equation_reward_func": 0.34207590855658054, | |
| "rewards/format_reward_func": 0.9673549495637417, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 472.42859268188477, | |
| "epoch": 0.835820895522388, | |
| "grad_norm": 0.022189875549398243, | |
| "kl": 0.083038330078125, | |
| "learning_rate": 4.6886806632488363e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3257534205913544, | |
| "reward_std": 0.3084813691675663, | |
| "rewards/equation_reward_func": 0.354213185608387, | |
| "rewards/format_reward_func": 0.9715402200818062, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 482.4203643798828, | |
| "epoch": 0.8557213930348259, | |
| "grad_norm": 0.025550851376685307, | |
| "kl": 0.08477783203125, | |
| "learning_rate": 4.6710430799648143e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3151507154107094, | |
| "reward_std": 0.3119509872049093, | |
| "rewards/equation_reward_func": 0.3477957770228386, | |
| "rewards/format_reward_func": 0.9673549570143223, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 486.5195541381836, | |
| "epoch": 0.8756218905472637, | |
| "grad_norm": 0.02223938159712647, | |
| "kl": 0.087982177734375, | |
| "learning_rate": 4.652954630476127e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3271484971046448, | |
| "reward_std": 0.2954318383708596, | |
| "rewards/equation_reward_func": 0.3623046986758709, | |
| "rewards/format_reward_func": 0.9648437798023224, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 484.2335624694824, | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 0.027365354466186077, | |
| "kl": 0.09051513671875, | |
| "learning_rate": 4.6344190712584713e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3436105474829674, | |
| "reward_std": 0.30479095969349146, | |
| "rewards/equation_reward_func": 0.3743024729192257, | |
| "rewards/format_reward_func": 0.9693080745637417, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 487.52652740478516, | |
| "epoch": 0.9154228855721394, | |
| "grad_norm": 0.024069403720113854, | |
| "kl": 0.09832763671875, | |
| "learning_rate": 4.615440251639995e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3451451659202576, | |
| "reward_std": 0.3047034330666065, | |
| "rewards/equation_reward_func": 0.37541854195296764, | |
| "rewards/format_reward_func": 0.9697266034781933, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 477.2613220214844, | |
| "epoch": 0.9353233830845771, | |
| "grad_norm": 0.02846015039604419, | |
| "kl": 0.101318359375, | |
| "learning_rate": 4.596022113001894e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3302176892757416, | |
| "reward_std": 0.29510026797652245, | |
| "rewards/equation_reward_func": 0.3597935438156128, | |
| "rewards/format_reward_func": 0.9704241491854191, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 463.9218978881836, | |
| "epoch": 0.9552238805970149, | |
| "grad_norm": 0.02336160587101436, | |
| "kl": 0.11102294921875, | |
| "learning_rate": 4.576168687959895e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3597935810685158, | |
| "reward_std": 0.27583552338182926, | |
| "rewards/equation_reward_func": 0.38295202516019344, | |
| "rewards/format_reward_func": 0.9768415540456772, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 475.0615482330322, | |
| "epoch": 0.9751243781094527, | |
| "grad_norm": 0.023628466997817635, | |
| "kl": 0.1075439453125, | |
| "learning_rate": 4.555884099526793e-07, | |
| "loss": 0.0001, | |
| "reward": 1.339146263897419, | |
| "reward_std": 0.2757530389353633, | |
| "rewards/equation_reward_func": 0.3607701025903225, | |
| "rewards/format_reward_func": 0.9783761575818062, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 453.8726501464844, | |
| "epoch": 0.9950248756218906, | |
| "grad_norm": 0.02314348112300545, | |
| "kl": 0.116424560546875, | |
| "learning_rate": 4.5351725602562174e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3716518506407738, | |
| "reward_std": 0.27756993286311626, | |
| "rewards/equation_reward_func": 0.39006697945296764, | |
| "rewards/format_reward_func": 0.9815848655998707, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 445.43651580810547, | |
| "epoch": 1.0199004975124377, | |
| "grad_norm": 0.031204448170122, | |
| "kl": 0.122705078125, | |
| "learning_rate": 4.514038371367791e-07, | |
| "loss": 0.0002, | |
| "reward": 1.3746652364730836, | |
| "reward_std": 0.2709580324590206, | |
| "rewards/equation_reward_func": 0.3933035880327225, | |
| "rewards/format_reward_func": 0.9813616544008255, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 438.7907543182373, | |
| "epoch": 1.0398009950248757, | |
| "grad_norm": 0.05302272032875539, | |
| "kl": 0.17510986328125, | |
| "learning_rate": 4.4924859218538936e-07, | |
| "loss": 0.0002, | |
| "reward": 1.3924386873841286, | |
| "reward_std": 0.2553457273170352, | |
| "rewards/equation_reward_func": 0.40945872850716114, | |
| "rewards/format_reward_func": 0.9829799495637417, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 436.17859268188477, | |
| "epoch": 1.0597014925373134, | |
| "grad_norm": 0.02585336057360534, | |
| "kl": 0.136199951171875, | |
| "learning_rate": 4.470519687568185e-07, | |
| "loss": 0.0001, | |
| "reward": 1.363699845969677, | |
| "reward_std": 0.2516592526808381, | |
| "rewards/equation_reward_func": 0.37960381247103214, | |
| "rewards/format_reward_func": 0.9840960279107094, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 425.2934055328369, | |
| "epoch": 1.0796019900497513, | |
| "grad_norm": 0.02611052947250305, | |
| "kl": 0.149169921875, | |
| "learning_rate": 4.4481442302960923e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3782087713479996, | |
| "reward_std": 0.23252319544553757, | |
| "rewards/equation_reward_func": 0.39453126676380634, | |
| "rewards/format_reward_func": 0.9836774952709675, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 420.09809494018555, | |
| "epoch": 1.099502487562189, | |
| "grad_norm": 0.029872543237961284, | |
| "kl": 0.15460205078125, | |
| "learning_rate": 4.4253641968074505e-07, | |
| "loss": 0.0002, | |
| "reward": 1.3736049681901932, | |
| "reward_std": 0.24892226420342922, | |
| "rewards/equation_reward_func": 0.3900669813156128, | |
| "rewards/format_reward_func": 0.9835379905998707, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 409.33427810668945, | |
| "epoch": 1.1194029850746268, | |
| "grad_norm": 0.031814359795596575, | |
| "kl": 0.337646484375, | |
| "learning_rate": 4.402184317891501e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3981585577130318, | |
| "reward_std": 0.2557879099622369, | |
| "rewards/equation_reward_func": 0.4168526940047741, | |
| "rewards/format_reward_func": 0.9813058413565159, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 410.7158374786377, | |
| "epoch": 1.1393034825870647, | |
| "grad_norm": 0.03295876265632556, | |
| "kl": 0.21923828125, | |
| "learning_rate": 4.37860940737443e-07, | |
| "loss": 0.0002, | |
| "reward": 1.37374447286129, | |
| "reward_std": 0.2627491420134902, | |
| "rewards/equation_reward_func": 0.4023437686264515, | |
| "rewards/format_reward_func": 0.9714007116854191, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 409.2297878265381, | |
| "epoch": 1.1592039800995024, | |
| "grad_norm": 0.038667349208111046, | |
| "kl": 0.19073486328125, | |
| "learning_rate": 4.354644361119671e-07, | |
| "loss": 0.0002, | |
| "reward": 1.3642578944563866, | |
| "reward_std": 0.27275012247264385, | |
| "rewards/equation_reward_func": 0.398158498108387, | |
| "rewards/format_reward_func": 0.9660993739962578, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 403.59544372558594, | |
| "epoch": 1.1791044776119404, | |
| "grad_norm": 0.036040137536976935, | |
| "kl": 0.208984375, | |
| "learning_rate": 4.3302941560111716e-07, | |
| "loss": 0.0002, | |
| "reward": 1.3783482760190964, | |
| "reward_std": 0.29416331835091114, | |
| "rewards/equation_reward_func": 0.4210379682481289, | |
| "rewards/format_reward_func": 0.9573103114962578, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 414.47644233703613, | |
| "epoch": 1.199004975124378, | |
| "grad_norm": 0.04236714791677691, | |
| "kl": 0.21710205078125, | |
| "learning_rate": 4.3055638489198236e-07, | |
| "loss": 0.0002, | |
| "reward": 1.331194244325161, | |
| "reward_std": 0.30849068984389305, | |
| "rewards/equation_reward_func": 0.38560269586741924, | |
| "rewards/format_reward_func": 0.9455915614962578, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 405.2057914733887, | |
| "epoch": 1.2189054726368158, | |
| "grad_norm": 0.09728181088720217, | |
| "kl": 0.23699951171875, | |
| "learning_rate": 4.280458575653296e-07, | |
| "loss": 0.0002, | |
| "reward": 1.3745815306901932, | |
| "reward_std": 0.32267612777650356, | |
| "rewards/equation_reward_func": 0.43498885817825794, | |
| "rewards/format_reward_func": 0.9395926706492901, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 410.2011909484863, | |
| "epoch": 1.2388059701492538, | |
| "grad_norm": 0.08144761353636777, | |
| "kl": 0.26934814453125, | |
| "learning_rate": 4.2549835498894665e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3253348842263222, | |
| "reward_std": 0.31379703618586063, | |
| "rewards/equation_reward_func": 0.3900669850409031, | |
| "rewards/format_reward_func": 0.9352679066359997, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 401.1234836578369, | |
| "epoch": 1.2587064676616915, | |
| "grad_norm": 11.330193029655701, | |
| "kl": 3.78851318359375, | |
| "learning_rate": 4.229144062093679e-07, | |
| "loss": 0.0038, | |
| "reward": 1.3459822088479996, | |
| "reward_std": 0.3209280576556921, | |
| "rewards/equation_reward_func": 0.40638952888548374, | |
| "rewards/format_reward_func": 0.9395926743745804, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 402.68640518188477, | |
| "epoch": 1.2786069651741294, | |
| "grad_norm": 0.043310106761832265, | |
| "kl": 0.29443359375, | |
| "learning_rate": 4.2029454784200675e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3317522779107094, | |
| "reward_std": 0.30866547860205173, | |
| "rewards/equation_reward_func": 0.39578684605658054, | |
| "rewards/format_reward_func": 0.9359654411673546, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 397.28475761413574, | |
| "epoch": 1.2985074626865671, | |
| "grad_norm": 0.03785892052881466, | |
| "kl": 0.303955078125, | |
| "learning_rate": 4.1763932395971433e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3419364467263222, | |
| "reward_std": 0.3180979546159506, | |
| "rewards/equation_reward_func": 0.4065290354192257, | |
| "rewards/format_reward_func": 0.9354074075818062, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 398.46011543273926, | |
| "epoch": 1.3184079601990049, | |
| "grad_norm": 0.09587175758858843, | |
| "kl": 0.5091552734375, | |
| "learning_rate": 4.1494928597979117e-07, | |
| "loss": 0.0005, | |
| "reward": 1.3143136650323868, | |
| "reward_std": 0.3102891594171524, | |
| "rewards/equation_reward_func": 0.38309153728187084, | |
| "rewards/format_reward_func": 0.9312221445143223, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 389.87487602233887, | |
| "epoch": 1.3383084577114428, | |
| "grad_norm": 0.05091987798629075, | |
| "kl": 0.343017578125, | |
| "learning_rate": 4.122249925494726e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3254743814468384, | |
| "reward_std": 0.31261836364865303, | |
| "rewards/equation_reward_func": 0.39188059978187084, | |
| "rewards/format_reward_func": 0.9335937984287739, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 375.31112480163574, | |
| "epoch": 1.3582089552238805, | |
| "grad_norm": 0.04504808671482071, | |
| "kl": 0.337646484375, | |
| "learning_rate": 4.094670094299131e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3480748385190964, | |
| "reward_std": 0.291673069819808, | |
| "rewards/equation_reward_func": 0.4047154225409031, | |
| "rewards/format_reward_func": 0.943359412252903, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 372.3832492828369, | |
| "epoch": 1.3781094527363185, | |
| "grad_norm": 0.045870065189935895, | |
| "kl": 0.320068359375, | |
| "learning_rate": 4.066759093786931e-07, | |
| "loss": 0.0003, | |
| "reward": 1.336216576397419, | |
| "reward_std": 0.2885549124330282, | |
| "rewards/equation_reward_func": 0.3929966688156128, | |
| "rewards/format_reward_func": 0.9432199075818062, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 365.1819381713867, | |
| "epoch": 1.3980099502487562, | |
| "grad_norm": 0.03591795932753835, | |
| "kl": 0.35400390625, | |
| "learning_rate": 4.038522720308732e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3397042825818062, | |
| "reward_std": 0.27093746792525053, | |
| "rewards/equation_reward_func": 0.38630024157464504, | |
| "rewards/format_reward_func": 0.9534040652215481, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 351.4241237640381, | |
| "epoch": 1.417910447761194, | |
| "grad_norm": 0.14706137636733024, | |
| "kl": 0.5086669921875, | |
| "learning_rate": 4.009966837786194e-07, | |
| "loss": 0.0005, | |
| "reward": 1.3590960577130318, | |
| "reward_std": 0.2756015956401825, | |
| "rewards/equation_reward_func": 0.4013672061264515, | |
| "rewards/format_reward_func": 0.9577288404107094, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 337.3819923400879, | |
| "epoch": 1.4378109452736318, | |
| "grad_norm": 0.05428672814003729, | |
| "kl": 0.4088134765625, | |
| "learning_rate": 3.981097376494259e-07, | |
| "loss": 0.0004, | |
| "reward": 1.377092681825161, | |
| "reward_std": 0.2494307504966855, | |
| "rewards/equation_reward_func": 0.41294644586741924, | |
| "rewards/format_reward_func": 0.9641462452709675, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 328.61650466918945, | |
| "epoch": 1.4577114427860698, | |
| "grad_norm": 0.03880844213298933, | |
| "kl": 0.3502197265625, | |
| "learning_rate": 3.951920331829592e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3715123534202576, | |
| "reward_std": 0.24660830944776535, | |
| "rewards/equation_reward_func": 0.402622789144516, | |
| "rewards/format_reward_func": 0.9688895456492901, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 313.04326248168945, | |
| "epoch": 1.4776119402985075, | |
| "grad_norm": 0.03817341156348879, | |
| "kl": 0.3505859375, | |
| "learning_rate": 3.922441763065506e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3846261724829674, | |
| "reward_std": 0.23643828835338354, | |
| "rewards/equation_reward_func": 0.41252792440354824, | |
| "rewards/format_reward_func": 0.9720982536673546, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 305.9158878326416, | |
| "epoch": 1.4975124378109452, | |
| "grad_norm": 2.541937238236308, | |
| "kl": 2.521484375, | |
| "learning_rate": 3.8926677920936093e-07, | |
| "loss": 0.0025, | |
| "reward": 1.372907429933548, | |
| "reward_std": 0.23928395099937916, | |
| "rewards/equation_reward_func": 0.3985770270228386, | |
| "rewards/format_reward_func": 0.9743303954601288, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 298.2822437286377, | |
| "epoch": 1.517412935323383, | |
| "grad_norm": 0.03821653296868408, | |
| "kl": 0.3787841796875, | |
| "learning_rate": 3.862604602152464e-07, | |
| "loss": 0.0004, | |
| "reward": 1.373465470969677, | |
| "reward_std": 0.22954470664262772, | |
| "rewards/equation_reward_func": 0.39899555407464504, | |
| "rewards/format_reward_func": 0.9744699038565159, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 296.4987564086914, | |
| "epoch": 1.537313432835821, | |
| "grad_norm": 0.033615495245343754, | |
| "kl": 0.358154296875, | |
| "learning_rate": 3.8322584365434934e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3853237107396126, | |
| "reward_std": 0.22156918980181217, | |
| "rewards/equation_reward_func": 0.4132254682481289, | |
| "rewards/format_reward_func": 0.9720982536673546, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 296.8680362701416, | |
| "epoch": 1.5572139303482588, | |
| "grad_norm": 0.0349053904041395, | |
| "kl": 0.3375244140625, | |
| "learning_rate": 3.8016355973344173e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3773717433214188, | |
| "reward_std": 0.2522313380613923, | |
| "rewards/equation_reward_func": 0.41517858766019344, | |
| "rewards/format_reward_func": 0.9621931202709675, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 293.3911952972412, | |
| "epoch": 1.5771144278606966, | |
| "grad_norm": 0.0398019110181203, | |
| "kl": 0.3760986328125, | |
| "learning_rate": 3.7707424440504863e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3775112181901932, | |
| "reward_std": 0.24829979613423347, | |
| "rewards/equation_reward_func": 0.4176897518336773, | |
| "rewards/format_reward_func": 0.9598214812576771, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 291.2261600494385, | |
| "epoch": 1.5970149253731343, | |
| "grad_norm": 0.06222482016183543, | |
| "kl": 0.404052734375, | |
| "learning_rate": 3.739585392353787e-07, | |
| "loss": 0.0004, | |
| "reward": 1.376395158469677, | |
| "reward_std": 0.25939000863581896, | |
| "rewards/equation_reward_func": 0.4197823852300644, | |
| "rewards/format_reward_func": 0.956612765789032, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 288.533353805542, | |
| "epoch": 1.616915422885572, | |
| "grad_norm": 0.03859495858824568, | |
| "kl": 0.3939208984375, | |
| "learning_rate": 3.7081709127108767e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3839286416769028, | |
| "reward_std": 0.26788724306970835, | |
| "rewards/equation_reward_func": 0.43359377048909664, | |
| "rewards/format_reward_func": 0.9503348730504513, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 290.49262046813965, | |
| "epoch": 1.63681592039801, | |
| "grad_norm": 0.042842388091469105, | |
| "kl": 0.3592529296875, | |
| "learning_rate": 3.6765055290490513e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3540737256407738, | |
| "reward_std": 0.25894952565431595, | |
| "rewards/equation_reward_func": 0.4047154225409031, | |
| "rewards/format_reward_func": 0.9493583105504513, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 295.7349452972412, | |
| "epoch": 1.6567164179104479, | |
| "grad_norm": 0.18951159402922121, | |
| "kl": 0.412109375, | |
| "learning_rate": 3.644595817401501e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3404018357396126, | |
| "reward_std": 0.24847182631492615, | |
| "rewards/equation_reward_func": 0.3911830522119999, | |
| "rewards/format_reward_func": 0.9492187984287739, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 290.04396057128906, | |
| "epoch": 1.6766169154228856, | |
| "grad_norm": 0.08853983568517768, | |
| "kl": 0.3330078125, | |
| "learning_rate": 3.6124484045416483e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3716518580913544, | |
| "reward_std": 0.23543819040060043, | |
| "rewards/equation_reward_func": 0.42034042067825794, | |
| "rewards/format_reward_func": 0.9513114243745804, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 283.24401092529297, | |
| "epoch": 1.6965174129353233, | |
| "grad_norm": 0.034701452611048136, | |
| "kl": 0.3319091796875, | |
| "learning_rate": 3.580069966606949e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3984375819563866, | |
| "reward_std": 0.2523732325062156, | |
| "rewards/equation_reward_func": 0.44601006619632244, | |
| "rewards/format_reward_func": 0.9524275064468384, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 282.3585510253906, | |
| "epoch": 1.716417910447761, | |
| "grad_norm": 0.13731908266516477, | |
| "kl": 0.3800048828125, | |
| "learning_rate": 3.547467227712444e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3766741827130318, | |
| "reward_std": 0.22029940225183964, | |
| "rewards/equation_reward_func": 0.419642873108387, | |
| "rewards/format_reward_func": 0.9570312909781933, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 282.2151336669922, | |
| "epoch": 1.736318407960199, | |
| "grad_norm": 598.5910266346846, | |
| "kl": 138.2977294921875, | |
| "learning_rate": 3.5146469585543386e-07, | |
| "loss": 0.1377, | |
| "reward": 1.3756976053118706, | |
| "reward_std": 0.22587174363434315, | |
| "rewards/equation_reward_func": 0.4218750149011612, | |
| "rewards/format_reward_func": 0.9538225904107094, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 281.84822940826416, | |
| "epoch": 1.756218905472637, | |
| "grad_norm": 0.03702511837906726, | |
| "kl": 0.306396484375, | |
| "learning_rate": 3.481615975003922e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3777902573347092, | |
| "reward_std": 0.22141603752970695, | |
| "rewards/equation_reward_func": 0.42717635817825794, | |
| "rewards/format_reward_func": 0.9506138823926449, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 274.6456604003906, | |
| "epoch": 1.7761194029850746, | |
| "grad_norm": 0.041298757850907544, | |
| "kl": 0.3270263671875, | |
| "learning_rate": 3.448381136692089e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4093192666769028, | |
| "reward_std": 0.22357938904315233, | |
| "rewards/equation_reward_func": 0.45982145331799984, | |
| "rewards/format_reward_func": 0.9494978114962578, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 272.24220180511475, | |
| "epoch": 1.7960199004975124, | |
| "grad_norm": 0.030094809329937633, | |
| "kl": 0.2969970703125, | |
| "learning_rate": 3.4149493455847897e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4101563170552254, | |
| "reward_std": 0.21365638496354222, | |
| "rewards/equation_reward_func": 0.4566127471625805, | |
| "rewards/format_reward_func": 0.9535435698926449, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 269.4391813278198, | |
| "epoch": 1.81592039800995, | |
| "grad_norm": 0.04116717679127556, | |
| "kl": 0.3162841796875, | |
| "learning_rate": 3.3813275445496766e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4030413553118706, | |
| "reward_std": 0.19462497252970934, | |
| "rewards/equation_reward_func": 0.44698662869632244, | |
| "rewards/format_reward_func": 0.9560547284781933, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 263.6639394760132, | |
| "epoch": 1.835820895522388, | |
| "grad_norm": 0.02803607002004903, | |
| "kl": 0.2899169921875, | |
| "learning_rate": 3.347522715914262e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4142020717263222, | |
| "reward_std": 0.18253574147820473, | |
| "rewards/equation_reward_func": 0.45159041695296764, | |
| "rewards/format_reward_func": 0.9626116529107094, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 261.7546148300171, | |
| "epoch": 1.855721393034826, | |
| "grad_norm": 0.022829184769749253, | |
| "kl": 1.263671875, | |
| "learning_rate": 3.313541880015877e-07, | |
| "loss": 0.0013, | |
| "reward": 1.4058315306901932, | |
| "reward_std": 0.16710444958880544, | |
| "rewards/equation_reward_func": 0.4370814971625805, | |
| "rewards/format_reward_func": 0.9687500447034836, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 251.26549530029297, | |
| "epoch": 1.8756218905472637, | |
| "grad_norm": 0.02442138813005668, | |
| "kl": 0.29052734375, | |
| "learning_rate": 3.279392093743747e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4494978412985802, | |
| "reward_std": 0.1628460343927145, | |
| "rewards/equation_reward_func": 0.48060828261077404, | |
| "rewards/format_reward_func": 0.9688895530998707, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 247.94644165039062, | |
| "epoch": 1.8955223880597014, | |
| "grad_norm": 0.02237700102044354, | |
| "kl": 0.2747802734375, | |
| "learning_rate": 3.245080449073459e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4411273077130318, | |
| "reward_std": 0.15618120227009058, | |
| "rewards/equation_reward_func": 0.46456475369632244, | |
| "rewards/format_reward_func": 0.9765625409781933, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 245.5500955581665, | |
| "epoch": 1.9154228855721394, | |
| "grad_norm": 0.03720429475202997, | |
| "kl": 0.3076171875, | |
| "learning_rate": 3.210614071594162e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4341518506407738, | |
| "reward_std": 0.15263939835131168, | |
| "rewards/equation_reward_func": 0.4598214514553547, | |
| "rewards/format_reward_func": 0.9743304029107094, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 241.20550727844238, | |
| "epoch": 1.935323383084577, | |
| "grad_norm": 0.023581004318971022, | |
| "kl": 0.2822265625, | |
| "learning_rate": 3.1760001190287695e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4430804178118706, | |
| "reward_std": 0.16672389581799507, | |
| "rewards/equation_reward_func": 0.4670759178698063, | |
| "rewards/format_reward_func": 0.9760045036673546, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 244.85854816436768, | |
| "epoch": 1.955223880597015, | |
| "grad_norm": 0.024100194647961135, | |
| "kl": 0.2650146484375, | |
| "learning_rate": 3.141245779747502e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4108538627624512, | |
| "reward_std": 0.16279484936967492, | |
| "rewards/equation_reward_func": 0.43750002048909664, | |
| "rewards/format_reward_func": 0.9733538292348385, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 242.98033905029297, | |
| "epoch": 1.9751243781094527, | |
| "grad_norm": 0.21271607094657838, | |
| "kl": 0.69915771484375, | |
| "learning_rate": 3.106358271275056e-07, | |
| "loss": 0.0007, | |
| "reward": 1.4214565232396126, | |
| "reward_std": 0.15484657231718302, | |
| "rewards/equation_reward_func": 0.448521226644516, | |
| "rewards/format_reward_func": 0.9729353152215481, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 237.08036708831787, | |
| "epoch": 1.9950248756218905, | |
| "grad_norm": 0.03129852975464754, | |
| "kl": 0.3726806640625, | |
| "learning_rate": 3.0713448387917227e-07, | |
| "loss": 0.0004, | |
| "reward": 1.4450335428118706, | |
| "reward_std": 0.1625676555559039, | |
| "rewards/equation_reward_func": 0.46958707459270954, | |
| "rewards/format_reward_func": 0.9754464738070965, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 232.5022430419922, | |
| "epoch": 2.009950248756219, | |
| "grad_norm": 0.027677161674615137, | |
| "kl": 0.28857421875, | |
| "learning_rate": 3.0362127536287636e-07, | |
| "loss": 0.0002, | |
| "reward": 1.4830729762713115, | |
| "reward_std": 0.16542020750542483, | |
| "rewards/equation_reward_func": 0.5070684750874838, | |
| "rewards/format_reward_func": 0.9760045061508814, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 233.4097490310669, | |
| "epoch": 2.029850746268657, | |
| "grad_norm": 0.02661806350024166, | |
| "kl": 0.282958984375, | |
| "learning_rate": 3.0009693117583523e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4497768729925156, | |
| "reward_std": 0.16554927779361606, | |
| "rewards/equation_reward_func": 0.4751674272119999, | |
| "rewards/format_reward_func": 0.9746094234287739, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 237.789213180542, | |
| "epoch": 2.0497512437810945, | |
| "grad_norm": 0.023261075188683814, | |
| "kl": 0.2811279296875, | |
| "learning_rate": 2.965621832278401e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4232701510190964, | |
| "reward_std": 0.16319336043670774, | |
| "rewards/equation_reward_func": 0.45368305779993534, | |
| "rewards/format_reward_func": 0.9695871025323868, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 234.0941801071167, | |
| "epoch": 2.0696517412935322, | |
| "grad_norm": 0.023054853003283082, | |
| "kl": 0.2955322265625, | |
| "learning_rate": 2.9301776558925875e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4295480474829674, | |
| "reward_std": 0.16508971201255918, | |
| "rewards/equation_reward_func": 0.4607980102300644, | |
| "rewards/format_reward_func": 0.9687500447034836, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 242.64566040039062, | |
| "epoch": 2.08955223880597, | |
| "grad_norm": 0.027215305849199487, | |
| "kl": 0.28125, | |
| "learning_rate": 2.894644143385885e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3913226127624512, | |
| "reward_std": 0.19783471059054136, | |
| "rewards/equation_reward_func": 0.43233819119632244, | |
| "rewards/format_reward_func": 0.9589844234287739, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 229.32353115081787, | |
| "epoch": 2.109452736318408, | |
| "grad_norm": 0.029714770863235616, | |
| "kl": 0.332763671875, | |
| "learning_rate": 2.859028674095937e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4439174607396126, | |
| "reward_std": 0.16730164922773838, | |
| "rewards/equation_reward_func": 0.47516743279993534, | |
| "rewards/format_reward_func": 0.9687500409781933, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 230.78223705291748, | |
| "epoch": 2.129353233830846, | |
| "grad_norm": 0.02758417379826148, | |
| "kl": 0.29638671875, | |
| "learning_rate": 2.823338644380566e-07, | |
| "loss": 0.0003, | |
| "reward": 1.434012345969677, | |
| "reward_std": 0.17043949477374554, | |
| "rewards/equation_reward_func": 0.4651227928698063, | |
| "rewards/format_reward_func": 0.9688895493745804, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 227.33301734924316, | |
| "epoch": 2.1492537313432836, | |
| "grad_norm": 0.02145275488584583, | |
| "kl": 0.2947998046875, | |
| "learning_rate": 2.7875814660817504e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4348493963479996, | |
| "reward_std": 0.15906028542667627, | |
| "rewards/equation_reward_func": 0.46205359138548374, | |
| "rewards/format_reward_func": 0.9727958030998707, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 226.86663818359375, | |
| "epoch": 2.1691542288557213, | |
| "grad_norm": 0.02075669716021165, | |
| "kl": 0.3297119140625, | |
| "learning_rate": 2.751764564986396e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4342913702130318, | |
| "reward_std": 0.16284326277673244, | |
| "rewards/equation_reward_func": 0.46386720798909664, | |
| "rewards/format_reward_func": 0.9704241529107094, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 228.29018878936768, | |
| "epoch": 2.189054726368159, | |
| "grad_norm": 0.024362889886761944, | |
| "kl": 0.318115234375, | |
| "learning_rate": 2.715895379284194e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4013672322034836, | |
| "reward_std": 0.14991394616663456, | |
| "rewards/equation_reward_func": 0.4255022555589676, | |
| "rewards/format_reward_func": 0.9758649952709675, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 222.1423101425171, | |
| "epoch": 2.208955223880597, | |
| "grad_norm": 0.021295359623068617, | |
| "kl": 0.2947998046875, | |
| "learning_rate": 2.6799813580229174e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4208985045552254, | |
| "reward_std": 0.14988858997821808, | |
| "rewards/equation_reward_func": 0.4457310475409031, | |
| "rewards/format_reward_func": 0.975167453289032, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 219.60059642791748, | |
| "epoch": 2.228855721393035, | |
| "grad_norm": 0.02454006684467977, | |
| "kl": 0.302001953125, | |
| "learning_rate": 2.6440299595614606e-07, | |
| "loss": 0.0003, | |
| "reward": 1.445312574505806, | |
| "reward_std": 0.15494418097659945, | |
| "rewards/equation_reward_func": 0.4681919813156128, | |
| "rewards/format_reward_func": 0.9771205671131611, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 220.79395484924316, | |
| "epoch": 2.2487562189054726, | |
| "grad_norm": 0.023780972383857492, | |
| "kl": 0.315673828125, | |
| "learning_rate": 2.6080486500209347e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4340123310685158, | |
| "reward_std": 0.14583389554172754, | |
| "rewards/equation_reward_func": 0.4557756893336773, | |
| "rewards/format_reward_func": 0.9782366454601288, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 216.97461986541748, | |
| "epoch": 2.2686567164179103, | |
| "grad_norm": 0.02408251073949643, | |
| "kl": 0.295654296875, | |
| "learning_rate": 2.572044901734166e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4261998385190964, | |
| "reward_std": 0.14129964634776115, | |
| "rewards/equation_reward_func": 0.4465680979192257, | |
| "rewards/format_reward_func": 0.9796317405998707, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 222.06753253936768, | |
| "epoch": 2.288557213930348, | |
| "grad_norm": 0.02085106325268825, | |
| "kl": 0.294921875, | |
| "learning_rate": 2.536026191693893e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4008092135190964, | |
| "reward_std": 0.13534251041710377, | |
| "rewards/equation_reward_func": 0.42173551209270954, | |
| "rewards/format_reward_func": 0.9790737070143223, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 216.5224723815918, | |
| "epoch": 2.308457711442786, | |
| "grad_norm": 0.02328394313813855, | |
| "kl": 0.3106689453125, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0003, | |
| "reward": 1.433035783469677, | |
| "reward_std": 0.14889662712812424, | |
| "rewards/equation_reward_func": 0.4554966762661934, | |
| "rewards/format_reward_func": 0.9775391034781933, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 221.65863132476807, | |
| "epoch": 2.328358208955224, | |
| "grad_norm": 0.018660636322713853, | |
| "kl": 0.307373046875, | |
| "learning_rate": 2.4639738083061073e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4042969346046448, | |
| "reward_std": 0.1360318623483181, | |
| "rewards/equation_reward_func": 0.423549123108387, | |
| "rewards/format_reward_func": 0.9807478152215481, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 214.48675537109375, | |
| "epoch": 2.3482587064676617, | |
| "grad_norm": 0.021713695385161994, | |
| "kl": 0.2969970703125, | |
| "learning_rate": 2.4279550982658345e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4568918198347092, | |
| "reward_std": 0.12715721363201737, | |
| "rewards/equation_reward_func": 0.4750279225409031, | |
| "rewards/format_reward_func": 0.9818638823926449, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 215.53753566741943, | |
| "epoch": 2.3681592039800994, | |
| "grad_norm": 0.023953440696633212, | |
| "kl": 0.287841796875, | |
| "learning_rate": 2.3919513499790646e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4486607909202576, | |
| "reward_std": 0.14019544329494238, | |
| "rewards/equation_reward_func": 0.46693640388548374, | |
| "rewards/format_reward_func": 0.9817243777215481, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 213.56808948516846, | |
| "epoch": 2.388059701492537, | |
| "grad_norm": 0.019891387969057997, | |
| "kl": 0.2940673828125, | |
| "learning_rate": 2.3559700404385394e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4373605623841286, | |
| "reward_std": 0.13764300849288702, | |
| "rewards/equation_reward_func": 0.456333726644516, | |
| "rewards/format_reward_func": 0.981026828289032, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 219.03098011016846, | |
| "epoch": 2.4079601990049753, | |
| "grad_norm": 0.018225840764500994, | |
| "kl": 0.301513671875, | |
| "learning_rate": 2.3200186419770823e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4228516295552254, | |
| "reward_std": 0.13813555100932717, | |
| "rewards/equation_reward_func": 0.444614976644516, | |
| "rewards/format_reward_func": 0.9782366491854191, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 209.34473514556885, | |
| "epoch": 2.427860696517413, | |
| "grad_norm": 0.024367276227975687, | |
| "kl": 0.311767578125, | |
| "learning_rate": 2.284104620715807e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4732143729925156, | |
| "reward_std": 0.14336633821949363, | |
| "rewards/equation_reward_func": 0.4926060512661934, | |
| "rewards/format_reward_func": 0.9806082993745804, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 219.85129356384277, | |
| "epoch": 2.4477611940298507, | |
| "grad_norm": 0.038111490848984936, | |
| "kl": 0.3367919921875, | |
| "learning_rate": 2.2482354350136043e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4001116678118706, | |
| "reward_std": 0.1531707807444036, | |
| "rewards/equation_reward_func": 0.4235491268336773, | |
| "rewards/format_reward_func": 0.9765625409781933, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 214.16170120239258, | |
| "epoch": 2.4676616915422884, | |
| "grad_norm": 0.025346719627314943, | |
| "kl": 0.3148193359375, | |
| "learning_rate": 2.2124185339182496e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4430804327130318, | |
| "reward_std": 0.15375623805448413, | |
| "rewards/equation_reward_func": 0.46554131619632244, | |
| "rewards/format_reward_func": 0.9775391034781933, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 214.68039321899414, | |
| "epoch": 2.487562189054726, | |
| "grad_norm": 0.0233724860224165, | |
| "kl": 0.8677978515625, | |
| "learning_rate": 2.1766613556194344e-07, | |
| "loss": 0.0009, | |
| "reward": 1.4355469420552254, | |
| "reward_std": 0.1511770309880376, | |
| "rewards/equation_reward_func": 0.4602399803698063, | |
| "rewards/format_reward_func": 0.9753069654107094, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 219.32129764556885, | |
| "epoch": 2.5074626865671643, | |
| "grad_norm": 0.023037103314753518, | |
| "kl": 0.314453125, | |
| "learning_rate": 2.1409713259040628e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4236886724829674, | |
| "reward_std": 0.13908220268785954, | |
| "rewards/equation_reward_func": 0.4494977854192257, | |
| "rewards/format_reward_func": 0.9741908833384514, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 216.81683444976807, | |
| "epoch": 2.527363184079602, | |
| "grad_norm": 0.02837351620993175, | |
| "kl": 0.3062744140625, | |
| "learning_rate": 2.105355856614115e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4143415838479996, | |
| "reward_std": 0.13123074593022466, | |
| "rewards/equation_reward_func": 0.4380580559372902, | |
| "rewards/format_reward_func": 0.9762835316359997, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 215.65974712371826, | |
| "epoch": 2.5472636815920398, | |
| "grad_norm": 0.02558897042063982, | |
| "kl": 0.3006591796875, | |
| "learning_rate": 2.069822344107413e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4338728412985802, | |
| "reward_std": 0.15900959819555283, | |
| "rewards/equation_reward_func": 0.4620535969734192, | |
| "rewards/format_reward_func": 0.9718192331492901, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 212.77707386016846, | |
| "epoch": 2.5671641791044775, | |
| "grad_norm": 0.024897773706899457, | |
| "kl": 0.3153076171875, | |
| "learning_rate": 2.034378167721599e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4379185885190964, | |
| "reward_std": 0.13706076564267278, | |
| "rewards/equation_reward_func": 0.4582868479192257, | |
| "rewards/format_reward_func": 0.9796317368745804, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 212.57813358306885, | |
| "epoch": 2.587064676616915, | |
| "grad_norm": 0.02082510904169789, | |
| "kl": 0.3096923828125, | |
| "learning_rate": 1.9990306882416485e-07, | |
| "loss": 0.0003, | |
| "reward": 1.436523512005806, | |
| "reward_std": 0.14973098738119006, | |
| "rewards/equation_reward_func": 0.4598214514553547, | |
| "rewards/format_reward_func": 0.9767020530998707, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 214.98591995239258, | |
| "epoch": 2.6069651741293534, | |
| "grad_norm": 0.026284363430018526, | |
| "kl": 0.30908203125, | |
| "learning_rate": 1.9637872463712362e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4447545409202576, | |
| "reward_std": 0.15666912542656064, | |
| "rewards/equation_reward_func": 0.4736328311264515, | |
| "rewards/format_reward_func": 0.9711216986179352, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 214.83343505859375, | |
| "epoch": 2.626865671641791, | |
| "grad_norm": 0.027482705612603058, | |
| "kl": 0.3055419921875, | |
| "learning_rate": 1.9286551612082773e-07, | |
| "loss": 0.0003, | |
| "reward": 1.425223283469677, | |
| "reward_std": 0.14923690911382437, | |
| "rewards/equation_reward_func": 0.450474351644516, | |
| "rewards/format_reward_func": 0.9747489206492901, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 215.618314743042, | |
| "epoch": 2.646766169154229, | |
| "grad_norm": 0.020891536883991365, | |
| "kl": 0.304931640625, | |
| "learning_rate": 1.8936417287249446e-07, | |
| "loss": 0.0003, | |
| "reward": 1.428711012005806, | |
| "reward_std": 0.1552719115279615, | |
| "rewards/equation_reward_func": 0.4545201100409031, | |
| "rewards/format_reward_func": 0.9741908945143223, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 213.43457889556885, | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.024835156963306366, | |
| "kl": 0.302001953125, | |
| "learning_rate": 1.8587542202524985e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4349889159202576, | |
| "reward_std": 0.1448821472004056, | |
| "rewards/equation_reward_func": 0.4573102928698063, | |
| "rewards/format_reward_func": 0.977678619325161, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 215.88770580291748, | |
| "epoch": 2.6865671641791042, | |
| "grad_norm": 0.03406652027066123, | |
| "kl": 0.37060546875, | |
| "learning_rate": 1.82399988097123e-07, | |
| "loss": 0.0004, | |
| "reward": 1.435965470969677, | |
| "reward_std": 0.16567487781867385, | |
| "rewards/equation_reward_func": 0.46554132364690304, | |
| "rewards/format_reward_func": 0.9704241491854191, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 218.51075267791748, | |
| "epoch": 2.7064676616915424, | |
| "grad_norm": 0.02189495968275397, | |
| "kl": 0.30078125, | |
| "learning_rate": 1.7893859284058378e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4058315604925156, | |
| "reward_std": 0.16543139517307281, | |
| "rewards/equation_reward_func": 0.43568640761077404, | |
| "rewards/format_reward_func": 0.9701451323926449, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 216.45620346069336, | |
| "epoch": 2.72636815920398, | |
| "grad_norm": 0.03134224538044876, | |
| "kl": 0.3323974609375, | |
| "learning_rate": 1.7549195509265407e-07, | |
| "loss": 0.0003, | |
| "reward": 1.43038509786129, | |
| "reward_std": 0.1662453394383192, | |
| "rewards/equation_reward_func": 0.45982144959270954, | |
| "rewards/format_reward_func": 0.9705636538565159, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 216.5844144821167, | |
| "epoch": 2.746268656716418, | |
| "grad_norm": 0.020880032290923414, | |
| "kl": 0.297119140625, | |
| "learning_rate": 1.7206079062562536e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4130859971046448, | |
| "reward_std": 0.15265868697315454, | |
| "rewards/equation_reward_func": 0.4409877434372902, | |
| "rewards/format_reward_func": 0.9720982573926449, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 219.52247047424316, | |
| "epoch": 2.7661691542288556, | |
| "grad_norm": 0.02562179332680793, | |
| "kl": 0.315673828125, | |
| "learning_rate": 1.6864581199841226e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4268973767757416, | |
| "reward_std": 0.173020682297647, | |
| "rewards/equation_reward_func": 0.4589843973517418, | |
| "rewards/format_reward_func": 0.9679129905998707, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 213.5055913925171, | |
| "epoch": 2.7860696517412933, | |
| "grad_norm": 0.027397671041646216, | |
| "kl": 0.3511962890625, | |
| "learning_rate": 1.6524772840857388e-07, | |
| "loss": 0.0004, | |
| "reward": 1.4381976276636124, | |
| "reward_std": 0.1577630965039134, | |
| "rewards/equation_reward_func": 0.4667968973517418, | |
| "rewards/format_reward_func": 0.9714007154107094, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 213.3683156967163, | |
| "epoch": 2.8059701492537314, | |
| "grad_norm": 0.023034788220825733, | |
| "kl": 0.3841552734375, | |
| "learning_rate": 1.6186724554503237e-07, | |
| "loss": 0.0004, | |
| "reward": 1.4397322088479996, | |
| "reward_std": 0.15021836338564754, | |
| "rewards/equation_reward_func": 0.46944756619632244, | |
| "rewards/format_reward_func": 0.970284640789032, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 212.66574478149414, | |
| "epoch": 2.825870646766169, | |
| "grad_norm": 0.10252794447546824, | |
| "kl": 1.5787353515625, | |
| "learning_rate": 1.5850506544152103e-07, | |
| "loss": 0.0016, | |
| "reward": 1.4352679252624512, | |
| "reward_std": 0.15777465933933854, | |
| "rewards/equation_reward_func": 0.462611623108387, | |
| "rewards/format_reward_func": 0.9726562947034836, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 215.74861431121826, | |
| "epoch": 2.845771144278607, | |
| "grad_norm": 0.0331122698510056, | |
| "kl": 0.3597412109375, | |
| "learning_rate": 1.5516188633079107e-07, | |
| "loss": 0.0004, | |
| "reward": 1.4305246248841286, | |
| "reward_std": 0.1583145372569561, | |
| "rewards/equation_reward_func": 0.45703127048909664, | |
| "rewards/format_reward_func": 0.9734933450818062, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 215.67969703674316, | |
| "epoch": 2.8656716417910446, | |
| "grad_norm": 0.02477198146500147, | |
| "kl": 0.3238525390625, | |
| "learning_rate": 1.5183840249960784e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4245257377624512, | |
| "reward_std": 0.16665043262764812, | |
| "rewards/equation_reward_func": 0.4508928805589676, | |
| "rewards/format_reward_func": 0.9736328534781933, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 218.7661943435669, | |
| "epoch": 2.8855721393034823, | |
| "grad_norm": 0.023329206966385622, | |
| "kl": 0.314697265625, | |
| "learning_rate": 1.4853530414456612e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4027623310685158, | |
| "reward_std": 0.16430522268638015, | |
| "rewards/equation_reward_func": 0.4331752434372902, | |
| "rewards/format_reward_func": 0.9695870988070965, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 218.44029998779297, | |
| "epoch": 2.9054726368159205, | |
| "grad_norm": 0.0245063774390441, | |
| "kl": 0.31298828125, | |
| "learning_rate": 1.4525327722875568e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4175502881407738, | |
| "reward_std": 0.17077632527798414, | |
| "rewards/equation_reward_func": 0.44963729567825794, | |
| "rewards/format_reward_func": 0.9679129831492901, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 214.96513175964355, | |
| "epoch": 2.925373134328358, | |
| "grad_norm": 0.02034377928215922, | |
| "kl": 0.330322265625, | |
| "learning_rate": 1.4199300333930515e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4123884439468384, | |
| "reward_std": 0.16281927097588778, | |
| "rewards/equation_reward_func": 0.44112725742161274, | |
| "rewards/format_reward_func": 0.971261203289032, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 213.34794330596924, | |
| "epoch": 2.945273631840796, | |
| "grad_norm": 0.02782572344275969, | |
| "kl": 0.327880859375, | |
| "learning_rate": 1.3875515954583523e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4298270866274834, | |
| "reward_std": 0.16148702334612608, | |
| "rewards/equation_reward_func": 0.45493863336741924, | |
| "rewards/format_reward_func": 0.9748884364962578, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 215.48131465911865, | |
| "epoch": 2.965174129353234, | |
| "grad_norm": 0.024323572341590376, | |
| "kl": 0.302734375, | |
| "learning_rate": 1.3554041825985e-07, | |
| "loss": 0.0003, | |
| "reward": 1.442382887005806, | |
| "reward_std": 0.16833833791315556, | |
| "rewards/equation_reward_func": 0.47377234511077404, | |
| "rewards/format_reward_func": 0.9686105288565159, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 212.95969200134277, | |
| "epoch": 2.9850746268656714, | |
| "grad_norm": 0.0273572585315816, | |
| "kl": 0.30029296875, | |
| "learning_rate": 1.323494470950949e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4288505017757416, | |
| "reward_std": 0.16034137457609177, | |
| "rewards/equation_reward_func": 0.4522879645228386, | |
| "rewards/format_reward_func": 0.9765625409781933, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 211.95704142252603, | |
| "epoch": 3.0, | |
| "grad_norm": 0.017192487768592887, | |
| "kl": 0.3084309895833333, | |
| "learning_rate": 1.2918290872891236e-07, | |
| "loss": 0.0002, | |
| "reward": 1.4287575085957844, | |
| "reward_std": 0.16905243881046772, | |
| "rewards/equation_reward_func": 0.4549851392706235, | |
| "rewards/format_reward_func": 0.9737723668416342, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 217.9089117050171, | |
| "epoch": 3.0199004975124377, | |
| "grad_norm": 0.02862549430280998, | |
| "kl": 0.6329345703125, | |
| "learning_rate": 1.260414607646213e-07, | |
| "loss": 0.0006, | |
| "reward": 1.4066686034202576, | |
| "reward_std": 0.1626745001412928, | |
| "rewards/equation_reward_func": 0.4342913143336773, | |
| "rewards/format_reward_func": 0.9723772667348385, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 215.75042915344238, | |
| "epoch": 3.0398009950248754, | |
| "grad_norm": 0.021323828874330892, | |
| "kl": 0.30419921875, | |
| "learning_rate": 1.2292575559495143e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4181083217263222, | |
| "reward_std": 0.16414937423542142, | |
| "rewards/equation_reward_func": 0.44614957459270954, | |
| "rewards/format_reward_func": 0.9719587489962578, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 214.40388679504395, | |
| "epoch": 3.0597014925373136, | |
| "grad_norm": 0.025008836515952163, | |
| "kl": 0.3052978515625, | |
| "learning_rate": 1.1983644026655835e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4190848916769028, | |
| "reward_std": 0.16639473848044872, | |
| "rewards/equation_reward_func": 0.4468471184372902, | |
| "rewards/format_reward_func": 0.972237765789032, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 213.76089000701904, | |
| "epoch": 3.0796019900497513, | |
| "grad_norm": 0.02381599230611037, | |
| "kl": 0.3135986328125, | |
| "learning_rate": 1.1677415634565066e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4115513935685158, | |
| "reward_std": 0.16106756404042244, | |
| "rewards/equation_reward_func": 0.43875559978187084, | |
| "rewards/format_reward_func": 0.9727957993745804, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 218.70118045806885, | |
| "epoch": 3.099502487562189, | |
| "grad_norm": 0.029377287943715084, | |
| "kl": 0.341552734375, | |
| "learning_rate": 1.1373953978475353e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3869978338479996, | |
| "reward_std": 0.17045084619894624, | |
| "rewards/equation_reward_func": 0.41643417067825794, | |
| "rewards/format_reward_func": 0.9705636575818062, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 214.29200172424316, | |
| "epoch": 3.1194029850746268, | |
| "grad_norm": 0.023607641624877624, | |
| "kl": 0.307861328125, | |
| "learning_rate": 1.1073322079063913e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4089007377624512, | |
| "reward_std": 0.15849500941112638, | |
| "rewards/equation_reward_func": 0.436802476644516, | |
| "rewards/format_reward_func": 0.9720982536673546, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 212.78962993621826, | |
| "epoch": 3.1393034825870645, | |
| "grad_norm": 0.021914197029448714, | |
| "kl": 0.308349609375, | |
| "learning_rate": 1.0775582369344946e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4239677116274834, | |
| "reward_std": 0.16578855132684112, | |
| "rewards/equation_reward_func": 0.45256698690354824, | |
| "rewards/format_reward_func": 0.9714007154107094, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 212.96471405029297, | |
| "epoch": 3.1592039800995027, | |
| "grad_norm": 0.023604792789655316, | |
| "kl": 0.3155517578125, | |
| "learning_rate": 1.0480796681704077e-07, | |
| "loss": 0.0003, | |
| "reward": 1.426060326397419, | |
| "reward_std": 0.16330989450216293, | |
| "rewards/equation_reward_func": 0.45312502048909664, | |
| "rewards/format_reward_func": 0.9729353040456772, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 214.71987438201904, | |
| "epoch": 3.1791044776119404, | |
| "grad_norm": 0.022973798133877794, | |
| "kl": 0.3466796875, | |
| "learning_rate": 1.018902623505741e-07, | |
| "loss": 0.0003, | |
| "reward": 1.4179688170552254, | |
| "reward_std": 0.16605077125132084, | |
| "rewards/equation_reward_func": 0.44782368279993534, | |
| "rewards/format_reward_func": 0.9701451286673546, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 209.85436058044434, | |
| "epoch": 3.199004975124378, | |
| "grad_norm": 0.02360413155397944, | |
| "kl": 0.3021240234375, | |
| "learning_rate": 9.900331622138063e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4429409205913544, | |
| "reward_std": 0.17034625075757504, | |
| "rewards/equation_reward_func": 0.4707031436264515, | |
| "rewards/format_reward_func": 0.9722377583384514, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 209.76968097686768, | |
| "epoch": 3.218905472636816, | |
| "grad_norm": 0.02508581662859248, | |
| "kl": 0.3162841796875, | |
| "learning_rate": 9.614772796912681e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4415458291769028, | |
| "reward_std": 0.16658680979162455, | |
| "rewards/equation_reward_func": 0.4718192256987095, | |
| "rewards/format_reward_func": 0.969726599752903, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 210.76744651794434, | |
| "epoch": 3.2388059701492535, | |
| "grad_norm": 0.02019798707225102, | |
| "kl": 0.3167724609375, | |
| "learning_rate": 9.332409062130686e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4317802116274834, | |
| "reward_std": 0.1603726176545024, | |
| "rewards/equation_reward_func": 0.4620535895228386, | |
| "rewards/format_reward_func": 0.9697266146540642, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 213.83831787109375, | |
| "epoch": 3.2587064676616917, | |
| "grad_norm": 0.028338174756536712, | |
| "kl": 0.34521484375, | |
| "learning_rate": 9.053299057008699e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4115514159202576, | |
| "reward_std": 0.1632266715168953, | |
| "rewards/equation_reward_func": 0.4440569430589676, | |
| "rewards/format_reward_func": 0.9674944616854191, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 210.93960094451904, | |
| "epoch": 3.2786069651741294, | |
| "grad_norm": 0.02129428174006055, | |
| "kl": 0.337646484375, | |
| "learning_rate": 8.777500745052743e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4172712713479996, | |
| "reward_std": 0.1612872895784676, | |
| "rewards/equation_reward_func": 0.44614957459270954, | |
| "rewards/format_reward_func": 0.9711216948926449, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 206.76423931121826, | |
| "epoch": 3.298507462686567, | |
| "grad_norm": 0.020973224442213136, | |
| "kl": 0.3349609375, | |
| "learning_rate": 8.505071402020892e-08, | |
| "loss": 0.0003, | |
| "reward": 1.438895158469677, | |
| "reward_std": 0.1493242043070495, | |
| "rewards/equation_reward_func": 0.46819198690354824, | |
| "rewards/format_reward_func": 0.9707031697034836, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 209.10282707214355, | |
| "epoch": 3.318407960199005, | |
| "grad_norm": 0.024242183613499557, | |
| "kl": 0.3062744140625, | |
| "learning_rate": 8.236067604028562e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4136440306901932, | |
| "reward_std": 0.1428091856651008, | |
| "rewards/equation_reward_func": 0.4411272518336773, | |
| "rewards/format_reward_func": 0.9725167863070965, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 209.48996448516846, | |
| "epoch": 3.3383084577114426, | |
| "grad_norm": 0.025886496661760183, | |
| "kl": 0.3170166015625, | |
| "learning_rate": 7.970545215799327e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4109933599829674, | |
| "reward_std": 0.15012150490656495, | |
| "rewards/equation_reward_func": 0.43498886190354824, | |
| "rewards/format_reward_func": 0.9760045148432255, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 210.97322273254395, | |
| "epoch": 3.3582089552238807, | |
| "grad_norm": 0.035166113391626445, | |
| "kl": 0.3455810546875, | |
| "learning_rate": 7.708559379063204e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4250837713479996, | |
| "reward_std": 0.15619614347815514, | |
| "rewards/equation_reward_func": 0.4535435438156128, | |
| "rewards/format_reward_func": 0.9715402275323868, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 210.39258575439453, | |
| "epoch": 3.3781094527363185, | |
| "grad_norm": 0.024233079455878367, | |
| "kl": 0.318603515625, | |
| "learning_rate": 7.45016450110534e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4052734896540642, | |
| "reward_std": 0.13724439358338714, | |
| "rewards/equation_reward_func": 0.42926899529993534, | |
| "rewards/format_reward_func": 0.9760045148432255, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 207.5920867919922, | |
| "epoch": 3.398009950248756, | |
| "grad_norm": 0.023190526466859332, | |
| "kl": 0.3145751953125, | |
| "learning_rate": 7.195414243467029e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4185268580913544, | |
| "reward_std": 0.14769554091617465, | |
| "rewards/equation_reward_func": 0.44196430779993534, | |
| "rewards/format_reward_func": 0.9765625409781933, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 208.33301734924316, | |
| "epoch": 3.417910447761194, | |
| "grad_norm": 0.020556956452052143, | |
| "kl": 0.3096923828125, | |
| "learning_rate": 6.944361510801763e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4354074373841286, | |
| "reward_std": 0.14919495349749923, | |
| "rewards/equation_reward_func": 0.4605189897119999, | |
| "rewards/format_reward_func": 0.9748884327709675, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 210.11733627319336, | |
| "epoch": 3.4378109452736316, | |
| "grad_norm": 0.02548976703976393, | |
| "kl": 0.303466796875, | |
| "learning_rate": 6.697058439888283e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4231306537985802, | |
| "reward_std": 0.1531607019715011, | |
| "rewards/equation_reward_func": 0.4482422098517418, | |
| "rewards/format_reward_func": 0.9748884364962578, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 206.89621353149414, | |
| "epoch": 3.45771144278607, | |
| "grad_norm": 0.021912532031084852, | |
| "kl": 0.3536376953125, | |
| "learning_rate": 6.453556388803288e-08, | |
| "loss": 0.0004, | |
| "reward": 1.4104353487491608, | |
| "reward_std": 0.1417902335524559, | |
| "rewards/equation_reward_func": 0.4347098395228386, | |
| "rewards/format_reward_func": 0.975725494325161, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 208.74094104766846, | |
| "epoch": 3.4776119402985075, | |
| "grad_norm": 0.021460533422209858, | |
| "kl": 0.3043212890625, | |
| "learning_rate": 6.213905926255697e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4176898002624512, | |
| "reward_std": 0.15168333146721125, | |
| "rewards/equation_reward_func": 0.443219892680645, | |
| "rewards/format_reward_func": 0.9744699038565159, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 211.02735328674316, | |
| "epoch": 3.4975124378109452, | |
| "grad_norm": 0.02394287843574342, | |
| "kl": 0.302001953125, | |
| "learning_rate": 5.978156821084987e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4225726202130318, | |
| "reward_std": 0.15021274331957102, | |
| "rewards/equation_reward_func": 0.4468471221625805, | |
| "rewards/format_reward_func": 0.9757254905998707, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 211.63505458831787, | |
| "epoch": 3.517412935323383, | |
| "grad_norm": 0.024298287233372077, | |
| "kl": 0.2999267578125, | |
| "learning_rate": 5.7463580319254853e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4143415838479996, | |
| "reward_std": 0.15382455056533217, | |
| "rewards/equation_reward_func": 0.43945314548909664, | |
| "rewards/format_reward_func": 0.9748884327709675, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 211.22475719451904, | |
| "epoch": 3.5373134328358207, | |
| "grad_norm": 0.022388645065280263, | |
| "kl": 0.31103515625, | |
| "learning_rate": 5.518557697039081e-08, | |
| "loss": 0.0003, | |
| "reward": 1.412527970969677, | |
| "reward_std": 0.15113574685528874, | |
| "rewards/equation_reward_func": 0.44070872850716114, | |
| "rewards/format_reward_func": 0.9718192368745804, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 214.04339790344238, | |
| "epoch": 3.557213930348259, | |
| "grad_norm": 0.0209017212620795, | |
| "kl": 0.320068359375, | |
| "learning_rate": 5.294803124318145e-08, | |
| "loss": 0.0003, | |
| "reward": 1.3909040838479996, | |
| "reward_std": 0.14382937783375382, | |
| "rewards/equation_reward_func": 0.4151785895228386, | |
| "rewards/format_reward_func": 0.9757254868745804, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 208.87654304504395, | |
| "epoch": 3.5771144278606966, | |
| "grad_norm": 0.023243470823934638, | |
| "kl": 0.30078125, | |
| "learning_rate": 5.07514078146106e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4390346631407738, | |
| "reward_std": 0.14443567395210266, | |
| "rewards/equation_reward_func": 0.46275113709270954, | |
| "rewards/format_reward_func": 0.9762835167348385, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 211.29144477844238, | |
| "epoch": 3.5970149253731343, | |
| "grad_norm": 0.019400841496720493, | |
| "kl": 0.2943115234375, | |
| "learning_rate": 4.859616286322094e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4024833366274834, | |
| "reward_std": 0.13700629398226738, | |
| "rewards/equation_reward_func": 0.4236886352300644, | |
| "rewards/format_reward_func": 0.9787946864962578, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 207.67523288726807, | |
| "epoch": 3.616915422885572, | |
| "grad_norm": 0.0267143223948108, | |
| "kl": 0.302734375, | |
| "learning_rate": 4.648274397437829e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4414063021540642, | |
| "reward_std": 0.1584903709590435, | |
| "rewards/equation_reward_func": 0.4638672098517418, | |
| "rewards/format_reward_func": 0.9775391109287739, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 208.99275493621826, | |
| "epoch": 3.6368159203980097, | |
| "grad_norm": 0.02236073210562904, | |
| "kl": 0.3035888671875, | |
| "learning_rate": 4.4411590047320617e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4196429327130318, | |
| "reward_std": 0.1565262214280665, | |
| "rewards/equation_reward_func": 0.44168529100716114, | |
| "rewards/format_reward_func": 0.9779576323926449, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 206.0256805419922, | |
| "epoch": 3.656716417910448, | |
| "grad_norm": 0.021682830704685987, | |
| "kl": 0.3154296875, | |
| "learning_rate": 4.2383131204010494e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4379185885190964, | |
| "reward_std": 0.14357166644185781, | |
| "rewards/equation_reward_func": 0.45954243279993534, | |
| "rewards/format_reward_func": 0.9783761575818062, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 208.63812351226807, | |
| "epoch": 3.6766169154228856, | |
| "grad_norm": 0.023364716402540755, | |
| "kl": 0.33544921875, | |
| "learning_rate": 4.039778869981064e-08, | |
| "loss": 0.0003, | |
| "reward": 1.425083763897419, | |
| "reward_std": 0.1421098834834993, | |
| "rewards/equation_reward_func": 0.44796319119632244, | |
| "rewards/format_reward_func": 0.9771205820143223, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 209.36608028411865, | |
| "epoch": 3.6965174129353233, | |
| "grad_norm": 0.023786600302629118, | |
| "kl": 0.2933349609375, | |
| "learning_rate": 3.845597483600049e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4204799756407738, | |
| "reward_std": 0.13766240561380982, | |
| "rewards/equation_reward_func": 0.44349891133606434, | |
| "rewards/format_reward_func": 0.9769810698926449, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 209.27972507476807, | |
| "epoch": 3.716417910447761, | |
| "grad_norm": 0.021757632159690704, | |
| "kl": 0.3203125, | |
| "learning_rate": 3.655809287415284e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4183873385190964, | |
| "reward_std": 0.1481694160029292, | |
| "rewards/equation_reward_func": 0.44559154473245144, | |
| "rewards/format_reward_func": 0.9727957993745804, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 207.45955181121826, | |
| "epoch": 3.7363184079601988, | |
| "grad_norm": 0.05088374036220301, | |
| "kl": 0.3743896484375, | |
| "learning_rate": 3.4704536952387285e-08, | |
| "loss": 0.0004, | |
| "reward": 1.4144810885190964, | |
| "reward_std": 0.14615898905321956, | |
| "rewards/equation_reward_func": 0.43624444119632244, | |
| "rewards/format_reward_func": 0.9782366417348385, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 207.17174530029297, | |
| "epoch": 3.756218905472637, | |
| "grad_norm": 0.037351085461750734, | |
| "kl": 0.3173828125, | |
| "learning_rate": 3.2895692003518575e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4362444803118706, | |
| "reward_std": 0.15589907299727201, | |
| "rewards/equation_reward_func": 0.4595424309372902, | |
| "rewards/format_reward_func": 0.9767020530998707, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 206.81167221069336, | |
| "epoch": 3.7761194029850746, | |
| "grad_norm": 0.02300045958202165, | |
| "kl": 0.306640625, | |
| "learning_rate": 3.113193367511635e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4362444803118706, | |
| "reward_std": 0.14778407430276275, | |
| "rewards/equation_reward_func": 0.4605189934372902, | |
| "rewards/format_reward_func": 0.975725494325161, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 205.91574573516846, | |
| "epoch": 3.7960199004975124, | |
| "grad_norm": 0.02131900977981381, | |
| "kl": 0.3189697265625, | |
| "learning_rate": 2.9413628251493934e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4458706006407738, | |
| "reward_std": 0.13890358246862888, | |
| "rewards/equation_reward_func": 0.4698660857975483, | |
| "rewards/format_reward_func": 0.9760045073926449, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 205.91086196899414, | |
| "epoch": 3.81592039800995, | |
| "grad_norm": 0.023242413775472502, | |
| "kl": 0.32373046875, | |
| "learning_rate": 2.774113257764066e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4434989541769028, | |
| "reward_std": 0.15019797440618277, | |
| "rewards/equation_reward_func": 0.46568082086741924, | |
| "rewards/format_reward_func": 0.9778181202709675, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 205.25921821594238, | |
| "epoch": 3.835820895522388, | |
| "grad_norm": 0.023329191763327563, | |
| "kl": 0.3209228515625, | |
| "learning_rate": 2.611479398511518e-08, | |
| "loss": 0.0003, | |
| "reward": 1.435825951397419, | |
| "reward_std": 0.15067564183846116, | |
| "rewards/equation_reward_func": 0.46135604567825794, | |
| "rewards/format_reward_func": 0.9744699075818062, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 211.78683948516846, | |
| "epoch": 3.855721393034826, | |
| "grad_norm": 0.02259400263789087, | |
| "kl": 0.31689453125, | |
| "learning_rate": 2.4534950219914057e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4026228412985802, | |
| "reward_std": 0.16164674330502748, | |
| "rewards/equation_reward_func": 0.4285714477300644, | |
| "rewards/format_reward_func": 0.9740513861179352, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 209.08399200439453, | |
| "epoch": 3.8756218905472637, | |
| "grad_norm": 0.02478747899164941, | |
| "kl": 0.3443603515625, | |
| "learning_rate": 2.300192937233128e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4481027498841286, | |
| "reward_std": 0.16136478912085295, | |
| "rewards/equation_reward_func": 0.47293529473245144, | |
| "rewards/format_reward_func": 0.9751674495637417, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 208.59822463989258, | |
| "epoch": 3.8955223880597014, | |
| "grad_norm": 0.021101252528902322, | |
| "kl": 0.3262939453125, | |
| "learning_rate": 2.1516049808822935e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4168527349829674, | |
| "reward_std": 0.14211608842015266, | |
| "rewards/equation_reward_func": 0.43750002048909664, | |
| "rewards/format_reward_func": 0.9793527238070965, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 207.56948566436768, | |
| "epoch": 3.9154228855721396, | |
| "grad_norm": 0.020157322294634124, | |
| "kl": 0.3306884765625, | |
| "learning_rate": 2.007762010589098e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4323382377624512, | |
| "reward_std": 0.15005581732839346, | |
| "rewards/equation_reward_func": 0.4594029225409031, | |
| "rewards/format_reward_func": 0.9729353077709675, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 204.75851917266846, | |
| "epoch": 3.935323383084577, | |
| "grad_norm": 0.02606741259329065, | |
| "kl": 0.319580078125, | |
| "learning_rate": 1.8686938986000627e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4275949224829674, | |
| "reward_std": 0.14978661900386214, | |
| "rewards/equation_reward_func": 0.4508928805589676, | |
| "rewards/format_reward_func": 0.9767020530998707, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 215.83901405334473, | |
| "epoch": 3.955223880597015, | |
| "grad_norm": 0.022436135999161815, | |
| "kl": 0.3084716796875, | |
| "learning_rate": 1.734429525554365e-08, | |
| "loss": 0.0003, | |
| "reward": 1.380301408469677, | |
| "reward_std": 0.15571930957958102, | |
| "rewards/equation_reward_func": 0.41057479567825794, | |
| "rewards/format_reward_func": 0.9697266109287739, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 203.08301734924316, | |
| "epoch": 3.9751243781094527, | |
| "grad_norm": 0.025891475256508395, | |
| "kl": 0.939208984375, | |
| "learning_rate": 1.604996774486145e-08, | |
| "loss": 0.0009, | |
| "reward": 1.4464286342263222, | |
| "reward_std": 0.14958901097998023, | |
| "rewards/equation_reward_func": 0.4700055941939354, | |
| "rewards/format_reward_func": 0.9764230325818062, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 209.05107021331787, | |
| "epoch": 3.9950248756218905, | |
| "grad_norm": 0.022997479014902993, | |
| "kl": 0.303466796875, | |
| "learning_rate": 1.4804225250339281e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4299665838479996, | |
| "reward_std": 0.15680306032299995, | |
| "rewards/equation_reward_func": 0.45619421638548374, | |
| "rewards/format_reward_func": 0.973772369325161, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 211.78218841552734, | |
| "epoch": 4.009950248756219, | |
| "grad_norm": 0.020830488980119865, | |
| "kl": 0.7779947916666666, | |
| "learning_rate": 1.360732647858498e-08, | |
| "loss": 0.0006, | |
| "reward": 1.3911830882231395, | |
| "reward_std": 0.1600110853711764, | |
| "rewards/equation_reward_func": 0.4188988283276558, | |
| "rewards/format_reward_func": 0.9722842623790106, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 209.77470207214355, | |
| "epoch": 4.029850746268656, | |
| "grad_norm": 0.023571808495632358, | |
| "kl": 0.3199462890625, | |
| "learning_rate": 1.2459519992702311e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4161552041769028, | |
| "reward_std": 0.15395409474149346, | |
| "rewards/equation_reward_func": 0.4450335018336773, | |
| "rewards/format_reward_func": 0.9711217060685158, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 208.13561153411865, | |
| "epoch": 4.0497512437810945, | |
| "grad_norm": 0.037985650681309185, | |
| "kl": 0.3270263671875, | |
| "learning_rate": 1.1361044160671629e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4171317592263222, | |
| "reward_std": 0.1468229554593563, | |
| "rewards/equation_reward_func": 0.44266184605658054, | |
| "rewards/format_reward_func": 0.9744699075818062, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 207.70773792266846, | |
| "epoch": 4.069651741293533, | |
| "grad_norm": 0.021241530165510853, | |
| "kl": 0.321533203125, | |
| "learning_rate": 1.0312127105846947e-08, | |
| "loss": 0.0003, | |
| "reward": 1.4218750670552254, | |
| "reward_std": 0.14907476026564837, | |
| "rewards/equation_reward_func": 0.4467076100409031, | |
| "rewards/format_reward_func": 0.975167453289032, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 207.40081882476807, | |
| "epoch": 4.08955223880597, | |
| "grad_norm": 0.025650392805104616, | |
| "kl": 0.323486328125, | |
| "learning_rate": 9.312986659581301e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4208985045552254, | |
| "reward_std": 0.1482405737042427, | |
| "rewards/equation_reward_func": 0.44475448317825794, | |
| "rewards/format_reward_func": 0.9761440120637417, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 204.74107837677002, | |
| "epoch": 4.109452736318408, | |
| "grad_norm": 0.02312838038656702, | |
| "kl": 0.3240966796875, | |
| "learning_rate": 8.363830315988945e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4256418123841286, | |
| "reward_std": 0.13928239746019244, | |
| "rewards/equation_reward_func": 0.4486607275903225, | |
| "rewards/format_reward_func": 0.9769810624420643, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 210.85812854766846, | |
| "epoch": 4.129353233830845, | |
| "grad_norm": 0.02306714311619018, | |
| "kl": 0.3194580078125, | |
| "learning_rate": 7.46485518885462e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4207590073347092, | |
| "reward_std": 0.16189886908978224, | |
| "rewards/equation_reward_func": 0.44824220798909664, | |
| "rewards/format_reward_func": 0.9725167937576771, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 209.68360042572021, | |
| "epoch": 4.149253731343284, | |
| "grad_norm": 0.025964972983739235, | |
| "kl": 0.4527587890625, | |
| "learning_rate": 6.616247970698319e-09, | |
| "loss": 0.0005, | |
| "reward": 1.4035994037985802, | |
| "reward_std": 0.1401984915137291, | |
| "rewards/equation_reward_func": 0.4278739057481289, | |
| "rewards/format_reward_func": 0.9757254905998707, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 209.79618644714355, | |
| "epoch": 4.169154228855722, | |
| "grad_norm": 0.02348654454949762, | |
| "kl": 0.31689453125, | |
| "learning_rate": 5.8181848940044855e-09, | |
| "loss": 0.0003, | |
| "reward": 1.3839286416769028, | |
| "reward_std": 0.14657014375552535, | |
| "rewards/equation_reward_func": 0.40820314176380634, | |
| "rewards/format_reward_func": 0.975725494325161, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 210.37012767791748, | |
| "epoch": 4.189054726368159, | |
| "grad_norm": 0.02357698256911753, | |
| "kl": 0.3001708984375, | |
| "learning_rate": 5.070831694623135e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4319197162985802, | |
| "reward_std": 0.17353114672005177, | |
| "rewards/equation_reward_func": 0.4606585055589676, | |
| "rewards/format_reward_func": 0.971261203289032, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 204.22601413726807, | |
| "epoch": 4.208955223880597, | |
| "grad_norm": 0.02307857400198192, | |
| "kl": 0.315185546875, | |
| "learning_rate": 4.374343577351336e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4426618963479996, | |
| "reward_std": 0.14288693387061357, | |
| "rewards/equation_reward_func": 0.4628906473517418, | |
| "rewards/format_reward_func": 0.9797712452709675, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 207.03641891479492, | |
| "epoch": 4.2288557213930345, | |
| "grad_norm": 0.024676025095632062, | |
| "kl": 0.32421875, | |
| "learning_rate": 3.7288651837012745e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4172712713479996, | |
| "reward_std": 0.15037415781989694, | |
| "rewards/equation_reward_func": 0.4390346184372902, | |
| "rewards/format_reward_func": 0.9782366491854191, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 208.80218505859375, | |
| "epoch": 4.248756218905473, | |
| "grad_norm": 0.02483677986616571, | |
| "kl": 0.3218994140625, | |
| "learning_rate": 3.134530561862081e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4372210651636124, | |
| "reward_std": 0.16455319011583924, | |
| "rewards/equation_reward_func": 0.4655413180589676, | |
| "rewards/format_reward_func": 0.9716797359287739, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 211.74707984924316, | |
| "epoch": 4.268656716417911, | |
| "grad_norm": 0.025368044357021936, | |
| "kl": 0.3048095703125, | |
| "learning_rate": 2.5914631388619103e-09, | |
| "loss": 0.0003, | |
| "reward": 1.414062574505806, | |
| "reward_std": 0.14385076658800244, | |
| "rewards/equation_reward_func": 0.4398716725409031, | |
| "rewards/format_reward_func": 0.9741908945143223, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 206.24610137939453, | |
| "epoch": 4.288557213930348, | |
| "grad_norm": 0.023950486942455378, | |
| "kl": 0.315185546875, | |
| "learning_rate": 2.0997756949353297e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4426618963479996, | |
| "reward_std": 0.13947459170594811, | |
| "rewards/equation_reward_func": 0.46735493279993534, | |
| "rewards/format_reward_func": 0.9753069616854191, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 206.12724208831787, | |
| "epoch": 4.308457711442786, | |
| "grad_norm": 0.021527565472442417, | |
| "kl": 0.31005859375, | |
| "learning_rate": 1.6595703401020844e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4464286491274834, | |
| "reward_std": 0.14696118608117104, | |
| "rewards/equation_reward_func": 0.47028462402522564, | |
| "rewards/format_reward_func": 0.9761440195143223, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 208.80678939819336, | |
| "epoch": 4.3283582089552235, | |
| "grad_norm": 0.020240811217773904, | |
| "kl": 0.2994384765625, | |
| "learning_rate": 1.2709384929615596e-09, | |
| "loss": 0.0003, | |
| "reward": 1.4192243963479996, | |
| "reward_std": 0.14247039007022977, | |
| "rewards/equation_reward_func": 0.4411272518336773, | |
| "rewards/format_reward_func": 0.978097140789032, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 207.45759773254395, | |
| "epoch": 4.348258706467662, | |
| "grad_norm": 0.021924119370664356, | |
| "kl": 0.3150634765625, | |
| "learning_rate": 9.339608617077165e-10, | |
| "loss": 0.0003, | |
| "reward": 1.438476637005806, | |
| "reward_std": 0.14740097196772695, | |
| "rewards/equation_reward_func": 0.4614955596625805, | |
| "rewards/format_reward_func": 0.9769810698926449, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 207.46052646636963, | |
| "epoch": 4.3681592039801, | |
| "grad_norm": 0.02647054738349205, | |
| "kl": 0.300537109375, | |
| "learning_rate": 6.487074273681114e-10, | |
| "loss": 0.0003, | |
| "reward": 1.4221540912985802, | |
| "reward_std": 0.15105881914496422, | |
| "rewards/equation_reward_func": 0.444614976644516, | |
| "rewards/format_reward_func": 0.9775391034781933, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 204.8751516342163, | |
| "epoch": 4.388059701492537, | |
| "grad_norm": 0.019730185707955705, | |
| "kl": 0.317626953125, | |
| "learning_rate": 4.152374292708538e-10, | |
| "loss": 0.0003, | |
| "reward": 1.428152970969677, | |
| "reward_std": 0.14623442757874727, | |
| "rewards/equation_reward_func": 0.45228797383606434, | |
| "rewards/format_reward_func": 0.9758649952709675, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 204.39119052886963, | |
| "epoch": 4.407960199004975, | |
| "grad_norm": 0.030516143275097163, | |
| "kl": 0.3372802734375, | |
| "learning_rate": 2.3359935274214204e-10, | |
| "loss": 0.0003, | |
| "reward": 1.4517299830913544, | |
| "reward_std": 0.13834463013336062, | |
| "rewards/equation_reward_func": 0.47349332459270954, | |
| "rewards/format_reward_func": 0.9782366491854191, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 206.29214000701904, | |
| "epoch": 4.4278606965174125, | |
| "grad_norm": 0.023528987935019292, | |
| "kl": 0.3077392578125, | |
| "learning_rate": 1.0383091903720665e-10, | |
| "loss": 0.0003, | |
| "reward": 1.4386161491274834, | |
| "reward_std": 0.14745613746345043, | |
| "rewards/equation_reward_func": 0.46303015761077404, | |
| "rewards/format_reward_func": 0.9755859784781933, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 205.60352325439453, | |
| "epoch": 4.447761194029851, | |
| "grad_norm": 0.01905395451435713, | |
| "kl": 0.327880859375, | |
| "learning_rate": 2.595907750671533e-11, | |
| "loss": 0.0003, | |
| "reward": 1.4253627881407738, | |
| "reward_std": 0.14054079167544842, | |
| "rewards/equation_reward_func": 0.44921877048909664, | |
| "rewards/format_reward_func": 0.976144015789032, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 202.40026092529297, | |
| "epoch": 4.467661691542289, | |
| "grad_norm": 0.022636083725668023, | |
| "kl": 0.3131103515625, | |
| "learning_rate": 0.0, | |
| "loss": 0.0003, | |
| "reward": 1.4511719420552254, | |
| "reward_std": 0.15265340125188231, | |
| "rewards/equation_reward_func": 0.47209824435412884, | |
| "rewards/format_reward_func": 0.9790736995637417, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.467661691542289, | |
| "step": 450, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0009040301745537565, | |
| "train_runtime": 63789.5966, | |
| "train_samples_per_second": 3.16, | |
| "train_steps_per_second": 0.007 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 450, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |