diff --git "a/checkpoint-200/trainer_state.json" "b/checkpoint-200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-200/trainer_state.json" @@ -0,0 +1,4634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15060240963855423, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 539.939453125, + "completions/mean_terminated_length": 534.0687866210938, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.0007530120481927711, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2677452564239502, + "learning_rate": 1e-06, + "loss": -0.0491, + "num_tokens": 322497.0, + "reward": 1.8222503662109375, + "reward_std": 0.2692643404006958, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.8222503662109375, + "rewards/icrm_reward/std": 0.35891062021255493, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 541.125, + "completions/mean_terminated_length": 535.2612915039062, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.0015060240963855422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2547134757041931, + "learning_rate": 1e-06, + "loss": -0.0405, + "num_tokens": 643217.0, + "reward": 1.8759613037109375, + "reward_std": 0.26730749011039734, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.8759613037109375, + "rewards/icrm_reward/std": 0.35533255338668823, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1268.0, + "completions/max_terminated_length": 1268.0, + "completions/mean_length": 503.001953125, + "completions/mean_terminated_length": 503.001953125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.002259036144578313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28800466656684875, + "learning_rate": 1e-06, + "loss": -0.0702, + "num_tokens": 946338.0, + "reward": 1.807281494140625, + "reward_std": 0.2657621502876282, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.807281494140625, + "rewards/icrm_reward/std": 0.36616575717926025, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 482.392578125, + "completions/mean_terminated_length": 482.392578125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.0030120481927710845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2880045473575592, + "learning_rate": 1e-06, + "loss": -0.0543, + "num_tokens": 1238651.0, + "reward": 1.8897247314453125, + "reward_std": 0.24010181427001953, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.8897247314453125, + "rewards/icrm_reward/std": 0.2975654602050781, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 548.681640625, + "completions/mean_terminated_length": 546.74951171875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.0037650602409638554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26260432600975037, + "learning_rate": 1e-06, + "loss": -0.0915, + "num_tokens": 1564648.0, + "reward": 1.8785552978515625, + "reward_std": 0.2939695119857788, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.8785552978515625, + "rewards/icrm_reward/std": 0.3403075635433197, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 465.359375, + "completions/mean_terminated_length": 461.1607971191406, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.004518072289156626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2783867418766022, + "learning_rate": 1e-06, + "loss": -0.0511, + "num_tokens": 1845008.0, + "reward": 1.886444091796875, + "reward_std": 0.24519476294517517, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.886444091796875, + "rewards/icrm_reward/std": 0.32863083481788635, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1419.0, + "completions/mean_length": 558.427734375, + "completions/mean_terminated_length": 556.5146484375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.005271084337349397, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2398781031370163, + "learning_rate": 1e-06, + "loss": -0.0528, + "num_tokens": 2178827.0, + "reward": 1.89752197265625, + "reward_std": 0.2825944125652313, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.89752197265625, + "rewards/icrm_reward/std": 0.3314611613750458, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1315.0, + "completions/max_terminated_length": 1315.0, + "completions/mean_length": 516.62890625, + "completions/mean_terminated_length": 516.62890625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.006024096385542169, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26950332522392273, + "learning_rate": 1e-06, + "loss": -0.0515, + "num_tokens": 2490557.0, + "reward": 1.880584716796875, + "reward_std": 0.24953413009643555, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.880584716796875, + "rewards/icrm_reward/std": 0.31805965304374695, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1486.0, + "completions/mean_length": 540.916015625, + "completions/mean_terminated_length": 538.9686889648438, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.00677710843373494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27774062752723694, + "learning_rate": 1e-06, + "loss": -0.0696, + "num_tokens": 2810898.0, + "reward": 1.8914337158203125, + "reward_std": 0.2737649083137512, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.8914337158203125, + "rewards/icrm_reward/std": 0.3299334645271301, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1449.0, + "completions/mean_length": 489.474609375, + "completions/mean_terminated_length": 487.4266052246094, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.007530120481927711, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29050636291503906, + "learning_rate": 1e-06, + "loss": -0.0795, + "num_tokens": 3108869.0, + "reward": 1.938812255859375, + "reward_std": 0.2572469711303711, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.938812255859375, + "rewards/icrm_reward/std": 0.338980495929718, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1475.0, + "completions/max_terminated_length": 1475.0, + "completions/mean_length": 520.849609375, + "completions/mean_terminated_length": 520.849609375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.008283132530120483, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2942567765712738, + "learning_rate": 1e-06, + "loss": -0.0717, + "num_tokens": 3421848.0, + "reward": 1.9404754638671875, + "reward_std": 0.2595525085926056, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.9404754638671875, + "rewards/icrm_reward/std": 0.3023451566696167, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 519.873046875, + "completions/mean_terminated_length": 517.884521484375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.009036144578313253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24970945715904236, + "learning_rate": 1e-06, + "loss": -0.0503, + "num_tokens": 3729655.0, + "reward": 1.9094085693359375, + "reward_std": 0.25953686237335205, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.9094085693359375, + "rewards/icrm_reward/std": 0.331910640001297, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1369.0, + "completions/mean_length": 547.90625, + "completions/mean_terminated_length": 545.9725952148438, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.009789156626506024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28067365288734436, + "learning_rate": 1e-06, + "loss": -0.0752, + "num_tokens": 4053863.0, + "reward": 1.9482421875, + "reward_std": 0.26520416140556335, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.9482421875, + "rewards/icrm_reward/std": 0.3480120599269867, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 548.40234375, + "completions/mean_terminated_length": 546.4696655273438, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.010542168674698794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25670015811920166, + "learning_rate": 1e-06, + "loss": -0.0691, + "num_tokens": 4378933.0, + "reward": 1.9578399658203125, + "reward_std": 0.27680855989456177, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.9578399658203125, + "rewards/icrm_reward/std": 0.3370186686515808, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1521.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 524.80859375, + "completions/mean_terminated_length": 524.80859375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.011295180722891566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2583891451358795, + "learning_rate": 1e-06, + "loss": -0.0585, + "num_tokens": 4691523.0, + "reward": 1.9633941650390625, + "reward_std": 0.2548086643218994, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.9633941650390625, + "rewards/icrm_reward/std": 0.31804537773132324, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1247.0, + "completions/mean_length": 560.337890625, + "completions/mean_terminated_length": 556.5117797851562, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.012048192771084338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24440975487232208, + "learning_rate": 1e-06, + "loss": -0.0549, + "num_tokens": 5027184.0, + "reward": 1.9822998046875, + "reward_std": 0.268485963344574, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 1.9822998046875, + "rewards/icrm_reward/std": 0.3278394639492035, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 591.25390625, + "completions/mean_terminated_length": 589.4050903320312, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.012801204819277108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22858311235904694, + "learning_rate": 1e-06, + "loss": -0.049, + "num_tokens": 5376418.0, + "reward": 2.090728759765625, + "reward_std": 0.2703300714492798, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.090728759765625, + "rewards/icrm_reward/std": 0.3051627278327942, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1427.0, + "completions/mean_length": 560.716796875, + "completions/mean_terminated_length": 556.8922119140625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.01355421686746988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24253730475902557, + "learning_rate": 1e-06, + "loss": -0.0476, + "num_tokens": 5708881.0, + "reward": 2.023834228515625, + "reward_std": 0.22830849885940552, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.023834228515625, + "rewards/icrm_reward/std": 0.3195403516292572, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1396.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 554.669921875, + "completions/mean_terminated_length": 554.669921875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.01430722891566265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22568698227405548, + "learning_rate": 1e-06, + "loss": -0.0628, + "num_tokens": 6034520.0, + "reward": 2.0350494384765625, + "reward_std": 0.23877058923244476, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.0350494384765625, + "rewards/icrm_reward/std": 0.2821340560913086, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1408.0, + "completions/mean_length": 618.142578125, + "completions/mean_terminated_length": 612.7328491210938, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.015060240963855422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21298259496688843, + "learning_rate": 1e-06, + "loss": -0.0312, + "num_tokens": 6399457.0, + "reward": 2.043853759765625, + "reward_std": 0.2564227879047394, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.043853759765625, + "rewards/icrm_reward/std": 0.32547545433044434, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1500.0, + "completions/mean_length": 647.5078125, + "completions/mean_terminated_length": 636.9723510742188, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.01581325301204819, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21974076330661774, + "learning_rate": 1e-06, + "loss": -0.0194, + "num_tokens": 6773909.0, + "reward": 2.0822906494140625, + "reward_std": 0.2559783458709717, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.0822906494140625, + "rewards/icrm_reward/std": 0.3165861964225769, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 629.095703125, + "completions/mean_terminated_length": 627.3209228515625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.016566265060240965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2140408158302307, + "learning_rate": 1e-06, + "loss": -0.0158, + "num_tokens": 7137062.0, + "reward": 2.0489654541015625, + "reward_std": 0.22745975852012634, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.0489654541015625, + "rewards/icrm_reward/std": 0.28583163022994995, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 668.0, + "completions/mean_terminated_length": 666.3013916015625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.017319277108433735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19740059971809387, + "learning_rate": 1e-06, + "loss": -0.0199, + "num_tokens": 7525974.0, + "reward": 2.114471435546875, + "reward_std": 0.22889575362205505, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.114471435546875, + "rewards/icrm_reward/std": 0.2792186439037323, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 688.009765625, + "completions/mean_terminated_length": 658.8869018554688, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.018072289156626505, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19638773798942566, + "learning_rate": 1e-06, + "loss": 0.0102, + "num_tokens": 7924795.0, + "reward": 2.080596923828125, + "reward_std": 0.23665004968643188, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.080596923828125, + "rewards/icrm_reward/std": 0.3111322522163391, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1517.0, + "completions/max_terminated_length": 1517.0, + "completions/mean_length": 625.5546875, + "completions/mean_terminated_length": 625.5546875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.01882530120481928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22388488054275513, + "learning_rate": 1e-06, + "loss": -0.0295, + "num_tokens": 8289031.0, + "reward": 2.1454315185546875, + "reward_std": 0.22431066632270813, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1454315185546875, + "rewards/icrm_reward/std": 0.2752014100551605, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1518.0, + "completions/mean_length": 620.7578125, + "completions/mean_terminated_length": 596.913818359375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.01957831325301205, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21399712562561035, + "learning_rate": 1e-06, + "loss": -0.0254, + "num_tokens": 8656715.0, + "reward": 2.0743865966796875, + "reward_std": 0.2408333122730255, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.0743865966796875, + "rewards/icrm_reward/std": 0.3148554563522339, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1519.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 640.244140625, + "completions/mean_terminated_length": 640.244140625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.02033132530120482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2085241973400116, + "learning_rate": 1e-06, + "loss": -0.0445, + "num_tokens": 9027544.0, + "reward": 2.1477813720703125, + "reward_std": 0.22815221548080444, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1477813720703125, + "rewards/icrm_reward/std": 0.3022710084915161, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 574.10546875, + "completions/mean_terminated_length": 574.10546875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.02108433734939759, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22086061537265778, + "learning_rate": 1e-06, + "loss": -0.0415, + "num_tokens": 9368830.0, + "reward": 2.0839691162109375, + "reward_std": 0.22651252150535583, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.0839691162109375, + "rewards/icrm_reward/std": 0.2819969654083252, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1455.0, + "completions/mean_length": 608.298828125, + "completions/mean_terminated_length": 602.8310546875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.021837349397590362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21593217551708221, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 9728471.0, + "reward": 2.12347412109375, + "reward_std": 0.2261788547039032, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.12347412109375, + "rewards/icrm_reward/std": 0.2729581892490387, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 602.5390625, + "completions/mean_terminated_length": 600.7123413085938, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.022590361445783132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21327072381973267, + "learning_rate": 1e-06, + "loss": -0.0265, + "num_tokens": 10078123.0, + "reward": 2.134918212890625, + "reward_std": 0.22259432077407837, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.134918212890625, + "rewards/icrm_reward/std": 0.2794201970100403, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 624.75390625, + "completions/mean_terminated_length": 622.9706420898438, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.023343373493975902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21840304136276245, + "learning_rate": 1e-06, + "loss": -0.0354, + "num_tokens": 10439629.0, + "reward": 2.1316375732421875, + "reward_std": 0.2339249849319458, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1316375732421875, + "rewards/icrm_reward/std": 0.2949000895023346, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1455.0, + "completions/mean_length": 618.02734375, + "completions/mean_terminated_length": 610.7991943359375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.024096385542168676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21134598553180695, + "learning_rate": 1e-06, + "loss": -0.0239, + "num_tokens": 10801515.0, + "reward": 2.1789169311523438, + "reward_std": 0.2280135154724121, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1789169311523438, + "rewards/icrm_reward/std": 0.3063806891441345, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1394.0, + "completions/max_terminated_length": 1394.0, + "completions/mean_length": 579.873046875, + "completions/mean_terminated_length": 579.873046875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.024849397590361446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23027564585208893, + "learning_rate": 1e-06, + "loss": -0.0456, + "num_tokens": 11142458.0, + "reward": 2.1141510009765625, + "reward_std": 0.22557678818702698, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1141510009765625, + "rewards/icrm_reward/std": 0.2943926155567169, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 569.83203125, + "completions/mean_terminated_length": 567.9412841796875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.025602409638554216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2484128624200821, + "learning_rate": 1e-06, + "loss": -0.0377, + "num_tokens": 11478676.0, + "reward": 2.138275146484375, + "reward_std": 0.2324836403131485, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.138275146484375, + "rewards/icrm_reward/std": 0.2837066352367401, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 649.62109375, + "completions/mean_terminated_length": 644.3968505859375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.02635542168674699, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22154265642166138, + "learning_rate": 1e-06, + "loss": -0.0408, + "num_tokens": 11857058.0, + "reward": 2.131011962890625, + "reward_std": 0.2266952395439148, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.131011962890625, + "rewards/icrm_reward/std": 0.31931453943252563, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 637.251953125, + "completions/mean_terminated_length": 628.3885498046875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.02710843373493976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20767764747142792, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 12229043.0, + "reward": 2.1409454345703125, + "reward_std": 0.22632841765880585, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1409454345703125, + "rewards/icrm_reward/std": 0.2695969045162201, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 659.4453125, + "completions/mean_terminated_length": 647.2950439453125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.02786144578313253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2131509631872177, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 12614919.0, + "reward": 2.208251953125, + "reward_std": 0.24248948693275452, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.208251953125, + "rewards/icrm_reward/std": 0.3120294213294983, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 630.41015625, + "completions/mean_terminated_length": 626.85888671875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.0286144578313253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2243937849998474, + "learning_rate": 1e-06, + "loss": -0.0279, + "num_tokens": 12980249.0, + "reward": 2.111053466796875, + "reward_std": 0.22191520035266876, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.111053466796875, + "rewards/icrm_reward/std": 0.2775036096572876, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1459.0, + "completions/mean_length": 675.94140625, + "completions/mean_terminated_length": 665.7431030273438, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.029367469879518073, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20395302772521973, + "learning_rate": 1e-06, + "loss": -0.0178, + "num_tokens": 13371051.0, + "reward": 2.2367706298828125, + "reward_std": 0.24021776020526886, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.2367706298828125, + "rewards/icrm_reward/std": 0.30560144782066345, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1495.0, + "completions/mean_length": 665.2265625, + "completions/mean_terminated_length": 658.3700561523438, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.030120481927710843, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21775901317596436, + "learning_rate": 1e-06, + "loss": -0.0184, + "num_tokens": 13762655.0, + "reward": 2.22381591796875, + "reward_std": 0.23306353390216827, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.22381591796875, + "rewards/icrm_reward/std": 0.2940692901611328, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1489.0, + "completions/mean_length": 711.74609375, + "completions/mean_terminated_length": 696.9979858398438, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.030873493975903613, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20008227229118347, + "learning_rate": 1e-06, + "loss": -0.0203, + "num_tokens": 14174493.0, + "reward": 2.1737060546875, + "reward_std": 0.23047159612178802, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1737060546875, + "rewards/icrm_reward/std": 0.2827721834182739, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 628.2109375, + "completions/mean_terminated_length": 622.8605346679688, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.03162650602409638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23574763536453247, + "learning_rate": 1e-06, + "loss": -0.024, + "num_tokens": 14538633.0, + "reward": 2.2064056396484375, + "reward_std": 0.23086489737033844, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.2064056396484375, + "rewards/icrm_reward/std": 0.30988746881484985, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1509.0, + "completions/mean_length": 651.89453125, + "completions/mean_terminated_length": 648.427490234375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.03237951807228916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22128161787986755, + "learning_rate": 1e-06, + "loss": -0.0133, + "num_tokens": 14917715.0, + "reward": 2.1989898681640625, + "reward_std": 0.22341391444206238, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1989898681640625, + "rewards/icrm_reward/std": 0.2708163857460022, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 752.138671875, + "completions/mean_terminated_length": 730.1023559570312, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.03313253012048193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18046841025352478, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 15353770.0, + "reward": 2.202545166015625, + "reward_std": 0.22210244834423065, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.202545166015625, + "rewards/icrm_reward/std": 0.30588632822036743, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 648.603515625, + "completions/mean_terminated_length": 619.977783203125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.0338855421686747, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21456339955329895, + "learning_rate": 1e-06, + "loss": -0.0282, + "num_tokens": 15731183.0, + "reward": 2.1501007080078125, + "reward_std": 0.24410219490528107, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1501007080078125, + "rewards/icrm_reward/std": 0.3128403425216675, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 664.4765625, + "completions/mean_terminated_length": 662.7710571289062, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.03463855421686747, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23055920004844666, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 16113635.0, + "reward": 2.2421112060546875, + "reward_std": 0.23560726642608643, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.2421112060546875, + "rewards/icrm_reward/std": 0.29645806550979614, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 679.16796875, + "completions/mean_terminated_length": 667.2911376953125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.035391566265060244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20098020136356354, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 16510985.0, + "reward": 2.1724700927734375, + "reward_std": 0.24453820288181305, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.1724700927734375, + "rewards/icrm_reward/std": 0.3007848858833313, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1473.0, + "completions/mean_length": 629.65625, + "completions/mean_terminated_length": 626.1019897460938, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.03614457831325301, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21083861589431763, + "learning_rate": 1e-06, + "loss": -0.0175, + "num_tokens": 16878169.0, + "reward": 2.199188232421875, + "reward_std": 0.21884912252426147, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.199188232421875, + "rewards/icrm_reward/std": 0.27998584508895874, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 673.232421875, + "completions/mean_terminated_length": 661.2733154296875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.036897590361445784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22509372234344482, + "learning_rate": 1e-06, + "loss": -0.0074, + "num_tokens": 17269376.0, + "reward": 2.2296295166015625, + "reward_std": 0.2356814444065094, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.2296295166015625, + "rewards/icrm_reward/std": 0.2954954504966736, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 636.708984375, + "completions/mean_terminated_length": 629.6279296875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.03765060240963856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24557623267173767, + "learning_rate": 1e-06, + "loss": -0.0308, + "num_tokens": 17640139.0, + "reward": 2.254730224609375, + "reward_std": 0.24260641634464264, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.254730224609375, + "rewards/icrm_reward/std": 0.2840177118778229, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 600.349609375, + "completions/mean_terminated_length": 598.5186157226562, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.038403614457831324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24236264824867249, + "learning_rate": 1e-06, + "loss": -0.0534, + "num_tokens": 17994222.0, + "reward": 2.2987060546875, + "reward_std": 0.24154716730117798, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.2987060546875, + "rewards/icrm_reward/std": 0.3029851019382477, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 686.248046875, + "completions/mean_terminated_length": 677.8678588867188, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.0391566265060241, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21713781356811523, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 18393245.0, + "reward": 2.2697906494140625, + "reward_std": 0.2312566339969635, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.2697906494140625, + "rewards/icrm_reward/std": 0.3033978343009949, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 734.234375, + "completions/mean_terminated_length": 729.5088500976562, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.039909638554216864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20006483793258667, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 18813301.0, + "reward": 2.4068450927734375, + "reward_std": 0.24239230155944824, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.4068450927734375, + "rewards/icrm_reward/std": 0.2884853184223175, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 682.37109375, + "completions/mean_terminated_length": 673.9526977539062, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.04066265060240964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22565065324306488, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 19205987.0, + "reward": 2.411712646484375, + "reward_std": 0.2517935633659363, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.411712646484375, + "rewards/icrm_reward/std": 0.29691916704177856, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 686.0625, + "completions/mean_terminated_length": 675.9841918945312, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.04141566265060241, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25505706667900085, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 19603747.0, + "reward": 2.3581085205078125, + "reward_std": 0.25292664766311646, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.3581085205078125, + "rewards/icrm_reward/std": 0.3019520044326782, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1487.0, + "completions/mean_length": 738.88671875, + "completions/mean_terminated_length": 721.38525390625, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.04216867469879518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19398604333400726, + "learning_rate": 1e-06, + "loss": 0.0298, + "num_tokens": 20029593.0, + "reward": 2.420806884765625, + "reward_std": 0.2693680226802826, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.420806884765625, + "rewards/icrm_reward/std": 0.33825576305389404, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1482.0, + "completions/mean_length": 693.65625, + "completions/mean_terminated_length": 678.58447265625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.04292168674698795, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23698727786540985, + "learning_rate": 1e-06, + "loss": 0.0359, + "num_tokens": 20431033.0, + "reward": 2.45440673828125, + "reward_std": 0.2709549069404602, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.45440673828125, + "rewards/icrm_reward/std": 0.3200661540031433, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1520.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 651.880859375, + "completions/mean_terminated_length": 651.880859375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.043674698795180725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22394704818725586, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 20809052.0, + "reward": 2.4409332275390625, + "reward_std": 0.24943535029888153, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.4409332275390625, + "rewards/icrm_reward/std": 0.29196539521217346, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 728.9296875, + "completions/mean_terminated_length": 704.5714111328125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.04442771084337349, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24043507874011993, + "learning_rate": 1e-06, + "loss": 0.0294, + "num_tokens": 21230728.0, + "reward": 2.4530029296875, + "reward_std": 0.29039081931114197, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.4530029296875, + "rewards/icrm_reward/std": 0.3333180546760559, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 668.4765625, + "completions/mean_terminated_length": 652.9542236328125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.045180722891566265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24050478637218475, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 21620748.0, + "reward": 2.416748046875, + "reward_std": 0.29372185468673706, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.416748046875, + "rewards/icrm_reward/std": 0.335464745759964, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 722.58203125, + "completions/mean_terminated_length": 682.5778198242188, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.04593373493975904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22767792642116547, + "learning_rate": 1e-06, + "loss": 0.0407, + "num_tokens": 22032694.0, + "reward": 2.4194793701171875, + "reward_std": 0.2609785497188568, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.4194793701171875, + "rewards/icrm_reward/std": 0.35149145126342773, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1526.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 627.765625, + "completions/mean_terminated_length": 627.765625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.046686746987951805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24805991351604462, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 22402686.0, + "reward": 2.4908294677734375, + "reward_std": 0.2669978737831116, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.4908294677734375, + "rewards/icrm_reward/std": 0.3251093924045563, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 766.275390625, + "completions/mean_terminated_length": 739.8404541015625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.04743975903614458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23006977140903473, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 22844203.0, + "reward": 2.4503021240234375, + "reward_std": 0.2922241687774658, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.4503021240234375, + "rewards/icrm_reward/std": 0.33814340829849243, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 725.68359375, + "completions/mean_terminated_length": 707.8922119140625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.04819277108433735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22646617889404297, + "learning_rate": 1e-06, + "loss": 0.0394, + "num_tokens": 23258473.0, + "reward": 2.477783203125, + "reward_std": 0.26488780975341797, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.477783203125, + "rewards/icrm_reward/std": 0.33223187923431396, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 754.828125, + "completions/mean_terminated_length": 728.0000610351562, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.04894578313253012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2258726954460144, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 23690801.0, + "reward": 2.45458984375, + "reward_std": 0.27226293087005615, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.45458984375, + "rewards/icrm_reward/std": 0.3482906222343445, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 698.974609375, + "completions/mean_terminated_length": 689.0494384765625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.04969879518072289, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544415593147278, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 24092564.0, + "reward": 2.50372314453125, + "reward_std": 0.2619969844818115, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.50372314453125, + "rewards/icrm_reward/std": 0.32770606875419617, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1461.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 648.00390625, + "completions/mean_terminated_length": 648.00390625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.050451807228915665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2574257552623749, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 24467686.0, + "reward": 2.5450897216796875, + "reward_std": 0.2660745680332184, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.5450897216796875, + "rewards/icrm_reward/std": 0.3114686608314514, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 744.58203125, + "completions/mean_terminated_length": 738.3504028320312, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.05120481927710843, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23591530323028564, + "learning_rate": 1e-06, + "loss": 0.0251, + "num_tokens": 24894160.0, + "reward": 2.490386962890625, + "reward_std": 0.25365495681762695, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.490386962890625, + "rewards/icrm_reward/std": 0.3168938159942627, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 769.4921875, + "completions/mean_terminated_length": 749.5230712890625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.051957831325301206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20792824029922485, + "learning_rate": 1e-06, + "loss": 0.0559, + "num_tokens": 25328156.0, + "reward": 2.5142974853515625, + "reward_std": 0.26831918954849243, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.5142974853515625, + "rewards/icrm_reward/std": 0.33375680446624756, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 776.34765625, + "completions/mean_terminated_length": 748.6680297851562, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.05271084337349398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.209881991147995, + "learning_rate": 1e-06, + "loss": 0.0291, + "num_tokens": 25767454.0, + "reward": 2.4936981201171875, + "reward_std": 0.2859433889389038, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.4936981201171875, + "rewards/icrm_reward/std": 0.31998544931411743, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 756.26953125, + "completions/mean_terminated_length": 737.5560302734375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.053463855421686746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22324053943157196, + "learning_rate": 1e-06, + "loss": 0.0431, + "num_tokens": 26204136.0, + "reward": 2.529388427734375, + "reward_std": 0.287563294172287, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.529388427734375, + "rewards/icrm_reward/std": 0.3346646726131439, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 739.04296875, + "completions/mean_terminated_length": 729.5928955078125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.05421686746987952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23168617486953735, + "learning_rate": 1e-06, + "loss": 0.0376, + "num_tokens": 26628030.0, + "reward": 2.5133209228515625, + "reward_std": 0.2752232849597931, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.5133209228515625, + "rewards/icrm_reward/std": 0.32146480679512024, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 801.69140625, + "completions/mean_terminated_length": 784.0680541992188, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.054969879518072286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21675075590610504, + "learning_rate": 1e-06, + "loss": 0.0333, + "num_tokens": 27084016.0, + "reward": 2.6229248046875, + "reward_std": 0.29856833815574646, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.6229248046875, + "rewards/icrm_reward/std": 0.3686384856700897, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1500.0, + "completions/mean_length": 859.42578125, + "completions/mean_terminated_length": 840.4055786132812, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.05572289156626506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21499691903591156, + "learning_rate": 1e-06, + "loss": 0.0493, + "num_tokens": 27565834.0, + "reward": 2.593292236328125, + "reward_std": 0.2805570960044861, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.593292236328125, + "rewards/icrm_reward/std": 0.36220526695251465, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1518.0, + "completions/mean_length": 773.33984375, + "completions/mean_terminated_length": 764.2964477539062, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.05647590361445783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2344229519367218, + "learning_rate": 1e-06, + "loss": 0.0423, + "num_tokens": 28007656.0, + "reward": 2.5948028564453125, + "reward_std": 0.293756902217865, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.5948028564453125, + "rewards/icrm_reward/std": 0.3333764374256134, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 817.533203125, + "completions/mean_terminated_length": 785.2754516601562, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.0572289156626506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22105208039283752, + "learning_rate": 1e-06, + "loss": 0.0472, + "num_tokens": 28470185.0, + "reward": 2.571807861328125, + "reward_std": 0.3125259280204773, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.571807861328125, + "rewards/icrm_reward/std": 0.39160090684890747, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 840.224609375, + "completions/mean_terminated_length": 808.9856567382812, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.05798192771084337, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2159413844347, + "learning_rate": 1e-06, + "loss": 0.0593, + "num_tokens": 28947228.0, + "reward": 2.6000213623046875, + "reward_std": 0.31856799125671387, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.6000213623046875, + "rewards/icrm_reward/std": 0.3859856426715851, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 812.81640625, + "completions/mean_terminated_length": 792.4859008789062, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.058734939759036146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24093309044837952, + "learning_rate": 1e-06, + "loss": 0.0314, + "num_tokens": 29408302.0, + "reward": 2.6257476806640625, + "reward_std": 0.29263830184936523, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.6257476806640625, + "rewards/icrm_reward/std": 0.36981090903282166, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 920.380859375, + "completions/mean_terminated_length": 869.62158203125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.05948795180722891, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22334524989128113, + "learning_rate": 1e-06, + "loss": 0.0437, + "num_tokens": 29929569.0, + "reward": 2.54388427734375, + "reward_std": 0.32523876428604126, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.54388427734375, + "rewards/icrm_reward/std": 0.43486225605010986, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 916.134765625, + "completions/mean_terminated_length": 889.623291015625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.060240963855421686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.230862557888031, + "learning_rate": 1e-06, + "loss": 0.0692, + "num_tokens": 30444198.0, + "reward": 2.6412506103515625, + "reward_std": 0.3409099280834198, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.6412506103515625, + "rewards/icrm_reward/std": 0.39923959970474243, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 770.642578125, + "completions/mean_terminated_length": 753.8383178710938, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.06099397590361446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26526880264282227, + "learning_rate": 1e-06, + "loss": 0.0325, + "num_tokens": 30882223.0, + "reward": 2.6500091552734375, + "reward_std": 0.31351447105407715, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.6500091552734375, + "rewards/icrm_reward/std": 0.37751612067222595, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 793.802734375, + "completions/mean_terminated_length": 789.4283447265625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.061746987951807226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2738642990589142, + "learning_rate": 1e-06, + "loss": 0.0415, + "num_tokens": 31337226.0, + "reward": 2.7096099853515625, + "reward_std": 0.32019662857055664, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.7096099853515625, + "rewards/icrm_reward/std": 0.3571399450302124, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 802.03125, + "completions/mean_terminated_length": 784.416015625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29248520731925964, + "learning_rate": 1e-06, + "loss": 0.0371, + "num_tokens": 31799898.0, + "reward": 2.713287353515625, + "reward_std": 0.30860912799835205, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.713287353515625, + "rewards/icrm_reward/std": 0.38378050923347473, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 831.005859375, + "completions/mean_terminated_length": 808.2640991210938, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.06325301204819277, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2955339550971985, + "learning_rate": 1e-06, + "loss": 0.0373, + "num_tokens": 32271437.0, + "reward": 2.73077392578125, + "reward_std": 0.34596526622772217, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.73077392578125, + "rewards/icrm_reward/std": 0.3958691656589508, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 784.931640625, + "completions/mean_terminated_length": 774.5208129882812, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.06400602409638555, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2862742841243744, + "learning_rate": 1e-06, + "loss": 0.0388, + "num_tokens": 32717626.0, + "reward": 2.7470855712890625, + "reward_std": 0.3193947374820709, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.7470855712890625, + "rewards/icrm_reward/std": 0.377297043800354, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1518.0, + "completions/mean_length": 779.779296875, + "completions/mean_terminated_length": 770.812255859375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.06475903614457831, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27621668577194214, + "learning_rate": 1e-06, + "loss": 0.0273, + "num_tokens": 33162073.0, + "reward": 2.8101959228515625, + "reward_std": 0.3282198905944824, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.8101959228515625, + "rewards/icrm_reward/std": 0.38994351029396057, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 819.9453125, + "completions/mean_terminated_length": 815.7249755859375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.06551204819277108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2889441251754761, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 33630589.0, + "reward": 2.8496246337890625, + "reward_std": 0.31779032945632935, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.8496246337890625, + "rewards/icrm_reward/std": 0.3903903365135193, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 808.7109375, + "completions/mean_terminated_length": 779.1463012695312, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.06626506024096386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32299426198005676, + "learning_rate": 1e-06, + "loss": 0.0374, + "num_tokens": 34088185.0, + "reward": 2.8147430419921875, + "reward_std": 0.3786037564277649, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.8147430419921875, + "rewards/icrm_reward/std": 0.45253899693489075, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1422.0, + "completions/mean_length": 845.720703125, + "completions/mean_terminated_length": 817.6605224609375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.06701807228915663, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31927403807640076, + "learning_rate": 1e-06, + "loss": 0.0419, + "num_tokens": 34564490.0, + "reward": 2.8685455322265625, + "reward_std": 0.4015999734401703, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.8685455322265625, + "rewards/icrm_reward/std": 0.462121844291687, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1517.0, + "completions/mean_length": 885.173828125, + "completions/mean_terminated_length": 870.8842163085938, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.0677710843373494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30388039350509644, + "learning_rate": 1e-06, + "loss": 0.0421, + "num_tokens": 35065059.0, + "reward": 2.9256591796875, + "reward_std": 0.381799578666687, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.9256591796875, + "rewards/icrm_reward/std": 0.4587096571922302, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 860.423828125, + "completions/mean_terminated_length": 830.091796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.06852409638554217, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31677141785621643, + "learning_rate": 1e-06, + "loss": 0.0362, + "num_tokens": 35548764.0, + "reward": 2.9433441162109375, + "reward_std": 0.4057391285896301, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.9433441162109375, + "rewards/icrm_reward/std": 0.4758445918560028, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 880.548828125, + "completions/mean_terminated_length": 864.8180541992188, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.06927710843373494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2733721435070038, + "learning_rate": 1e-06, + "loss": 0.0376, + "num_tokens": 36041157.0, + "reward": 2.95556640625, + "reward_std": 0.3688337206840515, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.95556640625, + "rewards/icrm_reward/std": 0.42398521304130554, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 882.771484375, + "completions/mean_terminated_length": 871.0834350585938, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.07003012048192771, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30676305294036865, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 36536928.0, + "reward": 2.928802490234375, + "reward_std": 0.37581419944763184, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 2.928802490234375, + "rewards/icrm_reward/std": 0.4320792555809021, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 912.806640625, + "completions/mean_terminated_length": 897.8500366210938, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.07078313253012049, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2867027521133423, + "learning_rate": 1e-06, + "loss": 0.0418, + "num_tokens": 37049405.0, + "reward": 3.07061767578125, + "reward_std": 0.38050028681755066, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.07061767578125, + "rewards/icrm_reward/std": 0.4632287621498108, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 887.109375, + "completions/mean_terminated_length": 874.1832885742188, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.07153614457831325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3053477108478546, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 37546421.0, + "reward": 3.01019287109375, + "reward_std": 0.4029240608215332, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.01019287109375, + "rewards/icrm_reward/std": 0.46473872661590576, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 895.69921875, + "completions/mean_terminated_length": 880.33203125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.07228915662650602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3111552894115448, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 38050571.0, + "reward": 3.07513427734375, + "reward_std": 0.3884848952293396, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.07513427734375, + "rewards/icrm_reward/std": 0.47221118211746216, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1512.0, + "completions/mean_length": 902.5625, + "completions/mean_terminated_length": 882.1290283203125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.0730421686746988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.272835373878479, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 38557883.0, + "reward": 3.056396484375, + "reward_std": 0.4365222454071045, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.056396484375, + "rewards/icrm_reward/std": 0.5650331974029541, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 836.984375, + "completions/mean_terminated_length": 832.8644409179688, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.07379518072289157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32397598028182983, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 39035331.0, + "reward": 3.0126800537109375, + "reward_std": 0.3690912127494812, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.0126800537109375, + "rewards/icrm_reward/std": 0.45690035820007324, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1495.0, + "completions/mean_length": 909.12890625, + "completions/mean_terminated_length": 904.1929321289062, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.07454819277108433, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2951310873031616, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 39545477.0, + "reward": 3.144927978515625, + "reward_std": 0.367910236120224, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.144927978515625, + "rewards/icrm_reward/std": 0.42396417260169983, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 900.78515625, + "completions/mean_terminated_length": 884.2365112304688, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.07530120481927711, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28665030002593994, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 40053143.0, + "reward": 3.1378173828125, + "reward_std": 0.40286144614219666, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.1378173828125, + "rewards/icrm_reward/std": 0.4643743336200714, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 945.23046875, + "completions/mean_terminated_length": 923.7044677734375, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.07605421686746988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2595527172088623, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 40579549.0, + "reward": 3.1948089599609375, + "reward_std": 0.4271482229232788, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.1948089599609375, + "rewards/icrm_reward/std": 0.49850770831108093, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 933.57421875, + "completions/mean_terminated_length": 903.9466552734375, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.07680722891566265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2670970857143402, + "learning_rate": 1e-06, + "loss": 0.0484, + "num_tokens": 41107283.0, + "reward": 3.1592559814453125, + "reward_std": 0.4245299994945526, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.1592559814453125, + "rewards/icrm_reward/std": 0.5013904571533203, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 933.482421875, + "completions/mean_terminated_length": 910.2616577148438, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.07756024096385543, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2757066488265991, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 41630394.0, + "reward": 3.312530517578125, + "reward_std": 0.44548290967941284, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.312530517578125, + "rewards/icrm_reward/std": 0.5510988831520081, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1523.0, + "completions/mean_length": 947.587890625, + "completions/mean_terminated_length": 937.0595703125, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.0783132530120482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26052749156951904, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 42162615.0, + "reward": 3.2794189453125, + "reward_std": 0.38833147287368774, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.2794189453125, + "rewards/icrm_reward/std": 0.44736889004707336, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 961.701171875, + "completions/mean_terminated_length": 946.739501953125, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.07906626506024096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25815528631210327, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 42698590.0, + "reward": 3.326141357421875, + "reward_std": 0.4088899791240692, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.326141357421875, + "rewards/icrm_reward/std": 0.49012550711631775, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 915.328125, + "completions/mean_terminated_length": 876.6971435546875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.07981927710843373, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3288217782974243, + "learning_rate": 1e-06, + "loss": 0.0242, + "num_tokens": 43214550.0, + "reward": 3.265228271484375, + "reward_std": 0.3938843011856079, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.265228271484375, + "rewards/icrm_reward/std": 0.5416343808174133, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1525.0, + "completions/mean_length": 932.228515625, + "completions/mean_terminated_length": 893.3160400390625, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.08057228915662651, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2760145366191864, + "learning_rate": 1e-06, + "loss": 0.0482, + "num_tokens": 43737771.0, + "reward": 3.3134918212890625, + "reward_std": 0.4481176435947418, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.3134918212890625, + "rewards/icrm_reward/std": 0.5695783495903015, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1043.150390625, + "completions/mean_terminated_length": 990.9913330078125, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.08132530120481928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2719205319881439, + "learning_rate": 1e-06, + "loss": 0.0224, + "num_tokens": 44318760.0, + "reward": 3.3242950439453125, + "reward_std": 0.46499037742614746, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.3242950439453125, + "rewards/icrm_reward/std": 0.6479662656784058, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 1041.3828125, + "completions/mean_terminated_length": 999.4661254882812, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "epoch": 0.08207831325301204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2970697581768036, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 44896012.0, + "reward": 3.4527130126953125, + "reward_std": 0.4472125172615051, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.4527130126953125, + "rewards/icrm_reward/std": 0.6081138849258423, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1054.5625, + "completions/mean_terminated_length": 1027.7608642578125, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.08283132530120482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29613715410232544, + "learning_rate": 1e-06, + "loss": 0.0159, + "num_tokens": 45483756.0, + "reward": 3.5418701171875, + "reward_std": 0.5048837661743164, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.5418701171875, + "rewards/icrm_reward/std": 0.6320242285728455, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1517.0, + "completions/mean_length": 1061.486328125, + "completions/mean_terminated_length": 1036.100830078125, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.08358433734939759, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31563860177993774, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 46069909.0, + "reward": 3.607818603515625, + "reward_std": 0.48346221446990967, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.607818603515625, + "rewards/icrm_reward/std": 0.6252943873405457, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1062.2109375, + "completions/mean_terminated_length": 1025.30517578125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "epoch": 0.08433734939759036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3350493311882019, + "learning_rate": 1e-06, + "loss": 0.0268, + "num_tokens": 46662017.0, + "reward": 3.5582427978515625, + "reward_std": 0.49701404571533203, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.5582427978515625, + "rewards/icrm_reward/std": 0.6976381540298462, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1031.458984375, + "completions/mean_terminated_length": 1019.3500366210938, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.08509036144578314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.334583580493927, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 47235772.0, + "reward": 3.6624908447265625, + "reward_std": 0.45720618963241577, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.6624908447265625, + "rewards/icrm_reward/std": 0.6205590963363647, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1115.578125, + "completions/mean_terminated_length": 1078.98095703125, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "epoch": 0.0858433734939759, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3087984323501587, + "learning_rate": 1e-06, + "loss": 0.0323, + "num_tokens": 47852820.0, + "reward": 3.5841064453125, + "reward_std": 0.5725352764129639, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.5841064453125, + "rewards/icrm_reward/std": 0.7000143527984619, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1063.62109375, + "completions/mean_terminated_length": 1025.7509765625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.08659638554216867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33008214831352234, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 48446802.0, + "reward": 3.625518798828125, + "reward_std": 0.49653565883636475, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.625518798828125, + "rewards/icrm_reward/std": 0.6842305064201355, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1078.359375, + "completions/mean_terminated_length": 1051.88427734375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "epoch": 0.08734939759036145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35016050934791565, + "learning_rate": 1e-06, + "loss": 0.0386, + "num_tokens": 49046842.0, + "reward": 3.6656494140625, + "reward_std": 0.5081939101219177, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.6656494140625, + "rewards/icrm_reward/std": 0.6160948276519775, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 1129.044921875, + "completions/mean_terminated_length": 1099.1844482421875, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "epoch": 0.08810240963855422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34213417768478394, + "learning_rate": 1e-06, + "loss": 0.043, + "num_tokens": 49670449.0, + "reward": 3.842926025390625, + "reward_std": 0.5554987192153931, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.842926025390625, + "rewards/icrm_reward/std": 0.6801345348358154, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1073.904296875, + "completions/mean_terminated_length": 1026.101318359375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.08885542168674698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35978832840919495, + "learning_rate": 1e-06, + "loss": 0.0266, + "num_tokens": 50263392.0, + "reward": 3.68927001953125, + "reward_std": 0.6048953533172607, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.68927001953125, + "rewards/icrm_reward/std": 0.7318205833435059, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 1037.619140625, + "completions/mean_terminated_length": 1016.3035278320312, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.08960843373493976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40669694542884827, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 50840125.0, + "reward": 3.836822509765625, + "reward_std": 0.5344001054763794, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.836822509765625, + "rewards/icrm_reward/std": 0.7598003149032593, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1001.3125, + "completions/mean_terminated_length": 982.9495239257812, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.09036144578313253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3870098888874054, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 51397917.0, + "reward": 3.87884521484375, + "reward_std": 0.5162875652313232, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.87884521484375, + "rewards/icrm_reward/std": 0.6479455828666687, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 1010.32421875, + "completions/mean_terminated_length": 1006.18505859375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.0911144578313253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39014145731925964, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 51959539.0, + "reward": 3.9730072021484375, + "reward_std": 0.4872249364852905, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.9730072021484375, + "rewards/icrm_reward/std": 0.6410709023475647, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1034.78515625, + "completions/mean_terminated_length": 1001.3709106445312, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.09186746987951808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4095689058303833, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 52537701.0, + "reward": 3.8944854736328125, + "reward_std": 0.5506025552749634, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 3.8944854736328125, + "rewards/icrm_reward/std": 0.675002932548523, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1102.3671875, + "completions/mean_terminated_length": 1060.5823974609375, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.09262048192771084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39829981327056885, + "learning_rate": 1e-06, + "loss": 0.0222, + "num_tokens": 53145905.0, + "reward": 4.00750732421875, + "reward_std": 0.6116676330566406, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.00750732421875, + "rewards/icrm_reward/std": 0.8072155117988586, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1062.55078125, + "completions/mean_terminated_length": 1050.2164306640625, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.09337349397590361, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.430706262588501, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 53736795.0, + "reward": 4.0050048828125, + "reward_std": 0.5629526972770691, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.0050048828125, + "rewards/icrm_reward/std": 0.7918898463249207, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1077.34765625, + "completions/mean_terminated_length": 1066.340087890625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "epoch": 0.09412650602409639, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45189860463142395, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 54334269.0, + "reward": 4.1801300048828125, + "reward_std": 0.5930963754653931, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.1801300048828125, + "rewards/icrm_reward/std": 0.6826416850090027, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 1056.853515625, + "completions/mean_terminated_length": 1047.308837890625, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "epoch": 0.09487951807228916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45464563369750977, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 54921730.0, + "reward": 4.1747894287109375, + "reward_std": 0.5486907958984375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.1747894287109375, + "rewards/icrm_reward/std": 0.7279157042503357, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 1100.255859375, + "completions/mean_terminated_length": 1074.0931396484375, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.09563253012048192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43557268381118774, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 55532117.0, + "reward": 4.1165618896484375, + "reward_std": 0.652687668800354, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.1165618896484375, + "rewards/icrm_reward/std": 0.8171107172966003, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 1049.5, + "completions/mean_terminated_length": 1030.75048828125, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.0963855421686747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2762600183486938, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 56113925.0, + "reward": 4.0705718994140625, + "reward_std": 0.5888340473175049, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.0705718994140625, + "rewards/icrm_reward/std": 0.7753996849060059, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1075.615234375, + "completions/mean_terminated_length": 1064.5660400390625, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.09713855421686747, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5985656976699829, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 56707232.0, + "reward": 4.26837158203125, + "reward_std": 0.6652629971504211, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.26837158203125, + "rewards/icrm_reward/std": 0.7837300896644592, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1517.0, + "completions/mean_length": 1126.310546875, + "completions/mean_terminated_length": 1102.6094970703125, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.09789156626506024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6995136737823486, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 57327535.0, + "reward": 4.19671630859375, + "reward_std": 0.7013879418373108, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.19671630859375, + "rewards/icrm_reward/std": 0.8030691742897034, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 1108.6328125, + "completions/mean_terminated_length": 1092.1622314453125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "epoch": 0.09864457831325302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5040700435638428, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 57940419.0, + "reward": 4.236175537109375, + "reward_std": 0.7264065742492676, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.236175537109375, + "rewards/icrm_reward/std": 0.8259881138801575, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1150.2265625, + "completions/mean_terminated_length": 1128.7506103515625, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.09939759036144578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5671800971031189, + "learning_rate": 1e-06, + "loss": 0.0306, + "num_tokens": 58572871.0, + "reward": 4.3385772705078125, + "reward_std": 0.7649765610694885, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.3385772705078125, + "rewards/icrm_reward/std": 0.9581444263458252, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 1201.853515625, + "completions/mean_terminated_length": 1168.07958984375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.10015060240963855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5382966995239258, + "learning_rate": 1e-06, + "loss": 0.0238, + "num_tokens": 59231148.0, + "reward": 4.30035400390625, + "reward_std": 0.7863367199897766, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.30035400390625, + "rewards/icrm_reward/std": 0.9911512732505798, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.1328125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1154.716796875, + "completions/mean_terminated_length": 1096.3221435546875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.10090361445783133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5057360529899597, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 59869675.0, + "reward": 4.17413330078125, + "reward_std": 0.7724943161010742, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.17413330078125, + "rewards/icrm_reward/std": 1.1001423597335815, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1109.478515625, + "completions/mean_terminated_length": 1093.04052734375, + "completions/min_length": 680.0, + "completions/min_terminated_length": 680.0, + "epoch": 0.1016566265060241, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4980197846889496, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 60486016.0, + "reward": 4.4792022705078125, + "reward_std": 0.6606467962265015, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.4792022705078125, + "rewards/icrm_reward/std": 0.7966200113296509, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1091.37890625, + "completions/mean_terminated_length": 1078.8795166015625, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.10240963855421686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5023417472839355, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 61090290.0, + "reward": 4.5316009521484375, + "reward_std": 0.6734751462936401, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.5316009521484375, + "rewards/icrm_reward/std": 0.8923114538192749, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1130.16015625, + "completions/mean_terminated_length": 1111.071533203125, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "epoch": 0.10316265060240964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5186536312103271, + "learning_rate": 1e-06, + "loss": 0.0244, + "num_tokens": 61718724.0, + "reward": 4.3646087646484375, + "reward_std": 0.6948167085647583, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.3646087646484375, + "rewards/icrm_reward/std": 0.826988160610199, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1148.951171875, + "completions/mean_terminated_length": 1117.0380859375, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "epoch": 0.10391566265060241, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5542064905166626, + "learning_rate": 1e-06, + "loss": 0.0254, + "num_tokens": 62349675.0, + "reward": 4.36920166015625, + "reward_std": 0.7360410690307617, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.36920166015625, + "rewards/icrm_reward/std": 0.9057928323745728, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1157.771484375, + "completions/mean_terminated_length": 1129.166015625, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "epoch": 0.10466867469879518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4985194802284241, + "learning_rate": 1e-06, + "loss": 0.0401, + "num_tokens": 62992966.0, + "reward": 4.42987060546875, + "reward_std": 0.7369166612625122, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.42987060546875, + "rewards/icrm_reward/std": 0.9678148031234741, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 1177.798828125, + "completions/mean_terminated_length": 1155.504150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "epoch": 0.10542168674698796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48446330428123474, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 63639679.0, + "reward": 4.651458740234375, + "reward_std": 0.7585673332214355, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.651458740234375, + "rewards/icrm_reward/std": 0.9639466404914856, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1178.580078125, + "completions/mean_terminated_length": 1149.9261474609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "epoch": 0.10617469879518072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5235411524772644, + "learning_rate": 1e-06, + "loss": 0.0409, + "num_tokens": 64287064.0, + "reward": 4.468170166015625, + "reward_std": 0.8118422031402588, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.468170166015625, + "rewards/icrm_reward/std": 0.9429880380630493, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.12890625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1191.279296875, + "completions/mean_terminated_length": 1140.266845703125, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.10692771084337349, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5035867094993591, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 64939303.0, + "reward": 4.414886474609375, + "reward_std": 0.7666824460029602, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.414886474609375, + "rewards/icrm_reward/std": 1.1432216167449951, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 1205.037109375, + "completions/mean_terminated_length": 1164.392578125, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "epoch": 0.10768072289156627, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4995163679122925, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 65599322.0, + "reward": 4.5460968017578125, + "reward_std": 0.8546550273895264, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.5460968017578125, + "rewards/icrm_reward/std": 1.0646564960479736, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.123046875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 1184.833984375, + "completions/mean_terminated_length": 1135.561279296875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "epoch": 0.10843373493975904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5785022377967834, + "learning_rate": 1e-06, + "loss": 0.0241, + "num_tokens": 66252757.0, + "reward": 4.5584564208984375, + "reward_std": 0.9171419143676758, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.5584564208984375, + "rewards/icrm_reward/std": 1.2026164531707764, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1152.01171875, + "completions/mean_terminated_length": 1133.950927734375, + "completions/min_length": 670.0, + "completions/min_terminated_length": 670.0, + "epoch": 0.1091867469879518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5453264713287354, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 66890171.0, + "reward": 4.862518310546875, + "reward_std": 0.7816354632377625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.862518310546875, + "rewards/icrm_reward/std": 0.9664557576179504, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1525.0, + "completions/mean_length": 1212.98828125, + "completions/mean_terminated_length": 1183.3731689453125, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "epoch": 0.10993975903614457, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5098794102668762, + "learning_rate": 1e-06, + "loss": 0.0254, + "num_tokens": 67554341.0, + "reward": 4.792633056640625, + "reward_std": 0.8540831804275513, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.792633056640625, + "rewards/icrm_reward/std": 1.080270528793335, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1214.3984375, + "completions/mean_terminated_length": 1177.2635498046875, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "epoch": 0.11069277108433735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4801645576953888, + "learning_rate": 1e-06, + "loss": 0.0272, + "num_tokens": 68224849.0, + "reward": 4.768585205078125, + "reward_std": 0.8221277594566345, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.768585205078125, + "rewards/icrm_reward/std": 1.0661983489990234, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.119140625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1154.353515625, + "completions/mean_terminated_length": 1102.73388671875, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "epoch": 0.11144578313253012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5634762048721313, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 68857798.0, + "reward": 4.6792144775390625, + "reward_std": 0.8073952198028564, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.6792144775390625, + "rewards/icrm_reward/std": 1.1656705141067505, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 1139.81640625, + "completions/mean_terminated_length": 1113.4041748046875, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.11219879518072289, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46944481134414673, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 69488120.0, + "reward": 5.034210205078125, + "reward_std": 0.7564300298690796, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.034210205078125, + "rewards/icrm_reward/std": 1.0557838678359985, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1128.408203125, + "completions/mean_terminated_length": 1086.2435302734375, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "epoch": 0.11295180722891567, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4718353748321533, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 70109049.0, + "reward": 4.9261627197265625, + "reward_std": 0.740686297416687, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.9261627197265625, + "rewards/icrm_reward/std": 1.173972725868225, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 1094.146484375, + "completions/mean_terminated_length": 1089.7889404296875, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "epoch": 0.11370481927710843, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4578796625137329, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 70718244.0, + "reward": 5.183929443359375, + "reward_std": 0.66904616355896, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.183929443359375, + "rewards/icrm_reward/std": 0.9041181802749634, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1516.0, + "completions/mean_length": 1059.701171875, + "completions/mean_terminated_length": 1051.1788330078125, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "epoch": 0.1144578313253012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4858332574367523, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 71306027.0, + "reward": 4.9518890380859375, + "reward_std": 0.6586086750030518, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 4.9518890380859375, + "rewards/icrm_reward/std": 0.925758421421051, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1033.603515625, + "completions/mean_terminated_length": 1019.4798583984375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "epoch": 0.11521084337349398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49182265996932983, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 71882144.0, + "reward": 5.0518341064453125, + "reward_std": 0.6696003079414368, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.0518341064453125, + "rewards/icrm_reward/std": 0.8905400633811951, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1078.1796875, + "completions/mean_terminated_length": 1071.833740234375, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "epoch": 0.11596385542168675, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46055495738983154, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 72478988.0, + "reward": 5.09454345703125, + "reward_std": 0.6344223618507385, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.09454345703125, + "rewards/icrm_reward/std": 0.9012209177017212, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1038.26171875, + "completions/mean_terminated_length": 1030.3612060546875, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.11671686746987951, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5130624175071716, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 73056402.0, + "reward": 5.181671142578125, + "reward_std": 0.698214054107666, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.181671142578125, + "rewards/icrm_reward/std": 0.8892819881439209, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1075.06640625, + "completions/mean_terminated_length": 1064.0040283203125, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.11746987951807229, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47389230132102966, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 73653476.0, + "reward": 5.4003753662109375, + "reward_std": 0.7417340278625488, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.4003753662109375, + "rewards/icrm_reward/std": 0.9252166152000427, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1132.5859375, + "completions/mean_terminated_length": 1114.473388671875, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "epoch": 0.11822289156626506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4955994784832001, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 74278048.0, + "reward": 5.3224029541015625, + "reward_std": 0.7916837930679321, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.3224029541015625, + "rewards/icrm_reward/std": 1.0698318481445312, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 1152.1640625, + "completions/mean_terminated_length": 1138.9818115234375, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "epoch": 0.11897590361445783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46200019121170044, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 74910132.0, + "reward": 5.66119384765625, + "reward_std": 0.7860316038131714, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.66119384765625, + "rewards/icrm_reward/std": 1.05222749710083, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1167.58984375, + "completions/mean_terminated_length": 1151.0489501953125, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "epoch": 0.1197289156626506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4425918459892273, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 75552562.0, + "reward": 5.364105224609375, + "reward_std": 0.8341003656387329, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.364105224609375, + "rewards/icrm_reward/std": 1.0970090627670288, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 1173.29296875, + "completions/mean_terminated_length": 1152.309814453125, + "completions/min_length": 674.0, + "completions/min_terminated_length": 674.0, + "epoch": 0.12048192771084337, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47959861159324646, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 76199640.0, + "reward": 5.54571533203125, + "reward_std": 0.7932090163230896, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.54571533203125, + "rewards/icrm_reward/std": 1.1264164447784424, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 1170.8125, + "completions/mean_terminated_length": 1142.3662109375, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.12123493975903614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4375867545604706, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 76847176.0, + "reward": 5.4109954833984375, + "reward_std": 0.8528537750244141, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.4109954833984375, + "rewards/icrm_reward/std": 1.1036971807479858, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 1240.06640625, + "completions/mean_terminated_length": 1202.2598876953125, + "completions/min_length": 696.0, + "completions/min_terminated_length": 696.0, + "epoch": 0.12198795180722892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43658557534217834, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 77526586.0, + "reward": 5.582000732421875, + "reward_std": 0.973861813545227, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.582000732421875, + "rewards/icrm_reward/std": 1.351715087890625, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 1217.12109375, + "completions/mean_terminated_length": 1187.8848876953125, + "completions/min_length": 549.0, + "completions/min_terminated_length": 549.0, + "epoch": 0.12274096385542169, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47865623235702515, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 78191080.0, + "reward": 5.48748779296875, + "reward_std": 1.0244450569152832, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.48748779296875, + "rewards/icrm_reward/std": 1.2848234176635742, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1236.654296875, + "completions/mean_terminated_length": 1216.697998046875, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "epoch": 0.12349397590361445, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49472272396087646, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 78871319.0, + "reward": 5.76812744140625, + "reward_std": 0.9594842195510864, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.76812744140625, + "rewards/icrm_reward/std": 1.2425694465637207, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1229.5546875, + "completions/mean_terminated_length": 1207.75732421875, + "completions/min_length": 755.0, + "completions/min_terminated_length": 755.0, + "epoch": 0.12424698795180723, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4555387794971466, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 79550163.0, + "reward": 5.9351806640625, + "reward_std": 0.9924957752227783, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.9351806640625, + "rewards/icrm_reward/std": 1.3653061389923096, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1525.0, + "completions/mean_length": 1254.59765625, + "completions/mean_terminated_length": 1225.487060546875, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.475546270608902, + "learning_rate": 1e-06, + "loss": 0.0277, + "num_tokens": 80240917.0, + "reward": 5.655059814453125, + "reward_std": 1.0685908794403076, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.655059814453125, + "rewards/icrm_reward/std": 1.4573386907577515, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1223.212890625, + "completions/mean_terminated_length": 1184.8004150390625, + "completions/min_length": 671.0, + "completions/min_terminated_length": 671.0, + "epoch": 0.12575301204819278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5457843542098999, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 80910978.0, + "reward": 5.82574462890625, + "reward_std": 1.1165331602096558, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.82574462890625, + "rewards/icrm_reward/std": 1.5456546545028687, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1215.30859375, + "completions/mean_terminated_length": 1198.1522216796875, + "completions/min_length": 761.0, + "completions/min_terminated_length": 761.0, + "epoch": 0.12650602409638553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5768990516662598, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 81577904.0, + "reward": 6.0022125244140625, + "reward_std": 1.0959432125091553, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.0022125244140625, + "rewards/icrm_reward/std": 1.3156405687332153, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 1226.791015625, + "completions/mean_terminated_length": 1196.9957275390625, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.1272590361445783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5543394684791565, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 82255301.0, + "reward": 5.8323211669921875, + "reward_std": 1.0277405977249146, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.8323211669921875, + "rewards/icrm_reward/std": 1.3263365030288696, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 1141.255859375, + "completions/mean_terminated_length": 1127.6990966796875, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "epoch": 0.1280120481927711, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6215465664863586, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 82884568.0, + "reward": 5.9560394287109375, + "reward_std": 0.9153929948806763, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.9560394287109375, + "rewards/icrm_reward/std": 1.1505851745605469, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 1175.505859375, + "completions/mean_terminated_length": 1153.0684814453125, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "epoch": 0.12876506024096385, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5910201668739319, + "learning_rate": 1e-06, + "loss": 0.0104, + "num_tokens": 83537595.0, + "reward": 5.8822784423828125, + "reward_std": 1.0506434440612793, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 5.8822784423828125, + "rewards/icrm_reward/std": 1.4261112213134766, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 1114.849609375, + "completions/mean_terminated_length": 1110.6962890625, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "epoch": 0.12951807228915663, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5866726040840149, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 84154110.0, + "reward": 6.1107940673828125, + "reward_std": 0.9180551767349243, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.1107940673828125, + "rewards/icrm_reward/std": 1.150841236114502, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1155.37890625, + "completions/mean_terminated_length": 1145.462890625, + "completions/min_length": 712.0, + "completions/min_terminated_length": 712.0, + "epoch": 0.1302710843373494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5644499659538269, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 84796384.0, + "reward": 6.081695556640625, + "reward_std": 1.0321983098983765, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.081695556640625, + "rewards/icrm_reward/std": 1.218118667602539, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 1111.115234375, + "completions/mean_terminated_length": 1107.8543701171875, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "epoch": 0.13102409638554216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6356381773948669, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 85409915.0, + "reward": 6.321380615234375, + "reward_std": 0.919288158416748, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.321380615234375, + "rewards/icrm_reward/std": 1.110399603843689, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 1056.58203125, + "completions/mean_terminated_length": 1055.643798828125, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "epoch": 0.13177710843373494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6157188415527344, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 85996565.0, + "reward": 6.337921142578125, + "reward_std": 0.8840509057044983, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.337921142578125, + "rewards/icrm_reward/std": 1.0802546739578247, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1487.0, + "completions/mean_length": 1083.025390625, + "completions/mean_terminated_length": 1079.4586181640625, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "epoch": 0.13253012048192772, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6107508540153503, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 86596162.0, + "reward": 6.166168212890625, + "reward_std": 0.8845409154891968, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.166168212890625, + "rewards/icrm_reward/std": 1.2309458255767822, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1132.9296875, + "completions/mean_terminated_length": 1119.9273681640625, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "epoch": 0.13328313253012047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5550761818885803, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 87224286.0, + "reward": 6.2139434814453125, + "reward_std": 0.947563648223877, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.2139434814453125, + "rewards/icrm_reward/std": 1.3295992612838745, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1153.431640625, + "completions/mean_terminated_length": 1141.0906982421875, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "epoch": 0.13403614457831325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5606355667114258, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 87857787.0, + "reward": 6.5912322998046875, + "reward_std": 1.209731101989746, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.5912322998046875, + "rewards/icrm_reward/std": 1.431469440460205, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1175.37109375, + "completions/mean_terminated_length": 1157.6351318359375, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "epoch": 0.13478915662650603, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5889480710029602, + "learning_rate": 1e-06, + "loss": 0.0243, + "num_tokens": 88505129.0, + "reward": 6.227874755859375, + "reward_std": 1.1391370296478271, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.227874755859375, + "rewards/icrm_reward/std": 1.4286152124404907, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 1208.939453125, + "completions/mean_terminated_length": 1192.8544921875, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "epoch": 0.1355421686746988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5687003135681152, + "learning_rate": 1e-06, + "loss": 0.0298, + "num_tokens": 89169002.0, + "reward": 6.2486572265625, + "reward_std": 1.2031638622283936, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.2486572265625, + "rewards/icrm_reward/std": 1.4246164560317993, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 1182.537109375, + "completions/mean_terminated_length": 1167.4195556640625, + "completions/min_length": 692.0, + "completions/min_terminated_length": 692.0, + "epoch": 0.13629518072289157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5957708954811096, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 89815453.0, + "reward": 6.5264739990234375, + "reward_std": 1.2282458543777466, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.5264739990234375, + "rewards/icrm_reward/std": 1.530551552772522, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 1234.216796875, + "completions/mean_terminated_length": 1216.7581787109375, + "completions/min_length": 748.0, + "completions/min_terminated_length": 748.0, + "epoch": 0.13704819277108435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5908061265945435, + "learning_rate": 1e-06, + "loss": 0.0305, + "num_tokens": 90494140.0, + "reward": 6.6832275390625, + "reward_std": 1.277160882949829, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.6832275390625, + "rewards/icrm_reward/std": 1.484665036201477, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 1215.9609375, + "completions/mean_terminated_length": 1202.27294921875, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "epoch": 0.1378012048192771, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5943951606750488, + "learning_rate": 1e-06, + "loss": 0.0314, + "num_tokens": 91161768.0, + "reward": 6.722076416015625, + "reward_std": 1.2803276777267456, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.722076416015625, + "rewards/icrm_reward/std": 1.427648663520813, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 1190.53515625, + "completions/mean_terminated_length": 1185.74658203125, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "epoch": 0.13855421686746988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6678600907325745, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 91819498.0, + "reward": 6.924530029296875, + "reward_std": 1.1015572547912598, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.924530029296875, + "rewards/icrm_reward/std": 1.4232861995697021, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 1157.99609375, + "completions/mean_terminated_length": 1152.7564697265625, + "completions/min_length": 667.0, + "completions/min_terminated_length": 667.0, + "epoch": 0.13930722891566266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8672060966491699, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 92457784.0, + "reward": 7.066802978515625, + "reward_std": 1.08809494972229, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.066802978515625, + "rewards/icrm_reward/std": 1.4011895656585693, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 1160.62109375, + "completions/mean_terminated_length": 1158.40869140625, + "completions/min_length": 675.0, + "completions/min_terminated_length": 675.0, + "epoch": 0.14006024096385541, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6453976631164551, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 93096710.0, + "reward": 6.9345703125, + "reward_std": 1.012839913368225, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.9345703125, + "rewards/icrm_reward/std": 1.399297833442688, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 1187.58984375, + "completions/mean_terminated_length": 1174.1622314453125, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "epoch": 0.1408132530120482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6340705752372742, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 93748820.0, + "reward": 6.9169769287109375, + "reward_std": 1.168637752532959, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.9169769287109375, + "rewards/icrm_reward/std": 1.6049163341522217, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 1109.232421875, + "completions/mean_terminated_length": 1107.558837890625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "epoch": 0.14156626506024098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6763615012168884, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 94361179.0, + "reward": 7.055084228515625, + "reward_std": 1.0929665565490723, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.055084228515625, + "rewards/icrm_reward/std": 1.364301085472107, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 1106.5546875, + "completions/mean_terminated_length": 1104.87060546875, + "completions/min_length": 670.0, + "completions/min_terminated_length": 670.0, + "epoch": 0.14231927710843373, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6773504018783569, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 94974855.0, + "reward": 6.922271728515625, + "reward_std": 1.015369176864624, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 6.922271728515625, + "rewards/icrm_reward/std": 1.2387880086898804, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1529.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 1116.544921875, + "completions/mean_terminated_length": 1116.544921875, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "epoch": 0.1430722891566265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7186038494110107, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 95589822.0, + "reward": 7.066253662109375, + "reward_std": 1.0190629959106445, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.066253662109375, + "rewards/icrm_reward/std": 1.3290399312973022, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 1157.650390625, + "completions/mean_terminated_length": 1151.6448974609375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.1438253012048193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6707749962806702, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 96230603.0, + "reward": 7.4195098876953125, + "reward_std": 1.1656768321990967, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.4195098876953125, + "rewards/icrm_reward/std": 1.4572044610977173, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1170.634765625, + "completions/mean_terminated_length": 1158.8487548828125, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "epoch": 0.14457831325301204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6166041493415833, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 96875520.0, + "reward": 7.4496612548828125, + "reward_std": 1.2275352478027344, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.4496612548828125, + "rewards/icrm_reward/std": 1.4913699626922607, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 1180.185546875, + "completions/mean_terminated_length": 1164.210205078125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.14533132530120482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6371023654937744, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 97525631.0, + "reward": 7.562042236328125, + "reward_std": 1.3294496536254883, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.562042236328125, + "rewards/icrm_reward/std": 1.7725695371627808, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1495.0, + "completions/mean_length": 1154.130859375, + "completions/mean_terminated_length": 1148.0694580078125, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "epoch": 0.1460843373493976, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0944349765777588, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 98168146.0, + "reward": 7.829681396484375, + "reward_std": 1.3168985843658447, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.829681396484375, + "rewards/icrm_reward/std": 1.9035899639129639, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 1199.44140625, + "completions/mean_terminated_length": 1194.0992431640625, + "completions/min_length": 752.0, + "completions/min_terminated_length": 752.0, + "epoch": 0.14683734939759036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.596019446849823, + "learning_rate": 1e-06, + "loss": 0.0205, + "num_tokens": 98827876.0, + "reward": 7.8392791748046875, + "reward_std": 1.417443037033081, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.8392791748046875, + "rewards/icrm_reward/std": 1.6570779085159302, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1147.794921875, + "completions/mean_terminated_length": 1143.1917724609375, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "epoch": 0.14759036144578314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6314316391944885, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 99459163.0, + "reward": 7.6285400390625, + "reward_std": 1.2673888206481934, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.6285400390625, + "rewards/icrm_reward/std": 1.7081259489059448, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1166.373046875, + "completions/mean_terminated_length": 1163.462646484375, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "epoch": 0.14834337349397592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6306869983673096, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 100107098.0, + "reward": 7.62646484375, + "reward_std": 1.1987028121948242, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.62646484375, + "rewards/icrm_reward/std": 1.6772334575653076, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 1118.8125, + "completions/mean_terminated_length": 1116.3536376953125, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "epoch": 0.14909638554216867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7667272686958313, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 100727226.0, + "reward": 7.72003173828125, + "reward_std": 1.3245576620101929, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 7.72003173828125, + "rewards/icrm_reward/std": 1.9072823524475098, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 1159.931640625, + "completions/mean_terminated_length": 1158.123779296875, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "epoch": 0.14984939759036145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0288300514221191, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 101365447.0, + "reward": 8.526580810546875, + "reward_std": 1.3684407472610474, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 8.526580810546875, + "rewards/icrm_reward/std": 1.7263346910476685, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1501.0, + "completions/mean_length": 1161.853515625, + "completions/mean_terminated_length": 1158.1636962890625, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "epoch": 0.15060240963855423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.744748592376709, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 102006172.0, + "reward": 8.15716552734375, + "reward_std": 1.3181054592132568, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/icrm_reward/mean": 8.15716552734375, + "rewards/icrm_reward/std": 1.5644598007202148, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 1328, + "num_input_tokens_seen": 102006172, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}