Reacherx commited on
Commit
f6a78b4
·
verified ·
1 Parent(s): b89b0df

Training in progress, step 1000, checkpoint

Browse files
Files changed (28) hide show
  1. last-checkpoint/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  2. last-checkpoint/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  3. last-checkpoint/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  4. last-checkpoint/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  5. last-checkpoint/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  6. last-checkpoint/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  7. last-checkpoint/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  8. last-checkpoint/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
  9. last-checkpoint/global_step1000/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
  10. last-checkpoint/global_step1000/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
  11. last-checkpoint/global_step1000/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
  12. last-checkpoint/global_step1000/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
  13. last-checkpoint/global_step1000/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
  14. last-checkpoint/global_step1000/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
  15. last-checkpoint/latest +1 -1
  16. last-checkpoint/model-00001-of-00004.safetensors +1 -1
  17. last-checkpoint/model-00002-of-00004.safetensors +1 -1
  18. last-checkpoint/model-00003-of-00004.safetensors +1 -1
  19. last-checkpoint/model-00004-of-00004.safetensors +1 -1
  20. last-checkpoint/rng_state_0.pth +1 -1
  21. last-checkpoint/rng_state_1.pth +1 -1
  22. last-checkpoint/rng_state_2.pth +1 -1
  23. last-checkpoint/rng_state_3.pth +1 -1
  24. last-checkpoint/rng_state_4.pth +1 -1
  25. last-checkpoint/rng_state_5.pth +1 -1
  26. last-checkpoint/rng_state_6.pth +1 -1
  27. last-checkpoint/scheduler.pt +1 -1
  28. last-checkpoint/trainer_state.json +1602 -2
last-checkpoint/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e5d318bcfe57b0fd139d89e3a1fc397ad1bea5a01f8c2ec82190bd7f6575ea8
3
+ size 14215152126
last-checkpoint/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3791076cf05d4ea41de9715dd825b5b6462b9efde945b42740d21e07b66971a
3
+ size 14215152126
last-checkpoint/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3961831ee127555db427ff704d4c259afe64c910664acbd3eb62d57152de9e25
3
+ size 14215152126
last-checkpoint/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9286fb40c4a7705be78fec28246b946fd8452bc84997300afada2924b682789e
3
+ size 14215152126
last-checkpoint/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22b14c332262f8f9c7e684d9ca5d8410487d42dc2add5254f9580fb614ab988
3
+ size 14215152126
last-checkpoint/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f302fc10a8f3ec279ce8b0fb8aeb9dcc314f957410d0524b7d5cf13e3e125e2
3
+ size 14215152126
last-checkpoint/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e493911b31601bb7d6ae76d0fd42b5cdf6631f10173376b0228eab5645d865
3
+ size 14215152126
last-checkpoint/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c52b26b6827d29df0fca8e30710877a8a0b448d9f7f43c326955fb5ed381bf
3
+ size 349379
last-checkpoint/global_step1000/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66cd09f0c195afa8c1c19a76e5b4a2f1f1e5b88ee98a3cd2aab39f38c5813f2e
3
+ size 349379
last-checkpoint/global_step1000/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c420ec50e4cd243bdb1edb1e29a775e650a89f64ab88598f977cc85f677fb80
3
+ size 349379
last-checkpoint/global_step1000/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bf65e95b06264e74e75f8deca27327d215322ff47aa58cc93dfe7d113bfbf32
3
+ size 349379
last-checkpoint/global_step1000/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d6de8c6edf380e4707198e62b25e1f25479b11a46c9473c2112915879563884
3
+ size 349379
last-checkpoint/global_step1000/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecd0b3abcf20b018d12d2cfc3cfdee930a86ef1b852cd044326340604345f2cd
3
+ size 349379
last-checkpoint/global_step1000/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f2667f93eb116e834165ce6e7bdea3ae15b8155467aefef118740fa857dabcc
3
+ size 349379
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step900
 
1
+ global_step1000
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7f71b5c20029f143abe32a8db029a9d3bd8c334d8d34d7fc9804705b07a5ea0
3
  size 4968243304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f217c2edcf986aa252f9dc1d2f54208b083b90bb3aeac66b7a88e4c1e82ffd0
3
  size 4968243304
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcb9c921436e99521105a4f8d656a7b593055edd22f3eeea9ac94ea1d2513d41
3
  size 4991495816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74a942f3fdc500b90d779547eafb7be72d6daa1f471473c0748293e667bd1b56
3
  size 4991495816
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdb91687ab05bad5705661001b77c36d26a6777ee6db23c420ab39b9492d4f10
3
  size 4932751040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec4c1b2bb9b92b6e257c2103bd96f6f25051981146aba585f9580dad4cd6dd3f
3
  size 4932751040
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26bf7b0ee993b1bf86b8730f1cb42b08885787f2ed3ef1442fa069f6efac654d
3
  size 1691924384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46ee0d8415c7d4ca5b68681fe3b8772ad2fc02e3cbb2452fe72d88084cd8012c
3
  size 1691924384
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0164f2208c7041cd8088d35a6d736cb760990d81607389689f8607bafc07582
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a467c4d473c4476133e0c962682ae3bd1eadb5b659536096b8f126b374b5fef
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38a39cd0cbca01be223d0e166aa7bc089071dad96861bd60647a479ac4fa3505
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e743495ecb0f3dcb697352d75c19d02d9cf64404eeb5050d2c4a404e1cbacd7f
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:067c9c98cc0f1f78cd36390632838cc5489aef20eefe30db85c261959e9e7d3d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e9fe5aeeb743f2a0bae96552be01addb99af031bae160fd209d89993f3074f0
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fb5ffc5fb0d4f9659807f6f7f2fe78e6a34689abae8fc7197e45d21d5630c59
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f42a876651e074ff662aeb862e8a5177c3427016384b0c21b126dd991e7a54c3
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4408661536ff8dc6a041602d206d533370a1a03af1d5db63c613f64034f1288c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6443d543afed862f54a00e23ced53d4e4f0e332d02f9fe49cf63681e33a1f925
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4def26f501edfecc05134202363535fb18daf6049ffd7569b3ea553c45ee78ac
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ad289fc655a368bdb83b0f67e858b03d811498d49979570c148e0f7dd8e6695
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04d75ad610f93d275b4a619ada47209ce31058f73212b3c9d5d3bc3bc6864a98
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca54964f1223bdd6d718543b7f1eb959ed6fc0fae7c10ce41dd0577e3c5efdeb
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9398c477eca57434cdee1034aff1b8e689e210acd0d746ee26b046a15d3a9ade
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e1a9307dd6491b636ce5ac79ba6d6ae4102618ffd8fd75198a2157f876beafe
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.03838116764041111,
5
  "eval_steps": 500,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -14407,6 +14407,1606 @@
14407
  "rewards/format_reward": 1.0,
14408
  "step": 900,
14409
  "temporal_rewards": 0.714285671710968
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14410
  }
14411
  ],
14412
  "logging_steps": 1.0,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.042645741822679,
5
  "eval_steps": 500,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
14407
  "rewards/format_reward": 1.0,
14408
  "step": 900,
14409
  "temporal_rewards": 0.714285671710968
14410
+ },
14411
+ {
14412
+ "all_correct": 0.14285714285714285,
14413
+ "all_wrong": 0.42857142857142855,
14414
+ "completion_length": 388.4821472167969,
14415
+ "epoch": 0.03842381338223379,
14416
+ "grad_norm": 1.388325239051785,
14417
+ "kl": 0.07861328125,
14418
+ "learning_rate": 9.963615763757953e-07,
14419
+ "loss": 0.0031,
14420
+ "reward": 1.4394482374191284,
14421
+ "reward_std": 0.09279949963092804,
14422
+ "rewards/accuracy_reward": 0.3323054015636444,
14423
+ "rewards/format_reward": 1.0,
14424
+ "step": 901,
14425
+ "temporal_rewards": 0.714285671710968
14426
+ },
14427
+ {
14428
+ "all_correct": 0.2857142857142857,
14429
+ "all_wrong": 0.2857142857142857,
14430
+ "completion_length": 437.51788330078125,
14431
+ "epoch": 0.03846645912405646,
14432
+ "grad_norm": 1.026819463325266,
14433
+ "kl": 0.06884765625,
14434
+ "learning_rate": 9.963535053184923e-07,
14435
+ "loss": 0.0028,
14436
+ "reward": 1.549193024635315,
14437
+ "reward_std": 0.10150664299726486,
14438
+ "rewards/accuracy_reward": 0.4420502781867981,
14439
+ "rewards/format_reward": 1.0,
14440
+ "step": 902,
14441
+ "temporal_rewards": 0.5
14442
+ },
14443
+ {
14444
+ "all_correct": 0.42857142857142855,
14445
+ "all_wrong": 0.14285714285714285,
14446
+ "completion_length": 414.2321472167969,
14447
+ "epoch": 0.03850910486587914,
14448
+ "grad_norm": 1.2365418501254621,
14449
+ "kl": 0.059814453125,
14450
+ "learning_rate": 9.96345425351918e-07,
14451
+ "loss": 0.0024,
14452
+ "reward": 1.782142996788025,
14453
+ "reward_std": 0.22749534249305725,
14454
+ "rewards/accuracy_reward": 0.6785714626312256,
14455
+ "rewards/format_reward": 1.0,
14456
+ "step": 903,
14457
+ "temporal_rewards": 0.5
14458
+ },
14459
+ {
14460
+ "all_correct": 0.2857142857142857,
14461
+ "all_wrong": 0.14285714285714285,
14462
+ "completion_length": 368.5535888671875,
14463
+ "epoch": 0.03855175060770182,
14464
+ "grad_norm": 1.8688669932203024,
14465
+ "kl": 0.0849609375,
14466
+ "learning_rate": 9.963373364762176e-07,
14467
+ "loss": 0.0034,
14468
+ "reward": 1.7157738208770752,
14469
+ "reward_std": 0.2985364496707916,
14470
+ "rewards/accuracy_reward": 0.5372024178504944,
14471
+ "rewards/format_reward": 1.0,
14472
+ "step": 904,
14473
+ "temporal_rewards": 0.6428571343421936
14474
+ },
14475
+ {
14476
+ "all_correct": 0.0,
14477
+ "all_wrong": 0.0,
14478
+ "completion_length": 423.4821472167969,
14479
+ "epoch": 0.0385943963495245,
14480
+ "grad_norm": 2.135810610374038,
14481
+ "kl": 0.06201171875,
14482
+ "learning_rate": 9.963292386915358e-07,
14483
+ "loss": 0.0025,
14484
+ "reward": 1.6755682229995728,
14485
+ "reward_std": 0.19589506089687347,
14486
+ "rewards/accuracy_reward": 0.5005680322647095,
14487
+ "rewards/format_reward": 1.0,
14488
+ "step": 905,
14489
+ "temporal_rewards": 0.5714285373687744
14490
+ },
14491
+ {
14492
+ "all_correct": 0.2857142857142857,
14493
+ "all_wrong": 0.0,
14494
+ "completion_length": 422.1071472167969,
14495
+ "epoch": 0.038637042091347176,
14496
+ "grad_norm": 2.051111400485243,
14497
+ "kl": 0.07080078125,
14498
+ "learning_rate": 9.963211319980185e-07,
14499
+ "loss": 0.0028,
14500
+ "reward": 1.6941068172454834,
14501
+ "reward_std": 0.3651140630245209,
14502
+ "rewards/accuracy_reward": 0.5173211693763733,
14503
+ "rewards/format_reward": 1.0,
14504
+ "step": 906,
14505
+ "temporal_rewards": 0.7857142686843872
14506
+ },
14507
+ {
14508
+ "all_correct": 0.14285714285714285,
14509
+ "all_wrong": 0.5714285714285714,
14510
+ "completion_length": 388.1607360839844,
14511
+ "epoch": 0.03867968783316986,
14512
+ "grad_norm": 1.2286741388000242,
14513
+ "kl": 0.068359375,
14514
+ "learning_rate": 9.963130163958108e-07,
14515
+ "loss": 0.0027,
14516
+ "reward": 1.3034491539001465,
14517
+ "reward_std": 0.12692005932331085,
14518
+ "rewards/accuracy_reward": 0.25344905257225037,
14519
+ "rewards/format_reward": 1.0,
14520
+ "step": 907,
14521
+ "temporal_rewards": 0.6428571343421936
14522
+ },
14523
+ {
14524
+ "all_correct": 0.14285714285714285,
14525
+ "all_wrong": 0.2857142857142857,
14526
+ "completion_length": 438.0535888671875,
14527
+ "epoch": 0.03872233357499254,
14528
+ "grad_norm": 1.1109793404981478,
14529
+ "kl": 0.05419921875,
14530
+ "learning_rate": 9.963048918850585e-07,
14531
+ "loss": 0.0022,
14532
+ "reward": 1.4734070301055908,
14533
+ "reward_std": 0.21417297422885895,
14534
+ "rewards/accuracy_reward": 0.30912119150161743,
14535
+ "rewards/format_reward": 0.9821429252624512,
14536
+ "step": 908,
14537
+ "temporal_rewards": 0.714285671710968
14538
+ },
14539
+ {
14540
+ "all_correct": 0.5714285714285714,
14541
+ "all_wrong": 0.0,
14542
+ "completion_length": 386.14288330078125,
14543
+ "epoch": 0.03876497931681522,
14544
+ "grad_norm": 1.4176361827714907,
14545
+ "kl": 0.0791015625,
14546
+ "learning_rate": 9.962967584659075e-07,
14547
+ "loss": 0.0032,
14548
+ "reward": 2.136169910430908,
14549
+ "reward_std": 0.14292067289352417,
14550
+ "rewards/accuracy_reward": 0.8022412061691284,
14551
+ "rewards/format_reward": 1.0,
14552
+ "step": 909,
14553
+ "temporal_rewards": 0.7857142686843872
14554
+ },
14555
+ {
14556
+ "all_correct": 0.14285714285714285,
14557
+ "all_wrong": 0.14285714285714285,
14558
+ "completion_length": 414.1964416503906,
14559
+ "epoch": 0.0388076250586379,
14560
+ "grad_norm": 1.40762891931829,
14561
+ "kl": 0.07275390625,
14562
+ "learning_rate": 9.962886161385037e-07,
14563
+ "loss": 0.0029,
14564
+ "reward": 1.5685076713562012,
14565
+ "reward_std": 0.1862516850233078,
14566
+ "rewards/accuracy_reward": 0.4113648533821106,
14567
+ "rewards/format_reward": 1.0,
14568
+ "step": 910,
14569
+ "temporal_rewards": 0.5714285373687744
14570
+ },
14571
+ {
14572
+ "all_correct": 0.42857142857142855,
14573
+ "all_wrong": 0.14285714285714285,
14574
+ "completion_length": 427.6250305175781,
14575
+ "epoch": 0.03885027080046057,
14576
+ "grad_norm": 1.4988795157708306,
14577
+ "kl": 0.06396484375,
14578
+ "learning_rate": 9.962804649029936e-07,
14579
+ "loss": 0.0026,
14580
+ "reward": 1.8500804901123047,
14581
+ "reward_std": 0.1730491816997528,
14582
+ "rewards/accuracy_reward": 0.7072232365608215,
14583
+ "rewards/format_reward": 0.9821429252624512,
14584
+ "step": 911,
14585
+ "temporal_rewards": 0.5714285373687744
14586
+ },
14587
+ {
14588
+ "all_correct": 0.14285714285714285,
14589
+ "all_wrong": 0.5714285714285714,
14590
+ "completion_length": 405.6964416503906,
14591
+ "epoch": 0.03889291654228325,
14592
+ "grad_norm": 1.9102132785830699,
14593
+ "kl": 0.06591796875,
14594
+ "learning_rate": 9.96272304759523e-07,
14595
+ "loss": 0.0026,
14596
+ "reward": 1.5267857313156128,
14597
+ "reward_std": 0.1490623652935028,
14598
+ "rewards/accuracy_reward": 0.392857164144516,
14599
+ "rewards/format_reward": 1.0,
14600
+ "step": 912,
14601
+ "temporal_rewards": 0.7857142686843872
14602
+ },
14603
+ {
14604
+ "all_correct": 0.14285714285714285,
14605
+ "all_wrong": 0.2857142857142857,
14606
+ "completion_length": 414.1964416503906,
14607
+ "epoch": 0.03893556228410593,
14608
+ "grad_norm": 1.7387254504787168,
14609
+ "kl": 0.05712890625,
14610
+ "learning_rate": 9.962641357082387e-07,
14611
+ "loss": 0.0023,
14612
+ "reward": 1.6454274654388428,
14613
+ "reward_std": 0.2543186545372009,
14614
+ "rewards/accuracy_reward": 0.5025703310966492,
14615
+ "rewards/format_reward": 1.0,
14616
+ "step": 913,
14617
+ "temporal_rewards": 0.6428571343421936
14618
+ },
14619
+ {
14620
+ "all_correct": 0.8571428571428571,
14621
+ "all_wrong": 0.14285714285714285,
14622
+ "completion_length": 370.3214416503906,
14623
+ "epoch": 0.038978208025928614,
14624
+ "grad_norm": 1.2170320478100345,
14625
+ "kl": 0.0751953125,
14626
+ "learning_rate": 9.962559577492871e-07,
14627
+ "loss": 0.003,
14628
+ "reward": 2.015306234359741,
14629
+ "reward_std": 0.05134067311882973,
14630
+ "rewards/accuracy_reward": 0.8367346525192261,
14631
+ "rewards/format_reward": 1.0,
14632
+ "step": 914,
14633
+ "temporal_rewards": 0.5
14634
+ },
14635
+ {
14636
+ "all_correct": 0.14285714285714285,
14637
+ "all_wrong": 0.14285714285714285,
14638
+ "completion_length": 407.08929443359375,
14639
+ "epoch": 0.03902085376775129,
14640
+ "grad_norm": 2.075396285314163,
14641
+ "kl": 0.058349609375,
14642
+ "learning_rate": 9.962477708828152e-07,
14643
+ "loss": 0.0023,
14644
+ "reward": 1.505526065826416,
14645
+ "reward_std": 0.31088876724243164,
14646
+ "rewards/accuracy_reward": 0.37695467472076416,
14647
+ "rewards/format_reward": 1.0,
14648
+ "step": 915,
14649
+ "temporal_rewards": 0.5
14650
+ },
14651
+ {
14652
+ "all_correct": 0.2857142857142857,
14653
+ "all_wrong": 0.14285714285714285,
14654
+ "completion_length": 363.9285888671875,
14655
+ "epoch": 0.03906349950957397,
14656
+ "grad_norm": 2.0530572811195458,
14657
+ "kl": 0.07958984375,
14658
+ "learning_rate": 9.9623957510897e-07,
14659
+ "loss": 0.0032,
14660
+ "reward": 1.8374649286270142,
14661
+ "reward_std": 0.30604711174964905,
14662
+ "rewards/accuracy_reward": 0.6463934183120728,
14663
+ "rewards/format_reward": 1.0,
14664
+ "step": 916,
14665
+ "temporal_rewards": 0.6428571343421936
14666
+ },
14667
+ {
14668
+ "all_correct": 0.5714285714285714,
14669
+ "all_wrong": 0.0,
14670
+ "completion_length": 421.2321472167969,
14671
+ "epoch": 0.03910614525139665,
14672
+ "grad_norm": 1.8443845860137627,
14673
+ "kl": 0.06787109375,
14674
+ "learning_rate": 9.962313704278981e-07,
14675
+ "loss": 0.0027,
14676
+ "reward": 2.0418219566345215,
14677
+ "reward_std": 0.21194705367088318,
14678
+ "rewards/accuracy_reward": 0.7382504940032959,
14679
+ "rewards/format_reward": 1.0,
14680
+ "step": 917,
14681
+ "temporal_rewards": 0.7857142686843872
14682
+ },
14683
+ {
14684
+ "all_correct": 0.0,
14685
+ "all_wrong": 0.0,
14686
+ "completion_length": 402.58929443359375,
14687
+ "epoch": 0.03914879099321933,
14688
+ "grad_norm": 1.8829320648015981,
14689
+ "kl": 0.05712890625,
14690
+ "learning_rate": 9.962231568397472e-07,
14691
+ "loss": 0.0023,
14692
+ "reward": 1.5507712364196777,
14693
+ "reward_std": 0.21578750014305115,
14694
+ "rewards/accuracy_reward": 0.43291404843330383,
14695
+ "rewards/format_reward": 1.0,
14696
+ "step": 918,
14697
+ "temporal_rewards": 0.5
14698
+ },
14699
+ {
14700
+ "all_correct": 0.2857142857142857,
14701
+ "all_wrong": 0.0,
14702
+ "completion_length": 403.4821472167969,
14703
+ "epoch": 0.039191436735042,
14704
+ "grad_norm": 1.6641626617634653,
14705
+ "kl": 0.0732421875,
14706
+ "learning_rate": 9.96214934344665e-07,
14707
+ "loss": 0.0029,
14708
+ "reward": 1.9780364036560059,
14709
+ "reward_std": 0.2805452048778534,
14710
+ "rewards/accuracy_reward": 0.6637506484985352,
14711
+ "rewards/format_reward": 1.0,
14712
+ "step": 919,
14713
+ "temporal_rewards": 0.714285671710968
14714
+ },
14715
+ {
14716
+ "all_correct": 0.2857142857142857,
14717
+ "all_wrong": 0.14285714285714285,
14718
+ "completion_length": 406.5357360839844,
14719
+ "epoch": 0.03923408247686468,
14720
+ "grad_norm": 1.7210279625723943,
14721
+ "kl": 0.064453125,
14722
+ "learning_rate": 9.962067029427983e-07,
14723
+ "loss": 0.0026,
14724
+ "reward": 1.669837236404419,
14725
+ "reward_std": 0.16734497249126434,
14726
+ "rewards/accuracy_reward": 0.5216229557991028,
14727
+ "rewards/format_reward": 1.0,
14728
+ "step": 920,
14729
+ "temporal_rewards": 0.5714285373687744
14730
+ },
14731
+ {
14732
+ "all_correct": 0.14285714285714285,
14733
+ "all_wrong": 0.2857142857142857,
14734
+ "completion_length": 470.5000305175781,
14735
+ "epoch": 0.039276728218687364,
14736
+ "grad_norm": 1.2527360686901718,
14737
+ "kl": 0.05322265625,
14738
+ "learning_rate": 9.961984626342956e-07,
14739
+ "loss": 0.0021,
14740
+ "reward": 1.4912724494934082,
14741
+ "reward_std": 0.09840281307697296,
14742
+ "rewards/accuracy_reward": 0.3662723898887634,
14743
+ "rewards/format_reward": 1.0,
14744
+ "step": 921,
14745
+ "temporal_rewards": 0.5714285373687744
14746
+ },
14747
+ {
14748
+ "all_correct": 0.42857142857142855,
14749
+ "all_wrong": 0.14285714285714285,
14750
+ "completion_length": 447.6607360839844,
14751
+ "epoch": 0.039319373960510044,
14752
+ "grad_norm": 1.6881157347849156,
14753
+ "kl": 0.05615234375,
14754
+ "learning_rate": 9.961902134193045e-07,
14755
+ "loss": 0.0022,
14756
+ "reward": 1.6204907894134521,
14757
+ "reward_std": 0.10691835731267929,
14758
+ "rewards/accuracy_reward": 0.5097763538360596,
14759
+ "rewards/format_reward": 1.0,
14760
+ "step": 922,
14761
+ "temporal_rewards": 0.5
14762
+ },
14763
+ {
14764
+ "all_correct": 0.14285714285714285,
14765
+ "all_wrong": 0.14285714285714285,
14766
+ "completion_length": 424.64288330078125,
14767
+ "epoch": 0.039362019702332725,
14768
+ "grad_norm": 1.4525888204858919,
14769
+ "kl": 0.06591796875,
14770
+ "learning_rate": 9.96181955297973e-07,
14771
+ "loss": 0.0026,
14772
+ "reward": 1.608081579208374,
14773
+ "reward_std": 0.3661433458328247,
14774
+ "rewards/accuracy_reward": 0.45808160305023193,
14775
+ "rewards/format_reward": 1.0,
14776
+ "step": 923,
14777
+ "temporal_rewards": 0.5714285373687744
14778
+ },
14779
+ {
14780
+ "all_correct": 0.42857142857142855,
14781
+ "all_wrong": 0.2857142857142857,
14782
+ "completion_length": 427.4464416503906,
14783
+ "epoch": 0.0394046654441554,
14784
+ "grad_norm": 2.6242606472855163,
14785
+ "kl": 0.06787109375,
14786
+ "learning_rate": 9.961736882704497e-07,
14787
+ "loss": 0.0027,
14788
+ "reward": 1.65829336643219,
14789
+ "reward_std": 0.12387296557426453,
14790
+ "rewards/accuracy_reward": 0.5118646621704102,
14791
+ "rewards/format_reward": 1.0,
14792
+ "step": 924,
14793
+ "temporal_rewards": 0.6428571343421936
14794
+ },
14795
+ {
14796
+ "all_correct": 0.2857142857142857,
14797
+ "all_wrong": 0.0,
14798
+ "completion_length": 428.0714416503906,
14799
+ "epoch": 0.03944731118597808,
14800
+ "grad_norm": 2.012136982924822,
14801
+ "kl": 0.056640625,
14802
+ "learning_rate": 9.961654123368824e-07,
14803
+ "loss": 0.0023,
14804
+ "reward": 1.6333997249603271,
14805
+ "reward_std": 0.11353455483913422,
14806
+ "rewards/accuracy_reward": 0.42268532514572144,
14807
+ "rewards/format_reward": 1.0,
14808
+ "step": 925,
14809
+ "temporal_rewards": 0.6428571343421936
14810
+ },
14811
+ {
14812
+ "all_correct": 0.2857142857142857,
14813
+ "all_wrong": 0.14285714285714285,
14814
+ "completion_length": 385.1964416503906,
14815
+ "epoch": 0.03948995692780076,
14816
+ "grad_norm": 1.6158760149356628,
14817
+ "kl": 0.078125,
14818
+ "learning_rate": 9.9615712749742e-07,
14819
+ "loss": 0.0031,
14820
+ "reward": 1.5787651538848877,
14821
+ "reward_std": 0.1297575682401657,
14822
+ "rewards/accuracy_reward": 0.45019370317459106,
14823
+ "rewards/format_reward": 1.0,
14824
+ "step": 926,
14825
+ "temporal_rewards": 0.5714285373687744
14826
+ },
14827
+ {
14828
+ "all_correct": 0.14285714285714285,
14829
+ "all_wrong": 0.14285714285714285,
14830
+ "completion_length": 430.3571472167969,
14831
+ "epoch": 0.03953260266962344,
14832
+ "grad_norm": 5.925694109606645,
14833
+ "kl": 0.0654296875,
14834
+ "learning_rate": 9.961488337522113e-07,
14835
+ "loss": 0.0026,
14836
+ "reward": 1.468300223350525,
14837
+ "reward_std": 0.21330617368221283,
14838
+ "rewards/accuracy_reward": 0.38080018758773804,
14839
+ "rewards/format_reward": 0.9821429252624512,
14840
+ "step": 927,
14841
+ "temporal_rewards": 0.5
14842
+ },
14843
+ {
14844
+ "all_correct": 0.42857142857142855,
14845
+ "all_wrong": 0.2857142857142857,
14846
+ "completion_length": 424.7321472167969,
14847
+ "epoch": 0.039575248411446114,
14848
+ "grad_norm": 1.3415082894145172,
14849
+ "kl": 0.06298828125,
14850
+ "learning_rate": 9.96140531101405e-07,
14851
+ "loss": 0.0025,
14852
+ "reward": 1.7482143640518188,
14853
+ "reward_std": 0.21537911891937256,
14854
+ "rewards/accuracy_reward": 0.625,
14855
+ "rewards/format_reward": 1.0,
14856
+ "step": 928,
14857
+ "temporal_rewards": 0.5714285373687744
14858
+ },
14859
+ {
14860
+ "all_correct": 0.0,
14861
+ "all_wrong": 0.14285714285714285,
14862
+ "completion_length": 390.9464416503906,
14863
+ "epoch": 0.039617894153268794,
14864
+ "grad_norm": 2.2955946940227503,
14865
+ "kl": 0.055908203125,
14866
+ "learning_rate": 9.961322195451497e-07,
14867
+ "loss": 0.0022,
14868
+ "reward": 1.515081524848938,
14869
+ "reward_std": 0.18819458782672882,
14870
+ "rewards/accuracy_reward": 0.2865099012851715,
14871
+ "rewards/format_reward": 1.0,
14872
+ "step": 929,
14873
+ "temporal_rewards": 0.714285671710968
14874
+ },
14875
+ {
14876
+ "all_correct": 0.2857142857142857,
14877
+ "all_wrong": 0.14285714285714285,
14878
+ "completion_length": 389.7500305175781,
14879
+ "epoch": 0.039660539895091475,
14880
+ "grad_norm": 6.008457943376909,
14881
+ "kl": 0.06591796875,
14882
+ "learning_rate": 9.961238990835957e-07,
14883
+ "loss": 0.0026,
14884
+ "reward": 1.6996906995773315,
14885
+ "reward_std": 0.2757183313369751,
14886
+ "rewards/accuracy_reward": 0.5104049444198608,
14887
+ "rewards/format_reward": 1.0,
14888
+ "step": 930,
14889
+ "temporal_rewards": 0.714285671710968
14890
+ },
14891
+ {
14892
+ "all_correct": 0.5714285714285714,
14893
+ "all_wrong": 0.14285714285714285,
14894
+ "completion_length": 400.0000305175781,
14895
+ "epoch": 0.039703185636914155,
14896
+ "grad_norm": 1.4843921910288904,
14897
+ "kl": 0.07080078125,
14898
+ "learning_rate": 9.961155697168913e-07,
14899
+ "loss": 0.0028,
14900
+ "reward": 1.7803571224212646,
14901
+ "reward_std": 0.16438372433185577,
14902
+ "rewards/accuracy_reward": 0.6071428656578064,
14903
+ "rewards/format_reward": 1.0,
14904
+ "step": 931,
14905
+ "temporal_rewards": 0.714285671710968
14906
+ },
14907
+ {
14908
+ "all_correct": 0.0,
14909
+ "all_wrong": 0.0,
14910
+ "completion_length": 439.89288330078125,
14911
+ "epoch": 0.039745831378736836,
14912
+ "grad_norm": 2.212969575646282,
14913
+ "kl": 0.044189453125,
14914
+ "learning_rate": 9.961072314451865e-07,
14915
+ "loss": 0.0018,
14916
+ "reward": 1.5589354038238525,
14917
+ "reward_std": 0.28393542766571045,
14918
+ "rewards/accuracy_reward": 0.3982209861278534,
14919
+ "rewards/format_reward": 1.0,
14920
+ "step": 932,
14921
+ "temporal_rewards": 0.5714285373687744
14922
+ },
14923
+ {
14924
+ "all_correct": 0.0,
14925
+ "all_wrong": 0.14285714285714285,
14926
+ "completion_length": 464.21429443359375,
14927
+ "epoch": 0.03978847712055951,
14928
+ "grad_norm": 1.53116470188613,
14929
+ "kl": 0.034423828125,
14930
+ "learning_rate": 9.960988842686308e-07,
14931
+ "loss": 0.0014,
14932
+ "reward": 1.314236044883728,
14933
+ "reward_std": 0.1616557240486145,
14934
+ "rewards/accuracy_reward": 0.18923597037792206,
14935
+ "rewards/format_reward": 0.9821429252624512,
14936
+ "step": 933,
14937
+ "temporal_rewards": 0.5714285373687744
14938
+ },
14939
+ {
14940
+ "all_correct": 0.2857142857142857,
14941
+ "all_wrong": 0.14285714285714285,
14942
+ "completion_length": 404.08929443359375,
14943
+ "epoch": 0.03983112286238219,
14944
+ "grad_norm": 1.8325447090521023,
14945
+ "kl": 0.0703125,
14946
+ "learning_rate": 9.96090528187374e-07,
14947
+ "loss": 0.0028,
14948
+ "reward": 1.8988735675811768,
14949
+ "reward_std": 0.2149917185306549,
14950
+ "rewards/accuracy_reward": 0.6917307376861572,
14951
+ "rewards/format_reward": 1.0,
14952
+ "step": 934,
14953
+ "temporal_rewards": 0.6428571343421936
14954
+ },
14955
+ {
14956
+ "all_correct": 0.2857142857142857,
14957
+ "all_wrong": 0.14285714285714285,
14958
+ "completion_length": 412.58929443359375,
14959
+ "epoch": 0.03987376860420487,
14960
+ "grad_norm": 3.4615665759091536,
14961
+ "kl": 0.0703125,
14962
+ "learning_rate": 9.960821632015666e-07,
14963
+ "loss": 0.0028,
14964
+ "reward": 1.7402606010437012,
14965
+ "reward_std": 0.20666787028312683,
14966
+ "rewards/accuracy_reward": 0.517046332359314,
14967
+ "rewards/format_reward": 1.0,
14968
+ "step": 935,
14969
+ "temporal_rewards": 0.714285671710968
14970
+ },
14971
+ {
14972
+ "all_correct": 0.0,
14973
+ "all_wrong": 0.2857142857142857,
14974
+ "completion_length": 365.6964416503906,
14975
+ "epoch": 0.03991641434602755,
14976
+ "grad_norm": 5.181621142955212,
14977
+ "kl": 0.058837890625,
14978
+ "learning_rate": 9.96073789311358e-07,
14979
+ "loss": 0.0024,
14980
+ "reward": 1.2537422180175781,
14981
+ "reward_std": 0.16066715121269226,
14982
+ "rewards/accuracy_reward": 0.16088514029979706,
14983
+ "rewards/format_reward": 1.0,
14984
+ "step": 936,
14985
+ "temporal_rewards": 0.5
14986
+ },
14987
+ {
14988
+ "all_correct": 0.0,
14989
+ "all_wrong": 0.0,
14990
+ "completion_length": 411.5714416503906,
14991
+ "epoch": 0.039959060087850225,
14992
+ "grad_norm": 1.7794049381076447,
14993
+ "kl": 0.06494140625,
14994
+ "learning_rate": 9.960654065168988e-07,
14995
+ "loss": 0.0026,
14996
+ "reward": 1.7535653114318848,
14997
+ "reward_std": 0.24865297973155975,
14998
+ "rewards/accuracy_reward": 0.5571366548538208,
14999
+ "rewards/format_reward": 1.0,
15000
+ "step": 937,
15001
+ "temporal_rewards": 0.6428571343421936
15002
+ },
15003
+ {
15004
+ "all_correct": 0.2857142857142857,
15005
+ "all_wrong": 0.0,
15006
+ "completion_length": 450.4107360839844,
15007
+ "epoch": 0.040001705829672905,
15008
+ "grad_norm": 1.6923568537310802,
15009
+ "kl": 0.051513671875,
15010
+ "learning_rate": 9.960570148183395e-07,
15011
+ "loss": 0.0021,
15012
+ "reward": 1.590727686882019,
15013
+ "reward_std": 0.33292099833488464,
15014
+ "rewards/accuracy_reward": 0.5085846781730652,
15015
+ "rewards/format_reward": 0.9642857313156128,
15016
+ "step": 938,
15017
+ "temporal_rewards": 0.5714285373687744
15018
+ },
15019
+ {
15020
+ "all_correct": 0.42857142857142855,
15021
+ "all_wrong": 0.2857142857142857,
15022
+ "completion_length": 394.89288330078125,
15023
+ "epoch": 0.040044351571495586,
15024
+ "grad_norm": 1.6035153750518245,
15025
+ "kl": 0.07177734375,
15026
+ "learning_rate": 9.96048614215831e-07,
15027
+ "loss": 0.0029,
15028
+ "reward": 1.7771177291870117,
15029
+ "reward_std": 0.08045493066310883,
15030
+ "rewards/accuracy_reward": 0.5771176218986511,
15031
+ "rewards/format_reward": 1.0,
15032
+ "step": 939,
15033
+ "temporal_rewards": 0.6428571343421936
15034
+ },
15035
+ {
15036
+ "all_correct": 0.2857142857142857,
15037
+ "all_wrong": 0.42857142857142855,
15038
+ "completion_length": 385.76788330078125,
15039
+ "epoch": 0.040086997313318266,
15040
+ "grad_norm": 1.6286891649077688,
15041
+ "kl": 0.0732421875,
15042
+ "learning_rate": 9.960402047095235e-07,
15043
+ "loss": 0.0029,
15044
+ "reward": 1.5035713911056519,
15045
+ "reward_std": 0.21551916003227234,
15046
+ "rewards/accuracy_reward": 0.392857164144516,
15047
+ "rewards/format_reward": 1.0,
15048
+ "step": 940,
15049
+ "temporal_rewards": 0.714285671710968
15050
+ },
15051
+ {
15052
+ "all_correct": 0.42857142857142855,
15053
+ "all_wrong": 0.0,
15054
+ "completion_length": 385.4821472167969,
15055
+ "epoch": 0.04012964305514095,
15056
+ "grad_norm": 3.8696943787473446,
15057
+ "kl": 0.07763671875,
15058
+ "learning_rate": 9.960317862995684e-07,
15059
+ "loss": 0.0031,
15060
+ "reward": 1.9869627952575684,
15061
+ "reward_std": 0.24162130057811737,
15062
+ "rewards/accuracy_reward": 0.8012484908103943,
15063
+ "rewards/format_reward": 1.0,
15064
+ "step": 941,
15065
+ "temporal_rewards": 0.5
15066
+ },
15067
+ {
15068
+ "all_correct": 0.14285714285714285,
15069
+ "all_wrong": 0.0,
15070
+ "completion_length": 420.4285888671875,
15071
+ "epoch": 0.04017228879696362,
15072
+ "grad_norm": 3.036793601101903,
15073
+ "kl": 0.0517578125,
15074
+ "learning_rate": 9.960233589861167e-07,
15075
+ "loss": 0.0021,
15076
+ "reward": 1.6199066638946533,
15077
+ "reward_std": 0.28489503264427185,
15078
+ "rewards/accuracy_reward": 0.46990665793418884,
15079
+ "rewards/format_reward": 0.9821429252624512,
15080
+ "step": 942,
15081
+ "temporal_rewards": 0.5
15082
+ },
15083
+ {
15084
+ "all_correct": 0.42857142857142855,
15085
+ "all_wrong": 0.14285714285714285,
15086
+ "completion_length": 370.8035888671875,
15087
+ "epoch": 0.0402149345387863,
15088
+ "grad_norm": 1.6650675145492482,
15089
+ "kl": 0.0712890625,
15090
+ "learning_rate": 9.960149227693196e-07,
15091
+ "loss": 0.0028,
15092
+ "reward": 1.9375001192092896,
15093
+ "reward_std": 0.19786998629570007,
15094
+ "rewards/accuracy_reward": 0.6964285969734192,
15095
+ "rewards/format_reward": 1.0,
15096
+ "step": 943,
15097
+ "temporal_rewards": 0.714285671710968
15098
+ },
15099
+ {
15100
+ "all_correct": 0.14285714285714285,
15101
+ "all_wrong": 0.0,
15102
+ "completion_length": 431.39288330078125,
15103
+ "epoch": 0.04025758028060898,
15104
+ "grad_norm": 1.609350149538502,
15105
+ "kl": 0.041015625,
15106
+ "learning_rate": 9.960064776493286e-07,
15107
+ "loss": 0.0016,
15108
+ "reward": 1.6551604270935059,
15109
+ "reward_std": 0.2681572437286377,
15110
+ "rewards/accuracy_reward": 0.4730173945426941,
15111
+ "rewards/format_reward": 1.0,
15112
+ "step": 944,
15113
+ "temporal_rewards": 0.5714285373687744
15114
+ },
15115
+ {
15116
+ "all_correct": 0.42857142857142855,
15117
+ "all_wrong": 0.0,
15118
+ "completion_length": 419.8571472167969,
15119
+ "epoch": 0.04030022602243166,
15120
+ "grad_norm": 1.9610773135593247,
15121
+ "kl": 0.05908203125,
15122
+ "learning_rate": 9.95998023626295e-07,
15123
+ "loss": 0.0024,
15124
+ "reward": 1.7684540748596191,
15125
+ "reward_std": 0.28671181201934814,
15126
+ "rewards/accuracy_reward": 0.7148826122283936,
15127
+ "rewards/format_reward": 0.9464285969734192,
15128
+ "step": 945,
15129
+ "temporal_rewards": 0.5
15130
+ },
15131
+ {
15132
+ "all_correct": 0.0,
15133
+ "all_wrong": 0.42857142857142855,
15134
+ "completion_length": 458.89288330078125,
15135
+ "epoch": 0.040342871764254336,
15136
+ "grad_norm": 1.470090956750234,
15137
+ "kl": 0.044921875,
15138
+ "learning_rate": 9.959895607003712e-07,
15139
+ "loss": 0.0018,
15140
+ "reward": 1.2988783121109009,
15141
+ "reward_std": 0.26279887557029724,
15142
+ "rewards/accuracy_reward": 0.2560211420059204,
15143
+ "rewards/format_reward": 0.9642857313156128,
15144
+ "step": 946,
15145
+ "temporal_rewards": 0.5714285373687744
15146
+ },
15147
+ {
15148
+ "all_correct": 0.14285714285714285,
15149
+ "all_wrong": 0.14285714285714285,
15150
+ "completion_length": 416.7500305175781,
15151
+ "epoch": 0.040385517506077016,
15152
+ "grad_norm": 4.425443340525767,
15153
+ "kl": 0.057373046875,
15154
+ "learning_rate": 9.959810888717084e-07,
15155
+ "loss": 0.0023,
15156
+ "reward": 1.3810738325119019,
15157
+ "reward_std": 0.16069242358207703,
15158
+ "rewards/accuracy_reward": 0.33107370138168335,
15159
+ "rewards/format_reward": 1.0,
15160
+ "step": 947,
15161
+ "temporal_rewards": 0.5
15162
+ },
15163
+ {
15164
+ "all_correct": 0.42857142857142855,
15165
+ "all_wrong": 0.2857142857142857,
15166
+ "completion_length": 390.5357360839844,
15167
+ "epoch": 0.0404281632478997,
15168
+ "grad_norm": 1.4929929891147553,
15169
+ "kl": 0.0625,
15170
+ "learning_rate": 9.959726081404588e-07,
15171
+ "loss": 0.0025,
15172
+ "reward": 1.6071429252624512,
15173
+ "reward_std": 0.1878172904253006,
15174
+ "rewards/accuracy_reward": 0.4821428656578064,
15175
+ "rewards/format_reward": 1.0,
15176
+ "step": 948,
15177
+ "temporal_rewards": 0.5714285373687744
15178
+ },
15179
+ {
15180
+ "all_correct": 0.2857142857142857,
15181
+ "all_wrong": 0.0,
15182
+ "completion_length": 440.96429443359375,
15183
+ "epoch": 0.04047080898972238,
15184
+ "grad_norm": 1.603958163585114,
15185
+ "kl": 0.045654296875,
15186
+ "learning_rate": 9.959641185067753e-07,
15187
+ "loss": 0.0018,
15188
+ "reward": 1.7649989128112793,
15189
+ "reward_std": 0.29367154836654663,
15190
+ "rewards/accuracy_reward": 0.5578558444976807,
15191
+ "rewards/format_reward": 1.0,
15192
+ "step": 949,
15193
+ "temporal_rewards": 0.6428571343421936
15194
+ },
15195
+ {
15196
+ "all_correct": 0.14285714285714285,
15197
+ "all_wrong": 0.14285714285714285,
15198
+ "completion_length": 416.0000305175781,
15199
+ "epoch": 0.04051345473154506,
15200
+ "grad_norm": 2.006844817768613,
15201
+ "kl": 0.060791015625,
15202
+ "learning_rate": 9.959556199708094e-07,
15203
+ "loss": 0.0024,
15204
+ "reward": 1.5652376413345337,
15205
+ "reward_std": 0.18956170976161957,
15206
+ "rewards/accuracy_reward": 0.4366661608219147,
15207
+ "rewards/format_reward": 1.0,
15208
+ "step": 950,
15209
+ "temporal_rewards": 0.5
15210
+ },
15211
+ {
15212
+ "all_correct": 0.14285714285714285,
15213
+ "all_wrong": 0.42857142857142855,
15214
+ "completion_length": 418.1071472167969,
15215
+ "epoch": 0.04055610047336773,
15216
+ "grad_norm": 1.2787284891519373,
15217
+ "kl": 0.078125,
15218
+ "learning_rate": 9.95947112532714e-07,
15219
+ "loss": 0.0031,
15220
+ "reward": 1.4751147031784058,
15221
+ "reward_std": 0.04257712885737419,
15222
+ "rewards/accuracy_reward": 0.3858288824558258,
15223
+ "rewards/format_reward": 1.0,
15224
+ "step": 951,
15225
+ "temporal_rewards": 0.5714285373687744
15226
+ },
15227
+ {
15228
+ "all_correct": 0.14285714285714285,
15229
+ "all_wrong": 0.0,
15230
+ "completion_length": 405.96429443359375,
15231
+ "epoch": 0.04059874621519041,
15232
+ "grad_norm": 3.136161553844802,
15233
+ "kl": 0.0634765625,
15234
+ "learning_rate": 9.959385961926419e-07,
15235
+ "loss": 0.0025,
15236
+ "reward": 1.9592256546020508,
15237
+ "reward_std": 0.335843026638031,
15238
+ "rewards/accuracy_reward": 0.696725606918335,
15239
+ "rewards/format_reward": 1.0,
15240
+ "step": 952,
15241
+ "temporal_rewards": 0.714285671710968
15242
+ },
15243
+ {
15244
+ "all_correct": 0.2857142857142857,
15245
+ "all_wrong": 0.42857142857142855,
15246
+ "completion_length": 376.96429443359375,
15247
+ "epoch": 0.04064139195701309,
15248
+ "grad_norm": 3.219961953332857,
15249
+ "kl": 0.0673828125,
15250
+ "learning_rate": 9.959300709507459e-07,
15251
+ "loss": 0.0027,
15252
+ "reward": 1.4304946660995483,
15253
+ "reward_std": 0.1210612803697586,
15254
+ "rewards/accuracy_reward": 0.3447802662849426,
15255
+ "rewards/format_reward": 1.0,
15256
+ "step": 953,
15257
+ "temporal_rewards": 0.6428571343421936
15258
+ },
15259
+ {
15260
+ "all_correct": 0.14285714285714285,
15261
+ "all_wrong": 0.0,
15262
+ "completion_length": 398.1964416503906,
15263
+ "epoch": 0.04068403769883577,
15264
+ "grad_norm": 6.050142310929799,
15265
+ "kl": 0.0751953125,
15266
+ "learning_rate": 9.959215368071788e-07,
15267
+ "loss": 0.003,
15268
+ "reward": 1.7659056186676025,
15269
+ "reward_std": 0.4009357690811157,
15270
+ "rewards/accuracy_reward": 0.6051912307739258,
15271
+ "rewards/format_reward": 1.0,
15272
+ "step": 954,
15273
+ "temporal_rewards": 0.6428571343421936
15274
+ },
15275
+ {
15276
+ "all_correct": 0.5714285714285714,
15277
+ "all_wrong": 0.0,
15278
+ "completion_length": 416.14288330078125,
15279
+ "epoch": 0.04072668344065845,
15280
+ "grad_norm": 1.421808016451398,
15281
+ "kl": 0.0703125,
15282
+ "learning_rate": 9.959129937620943e-07,
15283
+ "loss": 0.0028,
15284
+ "reward": 1.9267857074737549,
15285
+ "reward_std": 0.2703368067741394,
15286
+ "rewards/accuracy_reward": 0.6964285969734192,
15287
+ "rewards/format_reward": 1.0,
15288
+ "step": 955,
15289
+ "temporal_rewards": 0.714285671710968
15290
+ },
15291
+ {
15292
+ "all_correct": 0.2857142857142857,
15293
+ "all_wrong": 0.0,
15294
+ "completion_length": 435.1964416503906,
15295
+ "epoch": 0.04076932918248113,
15296
+ "grad_norm": 2.6075666468048433,
15297
+ "kl": 0.05859375,
15298
+ "learning_rate": 9.95904441815645e-07,
15299
+ "loss": 0.0023,
15300
+ "reward": 1.968336820602417,
15301
+ "reward_std": 0.21630696952342987,
15302
+ "rewards/accuracy_reward": 0.705836832523346,
15303
+ "rewards/format_reward": 1.0,
15304
+ "step": 956,
15305
+ "temporal_rewards": 0.714285671710968
15306
+ },
15307
+ {
15308
+ "all_correct": 0.14285714285714285,
15309
+ "all_wrong": 0.14285714285714285,
15310
+ "completion_length": 438.39288330078125,
15311
+ "epoch": 0.04081197492430381,
15312
+ "grad_norm": 1.6378505934638343,
15313
+ "kl": 0.0595703125,
15314
+ "learning_rate": 9.958958809679852e-07,
15315
+ "loss": 0.0024,
15316
+ "reward": 1.7164154052734375,
15317
+ "reward_std": 0.2044200748205185,
15318
+ "rewards/accuracy_reward": 0.527129590511322,
15319
+ "rewards/format_reward": 1.0,
15320
+ "step": 957,
15321
+ "temporal_rewards": 0.714285671710968
15322
+ },
15323
+ {
15324
+ "all_correct": 0.5714285714285714,
15325
+ "all_wrong": 0.0,
15326
+ "completion_length": 408.5357360839844,
15327
+ "epoch": 0.04085462066612649,
15328
+ "grad_norm": 1.8908338290164786,
15329
+ "kl": 0.061767578125,
15330
+ "learning_rate": 9.958873112192681e-07,
15331
+ "loss": 0.0025,
15332
+ "reward": 2.048797607421875,
15333
+ "reward_std": 0.04435715451836586,
15334
+ "rewards/accuracy_reward": 0.6916548013687134,
15335
+ "rewards/format_reward": 1.0,
15336
+ "step": 958,
15337
+ "temporal_rewards": 0.7857142686843872
15338
+ },
15339
+ {
15340
+ "all_correct": 0.2857142857142857,
15341
+ "all_wrong": 0.14285714285714285,
15342
+ "completion_length": 383.0357360839844,
15343
+ "epoch": 0.04089726640794917,
15344
+ "grad_norm": 1.9667798712281923,
15345
+ "kl": 0.06884765625,
15346
+ "learning_rate": 9.958787325696477e-07,
15347
+ "loss": 0.0028,
15348
+ "reward": 1.722543716430664,
15349
+ "reward_std": 0.13343960046768188,
15350
+ "rewards/accuracy_reward": 0.5082579851150513,
15351
+ "rewards/format_reward": 1.0,
15352
+ "step": 959,
15353
+ "temporal_rewards": 0.5
15354
+ },
15355
+ {
15356
+ "all_correct": 0.2857142857142857,
15357
+ "all_wrong": 0.0,
15358
+ "completion_length": 415.9464416503906,
15359
+ "epoch": 0.04093991214977184,
15360
+ "grad_norm": 1.9316488617697056,
15361
+ "kl": 0.054931640625,
15362
+ "learning_rate": 9.958701450192777e-07,
15363
+ "loss": 0.0022,
15364
+ "reward": 1.7576582431793213,
15365
+ "reward_std": 0.09231801331043243,
15366
+ "rewards/accuracy_reward": 0.5112294554710388,
15367
+ "rewards/format_reward": 1.0,
15368
+ "step": 960,
15369
+ "temporal_rewards": 0.6428571343421936
15370
+ },
15371
+ {
15372
+ "all_correct": 0.42857142857142855,
15373
+ "all_wrong": 0.2857142857142857,
15374
+ "completion_length": 391.0357360839844,
15375
+ "epoch": 0.04098255789159452,
15376
+ "grad_norm": 2.226596533494856,
15377
+ "kl": 0.0693359375,
15378
+ "learning_rate": 9.958615485683124e-07,
15379
+ "loss": 0.0028,
15380
+ "reward": 1.8000001907348633,
15381
+ "reward_std": 0.21024802327156067,
15382
+ "rewards/accuracy_reward": 0.5714285969734192,
15383
+ "rewards/format_reward": 1.0,
15384
+ "step": 961,
15385
+ "temporal_rewards": 0.5714285373687744
15386
+ },
15387
+ {
15388
+ "all_correct": 0.2857142857142857,
15389
+ "all_wrong": 0.14285714285714285,
15390
+ "completion_length": 425.96429443359375,
15391
+ "epoch": 0.041025203633417204,
15392
+ "grad_norm": 1.4948037453733134,
15393
+ "kl": 0.04736328125,
15394
+ "learning_rate": 9.958529432169062e-07,
15395
+ "loss": 0.0019,
15396
+ "reward": 1.6663552522659302,
15397
+ "reward_std": 0.18109546601772308,
15398
+ "rewards/accuracy_reward": 0.43064096570014954,
15399
+ "rewards/format_reward": 1.0,
15400
+ "step": 962,
15401
+ "temporal_rewards": 0.714285671710968
15402
+ },
15403
+ {
15404
+ "all_correct": 0.2857142857142857,
15405
+ "all_wrong": 0.42857142857142855,
15406
+ "completion_length": 452.357177734375,
15407
+ "epoch": 0.041067849375239884,
15408
+ "grad_norm": 1.4417561901645564,
15409
+ "kl": 0.06298828125,
15410
+ "learning_rate": 9.958443289652137e-07,
15411
+ "loss": 0.0025,
15412
+ "reward": 1.5744065046310425,
15413
+ "reward_std": 0.1612720489501953,
15414
+ "rewards/accuracy_reward": 0.45297789573669434,
15415
+ "rewards/format_reward": 0.9821429252624512,
15416
+ "step": 963,
15417
+ "temporal_rewards": 0.714285671710968
15418
+ },
15419
+ {
15420
+ "all_correct": 0.2857142857142857,
15421
+ "all_wrong": 0.5714285714285714,
15422
+ "completion_length": 411.1071472167969,
15423
+ "epoch": 0.04111049511706256,
15424
+ "grad_norm": 1.2820824961142303,
15425
+ "kl": 0.050048828125,
15426
+ "learning_rate": 9.95835705813389e-07,
15427
+ "loss": 0.002,
15428
+ "reward": 1.417178988456726,
15429
+ "reward_std": 0.09464232623577118,
15430
+ "rewards/accuracy_reward": 0.338607519865036,
15431
+ "rewards/format_reward": 0.9642857313156128,
15432
+ "step": 964,
15433
+ "temporal_rewards": 0.6428571343421936
15434
+ },
15435
+ {
15436
+ "all_correct": 0.42857142857142855,
15437
+ "all_wrong": 0.0,
15438
+ "completion_length": 377.39288330078125,
15439
+ "epoch": 0.04115314085888524,
15440
+ "grad_norm": 2.1119948944927778,
15441
+ "kl": 0.06982421875,
15442
+ "learning_rate": 9.958270737615876e-07,
15443
+ "loss": 0.0028,
15444
+ "reward": 2.0370266437530518,
15445
+ "reward_std": 0.15241330862045288,
15446
+ "rewards/accuracy_reward": 0.8263123035430908,
15447
+ "rewards/format_reward": 1.0,
15448
+ "step": 965,
15449
+ "temporal_rewards": 0.5714285373687744
15450
+ },
15451
+ {
15452
+ "all_correct": 0.14285714285714285,
15453
+ "all_wrong": 0.0,
15454
+ "completion_length": 411.08929443359375,
15455
+ "epoch": 0.04119578660070792,
15456
+ "grad_norm": 1.9724612440626825,
15457
+ "kl": 0.051513671875,
15458
+ "learning_rate": 9.958184328099636e-07,
15459
+ "loss": 0.0021,
15460
+ "reward": 1.7237517833709717,
15461
+ "reward_std": 0.2817230820655823,
15462
+ "rewards/accuracy_reward": 0.5362517237663269,
15463
+ "rewards/format_reward": 1.0,
15464
+ "step": 966,
15465
+ "temporal_rewards": 0.6428571343421936
15466
+ },
15467
+ {
15468
+ "all_correct": 0.2857142857142857,
15469
+ "all_wrong": 0.42857142857142855,
15470
+ "completion_length": 405.2500305175781,
15471
+ "epoch": 0.0412384323425306,
15472
+ "grad_norm": 1.217388155156064,
15473
+ "kl": 0.06787109375,
15474
+ "learning_rate": 9.958097829586727e-07,
15475
+ "loss": 0.0027,
15476
+ "reward": 1.5464287996292114,
15477
+ "reward_std": 0.22678472101688385,
15478
+ "rewards/accuracy_reward": 0.4285714626312256,
15479
+ "rewards/format_reward": 0.9821429252624512,
15480
+ "step": 967,
15481
+ "temporal_rewards": 0.6428571343421936
15482
+ },
15483
+ {
15484
+ "all_correct": 0.2857142857142857,
15485
+ "all_wrong": 0.0,
15486
+ "completion_length": 412.0000305175781,
15487
+ "epoch": 0.04128107808435328,
15488
+ "grad_norm": 2.174976056324669,
15489
+ "kl": 0.0634765625,
15490
+ "learning_rate": 9.9580112420787e-07,
15491
+ "loss": 0.0025,
15492
+ "reward": 1.6597402095794678,
15493
+ "reward_std": 0.24509233236312866,
15494
+ "rewards/accuracy_reward": 0.48652589321136475,
15495
+ "rewards/format_reward": 1.0,
15496
+ "step": 968,
15497
+ "temporal_rewards": 0.5714285373687744
15498
+ },
15499
+ {
15500
+ "all_correct": 0.14285714285714285,
15501
+ "all_wrong": 0.14285714285714285,
15502
+ "completion_length": 368.1785888671875,
15503
+ "epoch": 0.041323723826175954,
15504
+ "grad_norm": 1.4286186443155269,
15505
+ "kl": 0.080078125,
15506
+ "learning_rate": 9.95792456557711e-07,
15507
+ "loss": 0.0032,
15508
+ "reward": 1.694699764251709,
15509
+ "reward_std": 0.28493639826774597,
15510
+ "rewards/accuracy_reward": 0.5911281108856201,
15511
+ "rewards/format_reward": 1.0,
15512
+ "step": 969,
15513
+ "temporal_rewards": 0.5714285373687744
15514
+ },
15515
+ {
15516
+ "all_correct": 0.14285714285714285,
15517
+ "all_wrong": 0.0,
15518
+ "completion_length": 373.4285888671875,
15519
+ "epoch": 0.041366369567998634,
15520
+ "grad_norm": 1.8366623611705846,
15521
+ "kl": 0.04931640625,
15522
+ "learning_rate": 9.957837800083512e-07,
15523
+ "loss": 0.002,
15524
+ "reward": 1.6913397312164307,
15525
+ "reward_std": 0.31534942984580994,
15526
+ "rewards/accuracy_reward": 0.5377681255340576,
15527
+ "rewards/format_reward": 1.0,
15528
+ "step": 970,
15529
+ "temporal_rewards": 0.5714285373687744
15530
+ },
15531
+ {
15532
+ "all_correct": 0.14285714285714285,
15533
+ "all_wrong": 0.14285714285714285,
15534
+ "completion_length": 430.5714416503906,
15535
+ "epoch": 0.041409015309821315,
15536
+ "grad_norm": 1.6724082967761504,
15537
+ "kl": 0.055419921875,
15538
+ "learning_rate": 9.957750945599463e-07,
15539
+ "loss": 0.0022,
15540
+ "reward": 1.6170705556869507,
15541
+ "reward_std": 0.282865047454834,
15542
+ "rewards/accuracy_reward": 0.47421327233314514,
15543
+ "rewards/format_reward": 0.9821429252624512,
15544
+ "step": 971,
15545
+ "temporal_rewards": 0.5714285373687744
15546
+ },
15547
+ {
15548
+ "all_correct": 0.2857142857142857,
15549
+ "all_wrong": 0.14285714285714285,
15550
+ "completion_length": 360.96429443359375,
15551
+ "epoch": 0.041451661051643995,
15552
+ "grad_norm": 1.4751519734360083,
15553
+ "kl": 0.055908203125,
15554
+ "learning_rate": 9.957664002126524e-07,
15555
+ "loss": 0.0022,
15556
+ "reward": 1.7003968954086304,
15557
+ "reward_std": 0.27568307518959045,
15558
+ "rewards/accuracy_reward": 0.5753968358039856,
15559
+ "rewards/format_reward": 1.0,
15560
+ "step": 972,
15561
+ "temporal_rewards": 0.5714285373687744
15562
+ },
15563
+ {
15564
+ "all_correct": 0.2857142857142857,
15565
+ "all_wrong": 0.0,
15566
+ "completion_length": 401.7500305175781,
15567
+ "epoch": 0.04149430679346667,
15568
+ "grad_norm": 1.2879512175687846,
15569
+ "kl": 0.0625,
15570
+ "learning_rate": 9.957576969666252e-07,
15571
+ "loss": 0.0025,
15572
+ "reward": 1.939540982246399,
15573
+ "reward_std": 0.3482883870601654,
15574
+ "rewards/accuracy_reward": 0.7448980212211609,
15575
+ "rewards/format_reward": 0.9821429252624512,
15576
+ "step": 973,
15577
+ "temporal_rewards": 0.6428571343421936
15578
+ },
15579
+ {
15580
+ "all_correct": 0.0,
15581
+ "all_wrong": 0.2857142857142857,
15582
+ "completion_length": 418.5535888671875,
15583
+ "epoch": 0.04153695253528935,
15584
+ "grad_norm": 1.4953696048328862,
15585
+ "kl": 0.061767578125,
15586
+ "learning_rate": 9.95748984822021e-07,
15587
+ "loss": 0.0025,
15588
+ "reward": 1.4209396839141846,
15589
+ "reward_std": 0.11054398119449615,
15590
+ "rewards/accuracy_reward": 0.31736814975738525,
15591
+ "rewards/format_reward": 1.0,
15592
+ "step": 974,
15593
+ "temporal_rewards": 0.5
15594
+ },
15595
+ {
15596
+ "all_correct": 0.14285714285714285,
15597
+ "all_wrong": 0.0,
15598
+ "completion_length": 381.8035888671875,
15599
+ "epoch": 0.04157959827711203,
15600
+ "grad_norm": 1.62577462756665,
15601
+ "kl": 0.06396484375,
15602
+ "learning_rate": 9.957402637789966e-07,
15603
+ "loss": 0.0026,
15604
+ "reward": 1.5604526996612549,
15605
+ "reward_std": 0.2563931941986084,
15606
+ "rewards/accuracy_reward": 0.4104524850845337,
15607
+ "rewards/format_reward": 1.0,
15608
+ "step": 975,
15609
+ "temporal_rewards": 0.4285714328289032
15610
+ },
15611
+ {
15612
+ "all_correct": 0.7142857142857143,
15613
+ "all_wrong": 0.0,
15614
+ "completion_length": 377.0535888671875,
15615
+ "epoch": 0.04162224401893471,
15616
+ "grad_norm": 2.435265664853468,
15617
+ "kl": 0.061279296875,
15618
+ "learning_rate": 9.957315338377082e-07,
15619
+ "loss": 0.0025,
15620
+ "reward": 2.032465696334839,
15621
+ "reward_std": 0.14916305243968964,
15622
+ "rewards/accuracy_reward": 0.8503227829933167,
15623
+ "rewards/format_reward": 1.0,
15624
+ "step": 976,
15625
+ "temporal_rewards": 0.5
15626
+ },
15627
+ {
15628
+ "all_correct": 0.42857142857142855,
15629
+ "all_wrong": 0.14285714285714285,
15630
+ "completion_length": 388.3035888671875,
15631
+ "epoch": 0.04166488976075739,
15632
+ "grad_norm": 4.675537967649135,
15633
+ "kl": 0.0732421875,
15634
+ "learning_rate": 9.957227949983123e-07,
15635
+ "loss": 0.0029,
15636
+ "reward": 1.756882667541504,
15637
+ "reward_std": 0.09770465642213821,
15638
+ "rewards/accuracy_reward": 0.585453987121582,
15639
+ "rewards/format_reward": 1.0,
15640
+ "step": 977,
15641
+ "temporal_rewards": 0.5714285373687744
15642
+ },
15643
+ {
15644
+ "all_correct": 0.2857142857142857,
15645
+ "all_wrong": 0.0,
15646
+ "completion_length": 417.9464416503906,
15647
+ "epoch": 0.041707535502580065,
15648
+ "grad_norm": 8.652420291718713,
15649
+ "kl": 0.0703125,
15650
+ "learning_rate": 9.95714047260966e-07,
15651
+ "loss": 0.0028,
15652
+ "reward": 1.7980735301971436,
15653
+ "reward_std": 0.3015199899673462,
15654
+ "rewards/accuracy_reward": 0.6355735063552856,
15655
+ "rewards/format_reward": 0.9642857313156128,
15656
+ "step": 978,
15657
+ "temporal_rewards": 0.6428571343421936
15658
+ },
15659
+ {
15660
+ "all_correct": 0.5714285714285714,
15661
+ "all_wrong": 0.0,
15662
+ "completion_length": 427.1071472167969,
15663
+ "epoch": 0.041750181244402745,
15664
+ "grad_norm": 1.4268523956639492,
15665
+ "kl": 0.059326171875,
15666
+ "learning_rate": 9.957052906258265e-07,
15667
+ "loss": 0.0024,
15668
+ "reward": 2.1285715103149414,
15669
+ "reward_std": 0.24789518117904663,
15670
+ "rewards/accuracy_reward": 0.8928571939468384,
15671
+ "rewards/format_reward": 1.0,
15672
+ "step": 979,
15673
+ "temporal_rewards": 0.6428571343421936
15674
+ },
15675
+ {
15676
+ "all_correct": 0.0,
15677
+ "all_wrong": 0.0,
15678
+ "completion_length": 412.8750305175781,
15679
+ "epoch": 0.041792826986225426,
15680
+ "grad_norm": 1.7066404499935264,
15681
+ "kl": 0.06103515625,
15682
+ "learning_rate": 9.956965250930506e-07,
15683
+ "loss": 0.0024,
15684
+ "reward": 1.7384778261184692,
15685
+ "reward_std": 0.3482401967048645,
15686
+ "rewards/accuracy_reward": 0.5652633905410767,
15687
+ "rewards/format_reward": 1.0,
15688
+ "step": 980,
15689
+ "temporal_rewards": 0.5714285373687744
15690
+ },
15691
+ {
15692
+ "all_correct": 0.0,
15693
+ "all_wrong": 0.2857142857142857,
15694
+ "completion_length": 424.4464416503906,
15695
+ "epoch": 0.041835472728048106,
15696
+ "grad_norm": 4.659872228636638,
15697
+ "kl": 0.078125,
15698
+ "learning_rate": 9.95687750662796e-07,
15699
+ "loss": 0.0031,
15700
+ "reward": 1.5966719388961792,
15701
+ "reward_std": 0.1782858818769455,
15702
+ "rewards/accuracy_reward": 0.5038148164749146,
15703
+ "rewards/format_reward": 1.0,
15704
+ "step": 981,
15705
+ "temporal_rewards": 0.357142835855484
15706
+ },
15707
+ {
15708
+ "all_correct": 0.14285714285714285,
15709
+ "all_wrong": 0.0,
15710
+ "completion_length": 414.2500305175781,
15711
+ "epoch": 0.04187811846987078,
15712
+ "grad_norm": 2.2589039944618916,
15713
+ "kl": 0.06591796875,
15714
+ "learning_rate": 9.9567896733522e-07,
15715
+ "loss": 0.0026,
15716
+ "reward": 1.7768125534057617,
15717
+ "reward_std": 0.2902745306491852,
15718
+ "rewards/accuracy_reward": 0.5893124341964722,
15719
+ "rewards/format_reward": 1.0,
15720
+ "step": 982,
15721
+ "temporal_rewards": 0.6428571343421936
15722
+ },
15723
+ {
15724
+ "all_correct": 0.14285714285714285,
15725
+ "all_wrong": 0.2857142857142857,
15726
+ "completion_length": 372.4107360839844,
15727
+ "epoch": 0.04192076421169346,
15728
+ "grad_norm": 1.362896487824589,
15729
+ "kl": 0.08447265625,
15730
+ "learning_rate": 9.956701751104802e-07,
15731
+ "loss": 0.0034,
15732
+ "reward": 1.6840959787368774,
15733
+ "reward_std": 0.11667370796203613,
15734
+ "rewards/accuracy_reward": 0.5269531011581421,
15735
+ "rewards/format_reward": 1.0,
15736
+ "step": 983,
15737
+ "temporal_rewards": 0.5
15738
+ },
15739
+ {
15740
+ "all_correct": 0.7142857142857143,
15741
+ "all_wrong": 0.14285714285714285,
15742
+ "completion_length": 398.3214416503906,
15743
+ "epoch": 0.04196340995351614,
15744
+ "grad_norm": 1.2129928050538004,
15745
+ "kl": 0.06103515625,
15746
+ "learning_rate": 9.956613739887344e-07,
15747
+ "loss": 0.0024,
15748
+ "reward": 1.9772791862487793,
15749
+ "reward_std": 0.04609445109963417,
15750
+ "rewards/accuracy_reward": 0.7487077116966248,
15751
+ "rewards/format_reward": 1.0,
15752
+ "step": 984,
15753
+ "temporal_rewards": 0.6428571343421936
15754
+ },
15755
+ {
15756
+ "all_correct": 0.42857142857142855,
15757
+ "all_wrong": 0.14285714285714285,
15758
+ "completion_length": 404.5714416503906,
15759
+ "epoch": 0.04200605569533882,
15760
+ "grad_norm": 3.9207407827779077,
15761
+ "kl": 0.0634765625,
15762
+ "learning_rate": 9.956525639701407e-07,
15763
+ "loss": 0.0025,
15764
+ "reward": 1.910249948501587,
15765
+ "reward_std": 0.12284082174301147,
15766
+ "rewards/accuracy_reward": 0.6995355486869812,
15767
+ "rewards/format_reward": 1.0,
15768
+ "step": 985,
15769
+ "temporal_rewards": 0.6428571343421936
15770
+ },
15771
+ {
15772
+ "all_correct": 0.2857142857142857,
15773
+ "all_wrong": 0.0,
15774
+ "completion_length": 372.6785888671875,
15775
+ "epoch": 0.0420487014371615,
15776
+ "grad_norm": 2.0049322392282347,
15777
+ "kl": 0.0693359375,
15778
+ "learning_rate": 9.956437450548573e-07,
15779
+ "loss": 0.0028,
15780
+ "reward": 1.7768032550811768,
15781
+ "reward_std": 0.3517088294029236,
15782
+ "rewards/accuracy_reward": 0.6071603298187256,
15783
+ "rewards/format_reward": 1.0,
15784
+ "step": 986,
15785
+ "temporal_rewards": 0.6428571343421936
15786
+ },
15787
+ {
15788
+ "all_correct": 0.14285714285714285,
15789
+ "all_wrong": 0.0,
15790
+ "completion_length": 378.46429443359375,
15791
+ "epoch": 0.042091347178984176,
15792
+ "grad_norm": 2.4990401086302327,
15793
+ "kl": 0.0625,
15794
+ "learning_rate": 9.956349172430423e-07,
15795
+ "loss": 0.0025,
15796
+ "reward": 1.9568233489990234,
15797
+ "reward_std": 0.31181564927101135,
15798
+ "rewards/accuracy_reward": 0.7443231344223022,
15799
+ "rewards/format_reward": 1.0,
15800
+ "step": 987,
15801
+ "temporal_rewards": 0.6428571343421936
15802
+ },
15803
+ {
15804
+ "all_correct": 0.14285714285714285,
15805
+ "all_wrong": 0.2857142857142857,
15806
+ "completion_length": 321.75,
15807
+ "epoch": 0.042133992920806856,
15808
+ "grad_norm": 1.7114030947272834,
15809
+ "kl": 0.08740234375,
15810
+ "learning_rate": 9.956260805348543e-07,
15811
+ "loss": 0.0035,
15812
+ "reward": 1.672023892402649,
15813
+ "reward_std": 0.23027639091014862,
15814
+ "rewards/accuracy_reward": 0.5345238447189331,
15815
+ "rewards/format_reward": 1.0,
15816
+ "step": 988,
15817
+ "temporal_rewards": 0.5
15818
+ },
15819
+ {
15820
+ "all_correct": 0.0,
15821
+ "all_wrong": 0.0,
15822
+ "completion_length": 358.6250305175781,
15823
+ "epoch": 0.04217663866262954,
15824
+ "grad_norm": 2.7481031203471664,
15825
+ "kl": 0.07666015625,
15826
+ "learning_rate": 9.956172349304516e-07,
15827
+ "loss": 0.0031,
15828
+ "reward": 1.5195239782333374,
15829
+ "reward_std": 0.2381688952445984,
15830
+ "rewards/accuracy_reward": 0.39809539914131165,
15831
+ "rewards/format_reward": 1.0,
15832
+ "step": 989,
15833
+ "temporal_rewards": 0.4285714328289032
15834
+ },
15835
+ {
15836
+ "all_correct": 0.2857142857142857,
15837
+ "all_wrong": 0.14285714285714285,
15838
+ "completion_length": 340.2857360839844,
15839
+ "epoch": 0.04221928440445222,
15840
+ "grad_norm": 1.784431075262888,
15841
+ "kl": 0.087890625,
15842
+ "learning_rate": 9.956083804299937e-07,
15843
+ "loss": 0.0035,
15844
+ "reward": 1.7007704973220825,
15845
+ "reward_std": 0.2985832691192627,
15846
+ "rewards/accuracy_reward": 0.5364846587181091,
15847
+ "rewards/format_reward": 0.9642857313156128,
15848
+ "step": 990,
15849
+ "temporal_rewards": 0.714285671710968
15850
+ },
15851
+ {
15852
+ "all_correct": 0.14285714285714285,
15853
+ "all_wrong": 0.14285714285714285,
15854
+ "completion_length": 377.1607360839844,
15855
+ "epoch": 0.04226193014627489,
15856
+ "grad_norm": 3.305830844381748,
15857
+ "kl": 0.049560546875,
15858
+ "learning_rate": 9.955995170336387e-07,
15859
+ "loss": 0.002,
15860
+ "reward": 1.5684744119644165,
15861
+ "reward_std": 0.24174275994300842,
15862
+ "rewards/accuracy_reward": 0.4452598989009857,
15863
+ "rewards/format_reward": 0.9821429252624512,
15864
+ "step": 991,
15865
+ "temporal_rewards": 0.5714285373687744
15866
+ },
15867
+ {
15868
+ "all_correct": 0.42857142857142855,
15869
+ "all_wrong": 0.0,
15870
+ "completion_length": 364.3750305175781,
15871
+ "epoch": 0.04230457588809757,
15872
+ "grad_norm": 2.4344422715162475,
15873
+ "kl": 0.060302734375,
15874
+ "learning_rate": 9.955906447415462e-07,
15875
+ "loss": 0.0024,
15876
+ "reward": 2.0187172889709473,
15877
+ "reward_std": 0.23528897762298584,
15878
+ "rewards/accuracy_reward": 0.7312172055244446,
15879
+ "rewards/format_reward": 1.0,
15880
+ "step": 992,
15881
+ "temporal_rewards": 0.7857142686843872
15882
+ },
15883
+ {
15884
+ "all_correct": 0.14285714285714285,
15885
+ "all_wrong": 0.42857142857142855,
15886
+ "completion_length": 347.6071472167969,
15887
+ "epoch": 0.04234722162992025,
15888
+ "grad_norm": 1.1678389588315554,
15889
+ "kl": 0.04833984375,
15890
+ "learning_rate": 9.955817635538753e-07,
15891
+ "loss": 0.0019,
15892
+ "reward": 1.3988115787506104,
15893
+ "reward_std": 0.17111261188983917,
15894
+ "rewards/accuracy_reward": 0.32381147146224976,
15895
+ "rewards/format_reward": 1.0,
15896
+ "step": 993,
15897
+ "temporal_rewards": 0.5714285373687744
15898
+ },
15899
+ {
15900
+ "all_correct": 0.42857142857142855,
15901
+ "all_wrong": 0.0,
15902
+ "completion_length": 312.6964416503906,
15903
+ "epoch": 0.04238986737174293,
15904
+ "grad_norm": 2.533225798199559,
15905
+ "kl": 0.0859375,
15906
+ "learning_rate": 9.955728734707854e-07,
15907
+ "loss": 0.0034,
15908
+ "reward": 1.840993881225586,
15909
+ "reward_std": 0.20919422805309296,
15910
+ "rewards/accuracy_reward": 0.6695650815963745,
15911
+ "rewards/format_reward": 1.0,
15912
+ "step": 994,
15913
+ "temporal_rewards": 0.6428571343421936
15914
+ },
15915
+ {
15916
+ "all_correct": 0.2857142857142857,
15917
+ "all_wrong": 0.14285714285714285,
15918
+ "completion_length": 380.5000305175781,
15919
+ "epoch": 0.04243251311356561,
15920
+ "grad_norm": 1.5832851452596517,
15921
+ "kl": 0.061279296875,
15922
+ "learning_rate": 9.955639744924362e-07,
15923
+ "loss": 0.0025,
15924
+ "reward": 1.7240325212478638,
15925
+ "reward_std": 0.08556399494409561,
15926
+ "rewards/accuracy_reward": 0.5526038408279419,
15927
+ "rewards/format_reward": 1.0,
15928
+ "step": 995,
15929
+ "temporal_rewards": 0.6428571343421936
15930
+ },
15931
+ {
15932
+ "all_correct": 0.2857142857142857,
15933
+ "all_wrong": 0.0,
15934
+ "completion_length": 305.0535888671875,
15935
+ "epoch": 0.04247515885538829,
15936
+ "grad_norm": 10.554470531111166,
15937
+ "kl": 0.07177734375,
15938
+ "learning_rate": 9.955550666189872e-07,
15939
+ "loss": 0.0029,
15940
+ "reward": 1.5484970808029175,
15941
+ "reward_std": 0.08560214191675186,
15942
+ "rewards/accuracy_reward": 0.3877827525138855,
15943
+ "rewards/format_reward": 1.0,
15944
+ "step": 996,
15945
+ "temporal_rewards": 0.6428571343421936
15946
+ },
15947
+ {
15948
+ "all_correct": 0.5714285714285714,
15949
+ "all_wrong": 0.0,
15950
+ "completion_length": 320.9464416503906,
15951
+ "epoch": 0.04251780459721097,
15952
+ "grad_norm": 1.9700363180476712,
15953
+ "kl": 0.07177734375,
15954
+ "learning_rate": 9.955461498505984e-07,
15955
+ "loss": 0.0029,
15956
+ "reward": 1.9073069095611572,
15957
+ "reward_std": 0.24450773000717163,
15958
+ "rewards/accuracy_reward": 0.7305210828781128,
15959
+ "rewards/format_reward": 1.0,
15960
+ "step": 997,
15961
+ "temporal_rewards": 0.714285671710968
15962
+ },
15963
+ {
15964
+ "all_correct": 0.14285714285714285,
15965
+ "all_wrong": 0.2857142857142857,
15966
+ "completion_length": 396.8571472167969,
15967
+ "epoch": 0.04256045033903365,
15968
+ "grad_norm": 1.9243204654958848,
15969
+ "kl": 0.0498046875,
15970
+ "learning_rate": 9.9553722418743e-07,
15971
+ "loss": 0.002,
15972
+ "reward": 1.504149079322815,
15973
+ "reward_std": 0.22699996829032898,
15974
+ "rewards/accuracy_reward": 0.4398633539676666,
15975
+ "rewards/format_reward": 0.9821429252624512,
15976
+ "step": 998,
15977
+ "temporal_rewards": 0.6428571343421936
15978
+ },
15979
+ {
15980
+ "all_correct": 0.2857142857142857,
15981
+ "all_wrong": 0.14285714285714285,
15982
+ "completion_length": 334.0714416503906,
15983
+ "epoch": 0.04260309608085633,
15984
+ "grad_norm": 1.91575629989198,
15985
+ "kl": 0.0615234375,
15986
+ "learning_rate": 9.95528289629642e-07,
15987
+ "loss": 0.0025,
15988
+ "reward": 1.69874906539917,
15989
+ "reward_std": 0.15359075367450714,
15990
+ "rewards/accuracy_reward": 0.4737490117549896,
15991
+ "rewards/format_reward": 1.0,
15992
+ "step": 999,
15993
+ "temporal_rewards": 0.714285671710968
15994
+ },
15995
+ {
15996
+ "all_correct": 0.5714285714285714,
15997
+ "all_wrong": 0.14285714285714285,
15998
+ "completion_length": 332.64288330078125,
15999
+ "epoch": 0.042645741822679,
16000
+ "grad_norm": 1.6046922501658243,
16001
+ "kl": 0.06298828125,
16002
+ "learning_rate": 9.955193461773947e-07,
16003
+ "loss": 0.0025,
16004
+ "reward": 1.8086810111999512,
16005
+ "reward_std": 0.15720872581005096,
16006
+ "rewards/accuracy_reward": 0.6801096200942993,
16007
+ "rewards/format_reward": 1.0,
16008
+ "step": 1000,
16009
+ "temporal_rewards": 0.6428571343421936
16010
  }
16011
  ],
16012
  "logging_steps": 1.0,