Training in progress, step 1000, checkpoint
Browse files- last-checkpoint/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step1000/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step1000/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step1000/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step1000/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step1000/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step1000/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/model-00001-of-00004.safetensors +1 -1
- last-checkpoint/model-00002-of-00004.safetensors +1 -1
- last-checkpoint/model-00003-of-00004.safetensors +1 -1
- last-checkpoint/model-00004-of-00004.safetensors +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/rng_state_4.pth +1 -1
- last-checkpoint/rng_state_5.pth +1 -1
- last-checkpoint/rng_state_6.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +1602 -2
last-checkpoint/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e5d318bcfe57b0fd139d89e3a1fc397ad1bea5a01f8c2ec82190bd7f6575ea8
|
| 3 |
+
size 14215152126
|
last-checkpoint/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3791076cf05d4ea41de9715dd825b5b6462b9efde945b42740d21e07b66971a
|
| 3 |
+
size 14215152126
|
last-checkpoint/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3961831ee127555db427ff704d4c259afe64c910664acbd3eb62d57152de9e25
|
| 3 |
+
size 14215152126
|
last-checkpoint/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9286fb40c4a7705be78fec28246b946fd8452bc84997300afada2924b682789e
|
| 3 |
+
size 14215152126
|
last-checkpoint/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c22b14c332262f8f9c7e684d9ca5d8410487d42dc2add5254f9580fb614ab988
|
| 3 |
+
size 14215152126
|
last-checkpoint/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f302fc10a8f3ec279ce8b0fb8aeb9dcc314f957410d0524b7d5cf13e3e125e2
|
| 3 |
+
size 14215152126
|
last-checkpoint/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2e493911b31601bb7d6ae76d0fd42b5cdf6631f10173376b0228eab5645d865
|
| 3 |
+
size 14215152126
|
last-checkpoint/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12c52b26b6827d29df0fca8e30710877a8a0b448d9f7f43c326955fb5ed381bf
|
| 3 |
+
size 349379
|
last-checkpoint/global_step1000/zero_pp_rank_1_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66cd09f0c195afa8c1c19a76e5b4a2f1f1e5b88ee98a3cd2aab39f38c5813f2e
|
| 3 |
+
size 349379
|
last-checkpoint/global_step1000/zero_pp_rank_2_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c420ec50e4cd243bdb1edb1e29a775e650a89f64ab88598f977cc85f677fb80
|
| 3 |
+
size 349379
|
last-checkpoint/global_step1000/zero_pp_rank_3_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6bf65e95b06264e74e75f8deca27327d215322ff47aa58cc93dfe7d113bfbf32
|
| 3 |
+
size 349379
|
last-checkpoint/global_step1000/zero_pp_rank_4_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d6de8c6edf380e4707198e62b25e1f25479b11a46c9473c2112915879563884
|
| 3 |
+
size 349379
|
last-checkpoint/global_step1000/zero_pp_rank_5_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ecd0b3abcf20b018d12d2cfc3cfdee930a86ef1b852cd044326340604345f2cd
|
| 3 |
+
size 349379
|
last-checkpoint/global_step1000/zero_pp_rank_6_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f2667f93eb116e834165ce6e7bdea3ae15b8155467aefef118740fa857dabcc
|
| 3 |
+
size 349379
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step1000
|
last-checkpoint/model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4968243304
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f217c2edcf986aa252f9dc1d2f54208b083b90bb3aeac66b7a88e4c1e82ffd0
|
| 3 |
size 4968243304
|
last-checkpoint/model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4991495816
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74a942f3fdc500b90d779547eafb7be72d6daa1f471473c0748293e667bd1b56
|
| 3 |
size 4991495816
|
last-checkpoint/model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4932751040
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec4c1b2bb9b92b6e257c2103bd96f6f25051981146aba585f9580dad4cd6dd3f
|
| 3 |
size 4932751040
|
last-checkpoint/model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1691924384
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46ee0d8415c7d4ca5b68681fe3b8772ad2fc02e3cbb2452fe72d88084cd8012c
|
| 3 |
size 1691924384
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15920
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a467c4d473c4476133e0c962682ae3bd1eadb5b659536096b8f126b374b5fef
|
| 3 |
size 15920
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e743495ecb0f3dcb697352d75c19d02d9cf64404eeb5050d2c4a404e1cbacd7f
|
| 3 |
size 15984
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e9fe5aeeb743f2a0bae96552be01addb99af031bae160fd209d89993f3074f0
|
| 3 |
size 15984
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f42a876651e074ff662aeb862e8a5177c3427016384b0c21b126dd991e7a54c3
|
| 3 |
size 15984
|
last-checkpoint/rng_state_4.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6443d543afed862f54a00e23ced53d4e4f0e332d02f9fe49cf63681e33a1f925
|
| 3 |
size 15984
|
last-checkpoint/rng_state_5.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ad289fc655a368bdb83b0f67e858b03d811498d49979570c148e0f7dd8e6695
|
| 3 |
size 15984
|
last-checkpoint/rng_state_6.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca54964f1223bdd6d718543b7f1eb959ed6fc0fae7c10ce41dd0577e3c5efdeb
|
| 3 |
size 15984
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e1a9307dd6491b636ce5ac79ba6d6ae4102618ffd8fd75198a2157f876beafe
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -14407,6 +14407,1606 @@
|
|
| 14407 |
"rewards/format_reward": 1.0,
|
| 14408 |
"step": 900,
|
| 14409 |
"temporal_rewards": 0.714285671710968
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14410 |
}
|
| 14411 |
],
|
| 14412 |
"logging_steps": 1.0,
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 0.042645741822679,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 1000,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 14407 |
"rewards/format_reward": 1.0,
|
| 14408 |
"step": 900,
|
| 14409 |
"temporal_rewards": 0.714285671710968
|
| 14410 |
+
},
|
| 14411 |
+
{
|
| 14412 |
+
"all_correct": 0.14285714285714285,
|
| 14413 |
+
"all_wrong": 0.42857142857142855,
|
| 14414 |
+
"completion_length": 388.4821472167969,
|
| 14415 |
+
"epoch": 0.03842381338223379,
|
| 14416 |
+
"grad_norm": 1.388325239051785,
|
| 14417 |
+
"kl": 0.07861328125,
|
| 14418 |
+
"learning_rate": 9.963615763757953e-07,
|
| 14419 |
+
"loss": 0.0031,
|
| 14420 |
+
"reward": 1.4394482374191284,
|
| 14421 |
+
"reward_std": 0.09279949963092804,
|
| 14422 |
+
"rewards/accuracy_reward": 0.3323054015636444,
|
| 14423 |
+
"rewards/format_reward": 1.0,
|
| 14424 |
+
"step": 901,
|
| 14425 |
+
"temporal_rewards": 0.714285671710968
|
| 14426 |
+
},
|
| 14427 |
+
{
|
| 14428 |
+
"all_correct": 0.2857142857142857,
|
| 14429 |
+
"all_wrong": 0.2857142857142857,
|
| 14430 |
+
"completion_length": 437.51788330078125,
|
| 14431 |
+
"epoch": 0.03846645912405646,
|
| 14432 |
+
"grad_norm": 1.026819463325266,
|
| 14433 |
+
"kl": 0.06884765625,
|
| 14434 |
+
"learning_rate": 9.963535053184923e-07,
|
| 14435 |
+
"loss": 0.0028,
|
| 14436 |
+
"reward": 1.549193024635315,
|
| 14437 |
+
"reward_std": 0.10150664299726486,
|
| 14438 |
+
"rewards/accuracy_reward": 0.4420502781867981,
|
| 14439 |
+
"rewards/format_reward": 1.0,
|
| 14440 |
+
"step": 902,
|
| 14441 |
+
"temporal_rewards": 0.5
|
| 14442 |
+
},
|
| 14443 |
+
{
|
| 14444 |
+
"all_correct": 0.42857142857142855,
|
| 14445 |
+
"all_wrong": 0.14285714285714285,
|
| 14446 |
+
"completion_length": 414.2321472167969,
|
| 14447 |
+
"epoch": 0.03850910486587914,
|
| 14448 |
+
"grad_norm": 1.2365418501254621,
|
| 14449 |
+
"kl": 0.059814453125,
|
| 14450 |
+
"learning_rate": 9.96345425351918e-07,
|
| 14451 |
+
"loss": 0.0024,
|
| 14452 |
+
"reward": 1.782142996788025,
|
| 14453 |
+
"reward_std": 0.22749534249305725,
|
| 14454 |
+
"rewards/accuracy_reward": 0.6785714626312256,
|
| 14455 |
+
"rewards/format_reward": 1.0,
|
| 14456 |
+
"step": 903,
|
| 14457 |
+
"temporal_rewards": 0.5
|
| 14458 |
+
},
|
| 14459 |
+
{
|
| 14460 |
+
"all_correct": 0.2857142857142857,
|
| 14461 |
+
"all_wrong": 0.14285714285714285,
|
| 14462 |
+
"completion_length": 368.5535888671875,
|
| 14463 |
+
"epoch": 0.03855175060770182,
|
| 14464 |
+
"grad_norm": 1.8688669932203024,
|
| 14465 |
+
"kl": 0.0849609375,
|
| 14466 |
+
"learning_rate": 9.963373364762176e-07,
|
| 14467 |
+
"loss": 0.0034,
|
| 14468 |
+
"reward": 1.7157738208770752,
|
| 14469 |
+
"reward_std": 0.2985364496707916,
|
| 14470 |
+
"rewards/accuracy_reward": 0.5372024178504944,
|
| 14471 |
+
"rewards/format_reward": 1.0,
|
| 14472 |
+
"step": 904,
|
| 14473 |
+
"temporal_rewards": 0.6428571343421936
|
| 14474 |
+
},
|
| 14475 |
+
{
|
| 14476 |
+
"all_correct": 0.0,
|
| 14477 |
+
"all_wrong": 0.0,
|
| 14478 |
+
"completion_length": 423.4821472167969,
|
| 14479 |
+
"epoch": 0.0385943963495245,
|
| 14480 |
+
"grad_norm": 2.135810610374038,
|
| 14481 |
+
"kl": 0.06201171875,
|
| 14482 |
+
"learning_rate": 9.963292386915358e-07,
|
| 14483 |
+
"loss": 0.0025,
|
| 14484 |
+
"reward": 1.6755682229995728,
|
| 14485 |
+
"reward_std": 0.19589506089687347,
|
| 14486 |
+
"rewards/accuracy_reward": 0.5005680322647095,
|
| 14487 |
+
"rewards/format_reward": 1.0,
|
| 14488 |
+
"step": 905,
|
| 14489 |
+
"temporal_rewards": 0.5714285373687744
|
| 14490 |
+
},
|
| 14491 |
+
{
|
| 14492 |
+
"all_correct": 0.2857142857142857,
|
| 14493 |
+
"all_wrong": 0.0,
|
| 14494 |
+
"completion_length": 422.1071472167969,
|
| 14495 |
+
"epoch": 0.038637042091347176,
|
| 14496 |
+
"grad_norm": 2.051111400485243,
|
| 14497 |
+
"kl": 0.07080078125,
|
| 14498 |
+
"learning_rate": 9.963211319980185e-07,
|
| 14499 |
+
"loss": 0.0028,
|
| 14500 |
+
"reward": 1.6941068172454834,
|
| 14501 |
+
"reward_std": 0.3651140630245209,
|
| 14502 |
+
"rewards/accuracy_reward": 0.5173211693763733,
|
| 14503 |
+
"rewards/format_reward": 1.0,
|
| 14504 |
+
"step": 906,
|
| 14505 |
+
"temporal_rewards": 0.7857142686843872
|
| 14506 |
+
},
|
| 14507 |
+
{
|
| 14508 |
+
"all_correct": 0.14285714285714285,
|
| 14509 |
+
"all_wrong": 0.5714285714285714,
|
| 14510 |
+
"completion_length": 388.1607360839844,
|
| 14511 |
+
"epoch": 0.03867968783316986,
|
| 14512 |
+
"grad_norm": 1.2286741388000242,
|
| 14513 |
+
"kl": 0.068359375,
|
| 14514 |
+
"learning_rate": 9.963130163958108e-07,
|
| 14515 |
+
"loss": 0.0027,
|
| 14516 |
+
"reward": 1.3034491539001465,
|
| 14517 |
+
"reward_std": 0.12692005932331085,
|
| 14518 |
+
"rewards/accuracy_reward": 0.25344905257225037,
|
| 14519 |
+
"rewards/format_reward": 1.0,
|
| 14520 |
+
"step": 907,
|
| 14521 |
+
"temporal_rewards": 0.6428571343421936
|
| 14522 |
+
},
|
| 14523 |
+
{
|
| 14524 |
+
"all_correct": 0.14285714285714285,
|
| 14525 |
+
"all_wrong": 0.2857142857142857,
|
| 14526 |
+
"completion_length": 438.0535888671875,
|
| 14527 |
+
"epoch": 0.03872233357499254,
|
| 14528 |
+
"grad_norm": 1.1109793404981478,
|
| 14529 |
+
"kl": 0.05419921875,
|
| 14530 |
+
"learning_rate": 9.963048918850585e-07,
|
| 14531 |
+
"loss": 0.0022,
|
| 14532 |
+
"reward": 1.4734070301055908,
|
| 14533 |
+
"reward_std": 0.21417297422885895,
|
| 14534 |
+
"rewards/accuracy_reward": 0.30912119150161743,
|
| 14535 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14536 |
+
"step": 908,
|
| 14537 |
+
"temporal_rewards": 0.714285671710968
|
| 14538 |
+
},
|
| 14539 |
+
{
|
| 14540 |
+
"all_correct": 0.5714285714285714,
|
| 14541 |
+
"all_wrong": 0.0,
|
| 14542 |
+
"completion_length": 386.14288330078125,
|
| 14543 |
+
"epoch": 0.03876497931681522,
|
| 14544 |
+
"grad_norm": 1.4176361827714907,
|
| 14545 |
+
"kl": 0.0791015625,
|
| 14546 |
+
"learning_rate": 9.962967584659075e-07,
|
| 14547 |
+
"loss": 0.0032,
|
| 14548 |
+
"reward": 2.136169910430908,
|
| 14549 |
+
"reward_std": 0.14292067289352417,
|
| 14550 |
+
"rewards/accuracy_reward": 0.8022412061691284,
|
| 14551 |
+
"rewards/format_reward": 1.0,
|
| 14552 |
+
"step": 909,
|
| 14553 |
+
"temporal_rewards": 0.7857142686843872
|
| 14554 |
+
},
|
| 14555 |
+
{
|
| 14556 |
+
"all_correct": 0.14285714285714285,
|
| 14557 |
+
"all_wrong": 0.14285714285714285,
|
| 14558 |
+
"completion_length": 414.1964416503906,
|
| 14559 |
+
"epoch": 0.0388076250586379,
|
| 14560 |
+
"grad_norm": 1.40762891931829,
|
| 14561 |
+
"kl": 0.07275390625,
|
| 14562 |
+
"learning_rate": 9.962886161385037e-07,
|
| 14563 |
+
"loss": 0.0029,
|
| 14564 |
+
"reward": 1.5685076713562012,
|
| 14565 |
+
"reward_std": 0.1862516850233078,
|
| 14566 |
+
"rewards/accuracy_reward": 0.4113648533821106,
|
| 14567 |
+
"rewards/format_reward": 1.0,
|
| 14568 |
+
"step": 910,
|
| 14569 |
+
"temporal_rewards": 0.5714285373687744
|
| 14570 |
+
},
|
| 14571 |
+
{
|
| 14572 |
+
"all_correct": 0.42857142857142855,
|
| 14573 |
+
"all_wrong": 0.14285714285714285,
|
| 14574 |
+
"completion_length": 427.6250305175781,
|
| 14575 |
+
"epoch": 0.03885027080046057,
|
| 14576 |
+
"grad_norm": 1.4988795157708306,
|
| 14577 |
+
"kl": 0.06396484375,
|
| 14578 |
+
"learning_rate": 9.962804649029936e-07,
|
| 14579 |
+
"loss": 0.0026,
|
| 14580 |
+
"reward": 1.8500804901123047,
|
| 14581 |
+
"reward_std": 0.1730491816997528,
|
| 14582 |
+
"rewards/accuracy_reward": 0.7072232365608215,
|
| 14583 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14584 |
+
"step": 911,
|
| 14585 |
+
"temporal_rewards": 0.5714285373687744
|
| 14586 |
+
},
|
| 14587 |
+
{
|
| 14588 |
+
"all_correct": 0.14285714285714285,
|
| 14589 |
+
"all_wrong": 0.5714285714285714,
|
| 14590 |
+
"completion_length": 405.6964416503906,
|
| 14591 |
+
"epoch": 0.03889291654228325,
|
| 14592 |
+
"grad_norm": 1.9102132785830699,
|
| 14593 |
+
"kl": 0.06591796875,
|
| 14594 |
+
"learning_rate": 9.96272304759523e-07,
|
| 14595 |
+
"loss": 0.0026,
|
| 14596 |
+
"reward": 1.5267857313156128,
|
| 14597 |
+
"reward_std": 0.1490623652935028,
|
| 14598 |
+
"rewards/accuracy_reward": 0.392857164144516,
|
| 14599 |
+
"rewards/format_reward": 1.0,
|
| 14600 |
+
"step": 912,
|
| 14601 |
+
"temporal_rewards": 0.7857142686843872
|
| 14602 |
+
},
|
| 14603 |
+
{
|
| 14604 |
+
"all_correct": 0.14285714285714285,
|
| 14605 |
+
"all_wrong": 0.2857142857142857,
|
| 14606 |
+
"completion_length": 414.1964416503906,
|
| 14607 |
+
"epoch": 0.03893556228410593,
|
| 14608 |
+
"grad_norm": 1.7387254504787168,
|
| 14609 |
+
"kl": 0.05712890625,
|
| 14610 |
+
"learning_rate": 9.962641357082387e-07,
|
| 14611 |
+
"loss": 0.0023,
|
| 14612 |
+
"reward": 1.6454274654388428,
|
| 14613 |
+
"reward_std": 0.2543186545372009,
|
| 14614 |
+
"rewards/accuracy_reward": 0.5025703310966492,
|
| 14615 |
+
"rewards/format_reward": 1.0,
|
| 14616 |
+
"step": 913,
|
| 14617 |
+
"temporal_rewards": 0.6428571343421936
|
| 14618 |
+
},
|
| 14619 |
+
{
|
| 14620 |
+
"all_correct": 0.8571428571428571,
|
| 14621 |
+
"all_wrong": 0.14285714285714285,
|
| 14622 |
+
"completion_length": 370.3214416503906,
|
| 14623 |
+
"epoch": 0.038978208025928614,
|
| 14624 |
+
"grad_norm": 1.2170320478100345,
|
| 14625 |
+
"kl": 0.0751953125,
|
| 14626 |
+
"learning_rate": 9.962559577492871e-07,
|
| 14627 |
+
"loss": 0.003,
|
| 14628 |
+
"reward": 2.015306234359741,
|
| 14629 |
+
"reward_std": 0.05134067311882973,
|
| 14630 |
+
"rewards/accuracy_reward": 0.8367346525192261,
|
| 14631 |
+
"rewards/format_reward": 1.0,
|
| 14632 |
+
"step": 914,
|
| 14633 |
+
"temporal_rewards": 0.5
|
| 14634 |
+
},
|
| 14635 |
+
{
|
| 14636 |
+
"all_correct": 0.14285714285714285,
|
| 14637 |
+
"all_wrong": 0.14285714285714285,
|
| 14638 |
+
"completion_length": 407.08929443359375,
|
| 14639 |
+
"epoch": 0.03902085376775129,
|
| 14640 |
+
"grad_norm": 2.075396285314163,
|
| 14641 |
+
"kl": 0.058349609375,
|
| 14642 |
+
"learning_rate": 9.962477708828152e-07,
|
| 14643 |
+
"loss": 0.0023,
|
| 14644 |
+
"reward": 1.505526065826416,
|
| 14645 |
+
"reward_std": 0.31088876724243164,
|
| 14646 |
+
"rewards/accuracy_reward": 0.37695467472076416,
|
| 14647 |
+
"rewards/format_reward": 1.0,
|
| 14648 |
+
"step": 915,
|
| 14649 |
+
"temporal_rewards": 0.5
|
| 14650 |
+
},
|
| 14651 |
+
{
|
| 14652 |
+
"all_correct": 0.2857142857142857,
|
| 14653 |
+
"all_wrong": 0.14285714285714285,
|
| 14654 |
+
"completion_length": 363.9285888671875,
|
| 14655 |
+
"epoch": 0.03906349950957397,
|
| 14656 |
+
"grad_norm": 2.0530572811195458,
|
| 14657 |
+
"kl": 0.07958984375,
|
| 14658 |
+
"learning_rate": 9.9623957510897e-07,
|
| 14659 |
+
"loss": 0.0032,
|
| 14660 |
+
"reward": 1.8374649286270142,
|
| 14661 |
+
"reward_std": 0.30604711174964905,
|
| 14662 |
+
"rewards/accuracy_reward": 0.6463934183120728,
|
| 14663 |
+
"rewards/format_reward": 1.0,
|
| 14664 |
+
"step": 916,
|
| 14665 |
+
"temporal_rewards": 0.6428571343421936
|
| 14666 |
+
},
|
| 14667 |
+
{
|
| 14668 |
+
"all_correct": 0.5714285714285714,
|
| 14669 |
+
"all_wrong": 0.0,
|
| 14670 |
+
"completion_length": 421.2321472167969,
|
| 14671 |
+
"epoch": 0.03910614525139665,
|
| 14672 |
+
"grad_norm": 1.8443845860137627,
|
| 14673 |
+
"kl": 0.06787109375,
|
| 14674 |
+
"learning_rate": 9.962313704278981e-07,
|
| 14675 |
+
"loss": 0.0027,
|
| 14676 |
+
"reward": 2.0418219566345215,
|
| 14677 |
+
"reward_std": 0.21194705367088318,
|
| 14678 |
+
"rewards/accuracy_reward": 0.7382504940032959,
|
| 14679 |
+
"rewards/format_reward": 1.0,
|
| 14680 |
+
"step": 917,
|
| 14681 |
+
"temporal_rewards": 0.7857142686843872
|
| 14682 |
+
},
|
| 14683 |
+
{
|
| 14684 |
+
"all_correct": 0.0,
|
| 14685 |
+
"all_wrong": 0.0,
|
| 14686 |
+
"completion_length": 402.58929443359375,
|
| 14687 |
+
"epoch": 0.03914879099321933,
|
| 14688 |
+
"grad_norm": 1.8829320648015981,
|
| 14689 |
+
"kl": 0.05712890625,
|
| 14690 |
+
"learning_rate": 9.962231568397472e-07,
|
| 14691 |
+
"loss": 0.0023,
|
| 14692 |
+
"reward": 1.5507712364196777,
|
| 14693 |
+
"reward_std": 0.21578750014305115,
|
| 14694 |
+
"rewards/accuracy_reward": 0.43291404843330383,
|
| 14695 |
+
"rewards/format_reward": 1.0,
|
| 14696 |
+
"step": 918,
|
| 14697 |
+
"temporal_rewards": 0.5
|
| 14698 |
+
},
|
| 14699 |
+
{
|
| 14700 |
+
"all_correct": 0.2857142857142857,
|
| 14701 |
+
"all_wrong": 0.0,
|
| 14702 |
+
"completion_length": 403.4821472167969,
|
| 14703 |
+
"epoch": 0.039191436735042,
|
| 14704 |
+
"grad_norm": 1.6641626617634653,
|
| 14705 |
+
"kl": 0.0732421875,
|
| 14706 |
+
"learning_rate": 9.96214934344665e-07,
|
| 14707 |
+
"loss": 0.0029,
|
| 14708 |
+
"reward": 1.9780364036560059,
|
| 14709 |
+
"reward_std": 0.2805452048778534,
|
| 14710 |
+
"rewards/accuracy_reward": 0.6637506484985352,
|
| 14711 |
+
"rewards/format_reward": 1.0,
|
| 14712 |
+
"step": 919,
|
| 14713 |
+
"temporal_rewards": 0.714285671710968
|
| 14714 |
+
},
|
| 14715 |
+
{
|
| 14716 |
+
"all_correct": 0.2857142857142857,
|
| 14717 |
+
"all_wrong": 0.14285714285714285,
|
| 14718 |
+
"completion_length": 406.5357360839844,
|
| 14719 |
+
"epoch": 0.03923408247686468,
|
| 14720 |
+
"grad_norm": 1.7210279625723943,
|
| 14721 |
+
"kl": 0.064453125,
|
| 14722 |
+
"learning_rate": 9.962067029427983e-07,
|
| 14723 |
+
"loss": 0.0026,
|
| 14724 |
+
"reward": 1.669837236404419,
|
| 14725 |
+
"reward_std": 0.16734497249126434,
|
| 14726 |
+
"rewards/accuracy_reward": 0.5216229557991028,
|
| 14727 |
+
"rewards/format_reward": 1.0,
|
| 14728 |
+
"step": 920,
|
| 14729 |
+
"temporal_rewards": 0.5714285373687744
|
| 14730 |
+
},
|
| 14731 |
+
{
|
| 14732 |
+
"all_correct": 0.14285714285714285,
|
| 14733 |
+
"all_wrong": 0.2857142857142857,
|
| 14734 |
+
"completion_length": 470.5000305175781,
|
| 14735 |
+
"epoch": 0.039276728218687364,
|
| 14736 |
+
"grad_norm": 1.2527360686901718,
|
| 14737 |
+
"kl": 0.05322265625,
|
| 14738 |
+
"learning_rate": 9.961984626342956e-07,
|
| 14739 |
+
"loss": 0.0021,
|
| 14740 |
+
"reward": 1.4912724494934082,
|
| 14741 |
+
"reward_std": 0.09840281307697296,
|
| 14742 |
+
"rewards/accuracy_reward": 0.3662723898887634,
|
| 14743 |
+
"rewards/format_reward": 1.0,
|
| 14744 |
+
"step": 921,
|
| 14745 |
+
"temporal_rewards": 0.5714285373687744
|
| 14746 |
+
},
|
| 14747 |
+
{
|
| 14748 |
+
"all_correct": 0.42857142857142855,
|
| 14749 |
+
"all_wrong": 0.14285714285714285,
|
| 14750 |
+
"completion_length": 447.6607360839844,
|
| 14751 |
+
"epoch": 0.039319373960510044,
|
| 14752 |
+
"grad_norm": 1.6881157347849156,
|
| 14753 |
+
"kl": 0.05615234375,
|
| 14754 |
+
"learning_rate": 9.961902134193045e-07,
|
| 14755 |
+
"loss": 0.0022,
|
| 14756 |
+
"reward": 1.6204907894134521,
|
| 14757 |
+
"reward_std": 0.10691835731267929,
|
| 14758 |
+
"rewards/accuracy_reward": 0.5097763538360596,
|
| 14759 |
+
"rewards/format_reward": 1.0,
|
| 14760 |
+
"step": 922,
|
| 14761 |
+
"temporal_rewards": 0.5
|
| 14762 |
+
},
|
| 14763 |
+
{
|
| 14764 |
+
"all_correct": 0.14285714285714285,
|
| 14765 |
+
"all_wrong": 0.14285714285714285,
|
| 14766 |
+
"completion_length": 424.64288330078125,
|
| 14767 |
+
"epoch": 0.039362019702332725,
|
| 14768 |
+
"grad_norm": 1.4525888204858919,
|
| 14769 |
+
"kl": 0.06591796875,
|
| 14770 |
+
"learning_rate": 9.96181955297973e-07,
|
| 14771 |
+
"loss": 0.0026,
|
| 14772 |
+
"reward": 1.608081579208374,
|
| 14773 |
+
"reward_std": 0.3661433458328247,
|
| 14774 |
+
"rewards/accuracy_reward": 0.45808160305023193,
|
| 14775 |
+
"rewards/format_reward": 1.0,
|
| 14776 |
+
"step": 923,
|
| 14777 |
+
"temporal_rewards": 0.5714285373687744
|
| 14778 |
+
},
|
| 14779 |
+
{
|
| 14780 |
+
"all_correct": 0.42857142857142855,
|
| 14781 |
+
"all_wrong": 0.2857142857142857,
|
| 14782 |
+
"completion_length": 427.4464416503906,
|
| 14783 |
+
"epoch": 0.0394046654441554,
|
| 14784 |
+
"grad_norm": 2.6242606472855163,
|
| 14785 |
+
"kl": 0.06787109375,
|
| 14786 |
+
"learning_rate": 9.961736882704497e-07,
|
| 14787 |
+
"loss": 0.0027,
|
| 14788 |
+
"reward": 1.65829336643219,
|
| 14789 |
+
"reward_std": 0.12387296557426453,
|
| 14790 |
+
"rewards/accuracy_reward": 0.5118646621704102,
|
| 14791 |
+
"rewards/format_reward": 1.0,
|
| 14792 |
+
"step": 924,
|
| 14793 |
+
"temporal_rewards": 0.6428571343421936
|
| 14794 |
+
},
|
| 14795 |
+
{
|
| 14796 |
+
"all_correct": 0.2857142857142857,
|
| 14797 |
+
"all_wrong": 0.0,
|
| 14798 |
+
"completion_length": 428.0714416503906,
|
| 14799 |
+
"epoch": 0.03944731118597808,
|
| 14800 |
+
"grad_norm": 2.012136982924822,
|
| 14801 |
+
"kl": 0.056640625,
|
| 14802 |
+
"learning_rate": 9.961654123368824e-07,
|
| 14803 |
+
"loss": 0.0023,
|
| 14804 |
+
"reward": 1.6333997249603271,
|
| 14805 |
+
"reward_std": 0.11353455483913422,
|
| 14806 |
+
"rewards/accuracy_reward": 0.42268532514572144,
|
| 14807 |
+
"rewards/format_reward": 1.0,
|
| 14808 |
+
"step": 925,
|
| 14809 |
+
"temporal_rewards": 0.6428571343421936
|
| 14810 |
+
},
|
| 14811 |
+
{
|
| 14812 |
+
"all_correct": 0.2857142857142857,
|
| 14813 |
+
"all_wrong": 0.14285714285714285,
|
| 14814 |
+
"completion_length": 385.1964416503906,
|
| 14815 |
+
"epoch": 0.03948995692780076,
|
| 14816 |
+
"grad_norm": 1.6158760149356628,
|
| 14817 |
+
"kl": 0.078125,
|
| 14818 |
+
"learning_rate": 9.9615712749742e-07,
|
| 14819 |
+
"loss": 0.0031,
|
| 14820 |
+
"reward": 1.5787651538848877,
|
| 14821 |
+
"reward_std": 0.1297575682401657,
|
| 14822 |
+
"rewards/accuracy_reward": 0.45019370317459106,
|
| 14823 |
+
"rewards/format_reward": 1.0,
|
| 14824 |
+
"step": 926,
|
| 14825 |
+
"temporal_rewards": 0.5714285373687744
|
| 14826 |
+
},
|
| 14827 |
+
{
|
| 14828 |
+
"all_correct": 0.14285714285714285,
|
| 14829 |
+
"all_wrong": 0.14285714285714285,
|
| 14830 |
+
"completion_length": 430.3571472167969,
|
| 14831 |
+
"epoch": 0.03953260266962344,
|
| 14832 |
+
"grad_norm": 5.925694109606645,
|
| 14833 |
+
"kl": 0.0654296875,
|
| 14834 |
+
"learning_rate": 9.961488337522113e-07,
|
| 14835 |
+
"loss": 0.0026,
|
| 14836 |
+
"reward": 1.468300223350525,
|
| 14837 |
+
"reward_std": 0.21330617368221283,
|
| 14838 |
+
"rewards/accuracy_reward": 0.38080018758773804,
|
| 14839 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14840 |
+
"step": 927,
|
| 14841 |
+
"temporal_rewards": 0.5
|
| 14842 |
+
},
|
| 14843 |
+
{
|
| 14844 |
+
"all_correct": 0.42857142857142855,
|
| 14845 |
+
"all_wrong": 0.2857142857142857,
|
| 14846 |
+
"completion_length": 424.7321472167969,
|
| 14847 |
+
"epoch": 0.039575248411446114,
|
| 14848 |
+
"grad_norm": 1.3415082894145172,
|
| 14849 |
+
"kl": 0.06298828125,
|
| 14850 |
+
"learning_rate": 9.96140531101405e-07,
|
| 14851 |
+
"loss": 0.0025,
|
| 14852 |
+
"reward": 1.7482143640518188,
|
| 14853 |
+
"reward_std": 0.21537911891937256,
|
| 14854 |
+
"rewards/accuracy_reward": 0.625,
|
| 14855 |
+
"rewards/format_reward": 1.0,
|
| 14856 |
+
"step": 928,
|
| 14857 |
+
"temporal_rewards": 0.5714285373687744
|
| 14858 |
+
},
|
| 14859 |
+
{
|
| 14860 |
+
"all_correct": 0.0,
|
| 14861 |
+
"all_wrong": 0.14285714285714285,
|
| 14862 |
+
"completion_length": 390.9464416503906,
|
| 14863 |
+
"epoch": 0.039617894153268794,
|
| 14864 |
+
"grad_norm": 2.2955946940227503,
|
| 14865 |
+
"kl": 0.055908203125,
|
| 14866 |
+
"learning_rate": 9.961322195451497e-07,
|
| 14867 |
+
"loss": 0.0022,
|
| 14868 |
+
"reward": 1.515081524848938,
|
| 14869 |
+
"reward_std": 0.18819458782672882,
|
| 14870 |
+
"rewards/accuracy_reward": 0.2865099012851715,
|
| 14871 |
+
"rewards/format_reward": 1.0,
|
| 14872 |
+
"step": 929,
|
| 14873 |
+
"temporal_rewards": 0.714285671710968
|
| 14874 |
+
},
|
| 14875 |
+
{
|
| 14876 |
+
"all_correct": 0.2857142857142857,
|
| 14877 |
+
"all_wrong": 0.14285714285714285,
|
| 14878 |
+
"completion_length": 389.7500305175781,
|
| 14879 |
+
"epoch": 0.039660539895091475,
|
| 14880 |
+
"grad_norm": 6.008457943376909,
|
| 14881 |
+
"kl": 0.06591796875,
|
| 14882 |
+
"learning_rate": 9.961238990835957e-07,
|
| 14883 |
+
"loss": 0.0026,
|
| 14884 |
+
"reward": 1.6996906995773315,
|
| 14885 |
+
"reward_std": 0.2757183313369751,
|
| 14886 |
+
"rewards/accuracy_reward": 0.5104049444198608,
|
| 14887 |
+
"rewards/format_reward": 1.0,
|
| 14888 |
+
"step": 930,
|
| 14889 |
+
"temporal_rewards": 0.714285671710968
|
| 14890 |
+
},
|
| 14891 |
+
{
|
| 14892 |
+
"all_correct": 0.5714285714285714,
|
| 14893 |
+
"all_wrong": 0.14285714285714285,
|
| 14894 |
+
"completion_length": 400.0000305175781,
|
| 14895 |
+
"epoch": 0.039703185636914155,
|
| 14896 |
+
"grad_norm": 1.4843921910288904,
|
| 14897 |
+
"kl": 0.07080078125,
|
| 14898 |
+
"learning_rate": 9.961155697168913e-07,
|
| 14899 |
+
"loss": 0.0028,
|
| 14900 |
+
"reward": 1.7803571224212646,
|
| 14901 |
+
"reward_std": 0.16438372433185577,
|
| 14902 |
+
"rewards/accuracy_reward": 0.6071428656578064,
|
| 14903 |
+
"rewards/format_reward": 1.0,
|
| 14904 |
+
"step": 931,
|
| 14905 |
+
"temporal_rewards": 0.714285671710968
|
| 14906 |
+
},
|
| 14907 |
+
{
|
| 14908 |
+
"all_correct": 0.0,
|
| 14909 |
+
"all_wrong": 0.0,
|
| 14910 |
+
"completion_length": 439.89288330078125,
|
| 14911 |
+
"epoch": 0.039745831378736836,
|
| 14912 |
+
"grad_norm": 2.212969575646282,
|
| 14913 |
+
"kl": 0.044189453125,
|
| 14914 |
+
"learning_rate": 9.961072314451865e-07,
|
| 14915 |
+
"loss": 0.0018,
|
| 14916 |
+
"reward": 1.5589354038238525,
|
| 14917 |
+
"reward_std": 0.28393542766571045,
|
| 14918 |
+
"rewards/accuracy_reward": 0.3982209861278534,
|
| 14919 |
+
"rewards/format_reward": 1.0,
|
| 14920 |
+
"step": 932,
|
| 14921 |
+
"temporal_rewards": 0.5714285373687744
|
| 14922 |
+
},
|
| 14923 |
+
{
|
| 14924 |
+
"all_correct": 0.0,
|
| 14925 |
+
"all_wrong": 0.14285714285714285,
|
| 14926 |
+
"completion_length": 464.21429443359375,
|
| 14927 |
+
"epoch": 0.03978847712055951,
|
| 14928 |
+
"grad_norm": 1.53116470188613,
|
| 14929 |
+
"kl": 0.034423828125,
|
| 14930 |
+
"learning_rate": 9.960988842686308e-07,
|
| 14931 |
+
"loss": 0.0014,
|
| 14932 |
+
"reward": 1.314236044883728,
|
| 14933 |
+
"reward_std": 0.1616557240486145,
|
| 14934 |
+
"rewards/accuracy_reward": 0.18923597037792206,
|
| 14935 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14936 |
+
"step": 933,
|
| 14937 |
+
"temporal_rewards": 0.5714285373687744
|
| 14938 |
+
},
|
| 14939 |
+
{
|
| 14940 |
+
"all_correct": 0.2857142857142857,
|
| 14941 |
+
"all_wrong": 0.14285714285714285,
|
| 14942 |
+
"completion_length": 404.08929443359375,
|
| 14943 |
+
"epoch": 0.03983112286238219,
|
| 14944 |
+
"grad_norm": 1.8325447090521023,
|
| 14945 |
+
"kl": 0.0703125,
|
| 14946 |
+
"learning_rate": 9.96090528187374e-07,
|
| 14947 |
+
"loss": 0.0028,
|
| 14948 |
+
"reward": 1.8988735675811768,
|
| 14949 |
+
"reward_std": 0.2149917185306549,
|
| 14950 |
+
"rewards/accuracy_reward": 0.6917307376861572,
|
| 14951 |
+
"rewards/format_reward": 1.0,
|
| 14952 |
+
"step": 934,
|
| 14953 |
+
"temporal_rewards": 0.6428571343421936
|
| 14954 |
+
},
|
| 14955 |
+
{
|
| 14956 |
+
"all_correct": 0.2857142857142857,
|
| 14957 |
+
"all_wrong": 0.14285714285714285,
|
| 14958 |
+
"completion_length": 412.58929443359375,
|
| 14959 |
+
"epoch": 0.03987376860420487,
|
| 14960 |
+
"grad_norm": 3.4615665759091536,
|
| 14961 |
+
"kl": 0.0703125,
|
| 14962 |
+
"learning_rate": 9.960821632015666e-07,
|
| 14963 |
+
"loss": 0.0028,
|
| 14964 |
+
"reward": 1.7402606010437012,
|
| 14965 |
+
"reward_std": 0.20666787028312683,
|
| 14966 |
+
"rewards/accuracy_reward": 0.517046332359314,
|
| 14967 |
+
"rewards/format_reward": 1.0,
|
| 14968 |
+
"step": 935,
|
| 14969 |
+
"temporal_rewards": 0.714285671710968
|
| 14970 |
+
},
|
| 14971 |
+
{
|
| 14972 |
+
"all_correct": 0.0,
|
| 14973 |
+
"all_wrong": 0.2857142857142857,
|
| 14974 |
+
"completion_length": 365.6964416503906,
|
| 14975 |
+
"epoch": 0.03991641434602755,
|
| 14976 |
+
"grad_norm": 5.181621142955212,
|
| 14977 |
+
"kl": 0.058837890625,
|
| 14978 |
+
"learning_rate": 9.96073789311358e-07,
|
| 14979 |
+
"loss": 0.0024,
|
| 14980 |
+
"reward": 1.2537422180175781,
|
| 14981 |
+
"reward_std": 0.16066715121269226,
|
| 14982 |
+
"rewards/accuracy_reward": 0.16088514029979706,
|
| 14983 |
+
"rewards/format_reward": 1.0,
|
| 14984 |
+
"step": 936,
|
| 14985 |
+
"temporal_rewards": 0.5
|
| 14986 |
+
},
|
| 14987 |
+
{
|
| 14988 |
+
"all_correct": 0.0,
|
| 14989 |
+
"all_wrong": 0.0,
|
| 14990 |
+
"completion_length": 411.5714416503906,
|
| 14991 |
+
"epoch": 0.039959060087850225,
|
| 14992 |
+
"grad_norm": 1.7794049381076447,
|
| 14993 |
+
"kl": 0.06494140625,
|
| 14994 |
+
"learning_rate": 9.960654065168988e-07,
|
| 14995 |
+
"loss": 0.0026,
|
| 14996 |
+
"reward": 1.7535653114318848,
|
| 14997 |
+
"reward_std": 0.24865297973155975,
|
| 14998 |
+
"rewards/accuracy_reward": 0.5571366548538208,
|
| 14999 |
+
"rewards/format_reward": 1.0,
|
| 15000 |
+
"step": 937,
|
| 15001 |
+
"temporal_rewards": 0.6428571343421936
|
| 15002 |
+
},
|
| 15003 |
+
{
|
| 15004 |
+
"all_correct": 0.2857142857142857,
|
| 15005 |
+
"all_wrong": 0.0,
|
| 15006 |
+
"completion_length": 450.4107360839844,
|
| 15007 |
+
"epoch": 0.040001705829672905,
|
| 15008 |
+
"grad_norm": 1.6923568537310802,
|
| 15009 |
+
"kl": 0.051513671875,
|
| 15010 |
+
"learning_rate": 9.960570148183395e-07,
|
| 15011 |
+
"loss": 0.0021,
|
| 15012 |
+
"reward": 1.590727686882019,
|
| 15013 |
+
"reward_std": 0.33292099833488464,
|
| 15014 |
+
"rewards/accuracy_reward": 0.5085846781730652,
|
| 15015 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 15016 |
+
"step": 938,
|
| 15017 |
+
"temporal_rewards": 0.5714285373687744
|
| 15018 |
+
},
|
| 15019 |
+
{
|
| 15020 |
+
"all_correct": 0.42857142857142855,
|
| 15021 |
+
"all_wrong": 0.2857142857142857,
|
| 15022 |
+
"completion_length": 394.89288330078125,
|
| 15023 |
+
"epoch": 0.040044351571495586,
|
| 15024 |
+
"grad_norm": 1.6035153750518245,
|
| 15025 |
+
"kl": 0.07177734375,
|
| 15026 |
+
"learning_rate": 9.96048614215831e-07,
|
| 15027 |
+
"loss": 0.0029,
|
| 15028 |
+
"reward": 1.7771177291870117,
|
| 15029 |
+
"reward_std": 0.08045493066310883,
|
| 15030 |
+
"rewards/accuracy_reward": 0.5771176218986511,
|
| 15031 |
+
"rewards/format_reward": 1.0,
|
| 15032 |
+
"step": 939,
|
| 15033 |
+
"temporal_rewards": 0.6428571343421936
|
| 15034 |
+
},
|
| 15035 |
+
{
|
| 15036 |
+
"all_correct": 0.2857142857142857,
|
| 15037 |
+
"all_wrong": 0.42857142857142855,
|
| 15038 |
+
"completion_length": 385.76788330078125,
|
| 15039 |
+
"epoch": 0.040086997313318266,
|
| 15040 |
+
"grad_norm": 1.6286891649077688,
|
| 15041 |
+
"kl": 0.0732421875,
|
| 15042 |
+
"learning_rate": 9.960402047095235e-07,
|
| 15043 |
+
"loss": 0.0029,
|
| 15044 |
+
"reward": 1.5035713911056519,
|
| 15045 |
+
"reward_std": 0.21551916003227234,
|
| 15046 |
+
"rewards/accuracy_reward": 0.392857164144516,
|
| 15047 |
+
"rewards/format_reward": 1.0,
|
| 15048 |
+
"step": 940,
|
| 15049 |
+
"temporal_rewards": 0.714285671710968
|
| 15050 |
+
},
|
| 15051 |
+
{
|
| 15052 |
+
"all_correct": 0.42857142857142855,
|
| 15053 |
+
"all_wrong": 0.0,
|
| 15054 |
+
"completion_length": 385.4821472167969,
|
| 15055 |
+
"epoch": 0.04012964305514095,
|
| 15056 |
+
"grad_norm": 3.8696943787473446,
|
| 15057 |
+
"kl": 0.07763671875,
|
| 15058 |
+
"learning_rate": 9.960317862995684e-07,
|
| 15059 |
+
"loss": 0.0031,
|
| 15060 |
+
"reward": 1.9869627952575684,
|
| 15061 |
+
"reward_std": 0.24162130057811737,
|
| 15062 |
+
"rewards/accuracy_reward": 0.8012484908103943,
|
| 15063 |
+
"rewards/format_reward": 1.0,
|
| 15064 |
+
"step": 941,
|
| 15065 |
+
"temporal_rewards": 0.5
|
| 15066 |
+
},
|
| 15067 |
+
{
|
| 15068 |
+
"all_correct": 0.14285714285714285,
|
| 15069 |
+
"all_wrong": 0.0,
|
| 15070 |
+
"completion_length": 420.4285888671875,
|
| 15071 |
+
"epoch": 0.04017228879696362,
|
| 15072 |
+
"grad_norm": 3.036793601101903,
|
| 15073 |
+
"kl": 0.0517578125,
|
| 15074 |
+
"learning_rate": 9.960233589861167e-07,
|
| 15075 |
+
"loss": 0.0021,
|
| 15076 |
+
"reward": 1.6199066638946533,
|
| 15077 |
+
"reward_std": 0.28489503264427185,
|
| 15078 |
+
"rewards/accuracy_reward": 0.46990665793418884,
|
| 15079 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 15080 |
+
"step": 942,
|
| 15081 |
+
"temporal_rewards": 0.5
|
| 15082 |
+
},
|
| 15083 |
+
{
|
| 15084 |
+
"all_correct": 0.42857142857142855,
|
| 15085 |
+
"all_wrong": 0.14285714285714285,
|
| 15086 |
+
"completion_length": 370.8035888671875,
|
| 15087 |
+
"epoch": 0.0402149345387863,
|
| 15088 |
+
"grad_norm": 1.6650675145492482,
|
| 15089 |
+
"kl": 0.0712890625,
|
| 15090 |
+
"learning_rate": 9.960149227693196e-07,
|
| 15091 |
+
"loss": 0.0028,
|
| 15092 |
+
"reward": 1.9375001192092896,
|
| 15093 |
+
"reward_std": 0.19786998629570007,
|
| 15094 |
+
"rewards/accuracy_reward": 0.6964285969734192,
|
| 15095 |
+
"rewards/format_reward": 1.0,
|
| 15096 |
+
"step": 943,
|
| 15097 |
+
"temporal_rewards": 0.714285671710968
|
| 15098 |
+
},
|
| 15099 |
+
{
|
| 15100 |
+
"all_correct": 0.14285714285714285,
|
| 15101 |
+
"all_wrong": 0.0,
|
| 15102 |
+
"completion_length": 431.39288330078125,
|
| 15103 |
+
"epoch": 0.04025758028060898,
|
| 15104 |
+
"grad_norm": 1.609350149538502,
|
| 15105 |
+
"kl": 0.041015625,
|
| 15106 |
+
"learning_rate": 9.960064776493286e-07,
|
| 15107 |
+
"loss": 0.0016,
|
| 15108 |
+
"reward": 1.6551604270935059,
|
| 15109 |
+
"reward_std": 0.2681572437286377,
|
| 15110 |
+
"rewards/accuracy_reward": 0.4730173945426941,
|
| 15111 |
+
"rewards/format_reward": 1.0,
|
| 15112 |
+
"step": 944,
|
| 15113 |
+
"temporal_rewards": 0.5714285373687744
|
| 15114 |
+
},
|
| 15115 |
+
{
|
| 15116 |
+
"all_correct": 0.42857142857142855,
|
| 15117 |
+
"all_wrong": 0.0,
|
| 15118 |
+
"completion_length": 419.8571472167969,
|
| 15119 |
+
"epoch": 0.04030022602243166,
|
| 15120 |
+
"grad_norm": 1.9610773135593247,
|
| 15121 |
+
"kl": 0.05908203125,
|
| 15122 |
+
"learning_rate": 9.95998023626295e-07,
|
| 15123 |
+
"loss": 0.0024,
|
| 15124 |
+
"reward": 1.7684540748596191,
|
| 15125 |
+
"reward_std": 0.28671181201934814,
|
| 15126 |
+
"rewards/accuracy_reward": 0.7148826122283936,
|
| 15127 |
+
"rewards/format_reward": 0.9464285969734192,
|
| 15128 |
+
"step": 945,
|
| 15129 |
+
"temporal_rewards": 0.5
|
| 15130 |
+
},
|
| 15131 |
+
{
|
| 15132 |
+
"all_correct": 0.0,
|
| 15133 |
+
"all_wrong": 0.42857142857142855,
|
| 15134 |
+
"completion_length": 458.89288330078125,
|
| 15135 |
+
"epoch": 0.040342871764254336,
|
| 15136 |
+
"grad_norm": 1.470090956750234,
|
| 15137 |
+
"kl": 0.044921875,
|
| 15138 |
+
"learning_rate": 9.959895607003712e-07,
|
| 15139 |
+
"loss": 0.0018,
|
| 15140 |
+
"reward": 1.2988783121109009,
|
| 15141 |
+
"reward_std": 0.26279887557029724,
|
| 15142 |
+
"rewards/accuracy_reward": 0.2560211420059204,
|
| 15143 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 15144 |
+
"step": 946,
|
| 15145 |
+
"temporal_rewards": 0.5714285373687744
|
| 15146 |
+
},
|
| 15147 |
+
{
|
| 15148 |
+
"all_correct": 0.14285714285714285,
|
| 15149 |
+
"all_wrong": 0.14285714285714285,
|
| 15150 |
+
"completion_length": 416.7500305175781,
|
| 15151 |
+
"epoch": 0.040385517506077016,
|
| 15152 |
+
"grad_norm": 4.425443340525767,
|
| 15153 |
+
"kl": 0.057373046875,
|
| 15154 |
+
"learning_rate": 9.959810888717084e-07,
|
| 15155 |
+
"loss": 0.0023,
|
| 15156 |
+
"reward": 1.3810738325119019,
|
| 15157 |
+
"reward_std": 0.16069242358207703,
|
| 15158 |
+
"rewards/accuracy_reward": 0.33107370138168335,
|
| 15159 |
+
"rewards/format_reward": 1.0,
|
| 15160 |
+
"step": 947,
|
| 15161 |
+
"temporal_rewards": 0.5
|
| 15162 |
+
},
|
| 15163 |
+
{
|
| 15164 |
+
"all_correct": 0.42857142857142855,
|
| 15165 |
+
"all_wrong": 0.2857142857142857,
|
| 15166 |
+
"completion_length": 390.5357360839844,
|
| 15167 |
+
"epoch": 0.0404281632478997,
|
| 15168 |
+
"grad_norm": 1.4929929891147553,
|
| 15169 |
+
"kl": 0.0625,
|
| 15170 |
+
"learning_rate": 9.959726081404588e-07,
|
| 15171 |
+
"loss": 0.0025,
|
| 15172 |
+
"reward": 1.6071429252624512,
|
| 15173 |
+
"reward_std": 0.1878172904253006,
|
| 15174 |
+
"rewards/accuracy_reward": 0.4821428656578064,
|
| 15175 |
+
"rewards/format_reward": 1.0,
|
| 15176 |
+
"step": 948,
|
| 15177 |
+
"temporal_rewards": 0.5714285373687744
|
| 15178 |
+
},
|
| 15179 |
+
{
|
| 15180 |
+
"all_correct": 0.2857142857142857,
|
| 15181 |
+
"all_wrong": 0.0,
|
| 15182 |
+
"completion_length": 440.96429443359375,
|
| 15183 |
+
"epoch": 0.04047080898972238,
|
| 15184 |
+
"grad_norm": 1.603958163585114,
|
| 15185 |
+
"kl": 0.045654296875,
|
| 15186 |
+
"learning_rate": 9.959641185067753e-07,
|
| 15187 |
+
"loss": 0.0018,
|
| 15188 |
+
"reward": 1.7649989128112793,
|
| 15189 |
+
"reward_std": 0.29367154836654663,
|
| 15190 |
+
"rewards/accuracy_reward": 0.5578558444976807,
|
| 15191 |
+
"rewards/format_reward": 1.0,
|
| 15192 |
+
"step": 949,
|
| 15193 |
+
"temporal_rewards": 0.6428571343421936
|
| 15194 |
+
},
|
| 15195 |
+
{
|
| 15196 |
+
"all_correct": 0.14285714285714285,
|
| 15197 |
+
"all_wrong": 0.14285714285714285,
|
| 15198 |
+
"completion_length": 416.0000305175781,
|
| 15199 |
+
"epoch": 0.04051345473154506,
|
| 15200 |
+
"grad_norm": 2.006844817768613,
|
| 15201 |
+
"kl": 0.060791015625,
|
| 15202 |
+
"learning_rate": 9.959556199708094e-07,
|
| 15203 |
+
"loss": 0.0024,
|
| 15204 |
+
"reward": 1.5652376413345337,
|
| 15205 |
+
"reward_std": 0.18956170976161957,
|
| 15206 |
+
"rewards/accuracy_reward": 0.4366661608219147,
|
| 15207 |
+
"rewards/format_reward": 1.0,
|
| 15208 |
+
"step": 950,
|
| 15209 |
+
"temporal_rewards": 0.5
|
| 15210 |
+
},
|
| 15211 |
+
{
|
| 15212 |
+
"all_correct": 0.14285714285714285,
|
| 15213 |
+
"all_wrong": 0.42857142857142855,
|
| 15214 |
+
"completion_length": 418.1071472167969,
|
| 15215 |
+
"epoch": 0.04055610047336773,
|
| 15216 |
+
"grad_norm": 1.2787284891519373,
|
| 15217 |
+
"kl": 0.078125,
|
| 15218 |
+
"learning_rate": 9.95947112532714e-07,
|
| 15219 |
+
"loss": 0.0031,
|
| 15220 |
+
"reward": 1.4751147031784058,
|
| 15221 |
+
"reward_std": 0.04257712885737419,
|
| 15222 |
+
"rewards/accuracy_reward": 0.3858288824558258,
|
| 15223 |
+
"rewards/format_reward": 1.0,
|
| 15224 |
+
"step": 951,
|
| 15225 |
+
"temporal_rewards": 0.5714285373687744
|
| 15226 |
+
},
|
| 15227 |
+
{
|
| 15228 |
+
"all_correct": 0.14285714285714285,
|
| 15229 |
+
"all_wrong": 0.0,
|
| 15230 |
+
"completion_length": 405.96429443359375,
|
| 15231 |
+
"epoch": 0.04059874621519041,
|
| 15232 |
+
"grad_norm": 3.136161553844802,
|
| 15233 |
+
"kl": 0.0634765625,
|
| 15234 |
+
"learning_rate": 9.959385961926419e-07,
|
| 15235 |
+
"loss": 0.0025,
|
| 15236 |
+
"reward": 1.9592256546020508,
|
| 15237 |
+
"reward_std": 0.335843026638031,
|
| 15238 |
+
"rewards/accuracy_reward": 0.696725606918335,
|
| 15239 |
+
"rewards/format_reward": 1.0,
|
| 15240 |
+
"step": 952,
|
| 15241 |
+
"temporal_rewards": 0.714285671710968
|
| 15242 |
+
},
|
| 15243 |
+
{
|
| 15244 |
+
"all_correct": 0.2857142857142857,
|
| 15245 |
+
"all_wrong": 0.42857142857142855,
|
| 15246 |
+
"completion_length": 376.96429443359375,
|
| 15247 |
+
"epoch": 0.04064139195701309,
|
| 15248 |
+
"grad_norm": 3.219961953332857,
|
| 15249 |
+
"kl": 0.0673828125,
|
| 15250 |
+
"learning_rate": 9.959300709507459e-07,
|
| 15251 |
+
"loss": 0.0027,
|
| 15252 |
+
"reward": 1.4304946660995483,
|
| 15253 |
+
"reward_std": 0.1210612803697586,
|
| 15254 |
+
"rewards/accuracy_reward": 0.3447802662849426,
|
| 15255 |
+
"rewards/format_reward": 1.0,
|
| 15256 |
+
"step": 953,
|
| 15257 |
+
"temporal_rewards": 0.6428571343421936
|
| 15258 |
+
},
|
| 15259 |
+
{
|
| 15260 |
+
"all_correct": 0.14285714285714285,
|
| 15261 |
+
"all_wrong": 0.0,
|
| 15262 |
+
"completion_length": 398.1964416503906,
|
| 15263 |
+
"epoch": 0.04068403769883577,
|
| 15264 |
+
"grad_norm": 6.050142310929799,
|
| 15265 |
+
"kl": 0.0751953125,
|
| 15266 |
+
"learning_rate": 9.959215368071788e-07,
|
| 15267 |
+
"loss": 0.003,
|
| 15268 |
+
"reward": 1.7659056186676025,
|
| 15269 |
+
"reward_std": 0.4009357690811157,
|
| 15270 |
+
"rewards/accuracy_reward": 0.6051912307739258,
|
| 15271 |
+
"rewards/format_reward": 1.0,
|
| 15272 |
+
"step": 954,
|
| 15273 |
+
"temporal_rewards": 0.6428571343421936
|
| 15274 |
+
},
|
| 15275 |
+
{
|
| 15276 |
+
"all_correct": 0.5714285714285714,
|
| 15277 |
+
"all_wrong": 0.0,
|
| 15278 |
+
"completion_length": 416.14288330078125,
|
| 15279 |
+
"epoch": 0.04072668344065845,
|
| 15280 |
+
"grad_norm": 1.421808016451398,
|
| 15281 |
+
"kl": 0.0703125,
|
| 15282 |
+
"learning_rate": 9.959129937620943e-07,
|
| 15283 |
+
"loss": 0.0028,
|
| 15284 |
+
"reward": 1.9267857074737549,
|
| 15285 |
+
"reward_std": 0.2703368067741394,
|
| 15286 |
+
"rewards/accuracy_reward": 0.6964285969734192,
|
| 15287 |
+
"rewards/format_reward": 1.0,
|
| 15288 |
+
"step": 955,
|
| 15289 |
+
"temporal_rewards": 0.714285671710968
|
| 15290 |
+
},
|
| 15291 |
+
{
|
| 15292 |
+
"all_correct": 0.2857142857142857,
|
| 15293 |
+
"all_wrong": 0.0,
|
| 15294 |
+
"completion_length": 435.1964416503906,
|
| 15295 |
+
"epoch": 0.04076932918248113,
|
| 15296 |
+
"grad_norm": 2.6075666468048433,
|
| 15297 |
+
"kl": 0.05859375,
|
| 15298 |
+
"learning_rate": 9.95904441815645e-07,
|
| 15299 |
+
"loss": 0.0023,
|
| 15300 |
+
"reward": 1.968336820602417,
|
| 15301 |
+
"reward_std": 0.21630696952342987,
|
| 15302 |
+
"rewards/accuracy_reward": 0.705836832523346,
|
| 15303 |
+
"rewards/format_reward": 1.0,
|
| 15304 |
+
"step": 956,
|
| 15305 |
+
"temporal_rewards": 0.714285671710968
|
| 15306 |
+
},
|
| 15307 |
+
{
|
| 15308 |
+
"all_correct": 0.14285714285714285,
|
| 15309 |
+
"all_wrong": 0.14285714285714285,
|
| 15310 |
+
"completion_length": 438.39288330078125,
|
| 15311 |
+
"epoch": 0.04081197492430381,
|
| 15312 |
+
"grad_norm": 1.6378505934638343,
|
| 15313 |
+
"kl": 0.0595703125,
|
| 15314 |
+
"learning_rate": 9.958958809679852e-07,
|
| 15315 |
+
"loss": 0.0024,
|
| 15316 |
+
"reward": 1.7164154052734375,
|
| 15317 |
+
"reward_std": 0.2044200748205185,
|
| 15318 |
+
"rewards/accuracy_reward": 0.527129590511322,
|
| 15319 |
+
"rewards/format_reward": 1.0,
|
| 15320 |
+
"step": 957,
|
| 15321 |
+
"temporal_rewards": 0.714285671710968
|
| 15322 |
+
},
|
| 15323 |
+
{
|
| 15324 |
+
"all_correct": 0.5714285714285714,
|
| 15325 |
+
"all_wrong": 0.0,
|
| 15326 |
+
"completion_length": 408.5357360839844,
|
| 15327 |
+
"epoch": 0.04085462066612649,
|
| 15328 |
+
"grad_norm": 1.8908338290164786,
|
| 15329 |
+
"kl": 0.061767578125,
|
| 15330 |
+
"learning_rate": 9.958873112192681e-07,
|
| 15331 |
+
"loss": 0.0025,
|
| 15332 |
+
"reward": 2.048797607421875,
|
| 15333 |
+
"reward_std": 0.04435715451836586,
|
| 15334 |
+
"rewards/accuracy_reward": 0.6916548013687134,
|
| 15335 |
+
"rewards/format_reward": 1.0,
|
| 15336 |
+
"step": 958,
|
| 15337 |
+
"temporal_rewards": 0.7857142686843872
|
| 15338 |
+
},
|
| 15339 |
+
{
|
| 15340 |
+
"all_correct": 0.2857142857142857,
|
| 15341 |
+
"all_wrong": 0.14285714285714285,
|
| 15342 |
+
"completion_length": 383.0357360839844,
|
| 15343 |
+
"epoch": 0.04089726640794917,
|
| 15344 |
+
"grad_norm": 1.9667798712281923,
|
| 15345 |
+
"kl": 0.06884765625,
|
| 15346 |
+
"learning_rate": 9.958787325696477e-07,
|
| 15347 |
+
"loss": 0.0028,
|
| 15348 |
+
"reward": 1.722543716430664,
|
| 15349 |
+
"reward_std": 0.13343960046768188,
|
| 15350 |
+
"rewards/accuracy_reward": 0.5082579851150513,
|
| 15351 |
+
"rewards/format_reward": 1.0,
|
| 15352 |
+
"step": 959,
|
| 15353 |
+
"temporal_rewards": 0.5
|
| 15354 |
+
},
|
| 15355 |
+
{
|
| 15356 |
+
"all_correct": 0.2857142857142857,
|
| 15357 |
+
"all_wrong": 0.0,
|
| 15358 |
+
"completion_length": 415.9464416503906,
|
| 15359 |
+
"epoch": 0.04093991214977184,
|
| 15360 |
+
"grad_norm": 1.9316488617697056,
|
| 15361 |
+
"kl": 0.054931640625,
|
| 15362 |
+
"learning_rate": 9.958701450192777e-07,
|
| 15363 |
+
"loss": 0.0022,
|
| 15364 |
+
"reward": 1.7576582431793213,
|
| 15365 |
+
"reward_std": 0.09231801331043243,
|
| 15366 |
+
"rewards/accuracy_reward": 0.5112294554710388,
|
| 15367 |
+
"rewards/format_reward": 1.0,
|
| 15368 |
+
"step": 960,
|
| 15369 |
+
"temporal_rewards": 0.6428571343421936
|
| 15370 |
+
},
|
| 15371 |
+
{
|
| 15372 |
+
"all_correct": 0.42857142857142855,
|
| 15373 |
+
"all_wrong": 0.2857142857142857,
|
| 15374 |
+
"completion_length": 391.0357360839844,
|
| 15375 |
+
"epoch": 0.04098255789159452,
|
| 15376 |
+
"grad_norm": 2.226596533494856,
|
| 15377 |
+
"kl": 0.0693359375,
|
| 15378 |
+
"learning_rate": 9.958615485683124e-07,
|
| 15379 |
+
"loss": 0.0028,
|
| 15380 |
+
"reward": 1.8000001907348633,
|
| 15381 |
+
"reward_std": 0.21024802327156067,
|
| 15382 |
+
"rewards/accuracy_reward": 0.5714285969734192,
|
| 15383 |
+
"rewards/format_reward": 1.0,
|
| 15384 |
+
"step": 961,
|
| 15385 |
+
"temporal_rewards": 0.5714285373687744
|
| 15386 |
+
},
|
| 15387 |
+
{
|
| 15388 |
+
"all_correct": 0.2857142857142857,
|
| 15389 |
+
"all_wrong": 0.14285714285714285,
|
| 15390 |
+
"completion_length": 425.96429443359375,
|
| 15391 |
+
"epoch": 0.041025203633417204,
|
| 15392 |
+
"grad_norm": 1.4948037453733134,
|
| 15393 |
+
"kl": 0.04736328125,
|
| 15394 |
+
"learning_rate": 9.958529432169062e-07,
|
| 15395 |
+
"loss": 0.0019,
|
| 15396 |
+
"reward": 1.6663552522659302,
|
| 15397 |
+
"reward_std": 0.18109546601772308,
|
| 15398 |
+
"rewards/accuracy_reward": 0.43064096570014954,
|
| 15399 |
+
"rewards/format_reward": 1.0,
|
| 15400 |
+
"step": 962,
|
| 15401 |
+
"temporal_rewards": 0.714285671710968
|
| 15402 |
+
},
|
| 15403 |
+
{
|
| 15404 |
+
"all_correct": 0.2857142857142857,
|
| 15405 |
+
"all_wrong": 0.42857142857142855,
|
| 15406 |
+
"completion_length": 452.357177734375,
|
| 15407 |
+
"epoch": 0.041067849375239884,
|
| 15408 |
+
"grad_norm": 1.4417561901645564,
|
| 15409 |
+
"kl": 0.06298828125,
|
| 15410 |
+
"learning_rate": 9.958443289652137e-07,
|
| 15411 |
+
"loss": 0.0025,
|
| 15412 |
+
"reward": 1.5744065046310425,
|
| 15413 |
+
"reward_std": 0.1612720489501953,
|
| 15414 |
+
"rewards/accuracy_reward": 0.45297789573669434,
|
| 15415 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 15416 |
+
"step": 963,
|
| 15417 |
+
"temporal_rewards": 0.714285671710968
|
| 15418 |
+
},
|
| 15419 |
+
{
|
| 15420 |
+
"all_correct": 0.2857142857142857,
|
| 15421 |
+
"all_wrong": 0.5714285714285714,
|
| 15422 |
+
"completion_length": 411.1071472167969,
|
| 15423 |
+
"epoch": 0.04111049511706256,
|
| 15424 |
+
"grad_norm": 1.2820824961142303,
|
| 15425 |
+
"kl": 0.050048828125,
|
| 15426 |
+
"learning_rate": 9.95835705813389e-07,
|
| 15427 |
+
"loss": 0.002,
|
| 15428 |
+
"reward": 1.417178988456726,
|
| 15429 |
+
"reward_std": 0.09464232623577118,
|
| 15430 |
+
"rewards/accuracy_reward": 0.338607519865036,
|
| 15431 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 15432 |
+
"step": 964,
|
| 15433 |
+
"temporal_rewards": 0.6428571343421936
|
| 15434 |
+
},
|
| 15435 |
+
{
|
| 15436 |
+
"all_correct": 0.42857142857142855,
|
| 15437 |
+
"all_wrong": 0.0,
|
| 15438 |
+
"completion_length": 377.39288330078125,
|
| 15439 |
+
"epoch": 0.04115314085888524,
|
| 15440 |
+
"grad_norm": 2.1119948944927778,
|
| 15441 |
+
"kl": 0.06982421875,
|
| 15442 |
+
"learning_rate": 9.958270737615876e-07,
|
| 15443 |
+
"loss": 0.0028,
|
| 15444 |
+
"reward": 2.0370266437530518,
|
| 15445 |
+
"reward_std": 0.15241330862045288,
|
| 15446 |
+
"rewards/accuracy_reward": 0.8263123035430908,
|
| 15447 |
+
"rewards/format_reward": 1.0,
|
| 15448 |
+
"step": 965,
|
| 15449 |
+
"temporal_rewards": 0.5714285373687744
|
| 15450 |
+
},
|
| 15451 |
+
{
|
| 15452 |
+
"all_correct": 0.14285714285714285,
|
| 15453 |
+
"all_wrong": 0.0,
|
| 15454 |
+
"completion_length": 411.08929443359375,
|
| 15455 |
+
"epoch": 0.04119578660070792,
|
| 15456 |
+
"grad_norm": 1.9724612440626825,
|
| 15457 |
+
"kl": 0.051513671875,
|
| 15458 |
+
"learning_rate": 9.958184328099636e-07,
|
| 15459 |
+
"loss": 0.0021,
|
| 15460 |
+
"reward": 1.7237517833709717,
|
| 15461 |
+
"reward_std": 0.2817230820655823,
|
| 15462 |
+
"rewards/accuracy_reward": 0.5362517237663269,
|
| 15463 |
+
"rewards/format_reward": 1.0,
|
| 15464 |
+
"step": 966,
|
| 15465 |
+
"temporal_rewards": 0.6428571343421936
|
| 15466 |
+
},
|
| 15467 |
+
{
|
| 15468 |
+
"all_correct": 0.2857142857142857,
|
| 15469 |
+
"all_wrong": 0.42857142857142855,
|
| 15470 |
+
"completion_length": 405.2500305175781,
|
| 15471 |
+
"epoch": 0.0412384323425306,
|
| 15472 |
+
"grad_norm": 1.217388155156064,
|
| 15473 |
+
"kl": 0.06787109375,
|
| 15474 |
+
"learning_rate": 9.958097829586727e-07,
|
| 15475 |
+
"loss": 0.0027,
|
| 15476 |
+
"reward": 1.5464287996292114,
|
| 15477 |
+
"reward_std": 0.22678472101688385,
|
| 15478 |
+
"rewards/accuracy_reward": 0.4285714626312256,
|
| 15479 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 15480 |
+
"step": 967,
|
| 15481 |
+
"temporal_rewards": 0.6428571343421936
|
| 15482 |
+
},
|
| 15483 |
+
{
|
| 15484 |
+
"all_correct": 0.2857142857142857,
|
| 15485 |
+
"all_wrong": 0.0,
|
| 15486 |
+
"completion_length": 412.0000305175781,
|
| 15487 |
+
"epoch": 0.04128107808435328,
|
| 15488 |
+
"grad_norm": 2.174976056324669,
|
| 15489 |
+
"kl": 0.0634765625,
|
| 15490 |
+
"learning_rate": 9.9580112420787e-07,
|
| 15491 |
+
"loss": 0.0025,
|
| 15492 |
+
"reward": 1.6597402095794678,
|
| 15493 |
+
"reward_std": 0.24509233236312866,
|
| 15494 |
+
"rewards/accuracy_reward": 0.48652589321136475,
|
| 15495 |
+
"rewards/format_reward": 1.0,
|
| 15496 |
+
"step": 968,
|
| 15497 |
+
"temporal_rewards": 0.5714285373687744
|
| 15498 |
+
},
|
| 15499 |
+
{
|
| 15500 |
+
"all_correct": 0.14285714285714285,
|
| 15501 |
+
"all_wrong": 0.14285714285714285,
|
| 15502 |
+
"completion_length": 368.1785888671875,
|
| 15503 |
+
"epoch": 0.041323723826175954,
|
| 15504 |
+
"grad_norm": 1.4286186443155269,
|
| 15505 |
+
"kl": 0.080078125,
|
| 15506 |
+
"learning_rate": 9.95792456557711e-07,
|
| 15507 |
+
"loss": 0.0032,
|
| 15508 |
+
"reward": 1.694699764251709,
|
| 15509 |
+
"reward_std": 0.28493639826774597,
|
| 15510 |
+
"rewards/accuracy_reward": 0.5911281108856201,
|
| 15511 |
+
"rewards/format_reward": 1.0,
|
| 15512 |
+
"step": 969,
|
| 15513 |
+
"temporal_rewards": 0.5714285373687744
|
| 15514 |
+
},
|
| 15515 |
+
{
|
| 15516 |
+
"all_correct": 0.14285714285714285,
|
| 15517 |
+
"all_wrong": 0.0,
|
| 15518 |
+
"completion_length": 373.4285888671875,
|
| 15519 |
+
"epoch": 0.041366369567998634,
|
| 15520 |
+
"grad_norm": 1.8366623611705846,
|
| 15521 |
+
"kl": 0.04931640625,
|
| 15522 |
+
"learning_rate": 9.957837800083512e-07,
|
| 15523 |
+
"loss": 0.002,
|
| 15524 |
+
"reward": 1.6913397312164307,
|
| 15525 |
+
"reward_std": 0.31534942984580994,
|
| 15526 |
+
"rewards/accuracy_reward": 0.5377681255340576,
|
| 15527 |
+
"rewards/format_reward": 1.0,
|
| 15528 |
+
"step": 970,
|
| 15529 |
+
"temporal_rewards": 0.5714285373687744
|
| 15530 |
+
},
|
| 15531 |
+
{
|
| 15532 |
+
"all_correct": 0.14285714285714285,
|
| 15533 |
+
"all_wrong": 0.14285714285714285,
|
| 15534 |
+
"completion_length": 430.5714416503906,
|
| 15535 |
+
"epoch": 0.041409015309821315,
|
| 15536 |
+
"grad_norm": 1.6724082967761504,
|
| 15537 |
+
"kl": 0.055419921875,
|
| 15538 |
+
"learning_rate": 9.957750945599463e-07,
|
| 15539 |
+
"loss": 0.0022,
|
| 15540 |
+
"reward": 1.6170705556869507,
|
| 15541 |
+
"reward_std": 0.282865047454834,
|
| 15542 |
+
"rewards/accuracy_reward": 0.47421327233314514,
|
| 15543 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 15544 |
+
"step": 971,
|
| 15545 |
+
"temporal_rewards": 0.5714285373687744
|
| 15546 |
+
},
|
| 15547 |
+
{
|
| 15548 |
+
"all_correct": 0.2857142857142857,
|
| 15549 |
+
"all_wrong": 0.14285714285714285,
|
| 15550 |
+
"completion_length": 360.96429443359375,
|
| 15551 |
+
"epoch": 0.041451661051643995,
|
| 15552 |
+
"grad_norm": 1.4751519734360083,
|
| 15553 |
+
"kl": 0.055908203125,
|
| 15554 |
+
"learning_rate": 9.957664002126524e-07,
|
| 15555 |
+
"loss": 0.0022,
|
| 15556 |
+
"reward": 1.7003968954086304,
|
| 15557 |
+
"reward_std": 0.27568307518959045,
|
| 15558 |
+
"rewards/accuracy_reward": 0.5753968358039856,
|
| 15559 |
+
"rewards/format_reward": 1.0,
|
| 15560 |
+
"step": 972,
|
| 15561 |
+
"temporal_rewards": 0.5714285373687744
|
| 15562 |
+
},
|
| 15563 |
+
{
|
| 15564 |
+
"all_correct": 0.2857142857142857,
|
| 15565 |
+
"all_wrong": 0.0,
|
| 15566 |
+
"completion_length": 401.7500305175781,
|
| 15567 |
+
"epoch": 0.04149430679346667,
|
| 15568 |
+
"grad_norm": 1.2879512175687846,
|
| 15569 |
+
"kl": 0.0625,
|
| 15570 |
+
"learning_rate": 9.957576969666252e-07,
|
| 15571 |
+
"loss": 0.0025,
|
| 15572 |
+
"reward": 1.939540982246399,
|
| 15573 |
+
"reward_std": 0.3482883870601654,
|
| 15574 |
+
"rewards/accuracy_reward": 0.7448980212211609,
|
| 15575 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 15576 |
+
"step": 973,
|
| 15577 |
+
"temporal_rewards": 0.6428571343421936
|
| 15578 |
+
},
|
| 15579 |
+
{
|
| 15580 |
+
"all_correct": 0.0,
|
| 15581 |
+
"all_wrong": 0.2857142857142857,
|
| 15582 |
+
"completion_length": 418.5535888671875,
|
| 15583 |
+
"epoch": 0.04153695253528935,
|
| 15584 |
+
"grad_norm": 1.4953696048328862,
|
| 15585 |
+
"kl": 0.061767578125,
|
| 15586 |
+
"learning_rate": 9.95748984822021e-07,
|
| 15587 |
+
"loss": 0.0025,
|
| 15588 |
+
"reward": 1.4209396839141846,
|
| 15589 |
+
"reward_std": 0.11054398119449615,
|
| 15590 |
+
"rewards/accuracy_reward": 0.31736814975738525,
|
| 15591 |
+
"rewards/format_reward": 1.0,
|
| 15592 |
+
"step": 974,
|
| 15593 |
+
"temporal_rewards": 0.5
|
| 15594 |
+
},
|
| 15595 |
+
{
|
| 15596 |
+
"all_correct": 0.14285714285714285,
|
| 15597 |
+
"all_wrong": 0.0,
|
| 15598 |
+
"completion_length": 381.8035888671875,
|
| 15599 |
+
"epoch": 0.04157959827711203,
|
| 15600 |
+
"grad_norm": 1.62577462756665,
|
| 15601 |
+
"kl": 0.06396484375,
|
| 15602 |
+
"learning_rate": 9.957402637789966e-07,
|
| 15603 |
+
"loss": 0.0026,
|
| 15604 |
+
"reward": 1.5604526996612549,
|
| 15605 |
+
"reward_std": 0.2563931941986084,
|
| 15606 |
+
"rewards/accuracy_reward": 0.4104524850845337,
|
| 15607 |
+
"rewards/format_reward": 1.0,
|
| 15608 |
+
"step": 975,
|
| 15609 |
+
"temporal_rewards": 0.4285714328289032
|
| 15610 |
+
},
|
| 15611 |
+
{
|
| 15612 |
+
"all_correct": 0.7142857142857143,
|
| 15613 |
+
"all_wrong": 0.0,
|
| 15614 |
+
"completion_length": 377.0535888671875,
|
| 15615 |
+
"epoch": 0.04162224401893471,
|
| 15616 |
+
"grad_norm": 2.435265664853468,
|
| 15617 |
+
"kl": 0.061279296875,
|
| 15618 |
+
"learning_rate": 9.957315338377082e-07,
|
| 15619 |
+
"loss": 0.0025,
|
| 15620 |
+
"reward": 2.032465696334839,
|
| 15621 |
+
"reward_std": 0.14916305243968964,
|
| 15622 |
+
"rewards/accuracy_reward": 0.8503227829933167,
|
| 15623 |
+
"rewards/format_reward": 1.0,
|
| 15624 |
+
"step": 976,
|
| 15625 |
+
"temporal_rewards": 0.5
|
| 15626 |
+
},
|
| 15627 |
+
{
|
| 15628 |
+
"all_correct": 0.42857142857142855,
|
| 15629 |
+
"all_wrong": 0.14285714285714285,
|
| 15630 |
+
"completion_length": 388.3035888671875,
|
| 15631 |
+
"epoch": 0.04166488976075739,
|
| 15632 |
+
"grad_norm": 4.675537967649135,
|
| 15633 |
+
"kl": 0.0732421875,
|
| 15634 |
+
"learning_rate": 9.957227949983123e-07,
|
| 15635 |
+
"loss": 0.0029,
|
| 15636 |
+
"reward": 1.756882667541504,
|
| 15637 |
+
"reward_std": 0.09770465642213821,
|
| 15638 |
+
"rewards/accuracy_reward": 0.585453987121582,
|
| 15639 |
+
"rewards/format_reward": 1.0,
|
| 15640 |
+
"step": 977,
|
| 15641 |
+
"temporal_rewards": 0.5714285373687744
|
| 15642 |
+
},
|
| 15643 |
+
{
|
| 15644 |
+
"all_correct": 0.2857142857142857,
|
| 15645 |
+
"all_wrong": 0.0,
|
| 15646 |
+
"completion_length": 417.9464416503906,
|
| 15647 |
+
"epoch": 0.041707535502580065,
|
| 15648 |
+
"grad_norm": 8.652420291718713,
|
| 15649 |
+
"kl": 0.0703125,
|
| 15650 |
+
"learning_rate": 9.95714047260966e-07,
|
| 15651 |
+
"loss": 0.0028,
|
| 15652 |
+
"reward": 1.7980735301971436,
|
| 15653 |
+
"reward_std": 0.3015199899673462,
|
| 15654 |
+
"rewards/accuracy_reward": 0.6355735063552856,
|
| 15655 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 15656 |
+
"step": 978,
|
| 15657 |
+
"temporal_rewards": 0.6428571343421936
|
| 15658 |
+
},
|
| 15659 |
+
{
|
| 15660 |
+
"all_correct": 0.5714285714285714,
|
| 15661 |
+
"all_wrong": 0.0,
|
| 15662 |
+
"completion_length": 427.1071472167969,
|
| 15663 |
+
"epoch": 0.041750181244402745,
|
| 15664 |
+
"grad_norm": 1.4268523956639492,
|
| 15665 |
+
"kl": 0.059326171875,
|
| 15666 |
+
"learning_rate": 9.957052906258265e-07,
|
| 15667 |
+
"loss": 0.0024,
|
| 15668 |
+
"reward": 2.1285715103149414,
|
| 15669 |
+
"reward_std": 0.24789518117904663,
|
| 15670 |
+
"rewards/accuracy_reward": 0.8928571939468384,
|
| 15671 |
+
"rewards/format_reward": 1.0,
|
| 15672 |
+
"step": 979,
|
| 15673 |
+
"temporal_rewards": 0.6428571343421936
|
| 15674 |
+
},
|
| 15675 |
+
{
|
| 15676 |
+
"all_correct": 0.0,
|
| 15677 |
+
"all_wrong": 0.0,
|
| 15678 |
+
"completion_length": 412.8750305175781,
|
| 15679 |
+
"epoch": 0.041792826986225426,
|
| 15680 |
+
"grad_norm": 1.7066404499935264,
|
| 15681 |
+
"kl": 0.06103515625,
|
| 15682 |
+
"learning_rate": 9.956965250930506e-07,
|
| 15683 |
+
"loss": 0.0024,
|
| 15684 |
+
"reward": 1.7384778261184692,
|
| 15685 |
+
"reward_std": 0.3482401967048645,
|
| 15686 |
+
"rewards/accuracy_reward": 0.5652633905410767,
|
| 15687 |
+
"rewards/format_reward": 1.0,
|
| 15688 |
+
"step": 980,
|
| 15689 |
+
"temporal_rewards": 0.5714285373687744
|
| 15690 |
+
},
|
| 15691 |
+
{
|
| 15692 |
+
"all_correct": 0.0,
|
| 15693 |
+
"all_wrong": 0.2857142857142857,
|
| 15694 |
+
"completion_length": 424.4464416503906,
|
| 15695 |
+
"epoch": 0.041835472728048106,
|
| 15696 |
+
"grad_norm": 4.659872228636638,
|
| 15697 |
+
"kl": 0.078125,
|
| 15698 |
+
"learning_rate": 9.95687750662796e-07,
|
| 15699 |
+
"loss": 0.0031,
|
| 15700 |
+
"reward": 1.5966719388961792,
|
| 15701 |
+
"reward_std": 0.1782858818769455,
|
| 15702 |
+
"rewards/accuracy_reward": 0.5038148164749146,
|
| 15703 |
+
"rewards/format_reward": 1.0,
|
| 15704 |
+
"step": 981,
|
| 15705 |
+
"temporal_rewards": 0.357142835855484
|
| 15706 |
+
},
|
| 15707 |
+
{
|
| 15708 |
+
"all_correct": 0.14285714285714285,
|
| 15709 |
+
"all_wrong": 0.0,
|
| 15710 |
+
"completion_length": 414.2500305175781,
|
| 15711 |
+
"epoch": 0.04187811846987078,
|
| 15712 |
+
"grad_norm": 2.2589039944618916,
|
| 15713 |
+
"kl": 0.06591796875,
|
| 15714 |
+
"learning_rate": 9.9567896733522e-07,
|
| 15715 |
+
"loss": 0.0026,
|
| 15716 |
+
"reward": 1.7768125534057617,
|
| 15717 |
+
"reward_std": 0.2902745306491852,
|
| 15718 |
+
"rewards/accuracy_reward": 0.5893124341964722,
|
| 15719 |
+
"rewards/format_reward": 1.0,
|
| 15720 |
+
"step": 982,
|
| 15721 |
+
"temporal_rewards": 0.6428571343421936
|
| 15722 |
+
},
|
| 15723 |
+
{
|
| 15724 |
+
"all_correct": 0.14285714285714285,
|
| 15725 |
+
"all_wrong": 0.2857142857142857,
|
| 15726 |
+
"completion_length": 372.4107360839844,
|
| 15727 |
+
"epoch": 0.04192076421169346,
|
| 15728 |
+
"grad_norm": 1.362896487824589,
|
| 15729 |
+
"kl": 0.08447265625,
|
| 15730 |
+
"learning_rate": 9.956701751104802e-07,
|
| 15731 |
+
"loss": 0.0034,
|
| 15732 |
+
"reward": 1.6840959787368774,
|
| 15733 |
+
"reward_std": 0.11667370796203613,
|
| 15734 |
+
"rewards/accuracy_reward": 0.5269531011581421,
|
| 15735 |
+
"rewards/format_reward": 1.0,
|
| 15736 |
+
"step": 983,
|
| 15737 |
+
"temporal_rewards": 0.5
|
| 15738 |
+
},
|
| 15739 |
+
{
|
| 15740 |
+
"all_correct": 0.7142857142857143,
|
| 15741 |
+
"all_wrong": 0.14285714285714285,
|
| 15742 |
+
"completion_length": 398.3214416503906,
|
| 15743 |
+
"epoch": 0.04196340995351614,
|
| 15744 |
+
"grad_norm": 1.2129928050538004,
|
| 15745 |
+
"kl": 0.06103515625,
|
| 15746 |
+
"learning_rate": 9.956613739887344e-07,
|
| 15747 |
+
"loss": 0.0024,
|
| 15748 |
+
"reward": 1.9772791862487793,
|
| 15749 |
+
"reward_std": 0.04609445109963417,
|
| 15750 |
+
"rewards/accuracy_reward": 0.7487077116966248,
|
| 15751 |
+
"rewards/format_reward": 1.0,
|
| 15752 |
+
"step": 984,
|
| 15753 |
+
"temporal_rewards": 0.6428571343421936
|
| 15754 |
+
},
|
| 15755 |
+
{
|
| 15756 |
+
"all_correct": 0.42857142857142855,
|
| 15757 |
+
"all_wrong": 0.14285714285714285,
|
| 15758 |
+
"completion_length": 404.5714416503906,
|
| 15759 |
+
"epoch": 0.04200605569533882,
|
| 15760 |
+
"grad_norm": 3.9207407827779077,
|
| 15761 |
+
"kl": 0.0634765625,
|
| 15762 |
+
"learning_rate": 9.956525639701407e-07,
|
| 15763 |
+
"loss": 0.0025,
|
| 15764 |
+
"reward": 1.910249948501587,
|
| 15765 |
+
"reward_std": 0.12284082174301147,
|
| 15766 |
+
"rewards/accuracy_reward": 0.6995355486869812,
|
| 15767 |
+
"rewards/format_reward": 1.0,
|
| 15768 |
+
"step": 985,
|
| 15769 |
+
"temporal_rewards": 0.6428571343421936
|
| 15770 |
+
},
|
| 15771 |
+
{
|
| 15772 |
+
"all_correct": 0.2857142857142857,
|
| 15773 |
+
"all_wrong": 0.0,
|
| 15774 |
+
"completion_length": 372.6785888671875,
|
| 15775 |
+
"epoch": 0.0420487014371615,
|
| 15776 |
+
"grad_norm": 2.0049322392282347,
|
| 15777 |
+
"kl": 0.0693359375,
|
| 15778 |
+
"learning_rate": 9.956437450548573e-07,
|
| 15779 |
+
"loss": 0.0028,
|
| 15780 |
+
"reward": 1.7768032550811768,
|
| 15781 |
+
"reward_std": 0.3517088294029236,
|
| 15782 |
+
"rewards/accuracy_reward": 0.6071603298187256,
|
| 15783 |
+
"rewards/format_reward": 1.0,
|
| 15784 |
+
"step": 986,
|
| 15785 |
+
"temporal_rewards": 0.6428571343421936
|
| 15786 |
+
},
|
| 15787 |
+
{
|
| 15788 |
+
"all_correct": 0.14285714285714285,
|
| 15789 |
+
"all_wrong": 0.0,
|
| 15790 |
+
"completion_length": 378.46429443359375,
|
| 15791 |
+
"epoch": 0.042091347178984176,
|
| 15792 |
+
"grad_norm": 2.4990401086302327,
|
| 15793 |
+
"kl": 0.0625,
|
| 15794 |
+
"learning_rate": 9.956349172430423e-07,
|
| 15795 |
+
"loss": 0.0025,
|
| 15796 |
+
"reward": 1.9568233489990234,
|
| 15797 |
+
"reward_std": 0.31181564927101135,
|
| 15798 |
+
"rewards/accuracy_reward": 0.7443231344223022,
|
| 15799 |
+
"rewards/format_reward": 1.0,
|
| 15800 |
+
"step": 987,
|
| 15801 |
+
"temporal_rewards": 0.6428571343421936
|
| 15802 |
+
},
|
| 15803 |
+
{
|
| 15804 |
+
"all_correct": 0.14285714285714285,
|
| 15805 |
+
"all_wrong": 0.2857142857142857,
|
| 15806 |
+
"completion_length": 321.75,
|
| 15807 |
+
"epoch": 0.042133992920806856,
|
| 15808 |
+
"grad_norm": 1.7114030947272834,
|
| 15809 |
+
"kl": 0.08740234375,
|
| 15810 |
+
"learning_rate": 9.956260805348543e-07,
|
| 15811 |
+
"loss": 0.0035,
|
| 15812 |
+
"reward": 1.672023892402649,
|
| 15813 |
+
"reward_std": 0.23027639091014862,
|
| 15814 |
+
"rewards/accuracy_reward": 0.5345238447189331,
|
| 15815 |
+
"rewards/format_reward": 1.0,
|
| 15816 |
+
"step": 988,
|
| 15817 |
+
"temporal_rewards": 0.5
|
| 15818 |
+
},
|
| 15819 |
+
{
|
| 15820 |
+
"all_correct": 0.0,
|
| 15821 |
+
"all_wrong": 0.0,
|
| 15822 |
+
"completion_length": 358.6250305175781,
|
| 15823 |
+
"epoch": 0.04217663866262954,
|
| 15824 |
+
"grad_norm": 2.7481031203471664,
|
| 15825 |
+
"kl": 0.07666015625,
|
| 15826 |
+
"learning_rate": 9.956172349304516e-07,
|
| 15827 |
+
"loss": 0.0031,
|
| 15828 |
+
"reward": 1.5195239782333374,
|
| 15829 |
+
"reward_std": 0.2381688952445984,
|
| 15830 |
+
"rewards/accuracy_reward": 0.39809539914131165,
|
| 15831 |
+
"rewards/format_reward": 1.0,
|
| 15832 |
+
"step": 989,
|
| 15833 |
+
"temporal_rewards": 0.4285714328289032
|
| 15834 |
+
},
|
| 15835 |
+
{
|
| 15836 |
+
"all_correct": 0.2857142857142857,
|
| 15837 |
+
"all_wrong": 0.14285714285714285,
|
| 15838 |
+
"completion_length": 340.2857360839844,
|
| 15839 |
+
"epoch": 0.04221928440445222,
|
| 15840 |
+
"grad_norm": 1.784431075262888,
|
| 15841 |
+
"kl": 0.087890625,
|
| 15842 |
+
"learning_rate": 9.956083804299937e-07,
|
| 15843 |
+
"loss": 0.0035,
|
| 15844 |
+
"reward": 1.7007704973220825,
|
| 15845 |
+
"reward_std": 0.2985832691192627,
|
| 15846 |
+
"rewards/accuracy_reward": 0.5364846587181091,
|
| 15847 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 15848 |
+
"step": 990,
|
| 15849 |
+
"temporal_rewards": 0.714285671710968
|
| 15850 |
+
},
|
| 15851 |
+
{
|
| 15852 |
+
"all_correct": 0.14285714285714285,
|
| 15853 |
+
"all_wrong": 0.14285714285714285,
|
| 15854 |
+
"completion_length": 377.1607360839844,
|
| 15855 |
+
"epoch": 0.04226193014627489,
|
| 15856 |
+
"grad_norm": 3.305830844381748,
|
| 15857 |
+
"kl": 0.049560546875,
|
| 15858 |
+
"learning_rate": 9.955995170336387e-07,
|
| 15859 |
+
"loss": 0.002,
|
| 15860 |
+
"reward": 1.5684744119644165,
|
| 15861 |
+
"reward_std": 0.24174275994300842,
|
| 15862 |
+
"rewards/accuracy_reward": 0.4452598989009857,
|
| 15863 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 15864 |
+
"step": 991,
|
| 15865 |
+
"temporal_rewards": 0.5714285373687744
|
| 15866 |
+
},
|
| 15867 |
+
{
|
| 15868 |
+
"all_correct": 0.42857142857142855,
|
| 15869 |
+
"all_wrong": 0.0,
|
| 15870 |
+
"completion_length": 364.3750305175781,
|
| 15871 |
+
"epoch": 0.04230457588809757,
|
| 15872 |
+
"grad_norm": 2.4344422715162475,
|
| 15873 |
+
"kl": 0.060302734375,
|
| 15874 |
+
"learning_rate": 9.955906447415462e-07,
|
| 15875 |
+
"loss": 0.0024,
|
| 15876 |
+
"reward": 2.0187172889709473,
|
| 15877 |
+
"reward_std": 0.23528897762298584,
|
| 15878 |
+
"rewards/accuracy_reward": 0.7312172055244446,
|
| 15879 |
+
"rewards/format_reward": 1.0,
|
| 15880 |
+
"step": 992,
|
| 15881 |
+
"temporal_rewards": 0.7857142686843872
|
| 15882 |
+
},
|
| 15883 |
+
{
|
| 15884 |
+
"all_correct": 0.14285714285714285,
|
| 15885 |
+
"all_wrong": 0.42857142857142855,
|
| 15886 |
+
"completion_length": 347.6071472167969,
|
| 15887 |
+
"epoch": 0.04234722162992025,
|
| 15888 |
+
"grad_norm": 1.1678389588315554,
|
| 15889 |
+
"kl": 0.04833984375,
|
| 15890 |
+
"learning_rate": 9.955817635538753e-07,
|
| 15891 |
+
"loss": 0.0019,
|
| 15892 |
+
"reward": 1.3988115787506104,
|
| 15893 |
+
"reward_std": 0.17111261188983917,
|
| 15894 |
+
"rewards/accuracy_reward": 0.32381147146224976,
|
| 15895 |
+
"rewards/format_reward": 1.0,
|
| 15896 |
+
"step": 993,
|
| 15897 |
+
"temporal_rewards": 0.5714285373687744
|
| 15898 |
+
},
|
| 15899 |
+
{
|
| 15900 |
+
"all_correct": 0.42857142857142855,
|
| 15901 |
+
"all_wrong": 0.0,
|
| 15902 |
+
"completion_length": 312.6964416503906,
|
| 15903 |
+
"epoch": 0.04238986737174293,
|
| 15904 |
+
"grad_norm": 2.533225798199559,
|
| 15905 |
+
"kl": 0.0859375,
|
| 15906 |
+
"learning_rate": 9.955728734707854e-07,
|
| 15907 |
+
"loss": 0.0034,
|
| 15908 |
+
"reward": 1.840993881225586,
|
| 15909 |
+
"reward_std": 0.20919422805309296,
|
| 15910 |
+
"rewards/accuracy_reward": 0.6695650815963745,
|
| 15911 |
+
"rewards/format_reward": 1.0,
|
| 15912 |
+
"step": 994,
|
| 15913 |
+
"temporal_rewards": 0.6428571343421936
|
| 15914 |
+
},
|
| 15915 |
+
{
|
| 15916 |
+
"all_correct": 0.2857142857142857,
|
| 15917 |
+
"all_wrong": 0.14285714285714285,
|
| 15918 |
+
"completion_length": 380.5000305175781,
|
| 15919 |
+
"epoch": 0.04243251311356561,
|
| 15920 |
+
"grad_norm": 1.5832851452596517,
|
| 15921 |
+
"kl": 0.061279296875,
|
| 15922 |
+
"learning_rate": 9.955639744924362e-07,
|
| 15923 |
+
"loss": 0.0025,
|
| 15924 |
+
"reward": 1.7240325212478638,
|
| 15925 |
+
"reward_std": 0.08556399494409561,
|
| 15926 |
+
"rewards/accuracy_reward": 0.5526038408279419,
|
| 15927 |
+
"rewards/format_reward": 1.0,
|
| 15928 |
+
"step": 995,
|
| 15929 |
+
"temporal_rewards": 0.6428571343421936
|
| 15930 |
+
},
|
| 15931 |
+
{
|
| 15932 |
+
"all_correct": 0.2857142857142857,
|
| 15933 |
+
"all_wrong": 0.0,
|
| 15934 |
+
"completion_length": 305.0535888671875,
|
| 15935 |
+
"epoch": 0.04247515885538829,
|
| 15936 |
+
"grad_norm": 10.554470531111166,
|
| 15937 |
+
"kl": 0.07177734375,
|
| 15938 |
+
"learning_rate": 9.955550666189872e-07,
|
| 15939 |
+
"loss": 0.0029,
|
| 15940 |
+
"reward": 1.5484970808029175,
|
| 15941 |
+
"reward_std": 0.08560214191675186,
|
| 15942 |
+
"rewards/accuracy_reward": 0.3877827525138855,
|
| 15943 |
+
"rewards/format_reward": 1.0,
|
| 15944 |
+
"step": 996,
|
| 15945 |
+
"temporal_rewards": 0.6428571343421936
|
| 15946 |
+
},
|
| 15947 |
+
{
|
| 15948 |
+
"all_correct": 0.5714285714285714,
|
| 15949 |
+
"all_wrong": 0.0,
|
| 15950 |
+
"completion_length": 320.9464416503906,
|
| 15951 |
+
"epoch": 0.04251780459721097,
|
| 15952 |
+
"grad_norm": 1.9700363180476712,
|
| 15953 |
+
"kl": 0.07177734375,
|
| 15954 |
+
"learning_rate": 9.955461498505984e-07,
|
| 15955 |
+
"loss": 0.0029,
|
| 15956 |
+
"reward": 1.9073069095611572,
|
| 15957 |
+
"reward_std": 0.24450773000717163,
|
| 15958 |
+
"rewards/accuracy_reward": 0.7305210828781128,
|
| 15959 |
+
"rewards/format_reward": 1.0,
|
| 15960 |
+
"step": 997,
|
| 15961 |
+
"temporal_rewards": 0.714285671710968
|
| 15962 |
+
},
|
| 15963 |
+
{
|
| 15964 |
+
"all_correct": 0.14285714285714285,
|
| 15965 |
+
"all_wrong": 0.2857142857142857,
|
| 15966 |
+
"completion_length": 396.8571472167969,
|
| 15967 |
+
"epoch": 0.04256045033903365,
|
| 15968 |
+
"grad_norm": 1.9243204654958848,
|
| 15969 |
+
"kl": 0.0498046875,
|
| 15970 |
+
"learning_rate": 9.9553722418743e-07,
|
| 15971 |
+
"loss": 0.002,
|
| 15972 |
+
"reward": 1.504149079322815,
|
| 15973 |
+
"reward_std": 0.22699996829032898,
|
| 15974 |
+
"rewards/accuracy_reward": 0.4398633539676666,
|
| 15975 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 15976 |
+
"step": 998,
|
| 15977 |
+
"temporal_rewards": 0.6428571343421936
|
| 15978 |
+
},
|
| 15979 |
+
{
|
| 15980 |
+
"all_correct": 0.2857142857142857,
|
| 15981 |
+
"all_wrong": 0.14285714285714285,
|
| 15982 |
+
"completion_length": 334.0714416503906,
|
| 15983 |
+
"epoch": 0.04260309608085633,
|
| 15984 |
+
"grad_norm": 1.91575629989198,
|
| 15985 |
+
"kl": 0.0615234375,
|
| 15986 |
+
"learning_rate": 9.95528289629642e-07,
|
| 15987 |
+
"loss": 0.0025,
|
| 15988 |
+
"reward": 1.69874906539917,
|
| 15989 |
+
"reward_std": 0.15359075367450714,
|
| 15990 |
+
"rewards/accuracy_reward": 0.4737490117549896,
|
| 15991 |
+
"rewards/format_reward": 1.0,
|
| 15992 |
+
"step": 999,
|
| 15993 |
+
"temporal_rewards": 0.714285671710968
|
| 15994 |
+
},
|
| 15995 |
+
{
|
| 15996 |
+
"all_correct": 0.5714285714285714,
|
| 15997 |
+
"all_wrong": 0.14285714285714285,
|
| 15998 |
+
"completion_length": 332.64288330078125,
|
| 15999 |
+
"epoch": 0.042645741822679,
|
| 16000 |
+
"grad_norm": 1.6046922501658243,
|
| 16001 |
+
"kl": 0.06298828125,
|
| 16002 |
+
"learning_rate": 9.955193461773947e-07,
|
| 16003 |
+
"loss": 0.0025,
|
| 16004 |
+
"reward": 1.8086810111999512,
|
| 16005 |
+
"reward_std": 0.15720872581005096,
|
| 16006 |
+
"rewards/accuracy_reward": 0.6801096200942993,
|
| 16007 |
+
"rewards/format_reward": 1.0,
|
| 16008 |
+
"step": 1000,
|
| 16009 |
+
"temporal_rewards": 0.6428571343421936
|
| 16010 |
}
|
| 16011 |
],
|
| 16012 |
"logging_steps": 1.0,
|