icewaterdun commited on
Commit
8747b6d
·
verified ·
1 Parent(s): 4ba61b4

Upload trainer_state.json

Browse files

Essential checkpoint files (13/17) - excluding optimizer states

Files changed (1) hide show
  1. trainer_state.json +1261 -0
trainer_state.json ADDED
@@ -0,0 +1,1261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.7818654775176515,
6
+ "eval_steps": 200,
7
+ "global_step": 600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.014864362690449646,
14
+ "grad_norm": 2.1816246446736707,
15
+ "learning_rate": 1.6666666666666667e-06,
16
+ "loss": 1.5437,
17
+ "num_input_tokens_seen": 30040,
18
+ "step": 5,
19
+ "train_runtime": 185.9609,
20
+ "train_tokens_per_second": 161.539
21
+ },
22
+ {
23
+ "epoch": 0.029728725380899292,
24
+ "grad_norm": 1.297173579671946,
25
+ "learning_rate": 3.75e-06,
26
+ "loss": 1.3114,
27
+ "num_input_tokens_seen": 65616,
28
+ "step": 10,
29
+ "train_runtime": 371.0314,
30
+ "train_tokens_per_second": 176.848
31
+ },
32
+ {
33
+ "epoch": 0.044593088071348944,
34
+ "grad_norm": 1.3154044523767012,
35
+ "learning_rate": 5.833333333333334e-06,
36
+ "loss": 1.5722,
37
+ "num_input_tokens_seen": 92696,
38
+ "step": 15,
39
+ "train_runtime": 537.3955,
40
+ "train_tokens_per_second": 172.491
41
+ },
42
+ {
43
+ "epoch": 0.059457450761798585,
44
+ "grad_norm": 1.901062718640284,
45
+ "learning_rate": 7.916666666666667e-06,
46
+ "loss": 1.4077,
47
+ "num_input_tokens_seen": 123944,
48
+ "step": 20,
49
+ "train_runtime": 720.0196,
50
+ "train_tokens_per_second": 172.14
51
+ },
52
+ {
53
+ "epoch": 0.07432181345224824,
54
+ "grad_norm": 0.865431080037828,
55
+ "learning_rate": 1e-05,
56
+ "loss": 1.2642,
57
+ "num_input_tokens_seen": 154432,
58
+ "step": 25,
59
+ "train_runtime": 892.3433,
60
+ "train_tokens_per_second": 173.063
61
+ },
62
+ {
63
+ "epoch": 0.08918617614269789,
64
+ "grad_norm": 0.7781390780834406,
65
+ "learning_rate": 1.2083333333333333e-05,
66
+ "loss": 1.1572,
67
+ "num_input_tokens_seen": 181960,
68
+ "step": 30,
69
+ "train_runtime": 1057.0334,
70
+ "train_tokens_per_second": 172.142
71
+ },
72
+ {
73
+ "epoch": 0.10405053883314754,
74
+ "grad_norm": 0.665108652040459,
75
+ "learning_rate": 1.4166666666666668e-05,
76
+ "loss": 0.9421,
77
+ "num_input_tokens_seen": 212640,
78
+ "step": 35,
79
+ "train_runtime": 1231.9846,
80
+ "train_tokens_per_second": 172.6
81
+ },
82
+ {
83
+ "epoch": 0.11891490152359717,
84
+ "grad_norm": 0.2825351078495791,
85
+ "learning_rate": 1.6250000000000002e-05,
86
+ "loss": 0.8661,
87
+ "num_input_tokens_seen": 245808,
88
+ "step": 40,
89
+ "train_runtime": 1416.5893,
90
+ "train_tokens_per_second": 173.521
91
+ },
92
+ {
93
+ "epoch": 0.13377926421404682,
94
+ "grad_norm": 0.36334526479148993,
95
+ "learning_rate": 1.8333333333333333e-05,
96
+ "loss": 0.9087,
97
+ "num_input_tokens_seen": 272224,
98
+ "step": 45,
99
+ "train_runtime": 1582.5403,
100
+ "train_tokens_per_second": 172.017
101
+ },
102
+ {
103
+ "epoch": 0.14864362690449648,
104
+ "grad_norm": 0.44642145260416205,
105
+ "learning_rate": 2.0416666666666667e-05,
106
+ "loss": 0.8855,
107
+ "num_input_tokens_seen": 305560,
108
+ "step": 50,
109
+ "train_runtime": 1768.3526,
110
+ "train_tokens_per_second": 172.794
111
+ },
112
+ {
113
+ "epoch": 0.1635079895949461,
114
+ "grad_norm": 0.41738282561467666,
115
+ "learning_rate": 2.25e-05,
116
+ "loss": 0.8737,
117
+ "num_input_tokens_seen": 332024,
118
+ "step": 55,
119
+ "train_runtime": 1927.929,
120
+ "train_tokens_per_second": 172.218
121
+ },
122
+ {
123
+ "epoch": 0.17837235228539577,
124
+ "grad_norm": 0.39634079236288305,
125
+ "learning_rate": 2.4583333333333332e-05,
126
+ "loss": 0.8043,
127
+ "num_input_tokens_seen": 360856,
128
+ "step": 60,
129
+ "train_runtime": 2097.4995,
130
+ "train_tokens_per_second": 172.041
131
+ },
132
+ {
133
+ "epoch": 0.1932367149758454,
134
+ "grad_norm": 0.35491219061131213,
135
+ "learning_rate": 2.6666666666666667e-05,
136
+ "loss": 0.7943,
137
+ "num_input_tokens_seen": 398608,
138
+ "step": 65,
139
+ "train_runtime": 2304.6738,
140
+ "train_tokens_per_second": 172.956
141
+ },
142
+ {
143
+ "epoch": 0.20810107766629507,
144
+ "grad_norm": 0.35737550120310063,
145
+ "learning_rate": 2.8749999999999997e-05,
146
+ "loss": 0.8492,
147
+ "num_input_tokens_seen": 425552,
148
+ "step": 70,
149
+ "train_runtime": 2460.8004,
150
+ "train_tokens_per_second": 172.932
151
+ },
152
+ {
153
+ "epoch": 0.2229654403567447,
154
+ "grad_norm": 0.48693276619814596,
155
+ "learning_rate": 3.0833333333333335e-05,
156
+ "loss": 0.6925,
157
+ "num_input_tokens_seen": 457392,
158
+ "step": 75,
159
+ "train_runtime": 2638.2785,
160
+ "train_tokens_per_second": 173.368
161
+ },
162
+ {
163
+ "epoch": 0.23782980304719434,
164
+ "grad_norm": 0.5078391701662902,
165
+ "learning_rate": 3.291666666666667e-05,
166
+ "loss": 0.8085,
167
+ "num_input_tokens_seen": 483824,
168
+ "step": 80,
169
+ "train_runtime": 2806.7407,
170
+ "train_tokens_per_second": 172.379
171
+ },
172
+ {
173
+ "epoch": 0.25269416573764397,
174
+ "grad_norm": 0.3024390891532131,
175
+ "learning_rate": 3.5e-05,
176
+ "loss": 0.7564,
177
+ "num_input_tokens_seen": 512864,
178
+ "step": 85,
179
+ "train_runtime": 2979.1533,
180
+ "train_tokens_per_second": 172.151
181
+ },
182
+ {
183
+ "epoch": 0.26755852842809363,
184
+ "grad_norm": 0.37806969672351337,
185
+ "learning_rate": 3.708333333333334e-05,
186
+ "loss": 0.8039,
187
+ "num_input_tokens_seen": 538720,
188
+ "step": 90,
189
+ "train_runtime": 3137.4212,
190
+ "train_tokens_per_second": 171.708
191
+ },
192
+ {
193
+ "epoch": 0.2824228911185433,
194
+ "grad_norm": 0.3125837610721519,
195
+ "learning_rate": 3.9166666666666665e-05,
196
+ "loss": 0.7889,
197
+ "num_input_tokens_seen": 569688,
198
+ "step": 95,
199
+ "train_runtime": 3317.1304,
200
+ "train_tokens_per_second": 171.741
201
+ },
202
+ {
203
+ "epoch": 0.29728725380899296,
204
+ "grad_norm": 0.2612296186361751,
205
+ "learning_rate": 4.125e-05,
206
+ "loss": 0.8101,
207
+ "num_input_tokens_seen": 603480,
208
+ "step": 100,
209
+ "train_runtime": 3508.249,
210
+ "train_tokens_per_second": 172.017
211
+ },
212
+ {
213
+ "epoch": 0.31215161649944256,
214
+ "grad_norm": 0.4664200778768721,
215
+ "learning_rate": 4.3333333333333334e-05,
216
+ "loss": 0.7587,
217
+ "num_input_tokens_seen": 632928,
218
+ "step": 105,
219
+ "train_runtime": 3681.3714,
220
+ "train_tokens_per_second": 171.927
221
+ },
222
+ {
223
+ "epoch": 0.3270159791898922,
224
+ "grad_norm": 0.4122820082467493,
225
+ "learning_rate": 4.541666666666667e-05,
226
+ "loss": 0.7925,
227
+ "num_input_tokens_seen": 659840,
228
+ "step": 110,
229
+ "train_runtime": 3850.7584,
230
+ "train_tokens_per_second": 171.353
231
+ },
232
+ {
233
+ "epoch": 0.3418803418803419,
234
+ "grad_norm": 0.29086455774181913,
235
+ "learning_rate": 4.75e-05,
236
+ "loss": 0.8142,
237
+ "num_input_tokens_seen": 690432,
238
+ "step": 115,
239
+ "train_runtime": 4029.493,
240
+ "train_tokens_per_second": 171.345
241
+ },
242
+ {
243
+ "epoch": 0.35674470457079155,
244
+ "grad_norm": 0.49406027919264844,
245
+ "learning_rate": 4.958333333333334e-05,
246
+ "loss": 0.7088,
247
+ "num_input_tokens_seen": 726264,
248
+ "step": 120,
249
+ "train_runtime": 4222.1079,
250
+ "train_tokens_per_second": 172.015
251
+ },
252
+ {
253
+ "epoch": 0.37160906726124115,
254
+ "grad_norm": 0.29533667352828036,
255
+ "learning_rate": 4.9999194067399066e-05,
256
+ "loss": 0.6237,
257
+ "num_input_tokens_seen": 763328,
258
+ "step": 125,
259
+ "train_runtime": 4420.0509,
260
+ "train_tokens_per_second": 172.697
261
+ },
262
+ {
263
+ "epoch": 0.3864734299516908,
264
+ "grad_norm": 0.554389641306832,
265
+ "learning_rate": 4.999592005526383e-05,
266
+ "loss": 0.6609,
267
+ "num_input_tokens_seen": 792888,
268
+ "step": 130,
269
+ "train_runtime": 4590.9234,
270
+ "train_tokens_per_second": 172.708
271
+ },
272
+ {
273
+ "epoch": 0.4013377926421405,
274
+ "grad_norm": 0.3913197863670873,
275
+ "learning_rate": 4.999012792238118e-05,
276
+ "loss": 0.7873,
277
+ "num_input_tokens_seen": 830784,
278
+ "step": 135,
279
+ "train_runtime": 4780.7961,
280
+ "train_tokens_per_second": 173.775
281
+ },
282
+ {
283
+ "epoch": 0.41620215533259014,
284
+ "grad_norm": 0.7292785043191194,
285
+ "learning_rate": 4.998181825225791e-05,
286
+ "loss": 0.7047,
287
+ "num_input_tokens_seen": 869304,
288
+ "step": 140,
289
+ "train_runtime": 4979.1373,
290
+ "train_tokens_per_second": 174.589
291
+ },
292
+ {
293
+ "epoch": 0.43106651802303975,
294
+ "grad_norm": 0.3288366913256035,
295
+ "learning_rate": 4.997099188202077e-05,
296
+ "loss": 0.6608,
297
+ "num_input_tokens_seen": 903752,
298
+ "step": 145,
299
+ "train_runtime": 5170.5438,
300
+ "train_tokens_per_second": 174.789
301
+ },
302
+ {
303
+ "epoch": 0.4459308807134894,
304
+ "grad_norm": 0.32850031143064623,
305
+ "learning_rate": 4.995764990233205e-05,
306
+ "loss": 0.7203,
307
+ "num_input_tokens_seen": 935672,
308
+ "step": 150,
309
+ "train_runtime": 5350.7854,
310
+ "train_tokens_per_second": 174.866
311
+ },
312
+ {
313
+ "epoch": 0.46079524340393907,
314
+ "grad_norm": 0.4093406468089712,
315
+ "learning_rate": 4.994179365727973e-05,
316
+ "loss": 0.7793,
317
+ "num_input_tokens_seen": 967928,
318
+ "step": 155,
319
+ "train_runtime": 5526.3425,
320
+ "train_tokens_per_second": 175.148
321
+ },
322
+ {
323
+ "epoch": 0.4756596060943887,
324
+ "grad_norm": 0.26538290553046884,
325
+ "learning_rate": 4.992342474424209e-05,
326
+ "loss": 0.6941,
327
+ "num_input_tokens_seen": 1006856,
328
+ "step": 160,
329
+ "train_runtime": 5732.4064,
330
+ "train_tokens_per_second": 175.643
331
+ },
332
+ {
333
+ "epoch": 0.49052396878483834,
334
+ "grad_norm": 0.3293209375345381,
335
+ "learning_rate": 4.990254501372677e-05,
336
+ "loss": 0.7154,
337
+ "num_input_tokens_seen": 1040320,
338
+ "step": 165,
339
+ "train_runtime": 5919.4184,
340
+ "train_tokens_per_second": 175.747
341
+ },
342
+ {
343
+ "epoch": 0.5053883314752879,
344
+ "grad_norm": 0.37872717401082634,
345
+ "learning_rate": 4.987915656918435e-05,
346
+ "loss": 0.7354,
347
+ "num_input_tokens_seen": 1071792,
348
+ "step": 170,
349
+ "train_runtime": 6093.1775,
350
+ "train_tokens_per_second": 175.9
351
+ },
352
+ {
353
+ "epoch": 0.5202526941657376,
354
+ "grad_norm": 0.34111517332292757,
355
+ "learning_rate": 4.985326176679645e-05,
356
+ "loss": 0.7598,
357
+ "num_input_tokens_seen": 1103216,
358
+ "step": 175,
359
+ "train_runtime": 6268.9044,
360
+ "train_tokens_per_second": 175.982
361
+ },
362
+ {
363
+ "epoch": 0.5351170568561873,
364
+ "grad_norm": 0.44888012700873,
365
+ "learning_rate": 4.9824863215238373e-05,
366
+ "loss": 0.7502,
367
+ "num_input_tokens_seen": 1131576,
368
+ "step": 180,
369
+ "train_runtime": 6435.3922,
370
+ "train_tokens_per_second": 175.836
371
+ },
372
+ {
373
+ "epoch": 0.5499814195466369,
374
+ "grad_norm": 0.4802566147521983,
375
+ "learning_rate": 4.979396377541628e-05,
376
+ "loss": 0.6844,
377
+ "num_input_tokens_seen": 1160040,
378
+ "step": 185,
379
+ "train_runtime": 6605.4368,
380
+ "train_tokens_per_second": 175.619
381
+ },
382
+ {
383
+ "epoch": 0.5648457822370866,
384
+ "grad_norm": 0.4821989259826241,
385
+ "learning_rate": 4.976056656017901e-05,
386
+ "loss": 0.8291,
387
+ "num_input_tokens_seen": 1190376,
388
+ "step": 190,
389
+ "train_runtime": 6782.0935,
390
+ "train_tokens_per_second": 175.517
391
+ },
392
+ {
393
+ "epoch": 0.5797101449275363,
394
+ "grad_norm": 0.386641731129637,
395
+ "learning_rate": 4.972467493400445e-05,
396
+ "loss": 0.7762,
397
+ "num_input_tokens_seen": 1217904,
398
+ "step": 195,
399
+ "train_runtime": 6948.2037,
400
+ "train_tokens_per_second": 175.283
401
+ },
402
+ {
403
+ "epoch": 0.5945745076179859,
404
+ "grad_norm": 0.4418388818648707,
405
+ "learning_rate": 4.968629251266064e-05,
406
+ "loss": 0.6688,
407
+ "num_input_tokens_seen": 1256312,
408
+ "step": 200,
409
+ "train_runtime": 7158.3945,
410
+ "train_tokens_per_second": 175.502
411
+ },
412
+ {
413
+ "epoch": 0.5945745076179859,
414
+ "eval_loss": 0.8953876495361328,
415
+ "eval_runtime": 353.0879,
416
+ "eval_samples_per_second": 1.325,
417
+ "eval_steps_per_second": 0.663,
418
+ "num_input_tokens_seen": 1256312,
419
+ "step": 200
420
+ },
421
+ {
422
+ "epoch": 0.6094388703084356,
423
+ "grad_norm": 0.5521465457074078,
424
+ "learning_rate": 4.964542316284147e-05,
425
+ "loss": 0.7398,
426
+ "num_input_tokens_seen": 1290880,
427
+ "step": 205,
428
+ "train_runtime": 7728.2908,
429
+ "train_tokens_per_second": 167.033
430
+ },
431
+ {
432
+ "epoch": 0.6243032329988851,
433
+ "grad_norm": 0.4773991938686689,
434
+ "learning_rate": 4.960207100177716e-05,
435
+ "loss": 0.7169,
436
+ "num_input_tokens_seen": 1320288,
437
+ "step": 210,
438
+ "train_runtime": 7894.6486,
439
+ "train_tokens_per_second": 167.238
440
+ },
441
+ {
442
+ "epoch": 0.6391675956893348,
443
+ "grad_norm": 0.51804635767658,
444
+ "learning_rate": 4.955624039681952e-05,
445
+ "loss": 0.7618,
446
+ "num_input_tokens_seen": 1346816,
447
+ "step": 215,
448
+ "train_runtime": 8060.6928,
449
+ "train_tokens_per_second": 167.084
450
+ },
451
+ {
452
+ "epoch": 0.6540319583797845,
453
+ "grad_norm": 0.5900793472953364,
454
+ "learning_rate": 4.950793596500192e-05,
455
+ "loss": 0.7563,
456
+ "num_input_tokens_seen": 1382472,
457
+ "step": 220,
458
+ "train_runtime": 8252.4478,
459
+ "train_tokens_per_second": 167.523
460
+ },
461
+ {
462
+ "epoch": 0.6688963210702341,
463
+ "grad_norm": 0.39058107426349015,
464
+ "learning_rate": 4.94571625725742e-05,
465
+ "loss": 0.6626,
466
+ "num_input_tokens_seen": 1420832,
467
+ "step": 225,
468
+ "train_runtime": 8450.5909,
469
+ "train_tokens_per_second": 168.134
470
+ },
471
+ {
472
+ "epoch": 0.6837606837606838,
473
+ "grad_norm": 0.5663525201259818,
474
+ "learning_rate": 4.940392533451244e-05,
475
+ "loss": 0.7741,
476
+ "num_input_tokens_seen": 1451496,
477
+ "step": 230,
478
+ "train_runtime": 8624.0331,
479
+ "train_tokens_per_second": 168.308
480
+ },
481
+ {
482
+ "epoch": 0.6986250464511334,
483
+ "grad_norm": 0.3852996317746423,
484
+ "learning_rate": 4.9348229614003615e-05,
485
+ "loss": 0.7749,
486
+ "num_input_tokens_seen": 1483136,
487
+ "step": 235,
488
+ "train_runtime": 8805.4306,
489
+ "train_tokens_per_second": 168.434
490
+ },
491
+ {
492
+ "epoch": 0.7134894091415831,
493
+ "grad_norm": 0.4546419228429511,
494
+ "learning_rate": 4.9290081021905416e-05,
495
+ "loss": 0.7186,
496
+ "num_input_tokens_seen": 1522176,
497
+ "step": 240,
498
+ "train_runtime": 9011.7223,
499
+ "train_tokens_per_second": 168.911
500
+ },
501
+ {
502
+ "epoch": 0.7283537718320326,
503
+ "grad_norm": 0.27319864084916956,
504
+ "learning_rate": 4.9229485416180876e-05,
505
+ "loss": 0.659,
506
+ "num_input_tokens_seen": 1562400,
507
+ "step": 245,
508
+ "train_runtime": 9221.6215,
509
+ "train_tokens_per_second": 169.428
510
+ },
511
+ {
512
+ "epoch": 0.7432181345224823,
513
+ "grad_norm": 0.5971581275751557,
514
+ "learning_rate": 4.916644890130831e-05,
515
+ "loss": 0.747,
516
+ "num_input_tokens_seen": 1587136,
517
+ "step": 250,
518
+ "train_runtime": 9378.073,
519
+ "train_tokens_per_second": 169.239
520
+ },
521
+ {
522
+ "epoch": 0.758082497212932,
523
+ "grad_norm": 0.3725130213516166,
524
+ "learning_rate": 4.9100977827666345e-05,
525
+ "loss": 0.698,
526
+ "num_input_tokens_seen": 1614920,
527
+ "step": 255,
528
+ "train_runtime": 9541.7299,
529
+ "train_tokens_per_second": 169.248
530
+ },
531
+ {
532
+ "epoch": 0.7729468599033816,
533
+ "grad_norm": 0.4907840552388026,
534
+ "learning_rate": 4.903307879089411e-05,
535
+ "loss": 0.6477,
536
+ "num_input_tokens_seen": 1645688,
537
+ "step": 260,
538
+ "train_runtime": 9716.0007,
539
+ "train_tokens_per_second": 169.379
540
+ },
541
+ {
542
+ "epoch": 0.7878112225938313,
543
+ "grad_norm": 0.5241784073891005,
544
+ "learning_rate": 4.896275863122685e-05,
545
+ "loss": 0.7262,
546
+ "num_input_tokens_seen": 1681888,
547
+ "step": 265,
548
+ "train_runtime": 9905.9233,
549
+ "train_tokens_per_second": 169.786
550
+ },
551
+ {
552
+ "epoch": 0.802675585284281,
553
+ "grad_norm": 0.4747498444810908,
554
+ "learning_rate": 4.8890024432806806e-05,
555
+ "loss": 0.7729,
556
+ "num_input_tokens_seen": 1709480,
557
+ "step": 270,
558
+ "train_runtime": 10078.2583,
559
+ "train_tokens_per_second": 169.621
560
+ },
561
+ {
562
+ "epoch": 0.8175399479747306,
563
+ "grad_norm": 0.40268303548713125,
564
+ "learning_rate": 4.8814883522969545e-05,
565
+ "loss": 0.653,
566
+ "num_input_tokens_seen": 1744560,
567
+ "step": 275,
568
+ "train_runtime": 10262.7966,
569
+ "train_tokens_per_second": 169.989
570
+ },
571
+ {
572
+ "epoch": 0.8324043106651803,
573
+ "grad_norm": 0.5361180434189863,
574
+ "learning_rate": 4.8737343471505806e-05,
575
+ "loss": 0.75,
576
+ "num_input_tokens_seen": 1773576,
577
+ "step": 280,
578
+ "train_runtime": 10433.1959,
579
+ "train_tokens_per_second": 169.994
580
+ },
581
+ {
582
+ "epoch": 0.8472686733556298,
583
+ "grad_norm": 0.4150141248761779,
584
+ "learning_rate": 4.865741208989888e-05,
585
+ "loss": 0.6366,
586
+ "num_input_tokens_seen": 1807248,
587
+ "step": 285,
588
+ "train_runtime": 10621.0739,
589
+ "train_tokens_per_second": 170.157
590
+ },
591
+ {
592
+ "epoch": 0.8621330360460795,
593
+ "grad_norm": 0.557076443271248,
594
+ "learning_rate": 4.857509743053774e-05,
595
+ "loss": 0.7992,
596
+ "num_input_tokens_seen": 1836512,
597
+ "step": 290,
598
+ "train_runtime": 10790.2552,
599
+ "train_tokens_per_second": 170.201
600
+ },
601
+ {
602
+ "epoch": 0.8769973987365292,
603
+ "grad_norm": 0.622394894811983,
604
+ "learning_rate": 4.8490407785905756e-05,
605
+ "loss": 0.7259,
606
+ "num_input_tokens_seen": 1864768,
607
+ "step": 295,
608
+ "train_runtime": 10958.7213,
609
+ "train_tokens_per_second": 170.163
610
+ },
611
+ {
612
+ "epoch": 0.8918617614269788,
613
+ "grad_norm": 0.32132112389269674,
614
+ "learning_rate": 4.840335168774532e-05,
615
+ "loss": 0.6569,
616
+ "num_input_tokens_seen": 1897416,
617
+ "step": 300,
618
+ "train_runtime": 11132.0894,
619
+ "train_tokens_per_second": 170.446
620
+ },
621
+ {
622
+ "epoch": 0.9067261241174285,
623
+ "grad_norm": 0.5699931721537229,
624
+ "learning_rate": 4.8313937906198415e-05,
625
+ "loss": 0.7109,
626
+ "num_input_tokens_seen": 1925464,
627
+ "step": 305,
628
+ "train_runtime": 11302.0412,
629
+ "train_tokens_per_second": 170.364
630
+ },
631
+ {
632
+ "epoch": 0.9215904868078781,
633
+ "grad_norm": 0.49216652434145286,
634
+ "learning_rate": 4.822217544892298e-05,
635
+ "loss": 0.6374,
636
+ "num_input_tokens_seen": 1956064,
637
+ "step": 310,
638
+ "train_runtime": 11477.605,
639
+ "train_tokens_per_second": 170.424
640
+ },
641
+ {
642
+ "epoch": 0.9364548494983278,
643
+ "grad_norm": 0.289252551788521,
644
+ "learning_rate": 4.812807356018556e-05,
645
+ "loss": 0.7321,
646
+ "num_input_tokens_seen": 1986504,
647
+ "step": 315,
648
+ "train_runtime": 11660.7035,
649
+ "train_tokens_per_second": 170.359
650
+ },
651
+ {
652
+ "epoch": 0.9513192121887774,
653
+ "grad_norm": 0.4471738070997589,
654
+ "learning_rate": 4.803164171993001e-05,
655
+ "loss": 0.6904,
656
+ "num_input_tokens_seen": 2012976,
657
+ "step": 320,
658
+ "train_runtime": 11819.4269,
659
+ "train_tokens_per_second": 170.311
660
+ },
661
+ {
662
+ "epoch": 0.966183574879227,
663
+ "grad_norm": 0.4988862477404023,
664
+ "learning_rate": 4.793288964282244e-05,
665
+ "loss": 0.8029,
666
+ "num_input_tokens_seen": 2041512,
667
+ "step": 325,
668
+ "train_runtime": 11993.4775,
669
+ "train_tokens_per_second": 170.219
670
+ },
671
+ {
672
+ "epoch": 0.9810479375696767,
673
+ "grad_norm": 0.4455586736978044,
674
+ "learning_rate": 4.783182727727258e-05,
675
+ "loss": 0.7208,
676
+ "num_input_tokens_seen": 2065864,
677
+ "step": 330,
678
+ "train_runtime": 12150.1754,
679
+ "train_tokens_per_second": 170.028
680
+ },
681
+ {
682
+ "epoch": 0.9959123002601263,
683
+ "grad_norm": 0.45313304199376386,
684
+ "learning_rate": 4.772846480443154e-05,
685
+ "loss": 0.6892,
686
+ "num_input_tokens_seen": 2099192,
687
+ "step": 335,
688
+ "train_runtime": 12332.4171,
689
+ "train_tokens_per_second": 170.217
690
+ },
691
+ {
692
+ "epoch": 1.0089186176142697,
693
+ "grad_norm": 0.34932689166112835,
694
+ "learning_rate": 4.762281263716619e-05,
695
+ "loss": 0.7558,
696
+ "num_input_tokens_seen": 2125816,
697
+ "step": 340,
698
+ "train_runtime": 12484.2733,
699
+ "train_tokens_per_second": 170.28
700
+ },
701
+ {
702
+ "epoch": 1.0237829803047194,
703
+ "grad_norm": 0.4092397826706536,
704
+ "learning_rate": 4.751488141901009e-05,
705
+ "loss": 0.5846,
706
+ "num_input_tokens_seen": 2160160,
707
+ "step": 345,
708
+ "train_runtime": 12674.5489,
709
+ "train_tokens_per_second": 170.433
710
+ },
711
+ {
712
+ "epoch": 1.038647342995169,
713
+ "grad_norm": 0.31168294599405477,
714
+ "learning_rate": 4.740468202309132e-05,
715
+ "loss": 0.5396,
716
+ "num_input_tokens_seen": 2202880,
717
+ "step": 350,
718
+ "train_runtime": 12878.5359,
719
+ "train_tokens_per_second": 171.05
720
+ },
721
+ {
722
+ "epoch": 1.0535117056856187,
723
+ "grad_norm": 0.4562243418713546,
724
+ "learning_rate": 4.729222555103703e-05,
725
+ "loss": 0.6491,
726
+ "num_input_tokens_seen": 2232216,
727
+ "step": 355,
728
+ "train_runtime": 13048.5779,
729
+ "train_tokens_per_second": 171.07
730
+ },
731
+ {
732
+ "epoch": 1.0683760683760684,
733
+ "grad_norm": 0.37853300240110366,
734
+ "learning_rate": 4.717752333185511e-05,
735
+ "loss": 0.5624,
736
+ "num_input_tokens_seen": 2255520,
737
+ "step": 360,
738
+ "train_runtime": 13200.2201,
739
+ "train_tokens_per_second": 170.87
740
+ },
741
+ {
742
+ "epoch": 1.083240431066518,
743
+ "grad_norm": 0.505365165589364,
744
+ "learning_rate": 4.706058692079288e-05,
745
+ "loss": 0.5882,
746
+ "num_input_tokens_seen": 2295400,
747
+ "step": 365,
748
+ "train_runtime": 13416.4241,
749
+ "train_tokens_per_second": 171.089
750
+ },
751
+ {
752
+ "epoch": 1.0981047937569677,
753
+ "grad_norm": 0.7984507974057846,
754
+ "learning_rate": 4.6941428098172956e-05,
755
+ "loss": 0.6382,
756
+ "num_input_tokens_seen": 2322496,
757
+ "step": 370,
758
+ "train_runtime": 13576.7515,
759
+ "train_tokens_per_second": 171.064
760
+ },
761
+ {
762
+ "epoch": 1.1129691564474173,
763
+ "grad_norm": 0.5722349779883125,
764
+ "learning_rate": 4.682005886820656e-05,
765
+ "loss": 0.6791,
766
+ "num_input_tokens_seen": 2353904,
767
+ "step": 375,
768
+ "train_runtime": 13756.8344,
769
+ "train_tokens_per_second": 171.108
770
+ },
771
+ {
772
+ "epoch": 1.127833519137867,
773
+ "grad_norm": 0.826155965272619,
774
+ "learning_rate": 4.669649145778412e-05,
775
+ "loss": 0.6277,
776
+ "num_input_tokens_seen": 2389640,
777
+ "step": 380,
778
+ "train_runtime": 13955.4804,
779
+ "train_tokens_per_second": 171.233
780
+ },
781
+ {
782
+ "epoch": 1.1426978818283167,
783
+ "grad_norm": 0.48208601176965793,
784
+ "learning_rate": 4.657073831524358e-05,
785
+ "loss": 0.5038,
786
+ "num_input_tokens_seen": 2427872,
787
+ "step": 385,
788
+ "train_runtime": 14154.7296,
789
+ "train_tokens_per_second": 171.524
790
+ },
791
+ {
792
+ "epoch": 1.1575622445187663,
793
+ "grad_norm": 0.862951567640207,
794
+ "learning_rate": 4.644281210911631e-05,
795
+ "loss": 0.6581,
796
+ "num_input_tokens_seen": 2453984,
797
+ "step": 390,
798
+ "train_runtime": 14320.9438,
799
+ "train_tokens_per_second": 171.356
800
+ },
801
+ {
802
+ "epoch": 1.172426607209216,
803
+ "grad_norm": 0.6446091778736193,
804
+ "learning_rate": 4.631272572685086e-05,
805
+ "loss": 0.5966,
806
+ "num_input_tokens_seen": 2482696,
807
+ "step": 395,
808
+ "train_runtime": 14484.582,
809
+ "train_tokens_per_second": 171.403
810
+ },
811
+ {
812
+ "epoch": 1.1872909698996654,
813
+ "grad_norm": 0.5940985307805492,
814
+ "learning_rate": 4.618049227351467e-05,
815
+ "loss": 0.6238,
816
+ "num_input_tokens_seen": 2515024,
817
+ "step": 400,
818
+ "train_runtime": 14670.3277,
819
+ "train_tokens_per_second": 171.436
820
+ },
821
+ {
822
+ "epoch": 1.1872909698996654,
823
+ "eval_loss": 0.8729309439659119,
824
+ "eval_runtime": 355.7054,
825
+ "eval_samples_per_second": 1.316,
826
+ "eval_steps_per_second": 0.658,
827
+ "num_input_tokens_seen": 2515024,
828
+ "step": 400
829
+ },
830
+ {
831
+ "epoch": 1.2021553325901153,
832
+ "grad_norm": 0.42070342961969176,
833
+ "learning_rate": 4.6046125070473854e-05,
834
+ "loss": 0.6013,
835
+ "num_input_tokens_seen": 2544048,
836
+ "step": 405,
837
+ "train_runtime": 15222.6559,
838
+ "train_tokens_per_second": 167.122
839
+ },
840
+ {
841
+ "epoch": 1.2170196952805648,
842
+ "grad_norm": 0.5065404783086159,
843
+ "learning_rate": 4.5909637654051194e-05,
844
+ "loss": 0.6597,
845
+ "num_input_tokens_seen": 2577080,
846
+ "step": 410,
847
+ "train_runtime": 15412.9664,
848
+ "train_tokens_per_second": 167.202
849
+ },
850
+ {
851
+ "epoch": 1.2318840579710144,
852
+ "grad_norm": 0.7043622561243575,
853
+ "learning_rate": 4.577104377416243e-05,
854
+ "loss": 0.6459,
855
+ "num_input_tokens_seen": 2605240,
856
+ "step": 415,
857
+ "train_runtime": 15585.0317,
858
+ "train_tokens_per_second": 167.163
859
+ },
860
+ {
861
+ "epoch": 1.246748420661464,
862
+ "grad_norm": 0.5815684563195597,
863
+ "learning_rate": 4.5630357392931136e-05,
864
+ "loss": 0.5973,
865
+ "num_input_tokens_seen": 2634016,
866
+ "step": 420,
867
+ "train_runtime": 15757.2057,
868
+ "train_tokens_per_second": 167.163
869
+ },
870
+ {
871
+ "epoch": 1.2616127833519137,
872
+ "grad_norm": 0.5571563336020446,
873
+ "learning_rate": 4.548759268328211e-05,
874
+ "loss": 0.6341,
875
+ "num_input_tokens_seen": 2663784,
876
+ "step": 425,
877
+ "train_runtime": 15930.8715,
878
+ "train_tokens_per_second": 167.209
879
+ },
880
+ {
881
+ "epoch": 1.2764771460423634,
882
+ "grad_norm": 0.6460743117704272,
883
+ "learning_rate": 4.534276402751361e-05,
884
+ "loss": 0.651,
885
+ "num_input_tokens_seen": 2697024,
886
+ "step": 430,
887
+ "train_runtime": 16122.3767,
888
+ "train_tokens_per_second": 167.285
889
+ },
890
+ {
891
+ "epoch": 1.291341508732813,
892
+ "grad_norm": 0.6482175581649149,
893
+ "learning_rate": 4.5195886015848454e-05,
894
+ "loss": 0.6475,
895
+ "num_input_tokens_seen": 2719400,
896
+ "step": 435,
897
+ "train_runtime": 16266.13,
898
+ "train_tokens_per_second": 167.182
899
+ },
900
+ {
901
+ "epoch": 1.3062058714232627,
902
+ "grad_norm": 0.6094464333253732,
903
+ "learning_rate": 4.5046973444964165e-05,
904
+ "loss": 0.6373,
905
+ "num_input_tokens_seen": 2751744,
906
+ "step": 440,
907
+ "train_runtime": 16434.9886,
908
+ "train_tokens_per_second": 167.432
909
+ },
910
+ {
911
+ "epoch": 1.3210702341137124,
912
+ "grad_norm": 0.8126749512005293,
913
+ "learning_rate": 4.4896041316502335e-05,
914
+ "loss": 0.6434,
915
+ "num_input_tokens_seen": 2791080,
916
+ "step": 445,
917
+ "train_runtime": 16641.4496,
918
+ "train_tokens_per_second": 167.719
919
+ },
920
+ {
921
+ "epoch": 1.335934596804162,
922
+ "grad_norm": 0.5026965778291799,
923
+ "learning_rate": 4.474310483555739e-05,
924
+ "loss": 0.5877,
925
+ "num_input_tokens_seen": 2823104,
926
+ "step": 450,
927
+ "train_runtime": 16826.3481,
928
+ "train_tokens_per_second": 167.779
929
+ },
930
+ {
931
+ "epoch": 1.3507989594946117,
932
+ "grad_norm": 0.824948158879483,
933
+ "learning_rate": 4.4588179409144734e-05,
934
+ "loss": 0.6944,
935
+ "num_input_tokens_seen": 2850480,
936
+ "step": 455,
937
+ "train_runtime": 16993.1957,
938
+ "train_tokens_per_second": 167.742
939
+ },
940
+ {
941
+ "epoch": 1.3656633221850614,
942
+ "grad_norm": 0.6935578787290407,
943
+ "learning_rate": 4.4431280644648676e-05,
944
+ "loss": 0.6677,
945
+ "num_input_tokens_seen": 2877920,
946
+ "step": 460,
947
+ "train_runtime": 17162.7566,
948
+ "train_tokens_per_second": 167.684
949
+ },
950
+ {
951
+ "epoch": 1.380527684875511,
952
+ "grad_norm": 0.775256808439057,
953
+ "learning_rate": 4.427242434825013e-05,
954
+ "loss": 0.6185,
955
+ "num_input_tokens_seen": 2910104,
956
+ "step": 465,
957
+ "train_runtime": 17349.4341,
958
+ "train_tokens_per_second": 167.735
959
+ },
960
+ {
961
+ "epoch": 1.3953920475659607,
962
+ "grad_norm": 0.5290136444612117,
963
+ "learning_rate": 4.4111626523334235e-05,
964
+ "loss": 0.4792,
965
+ "num_input_tokens_seen": 2943600,
966
+ "step": 470,
967
+ "train_runtime": 17535.2111,
968
+ "train_tokens_per_second": 167.868
969
+ },
970
+ {
971
+ "epoch": 1.4102564102564101,
972
+ "grad_norm": 0.431851575535354,
973
+ "learning_rate": 4.394890336887819e-05,
974
+ "loss": 0.5479,
975
+ "num_input_tokens_seen": 2983616,
976
+ "step": 475,
977
+ "train_runtime": 17735.6146,
978
+ "train_tokens_per_second": 168.227
979
+ },
980
+ {
981
+ "epoch": 1.42512077294686,
982
+ "grad_norm": 0.707234404224925,
983
+ "learning_rate": 4.378427127781935e-05,
984
+ "loss": 0.6563,
985
+ "num_input_tokens_seen": 3010896,
986
+ "step": 480,
987
+ "train_runtime": 17897.1321,
988
+ "train_tokens_per_second": 168.233
989
+ },
990
+ {
991
+ "epoch": 1.4399851356373095,
992
+ "grad_norm": 0.5496990766753803,
993
+ "learning_rate": 4.361774683540375e-05,
994
+ "loss": 0.5839,
995
+ "num_input_tokens_seen": 3040128,
996
+ "step": 485,
997
+ "train_runtime": 18074.4912,
998
+ "train_tokens_per_second": 168.2
999
+ },
1000
+ {
1001
+ "epoch": 1.4548494983277591,
1002
+ "grad_norm": 0.40637808524319596,
1003
+ "learning_rate": 4.34493468175153e-05,
1004
+ "loss": 0.5499,
1005
+ "num_input_tokens_seen": 3070192,
1006
+ "step": 490,
1007
+ "train_runtime": 18243.0065,
1008
+ "train_tokens_per_second": 168.294
1009
+ },
1010
+ {
1011
+ "epoch": 1.4697138610182088,
1012
+ "grad_norm": 0.5939003521421573,
1013
+ "learning_rate": 4.327908818898581e-05,
1014
+ "loss": 0.6602,
1015
+ "num_input_tokens_seen": 3100544,
1016
+ "step": 495,
1017
+ "train_runtime": 18419.7661,
1018
+ "train_tokens_per_second": 168.327
1019
+ },
1020
+ {
1021
+ "epoch": 1.4845782237086584,
1022
+ "grad_norm": 0.5699315589577931,
1023
+ "learning_rate": 4.3106988101885825e-05,
1024
+ "loss": 0.6172,
1025
+ "num_input_tokens_seen": 3134864,
1026
+ "step": 500,
1027
+ "train_runtime": 18605.4997,
1028
+ "train_tokens_per_second": 168.491
1029
+ },
1030
+ {
1031
+ "epoch": 1.499442586399108,
1032
+ "grad_norm": 0.5176180044018102,
1033
+ "learning_rate": 4.293306389379682e-05,
1034
+ "loss": 0.6932,
1035
+ "num_input_tokens_seen": 3164496,
1036
+ "step": 505,
1037
+ "train_runtime": 18782.3882,
1038
+ "train_tokens_per_second": 168.482
1039
+ },
1040
+ {
1041
+ "epoch": 1.5143069490895578,
1042
+ "grad_norm": 0.8125909060302873,
1043
+ "learning_rate": 4.275733308606452e-05,
1044
+ "loss": 0.6075,
1045
+ "num_input_tokens_seen": 3201592,
1046
+ "step": 510,
1047
+ "train_runtime": 18974.7401,
1048
+ "train_tokens_per_second": 168.729
1049
+ },
1050
+ {
1051
+ "epoch": 1.5291713117800074,
1052
+ "grad_norm": 0.4238803109744304,
1053
+ "learning_rate": 4.2579813382033764e-05,
1054
+ "loss": 0.6465,
1055
+ "num_input_tokens_seen": 3242824,
1056
+ "step": 515,
1057
+ "train_runtime": 19182.5699,
1058
+ "train_tokens_per_second": 169.051
1059
+ },
1060
+ {
1061
+ "epoch": 1.544035674470457,
1062
+ "grad_norm": 0.5568097005839904,
1063
+ "learning_rate": 4.240052266526512e-05,
1064
+ "loss": 0.6159,
1065
+ "num_input_tokens_seen": 3273600,
1066
+ "step": 520,
1067
+ "train_runtime": 19367.6233,
1068
+ "train_tokens_per_second": 169.024
1069
+ },
1070
+ {
1071
+ "epoch": 1.5589000371609067,
1072
+ "grad_norm": 0.5943741262587303,
1073
+ "learning_rate": 4.22194789977332e-05,
1074
+ "loss": 0.6439,
1075
+ "num_input_tokens_seen": 3308472,
1076
+ "step": 525,
1077
+ "train_runtime": 19562.3706,
1078
+ "train_tokens_per_second": 169.124
1079
+ },
1080
+ {
1081
+ "epoch": 1.5737643998513564,
1082
+ "grad_norm": 0.8079971411544303,
1083
+ "learning_rate": 4.203670061800712e-05,
1084
+ "loss": 0.6874,
1085
+ "num_input_tokens_seen": 3332568,
1086
+ "step": 530,
1087
+ "train_runtime": 19717.9765,
1088
+ "train_tokens_per_second": 169.012
1089
+ },
1090
+ {
1091
+ "epoch": 1.588628762541806,
1092
+ "grad_norm": 0.551762199364396,
1093
+ "learning_rate": 4.1852205939413104e-05,
1094
+ "loss": 0.6013,
1095
+ "num_input_tokens_seen": 3366712,
1096
+ "step": 535,
1097
+ "train_runtime": 19903.6186,
1098
+ "train_tokens_per_second": 169.151
1099
+ },
1100
+ {
1101
+ "epoch": 1.6034931252322555,
1102
+ "grad_norm": 0.5068985496527462,
1103
+ "learning_rate": 4.1666013548179496e-05,
1104
+ "loss": 0.6359,
1105
+ "num_input_tokens_seen": 3391608,
1106
+ "step": 540,
1107
+ "train_runtime": 20057.39,
1108
+ "train_tokens_per_second": 169.095
1109
+ },
1110
+ {
1111
+ "epoch": 1.6183574879227054,
1112
+ "grad_norm": 0.6855669469937492,
1113
+ "learning_rate": 4.147814220156437e-05,
1114
+ "loss": 0.6127,
1115
+ "num_input_tokens_seen": 3421400,
1116
+ "step": 545,
1117
+ "train_runtime": 20228.9847,
1118
+ "train_tokens_per_second": 169.134
1119
+ },
1120
+ {
1121
+ "epoch": 1.6332218506131548,
1122
+ "grad_norm": 0.4879299108867579,
1123
+ "learning_rate": 4.128861082596592e-05,
1124
+ "loss": 0.5612,
1125
+ "num_input_tokens_seen": 3455584,
1126
+ "step": 550,
1127
+ "train_runtime": 20414.7402,
1128
+ "train_tokens_per_second": 169.269
1129
+ },
1130
+ {
1131
+ "epoch": 1.6480862133036047,
1132
+ "grad_norm": 0.8129316983564234,
1133
+ "learning_rate": 4.109743851501573e-05,
1134
+ "loss": 0.6068,
1135
+ "num_input_tokens_seen": 3480392,
1136
+ "step": 555,
1137
+ "train_runtime": 20570.2335,
1138
+ "train_tokens_per_second": 169.196
1139
+ },
1140
+ {
1141
+ "epoch": 1.6629505759940542,
1142
+ "grad_norm": 0.5732379522903723,
1143
+ "learning_rate": 4.090464452765535e-05,
1144
+ "loss": 0.6331,
1145
+ "num_input_tokens_seen": 3515664,
1146
+ "step": 560,
1147
+ "train_runtime": 20754.2066,
1148
+ "train_tokens_per_second": 169.395
1149
+ },
1150
+ {
1151
+ "epoch": 1.677814938684504,
1152
+ "grad_norm": 0.8129257440384582,
1153
+ "learning_rate": 4.0710248286195994e-05,
1154
+ "loss": 0.6944,
1155
+ "num_input_tokens_seen": 3538880,
1156
+ "step": 565,
1157
+ "train_runtime": 20901.2916,
1158
+ "train_tokens_per_second": 169.314
1159
+ },
1160
+ {
1161
+ "epoch": 1.6926793013749535,
1162
+ "grad_norm": 0.5743351437424525,
1163
+ "learning_rate": 4.051426937436207e-05,
1164
+ "loss": 0.6007,
1165
+ "num_input_tokens_seen": 3576168,
1166
+ "step": 570,
1167
+ "train_runtime": 21091.5246,
1168
+ "train_tokens_per_second": 169.555
1169
+ },
1170
+ {
1171
+ "epoch": 1.7075436640654031,
1172
+ "grad_norm": 1.7481870694933204,
1173
+ "learning_rate": 4.0316727535318175e-05,
1174
+ "loss": 0.6374,
1175
+ "num_input_tokens_seen": 3606944,
1176
+ "step": 575,
1177
+ "train_runtime": 21266.3906,
1178
+ "train_tokens_per_second": 169.608
1179
+ },
1180
+ {
1181
+ "epoch": 1.7224080267558528,
1182
+ "grad_norm": 0.5813131410466363,
1183
+ "learning_rate": 4.0117642669680164e-05,
1184
+ "loss": 0.5499,
1185
+ "num_input_tokens_seen": 3637888,
1186
+ "step": 580,
1187
+ "train_runtime": 21440.8922,
1188
+ "train_tokens_per_second": 169.671
1189
+ },
1190
+ {
1191
+ "epoch": 1.7372723894463025,
1192
+ "grad_norm": 0.8484268316277586,
1193
+ "learning_rate": 3.991703483351039e-05,
1194
+ "loss": 0.6668,
1195
+ "num_input_tokens_seen": 3661216,
1196
+ "step": 585,
1197
+ "train_runtime": 21594.0452,
1198
+ "train_tokens_per_second": 169.547
1199
+ },
1200
+ {
1201
+ "epoch": 1.7521367521367521,
1202
+ "grad_norm": 0.34272148077746284,
1203
+ "learning_rate": 3.9714924236297155e-05,
1204
+ "loss": 0.6033,
1205
+ "num_input_tokens_seen": 3700064,
1206
+ "step": 590,
1207
+ "train_runtime": 21798.0768,
1208
+ "train_tokens_per_second": 169.743
1209
+ },
1210
+ {
1211
+ "epoch": 1.7670011148272018,
1212
+ "grad_norm": 0.588668446023156,
1213
+ "learning_rate": 3.9511331238918837e-05,
1214
+ "loss": 0.5136,
1215
+ "num_input_tokens_seen": 3730992,
1216
+ "step": 595,
1217
+ "train_runtime": 21979.8348,
1218
+ "train_tokens_per_second": 169.746
1219
+ },
1220
+ {
1221
+ "epoch": 1.7818654775176515,
1222
+ "grad_norm": 0.5461111827190066,
1223
+ "learning_rate": 3.9306276351592685e-05,
1224
+ "loss": 0.5867,
1225
+ "num_input_tokens_seen": 3766176,
1226
+ "step": 600,
1227
+ "train_runtime": 22165.8728,
1228
+ "train_tokens_per_second": 169.909
1229
+ },
1230
+ {
1231
+ "epoch": 1.7818654775176515,
1232
+ "eval_loss": 0.8493290543556213,
1233
+ "eval_runtime": 354.1156,
1234
+ "eval_samples_per_second": 1.322,
1235
+ "eval_steps_per_second": 0.661,
1236
+ "num_input_tokens_seen": 3766176,
1237
+ "step": 600
1238
+ }
1239
+ ],
1240
+ "logging_steps": 5,
1241
+ "max_steps": 1685,
1242
+ "num_input_tokens_seen": 3766176,
1243
+ "num_train_epochs": 5,
1244
+ "save_steps": 200,
1245
+ "stateful_callbacks": {
1246
+ "TrainerControl": {
1247
+ "args": {
1248
+ "should_epoch_stop": false,
1249
+ "should_evaluate": false,
1250
+ "should_log": false,
1251
+ "should_save": true,
1252
+ "should_training_stop": false
1253
+ },
1254
+ "attributes": {}
1255
+ }
1256
+ },
1257
+ "total_flos": 166671066071040.0,
1258
+ "train_batch_size": 1,
1259
+ "trial_name": null,
1260
+ "trial_params": null
1261
+ }