diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.09285051067780872, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.285051067780872e-05, + "grad_norm": 14.287509811043522, + "learning_rate": 0.0, + "loss": 1.1616, + "num_tokens": 13807.0, + "step": 1 + }, + { + "epoch": 0.00018570102135561745, + "grad_norm": 9.168088347237216, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.1401, + "num_tokens": 26712.0, + "step": 2 + }, + { + "epoch": 0.0002785515320334262, + "grad_norm": 4.754854538568191, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.1516, + "num_tokens": 42721.0, + "step": 3 + }, + { + "epoch": 0.0003714020427112349, + "grad_norm": 4.37586570254163, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.0378, + "num_tokens": 56819.0, + "step": 4 + }, + { + "epoch": 0.00046425255338904364, + "grad_norm": 4.182352814209364, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.9674, + "num_tokens": 70968.0, + "step": 5 + }, + { + "epoch": 0.0005571030640668524, + "grad_norm": 3.248662541597197, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.8998, + "num_tokens": 85751.0, + "step": 6 + }, + { + "epoch": 0.0006499535747446611, + "grad_norm": 3.289945899972427, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.9959, + "num_tokens": 99035.0, + "step": 7 + }, + { + "epoch": 0.0007428040854224698, + "grad_norm": 2.747853892747352, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.8816, + "num_tokens": 114831.0, + "step": 8 + }, + { + "epoch": 0.0008356545961002785, + "grad_norm": 2.7244745527918983, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.7935, + "num_tokens": 129931.0, + "step": 9 + }, + { + "epoch": 0.0009285051067780873, + "grad_norm": 2.6033029462752166, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.8062, + "num_tokens": 143585.0, + "step": 10 + }, + { + "epoch": 0.001021355617455896, + "grad_norm": 2.429086396139323, + "learning_rate": 5.000000000000001e-07, + "loss": 0.8366, + "num_tokens": 158835.0, + "step": 11 + }, + { + "epoch": 0.0011142061281337048, + "grad_norm": 2.387367110151706, + "learning_rate": 5.5e-07, + "loss": 0.7614, + "num_tokens": 173289.0, + "step": 12 + }, + { + "epoch": 0.0012070566388115134, + "grad_norm": 2.700143522537229, + "learning_rate": 6.000000000000001e-07, + "loss": 0.9328, + "num_tokens": 186180.0, + "step": 13 + }, + { + "epoch": 0.0012999071494893223, + "grad_norm": 2.4320945569073116, + "learning_rate": 6.5e-07, + "loss": 0.8479, + "num_tokens": 200413.0, + "step": 14 + }, + { + "epoch": 0.001392757660167131, + "grad_norm": 2.2865494279040615, + "learning_rate": 7.000000000000001e-07, + "loss": 0.77, + "num_tokens": 215612.0, + "step": 15 + }, + { + "epoch": 0.0014856081708449396, + "grad_norm": 2.481503726196433, + "learning_rate": 7.5e-07, + "loss": 0.8109, + "num_tokens": 229051.0, + "step": 16 + }, + { + "epoch": 0.0015784586815227484, + "grad_norm": 2.19807802441244, + "learning_rate": 8.000000000000001e-07, + "loss": 0.7204, + "num_tokens": 243368.0, + "step": 17 + }, + { + "epoch": 0.001671309192200557, + "grad_norm": 2.3845021359711596, + "learning_rate": 8.500000000000001e-07, + "loss": 0.72, + "num_tokens": 256339.0, + "step": 18 + }, + { + "epoch": 0.001764159702878366, + "grad_norm": 2.351958556972541, + "learning_rate": 9.000000000000001e-07, + "loss": 0.7748, + "num_tokens": 271000.0, + "step": 19 + }, + { + "epoch": 0.0018570102135561746, + "grad_norm": 2.266943495454673, + "learning_rate": 9.500000000000001e-07, + "loss": 0.7119, + "num_tokens": 285632.0, + "step": 20 + }, + { + "epoch": 0.0019498607242339832, + "grad_norm": 2.3558492082051323, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.8105, + "num_tokens": 301058.0, + "step": 21 + }, + { + "epoch": 0.002042711234911792, + "grad_norm": 2.2384444005740223, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.6897, + "num_tokens": 314223.0, + "step": 22 + }, + { + "epoch": 0.0021355617455896007, + "grad_norm": 2.4657363016519542, + "learning_rate": 1.1e-06, + "loss": 0.7515, + "num_tokens": 327941.0, + "step": 23 + }, + { + "epoch": 0.0022284122562674096, + "grad_norm": 2.2717254033005894, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.7083, + "num_tokens": 341348.0, + "step": 24 + }, + { + "epoch": 0.002321262766945218, + "grad_norm": 2.3620201224841053, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8159, + "num_tokens": 354390.0, + "step": 25 + }, + { + "epoch": 0.002414113277623027, + "grad_norm": 2.2279824352593467, + "learning_rate": 1.25e-06, + "loss": 0.7913, + "num_tokens": 368251.0, + "step": 26 + }, + { + "epoch": 0.0025069637883008357, + "grad_norm": 2.1878176708057415, + "learning_rate": 1.3e-06, + "loss": 0.6895, + "num_tokens": 382913.0, + "step": 27 + }, + { + "epoch": 0.0025998142989786446, + "grad_norm": 2.155832252780746, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.8248, + "num_tokens": 398649.0, + "step": 28 + }, + { + "epoch": 0.002692664809656453, + "grad_norm": 2.2227740504584426, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.6986, + "num_tokens": 412446.0, + "step": 29 + }, + { + "epoch": 0.002785515320334262, + "grad_norm": 2.0023118504160093, + "learning_rate": 1.45e-06, + "loss": 0.6905, + "num_tokens": 428222.0, + "step": 30 + }, + { + "epoch": 0.0028783658310120707, + "grad_norm": 2.193839330385369, + "learning_rate": 1.5e-06, + "loss": 0.7439, + "num_tokens": 441616.0, + "step": 31 + }, + { + "epoch": 0.002971216341689879, + "grad_norm": 2.1326626802879716, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.7303, + "num_tokens": 456791.0, + "step": 32 + }, + { + "epoch": 0.003064066852367688, + "grad_norm": 2.1284702504469357, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.7059, + "num_tokens": 471464.0, + "step": 33 + }, + { + "epoch": 0.003156917363045497, + "grad_norm": 2.3630177545657216, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.8518, + "num_tokens": 486190.0, + "step": 34 + }, + { + "epoch": 0.0032497678737233053, + "grad_norm": 2.203295780747593, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.727, + "num_tokens": 499372.0, + "step": 35 + }, + { + "epoch": 0.003342618384401114, + "grad_norm": 2.174252031394671, + "learning_rate": 1.75e-06, + "loss": 0.6994, + "num_tokens": 513678.0, + "step": 36 + }, + { + "epoch": 0.003435468895078923, + "grad_norm": 2.227471086870093, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.7931, + "num_tokens": 528006.0, + "step": 37 + }, + { + "epoch": 0.003528319405756732, + "grad_norm": 2.30484540996748, + "learning_rate": 1.85e-06, + "loss": 0.7245, + "num_tokens": 542924.0, + "step": 38 + }, + { + "epoch": 0.0036211699164345403, + "grad_norm": 2.1063586605096223, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.7946, + "num_tokens": 556514.0, + "step": 39 + }, + { + "epoch": 0.003714020427112349, + "grad_norm": 2.1652087233452195, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.6553, + "num_tokens": 568910.0, + "step": 40 + }, + { + "epoch": 0.003806870937790158, + "grad_norm": 2.2118946627169644, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "num_tokens": 582979.0, + "step": 41 + }, + { + "epoch": 0.0038997214484679664, + "grad_norm": 2.12125626896352, + "learning_rate": 2.05e-06, + "loss": 0.7821, + "num_tokens": 598581.0, + "step": 42 + }, + { + "epoch": 0.003992571959145775, + "grad_norm": 2.1314976121509863, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.6638, + "num_tokens": 612967.0, + "step": 43 + }, + { + "epoch": 0.004085422469823584, + "grad_norm": 2.231906908850155, + "learning_rate": 2.15e-06, + "loss": 0.8441, + "num_tokens": 628069.0, + "step": 44 + }, + { + "epoch": 0.004178272980501393, + "grad_norm": 2.214069617257424, + "learning_rate": 2.2e-06, + "loss": 0.644, + "num_tokens": 640353.0, + "step": 45 + }, + { + "epoch": 0.004271123491179201, + "grad_norm": 2.2615830412895033, + "learning_rate": 2.25e-06, + "loss": 0.7159, + "num_tokens": 655384.0, + "step": 46 + }, + { + "epoch": 0.00436397400185701, + "grad_norm": 2.0000370368189193, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.6425, + "num_tokens": 669066.0, + "step": 47 + }, + { + "epoch": 0.004456824512534819, + "grad_norm": 2.1346640899314986, + "learning_rate": 2.35e-06, + "loss": 0.6939, + "num_tokens": 682055.0, + "step": 48 + }, + { + "epoch": 0.0045496750232126276, + "grad_norm": 2.1853877550069436, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.7328, + "num_tokens": 696851.0, + "step": 49 + }, + { + "epoch": 0.004642525533890436, + "grad_norm": 2.097531541230524, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.6802, + "num_tokens": 711014.0, + "step": 50 + }, + { + "epoch": 0.004735376044568245, + "grad_norm": 2.124733849595952, + "learning_rate": 2.5e-06, + "loss": 0.6325, + "num_tokens": 724925.0, + "step": 51 + }, + { + "epoch": 0.004828226555246054, + "grad_norm": 2.175546181546373, + "learning_rate": 2.55e-06, + "loss": 0.6227, + "num_tokens": 738283.0, + "step": 52 + }, + { + "epoch": 0.004921077065923863, + "grad_norm": 2.4607205368314093, + "learning_rate": 2.6e-06, + "loss": 0.8863, + "num_tokens": 751939.0, + "step": 53 + }, + { + "epoch": 0.005013927576601671, + "grad_norm": 1.9997530119482616, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.6715, + "num_tokens": 766863.0, + "step": 54 + }, + { + "epoch": 0.00510677808727948, + "grad_norm": 2.146481949226111, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.732, + "num_tokens": 781712.0, + "step": 55 + }, + { + "epoch": 0.005199628597957289, + "grad_norm": 2.3508341185952366, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.7888, + "num_tokens": 795599.0, + "step": 56 + }, + { + "epoch": 0.0052924791086350976, + "grad_norm": 2.1926190698546537, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7789, + "num_tokens": 811697.0, + "step": 57 + }, + { + "epoch": 0.005385329619312906, + "grad_norm": 2.4219159793944005, + "learning_rate": 2.85e-06, + "loss": 0.8476, + "num_tokens": 824438.0, + "step": 58 + }, + { + "epoch": 0.005478180129990715, + "grad_norm": 2.0614504154319193, + "learning_rate": 2.9e-06, + "loss": 0.7168, + "num_tokens": 839593.0, + "step": 59 + }, + { + "epoch": 0.005571030640668524, + "grad_norm": 2.2502153719806524, + "learning_rate": 2.95e-06, + "loss": 0.7145, + "num_tokens": 853454.0, + "step": 60 + }, + { + "epoch": 0.005663881151346332, + "grad_norm": 2.3293117561929435, + "learning_rate": 3e-06, + "loss": 0.7381, + "num_tokens": 868236.0, + "step": 61 + }, + { + "epoch": 0.005756731662024141, + "grad_norm": 2.07810621547947, + "learning_rate": 3.05e-06, + "loss": 0.6148, + "num_tokens": 884001.0, + "step": 62 + }, + { + "epoch": 0.00584958217270195, + "grad_norm": 2.1871098674401894, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.7906, + "num_tokens": 898223.0, + "step": 63 + }, + { + "epoch": 0.005942432683379758, + "grad_norm": 2.047799574220529, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.6349, + "num_tokens": 913430.0, + "step": 64 + }, + { + "epoch": 0.0060352831940575676, + "grad_norm": 2.072528524202935, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.6659, + "num_tokens": 928578.0, + "step": 65 + }, + { + "epoch": 0.006128133704735376, + "grad_norm": 2.159087212743246, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.6658, + "num_tokens": 942520.0, + "step": 66 + }, + { + "epoch": 0.006220984215413184, + "grad_norm": 2.0234147107706004, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.6321, + "num_tokens": 957209.0, + "step": 67 + }, + { + "epoch": 0.006313834726090994, + "grad_norm": 2.3119730292263876, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.6896, + "num_tokens": 970890.0, + "step": 68 + }, + { + "epoch": 0.006406685236768802, + "grad_norm": 2.1419355844008345, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.6735, + "num_tokens": 986364.0, + "step": 69 + }, + { + "epoch": 0.0064995357474466105, + "grad_norm": 2.2182354874661865, + "learning_rate": 3.45e-06, + "loss": 0.7457, + "num_tokens": 999624.0, + "step": 70 + }, + { + "epoch": 0.00659238625812442, + "grad_norm": 2.1301051869821546, + "learning_rate": 3.5e-06, + "loss": 0.5716, + "num_tokens": 1011560.0, + "step": 71 + }, + { + "epoch": 0.006685236768802228, + "grad_norm": 2.055313999030009, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.6408, + "num_tokens": 1025935.0, + "step": 72 + }, + { + "epoch": 0.0067780872794800376, + "grad_norm": 2.0320897547354586, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.6677, + "num_tokens": 1041929.0, + "step": 73 + }, + { + "epoch": 0.006870937790157846, + "grad_norm": 2.1957089951294866, + "learning_rate": 3.65e-06, + "loss": 0.7533, + "num_tokens": 1056525.0, + "step": 74 + }, + { + "epoch": 0.006963788300835654, + "grad_norm": 2.1303284632089143, + "learning_rate": 3.7e-06, + "loss": 0.6551, + "num_tokens": 1071307.0, + "step": 75 + }, + { + "epoch": 0.007056638811513464, + "grad_norm": 2.379880562813867, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7081, + "num_tokens": 1083650.0, + "step": 76 + }, + { + "epoch": 0.007149489322191272, + "grad_norm": 1.9353928565445153, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.6264, + "num_tokens": 1099288.0, + "step": 77 + }, + { + "epoch": 0.0072423398328690805, + "grad_norm": 2.378942215613804, + "learning_rate": 3.85e-06, + "loss": 0.7502, + "num_tokens": 1112453.0, + "step": 78 + }, + { + "epoch": 0.00733519034354689, + "grad_norm": 2.0179685145828166, + "learning_rate": 3.900000000000001e-06, + "loss": 0.6985, + "num_tokens": 1128092.0, + "step": 79 + }, + { + "epoch": 0.007428040854224698, + "grad_norm": 2.338327873026441, + "learning_rate": 3.95e-06, + "loss": 0.8046, + "num_tokens": 1142544.0, + "step": 80 + }, + { + "epoch": 0.007520891364902507, + "grad_norm": 2.160204745367612, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6533, + "num_tokens": 1156504.0, + "step": 81 + }, + { + "epoch": 0.007613741875580316, + "grad_norm": 2.041858980135371, + "learning_rate": 4.05e-06, + "loss": 0.7079, + "num_tokens": 1172348.0, + "step": 82 + }, + { + "epoch": 0.007706592386258124, + "grad_norm": 2.293588942292026, + "learning_rate": 4.1e-06, + "loss": 0.6849, + "num_tokens": 1186064.0, + "step": 83 + }, + { + "epoch": 0.007799442896935933, + "grad_norm": 2.210814706722604, + "learning_rate": 4.15e-06, + "loss": 0.8255, + "num_tokens": 1201940.0, + "step": 84 + }, + { + "epoch": 0.007892293407613741, + "grad_norm": 2.159267045230126, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.7131, + "num_tokens": 1217706.0, + "step": 85 + }, + { + "epoch": 0.00798514391829155, + "grad_norm": 2.4600081935708364, + "learning_rate": 4.25e-06, + "loss": 0.727, + "num_tokens": 1230820.0, + "step": 86 + }, + { + "epoch": 0.00807799442896936, + "grad_norm": 2.0812134722717124, + "learning_rate": 4.3e-06, + "loss": 0.6788, + "num_tokens": 1245894.0, + "step": 87 + }, + { + "epoch": 0.008170844939647167, + "grad_norm": 2.213998537048535, + "learning_rate": 4.350000000000001e-06, + "loss": 0.743, + "num_tokens": 1260472.0, + "step": 88 + }, + { + "epoch": 0.008263695450324977, + "grad_norm": 2.247398053031056, + "learning_rate": 4.4e-06, + "loss": 0.7532, + "num_tokens": 1275561.0, + "step": 89 + }, + { + "epoch": 0.008356545961002786, + "grad_norm": 1.9869509142912232, + "learning_rate": 4.450000000000001e-06, + "loss": 0.6027, + "num_tokens": 1290411.0, + "step": 90 + }, + { + "epoch": 0.008449396471680594, + "grad_norm": 2.10555291017345, + "learning_rate": 4.5e-06, + "loss": 0.6669, + "num_tokens": 1304343.0, + "step": 91 + }, + { + "epoch": 0.008542246982358403, + "grad_norm": 1.9766517334282452, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.5788, + "num_tokens": 1319620.0, + "step": 92 + }, + { + "epoch": 0.008635097493036212, + "grad_norm": 2.2060354019031654, + "learning_rate": 4.600000000000001e-06, + "loss": 0.6485, + "num_tokens": 1333475.0, + "step": 93 + }, + { + "epoch": 0.00872794800371402, + "grad_norm": 2.174901287494263, + "learning_rate": 4.65e-06, + "loss": 0.7091, + "num_tokens": 1348738.0, + "step": 94 + }, + { + "epoch": 0.008820798514391829, + "grad_norm": 2.3102005788024664, + "learning_rate": 4.7e-06, + "loss": 0.8085, + "num_tokens": 1362927.0, + "step": 95 + }, + { + "epoch": 0.008913649025069638, + "grad_norm": 2.0375894520412428, + "learning_rate": 4.75e-06, + "loss": 0.5361, + "num_tokens": 1375675.0, + "step": 96 + }, + { + "epoch": 0.009006499535747446, + "grad_norm": 2.2092690547244858, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7782, + "num_tokens": 1391107.0, + "step": 97 + }, + { + "epoch": 0.009099350046425255, + "grad_norm": 2.094227515449823, + "learning_rate": 4.85e-06, + "loss": 0.6001, + "num_tokens": 1404931.0, + "step": 98 + }, + { + "epoch": 0.009192200557103064, + "grad_norm": 2.272494949140071, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7559, + "num_tokens": 1418956.0, + "step": 99 + }, + { + "epoch": 0.009285051067780872, + "grad_norm": 2.2304997413893473, + "learning_rate": 4.95e-06, + "loss": 0.6957, + "num_tokens": 1432980.0, + "step": 100 + }, + { + "epoch": 0.009377901578458681, + "grad_norm": 2.253728423429401, + "learning_rate": 5e-06, + "loss": 0.6984, + "num_tokens": 1445982.0, + "step": 101 + }, + { + "epoch": 0.00947075208913649, + "grad_norm": 2.0982182949905877, + "learning_rate": 5e-06, + "loss": 0.6303, + "num_tokens": 1461991.0, + "step": 102 + }, + { + "epoch": 0.0095636025998143, + "grad_norm": 2.2482306071433342, + "learning_rate": 5e-06, + "loss": 0.7276, + "num_tokens": 1476407.0, + "step": 103 + }, + { + "epoch": 0.009656453110492107, + "grad_norm": 2.214577767879491, + "learning_rate": 5e-06, + "loss": 0.6006, + "num_tokens": 1489653.0, + "step": 104 + }, + { + "epoch": 0.009749303621169917, + "grad_norm": 2.1615899582775224, + "learning_rate": 5e-06, + "loss": 0.743, + "num_tokens": 1504334.0, + "step": 105 + }, + { + "epoch": 0.009842154131847726, + "grad_norm": 2.3300960279758423, + "learning_rate": 5e-06, + "loss": 0.7086, + "num_tokens": 1519262.0, + "step": 106 + }, + { + "epoch": 0.009935004642525534, + "grad_norm": 2.170259841672104, + "learning_rate": 5e-06, + "loss": 0.6716, + "num_tokens": 1533532.0, + "step": 107 + }, + { + "epoch": 0.010027855153203343, + "grad_norm": 1.925891870719645, + "learning_rate": 5e-06, + "loss": 0.5642, + "num_tokens": 1547259.0, + "step": 108 + }, + { + "epoch": 0.010120705663881152, + "grad_norm": 2.111904758392159, + "learning_rate": 5e-06, + "loss": 0.6685, + "num_tokens": 1562390.0, + "step": 109 + }, + { + "epoch": 0.01021355617455896, + "grad_norm": 2.1911978191937185, + "learning_rate": 5e-06, + "loss": 0.7214, + "num_tokens": 1576122.0, + "step": 110 + }, + { + "epoch": 0.010306406685236769, + "grad_norm": 2.274771164711765, + "learning_rate": 5e-06, + "loss": 0.7586, + "num_tokens": 1589086.0, + "step": 111 + }, + { + "epoch": 0.010399257195914578, + "grad_norm": 2.242339962556352, + "learning_rate": 5e-06, + "loss": 0.6064, + "num_tokens": 1603143.0, + "step": 112 + }, + { + "epoch": 0.010492107706592386, + "grad_norm": 2.1717882766864443, + "learning_rate": 5e-06, + "loss": 0.6991, + "num_tokens": 1617570.0, + "step": 113 + }, + { + "epoch": 0.010584958217270195, + "grad_norm": 2.2270418800917975, + "learning_rate": 5e-06, + "loss": 0.6895, + "num_tokens": 1631062.0, + "step": 114 + }, + { + "epoch": 0.010677808727948004, + "grad_norm": 2.2203050227232506, + "learning_rate": 5e-06, + "loss": 0.6629, + "num_tokens": 1644695.0, + "step": 115 + }, + { + "epoch": 0.010770659238625812, + "grad_norm": 2.1752803464623387, + "learning_rate": 5e-06, + "loss": 0.6389, + "num_tokens": 1659457.0, + "step": 116 + }, + { + "epoch": 0.010863509749303621, + "grad_norm": 2.2267364334521673, + "learning_rate": 5e-06, + "loss": 0.5714, + "num_tokens": 1671902.0, + "step": 117 + }, + { + "epoch": 0.01095636025998143, + "grad_norm": 2.1685197345168508, + "learning_rate": 5e-06, + "loss": 0.5996, + "num_tokens": 1685274.0, + "step": 118 + }, + { + "epoch": 0.011049210770659238, + "grad_norm": 2.0400458548436315, + "learning_rate": 5e-06, + "loss": 0.6371, + "num_tokens": 1700035.0, + "step": 119 + }, + { + "epoch": 0.011142061281337047, + "grad_norm": 2.2768711270665807, + "learning_rate": 5e-06, + "loss": 0.7662, + "num_tokens": 1715455.0, + "step": 120 + }, + { + "epoch": 0.011234911792014857, + "grad_norm": 2.338383702911172, + "learning_rate": 5e-06, + "loss": 0.7496, + "num_tokens": 1728641.0, + "step": 121 + }, + { + "epoch": 0.011327762302692664, + "grad_norm": 2.278402117757206, + "learning_rate": 5e-06, + "loss": 0.6816, + "num_tokens": 1742447.0, + "step": 122 + }, + { + "epoch": 0.011420612813370474, + "grad_norm": 2.2279631600998107, + "learning_rate": 5e-06, + "loss": 0.7281, + "num_tokens": 1756499.0, + "step": 123 + }, + { + "epoch": 0.011513463324048283, + "grad_norm": 2.110584796389449, + "learning_rate": 5e-06, + "loss": 0.6212, + "num_tokens": 1771461.0, + "step": 124 + }, + { + "epoch": 0.01160631383472609, + "grad_norm": 2.08864563034088, + "learning_rate": 5e-06, + "loss": 0.6583, + "num_tokens": 1786032.0, + "step": 125 + }, + { + "epoch": 0.0116991643454039, + "grad_norm": 2.252747614850935, + "learning_rate": 5e-06, + "loss": 0.6045, + "num_tokens": 1799205.0, + "step": 126 + }, + { + "epoch": 0.011792014856081709, + "grad_norm": 2.13661482422633, + "learning_rate": 5e-06, + "loss": 0.6617, + "num_tokens": 1812961.0, + "step": 127 + }, + { + "epoch": 0.011884865366759517, + "grad_norm": 2.20525156097626, + "learning_rate": 5e-06, + "loss": 0.6795, + "num_tokens": 1827874.0, + "step": 128 + }, + { + "epoch": 0.011977715877437326, + "grad_norm": 2.101815112041532, + "learning_rate": 5e-06, + "loss": 0.6661, + "num_tokens": 1842979.0, + "step": 129 + }, + { + "epoch": 0.012070566388115135, + "grad_norm": 2.2675035857838766, + "learning_rate": 5e-06, + "loss": 0.7023, + "num_tokens": 1858451.0, + "step": 130 + }, + { + "epoch": 0.012163416898792943, + "grad_norm": 2.5822942714992694, + "learning_rate": 5e-06, + "loss": 0.7488, + "num_tokens": 1873637.0, + "step": 131 + }, + { + "epoch": 0.012256267409470752, + "grad_norm": 2.099884777406776, + "learning_rate": 5e-06, + "loss": 0.6535, + "num_tokens": 1888886.0, + "step": 132 + }, + { + "epoch": 0.012349117920148561, + "grad_norm": 2.407132763677911, + "learning_rate": 5e-06, + "loss": 0.7241, + "num_tokens": 1901712.0, + "step": 133 + }, + { + "epoch": 0.012441968430826369, + "grad_norm": 2.273164539766026, + "learning_rate": 5e-06, + "loss": 0.8251, + "num_tokens": 1916839.0, + "step": 134 + }, + { + "epoch": 0.012534818941504178, + "grad_norm": 2.1757692329071423, + "learning_rate": 5e-06, + "loss": 0.6474, + "num_tokens": 1930738.0, + "step": 135 + }, + { + "epoch": 0.012627669452181987, + "grad_norm": 2.1000246390915374, + "learning_rate": 5e-06, + "loss": 0.6575, + "num_tokens": 1945687.0, + "step": 136 + }, + { + "epoch": 0.012720519962859795, + "grad_norm": 2.3486532085154073, + "learning_rate": 5e-06, + "loss": 0.7223, + "num_tokens": 1958947.0, + "step": 137 + }, + { + "epoch": 0.012813370473537604, + "grad_norm": 2.1957507854935128, + "learning_rate": 5e-06, + "loss": 0.6823, + "num_tokens": 1972778.0, + "step": 138 + }, + { + "epoch": 0.012906220984215414, + "grad_norm": 2.2307347422883943, + "learning_rate": 5e-06, + "loss": 0.7317, + "num_tokens": 1986443.0, + "step": 139 + }, + { + "epoch": 0.012999071494893221, + "grad_norm": 2.0675934430996636, + "learning_rate": 5e-06, + "loss": 0.6607, + "num_tokens": 2001831.0, + "step": 140 + }, + { + "epoch": 0.01309192200557103, + "grad_norm": 2.0233348443154657, + "learning_rate": 5e-06, + "loss": 0.7004, + "num_tokens": 2016960.0, + "step": 141 + }, + { + "epoch": 0.01318477251624884, + "grad_norm": 1.972074169607146, + "learning_rate": 5e-06, + "loss": 0.637, + "num_tokens": 2032512.0, + "step": 142 + }, + { + "epoch": 0.013277623026926649, + "grad_norm": 2.173060672824674, + "learning_rate": 5e-06, + "loss": 0.616, + "num_tokens": 2047487.0, + "step": 143 + }, + { + "epoch": 0.013370473537604457, + "grad_norm": 2.1926336954495103, + "learning_rate": 5e-06, + "loss": 0.5793, + "num_tokens": 2060914.0, + "step": 144 + }, + { + "epoch": 0.013463324048282266, + "grad_norm": 2.3026536079434896, + "learning_rate": 5e-06, + "loss": 0.7222, + "num_tokens": 2075445.0, + "step": 145 + }, + { + "epoch": 0.013556174558960075, + "grad_norm": 2.208594549139958, + "learning_rate": 5e-06, + "loss": 0.5821, + "num_tokens": 2088872.0, + "step": 146 + }, + { + "epoch": 0.013649025069637883, + "grad_norm": 2.278486188101276, + "learning_rate": 5e-06, + "loss": 0.6495, + "num_tokens": 2102868.0, + "step": 147 + }, + { + "epoch": 0.013741875580315692, + "grad_norm": 2.117513297815845, + "learning_rate": 5e-06, + "loss": 0.6486, + "num_tokens": 2116915.0, + "step": 148 + }, + { + "epoch": 0.013834726090993501, + "grad_norm": 2.0969505441117526, + "learning_rate": 5e-06, + "loss": 0.7078, + "num_tokens": 2133299.0, + "step": 149 + }, + { + "epoch": 0.013927576601671309, + "grad_norm": 2.476979027211727, + "learning_rate": 5e-06, + "loss": 0.694, + "num_tokens": 2145791.0, + "step": 150 + }, + { + "epoch": 0.014020427112349118, + "grad_norm": 2.2184819672497817, + "learning_rate": 5e-06, + "loss": 0.5912, + "num_tokens": 2159027.0, + "step": 151 + }, + { + "epoch": 0.014113277623026927, + "grad_norm": 2.1742202212998087, + "learning_rate": 5e-06, + "loss": 0.6653, + "num_tokens": 2174774.0, + "step": 152 + }, + { + "epoch": 0.014206128133704735, + "grad_norm": 2.077664267074751, + "learning_rate": 5e-06, + "loss": 0.6276, + "num_tokens": 2189546.0, + "step": 153 + }, + { + "epoch": 0.014298978644382544, + "grad_norm": 2.084170029156568, + "learning_rate": 5e-06, + "loss": 0.6357, + "num_tokens": 2204046.0, + "step": 154 + }, + { + "epoch": 0.014391829155060354, + "grad_norm": 2.196496997514834, + "learning_rate": 5e-06, + "loss": 0.6929, + "num_tokens": 2219396.0, + "step": 155 + }, + { + "epoch": 0.014484679665738161, + "grad_norm": 2.0669347905326854, + "learning_rate": 5e-06, + "loss": 0.5721, + "num_tokens": 2232836.0, + "step": 156 + }, + { + "epoch": 0.01457753017641597, + "grad_norm": 2.1505494269109544, + "learning_rate": 5e-06, + "loss": 0.6067, + "num_tokens": 2247018.0, + "step": 157 + }, + { + "epoch": 0.01467038068709378, + "grad_norm": 2.1236648342068243, + "learning_rate": 5e-06, + "loss": 0.6502, + "num_tokens": 2261727.0, + "step": 158 + }, + { + "epoch": 0.014763231197771587, + "grad_norm": 2.054641741686151, + "learning_rate": 5e-06, + "loss": 0.6346, + "num_tokens": 2276662.0, + "step": 159 + }, + { + "epoch": 0.014856081708449397, + "grad_norm": 2.1451135516474955, + "learning_rate": 5e-06, + "loss": 0.6623, + "num_tokens": 2290536.0, + "step": 160 + }, + { + "epoch": 0.014948932219127206, + "grad_norm": 2.062927485355409, + "learning_rate": 5e-06, + "loss": 0.7109, + "num_tokens": 2306338.0, + "step": 161 + }, + { + "epoch": 0.015041782729805013, + "grad_norm": 2.089139938446874, + "learning_rate": 5e-06, + "loss": 0.628, + "num_tokens": 2320157.0, + "step": 162 + }, + { + "epoch": 0.015134633240482823, + "grad_norm": 2.1989311110189353, + "learning_rate": 5e-06, + "loss": 0.6974, + "num_tokens": 2334296.0, + "step": 163 + }, + { + "epoch": 0.015227483751160632, + "grad_norm": 2.141559784284699, + "learning_rate": 5e-06, + "loss": 0.5695, + "num_tokens": 2349383.0, + "step": 164 + }, + { + "epoch": 0.01532033426183844, + "grad_norm": 2.160417598782193, + "learning_rate": 5e-06, + "loss": 0.6379, + "num_tokens": 2363961.0, + "step": 165 + }, + { + "epoch": 0.015413184772516249, + "grad_norm": 2.164953235417068, + "learning_rate": 5e-06, + "loss": 0.5624, + "num_tokens": 2377850.0, + "step": 166 + }, + { + "epoch": 0.015506035283194058, + "grad_norm": 2.069669733937564, + "learning_rate": 5e-06, + "loss": 0.6306, + "num_tokens": 2392415.0, + "step": 167 + }, + { + "epoch": 0.015598885793871866, + "grad_norm": 2.240677761614807, + "learning_rate": 5e-06, + "loss": 0.6584, + "num_tokens": 2405603.0, + "step": 168 + }, + { + "epoch": 0.015691736304549675, + "grad_norm": 2.2382089533981, + "learning_rate": 5e-06, + "loss": 0.6496, + "num_tokens": 2419136.0, + "step": 169 + }, + { + "epoch": 0.015784586815227482, + "grad_norm": 2.0815864034367286, + "learning_rate": 5e-06, + "loss": 0.6736, + "num_tokens": 2434747.0, + "step": 170 + }, + { + "epoch": 0.015877437325905294, + "grad_norm": 2.105125043014408, + "learning_rate": 5e-06, + "loss": 0.6898, + "num_tokens": 2449621.0, + "step": 171 + }, + { + "epoch": 0.0159702878365831, + "grad_norm": 2.183801101504404, + "learning_rate": 5e-06, + "loss": 0.6885, + "num_tokens": 2464795.0, + "step": 172 + }, + { + "epoch": 0.01606313834726091, + "grad_norm": 2.0670492420519735, + "learning_rate": 5e-06, + "loss": 0.6156, + "num_tokens": 2479842.0, + "step": 173 + }, + { + "epoch": 0.01615598885793872, + "grad_norm": 2.0051671041444474, + "learning_rate": 5e-06, + "loss": 0.5744, + "num_tokens": 2495044.0, + "step": 174 + }, + { + "epoch": 0.016248839368616527, + "grad_norm": 2.109795551967952, + "learning_rate": 5e-06, + "loss": 0.6302, + "num_tokens": 2509678.0, + "step": 175 + }, + { + "epoch": 0.016341689879294335, + "grad_norm": 2.254370102067108, + "learning_rate": 5e-06, + "loss": 0.6981, + "num_tokens": 2524540.0, + "step": 176 + }, + { + "epoch": 0.016434540389972146, + "grad_norm": 2.2135679568153135, + "learning_rate": 5e-06, + "loss": 0.6996, + "num_tokens": 2539311.0, + "step": 177 + }, + { + "epoch": 0.016527390900649953, + "grad_norm": 2.338341709424745, + "learning_rate": 5e-06, + "loss": 0.604, + "num_tokens": 2551205.0, + "step": 178 + }, + { + "epoch": 0.01662024141132776, + "grad_norm": 2.1209764926017023, + "learning_rate": 5e-06, + "loss": 0.6645, + "num_tokens": 2566173.0, + "step": 179 + }, + { + "epoch": 0.016713091922005572, + "grad_norm": 2.151286868803476, + "learning_rate": 5e-06, + "loss": 0.6823, + "num_tokens": 2581044.0, + "step": 180 + }, + { + "epoch": 0.01680594243268338, + "grad_norm": 2.189250522851583, + "learning_rate": 5e-06, + "loss": 0.6526, + "num_tokens": 2596487.0, + "step": 181 + }, + { + "epoch": 0.016898792943361187, + "grad_norm": 2.186328986629683, + "learning_rate": 5e-06, + "loss": 0.6753, + "num_tokens": 2610606.0, + "step": 182 + }, + { + "epoch": 0.016991643454038998, + "grad_norm": 2.364126203991404, + "learning_rate": 5e-06, + "loss": 0.616, + "num_tokens": 2624432.0, + "step": 183 + }, + { + "epoch": 0.017084493964716806, + "grad_norm": 2.1888284025963154, + "learning_rate": 5e-06, + "loss": 0.6479, + "num_tokens": 2639311.0, + "step": 184 + }, + { + "epoch": 0.017177344475394613, + "grad_norm": 2.323079260058111, + "learning_rate": 5e-06, + "loss": 0.6708, + "num_tokens": 2652714.0, + "step": 185 + }, + { + "epoch": 0.017270194986072424, + "grad_norm": 2.2884440294795585, + "learning_rate": 5e-06, + "loss": 0.6372, + "num_tokens": 2667229.0, + "step": 186 + }, + { + "epoch": 0.017363045496750232, + "grad_norm": 2.217934628311155, + "learning_rate": 5e-06, + "loss": 0.7043, + "num_tokens": 2682865.0, + "step": 187 + }, + { + "epoch": 0.01745589600742804, + "grad_norm": 2.3157428549775663, + "learning_rate": 5e-06, + "loss": 0.6748, + "num_tokens": 2696639.0, + "step": 188 + }, + { + "epoch": 0.01754874651810585, + "grad_norm": 2.3284546257646457, + "learning_rate": 5e-06, + "loss": 0.7613, + "num_tokens": 2711292.0, + "step": 189 + }, + { + "epoch": 0.017641597028783658, + "grad_norm": 2.317116166466421, + "learning_rate": 5e-06, + "loss": 0.6445, + "num_tokens": 2726671.0, + "step": 190 + }, + { + "epoch": 0.017734447539461465, + "grad_norm": 2.157623149959027, + "learning_rate": 5e-06, + "loss": 0.643, + "num_tokens": 2742312.0, + "step": 191 + }, + { + "epoch": 0.017827298050139277, + "grad_norm": 2.433893213601624, + "learning_rate": 5e-06, + "loss": 0.7253, + "num_tokens": 2755294.0, + "step": 192 + }, + { + "epoch": 0.017920148560817084, + "grad_norm": 2.25213239409801, + "learning_rate": 5e-06, + "loss": 0.6495, + "num_tokens": 2769102.0, + "step": 193 + }, + { + "epoch": 0.01801299907149489, + "grad_norm": 2.218303922525866, + "learning_rate": 5e-06, + "loss": 0.637, + "num_tokens": 2782541.0, + "step": 194 + }, + { + "epoch": 0.018105849582172703, + "grad_norm": 2.299655956361809, + "learning_rate": 5e-06, + "loss": 0.7323, + "num_tokens": 2796304.0, + "step": 195 + }, + { + "epoch": 0.01819870009285051, + "grad_norm": 2.1297851145792843, + "learning_rate": 5e-06, + "loss": 0.5839, + "num_tokens": 2810399.0, + "step": 196 + }, + { + "epoch": 0.018291550603528318, + "grad_norm": 2.293774562094206, + "learning_rate": 5e-06, + "loss": 0.6575, + "num_tokens": 2823770.0, + "step": 197 + }, + { + "epoch": 0.01838440111420613, + "grad_norm": 2.243942627712653, + "learning_rate": 5e-06, + "loss": 0.6062, + "num_tokens": 2837299.0, + "step": 198 + }, + { + "epoch": 0.018477251624883936, + "grad_norm": 2.27973787526089, + "learning_rate": 5e-06, + "loss": 0.7118, + "num_tokens": 2851808.0, + "step": 199 + }, + { + "epoch": 0.018570102135561744, + "grad_norm": 2.32163511239, + "learning_rate": 5e-06, + "loss": 0.5771, + "num_tokens": 2864824.0, + "step": 200 + }, + { + "epoch": 0.018662952646239555, + "grad_norm": 2.2612929897659457, + "learning_rate": 5e-06, + "loss": 0.6343, + "num_tokens": 2878643.0, + "step": 201 + }, + { + "epoch": 0.018755803156917362, + "grad_norm": 2.2474703557948916, + "learning_rate": 5e-06, + "loss": 0.6425, + "num_tokens": 2893041.0, + "step": 202 + }, + { + "epoch": 0.01884865366759517, + "grad_norm": 2.401633671007521, + "learning_rate": 5e-06, + "loss": 0.8355, + "num_tokens": 2907356.0, + "step": 203 + }, + { + "epoch": 0.01894150417827298, + "grad_norm": 2.3621468378159816, + "learning_rate": 5e-06, + "loss": 0.6836, + "num_tokens": 2921877.0, + "step": 204 + }, + { + "epoch": 0.01903435468895079, + "grad_norm": 2.273712797852922, + "learning_rate": 5e-06, + "loss": 0.6374, + "num_tokens": 2935194.0, + "step": 205 + }, + { + "epoch": 0.0191272051996286, + "grad_norm": 2.147948489705186, + "learning_rate": 5e-06, + "loss": 0.6122, + "num_tokens": 2949929.0, + "step": 206 + }, + { + "epoch": 0.019220055710306407, + "grad_norm": 2.2544543211998977, + "learning_rate": 5e-06, + "loss": 0.6513, + "num_tokens": 2963136.0, + "step": 207 + }, + { + "epoch": 0.019312906220984215, + "grad_norm": 2.1862788194284866, + "learning_rate": 5e-06, + "loss": 0.6832, + "num_tokens": 2978416.0, + "step": 208 + }, + { + "epoch": 0.019405756731662026, + "grad_norm": 2.083087394561942, + "learning_rate": 5e-06, + "loss": 0.6517, + "num_tokens": 2993059.0, + "step": 209 + }, + { + "epoch": 0.019498607242339833, + "grad_norm": 2.12818688478096, + "learning_rate": 5e-06, + "loss": 0.626, + "num_tokens": 3006906.0, + "step": 210 + }, + { + "epoch": 0.01959145775301764, + "grad_norm": 2.2171269622446115, + "learning_rate": 5e-06, + "loss": 0.5625, + "num_tokens": 3020317.0, + "step": 211 + }, + { + "epoch": 0.019684308263695452, + "grad_norm": 1.9588502427734422, + "learning_rate": 5e-06, + "loss": 0.5676, + "num_tokens": 3034988.0, + "step": 212 + }, + { + "epoch": 0.01977715877437326, + "grad_norm": 2.0134794296167886, + "learning_rate": 5e-06, + "loss": 0.613, + "num_tokens": 3049385.0, + "step": 213 + }, + { + "epoch": 0.019870009285051067, + "grad_norm": 2.166042576850574, + "learning_rate": 5e-06, + "loss": 0.6036, + "num_tokens": 3062877.0, + "step": 214 + }, + { + "epoch": 0.019962859795728878, + "grad_norm": 2.1682852331679303, + "learning_rate": 5e-06, + "loss": 0.6983, + "num_tokens": 3076704.0, + "step": 215 + }, + { + "epoch": 0.020055710306406686, + "grad_norm": 2.3439291101989164, + "learning_rate": 5e-06, + "loss": 0.7482, + "num_tokens": 3090205.0, + "step": 216 + }, + { + "epoch": 0.020148560817084493, + "grad_norm": 2.1828814787503474, + "learning_rate": 5e-06, + "loss": 0.5395, + "num_tokens": 3101766.0, + "step": 217 + }, + { + "epoch": 0.020241411327762304, + "grad_norm": 1.924987339093511, + "learning_rate": 5e-06, + "loss": 0.5764, + "num_tokens": 3117231.0, + "step": 218 + }, + { + "epoch": 0.020334261838440112, + "grad_norm": 1.947709918328564, + "learning_rate": 5e-06, + "loss": 0.5369, + "num_tokens": 3132027.0, + "step": 219 + }, + { + "epoch": 0.02042711234911792, + "grad_norm": 2.2785349454388344, + "learning_rate": 5e-06, + "loss": 0.598, + "num_tokens": 3146113.0, + "step": 220 + }, + { + "epoch": 0.02051996285979573, + "grad_norm": 2.198167644389519, + "learning_rate": 5e-06, + "loss": 0.7482, + "num_tokens": 3161472.0, + "step": 221 + }, + { + "epoch": 0.020612813370473538, + "grad_norm": 2.1913734378558667, + "learning_rate": 5e-06, + "loss": 0.5618, + "num_tokens": 3173690.0, + "step": 222 + }, + { + "epoch": 0.020705663881151345, + "grad_norm": 2.1636926351833403, + "learning_rate": 5e-06, + "loss": 0.6269, + "num_tokens": 3187745.0, + "step": 223 + }, + { + "epoch": 0.020798514391829157, + "grad_norm": 2.4714410846032395, + "learning_rate": 5e-06, + "loss": 0.7892, + "num_tokens": 3201238.0, + "step": 224 + }, + { + "epoch": 0.020891364902506964, + "grad_norm": 2.3285243058037786, + "learning_rate": 5e-06, + "loss": 0.6138, + "num_tokens": 3214519.0, + "step": 225 + }, + { + "epoch": 0.02098421541318477, + "grad_norm": 2.3412892823833937, + "learning_rate": 5e-06, + "loss": 0.6636, + "num_tokens": 3228265.0, + "step": 226 + }, + { + "epoch": 0.021077065923862583, + "grad_norm": 2.1220429233886033, + "learning_rate": 5e-06, + "loss": 0.6886, + "num_tokens": 3243767.0, + "step": 227 + }, + { + "epoch": 0.02116991643454039, + "grad_norm": 2.255105313458723, + "learning_rate": 5e-06, + "loss": 0.7232, + "num_tokens": 3258805.0, + "step": 228 + }, + { + "epoch": 0.021262766945218198, + "grad_norm": 2.527306641219495, + "learning_rate": 5e-06, + "loss": 0.7722, + "num_tokens": 3272245.0, + "step": 229 + }, + { + "epoch": 0.02135561745589601, + "grad_norm": 2.1131272714325195, + "learning_rate": 5e-06, + "loss": 0.6262, + "num_tokens": 3285256.0, + "step": 230 + }, + { + "epoch": 0.021448467966573816, + "grad_norm": 2.0418637482563957, + "learning_rate": 5e-06, + "loss": 0.5851, + "num_tokens": 3298365.0, + "step": 231 + }, + { + "epoch": 0.021541318477251624, + "grad_norm": 2.158977359744682, + "learning_rate": 5e-06, + "loss": 0.5906, + "num_tokens": 3312312.0, + "step": 232 + }, + { + "epoch": 0.021634168987929435, + "grad_norm": 1.9742557233081006, + "learning_rate": 5e-06, + "loss": 0.6009, + "num_tokens": 3328059.0, + "step": 233 + }, + { + "epoch": 0.021727019498607242, + "grad_norm": 2.132233502603073, + "learning_rate": 5e-06, + "loss": 0.6558, + "num_tokens": 3343087.0, + "step": 234 + }, + { + "epoch": 0.02181987000928505, + "grad_norm": 2.2810569030231926, + "learning_rate": 5e-06, + "loss": 0.65, + "num_tokens": 3355735.0, + "step": 235 + }, + { + "epoch": 0.02191272051996286, + "grad_norm": 1.948302033742449, + "learning_rate": 5e-06, + "loss": 0.6033, + "num_tokens": 3371814.0, + "step": 236 + }, + { + "epoch": 0.02200557103064067, + "grad_norm": 2.1743386329670185, + "learning_rate": 5e-06, + "loss": 0.7011, + "num_tokens": 3386802.0, + "step": 237 + }, + { + "epoch": 0.022098421541318476, + "grad_norm": 2.265242678206548, + "learning_rate": 5e-06, + "loss": 0.6803, + "num_tokens": 3400358.0, + "step": 238 + }, + { + "epoch": 0.022191272051996287, + "grad_norm": 2.2274423597502677, + "learning_rate": 5e-06, + "loss": 0.7217, + "num_tokens": 3415363.0, + "step": 239 + }, + { + "epoch": 0.022284122562674095, + "grad_norm": 2.2324007565693207, + "learning_rate": 5e-06, + "loss": 0.637, + "num_tokens": 3428088.0, + "step": 240 + }, + { + "epoch": 0.022376973073351902, + "grad_norm": 2.1748781129955956, + "learning_rate": 5e-06, + "loss": 0.729, + "num_tokens": 3443210.0, + "step": 241 + }, + { + "epoch": 0.022469823584029713, + "grad_norm": 2.0899441938954015, + "learning_rate": 5e-06, + "loss": 0.5811, + "num_tokens": 3458631.0, + "step": 242 + }, + { + "epoch": 0.02256267409470752, + "grad_norm": 2.1373468303916368, + "learning_rate": 5e-06, + "loss": 0.6306, + "num_tokens": 3473418.0, + "step": 243 + }, + { + "epoch": 0.02265552460538533, + "grad_norm": 2.028419713954812, + "learning_rate": 5e-06, + "loss": 0.6159, + "num_tokens": 3488587.0, + "step": 244 + }, + { + "epoch": 0.02274837511606314, + "grad_norm": 2.323069306175, + "learning_rate": 5e-06, + "loss": 0.6214, + "num_tokens": 3501985.0, + "step": 245 + }, + { + "epoch": 0.022841225626740947, + "grad_norm": 2.1007870316038746, + "learning_rate": 5e-06, + "loss": 0.6068, + "num_tokens": 3516958.0, + "step": 246 + }, + { + "epoch": 0.022934076137418755, + "grad_norm": 2.2307308986225247, + "learning_rate": 5e-06, + "loss": 0.6266, + "num_tokens": 3530940.0, + "step": 247 + }, + { + "epoch": 0.023026926648096566, + "grad_norm": 2.3599828681848534, + "learning_rate": 5e-06, + "loss": 0.8556, + "num_tokens": 3546887.0, + "step": 248 + }, + { + "epoch": 0.023119777158774373, + "grad_norm": 2.1897667882128364, + "learning_rate": 5e-06, + "loss": 0.6937, + "num_tokens": 3560480.0, + "step": 249 + }, + { + "epoch": 0.02321262766945218, + "grad_norm": 2.3345760105676234, + "learning_rate": 5e-06, + "loss": 0.6055, + "num_tokens": 3571778.0, + "step": 250 + }, + { + "epoch": 0.023305478180129992, + "grad_norm": 2.09817567450407, + "learning_rate": 5e-06, + "loss": 0.6439, + "num_tokens": 3585515.0, + "step": 251 + }, + { + "epoch": 0.0233983286908078, + "grad_norm": 2.054908889121592, + "learning_rate": 5e-06, + "loss": 0.5369, + "num_tokens": 3600687.0, + "step": 252 + }, + { + "epoch": 0.023491179201485607, + "grad_norm": 2.1269955248680743, + "learning_rate": 5e-06, + "loss": 0.6743, + "num_tokens": 3615393.0, + "step": 253 + }, + { + "epoch": 0.023584029712163418, + "grad_norm": 2.1780642521568314, + "learning_rate": 5e-06, + "loss": 0.6524, + "num_tokens": 3631134.0, + "step": 254 + }, + { + "epoch": 0.023676880222841225, + "grad_norm": 2.1593367234678884, + "learning_rate": 5e-06, + "loss": 0.5967, + "num_tokens": 3645067.0, + "step": 255 + }, + { + "epoch": 0.023769730733519033, + "grad_norm": 2.0352941572250662, + "learning_rate": 5e-06, + "loss": 0.5727, + "num_tokens": 3658995.0, + "step": 256 + }, + { + "epoch": 0.023862581244196844, + "grad_norm": 2.196653144227753, + "learning_rate": 5e-06, + "loss": 0.7371, + "num_tokens": 3673617.0, + "step": 257 + }, + { + "epoch": 0.02395543175487465, + "grad_norm": 2.207978288912168, + "learning_rate": 5e-06, + "loss": 0.6571, + "num_tokens": 3688514.0, + "step": 258 + }, + { + "epoch": 0.02404828226555246, + "grad_norm": 2.018645559472593, + "learning_rate": 5e-06, + "loss": 0.5891, + "num_tokens": 3702596.0, + "step": 259 + }, + { + "epoch": 0.02414113277623027, + "grad_norm": 2.1430340467845723, + "learning_rate": 5e-06, + "loss": 0.6306, + "num_tokens": 3716473.0, + "step": 260 + }, + { + "epoch": 0.024233983286908078, + "grad_norm": 2.168977255601283, + "learning_rate": 5e-06, + "loss": 0.6816, + "num_tokens": 3730444.0, + "step": 261 + }, + { + "epoch": 0.024326833797585885, + "grad_norm": 2.2796003984449427, + "learning_rate": 5e-06, + "loss": 0.6652, + "num_tokens": 3744864.0, + "step": 262 + }, + { + "epoch": 0.024419684308263696, + "grad_norm": 2.209430923373051, + "learning_rate": 5e-06, + "loss": 0.6192, + "num_tokens": 3758880.0, + "step": 263 + }, + { + "epoch": 0.024512534818941504, + "grad_norm": 2.082171137282792, + "learning_rate": 5e-06, + "loss": 0.6052, + "num_tokens": 3773705.0, + "step": 264 + }, + { + "epoch": 0.02460538532961931, + "grad_norm": 2.2223926215125744, + "learning_rate": 5e-06, + "loss": 0.664, + "num_tokens": 3788528.0, + "step": 265 + }, + { + "epoch": 0.024698235840297122, + "grad_norm": 2.0334259705773516, + "learning_rate": 5e-06, + "loss": 0.6338, + "num_tokens": 3804492.0, + "step": 266 + }, + { + "epoch": 0.02479108635097493, + "grad_norm": 2.090643786241404, + "learning_rate": 5e-06, + "loss": 0.6709, + "num_tokens": 3820337.0, + "step": 267 + }, + { + "epoch": 0.024883936861652738, + "grad_norm": 2.082444088583218, + "learning_rate": 5e-06, + "loss": 0.6088, + "num_tokens": 3835000.0, + "step": 268 + }, + { + "epoch": 0.02497678737233055, + "grad_norm": 2.0780985126190696, + "learning_rate": 5e-06, + "loss": 0.6468, + "num_tokens": 3848673.0, + "step": 269 + }, + { + "epoch": 0.025069637883008356, + "grad_norm": 2.219448869714847, + "learning_rate": 5e-06, + "loss": 0.6875, + "num_tokens": 3861324.0, + "step": 270 + }, + { + "epoch": 0.025162488393686164, + "grad_norm": 2.002246341800795, + "learning_rate": 5e-06, + "loss": 0.5322, + "num_tokens": 3875413.0, + "step": 271 + }, + { + "epoch": 0.025255338904363975, + "grad_norm": 2.1016274970542024, + "learning_rate": 5e-06, + "loss": 0.6233, + "num_tokens": 3890039.0, + "step": 272 + }, + { + "epoch": 0.025348189415041782, + "grad_norm": 2.0097364880596436, + "learning_rate": 5e-06, + "loss": 0.6086, + "num_tokens": 3905381.0, + "step": 273 + }, + { + "epoch": 0.02544103992571959, + "grad_norm": 2.1212841583540554, + "learning_rate": 5e-06, + "loss": 0.6499, + "num_tokens": 3920211.0, + "step": 274 + }, + { + "epoch": 0.0255338904363974, + "grad_norm": 2.2023376262568055, + "learning_rate": 5e-06, + "loss": 0.6286, + "num_tokens": 3933655.0, + "step": 275 + }, + { + "epoch": 0.02562674094707521, + "grad_norm": 2.08398989037556, + "learning_rate": 5e-06, + "loss": 0.6534, + "num_tokens": 3948567.0, + "step": 276 + }, + { + "epoch": 0.025719591457753016, + "grad_norm": 2.058813880273289, + "learning_rate": 5e-06, + "loss": 0.644, + "num_tokens": 3962489.0, + "step": 277 + }, + { + "epoch": 0.025812441968430827, + "grad_norm": 2.182201445855465, + "learning_rate": 5e-06, + "loss": 0.588, + "num_tokens": 3975823.0, + "step": 278 + }, + { + "epoch": 0.025905292479108635, + "grad_norm": 2.0404587625362347, + "learning_rate": 5e-06, + "loss": 0.633, + "num_tokens": 3990519.0, + "step": 279 + }, + { + "epoch": 0.025998142989786442, + "grad_norm": 1.9039350260416585, + "learning_rate": 5e-06, + "loss": 0.597, + "num_tokens": 4006865.0, + "step": 280 + }, + { + "epoch": 0.026090993500464253, + "grad_norm": 2.273001169849209, + "learning_rate": 5e-06, + "loss": 0.6639, + "num_tokens": 4021123.0, + "step": 281 + }, + { + "epoch": 0.02618384401114206, + "grad_norm": 2.2117943469492616, + "learning_rate": 5e-06, + "loss": 0.606, + "num_tokens": 4035637.0, + "step": 282 + }, + { + "epoch": 0.02627669452181987, + "grad_norm": 2.226179101291727, + "learning_rate": 5e-06, + "loss": 0.7121, + "num_tokens": 4048706.0, + "step": 283 + }, + { + "epoch": 0.02636954503249768, + "grad_norm": 2.1067638234163937, + "learning_rate": 5e-06, + "loss": 0.6277, + "num_tokens": 4064125.0, + "step": 284 + }, + { + "epoch": 0.026462395543175487, + "grad_norm": 2.595271893626691, + "learning_rate": 5e-06, + "loss": 0.5883, + "num_tokens": 4074436.0, + "step": 285 + }, + { + "epoch": 0.026555246053853298, + "grad_norm": 2.272289291957328, + "learning_rate": 5e-06, + "loss": 0.6517, + "num_tokens": 4086413.0, + "step": 286 + }, + { + "epoch": 0.026648096564531105, + "grad_norm": 2.0834716502812705, + "learning_rate": 5e-06, + "loss": 0.5156, + "num_tokens": 4099585.0, + "step": 287 + }, + { + "epoch": 0.026740947075208913, + "grad_norm": 2.3491240031420473, + "learning_rate": 5e-06, + "loss": 0.681, + "num_tokens": 4111180.0, + "step": 288 + }, + { + "epoch": 0.026833797585886724, + "grad_norm": 2.0312135885846794, + "learning_rate": 5e-06, + "loss": 0.6322, + "num_tokens": 4125563.0, + "step": 289 + }, + { + "epoch": 0.02692664809656453, + "grad_norm": 1.9796349688359598, + "learning_rate": 5e-06, + "loss": 0.5736, + "num_tokens": 4140095.0, + "step": 290 + }, + { + "epoch": 0.02701949860724234, + "grad_norm": 2.1434479296586177, + "learning_rate": 5e-06, + "loss": 0.5815, + "num_tokens": 4154705.0, + "step": 291 + }, + { + "epoch": 0.02711234911792015, + "grad_norm": 1.9054453130079345, + "learning_rate": 5e-06, + "loss": 0.619, + "num_tokens": 4170632.0, + "step": 292 + }, + { + "epoch": 0.027205199628597958, + "grad_norm": 2.0903827564600683, + "learning_rate": 5e-06, + "loss": 0.5966, + "num_tokens": 4185232.0, + "step": 293 + }, + { + "epoch": 0.027298050139275765, + "grad_norm": 2.0501632180856753, + "learning_rate": 5e-06, + "loss": 0.5631, + "num_tokens": 4199520.0, + "step": 294 + }, + { + "epoch": 0.027390900649953576, + "grad_norm": 2.07917163203065, + "learning_rate": 5e-06, + "loss": 0.647, + "num_tokens": 4214880.0, + "step": 295 + }, + { + "epoch": 0.027483751160631384, + "grad_norm": 2.3163039778237917, + "learning_rate": 5e-06, + "loss": 0.6674, + "num_tokens": 4228135.0, + "step": 296 + }, + { + "epoch": 0.02757660167130919, + "grad_norm": 2.1524713179745816, + "learning_rate": 5e-06, + "loss": 0.6241, + "num_tokens": 4241740.0, + "step": 297 + }, + { + "epoch": 0.027669452181987002, + "grad_norm": 1.9627447308726265, + "learning_rate": 5e-06, + "loss": 0.5844, + "num_tokens": 4256660.0, + "step": 298 + }, + { + "epoch": 0.02776230269266481, + "grad_norm": 1.954254928132177, + "learning_rate": 5e-06, + "loss": 0.5254, + "num_tokens": 4272198.0, + "step": 299 + }, + { + "epoch": 0.027855153203342618, + "grad_norm": 2.154470932794785, + "learning_rate": 5e-06, + "loss": 0.642, + "num_tokens": 4287089.0, + "step": 300 + }, + { + "epoch": 0.02794800371402043, + "grad_norm": 2.0485067283044103, + "learning_rate": 5e-06, + "loss": 0.6198, + "num_tokens": 4301984.0, + "step": 301 + }, + { + "epoch": 0.028040854224698236, + "grad_norm": 2.3450108098255416, + "learning_rate": 5e-06, + "loss": 0.7235, + "num_tokens": 4316871.0, + "step": 302 + }, + { + "epoch": 0.028133704735376044, + "grad_norm": 2.2600447018315877, + "learning_rate": 5e-06, + "loss": 0.667, + "num_tokens": 4330770.0, + "step": 303 + }, + { + "epoch": 0.028226555246053855, + "grad_norm": 2.270047777186157, + "learning_rate": 5e-06, + "loss": 0.7304, + "num_tokens": 4346490.0, + "step": 304 + }, + { + "epoch": 0.028319405756731662, + "grad_norm": 2.1560769690448383, + "learning_rate": 5e-06, + "loss": 0.7445, + "num_tokens": 4361890.0, + "step": 305 + }, + { + "epoch": 0.02841225626740947, + "grad_norm": 2.2384214562419733, + "learning_rate": 5e-06, + "loss": 0.675, + "num_tokens": 4375718.0, + "step": 306 + }, + { + "epoch": 0.02850510677808728, + "grad_norm": 2.0881928292373884, + "learning_rate": 5e-06, + "loss": 0.6246, + "num_tokens": 4390587.0, + "step": 307 + }, + { + "epoch": 0.02859795728876509, + "grad_norm": 2.12899611762672, + "learning_rate": 5e-06, + "loss": 0.6135, + "num_tokens": 4404895.0, + "step": 308 + }, + { + "epoch": 0.028690807799442896, + "grad_norm": 2.1411525726790432, + "learning_rate": 5e-06, + "loss": 0.7095, + "num_tokens": 4420476.0, + "step": 309 + }, + { + "epoch": 0.028783658310120707, + "grad_norm": 2.063960133870804, + "learning_rate": 5e-06, + "loss": 0.6276, + "num_tokens": 4434025.0, + "step": 310 + }, + { + "epoch": 0.028876508820798515, + "grad_norm": 2.1094484058829557, + "learning_rate": 5e-06, + "loss": 0.6342, + "num_tokens": 4449141.0, + "step": 311 + }, + { + "epoch": 0.028969359331476322, + "grad_norm": 2.0638074042433194, + "learning_rate": 5e-06, + "loss": 0.6691, + "num_tokens": 4464893.0, + "step": 312 + }, + { + "epoch": 0.029062209842154133, + "grad_norm": 2.428858764068653, + "learning_rate": 5e-06, + "loss": 0.6484, + "num_tokens": 4476070.0, + "step": 313 + }, + { + "epoch": 0.02915506035283194, + "grad_norm": 2.057879512805209, + "learning_rate": 5e-06, + "loss": 0.6055, + "num_tokens": 4490955.0, + "step": 314 + }, + { + "epoch": 0.02924791086350975, + "grad_norm": 2.113654869002159, + "learning_rate": 5e-06, + "loss": 0.6132, + "num_tokens": 4506110.0, + "step": 315 + }, + { + "epoch": 0.02934076137418756, + "grad_norm": 2.1522770513809024, + "learning_rate": 5e-06, + "loss": 0.7801, + "num_tokens": 4521427.0, + "step": 316 + }, + { + "epoch": 0.029433611884865367, + "grad_norm": 2.0685217116373793, + "learning_rate": 5e-06, + "loss": 0.5375, + "num_tokens": 4535643.0, + "step": 317 + }, + { + "epoch": 0.029526462395543174, + "grad_norm": 2.270354897856227, + "learning_rate": 5e-06, + "loss": 0.7061, + "num_tokens": 4549923.0, + "step": 318 + }, + { + "epoch": 0.029619312906220985, + "grad_norm": 2.252276617988608, + "learning_rate": 5e-06, + "loss": 0.6479, + "num_tokens": 4565309.0, + "step": 319 + }, + { + "epoch": 0.029712163416898793, + "grad_norm": 2.1948104785451075, + "learning_rate": 5e-06, + "loss": 0.6756, + "num_tokens": 4578623.0, + "step": 320 + }, + { + "epoch": 0.0298050139275766, + "grad_norm": 2.2752052092956885, + "learning_rate": 5e-06, + "loss": 0.7029, + "num_tokens": 4592961.0, + "step": 321 + }, + { + "epoch": 0.02989786443825441, + "grad_norm": 2.042395791887509, + "learning_rate": 5e-06, + "loss": 0.617, + "num_tokens": 4607691.0, + "step": 322 + }, + { + "epoch": 0.02999071494893222, + "grad_norm": 2.042913815309425, + "learning_rate": 5e-06, + "loss": 0.6352, + "num_tokens": 4622307.0, + "step": 323 + }, + { + "epoch": 0.030083565459610027, + "grad_norm": 2.2989866272333317, + "learning_rate": 5e-06, + "loss": 0.6022, + "num_tokens": 4634527.0, + "step": 324 + }, + { + "epoch": 0.030176415970287838, + "grad_norm": 2.1112143278064917, + "learning_rate": 5e-06, + "loss": 0.6796, + "num_tokens": 4649601.0, + "step": 325 + }, + { + "epoch": 0.030269266480965645, + "grad_norm": 2.0473928752102473, + "learning_rate": 5e-06, + "loss": 0.6146, + "num_tokens": 4663647.0, + "step": 326 + }, + { + "epoch": 0.030362116991643453, + "grad_norm": 2.1859033785634607, + "learning_rate": 5e-06, + "loss": 0.5691, + "num_tokens": 4677240.0, + "step": 327 + }, + { + "epoch": 0.030454967502321264, + "grad_norm": 2.207670788597765, + "learning_rate": 5e-06, + "loss": 0.7175, + "num_tokens": 4690976.0, + "step": 328 + }, + { + "epoch": 0.03054781801299907, + "grad_norm": 1.919580857872544, + "learning_rate": 5e-06, + "loss": 0.549, + "num_tokens": 4706238.0, + "step": 329 + }, + { + "epoch": 0.03064066852367688, + "grad_norm": 1.9694838755096198, + "learning_rate": 5e-06, + "loss": 0.5681, + "num_tokens": 4721085.0, + "step": 330 + }, + { + "epoch": 0.03073351903435469, + "grad_norm": 2.0673282026531545, + "learning_rate": 5e-06, + "loss": 0.5807, + "num_tokens": 4735139.0, + "step": 331 + }, + { + "epoch": 0.030826369545032498, + "grad_norm": 2.108173118904099, + "learning_rate": 5e-06, + "loss": 0.5314, + "num_tokens": 4747700.0, + "step": 332 + }, + { + "epoch": 0.030919220055710305, + "grad_norm": 2.10278519083512, + "learning_rate": 5e-06, + "loss": 0.6681, + "num_tokens": 4762634.0, + "step": 333 + }, + { + "epoch": 0.031012070566388116, + "grad_norm": 2.153997194712624, + "learning_rate": 5e-06, + "loss": 0.6669, + "num_tokens": 4778021.0, + "step": 334 + }, + { + "epoch": 0.031104921077065924, + "grad_norm": 2.038870523397791, + "learning_rate": 5e-06, + "loss": 0.5526, + "num_tokens": 4791493.0, + "step": 335 + }, + { + "epoch": 0.03119777158774373, + "grad_norm": 2.13197062550287, + "learning_rate": 5e-06, + "loss": 0.6446, + "num_tokens": 4806665.0, + "step": 336 + }, + { + "epoch": 0.03129062209842154, + "grad_norm": 2.081413599254524, + "learning_rate": 5e-06, + "loss": 0.5964, + "num_tokens": 4821196.0, + "step": 337 + }, + { + "epoch": 0.03138347260909935, + "grad_norm": 2.0564866668587385, + "learning_rate": 5e-06, + "loss": 0.5239, + "num_tokens": 4835107.0, + "step": 338 + }, + { + "epoch": 0.03147632311977716, + "grad_norm": 2.1851749125593534, + "learning_rate": 5e-06, + "loss": 0.5845, + "num_tokens": 4848126.0, + "step": 339 + }, + { + "epoch": 0.031569173630454965, + "grad_norm": 2.0678180256033984, + "learning_rate": 5e-06, + "loss": 0.6018, + "num_tokens": 4862095.0, + "step": 340 + }, + { + "epoch": 0.031662024141132776, + "grad_norm": 2.096156703899933, + "learning_rate": 5e-06, + "loss": 0.681, + "num_tokens": 4876645.0, + "step": 341 + }, + { + "epoch": 0.03175487465181059, + "grad_norm": 2.254760367830965, + "learning_rate": 5e-06, + "loss": 0.7544, + "num_tokens": 4890860.0, + "step": 342 + }, + { + "epoch": 0.03184772516248839, + "grad_norm": 2.0545810600188634, + "learning_rate": 5e-06, + "loss": 0.5299, + "num_tokens": 4903918.0, + "step": 343 + }, + { + "epoch": 0.0319405756731662, + "grad_norm": 2.116662439871522, + "learning_rate": 5e-06, + "loss": 0.5842, + "num_tokens": 4919084.0, + "step": 344 + }, + { + "epoch": 0.03203342618384401, + "grad_norm": 1.9770304155971818, + "learning_rate": 5e-06, + "loss": 0.5926, + "num_tokens": 4934847.0, + "step": 345 + }, + { + "epoch": 0.03212627669452182, + "grad_norm": 2.156071422254825, + "learning_rate": 5e-06, + "loss": 0.6378, + "num_tokens": 4949637.0, + "step": 346 + }, + { + "epoch": 0.03221912720519963, + "grad_norm": 2.1097646084473674, + "learning_rate": 5e-06, + "loss": 0.6281, + "num_tokens": 4964100.0, + "step": 347 + }, + { + "epoch": 0.03231197771587744, + "grad_norm": 1.9795334125566446, + "learning_rate": 5e-06, + "loss": 0.6018, + "num_tokens": 4978802.0, + "step": 348 + }, + { + "epoch": 0.03240482822655524, + "grad_norm": 2.2005054764441794, + "learning_rate": 5e-06, + "loss": 0.6841, + "num_tokens": 4993413.0, + "step": 349 + }, + { + "epoch": 0.032497678737233054, + "grad_norm": 2.057418564952531, + "learning_rate": 5e-06, + "loss": 0.58, + "num_tokens": 5007941.0, + "step": 350 + }, + { + "epoch": 0.032590529247910865, + "grad_norm": 2.0807780876869177, + "learning_rate": 5e-06, + "loss": 0.6183, + "num_tokens": 5022864.0, + "step": 351 + }, + { + "epoch": 0.03268337975858867, + "grad_norm": 2.133884032863965, + "learning_rate": 5e-06, + "loss": 0.6788, + "num_tokens": 5037462.0, + "step": 352 + }, + { + "epoch": 0.03277623026926648, + "grad_norm": 2.0870820500855993, + "learning_rate": 5e-06, + "loss": 0.6108, + "num_tokens": 5051034.0, + "step": 353 + }, + { + "epoch": 0.03286908077994429, + "grad_norm": 2.062279643860354, + "learning_rate": 5e-06, + "loss": 0.646, + "num_tokens": 5067418.0, + "step": 354 + }, + { + "epoch": 0.032961931290622096, + "grad_norm": 2.1959914606029867, + "learning_rate": 5e-06, + "loss": 0.5637, + "num_tokens": 5080275.0, + "step": 355 + }, + { + "epoch": 0.03305478180129991, + "grad_norm": 2.2009961665574393, + "learning_rate": 5e-06, + "loss": 0.6817, + "num_tokens": 5095299.0, + "step": 356 + }, + { + "epoch": 0.03314763231197772, + "grad_norm": 2.212908581898663, + "learning_rate": 5e-06, + "loss": 0.7239, + "num_tokens": 5109388.0, + "step": 357 + }, + { + "epoch": 0.03324048282265552, + "grad_norm": 1.993635145305144, + "learning_rate": 5e-06, + "loss": 0.6365, + "num_tokens": 5125615.0, + "step": 358 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 2.2413768244952585, + "learning_rate": 5e-06, + "loss": 0.6779, + "num_tokens": 5138961.0, + "step": 359 + }, + { + "epoch": 0.033426183844011144, + "grad_norm": 2.2527070298576897, + "learning_rate": 5e-06, + "loss": 0.6969, + "num_tokens": 5151821.0, + "step": 360 + }, + { + "epoch": 0.03351903435468895, + "grad_norm": 2.0778226788295657, + "learning_rate": 5e-06, + "loss": 0.5746, + "num_tokens": 5167360.0, + "step": 361 + }, + { + "epoch": 0.03361188486536676, + "grad_norm": 2.148091620390952, + "learning_rate": 5e-06, + "loss": 0.6559, + "num_tokens": 5182568.0, + "step": 362 + }, + { + "epoch": 0.03370473537604457, + "grad_norm": 2.306828309524505, + "learning_rate": 5e-06, + "loss": 0.6713, + "num_tokens": 5195854.0, + "step": 363 + }, + { + "epoch": 0.033797585886722374, + "grad_norm": 2.142129630620089, + "learning_rate": 5e-06, + "loss": 0.5806, + "num_tokens": 5210175.0, + "step": 364 + }, + { + "epoch": 0.033890436397400185, + "grad_norm": 2.1813256835339505, + "learning_rate": 5e-06, + "loss": 0.6328, + "num_tokens": 5224785.0, + "step": 365 + }, + { + "epoch": 0.033983286908077996, + "grad_norm": 2.3194956688757595, + "learning_rate": 5e-06, + "loss": 0.7637, + "num_tokens": 5239374.0, + "step": 366 + }, + { + "epoch": 0.0340761374187558, + "grad_norm": 2.1819814914948195, + "learning_rate": 5e-06, + "loss": 0.6405, + "num_tokens": 5253331.0, + "step": 367 + }, + { + "epoch": 0.03416898792943361, + "grad_norm": 2.311456046786594, + "learning_rate": 5e-06, + "loss": 0.6846, + "num_tokens": 5266662.0, + "step": 368 + }, + { + "epoch": 0.03426183844011142, + "grad_norm": 2.139469791418927, + "learning_rate": 5e-06, + "loss": 0.598, + "num_tokens": 5281605.0, + "step": 369 + }, + { + "epoch": 0.034354688950789226, + "grad_norm": 2.180994015895476, + "learning_rate": 5e-06, + "loss": 0.7162, + "num_tokens": 5296130.0, + "step": 370 + }, + { + "epoch": 0.03444753946146704, + "grad_norm": 2.3530571561663782, + "learning_rate": 5e-06, + "loss": 0.5777, + "num_tokens": 5308570.0, + "step": 371 + }, + { + "epoch": 0.03454038997214485, + "grad_norm": 2.0801666500341396, + "learning_rate": 5e-06, + "loss": 0.6696, + "num_tokens": 5324759.0, + "step": 372 + }, + { + "epoch": 0.03463324048282265, + "grad_norm": 2.1207667761679745, + "learning_rate": 5e-06, + "loss": 0.6647, + "num_tokens": 5339212.0, + "step": 373 + }, + { + "epoch": 0.034726090993500464, + "grad_norm": 2.2093588560163693, + "learning_rate": 5e-06, + "loss": 0.5618, + "num_tokens": 5352660.0, + "step": 374 + }, + { + "epoch": 0.034818941504178275, + "grad_norm": 2.1631847748846593, + "learning_rate": 5e-06, + "loss": 0.6078, + "num_tokens": 5366872.0, + "step": 375 + }, + { + "epoch": 0.03491179201485608, + "grad_norm": 1.8895777793642046, + "learning_rate": 5e-06, + "loss": 0.4582, + "num_tokens": 5380708.0, + "step": 376 + }, + { + "epoch": 0.03500464252553389, + "grad_norm": 2.056685960092859, + "learning_rate": 5e-06, + "loss": 0.5876, + "num_tokens": 5395148.0, + "step": 377 + }, + { + "epoch": 0.0350974930362117, + "grad_norm": 2.00633687843349, + "learning_rate": 5e-06, + "loss": 0.5592, + "num_tokens": 5409804.0, + "step": 378 + }, + { + "epoch": 0.035190343546889505, + "grad_norm": 2.275242532605724, + "learning_rate": 5e-06, + "loss": 0.6378, + "num_tokens": 5424387.0, + "step": 379 + }, + { + "epoch": 0.035283194057567316, + "grad_norm": 2.0884213469168564, + "learning_rate": 5e-06, + "loss": 0.5117, + "num_tokens": 5438237.0, + "step": 380 + }, + { + "epoch": 0.03537604456824513, + "grad_norm": 2.107485008972274, + "learning_rate": 5e-06, + "loss": 0.6554, + "num_tokens": 5453594.0, + "step": 381 + }, + { + "epoch": 0.03546889507892293, + "grad_norm": 2.1333942743196133, + "learning_rate": 5e-06, + "loss": 0.6281, + "num_tokens": 5469060.0, + "step": 382 + }, + { + "epoch": 0.03556174558960074, + "grad_norm": 2.287082718663137, + "learning_rate": 5e-06, + "loss": 0.6877, + "num_tokens": 5482644.0, + "step": 383 + }, + { + "epoch": 0.03565459610027855, + "grad_norm": 2.021743552956354, + "learning_rate": 5e-06, + "loss": 0.5801, + "num_tokens": 5497546.0, + "step": 384 + }, + { + "epoch": 0.03574744661095636, + "grad_norm": 2.3403295170348324, + "learning_rate": 5e-06, + "loss": 0.6903, + "num_tokens": 5510843.0, + "step": 385 + }, + { + "epoch": 0.03584029712163417, + "grad_norm": 2.0108472971476687, + "learning_rate": 5e-06, + "loss": 0.5967, + "num_tokens": 5526194.0, + "step": 386 + }, + { + "epoch": 0.03593314763231198, + "grad_norm": 2.3709875989783087, + "learning_rate": 5e-06, + "loss": 0.6054, + "num_tokens": 5538336.0, + "step": 387 + }, + { + "epoch": 0.03602599814298978, + "grad_norm": 2.082183302777171, + "learning_rate": 5e-06, + "loss": 0.6806, + "num_tokens": 5554000.0, + "step": 388 + }, + { + "epoch": 0.036118848653667594, + "grad_norm": 2.1875909126816167, + "learning_rate": 5e-06, + "loss": 0.61, + "num_tokens": 5567399.0, + "step": 389 + }, + { + "epoch": 0.036211699164345405, + "grad_norm": 1.9656273745927486, + "learning_rate": 5e-06, + "loss": 0.6341, + "num_tokens": 5582543.0, + "step": 390 + }, + { + "epoch": 0.03630454967502321, + "grad_norm": 2.1030858921872047, + "learning_rate": 5e-06, + "loss": 0.6895, + "num_tokens": 5597199.0, + "step": 391 + }, + { + "epoch": 0.03639740018570102, + "grad_norm": 2.04705002889176, + "learning_rate": 5e-06, + "loss": 0.5963, + "num_tokens": 5612151.0, + "step": 392 + }, + { + "epoch": 0.03649025069637883, + "grad_norm": 2.1720710894008874, + "learning_rate": 5e-06, + "loss": 0.6415, + "num_tokens": 5625577.0, + "step": 393 + }, + { + "epoch": 0.036583101207056636, + "grad_norm": 2.087206086103312, + "learning_rate": 5e-06, + "loss": 0.5856, + "num_tokens": 5639334.0, + "step": 394 + }, + { + "epoch": 0.03667595171773445, + "grad_norm": 2.2802366707851838, + "learning_rate": 5e-06, + "loss": 0.6937, + "num_tokens": 5654137.0, + "step": 395 + }, + { + "epoch": 0.03676880222841226, + "grad_norm": 2.1971274334268873, + "learning_rate": 5e-06, + "loss": 0.5903, + "num_tokens": 5666551.0, + "step": 396 + }, + { + "epoch": 0.03686165273909006, + "grad_norm": 2.026335552021248, + "learning_rate": 5e-06, + "loss": 0.6402, + "num_tokens": 5682080.0, + "step": 397 + }, + { + "epoch": 0.03695450324976787, + "grad_norm": 2.1267928315675513, + "learning_rate": 5e-06, + "loss": 0.5895, + "num_tokens": 5694989.0, + "step": 398 + }, + { + "epoch": 0.037047353760445684, + "grad_norm": 1.9932278683471383, + "learning_rate": 5e-06, + "loss": 0.5937, + "num_tokens": 5709950.0, + "step": 399 + }, + { + "epoch": 0.03714020427112349, + "grad_norm": 2.133483811388729, + "learning_rate": 5e-06, + "loss": 0.6876, + "num_tokens": 5725323.0, + "step": 400 + }, + { + "epoch": 0.0372330547818013, + "grad_norm": 1.9984013671702965, + "learning_rate": 5e-06, + "loss": 0.5768, + "num_tokens": 5740993.0, + "step": 401 + }, + { + "epoch": 0.03732590529247911, + "grad_norm": 2.1379923236204714, + "learning_rate": 5e-06, + "loss": 0.5526, + "num_tokens": 5753365.0, + "step": 402 + }, + { + "epoch": 0.037418755803156914, + "grad_norm": 1.9833009904888408, + "learning_rate": 5e-06, + "loss": 0.5735, + "num_tokens": 5768649.0, + "step": 403 + }, + { + "epoch": 0.037511606313834725, + "grad_norm": 2.1052876212527565, + "learning_rate": 5e-06, + "loss": 0.663, + "num_tokens": 5783392.0, + "step": 404 + }, + { + "epoch": 0.037604456824512536, + "grad_norm": 2.093090828965505, + "learning_rate": 5e-06, + "loss": 0.6059, + "num_tokens": 5797775.0, + "step": 405 + }, + { + "epoch": 0.03769730733519034, + "grad_norm": 2.0051083086900907, + "learning_rate": 5e-06, + "loss": 0.5576, + "num_tokens": 5812409.0, + "step": 406 + }, + { + "epoch": 0.03779015784586815, + "grad_norm": 1.9677129105694728, + "learning_rate": 5e-06, + "loss": 0.5425, + "num_tokens": 5826396.0, + "step": 407 + }, + { + "epoch": 0.03788300835654596, + "grad_norm": 2.0672054551739434, + "learning_rate": 5e-06, + "loss": 0.5645, + "num_tokens": 5839584.0, + "step": 408 + }, + { + "epoch": 0.03797585886722377, + "grad_norm": 2.198784019350147, + "learning_rate": 5e-06, + "loss": 0.634, + "num_tokens": 5853474.0, + "step": 409 + }, + { + "epoch": 0.03806870937790158, + "grad_norm": 1.938281046959945, + "learning_rate": 5e-06, + "loss": 0.5692, + "num_tokens": 5869220.0, + "step": 410 + }, + { + "epoch": 0.03816155988857939, + "grad_norm": 2.0798352025090376, + "learning_rate": 5e-06, + "loss": 0.6416, + "num_tokens": 5883975.0, + "step": 411 + }, + { + "epoch": 0.0382544103992572, + "grad_norm": 2.0590050076128508, + "learning_rate": 5e-06, + "loss": 0.6133, + "num_tokens": 5899877.0, + "step": 412 + }, + { + "epoch": 0.038347260909935, + "grad_norm": 2.128844547774631, + "learning_rate": 5e-06, + "loss": 0.6286, + "num_tokens": 5913369.0, + "step": 413 + }, + { + "epoch": 0.038440111420612814, + "grad_norm": 2.0989850189121406, + "learning_rate": 5e-06, + "loss": 0.6591, + "num_tokens": 5928526.0, + "step": 414 + }, + { + "epoch": 0.038532961931290625, + "grad_norm": 2.065275636776591, + "learning_rate": 5e-06, + "loss": 0.6095, + "num_tokens": 5943300.0, + "step": 415 + }, + { + "epoch": 0.03862581244196843, + "grad_norm": 2.1875467929223915, + "learning_rate": 5e-06, + "loss": 0.6508, + "num_tokens": 5958090.0, + "step": 416 + }, + { + "epoch": 0.03871866295264624, + "grad_norm": 2.0501329795029197, + "learning_rate": 5e-06, + "loss": 0.6182, + "num_tokens": 5974436.0, + "step": 417 + }, + { + "epoch": 0.03881151346332405, + "grad_norm": 2.119756087929098, + "learning_rate": 5e-06, + "loss": 0.6547, + "num_tokens": 5989345.0, + "step": 418 + }, + { + "epoch": 0.038904363974001856, + "grad_norm": 2.17606267042137, + "learning_rate": 5e-06, + "loss": 0.6992, + "num_tokens": 6003682.0, + "step": 419 + }, + { + "epoch": 0.03899721448467967, + "grad_norm": 2.109155680817779, + "learning_rate": 5e-06, + "loss": 0.6864, + "num_tokens": 6019793.0, + "step": 420 + }, + { + "epoch": 0.03909006499535748, + "grad_norm": 2.195038446232686, + "learning_rate": 5e-06, + "loss": 0.6612, + "num_tokens": 6034988.0, + "step": 421 + }, + { + "epoch": 0.03918291550603528, + "grad_norm": 2.059119115844342, + "learning_rate": 5e-06, + "loss": 0.5297, + "num_tokens": 6048508.0, + "step": 422 + }, + { + "epoch": 0.03927576601671309, + "grad_norm": 2.0854023883224797, + "learning_rate": 5e-06, + "loss": 0.5826, + "num_tokens": 6062109.0, + "step": 423 + }, + { + "epoch": 0.039368616527390904, + "grad_norm": 2.1851882644381107, + "learning_rate": 5e-06, + "loss": 0.5989, + "num_tokens": 6075226.0, + "step": 424 + }, + { + "epoch": 0.03946146703806871, + "grad_norm": 1.9346791296851213, + "learning_rate": 5e-06, + "loss": 0.6004, + "num_tokens": 6091233.0, + "step": 425 + }, + { + "epoch": 0.03955431754874652, + "grad_norm": 2.408065446150393, + "learning_rate": 5e-06, + "loss": 0.6171, + "num_tokens": 6103510.0, + "step": 426 + }, + { + "epoch": 0.03964716805942433, + "grad_norm": 2.1352403913278724, + "learning_rate": 5e-06, + "loss": 0.6245, + "num_tokens": 6117929.0, + "step": 427 + }, + { + "epoch": 0.039740018570102134, + "grad_norm": 2.261508905977782, + "learning_rate": 5e-06, + "loss": 0.665, + "num_tokens": 6130763.0, + "step": 428 + }, + { + "epoch": 0.039832869080779945, + "grad_norm": 1.9215848313474475, + "learning_rate": 5e-06, + "loss": 0.6028, + "num_tokens": 6146906.0, + "step": 429 + }, + { + "epoch": 0.039925719591457756, + "grad_norm": 1.930989745423036, + "learning_rate": 5e-06, + "loss": 0.5171, + "num_tokens": 6161704.0, + "step": 430 + }, + { + "epoch": 0.04001857010213556, + "grad_norm": 2.112456304044234, + "learning_rate": 5e-06, + "loss": 0.6208, + "num_tokens": 6175027.0, + "step": 431 + }, + { + "epoch": 0.04011142061281337, + "grad_norm": 2.144231930636576, + "learning_rate": 5e-06, + "loss": 0.6004, + "num_tokens": 6188095.0, + "step": 432 + }, + { + "epoch": 0.04020427112349118, + "grad_norm": 2.249720241212084, + "learning_rate": 5e-06, + "loss": 0.6616, + "num_tokens": 6200880.0, + "step": 433 + }, + { + "epoch": 0.040297121634168986, + "grad_norm": 2.0522576446398606, + "learning_rate": 5e-06, + "loss": 0.6206, + "num_tokens": 6216295.0, + "step": 434 + }, + { + "epoch": 0.0403899721448468, + "grad_norm": 2.107640807261354, + "learning_rate": 5e-06, + "loss": 0.5291, + "num_tokens": 6230293.0, + "step": 435 + }, + { + "epoch": 0.04048282265552461, + "grad_norm": 2.2590911835573495, + "learning_rate": 5e-06, + "loss": 0.7468, + "num_tokens": 6244298.0, + "step": 436 + }, + { + "epoch": 0.04057567316620241, + "grad_norm": 2.057442984796695, + "learning_rate": 5e-06, + "loss": 0.6058, + "num_tokens": 6257518.0, + "step": 437 + }, + { + "epoch": 0.040668523676880224, + "grad_norm": 2.2530341191876744, + "learning_rate": 5e-06, + "loss": 0.6444, + "num_tokens": 6272234.0, + "step": 438 + }, + { + "epoch": 0.040761374187558035, + "grad_norm": 2.1167786345067654, + "learning_rate": 5e-06, + "loss": 0.5542, + "num_tokens": 6285901.0, + "step": 439 + }, + { + "epoch": 0.04085422469823584, + "grad_norm": 2.18412858644023, + "learning_rate": 5e-06, + "loss": 0.6391, + "num_tokens": 6300736.0, + "step": 440 + }, + { + "epoch": 0.04094707520891365, + "grad_norm": 1.989770512281639, + "learning_rate": 5e-06, + "loss": 0.5668, + "num_tokens": 6316488.0, + "step": 441 + }, + { + "epoch": 0.04103992571959146, + "grad_norm": 2.082579001697823, + "learning_rate": 5e-06, + "loss": 0.6292, + "num_tokens": 6331729.0, + "step": 442 + }, + { + "epoch": 0.041132776230269265, + "grad_norm": 1.9741625350036671, + "learning_rate": 5e-06, + "loss": 0.5116, + "num_tokens": 6346514.0, + "step": 443 + }, + { + "epoch": 0.041225626740947076, + "grad_norm": 2.012170575066542, + "learning_rate": 5e-06, + "loss": 0.5478, + "num_tokens": 6361743.0, + "step": 444 + }, + { + "epoch": 0.04131847725162489, + "grad_norm": 2.2929866075394743, + "learning_rate": 5e-06, + "loss": 0.6268, + "num_tokens": 6375030.0, + "step": 445 + }, + { + "epoch": 0.04141132776230269, + "grad_norm": 2.0509615991452725, + "learning_rate": 5e-06, + "loss": 0.5557, + "num_tokens": 6388739.0, + "step": 446 + }, + { + "epoch": 0.0415041782729805, + "grad_norm": 1.9690229893234334, + "learning_rate": 5e-06, + "loss": 0.5155, + "num_tokens": 6403653.0, + "step": 447 + }, + { + "epoch": 0.04159702878365831, + "grad_norm": 2.4254290240315877, + "learning_rate": 5e-06, + "loss": 0.889, + "num_tokens": 6417549.0, + "step": 448 + }, + { + "epoch": 0.04168987929433612, + "grad_norm": 2.137401029369851, + "learning_rate": 5e-06, + "loss": 0.668, + "num_tokens": 6431347.0, + "step": 449 + }, + { + "epoch": 0.04178272980501393, + "grad_norm": 2.1009119133073435, + "learning_rate": 5e-06, + "loss": 0.5307, + "num_tokens": 6445501.0, + "step": 450 + }, + { + "epoch": 0.04187558031569174, + "grad_norm": 1.9034661210556802, + "learning_rate": 5e-06, + "loss": 0.5411, + "num_tokens": 6461885.0, + "step": 451 + }, + { + "epoch": 0.04196843082636954, + "grad_norm": 1.9092160623287282, + "learning_rate": 5e-06, + "loss": 0.5595, + "num_tokens": 6477317.0, + "step": 452 + }, + { + "epoch": 0.042061281337047354, + "grad_norm": 2.0712450873039367, + "learning_rate": 5e-06, + "loss": 0.5999, + "num_tokens": 6493119.0, + "step": 453 + }, + { + "epoch": 0.042154131847725165, + "grad_norm": 2.166975978521293, + "learning_rate": 5e-06, + "loss": 0.7047, + "num_tokens": 6506921.0, + "step": 454 + }, + { + "epoch": 0.04224698235840297, + "grad_norm": 2.065537447814496, + "learning_rate": 5e-06, + "loss": 0.6414, + "num_tokens": 6522713.0, + "step": 455 + }, + { + "epoch": 0.04233983286908078, + "grad_norm": 2.1188952240852816, + "learning_rate": 5e-06, + "loss": 0.6735, + "num_tokens": 6537451.0, + "step": 456 + }, + { + "epoch": 0.04243268337975859, + "grad_norm": 2.0816131591284215, + "learning_rate": 5e-06, + "loss": 0.5096, + "num_tokens": 6550576.0, + "step": 457 + }, + { + "epoch": 0.042525533890436396, + "grad_norm": 2.104301429223252, + "learning_rate": 5e-06, + "loss": 0.6261, + "num_tokens": 6566083.0, + "step": 458 + }, + { + "epoch": 0.04261838440111421, + "grad_norm": 2.1340758388392156, + "learning_rate": 5e-06, + "loss": 0.6309, + "num_tokens": 6580788.0, + "step": 459 + }, + { + "epoch": 0.04271123491179202, + "grad_norm": 2.1959410483369823, + "learning_rate": 5e-06, + "loss": 0.6062, + "num_tokens": 6595934.0, + "step": 460 + }, + { + "epoch": 0.04280408542246982, + "grad_norm": 2.201805815858792, + "learning_rate": 5e-06, + "loss": 0.7693, + "num_tokens": 6611964.0, + "step": 461 + }, + { + "epoch": 0.04289693593314763, + "grad_norm": 2.0688844918181015, + "learning_rate": 5e-06, + "loss": 0.5792, + "num_tokens": 6626321.0, + "step": 462 + }, + { + "epoch": 0.042989786443825444, + "grad_norm": 2.03634021822181, + "learning_rate": 5e-06, + "loss": 0.5519, + "num_tokens": 6640119.0, + "step": 463 + }, + { + "epoch": 0.04308263695450325, + "grad_norm": 2.0814048170628805, + "learning_rate": 5e-06, + "loss": 0.6144, + "num_tokens": 6653338.0, + "step": 464 + }, + { + "epoch": 0.04317548746518106, + "grad_norm": 2.086149553148392, + "learning_rate": 5e-06, + "loss": 0.5984, + "num_tokens": 6667193.0, + "step": 465 + }, + { + "epoch": 0.04326833797585887, + "grad_norm": 2.231221552157291, + "learning_rate": 5e-06, + "loss": 0.5731, + "num_tokens": 6679934.0, + "step": 466 + }, + { + "epoch": 0.043361188486536674, + "grad_norm": 2.1256184061316983, + "learning_rate": 5e-06, + "loss": 0.6713, + "num_tokens": 6696033.0, + "step": 467 + }, + { + "epoch": 0.043454038997214485, + "grad_norm": 2.1271761553677555, + "learning_rate": 5e-06, + "loss": 0.6061, + "num_tokens": 6709835.0, + "step": 468 + }, + { + "epoch": 0.043546889507892296, + "grad_norm": 2.197602498932096, + "learning_rate": 5e-06, + "loss": 0.5984, + "num_tokens": 6724497.0, + "step": 469 + }, + { + "epoch": 0.0436397400185701, + "grad_norm": 1.9527634017949205, + "learning_rate": 5e-06, + "loss": 0.5125, + "num_tokens": 6739271.0, + "step": 470 + }, + { + "epoch": 0.04373259052924791, + "grad_norm": 2.121382623058448, + "learning_rate": 5e-06, + "loss": 0.6694, + "num_tokens": 6754702.0, + "step": 471 + }, + { + "epoch": 0.04382544103992572, + "grad_norm": 2.082600643163522, + "learning_rate": 5e-06, + "loss": 0.5713, + "num_tokens": 6769658.0, + "step": 472 + }, + { + "epoch": 0.043918291550603526, + "grad_norm": 2.2656330090424, + "learning_rate": 5e-06, + "loss": 0.5698, + "num_tokens": 6783057.0, + "step": 473 + }, + { + "epoch": 0.04401114206128134, + "grad_norm": 2.2283412260949076, + "learning_rate": 5e-06, + "loss": 0.5709, + "num_tokens": 6796239.0, + "step": 474 + }, + { + "epoch": 0.04410399257195915, + "grad_norm": 2.0934578500266845, + "learning_rate": 5e-06, + "loss": 0.5815, + "num_tokens": 6811639.0, + "step": 475 + }, + { + "epoch": 0.04419684308263695, + "grad_norm": 2.0940140558304488, + "learning_rate": 5e-06, + "loss": 0.6367, + "num_tokens": 6826178.0, + "step": 476 + }, + { + "epoch": 0.04428969359331476, + "grad_norm": 2.0547671465732966, + "learning_rate": 5e-06, + "loss": 0.5742, + "num_tokens": 6841614.0, + "step": 477 + }, + { + "epoch": 0.044382544103992574, + "grad_norm": 2.151966486285293, + "learning_rate": 5e-06, + "loss": 0.5538, + "num_tokens": 6855734.0, + "step": 478 + }, + { + "epoch": 0.04447539461467038, + "grad_norm": 2.3184745482658764, + "learning_rate": 5e-06, + "loss": 0.6307, + "num_tokens": 6867828.0, + "step": 479 + }, + { + "epoch": 0.04456824512534819, + "grad_norm": 2.134022854840195, + "learning_rate": 5e-06, + "loss": 0.6026, + "num_tokens": 6880907.0, + "step": 480 + }, + { + "epoch": 0.044661095636026, + "grad_norm": 2.1160019700659647, + "learning_rate": 5e-06, + "loss": 0.6769, + "num_tokens": 6896376.0, + "step": 481 + }, + { + "epoch": 0.044753946146703805, + "grad_norm": 2.0998347234934007, + "learning_rate": 5e-06, + "loss": 0.6301, + "num_tokens": 6910617.0, + "step": 482 + }, + { + "epoch": 0.044846796657381616, + "grad_norm": 2.1259956276918506, + "learning_rate": 5e-06, + "loss": 0.62, + "num_tokens": 6923957.0, + "step": 483 + }, + { + "epoch": 0.04493964716805943, + "grad_norm": 2.0395117166900145, + "learning_rate": 5e-06, + "loss": 0.5863, + "num_tokens": 6938336.0, + "step": 484 + }, + { + "epoch": 0.04503249767873723, + "grad_norm": 2.1863876593704314, + "learning_rate": 5e-06, + "loss": 0.6923, + "num_tokens": 6952694.0, + "step": 485 + }, + { + "epoch": 0.04512534818941504, + "grad_norm": 2.089787878727239, + "learning_rate": 5e-06, + "loss": 0.5838, + "num_tokens": 6967274.0, + "step": 486 + }, + { + "epoch": 0.04521819870009285, + "grad_norm": 2.193504866823072, + "learning_rate": 5e-06, + "loss": 0.7248, + "num_tokens": 6982888.0, + "step": 487 + }, + { + "epoch": 0.04531104921077066, + "grad_norm": 2.221909850880461, + "learning_rate": 5e-06, + "loss": 0.6721, + "num_tokens": 6997474.0, + "step": 488 + }, + { + "epoch": 0.04540389972144847, + "grad_norm": 2.165410913599911, + "learning_rate": 5e-06, + "loss": 0.699, + "num_tokens": 7012153.0, + "step": 489 + }, + { + "epoch": 0.04549675023212628, + "grad_norm": 2.0076603457262476, + "learning_rate": 5e-06, + "loss": 0.6046, + "num_tokens": 7026005.0, + "step": 490 + }, + { + "epoch": 0.04558960074280408, + "grad_norm": 2.026519402785355, + "learning_rate": 5e-06, + "loss": 0.6431, + "num_tokens": 7040635.0, + "step": 491 + }, + { + "epoch": 0.045682451253481894, + "grad_norm": 1.9984548740289056, + "learning_rate": 5e-06, + "loss": 0.5792, + "num_tokens": 7054984.0, + "step": 492 + }, + { + "epoch": 0.045775301764159705, + "grad_norm": 1.966544860683357, + "learning_rate": 5e-06, + "loss": 0.5324, + "num_tokens": 7069984.0, + "step": 493 + }, + { + "epoch": 0.04586815227483751, + "grad_norm": 2.023418489079073, + "learning_rate": 5e-06, + "loss": 0.6622, + "num_tokens": 7084681.0, + "step": 494 + }, + { + "epoch": 0.04596100278551532, + "grad_norm": 2.1351077886485768, + "learning_rate": 5e-06, + "loss": 0.6518, + "num_tokens": 7098332.0, + "step": 495 + }, + { + "epoch": 0.04605385329619313, + "grad_norm": 2.1705816133499605, + "learning_rate": 5e-06, + "loss": 0.7118, + "num_tokens": 7112503.0, + "step": 496 + }, + { + "epoch": 0.046146703806870935, + "grad_norm": 2.037778425109175, + "learning_rate": 5e-06, + "loss": 0.5728, + "num_tokens": 7127433.0, + "step": 497 + }, + { + "epoch": 0.046239554317548746, + "grad_norm": 2.0306990949334502, + "learning_rate": 5e-06, + "loss": 0.5055, + "num_tokens": 7141717.0, + "step": 498 + }, + { + "epoch": 0.04633240482822656, + "grad_norm": 2.136903430064288, + "learning_rate": 5e-06, + "loss": 0.6964, + "num_tokens": 7156973.0, + "step": 499 + }, + { + "epoch": 0.04642525533890436, + "grad_norm": 2.289817943559723, + "learning_rate": 5e-06, + "loss": 0.6207, + "num_tokens": 7169654.0, + "step": 500 + }, + { + "epoch": 0.04651810584958217, + "grad_norm": 2.105199597016089, + "learning_rate": 5e-06, + "loss": 0.6372, + "num_tokens": 7185658.0, + "step": 501 + }, + { + "epoch": 0.046610956360259984, + "grad_norm": 2.111959314384318, + "learning_rate": 5e-06, + "loss": 0.6939, + "num_tokens": 7201231.0, + "step": 502 + }, + { + "epoch": 0.04670380687093779, + "grad_norm": 2.4336625326570687, + "learning_rate": 5e-06, + "loss": 0.7239, + "num_tokens": 7213949.0, + "step": 503 + }, + { + "epoch": 0.0467966573816156, + "grad_norm": 1.9876256370684677, + "learning_rate": 5e-06, + "loss": 0.6094, + "num_tokens": 7228605.0, + "step": 504 + }, + { + "epoch": 0.04688950789229341, + "grad_norm": 2.112748384843679, + "learning_rate": 5e-06, + "loss": 0.6086, + "num_tokens": 7242705.0, + "step": 505 + }, + { + "epoch": 0.046982358402971214, + "grad_norm": 2.0118346405508523, + "learning_rate": 5e-06, + "loss": 0.6891, + "num_tokens": 7258607.0, + "step": 506 + }, + { + "epoch": 0.047075208913649025, + "grad_norm": 2.071825951238989, + "learning_rate": 5e-06, + "loss": 0.6112, + "num_tokens": 7272704.0, + "step": 507 + }, + { + "epoch": 0.047168059424326836, + "grad_norm": 2.490275359989562, + "learning_rate": 5e-06, + "loss": 0.6325, + "num_tokens": 7283275.0, + "step": 508 + }, + { + "epoch": 0.04726090993500464, + "grad_norm": 2.0340950015053707, + "learning_rate": 5e-06, + "loss": 0.5642, + "num_tokens": 7297612.0, + "step": 509 + }, + { + "epoch": 0.04735376044568245, + "grad_norm": 2.220971567127137, + "learning_rate": 5e-06, + "loss": 0.6783, + "num_tokens": 7311816.0, + "step": 510 + }, + { + "epoch": 0.04744661095636026, + "grad_norm": 2.144053869714226, + "learning_rate": 5e-06, + "loss": 0.5934, + "num_tokens": 7325936.0, + "step": 511 + }, + { + "epoch": 0.047539461467038066, + "grad_norm": 2.0634558219153885, + "learning_rate": 5e-06, + "loss": 0.6421, + "num_tokens": 7341700.0, + "step": 512 + }, + { + "epoch": 0.04763231197771588, + "grad_norm": 2.0707423494892403, + "learning_rate": 5e-06, + "loss": 0.6174, + "num_tokens": 7355288.0, + "step": 513 + }, + { + "epoch": 0.04772516248839369, + "grad_norm": 2.2268762065344005, + "learning_rate": 5e-06, + "loss": 0.6532, + "num_tokens": 7367941.0, + "step": 514 + }, + { + "epoch": 0.04781801299907149, + "grad_norm": 2.251983604901386, + "learning_rate": 5e-06, + "loss": 0.6176, + "num_tokens": 7380962.0, + "step": 515 + }, + { + "epoch": 0.0479108635097493, + "grad_norm": 2.082282399863214, + "learning_rate": 5e-06, + "loss": 0.6732, + "num_tokens": 7396680.0, + "step": 516 + }, + { + "epoch": 0.048003714020427114, + "grad_norm": 2.2717666385877577, + "learning_rate": 5e-06, + "loss": 0.7087, + "num_tokens": 7409026.0, + "step": 517 + }, + { + "epoch": 0.04809656453110492, + "grad_norm": 2.1602576108672675, + "learning_rate": 5e-06, + "loss": 0.7464, + "num_tokens": 7424905.0, + "step": 518 + }, + { + "epoch": 0.04818941504178273, + "grad_norm": 2.1017026102767864, + "learning_rate": 5e-06, + "loss": 0.5157, + "num_tokens": 7437001.0, + "step": 519 + }, + { + "epoch": 0.04828226555246054, + "grad_norm": 2.0856123799891497, + "learning_rate": 5e-06, + "loss": 0.6461, + "num_tokens": 7451495.0, + "step": 520 + }, + { + "epoch": 0.048375116063138345, + "grad_norm": 2.2944199464866353, + "learning_rate": 5e-06, + "loss": 0.6363, + "num_tokens": 7464522.0, + "step": 521 + }, + { + "epoch": 0.048467966573816156, + "grad_norm": 2.0614759557031888, + "learning_rate": 5e-06, + "loss": 0.538, + "num_tokens": 7478740.0, + "step": 522 + }, + { + "epoch": 0.04856081708449397, + "grad_norm": 1.983663319988362, + "learning_rate": 5e-06, + "loss": 0.5869, + "num_tokens": 7494460.0, + "step": 523 + }, + { + "epoch": 0.04865366759517177, + "grad_norm": 1.993426124445217, + "learning_rate": 5e-06, + "loss": 0.5977, + "num_tokens": 7510231.0, + "step": 524 + }, + { + "epoch": 0.04874651810584958, + "grad_norm": 2.072369866430154, + "learning_rate": 5e-06, + "loss": 0.6186, + "num_tokens": 7524142.0, + "step": 525 + }, + { + "epoch": 0.04883936861652739, + "grad_norm": 2.0819843358754513, + "learning_rate": 5e-06, + "loss": 0.5582, + "num_tokens": 7538491.0, + "step": 526 + }, + { + "epoch": 0.0489322191272052, + "grad_norm": 2.1960512753204053, + "learning_rate": 5e-06, + "loss": 0.6051, + "num_tokens": 7550914.0, + "step": 527 + }, + { + "epoch": 0.04902506963788301, + "grad_norm": 1.9966637587216531, + "learning_rate": 5e-06, + "loss": 0.5497, + "num_tokens": 7566579.0, + "step": 528 + }, + { + "epoch": 0.04911792014856082, + "grad_norm": 2.0411540059703106, + "learning_rate": 5e-06, + "loss": 0.5677, + "num_tokens": 7579455.0, + "step": 529 + }, + { + "epoch": 0.04921077065923862, + "grad_norm": 2.0783306492295486, + "learning_rate": 5e-06, + "loss": 0.6104, + "num_tokens": 7593787.0, + "step": 530 + }, + { + "epoch": 0.049303621169916434, + "grad_norm": 2.0699815148347804, + "learning_rate": 5e-06, + "loss": 0.6212, + "num_tokens": 7608875.0, + "step": 531 + }, + { + "epoch": 0.049396471680594245, + "grad_norm": 2.0323528189567166, + "learning_rate": 5e-06, + "loss": 0.5339, + "num_tokens": 7623537.0, + "step": 532 + }, + { + "epoch": 0.04948932219127205, + "grad_norm": 2.2871868949688916, + "learning_rate": 5e-06, + "loss": 0.6424, + "num_tokens": 7636052.0, + "step": 533 + }, + { + "epoch": 0.04958217270194986, + "grad_norm": 2.01635851343428, + "learning_rate": 5e-06, + "loss": 0.6568, + "num_tokens": 7651968.0, + "step": 534 + }, + { + "epoch": 0.04967502321262767, + "grad_norm": 2.0920720642859663, + "learning_rate": 5e-06, + "loss": 0.5991, + "num_tokens": 7665537.0, + "step": 535 + }, + { + "epoch": 0.049767873723305475, + "grad_norm": 1.9518342973350309, + "learning_rate": 5e-06, + "loss": 0.5845, + "num_tokens": 7681904.0, + "step": 536 + }, + { + "epoch": 0.049860724233983286, + "grad_norm": 2.1584906419042547, + "learning_rate": 5e-06, + "loss": 0.7102, + "num_tokens": 7695827.0, + "step": 537 + }, + { + "epoch": 0.0499535747446611, + "grad_norm": 2.4994739698783848, + "learning_rate": 5e-06, + "loss": 0.8489, + "num_tokens": 7707650.0, + "step": 538 + }, + { + "epoch": 0.0500464252553389, + "grad_norm": 2.07056148979299, + "learning_rate": 5e-06, + "loss": 0.6062, + "num_tokens": 7721476.0, + "step": 539 + }, + { + "epoch": 0.05013927576601671, + "grad_norm": 2.1796350928016985, + "learning_rate": 5e-06, + "loss": 0.7498, + "num_tokens": 7736547.0, + "step": 540 + }, + { + "epoch": 0.05023212627669452, + "grad_norm": 2.1619605946396745, + "learning_rate": 5e-06, + "loss": 0.7034, + "num_tokens": 7750718.0, + "step": 541 + }, + { + "epoch": 0.05032497678737233, + "grad_norm": 2.2812414536451677, + "learning_rate": 5e-06, + "loss": 0.6172, + "num_tokens": 7763935.0, + "step": 542 + }, + { + "epoch": 0.05041782729805014, + "grad_norm": 2.151917885739651, + "learning_rate": 5e-06, + "loss": 0.7511, + "num_tokens": 7779854.0, + "step": 543 + }, + { + "epoch": 0.05051067780872795, + "grad_norm": 2.171231893928171, + "learning_rate": 5e-06, + "loss": 0.6183, + "num_tokens": 7794282.0, + "step": 544 + }, + { + "epoch": 0.050603528319405754, + "grad_norm": 2.083376164685335, + "learning_rate": 5e-06, + "loss": 0.606, + "num_tokens": 7809233.0, + "step": 545 + }, + { + "epoch": 0.050696378830083565, + "grad_norm": 1.9660432564848285, + "learning_rate": 5e-06, + "loss": 0.5434, + "num_tokens": 7823603.0, + "step": 546 + }, + { + "epoch": 0.050789229340761376, + "grad_norm": 2.1904518875212675, + "learning_rate": 5e-06, + "loss": 0.5833, + "num_tokens": 7836933.0, + "step": 547 + }, + { + "epoch": 0.05088207985143918, + "grad_norm": 2.127516540783001, + "learning_rate": 5e-06, + "loss": 0.6044, + "num_tokens": 7850759.0, + "step": 548 + }, + { + "epoch": 0.05097493036211699, + "grad_norm": 2.234454293952321, + "learning_rate": 5e-06, + "loss": 0.604, + "num_tokens": 7864086.0, + "step": 549 + }, + { + "epoch": 0.0510677808727948, + "grad_norm": 2.168597173243071, + "learning_rate": 5e-06, + "loss": 0.5758, + "num_tokens": 7877391.0, + "step": 550 + }, + { + "epoch": 0.051160631383472606, + "grad_norm": 2.225845274995538, + "learning_rate": 5e-06, + "loss": 0.6038, + "num_tokens": 7891193.0, + "step": 551 + }, + { + "epoch": 0.05125348189415042, + "grad_norm": 2.213969262005597, + "learning_rate": 5e-06, + "loss": 0.6906, + "num_tokens": 7905517.0, + "step": 552 + }, + { + "epoch": 0.05134633240482823, + "grad_norm": 2.123193244150427, + "learning_rate": 5e-06, + "loss": 0.5943, + "num_tokens": 7918288.0, + "step": 553 + }, + { + "epoch": 0.05143918291550603, + "grad_norm": 2.0366531426065366, + "learning_rate": 5e-06, + "loss": 0.6124, + "num_tokens": 7933837.0, + "step": 554 + }, + { + "epoch": 0.05153203342618384, + "grad_norm": 2.1378536001994743, + "learning_rate": 5e-06, + "loss": 0.5752, + "num_tokens": 7947056.0, + "step": 555 + }, + { + "epoch": 0.051624883936861654, + "grad_norm": 2.138231054111841, + "learning_rate": 5e-06, + "loss": 0.6558, + "num_tokens": 7962335.0, + "step": 556 + }, + { + "epoch": 0.05171773444753946, + "grad_norm": 2.0635660341222732, + "learning_rate": 5e-06, + "loss": 0.5889, + "num_tokens": 7975728.0, + "step": 557 + }, + { + "epoch": 0.05181058495821727, + "grad_norm": 2.222478349925832, + "learning_rate": 5e-06, + "loss": 0.7452, + "num_tokens": 7990692.0, + "step": 558 + }, + { + "epoch": 0.05190343546889508, + "grad_norm": 2.3912146006167374, + "learning_rate": 5e-06, + "loss": 0.5657, + "num_tokens": 8001845.0, + "step": 559 + }, + { + "epoch": 0.051996285979572884, + "grad_norm": 2.136921845112435, + "learning_rate": 5e-06, + "loss": 0.5456, + "num_tokens": 8016995.0, + "step": 560 + }, + { + "epoch": 0.052089136490250695, + "grad_norm": 2.1775030341756056, + "learning_rate": 5e-06, + "loss": 0.584, + "num_tokens": 8030661.0, + "step": 561 + }, + { + "epoch": 0.052181987000928506, + "grad_norm": 2.1382448623731047, + "learning_rate": 5e-06, + "loss": 0.5873, + "num_tokens": 8042982.0, + "step": 562 + }, + { + "epoch": 0.05227483751160631, + "grad_norm": 2.1188759636887537, + "learning_rate": 5e-06, + "loss": 0.6209, + "num_tokens": 8057108.0, + "step": 563 + }, + { + "epoch": 0.05236768802228412, + "grad_norm": 1.9886358996609388, + "learning_rate": 5e-06, + "loss": 0.5758, + "num_tokens": 8071147.0, + "step": 564 + }, + { + "epoch": 0.05246053853296193, + "grad_norm": 2.0313681681692763, + "learning_rate": 5e-06, + "loss": 0.4854, + "num_tokens": 8084675.0, + "step": 565 + }, + { + "epoch": 0.05255338904363974, + "grad_norm": 2.091496725650021, + "learning_rate": 5e-06, + "loss": 0.6292, + "num_tokens": 8099983.0, + "step": 566 + }, + { + "epoch": 0.05264623955431755, + "grad_norm": 1.9423200576938227, + "learning_rate": 5e-06, + "loss": 0.495, + "num_tokens": 8114702.0, + "step": 567 + }, + { + "epoch": 0.05273909006499536, + "grad_norm": 2.1032120454518664, + "learning_rate": 5e-06, + "loss": 0.5897, + "num_tokens": 8130339.0, + "step": 568 + }, + { + "epoch": 0.05283194057567316, + "grad_norm": 2.049956517011654, + "learning_rate": 5e-06, + "loss": 0.6462, + "num_tokens": 8144950.0, + "step": 569 + }, + { + "epoch": 0.052924791086350974, + "grad_norm": 2.044100507339956, + "learning_rate": 5e-06, + "loss": 0.6113, + "num_tokens": 8159517.0, + "step": 570 + }, + { + "epoch": 0.053017641597028785, + "grad_norm": 2.155017289764851, + "learning_rate": 5e-06, + "loss": 0.6622, + "num_tokens": 8173638.0, + "step": 571 + }, + { + "epoch": 0.053110492107706596, + "grad_norm": 2.1616646216607536, + "learning_rate": 5e-06, + "loss": 0.6379, + "num_tokens": 8188248.0, + "step": 572 + }, + { + "epoch": 0.0532033426183844, + "grad_norm": 2.0986653266708326, + "learning_rate": 5e-06, + "loss": 0.4703, + "num_tokens": 8201018.0, + "step": 573 + }, + { + "epoch": 0.05329619312906221, + "grad_norm": 2.0447184976328834, + "learning_rate": 5e-06, + "loss": 0.5772, + "num_tokens": 8215405.0, + "step": 574 + }, + { + "epoch": 0.05338904363974002, + "grad_norm": 2.033960924250815, + "learning_rate": 5e-06, + "loss": 0.6417, + "num_tokens": 8230762.0, + "step": 575 + }, + { + "epoch": 0.053481894150417826, + "grad_norm": 2.2551475746623337, + "learning_rate": 5e-06, + "loss": 0.602, + "num_tokens": 8244217.0, + "step": 576 + }, + { + "epoch": 0.05357474466109564, + "grad_norm": 2.032969457490434, + "learning_rate": 5e-06, + "loss": 0.6429, + "num_tokens": 8260549.0, + "step": 577 + }, + { + "epoch": 0.05366759517177345, + "grad_norm": 1.9826592579537134, + "learning_rate": 5e-06, + "loss": 0.5719, + "num_tokens": 8274694.0, + "step": 578 + }, + { + "epoch": 0.05376044568245125, + "grad_norm": 2.174124704542307, + "learning_rate": 5e-06, + "loss": 0.6329, + "num_tokens": 8288400.0, + "step": 579 + }, + { + "epoch": 0.05385329619312906, + "grad_norm": 2.0590200726979693, + "learning_rate": 5e-06, + "loss": 0.6901, + "num_tokens": 8303655.0, + "step": 580 + }, + { + "epoch": 0.053946146703806874, + "grad_norm": 1.997892519770767, + "learning_rate": 5e-06, + "loss": 0.6323, + "num_tokens": 8319383.0, + "step": 581 + }, + { + "epoch": 0.05403899721448468, + "grad_norm": 2.0435048600857133, + "learning_rate": 5e-06, + "loss": 0.6477, + "num_tokens": 8334543.0, + "step": 582 + }, + { + "epoch": 0.05413184772516249, + "grad_norm": 2.1437891824159925, + "learning_rate": 5e-06, + "loss": 0.6712, + "num_tokens": 8349927.0, + "step": 583 + }, + { + "epoch": 0.0542246982358403, + "grad_norm": 2.2376460998523537, + "learning_rate": 5e-06, + "loss": 0.5963, + "num_tokens": 8363673.0, + "step": 584 + }, + { + "epoch": 0.054317548746518104, + "grad_norm": 2.2558714084154405, + "learning_rate": 5e-06, + "loss": 0.6731, + "num_tokens": 8378688.0, + "step": 585 + }, + { + "epoch": 0.054410399257195916, + "grad_norm": 2.135581834505454, + "learning_rate": 5e-06, + "loss": 0.5796, + "num_tokens": 8392207.0, + "step": 586 + }, + { + "epoch": 0.05450324976787373, + "grad_norm": 2.1063969793578043, + "learning_rate": 5e-06, + "loss": 0.5641, + "num_tokens": 8406496.0, + "step": 587 + }, + { + "epoch": 0.05459610027855153, + "grad_norm": 2.156010523351603, + "learning_rate": 5e-06, + "loss": 0.5863, + "num_tokens": 8419496.0, + "step": 588 + }, + { + "epoch": 0.05468895078922934, + "grad_norm": 2.123309658478724, + "learning_rate": 5e-06, + "loss": 0.6041, + "num_tokens": 8433947.0, + "step": 589 + }, + { + "epoch": 0.05478180129990715, + "grad_norm": 2.277387137724898, + "learning_rate": 5e-06, + "loss": 0.5808, + "num_tokens": 8446659.0, + "step": 590 + }, + { + "epoch": 0.05487465181058496, + "grad_norm": 2.040903963914998, + "learning_rate": 5e-06, + "loss": 0.5886, + "num_tokens": 8461214.0, + "step": 591 + }, + { + "epoch": 0.05496750232126277, + "grad_norm": 2.3612542742247467, + "learning_rate": 5e-06, + "loss": 0.7525, + "num_tokens": 8477133.0, + "step": 592 + }, + { + "epoch": 0.05506035283194058, + "grad_norm": 2.2318219873163394, + "learning_rate": 5e-06, + "loss": 0.5305, + "num_tokens": 8489969.0, + "step": 593 + }, + { + "epoch": 0.05515320334261838, + "grad_norm": 2.072015478695998, + "learning_rate": 5e-06, + "loss": 0.588, + "num_tokens": 8503643.0, + "step": 594 + }, + { + "epoch": 0.055246053853296194, + "grad_norm": 1.94087960578999, + "learning_rate": 5e-06, + "loss": 0.6007, + "num_tokens": 8519019.0, + "step": 595 + }, + { + "epoch": 0.055338904363974005, + "grad_norm": 1.954368701789745, + "learning_rate": 5e-06, + "loss": 0.5985, + "num_tokens": 8535285.0, + "step": 596 + }, + { + "epoch": 0.05543175487465181, + "grad_norm": 2.2606158851080966, + "learning_rate": 5e-06, + "loss": 0.6354, + "num_tokens": 8548187.0, + "step": 597 + }, + { + "epoch": 0.05552460538532962, + "grad_norm": 2.232772406531499, + "learning_rate": 5e-06, + "loss": 0.5849, + "num_tokens": 8560680.0, + "step": 598 + }, + { + "epoch": 0.05561745589600743, + "grad_norm": 2.0786673890851572, + "learning_rate": 5e-06, + "loss": 0.6383, + "num_tokens": 8575701.0, + "step": 599 + }, + { + "epoch": 0.055710306406685235, + "grad_norm": 2.162439474704742, + "learning_rate": 5e-06, + "loss": 0.6592, + "num_tokens": 8590726.0, + "step": 600 + }, + { + "epoch": 0.055803156917363046, + "grad_norm": 2.0210030800452894, + "learning_rate": 5e-06, + "loss": 0.6526, + "num_tokens": 8606021.0, + "step": 601 + }, + { + "epoch": 0.05589600742804086, + "grad_norm": 2.0477835177271433, + "learning_rate": 5e-06, + "loss": 0.6289, + "num_tokens": 8621281.0, + "step": 602 + }, + { + "epoch": 0.05598885793871866, + "grad_norm": 2.0257342214229186, + "learning_rate": 5e-06, + "loss": 0.6379, + "num_tokens": 8636559.0, + "step": 603 + }, + { + "epoch": 0.05608170844939647, + "grad_norm": 2.0720263600345183, + "learning_rate": 5e-06, + "loss": 0.6381, + "num_tokens": 8650696.0, + "step": 604 + }, + { + "epoch": 0.05617455896007428, + "grad_norm": 2.020146971097191, + "learning_rate": 5e-06, + "loss": 0.5802, + "num_tokens": 8664828.0, + "step": 605 + }, + { + "epoch": 0.05626740947075209, + "grad_norm": 1.89083552523103, + "learning_rate": 5e-06, + "loss": 0.5343, + "num_tokens": 8679260.0, + "step": 606 + }, + { + "epoch": 0.0563602599814299, + "grad_norm": 2.3619481083441487, + "learning_rate": 5e-06, + "loss": 0.6114, + "num_tokens": 8691840.0, + "step": 607 + }, + { + "epoch": 0.05645311049210771, + "grad_norm": 2.0778175515116977, + "learning_rate": 5e-06, + "loss": 0.5723, + "num_tokens": 8705738.0, + "step": 608 + }, + { + "epoch": 0.056545961002785514, + "grad_norm": 2.154234749918748, + "learning_rate": 5e-06, + "loss": 0.6399, + "num_tokens": 8720308.0, + "step": 609 + }, + { + "epoch": 0.056638811513463325, + "grad_norm": 2.2125052482728362, + "learning_rate": 5e-06, + "loss": 0.5075, + "num_tokens": 8732585.0, + "step": 610 + }, + { + "epoch": 0.056731662024141136, + "grad_norm": 2.133936971171757, + "learning_rate": 5e-06, + "loss": 0.6738, + "num_tokens": 8747806.0, + "step": 611 + }, + { + "epoch": 0.05682451253481894, + "grad_norm": 2.1823236139126334, + "learning_rate": 5e-06, + "loss": 0.6143, + "num_tokens": 8761155.0, + "step": 612 + }, + { + "epoch": 0.05691736304549675, + "grad_norm": 2.1653413990649835, + "learning_rate": 5e-06, + "loss": 0.7286, + "num_tokens": 8775957.0, + "step": 613 + }, + { + "epoch": 0.05701021355617456, + "grad_norm": 1.9979937756158865, + "learning_rate": 5e-06, + "loss": 0.5707, + "num_tokens": 8790059.0, + "step": 614 + }, + { + "epoch": 0.057103064066852366, + "grad_norm": 2.094474439271198, + "learning_rate": 5e-06, + "loss": 0.6314, + "num_tokens": 8804999.0, + "step": 615 + }, + { + "epoch": 0.05719591457753018, + "grad_norm": 1.9762430327315135, + "learning_rate": 5e-06, + "loss": 0.584, + "num_tokens": 8820003.0, + "step": 616 + }, + { + "epoch": 0.05728876508820799, + "grad_norm": 2.092738554103747, + "learning_rate": 5e-06, + "loss": 0.6063, + "num_tokens": 8834649.0, + "step": 617 + }, + { + "epoch": 0.05738161559888579, + "grad_norm": 2.1657426617235633, + "learning_rate": 5e-06, + "loss": 0.6244, + "num_tokens": 8848522.0, + "step": 618 + }, + { + "epoch": 0.0574744661095636, + "grad_norm": 2.2508407487259747, + "learning_rate": 5e-06, + "loss": 0.6251, + "num_tokens": 8862035.0, + "step": 619 + }, + { + "epoch": 0.057567316620241414, + "grad_norm": 2.1507354730940524, + "learning_rate": 5e-06, + "loss": 0.6318, + "num_tokens": 8876586.0, + "step": 620 + }, + { + "epoch": 0.05766016713091922, + "grad_norm": 2.2490347261039654, + "learning_rate": 5e-06, + "loss": 0.6091, + "num_tokens": 8889461.0, + "step": 621 + }, + { + "epoch": 0.05775301764159703, + "grad_norm": 2.2242388225092515, + "learning_rate": 5e-06, + "loss": 0.563, + "num_tokens": 8902265.0, + "step": 622 + }, + { + "epoch": 0.05784586815227484, + "grad_norm": 2.00942531733651, + "learning_rate": 5e-06, + "loss": 0.4743, + "num_tokens": 8915669.0, + "step": 623 + }, + { + "epoch": 0.057938718662952644, + "grad_norm": 2.0005584956647557, + "learning_rate": 5e-06, + "loss": 0.6457, + "num_tokens": 8931380.0, + "step": 624 + }, + { + "epoch": 0.058031569173630455, + "grad_norm": 1.9926101807701735, + "learning_rate": 5e-06, + "loss": 0.5831, + "num_tokens": 8946718.0, + "step": 625 + }, + { + "epoch": 0.058124419684308266, + "grad_norm": 1.971366373388296, + "learning_rate": 5e-06, + "loss": 0.5516, + "num_tokens": 8961754.0, + "step": 626 + }, + { + "epoch": 0.05821727019498607, + "grad_norm": 2.186123127744096, + "learning_rate": 5e-06, + "loss": 0.6699, + "num_tokens": 8977751.0, + "step": 627 + }, + { + "epoch": 0.05831012070566388, + "grad_norm": 2.0692717577879725, + "learning_rate": 5e-06, + "loss": 0.5852, + "num_tokens": 8991633.0, + "step": 628 + }, + { + "epoch": 0.05840297121634169, + "grad_norm": 2.1132177040504865, + "learning_rate": 5e-06, + "loss": 0.5842, + "num_tokens": 9005329.0, + "step": 629 + }, + { + "epoch": 0.0584958217270195, + "grad_norm": 2.205984085045793, + "learning_rate": 5e-06, + "loss": 0.7334, + "num_tokens": 9019840.0, + "step": 630 + }, + { + "epoch": 0.05858867223769731, + "grad_norm": 2.1102214858196517, + "learning_rate": 5e-06, + "loss": 0.5568, + "num_tokens": 9033928.0, + "step": 631 + }, + { + "epoch": 0.05868152274837512, + "grad_norm": 2.2361847995474, + "learning_rate": 5e-06, + "loss": 0.6923, + "num_tokens": 9047746.0, + "step": 632 + }, + { + "epoch": 0.05877437325905292, + "grad_norm": 2.097518437738315, + "learning_rate": 5e-06, + "loss": 0.6205, + "num_tokens": 9062308.0, + "step": 633 + }, + { + "epoch": 0.058867223769730734, + "grad_norm": 1.952572810400633, + "learning_rate": 5e-06, + "loss": 0.6296, + "num_tokens": 9078692.0, + "step": 634 + }, + { + "epoch": 0.058960074280408545, + "grad_norm": 2.166625105973104, + "learning_rate": 5e-06, + "loss": 0.5386, + "num_tokens": 9091707.0, + "step": 635 + }, + { + "epoch": 0.05905292479108635, + "grad_norm": 2.092753142741248, + "learning_rate": 5e-06, + "loss": 0.6092, + "num_tokens": 9106092.0, + "step": 636 + }, + { + "epoch": 0.05914577530176416, + "grad_norm": 2.233574547775065, + "learning_rate": 5e-06, + "loss": 0.6747, + "num_tokens": 9120676.0, + "step": 637 + }, + { + "epoch": 0.05923862581244197, + "grad_norm": 2.063446553756211, + "learning_rate": 5e-06, + "loss": 0.6632, + "num_tokens": 9134636.0, + "step": 638 + }, + { + "epoch": 0.059331476323119775, + "grad_norm": 2.2406787931682555, + "learning_rate": 5e-06, + "loss": 0.5724, + "num_tokens": 9147064.0, + "step": 639 + }, + { + "epoch": 0.059424326833797586, + "grad_norm": 2.1559768719879546, + "learning_rate": 5e-06, + "loss": 0.5959, + "num_tokens": 9162689.0, + "step": 640 + }, + { + "epoch": 0.0595171773444754, + "grad_norm": 2.0743667820666913, + "learning_rate": 5e-06, + "loss": 0.6548, + "num_tokens": 9177702.0, + "step": 641 + }, + { + "epoch": 0.0596100278551532, + "grad_norm": 2.0200615304276215, + "learning_rate": 5e-06, + "loss": 0.5906, + "num_tokens": 9192075.0, + "step": 642 + }, + { + "epoch": 0.05970287836583101, + "grad_norm": 1.9658925908052716, + "learning_rate": 5e-06, + "loss": 0.6388, + "num_tokens": 9207469.0, + "step": 643 + }, + { + "epoch": 0.05979572887650882, + "grad_norm": 2.129445466070689, + "learning_rate": 5e-06, + "loss": 0.5358, + "num_tokens": 9221635.0, + "step": 644 + }, + { + "epoch": 0.05988857938718663, + "grad_norm": 2.0934978162973295, + "learning_rate": 5e-06, + "loss": 0.587, + "num_tokens": 9237076.0, + "step": 645 + }, + { + "epoch": 0.05998142989786444, + "grad_norm": 2.111236294851904, + "learning_rate": 5e-06, + "loss": 0.6112, + "num_tokens": 9253263.0, + "step": 646 + }, + { + "epoch": 0.06007428040854225, + "grad_norm": 2.177509701520123, + "learning_rate": 5e-06, + "loss": 0.5842, + "num_tokens": 9266389.0, + "step": 647 + }, + { + "epoch": 0.06016713091922005, + "grad_norm": 2.1332859105227575, + "learning_rate": 5e-06, + "loss": 0.6558, + "num_tokens": 9282127.0, + "step": 648 + }, + { + "epoch": 0.060259981429897864, + "grad_norm": 2.3356797096515254, + "learning_rate": 5e-06, + "loss": 0.7086, + "num_tokens": 9294629.0, + "step": 649 + }, + { + "epoch": 0.060352831940575676, + "grad_norm": 2.0214409589798414, + "learning_rate": 5e-06, + "loss": 0.5794, + "num_tokens": 9309565.0, + "step": 650 + }, + { + "epoch": 0.06044568245125348, + "grad_norm": 2.0311676265945864, + "learning_rate": 5e-06, + "loss": 0.5592, + "num_tokens": 9324334.0, + "step": 651 + }, + { + "epoch": 0.06053853296193129, + "grad_norm": 1.9684046869565008, + "learning_rate": 5e-06, + "loss": 0.6107, + "num_tokens": 9340348.0, + "step": 652 + }, + { + "epoch": 0.0606313834726091, + "grad_norm": 1.9640723827958146, + "learning_rate": 5e-06, + "loss": 0.5419, + "num_tokens": 9354086.0, + "step": 653 + }, + { + "epoch": 0.060724233983286906, + "grad_norm": 2.0334824407121324, + "learning_rate": 5e-06, + "loss": 0.6182, + "num_tokens": 9369224.0, + "step": 654 + }, + { + "epoch": 0.06081708449396472, + "grad_norm": 2.032113288124857, + "learning_rate": 5e-06, + "loss": 0.5093, + "num_tokens": 9383536.0, + "step": 655 + }, + { + "epoch": 0.06090993500464253, + "grad_norm": 2.1318689820170653, + "learning_rate": 5e-06, + "loss": 0.706, + "num_tokens": 9398988.0, + "step": 656 + }, + { + "epoch": 0.06100278551532033, + "grad_norm": 2.1844344305920798, + "learning_rate": 5e-06, + "loss": 0.7169, + "num_tokens": 9414019.0, + "step": 657 + }, + { + "epoch": 0.06109563602599814, + "grad_norm": 2.0770463995658477, + "learning_rate": 5e-06, + "loss": 0.6309, + "num_tokens": 9427854.0, + "step": 658 + }, + { + "epoch": 0.061188486536675954, + "grad_norm": 2.0491021290756257, + "learning_rate": 5e-06, + "loss": 0.6316, + "num_tokens": 9442309.0, + "step": 659 + }, + { + "epoch": 0.06128133704735376, + "grad_norm": 2.0783053976200825, + "learning_rate": 5e-06, + "loss": 0.5491, + "num_tokens": 9455948.0, + "step": 660 + }, + { + "epoch": 0.06137418755803157, + "grad_norm": 1.9497550187859132, + "learning_rate": 5e-06, + "loss": 0.624, + "num_tokens": 9471428.0, + "step": 661 + }, + { + "epoch": 0.06146703806870938, + "grad_norm": 2.012605424992593, + "learning_rate": 5e-06, + "loss": 0.5938, + "num_tokens": 9485635.0, + "step": 662 + }, + { + "epoch": 0.061559888579387184, + "grad_norm": 2.0593686369570583, + "learning_rate": 5e-06, + "loss": 0.5864, + "num_tokens": 9499614.0, + "step": 663 + }, + { + "epoch": 0.061652739090064995, + "grad_norm": 2.0651195624214846, + "learning_rate": 5e-06, + "loss": 0.5614, + "num_tokens": 9512753.0, + "step": 664 + }, + { + "epoch": 0.061745589600742806, + "grad_norm": 1.9717626656821952, + "learning_rate": 5e-06, + "loss": 0.5784, + "num_tokens": 9528147.0, + "step": 665 + }, + { + "epoch": 0.06183844011142061, + "grad_norm": 1.9409455127427149, + "learning_rate": 5e-06, + "loss": 0.5679, + "num_tokens": 9543884.0, + "step": 666 + }, + { + "epoch": 0.06193129062209842, + "grad_norm": 1.9976685706520205, + "learning_rate": 5e-06, + "loss": 0.6022, + "num_tokens": 9559267.0, + "step": 667 + }, + { + "epoch": 0.06202414113277623, + "grad_norm": 2.067360938534982, + "learning_rate": 5e-06, + "loss": 0.583, + "num_tokens": 9572825.0, + "step": 668 + }, + { + "epoch": 0.062116991643454036, + "grad_norm": 2.3078901950753488, + "learning_rate": 5e-06, + "loss": 0.606, + "num_tokens": 9585463.0, + "step": 669 + }, + { + "epoch": 0.06220984215413185, + "grad_norm": 2.093527771188684, + "learning_rate": 5e-06, + "loss": 0.6644, + "num_tokens": 9600062.0, + "step": 670 + }, + { + "epoch": 0.06230269266480966, + "grad_norm": 2.1175995546180784, + "learning_rate": 5e-06, + "loss": 0.5996, + "num_tokens": 9614452.0, + "step": 671 + }, + { + "epoch": 0.06239554317548746, + "grad_norm": 2.0497429785216568, + "learning_rate": 5e-06, + "loss": 0.6534, + "num_tokens": 9628980.0, + "step": 672 + }, + { + "epoch": 0.062488393686165274, + "grad_norm": 2.092301509619279, + "learning_rate": 5e-06, + "loss": 0.5603, + "num_tokens": 9642425.0, + "step": 673 + }, + { + "epoch": 0.06258124419684308, + "grad_norm": 1.9168903615774842, + "learning_rate": 5e-06, + "loss": 0.4852, + "num_tokens": 9656822.0, + "step": 674 + }, + { + "epoch": 0.06267409470752089, + "grad_norm": 2.2946930433042296, + "learning_rate": 5e-06, + "loss": 0.6364, + "num_tokens": 9669501.0, + "step": 675 + }, + { + "epoch": 0.0627669452181987, + "grad_norm": 2.1205377601423603, + "learning_rate": 5e-06, + "loss": 0.6074, + "num_tokens": 9683602.0, + "step": 676 + }, + { + "epoch": 0.06285979572887651, + "grad_norm": 1.9632612250421462, + "learning_rate": 5e-06, + "loss": 0.5779, + "num_tokens": 9699487.0, + "step": 677 + }, + { + "epoch": 0.06295264623955432, + "grad_norm": 1.9310897203319835, + "learning_rate": 5e-06, + "loss": 0.5331, + "num_tokens": 9714287.0, + "step": 678 + }, + { + "epoch": 0.06304549675023213, + "grad_norm": 2.04854408180042, + "learning_rate": 5e-06, + "loss": 0.6599, + "num_tokens": 9729908.0, + "step": 679 + }, + { + "epoch": 0.06313834726090993, + "grad_norm": 2.2039228632499888, + "learning_rate": 5e-06, + "loss": 0.5656, + "num_tokens": 9742431.0, + "step": 680 + }, + { + "epoch": 0.06323119777158774, + "grad_norm": 2.5488136546545133, + "learning_rate": 5e-06, + "loss": 0.6997, + "num_tokens": 9755319.0, + "step": 681 + }, + { + "epoch": 0.06332404828226555, + "grad_norm": 1.9774272910389827, + "learning_rate": 5e-06, + "loss": 0.6457, + "num_tokens": 9770142.0, + "step": 682 + }, + { + "epoch": 0.06341689879294336, + "grad_norm": 1.890706035639039, + "learning_rate": 5e-06, + "loss": 0.5117, + "num_tokens": 9785330.0, + "step": 683 + }, + { + "epoch": 0.06350974930362117, + "grad_norm": 2.1862497952858684, + "learning_rate": 5e-06, + "loss": 0.6556, + "num_tokens": 9799450.0, + "step": 684 + }, + { + "epoch": 0.06360259981429899, + "grad_norm": 2.2069232222999626, + "learning_rate": 5e-06, + "loss": 0.6599, + "num_tokens": 9814872.0, + "step": 685 + }, + { + "epoch": 0.06369545032497678, + "grad_norm": 1.9660144099038583, + "learning_rate": 5e-06, + "loss": 0.5379, + "num_tokens": 9829809.0, + "step": 686 + }, + { + "epoch": 0.0637883008356546, + "grad_norm": 1.8852759090571183, + "learning_rate": 5e-06, + "loss": 0.5729, + "num_tokens": 9845505.0, + "step": 687 + }, + { + "epoch": 0.0638811513463324, + "grad_norm": 2.0840512193110317, + "learning_rate": 5e-06, + "loss": 0.6479, + "num_tokens": 9860062.0, + "step": 688 + }, + { + "epoch": 0.06397400185701022, + "grad_norm": 2.093838626601899, + "learning_rate": 5e-06, + "loss": 0.587, + "num_tokens": 9874037.0, + "step": 689 + }, + { + "epoch": 0.06406685236768803, + "grad_norm": 2.0685850348810075, + "learning_rate": 5e-06, + "loss": 0.5431, + "num_tokens": 9887108.0, + "step": 690 + }, + { + "epoch": 0.06415970287836584, + "grad_norm": 2.113880519301545, + "learning_rate": 5e-06, + "loss": 0.5094, + "num_tokens": 9900151.0, + "step": 691 + }, + { + "epoch": 0.06425255338904363, + "grad_norm": 2.059203780678895, + "learning_rate": 5e-06, + "loss": 0.5141, + "num_tokens": 9912778.0, + "step": 692 + }, + { + "epoch": 0.06434540389972145, + "grad_norm": 2.1507425371379485, + "learning_rate": 5e-06, + "loss": 0.692, + "num_tokens": 9927527.0, + "step": 693 + }, + { + "epoch": 0.06443825441039926, + "grad_norm": 2.1038257096582167, + "learning_rate": 5e-06, + "loss": 0.6741, + "num_tokens": 9943349.0, + "step": 694 + }, + { + "epoch": 0.06453110492107707, + "grad_norm": 2.1994561785536253, + "learning_rate": 5e-06, + "loss": 0.6529, + "num_tokens": 9956677.0, + "step": 695 + }, + { + "epoch": 0.06462395543175488, + "grad_norm": 2.0375348361510923, + "learning_rate": 5e-06, + "loss": 0.5875, + "num_tokens": 9970600.0, + "step": 696 + }, + { + "epoch": 0.06471680594243269, + "grad_norm": 2.0962533511739436, + "learning_rate": 5e-06, + "loss": 0.6418, + "num_tokens": 9985496.0, + "step": 697 + }, + { + "epoch": 0.06480965645311049, + "grad_norm": 1.9552995787178655, + "learning_rate": 5e-06, + "loss": 0.516, + "num_tokens": 10000478.0, + "step": 698 + }, + { + "epoch": 0.0649025069637883, + "grad_norm": 2.192076309317724, + "learning_rate": 5e-06, + "loss": 0.6845, + "num_tokens": 10015394.0, + "step": 699 + }, + { + "epoch": 0.06499535747446611, + "grad_norm": 2.07068044748379, + "learning_rate": 5e-06, + "loss": 0.4831, + "num_tokens": 10028725.0, + "step": 700 + }, + { + "epoch": 0.06508820798514392, + "grad_norm": 2.0146450069825614, + "learning_rate": 5e-06, + "loss": 0.6237, + "num_tokens": 10044183.0, + "step": 701 + }, + { + "epoch": 0.06518105849582173, + "grad_norm": 2.055044959350912, + "learning_rate": 5e-06, + "loss": 0.6294, + "num_tokens": 10059573.0, + "step": 702 + }, + { + "epoch": 0.06527390900649954, + "grad_norm": 2.031053798787178, + "learning_rate": 5e-06, + "loss": 0.5417, + "num_tokens": 10074126.0, + "step": 703 + }, + { + "epoch": 0.06536675951717734, + "grad_norm": 1.9995132166947185, + "learning_rate": 5e-06, + "loss": 0.5709, + "num_tokens": 10088383.0, + "step": 704 + }, + { + "epoch": 0.06545961002785515, + "grad_norm": 1.9628601506568242, + "learning_rate": 5e-06, + "loss": 0.5879, + "num_tokens": 10104050.0, + "step": 705 + }, + { + "epoch": 0.06555246053853296, + "grad_norm": 2.193243746902614, + "learning_rate": 5e-06, + "loss": 0.7057, + "num_tokens": 10119185.0, + "step": 706 + }, + { + "epoch": 0.06564531104921077, + "grad_norm": 2.159755955192533, + "learning_rate": 5e-06, + "loss": 0.5922, + "num_tokens": 10132722.0, + "step": 707 + }, + { + "epoch": 0.06573816155988858, + "grad_norm": 2.1338413576651636, + "learning_rate": 5e-06, + "loss": 0.6282, + "num_tokens": 10146502.0, + "step": 708 + }, + { + "epoch": 0.0658310120705664, + "grad_norm": 2.0951447911009877, + "learning_rate": 5e-06, + "loss": 0.6486, + "num_tokens": 10161209.0, + "step": 709 + }, + { + "epoch": 0.06592386258124419, + "grad_norm": 2.1233391937994477, + "learning_rate": 5e-06, + "loss": 0.6197, + "num_tokens": 10174304.0, + "step": 710 + }, + { + "epoch": 0.066016713091922, + "grad_norm": 2.130240567001582, + "learning_rate": 5e-06, + "loss": 0.6293, + "num_tokens": 10188837.0, + "step": 711 + }, + { + "epoch": 0.06610956360259981, + "grad_norm": 1.9618359917589459, + "learning_rate": 5e-06, + "loss": 0.5774, + "num_tokens": 10203875.0, + "step": 712 + }, + { + "epoch": 0.06620241411327762, + "grad_norm": 2.139897803248488, + "learning_rate": 5e-06, + "loss": 0.6321, + "num_tokens": 10219221.0, + "step": 713 + }, + { + "epoch": 0.06629526462395544, + "grad_norm": 2.025947272932049, + "learning_rate": 5e-06, + "loss": 0.5651, + "num_tokens": 10232506.0, + "step": 714 + }, + { + "epoch": 0.06638811513463325, + "grad_norm": 2.1031287266543885, + "learning_rate": 5e-06, + "loss": 0.6556, + "num_tokens": 10247236.0, + "step": 715 + }, + { + "epoch": 0.06648096564531104, + "grad_norm": 1.9966444523224782, + "learning_rate": 5e-06, + "loss": 0.5832, + "num_tokens": 10261830.0, + "step": 716 + }, + { + "epoch": 0.06657381615598885, + "grad_norm": 2.0007385056950673, + "learning_rate": 5e-06, + "loss": 0.6828, + "num_tokens": 10277839.0, + "step": 717 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 2.018254119409408, + "learning_rate": 5e-06, + "loss": 0.5261, + "num_tokens": 10292835.0, + "step": 718 + }, + { + "epoch": 0.06675951717734448, + "grad_norm": 2.204669161393202, + "learning_rate": 5e-06, + "loss": 0.5727, + "num_tokens": 10306899.0, + "step": 719 + }, + { + "epoch": 0.06685236768802229, + "grad_norm": 2.129594766621451, + "learning_rate": 5e-06, + "loss": 0.5819, + "num_tokens": 10319953.0, + "step": 720 + }, + { + "epoch": 0.0669452181987001, + "grad_norm": 2.138843022878026, + "learning_rate": 5e-06, + "loss": 0.6182, + "num_tokens": 10334340.0, + "step": 721 + }, + { + "epoch": 0.0670380687093779, + "grad_norm": 1.9172964860901047, + "learning_rate": 5e-06, + "loss": 0.6053, + "num_tokens": 10350141.0, + "step": 722 + }, + { + "epoch": 0.06713091922005571, + "grad_norm": 2.0305708971979657, + "learning_rate": 5e-06, + "loss": 0.5839, + "num_tokens": 10364724.0, + "step": 723 + }, + { + "epoch": 0.06722376973073352, + "grad_norm": 2.0064006602332647, + "learning_rate": 5e-06, + "loss": 0.6141, + "num_tokens": 10379796.0, + "step": 724 + }, + { + "epoch": 0.06731662024141133, + "grad_norm": 2.0275863973093027, + "learning_rate": 5e-06, + "loss": 0.5905, + "num_tokens": 10395364.0, + "step": 725 + }, + { + "epoch": 0.06740947075208914, + "grad_norm": 2.1666973430877974, + "learning_rate": 5e-06, + "loss": 0.636, + "num_tokens": 10408512.0, + "step": 726 + }, + { + "epoch": 0.06750232126276695, + "grad_norm": 2.2038223752508177, + "learning_rate": 5e-06, + "loss": 0.6197, + "num_tokens": 10421720.0, + "step": 727 + }, + { + "epoch": 0.06759517177344475, + "grad_norm": 2.075454916131158, + "learning_rate": 5e-06, + "loss": 0.6013, + "num_tokens": 10435311.0, + "step": 728 + }, + { + "epoch": 0.06768802228412256, + "grad_norm": 2.1439075869317463, + "learning_rate": 5e-06, + "loss": 0.5794, + "num_tokens": 10448838.0, + "step": 729 + }, + { + "epoch": 0.06778087279480037, + "grad_norm": 1.9965630822891485, + "learning_rate": 5e-06, + "loss": 0.6249, + "num_tokens": 10464623.0, + "step": 730 + }, + { + "epoch": 0.06787372330547818, + "grad_norm": 2.1617775695012136, + "learning_rate": 5e-06, + "loss": 0.6602, + "num_tokens": 10480133.0, + "step": 731 + }, + { + "epoch": 0.06796657381615599, + "grad_norm": 2.056115471777315, + "learning_rate": 5e-06, + "loss": 0.5691, + "num_tokens": 10494939.0, + "step": 732 + }, + { + "epoch": 0.0680594243268338, + "grad_norm": 2.20580104552033, + "learning_rate": 5e-06, + "loss": 0.6266, + "num_tokens": 10508915.0, + "step": 733 + }, + { + "epoch": 0.0681522748375116, + "grad_norm": 1.9813437849247333, + "learning_rate": 5e-06, + "loss": 0.5181, + "num_tokens": 10523411.0, + "step": 734 + }, + { + "epoch": 0.06824512534818941, + "grad_norm": 2.0376769736050124, + "learning_rate": 5e-06, + "loss": 0.5499, + "num_tokens": 10538559.0, + "step": 735 + }, + { + "epoch": 0.06833797585886722, + "grad_norm": 1.9206579407867308, + "learning_rate": 5e-06, + "loss": 0.6018, + "num_tokens": 10554729.0, + "step": 736 + }, + { + "epoch": 0.06843082636954503, + "grad_norm": 2.1361430001944743, + "learning_rate": 5e-06, + "loss": 0.6249, + "num_tokens": 10569600.0, + "step": 737 + }, + { + "epoch": 0.06852367688022284, + "grad_norm": 2.1801980427718575, + "learning_rate": 5e-06, + "loss": 0.7083, + "num_tokens": 10585147.0, + "step": 738 + }, + { + "epoch": 0.06861652739090066, + "grad_norm": 2.078323569394189, + "learning_rate": 5e-06, + "loss": 0.5535, + "num_tokens": 10598843.0, + "step": 739 + }, + { + "epoch": 0.06870937790157845, + "grad_norm": 2.1106204670463202, + "learning_rate": 5e-06, + "loss": 0.5595, + "num_tokens": 10612561.0, + "step": 740 + }, + { + "epoch": 0.06880222841225626, + "grad_norm": 2.168203963284914, + "learning_rate": 5e-06, + "loss": 0.596, + "num_tokens": 10626722.0, + "step": 741 + }, + { + "epoch": 0.06889507892293407, + "grad_norm": 1.9720035245985432, + "learning_rate": 5e-06, + "loss": 0.578, + "num_tokens": 10642684.0, + "step": 742 + }, + { + "epoch": 0.06898792943361189, + "grad_norm": 1.992841381615888, + "learning_rate": 5e-06, + "loss": 0.5117, + "num_tokens": 10657703.0, + "step": 743 + }, + { + "epoch": 0.0690807799442897, + "grad_norm": 2.2887721951823408, + "learning_rate": 5e-06, + "loss": 0.7615, + "num_tokens": 10672915.0, + "step": 744 + }, + { + "epoch": 0.06917363045496751, + "grad_norm": 1.8849728479782315, + "learning_rate": 5e-06, + "loss": 0.508, + "num_tokens": 10687466.0, + "step": 745 + }, + { + "epoch": 0.0692664809656453, + "grad_norm": 2.176805809525069, + "learning_rate": 5e-06, + "loss": 0.7425, + "num_tokens": 10701387.0, + "step": 746 + }, + { + "epoch": 0.06935933147632312, + "grad_norm": 2.1061038057192802, + "learning_rate": 5e-06, + "loss": 0.4925, + "num_tokens": 10713824.0, + "step": 747 + }, + { + "epoch": 0.06945218198700093, + "grad_norm": 2.103213146531026, + "learning_rate": 5e-06, + "loss": 0.6164, + "num_tokens": 10727599.0, + "step": 748 + }, + { + "epoch": 0.06954503249767874, + "grad_norm": 2.0090345291015335, + "learning_rate": 5e-06, + "loss": 0.5111, + "num_tokens": 10741360.0, + "step": 749 + }, + { + "epoch": 0.06963788300835655, + "grad_norm": 1.9746388851066063, + "learning_rate": 5e-06, + "loss": 0.5545, + "num_tokens": 10755700.0, + "step": 750 + }, + { + "epoch": 0.06973073351903436, + "grad_norm": 1.937903568346398, + "learning_rate": 5e-06, + "loss": 0.6601, + "num_tokens": 10772084.0, + "step": 751 + }, + { + "epoch": 0.06982358402971216, + "grad_norm": 1.9445153806421829, + "learning_rate": 5e-06, + "loss": 0.6252, + "num_tokens": 10787734.0, + "step": 752 + }, + { + "epoch": 0.06991643454038997, + "grad_norm": 2.1043565857506965, + "learning_rate": 5e-06, + "loss": 0.6026, + "num_tokens": 10802521.0, + "step": 753 + }, + { + "epoch": 0.07000928505106778, + "grad_norm": 1.9539433365193306, + "learning_rate": 5e-06, + "loss": 0.5573, + "num_tokens": 10817504.0, + "step": 754 + }, + { + "epoch": 0.07010213556174559, + "grad_norm": 1.9825040646670349, + "learning_rate": 5e-06, + "loss": 0.5556, + "num_tokens": 10832494.0, + "step": 755 + }, + { + "epoch": 0.0701949860724234, + "grad_norm": 2.024161797020233, + "learning_rate": 5e-06, + "loss": 0.6798, + "num_tokens": 10848643.0, + "step": 756 + }, + { + "epoch": 0.07028783658310121, + "grad_norm": 2.1844754778046744, + "learning_rate": 5e-06, + "loss": 0.5397, + "num_tokens": 10860987.0, + "step": 757 + }, + { + "epoch": 0.07038068709377901, + "grad_norm": 2.132657536325868, + "learning_rate": 5e-06, + "loss": 0.6934, + "num_tokens": 10875976.0, + "step": 758 + }, + { + "epoch": 0.07047353760445682, + "grad_norm": 2.193824615471635, + "learning_rate": 5e-06, + "loss": 0.6403, + "num_tokens": 10890031.0, + "step": 759 + }, + { + "epoch": 0.07056638811513463, + "grad_norm": 2.0093393975194616, + "learning_rate": 5e-06, + "loss": 0.5077, + "num_tokens": 10905102.0, + "step": 760 + }, + { + "epoch": 0.07065923862581244, + "grad_norm": 2.1286168726657184, + "learning_rate": 5e-06, + "loss": 0.6341, + "num_tokens": 10919369.0, + "step": 761 + }, + { + "epoch": 0.07075208913649025, + "grad_norm": 1.9984201151779075, + "learning_rate": 5e-06, + "loss": 0.5927, + "num_tokens": 10933710.0, + "step": 762 + }, + { + "epoch": 0.07084493964716806, + "grad_norm": 2.339044652976246, + "learning_rate": 5e-06, + "loss": 0.6411, + "num_tokens": 10946387.0, + "step": 763 + }, + { + "epoch": 0.07093779015784586, + "grad_norm": 1.890210832503502, + "learning_rate": 5e-06, + "loss": 0.5261, + "num_tokens": 10961789.0, + "step": 764 + }, + { + "epoch": 0.07103064066852367, + "grad_norm": 1.9616146442024358, + "learning_rate": 5e-06, + "loss": 0.5796, + "num_tokens": 10976590.0, + "step": 765 + }, + { + "epoch": 0.07112349117920148, + "grad_norm": 2.1222107404349857, + "learning_rate": 5e-06, + "loss": 0.6003, + "num_tokens": 10989822.0, + "step": 766 + }, + { + "epoch": 0.0712163416898793, + "grad_norm": 2.241997251886281, + "learning_rate": 5e-06, + "loss": 0.5709, + "num_tokens": 11003272.0, + "step": 767 + }, + { + "epoch": 0.0713091922005571, + "grad_norm": 2.1852170172445393, + "learning_rate": 5e-06, + "loss": 0.6303, + "num_tokens": 11017710.0, + "step": 768 + }, + { + "epoch": 0.07140204271123492, + "grad_norm": 2.2512367618343507, + "learning_rate": 5e-06, + "loss": 0.5803, + "num_tokens": 11030485.0, + "step": 769 + }, + { + "epoch": 0.07149489322191271, + "grad_norm": 2.131002693896147, + "learning_rate": 5e-06, + "loss": 0.6579, + "num_tokens": 11045303.0, + "step": 770 + }, + { + "epoch": 0.07158774373259053, + "grad_norm": 1.9914686176795313, + "learning_rate": 5e-06, + "loss": 0.6016, + "num_tokens": 11060790.0, + "step": 771 + }, + { + "epoch": 0.07168059424326834, + "grad_norm": 2.054801852540561, + "learning_rate": 5e-06, + "loss": 0.625, + "num_tokens": 11074951.0, + "step": 772 + }, + { + "epoch": 0.07177344475394615, + "grad_norm": 2.077396710644228, + "learning_rate": 5e-06, + "loss": 0.493, + "num_tokens": 11087244.0, + "step": 773 + }, + { + "epoch": 0.07186629526462396, + "grad_norm": 2.0347939421807846, + "learning_rate": 5e-06, + "loss": 0.6416, + "num_tokens": 11103067.0, + "step": 774 + }, + { + "epoch": 0.07195914577530177, + "grad_norm": 2.297611832362149, + "learning_rate": 5e-06, + "loss": 0.7901, + "num_tokens": 11118050.0, + "step": 775 + }, + { + "epoch": 0.07205199628597957, + "grad_norm": 2.1796190675521605, + "learning_rate": 5e-06, + "loss": 0.6233, + "num_tokens": 11130978.0, + "step": 776 + }, + { + "epoch": 0.07214484679665738, + "grad_norm": 2.187524841141574, + "learning_rate": 5e-06, + "loss": 0.6208, + "num_tokens": 11145158.0, + "step": 777 + }, + { + "epoch": 0.07223769730733519, + "grad_norm": 2.051520761878353, + "learning_rate": 5e-06, + "loss": 0.566, + "num_tokens": 11160267.0, + "step": 778 + }, + { + "epoch": 0.072330547818013, + "grad_norm": 2.3559789799789903, + "learning_rate": 5e-06, + "loss": 0.5864, + "num_tokens": 11173456.0, + "step": 779 + }, + { + "epoch": 0.07242339832869081, + "grad_norm": 2.077500984351304, + "learning_rate": 5e-06, + "loss": 0.5482, + "num_tokens": 11188121.0, + "step": 780 + }, + { + "epoch": 0.07251624883936862, + "grad_norm": 2.0838401265534268, + "learning_rate": 5e-06, + "loss": 0.7052, + "num_tokens": 11203900.0, + "step": 781 + }, + { + "epoch": 0.07260909935004642, + "grad_norm": 2.1770871245269507, + "learning_rate": 5e-06, + "loss": 0.5624, + "num_tokens": 11217159.0, + "step": 782 + }, + { + "epoch": 0.07270194986072423, + "grad_norm": 1.9723793877201934, + "learning_rate": 5e-06, + "loss": 0.6291, + "num_tokens": 11232246.0, + "step": 783 + }, + { + "epoch": 0.07279480037140204, + "grad_norm": 2.0125409122433013, + "learning_rate": 5e-06, + "loss": 0.5217, + "num_tokens": 11247130.0, + "step": 784 + }, + { + "epoch": 0.07288765088207985, + "grad_norm": 2.2519571222417647, + "learning_rate": 5e-06, + "loss": 0.6411, + "num_tokens": 11260385.0, + "step": 785 + }, + { + "epoch": 0.07298050139275766, + "grad_norm": 2.0607027377403, + "learning_rate": 5e-06, + "loss": 0.5588, + "num_tokens": 11275586.0, + "step": 786 + }, + { + "epoch": 0.07307335190343547, + "grad_norm": 2.020911385020079, + "learning_rate": 5e-06, + "loss": 0.6306, + "num_tokens": 11290268.0, + "step": 787 + }, + { + "epoch": 0.07316620241411327, + "grad_norm": 2.0389661171736897, + "learning_rate": 5e-06, + "loss": 0.5782, + "num_tokens": 11305094.0, + "step": 788 + }, + { + "epoch": 0.07325905292479108, + "grad_norm": 2.057280563572399, + "learning_rate": 5e-06, + "loss": 0.6425, + "num_tokens": 11320597.0, + "step": 789 + }, + { + "epoch": 0.0733519034354689, + "grad_norm": 2.0060478210637616, + "learning_rate": 5e-06, + "loss": 0.6475, + "num_tokens": 11336013.0, + "step": 790 + }, + { + "epoch": 0.0734447539461467, + "grad_norm": 2.0579630040182564, + "learning_rate": 5e-06, + "loss": 0.6546, + "num_tokens": 11350736.0, + "step": 791 + }, + { + "epoch": 0.07353760445682452, + "grad_norm": 2.129074783970864, + "learning_rate": 5e-06, + "loss": 0.4666, + "num_tokens": 11362434.0, + "step": 792 + }, + { + "epoch": 0.07363045496750233, + "grad_norm": 2.0098086737238905, + "learning_rate": 5e-06, + "loss": 0.6916, + "num_tokens": 11378578.0, + "step": 793 + }, + { + "epoch": 0.07372330547818012, + "grad_norm": 1.9789701381877998, + "learning_rate": 5e-06, + "loss": 0.5378, + "num_tokens": 11393895.0, + "step": 794 + }, + { + "epoch": 0.07381615598885793, + "grad_norm": 2.0529529121482177, + "learning_rate": 5e-06, + "loss": 0.5315, + "num_tokens": 11407187.0, + "step": 795 + }, + { + "epoch": 0.07390900649953575, + "grad_norm": 1.9382449499271208, + "learning_rate": 5e-06, + "loss": 0.6296, + "num_tokens": 11423320.0, + "step": 796 + }, + { + "epoch": 0.07400185701021356, + "grad_norm": 2.00750882460766, + "learning_rate": 5e-06, + "loss": 0.6327, + "num_tokens": 11439049.0, + "step": 797 + }, + { + "epoch": 0.07409470752089137, + "grad_norm": 2.1010396027272744, + "learning_rate": 5e-06, + "loss": 0.6385, + "num_tokens": 11453466.0, + "step": 798 + }, + { + "epoch": 0.07418755803156918, + "grad_norm": 2.0384539234754016, + "learning_rate": 5e-06, + "loss": 0.5774, + "num_tokens": 11467285.0, + "step": 799 + }, + { + "epoch": 0.07428040854224698, + "grad_norm": 2.058151429219153, + "learning_rate": 5e-06, + "loss": 0.6711, + "num_tokens": 11480713.0, + "step": 800 + }, + { + "epoch": 0.07437325905292479, + "grad_norm": 2.014700566935491, + "learning_rate": 5e-06, + "loss": 0.6543, + "num_tokens": 11496072.0, + "step": 801 + }, + { + "epoch": 0.0744661095636026, + "grad_norm": 2.09441918118561, + "learning_rate": 5e-06, + "loss": 0.576, + "num_tokens": 11510242.0, + "step": 802 + }, + { + "epoch": 0.07455896007428041, + "grad_norm": 2.1018552867640397, + "learning_rate": 5e-06, + "loss": 0.6256, + "num_tokens": 11523929.0, + "step": 803 + }, + { + "epoch": 0.07465181058495822, + "grad_norm": 2.4019579122303147, + "learning_rate": 5e-06, + "loss": 0.5521, + "num_tokens": 11535281.0, + "step": 804 + }, + { + "epoch": 0.07474466109563603, + "grad_norm": 2.088613207062237, + "learning_rate": 5e-06, + "loss": 0.5218, + "num_tokens": 11549263.0, + "step": 805 + }, + { + "epoch": 0.07483751160631383, + "grad_norm": 2.074112138135572, + "learning_rate": 5e-06, + "loss": 0.5703, + "num_tokens": 11562247.0, + "step": 806 + }, + { + "epoch": 0.07493036211699164, + "grad_norm": 1.9977267087828536, + "learning_rate": 5e-06, + "loss": 0.5753, + "num_tokens": 11577757.0, + "step": 807 + }, + { + "epoch": 0.07502321262766945, + "grad_norm": 2.2168424675586023, + "learning_rate": 5e-06, + "loss": 0.6663, + "num_tokens": 11592404.0, + "step": 808 + }, + { + "epoch": 0.07511606313834726, + "grad_norm": 1.9472779653906431, + "learning_rate": 5e-06, + "loss": 0.5503, + "num_tokens": 11608048.0, + "step": 809 + }, + { + "epoch": 0.07520891364902507, + "grad_norm": 2.060235375310467, + "learning_rate": 5e-06, + "loss": 0.5836, + "num_tokens": 11623060.0, + "step": 810 + }, + { + "epoch": 0.07530176415970288, + "grad_norm": 2.2345596795627554, + "learning_rate": 5e-06, + "loss": 0.6253, + "num_tokens": 11636286.0, + "step": 811 + }, + { + "epoch": 0.07539461467038068, + "grad_norm": 2.028270377376435, + "learning_rate": 5e-06, + "loss": 0.5618, + "num_tokens": 11651343.0, + "step": 812 + }, + { + "epoch": 0.07548746518105849, + "grad_norm": 2.0055156435734474, + "learning_rate": 5e-06, + "loss": 0.5934, + "num_tokens": 11666913.0, + "step": 813 + }, + { + "epoch": 0.0755803156917363, + "grad_norm": 2.1491983463410955, + "learning_rate": 5e-06, + "loss": 0.5947, + "num_tokens": 11681369.0, + "step": 814 + }, + { + "epoch": 0.07567316620241411, + "grad_norm": 2.012946565820802, + "learning_rate": 5e-06, + "loss": 0.6063, + "num_tokens": 11695159.0, + "step": 815 + }, + { + "epoch": 0.07576601671309192, + "grad_norm": 2.0295417197832184, + "learning_rate": 5e-06, + "loss": 0.5731, + "num_tokens": 11709329.0, + "step": 816 + }, + { + "epoch": 0.07585886722376974, + "grad_norm": 2.038224301797941, + "learning_rate": 5e-06, + "loss": 0.5504, + "num_tokens": 11723497.0, + "step": 817 + }, + { + "epoch": 0.07595171773444755, + "grad_norm": 2.018686109514549, + "learning_rate": 5e-06, + "loss": 0.6164, + "num_tokens": 11737912.0, + "step": 818 + }, + { + "epoch": 0.07604456824512534, + "grad_norm": 2.0265853476448035, + "learning_rate": 5e-06, + "loss": 0.5609, + "num_tokens": 11750860.0, + "step": 819 + }, + { + "epoch": 0.07613741875580315, + "grad_norm": 2.0685216391130545, + "learning_rate": 5e-06, + "loss": 0.5725, + "num_tokens": 11763270.0, + "step": 820 + }, + { + "epoch": 0.07623026926648097, + "grad_norm": 1.9378336579065492, + "learning_rate": 5e-06, + "loss": 0.5823, + "num_tokens": 11779108.0, + "step": 821 + }, + { + "epoch": 0.07632311977715878, + "grad_norm": 2.1939518611002593, + "learning_rate": 5e-06, + "loss": 0.614, + "num_tokens": 11791833.0, + "step": 822 + }, + { + "epoch": 0.07641597028783659, + "grad_norm": 2.2031056802772024, + "learning_rate": 5e-06, + "loss": 0.6027, + "num_tokens": 11806157.0, + "step": 823 + }, + { + "epoch": 0.0765088207985144, + "grad_norm": 1.9115619744124048, + "learning_rate": 5e-06, + "loss": 0.5389, + "num_tokens": 11821064.0, + "step": 824 + }, + { + "epoch": 0.0766016713091922, + "grad_norm": 1.9577945045789475, + "learning_rate": 5e-06, + "loss": 0.5709, + "num_tokens": 11836508.0, + "step": 825 + }, + { + "epoch": 0.07669452181987, + "grad_norm": 2.1031047451170473, + "learning_rate": 5e-06, + "loss": 0.6391, + "num_tokens": 11851976.0, + "step": 826 + }, + { + "epoch": 0.07678737233054782, + "grad_norm": 2.251113654887244, + "learning_rate": 5e-06, + "loss": 0.6433, + "num_tokens": 11865887.0, + "step": 827 + }, + { + "epoch": 0.07688022284122563, + "grad_norm": 1.9610834603315113, + "learning_rate": 5e-06, + "loss": 0.5241, + "num_tokens": 11880725.0, + "step": 828 + }, + { + "epoch": 0.07697307335190344, + "grad_norm": 2.039621885844689, + "learning_rate": 5e-06, + "loss": 0.5829, + "num_tokens": 11895743.0, + "step": 829 + }, + { + "epoch": 0.07706592386258125, + "grad_norm": 2.245781039328306, + "learning_rate": 5e-06, + "loss": 0.6151, + "num_tokens": 11908905.0, + "step": 830 + }, + { + "epoch": 0.07715877437325905, + "grad_norm": 2.27366542244551, + "learning_rate": 5e-06, + "loss": 0.7177, + "num_tokens": 11922519.0, + "step": 831 + }, + { + "epoch": 0.07725162488393686, + "grad_norm": 2.1158175058252695, + "learning_rate": 5e-06, + "loss": 0.6775, + "num_tokens": 11936528.0, + "step": 832 + }, + { + "epoch": 0.07734447539461467, + "grad_norm": 1.9004445225491817, + "learning_rate": 5e-06, + "loss": 0.565, + "num_tokens": 11951999.0, + "step": 833 + }, + { + "epoch": 0.07743732590529248, + "grad_norm": 2.0317633440119294, + "learning_rate": 5e-06, + "loss": 0.5288, + "num_tokens": 11965562.0, + "step": 834 + }, + { + "epoch": 0.07753017641597029, + "grad_norm": 2.0237583446822147, + "learning_rate": 5e-06, + "loss": 0.5307, + "num_tokens": 11978858.0, + "step": 835 + }, + { + "epoch": 0.0776230269266481, + "grad_norm": 1.988766908949781, + "learning_rate": 5e-06, + "loss": 0.5419, + "num_tokens": 11993896.0, + "step": 836 + }, + { + "epoch": 0.0777158774373259, + "grad_norm": 2.0767690802175136, + "learning_rate": 5e-06, + "loss": 0.5916, + "num_tokens": 12008744.0, + "step": 837 + }, + { + "epoch": 0.07780872794800371, + "grad_norm": 2.037627228064559, + "learning_rate": 5e-06, + "loss": 0.5092, + "num_tokens": 12022953.0, + "step": 838 + }, + { + "epoch": 0.07790157845868152, + "grad_norm": 2.157297536664786, + "learning_rate": 5e-06, + "loss": 0.5902, + "num_tokens": 12037288.0, + "step": 839 + }, + { + "epoch": 0.07799442896935933, + "grad_norm": 2.10878822164796, + "learning_rate": 5e-06, + "loss": 0.615, + "num_tokens": 12051268.0, + "step": 840 + }, + { + "epoch": 0.07808727948003714, + "grad_norm": 2.1687350131287695, + "learning_rate": 5e-06, + "loss": 0.6822, + "num_tokens": 12067127.0, + "step": 841 + }, + { + "epoch": 0.07818012999071496, + "grad_norm": 2.1868282154692267, + "learning_rate": 5e-06, + "loss": 0.6742, + "num_tokens": 12081666.0, + "step": 842 + }, + { + "epoch": 0.07827298050139275, + "grad_norm": 2.024575518046244, + "learning_rate": 5e-06, + "loss": 0.596, + "num_tokens": 12097386.0, + "step": 843 + }, + { + "epoch": 0.07836583101207056, + "grad_norm": 2.2893032183617636, + "learning_rate": 5e-06, + "loss": 0.6619, + "num_tokens": 12110735.0, + "step": 844 + }, + { + "epoch": 0.07845868152274837, + "grad_norm": 2.0076665937373646, + "learning_rate": 5e-06, + "loss": 0.6438, + "num_tokens": 12125630.0, + "step": 845 + }, + { + "epoch": 0.07855153203342619, + "grad_norm": 2.0516600159839484, + "learning_rate": 5e-06, + "loss": 0.5359, + "num_tokens": 12138941.0, + "step": 846 + }, + { + "epoch": 0.078644382544104, + "grad_norm": 1.9780437534553, + "learning_rate": 5e-06, + "loss": 0.5784, + "num_tokens": 12152923.0, + "step": 847 + }, + { + "epoch": 0.07873723305478181, + "grad_norm": 2.145745498780528, + "learning_rate": 5e-06, + "loss": 0.6283, + "num_tokens": 12166437.0, + "step": 848 + }, + { + "epoch": 0.0788300835654596, + "grad_norm": 2.0030273137775714, + "learning_rate": 5e-06, + "loss": 0.5911, + "num_tokens": 12181946.0, + "step": 849 + }, + { + "epoch": 0.07892293407613742, + "grad_norm": 2.1735850477984937, + "learning_rate": 5e-06, + "loss": 0.578, + "num_tokens": 12195851.0, + "step": 850 + }, + { + "epoch": 0.07901578458681523, + "grad_norm": 2.095504170421263, + "learning_rate": 5e-06, + "loss": 0.674, + "num_tokens": 12211288.0, + "step": 851 + }, + { + "epoch": 0.07910863509749304, + "grad_norm": 2.0259561674530024, + "learning_rate": 5e-06, + "loss": 0.6377, + "num_tokens": 12227292.0, + "step": 852 + }, + { + "epoch": 0.07920148560817085, + "grad_norm": 2.037692960820566, + "learning_rate": 5e-06, + "loss": 0.4885, + "num_tokens": 12241151.0, + "step": 853 + }, + { + "epoch": 0.07929433611884866, + "grad_norm": 2.1076581682298143, + "learning_rate": 5e-06, + "loss": 0.5701, + "num_tokens": 12254781.0, + "step": 854 + }, + { + "epoch": 0.07938718662952646, + "grad_norm": 2.167697918685706, + "learning_rate": 5e-06, + "loss": 0.6233, + "num_tokens": 12269723.0, + "step": 855 + }, + { + "epoch": 0.07948003714020427, + "grad_norm": 1.9301282046059949, + "learning_rate": 5e-06, + "loss": 0.5974, + "num_tokens": 12285686.0, + "step": 856 + }, + { + "epoch": 0.07957288765088208, + "grad_norm": 2.0405752789513114, + "learning_rate": 5e-06, + "loss": 0.5623, + "num_tokens": 12299769.0, + "step": 857 + }, + { + "epoch": 0.07966573816155989, + "grad_norm": 1.9788097625178174, + "learning_rate": 5e-06, + "loss": 0.5658, + "num_tokens": 12314348.0, + "step": 858 + }, + { + "epoch": 0.0797585886722377, + "grad_norm": 2.0699852053649543, + "learning_rate": 5e-06, + "loss": 0.6023, + "num_tokens": 12329015.0, + "step": 859 + }, + { + "epoch": 0.07985143918291551, + "grad_norm": 1.951583586578621, + "learning_rate": 5e-06, + "loss": 0.5706, + "num_tokens": 12344795.0, + "step": 860 + }, + { + "epoch": 0.07994428969359331, + "grad_norm": 1.9812224073350133, + "learning_rate": 5e-06, + "loss": 0.6615, + "num_tokens": 12360382.0, + "step": 861 + }, + { + "epoch": 0.08003714020427112, + "grad_norm": 1.972693062600726, + "learning_rate": 5e-06, + "loss": 0.6293, + "num_tokens": 12375287.0, + "step": 862 + }, + { + "epoch": 0.08012999071494893, + "grad_norm": 2.0091594207592407, + "learning_rate": 5e-06, + "loss": 0.5329, + "num_tokens": 12388497.0, + "step": 863 + }, + { + "epoch": 0.08022284122562674, + "grad_norm": 1.9851738287366878, + "learning_rate": 5e-06, + "loss": 0.5518, + "num_tokens": 12403557.0, + "step": 864 + }, + { + "epoch": 0.08031569173630455, + "grad_norm": 2.080306017160957, + "learning_rate": 5e-06, + "loss": 0.558, + "num_tokens": 12417525.0, + "step": 865 + }, + { + "epoch": 0.08040854224698236, + "grad_norm": 1.9532238607049095, + "learning_rate": 5e-06, + "loss": 0.546, + "num_tokens": 12432310.0, + "step": 866 + }, + { + "epoch": 0.08050139275766016, + "grad_norm": 2.02169514928064, + "learning_rate": 5e-06, + "loss": 0.5706, + "num_tokens": 12446918.0, + "step": 867 + }, + { + "epoch": 0.08059424326833797, + "grad_norm": 2.101242641987792, + "learning_rate": 5e-06, + "loss": 0.6181, + "num_tokens": 12462080.0, + "step": 868 + }, + { + "epoch": 0.08068709377901578, + "grad_norm": 2.0793604093710085, + "learning_rate": 5e-06, + "loss": 0.5575, + "num_tokens": 12475316.0, + "step": 869 + }, + { + "epoch": 0.0807799442896936, + "grad_norm": 2.0564590762029247, + "learning_rate": 5e-06, + "loss": 0.6245, + "num_tokens": 12490534.0, + "step": 870 + }, + { + "epoch": 0.0808727948003714, + "grad_norm": 2.02614632531486, + "learning_rate": 5e-06, + "loss": 0.6656, + "num_tokens": 12505265.0, + "step": 871 + }, + { + "epoch": 0.08096564531104922, + "grad_norm": 2.1277193015351865, + "learning_rate": 5e-06, + "loss": 0.4902, + "num_tokens": 12519590.0, + "step": 872 + }, + { + "epoch": 0.08105849582172701, + "grad_norm": 1.895789183631472, + "learning_rate": 5e-06, + "loss": 0.5106, + "num_tokens": 12534628.0, + "step": 873 + }, + { + "epoch": 0.08115134633240483, + "grad_norm": 1.8732202986290836, + "learning_rate": 5e-06, + "loss": 0.5319, + "num_tokens": 12550337.0, + "step": 874 + }, + { + "epoch": 0.08124419684308264, + "grad_norm": 2.0174496060060703, + "learning_rate": 5e-06, + "loss": 0.6197, + "num_tokens": 12566235.0, + "step": 875 + }, + { + "epoch": 0.08133704735376045, + "grad_norm": 2.176912651458486, + "learning_rate": 5e-06, + "loss": 0.7036, + "num_tokens": 12580752.0, + "step": 876 + }, + { + "epoch": 0.08142989786443826, + "grad_norm": 2.0533230379048746, + "learning_rate": 5e-06, + "loss": 0.554, + "num_tokens": 12594228.0, + "step": 877 + }, + { + "epoch": 0.08152274837511607, + "grad_norm": 2.0665534842208952, + "learning_rate": 5e-06, + "loss": 0.5245, + "num_tokens": 12608490.0, + "step": 878 + }, + { + "epoch": 0.08161559888579387, + "grad_norm": 2.1338746622813516, + "learning_rate": 5e-06, + "loss": 0.7013, + "num_tokens": 12624832.0, + "step": 879 + }, + { + "epoch": 0.08170844939647168, + "grad_norm": 2.178394710927803, + "learning_rate": 5e-06, + "loss": 0.589, + "num_tokens": 12638222.0, + "step": 880 + }, + { + "epoch": 0.08180129990714949, + "grad_norm": 2.044347563684345, + "learning_rate": 5e-06, + "loss": 0.5698, + "num_tokens": 12652421.0, + "step": 881 + }, + { + "epoch": 0.0818941504178273, + "grad_norm": 2.255756346528494, + "learning_rate": 5e-06, + "loss": 0.6519, + "num_tokens": 12667189.0, + "step": 882 + }, + { + "epoch": 0.08198700092850511, + "grad_norm": 2.183750364112111, + "learning_rate": 5e-06, + "loss": 0.6174, + "num_tokens": 12680015.0, + "step": 883 + }, + { + "epoch": 0.08207985143918292, + "grad_norm": 2.195087180884692, + "learning_rate": 5e-06, + "loss": 0.6544, + "num_tokens": 12693156.0, + "step": 884 + }, + { + "epoch": 0.08217270194986072, + "grad_norm": 2.178084871579104, + "learning_rate": 5e-06, + "loss": 0.5918, + "num_tokens": 12706374.0, + "step": 885 + }, + { + "epoch": 0.08226555246053853, + "grad_norm": 2.0059538492103743, + "learning_rate": 5e-06, + "loss": 0.6136, + "num_tokens": 12720675.0, + "step": 886 + }, + { + "epoch": 0.08235840297121634, + "grad_norm": 1.8847984040087393, + "learning_rate": 5e-06, + "loss": 0.5242, + "num_tokens": 12736672.0, + "step": 887 + }, + { + "epoch": 0.08245125348189415, + "grad_norm": 2.0242577466076246, + "learning_rate": 5e-06, + "loss": 0.5103, + "num_tokens": 12751957.0, + "step": 888 + }, + { + "epoch": 0.08254410399257196, + "grad_norm": 1.9294411797992521, + "learning_rate": 5e-06, + "loss": 0.5592, + "num_tokens": 12768094.0, + "step": 889 + }, + { + "epoch": 0.08263695450324977, + "grad_norm": 2.121648264192772, + "learning_rate": 5e-06, + "loss": 0.7061, + "num_tokens": 12782390.0, + "step": 890 + }, + { + "epoch": 0.08272980501392757, + "grad_norm": 2.0927751916722284, + "learning_rate": 5e-06, + "loss": 0.5532, + "num_tokens": 12797024.0, + "step": 891 + }, + { + "epoch": 0.08282265552460538, + "grad_norm": 2.159209099218787, + "learning_rate": 5e-06, + "loss": 0.5591, + "num_tokens": 12810522.0, + "step": 892 + }, + { + "epoch": 0.08291550603528319, + "grad_norm": 2.074433008579091, + "learning_rate": 5e-06, + "loss": 0.6009, + "num_tokens": 12825834.0, + "step": 893 + }, + { + "epoch": 0.083008356545961, + "grad_norm": 2.096995680606439, + "learning_rate": 5e-06, + "loss": 0.5984, + "num_tokens": 12841021.0, + "step": 894 + }, + { + "epoch": 0.08310120705663882, + "grad_norm": 2.108534611588687, + "learning_rate": 5e-06, + "loss": 0.576, + "num_tokens": 12854904.0, + "step": 895 + }, + { + "epoch": 0.08319405756731663, + "grad_norm": 2.09048537468935, + "learning_rate": 5e-06, + "loss": 0.5538, + "num_tokens": 12869226.0, + "step": 896 + }, + { + "epoch": 0.08328690807799442, + "grad_norm": 2.2902831851782635, + "learning_rate": 5e-06, + "loss": 0.6325, + "num_tokens": 12882513.0, + "step": 897 + }, + { + "epoch": 0.08337975858867223, + "grad_norm": 2.028857371033817, + "learning_rate": 5e-06, + "loss": 0.5051, + "num_tokens": 12896512.0, + "step": 898 + }, + { + "epoch": 0.08347260909935005, + "grad_norm": 1.9082830573800686, + "learning_rate": 5e-06, + "loss": 0.5341, + "num_tokens": 12912034.0, + "step": 899 + }, + { + "epoch": 0.08356545961002786, + "grad_norm": 1.991908873606021, + "learning_rate": 5e-06, + "loss": 0.565, + "num_tokens": 12926723.0, + "step": 900 + }, + { + "epoch": 0.08365831012070567, + "grad_norm": 2.140075371577014, + "learning_rate": 5e-06, + "loss": 0.5975, + "num_tokens": 12940601.0, + "step": 901 + }, + { + "epoch": 0.08375116063138348, + "grad_norm": 2.1648925446175356, + "learning_rate": 5e-06, + "loss": 0.6406, + "num_tokens": 12954984.0, + "step": 902 + }, + { + "epoch": 0.08384401114206128, + "grad_norm": 2.181198981453642, + "learning_rate": 5e-06, + "loss": 0.7137, + "num_tokens": 12969348.0, + "step": 903 + }, + { + "epoch": 0.08393686165273909, + "grad_norm": 2.2602245665414147, + "learning_rate": 5e-06, + "loss": 0.5515, + "num_tokens": 12985142.0, + "step": 904 + }, + { + "epoch": 0.0840297121634169, + "grad_norm": 2.1301017936971465, + "learning_rate": 5e-06, + "loss": 0.6337, + "num_tokens": 13000969.0, + "step": 905 + }, + { + "epoch": 0.08412256267409471, + "grad_norm": 2.122365499716171, + "learning_rate": 5e-06, + "loss": 0.6827, + "num_tokens": 13016128.0, + "step": 906 + }, + { + "epoch": 0.08421541318477252, + "grad_norm": 1.965905831875374, + "learning_rate": 5e-06, + "loss": 0.5594, + "num_tokens": 13030779.0, + "step": 907 + }, + { + "epoch": 0.08430826369545033, + "grad_norm": 2.1549295788950236, + "learning_rate": 5e-06, + "loss": 0.6107, + "num_tokens": 13044424.0, + "step": 908 + }, + { + "epoch": 0.08440111420612813, + "grad_norm": 1.9413351739579734, + "learning_rate": 5e-06, + "loss": 0.4985, + "num_tokens": 13058979.0, + "step": 909 + }, + { + "epoch": 0.08449396471680594, + "grad_norm": 2.244524444571072, + "learning_rate": 5e-06, + "loss": 0.7585, + "num_tokens": 13074051.0, + "step": 910 + }, + { + "epoch": 0.08458681522748375, + "grad_norm": 2.039069425747441, + "learning_rate": 5e-06, + "loss": 0.5795, + "num_tokens": 13088812.0, + "step": 911 + }, + { + "epoch": 0.08467966573816156, + "grad_norm": 2.0100076501046935, + "learning_rate": 5e-06, + "loss": 0.6822, + "num_tokens": 13104539.0, + "step": 912 + }, + { + "epoch": 0.08477251624883937, + "grad_norm": 2.2704487634627473, + "learning_rate": 5e-06, + "loss": 0.7439, + "num_tokens": 13119130.0, + "step": 913 + }, + { + "epoch": 0.08486536675951718, + "grad_norm": 1.9746509256172131, + "learning_rate": 5e-06, + "loss": 0.6558, + "num_tokens": 13135125.0, + "step": 914 + }, + { + "epoch": 0.08495821727019498, + "grad_norm": 2.145728407700786, + "learning_rate": 5e-06, + "loss": 0.6072, + "num_tokens": 13149675.0, + "step": 915 + }, + { + "epoch": 0.08505106778087279, + "grad_norm": 2.315658456263577, + "learning_rate": 5e-06, + "loss": 0.6322, + "num_tokens": 13163629.0, + "step": 916 + }, + { + "epoch": 0.0851439182915506, + "grad_norm": 2.0846590007744927, + "learning_rate": 5e-06, + "loss": 0.6077, + "num_tokens": 13177033.0, + "step": 917 + }, + { + "epoch": 0.08523676880222841, + "grad_norm": 2.1563363317910107, + "learning_rate": 5e-06, + "loss": 0.5758, + "num_tokens": 13191355.0, + "step": 918 + }, + { + "epoch": 0.08532961931290622, + "grad_norm": 2.325367983489667, + "learning_rate": 5e-06, + "loss": 0.7462, + "num_tokens": 13205342.0, + "step": 919 + }, + { + "epoch": 0.08542246982358404, + "grad_norm": 2.07977966646989, + "learning_rate": 5e-06, + "loss": 0.6487, + "num_tokens": 13219726.0, + "step": 920 + }, + { + "epoch": 0.08551532033426183, + "grad_norm": 2.1097576056092135, + "learning_rate": 5e-06, + "loss": 0.5943, + "num_tokens": 13234442.0, + "step": 921 + }, + { + "epoch": 0.08560817084493964, + "grad_norm": 1.9587636133796358, + "learning_rate": 5e-06, + "loss": 0.5014, + "num_tokens": 13249770.0, + "step": 922 + }, + { + "epoch": 0.08570102135561745, + "grad_norm": 2.2185034191409247, + "learning_rate": 5e-06, + "loss": 0.7009, + "num_tokens": 13265309.0, + "step": 923 + }, + { + "epoch": 0.08579387186629527, + "grad_norm": 2.0848122504215256, + "learning_rate": 5e-06, + "loss": 0.5756, + "num_tokens": 13279283.0, + "step": 924 + }, + { + "epoch": 0.08588672237697308, + "grad_norm": 2.0862514196399515, + "learning_rate": 5e-06, + "loss": 0.552, + "num_tokens": 13294330.0, + "step": 925 + }, + { + "epoch": 0.08597957288765089, + "grad_norm": 2.130708763746173, + "learning_rate": 5e-06, + "loss": 0.6133, + "num_tokens": 13308211.0, + "step": 926 + }, + { + "epoch": 0.08607242339832868, + "grad_norm": 2.176896272958518, + "learning_rate": 5e-06, + "loss": 0.6308, + "num_tokens": 13322823.0, + "step": 927 + }, + { + "epoch": 0.0861652739090065, + "grad_norm": 2.1245689282304516, + "learning_rate": 5e-06, + "loss": 0.5755, + "num_tokens": 13337523.0, + "step": 928 + }, + { + "epoch": 0.0862581244196843, + "grad_norm": 2.125274049929472, + "learning_rate": 5e-06, + "loss": 0.6288, + "num_tokens": 13351935.0, + "step": 929 + }, + { + "epoch": 0.08635097493036212, + "grad_norm": 2.0561744043709136, + "learning_rate": 5e-06, + "loss": 0.6569, + "num_tokens": 13368072.0, + "step": 930 + }, + { + "epoch": 0.08644382544103993, + "grad_norm": 2.151542216371963, + "learning_rate": 5e-06, + "loss": 0.5666, + "num_tokens": 13381699.0, + "step": 931 + }, + { + "epoch": 0.08653667595171774, + "grad_norm": 1.9584144790792886, + "learning_rate": 5e-06, + "loss": 0.5416, + "num_tokens": 13395719.0, + "step": 932 + }, + { + "epoch": 0.08662952646239554, + "grad_norm": 1.9526260881986217, + "learning_rate": 5e-06, + "loss": 0.5846, + "num_tokens": 13409996.0, + "step": 933 + }, + { + "epoch": 0.08672237697307335, + "grad_norm": 1.974799781786812, + "learning_rate": 5e-06, + "loss": 0.5854, + "num_tokens": 13425517.0, + "step": 934 + }, + { + "epoch": 0.08681522748375116, + "grad_norm": 2.0442647207354314, + "learning_rate": 5e-06, + "loss": 0.6149, + "num_tokens": 13438880.0, + "step": 935 + }, + { + "epoch": 0.08690807799442897, + "grad_norm": 2.170970488092463, + "learning_rate": 5e-06, + "loss": 0.6853, + "num_tokens": 13454089.0, + "step": 936 + }, + { + "epoch": 0.08700092850510678, + "grad_norm": 1.8721062414744214, + "learning_rate": 5e-06, + "loss": 0.4687, + "num_tokens": 13469266.0, + "step": 937 + }, + { + "epoch": 0.08709377901578459, + "grad_norm": 2.2113566546380263, + "learning_rate": 5e-06, + "loss": 0.5846, + "num_tokens": 13483139.0, + "step": 938 + }, + { + "epoch": 0.08718662952646239, + "grad_norm": 2.2429698010734733, + "learning_rate": 5e-06, + "loss": 0.711, + "num_tokens": 13496471.0, + "step": 939 + }, + { + "epoch": 0.0872794800371402, + "grad_norm": 2.1270026604532664, + "learning_rate": 5e-06, + "loss": 0.6196, + "num_tokens": 13510570.0, + "step": 940 + }, + { + "epoch": 0.08737233054781801, + "grad_norm": 2.119059724541826, + "learning_rate": 5e-06, + "loss": 0.5561, + "num_tokens": 13523912.0, + "step": 941 + }, + { + "epoch": 0.08746518105849582, + "grad_norm": 2.133499397259066, + "learning_rate": 5e-06, + "loss": 0.5683, + "num_tokens": 13537192.0, + "step": 942 + }, + { + "epoch": 0.08755803156917363, + "grad_norm": 1.9792694207162467, + "learning_rate": 5e-06, + "loss": 0.5763, + "num_tokens": 13552845.0, + "step": 943 + }, + { + "epoch": 0.08765088207985144, + "grad_norm": 2.046493740005975, + "learning_rate": 5e-06, + "loss": 0.5526, + "num_tokens": 13566876.0, + "step": 944 + }, + { + "epoch": 0.08774373259052924, + "grad_norm": 2.0501920182650175, + "learning_rate": 5e-06, + "loss": 0.5812, + "num_tokens": 13581307.0, + "step": 945 + }, + { + "epoch": 0.08783658310120705, + "grad_norm": 2.0394237039850163, + "learning_rate": 5e-06, + "loss": 0.6122, + "num_tokens": 13596357.0, + "step": 946 + }, + { + "epoch": 0.08792943361188486, + "grad_norm": 2.068514889517226, + "learning_rate": 5e-06, + "loss": 0.6476, + "num_tokens": 13610378.0, + "step": 947 + }, + { + "epoch": 0.08802228412256267, + "grad_norm": 2.057736481070177, + "learning_rate": 5e-06, + "loss": 0.5403, + "num_tokens": 13624566.0, + "step": 948 + }, + { + "epoch": 0.08811513463324049, + "grad_norm": 2.0037953345930384, + "learning_rate": 5e-06, + "loss": 0.5782, + "num_tokens": 13638889.0, + "step": 949 + }, + { + "epoch": 0.0882079851439183, + "grad_norm": 2.1788345355607364, + "learning_rate": 5e-06, + "loss": 0.557, + "num_tokens": 13652289.0, + "step": 950 + }, + { + "epoch": 0.0883008356545961, + "grad_norm": 1.9404370102143804, + "learning_rate": 5e-06, + "loss": 0.5561, + "num_tokens": 13666651.0, + "step": 951 + }, + { + "epoch": 0.0883936861652739, + "grad_norm": 2.0352460548198477, + "learning_rate": 5e-06, + "loss": 0.5975, + "num_tokens": 13681701.0, + "step": 952 + }, + { + "epoch": 0.08848653667595172, + "grad_norm": 2.045746106325808, + "learning_rate": 5e-06, + "loss": 0.6099, + "num_tokens": 13696268.0, + "step": 953 + }, + { + "epoch": 0.08857938718662953, + "grad_norm": 2.0323154514719937, + "learning_rate": 5e-06, + "loss": 0.5596, + "num_tokens": 13710959.0, + "step": 954 + }, + { + "epoch": 0.08867223769730734, + "grad_norm": 2.139187556144668, + "learning_rate": 5e-06, + "loss": 0.4731, + "num_tokens": 13723369.0, + "step": 955 + }, + { + "epoch": 0.08876508820798515, + "grad_norm": 1.9023206034287157, + "learning_rate": 5e-06, + "loss": 0.4865, + "num_tokens": 13738598.0, + "step": 956 + }, + { + "epoch": 0.08885793871866295, + "grad_norm": 2.1192709162112533, + "learning_rate": 5e-06, + "loss": 0.5969, + "num_tokens": 13752823.0, + "step": 957 + }, + { + "epoch": 0.08895078922934076, + "grad_norm": 2.175470612432415, + "learning_rate": 5e-06, + "loss": 0.5161, + "num_tokens": 13766098.0, + "step": 958 + }, + { + "epoch": 0.08904363974001857, + "grad_norm": 2.078619432623572, + "learning_rate": 5e-06, + "loss": 0.6158, + "num_tokens": 13781065.0, + "step": 959 + }, + { + "epoch": 0.08913649025069638, + "grad_norm": 2.019135890196655, + "learning_rate": 5e-06, + "loss": 0.5774, + "num_tokens": 13796809.0, + "step": 960 + }, + { + "epoch": 0.08922934076137419, + "grad_norm": 2.0613167241699664, + "learning_rate": 5e-06, + "loss": 0.5809, + "num_tokens": 13811892.0, + "step": 961 + }, + { + "epoch": 0.089322191272052, + "grad_norm": 2.105603465419038, + "learning_rate": 5e-06, + "loss": 0.5957, + "num_tokens": 13825868.0, + "step": 962 + }, + { + "epoch": 0.0894150417827298, + "grad_norm": 2.032587673509969, + "learning_rate": 5e-06, + "loss": 0.562, + "num_tokens": 13839982.0, + "step": 963 + }, + { + "epoch": 0.08950789229340761, + "grad_norm": 2.1535148598750244, + "learning_rate": 5e-06, + "loss": 0.6506, + "num_tokens": 13853507.0, + "step": 964 + }, + { + "epoch": 0.08960074280408542, + "grad_norm": 2.108506512695131, + "learning_rate": 5e-06, + "loss": 0.5623, + "num_tokens": 13867298.0, + "step": 965 + }, + { + "epoch": 0.08969359331476323, + "grad_norm": 2.177797676944205, + "learning_rate": 5e-06, + "loss": 0.6755, + "num_tokens": 13882070.0, + "step": 966 + }, + { + "epoch": 0.08978644382544104, + "grad_norm": 2.0919915762215844, + "learning_rate": 5e-06, + "loss": 0.5663, + "num_tokens": 13895278.0, + "step": 967 + }, + { + "epoch": 0.08987929433611885, + "grad_norm": 2.0791539473737655, + "learning_rate": 5e-06, + "loss": 0.658, + "num_tokens": 13909332.0, + "step": 968 + }, + { + "epoch": 0.08997214484679665, + "grad_norm": 2.0760549211969157, + "learning_rate": 5e-06, + "loss": 0.6306, + "num_tokens": 13924429.0, + "step": 969 + }, + { + "epoch": 0.09006499535747446, + "grad_norm": 2.048169656896346, + "learning_rate": 5e-06, + "loss": 0.5708, + "num_tokens": 13938428.0, + "step": 970 + }, + { + "epoch": 0.09015784586815227, + "grad_norm": 2.054561782977519, + "learning_rate": 5e-06, + "loss": 0.5971, + "num_tokens": 13953298.0, + "step": 971 + }, + { + "epoch": 0.09025069637883008, + "grad_norm": 2.047574135043119, + "learning_rate": 5e-06, + "loss": 0.5425, + "num_tokens": 13967091.0, + "step": 972 + }, + { + "epoch": 0.0903435468895079, + "grad_norm": 2.134898212862876, + "learning_rate": 5e-06, + "loss": 0.5444, + "num_tokens": 13980856.0, + "step": 973 + }, + { + "epoch": 0.0904363974001857, + "grad_norm": 2.1046763643767394, + "learning_rate": 5e-06, + "loss": 0.7496, + "num_tokens": 13996372.0, + "step": 974 + }, + { + "epoch": 0.0905292479108635, + "grad_norm": 1.982245645684972, + "learning_rate": 5e-06, + "loss": 0.5667, + "num_tokens": 14010422.0, + "step": 975 + }, + { + "epoch": 0.09062209842154131, + "grad_norm": 2.0013288334802124, + "learning_rate": 5e-06, + "loss": 0.5815, + "num_tokens": 14024188.0, + "step": 976 + }, + { + "epoch": 0.09071494893221912, + "grad_norm": 2.009152244627321, + "learning_rate": 5e-06, + "loss": 0.6377, + "num_tokens": 14039431.0, + "step": 977 + }, + { + "epoch": 0.09080779944289694, + "grad_norm": 1.9449540951472217, + "learning_rate": 5e-06, + "loss": 0.5392, + "num_tokens": 14055495.0, + "step": 978 + }, + { + "epoch": 0.09090064995357475, + "grad_norm": 2.053889692790374, + "learning_rate": 5e-06, + "loss": 0.6066, + "num_tokens": 14069685.0, + "step": 979 + }, + { + "epoch": 0.09099350046425256, + "grad_norm": 1.922472284407275, + "learning_rate": 5e-06, + "loss": 0.5126, + "num_tokens": 14084354.0, + "step": 980 + }, + { + "epoch": 0.09108635097493037, + "grad_norm": 1.9350707098143827, + "learning_rate": 5e-06, + "loss": 0.5602, + "num_tokens": 14100071.0, + "step": 981 + }, + { + "epoch": 0.09117920148560817, + "grad_norm": 2.1777876019401687, + "learning_rate": 5e-06, + "loss": 0.6552, + "num_tokens": 14115070.0, + "step": 982 + }, + { + "epoch": 0.09127205199628598, + "grad_norm": 2.21254671742243, + "learning_rate": 5e-06, + "loss": 0.6114, + "num_tokens": 14129577.0, + "step": 983 + }, + { + "epoch": 0.09136490250696379, + "grad_norm": 2.0666169807773893, + "learning_rate": 5e-06, + "loss": 0.6047, + "num_tokens": 14144701.0, + "step": 984 + }, + { + "epoch": 0.0914577530176416, + "grad_norm": 2.044990588108537, + "learning_rate": 5e-06, + "loss": 0.5575, + "num_tokens": 14158663.0, + "step": 985 + }, + { + "epoch": 0.09155060352831941, + "grad_norm": 1.861092848477731, + "learning_rate": 5e-06, + "loss": 0.5158, + "num_tokens": 14174441.0, + "step": 986 + }, + { + "epoch": 0.09164345403899722, + "grad_norm": 2.0523590778868344, + "learning_rate": 5e-06, + "loss": 0.574, + "num_tokens": 14189387.0, + "step": 987 + }, + { + "epoch": 0.09173630454967502, + "grad_norm": 2.169412047678863, + "learning_rate": 5e-06, + "loss": 0.6172, + "num_tokens": 14202433.0, + "step": 988 + }, + { + "epoch": 0.09182915506035283, + "grad_norm": 2.111004773087805, + "learning_rate": 5e-06, + "loss": 0.5982, + "num_tokens": 14216995.0, + "step": 989 + }, + { + "epoch": 0.09192200557103064, + "grad_norm": 2.100600781577477, + "learning_rate": 5e-06, + "loss": 0.6516, + "num_tokens": 14231851.0, + "step": 990 + }, + { + "epoch": 0.09201485608170845, + "grad_norm": 2.0540531499947616, + "learning_rate": 5e-06, + "loss": 0.5151, + "num_tokens": 14247577.0, + "step": 991 + }, + { + "epoch": 0.09210770659238626, + "grad_norm": 2.0783719672029326, + "learning_rate": 5e-06, + "loss": 0.6329, + "num_tokens": 14261597.0, + "step": 992 + }, + { + "epoch": 0.09220055710306407, + "grad_norm": 2.079813576697321, + "learning_rate": 5e-06, + "loss": 0.6648, + "num_tokens": 14276102.0, + "step": 993 + }, + { + "epoch": 0.09229340761374187, + "grad_norm": 1.9711651794974305, + "learning_rate": 5e-06, + "loss": 0.5709, + "num_tokens": 14291109.0, + "step": 994 + }, + { + "epoch": 0.09238625812441968, + "grad_norm": 2.129503985557226, + "learning_rate": 5e-06, + "loss": 0.6654, + "num_tokens": 14305965.0, + "step": 995 + }, + { + "epoch": 0.09247910863509749, + "grad_norm": 2.0002449383285, + "learning_rate": 5e-06, + "loss": 0.5273, + "num_tokens": 14320779.0, + "step": 996 + }, + { + "epoch": 0.0925719591457753, + "grad_norm": 1.8771941543949366, + "learning_rate": 5e-06, + "loss": 0.5457, + "num_tokens": 14335958.0, + "step": 997 + }, + { + "epoch": 0.09266480965645311, + "grad_norm": 2.077293562897452, + "learning_rate": 5e-06, + "loss": 0.5109, + "num_tokens": 14348529.0, + "step": 998 + }, + { + "epoch": 0.09275766016713093, + "grad_norm": 2.0287415601278953, + "learning_rate": 5e-06, + "loss": 0.5586, + "num_tokens": 14363371.0, + "step": 999 + }, + { + "epoch": 0.09285051067780872, + "grad_norm": 2.0690957396096024, + "learning_rate": 5e-06, + "loss": 0.6005, + "num_tokens": 14378247.0, + "step": 1000 + }, + { + "epoch": 0.09285051067780872, + "step": 1000, + "total_flos": 5827533262848.0, + "train_loss": 0.6309259505271911, + "train_runtime": 3423.5085, + "train_samples_per_second": 2.337, + "train_steps_per_second": 0.292 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5827533262848.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}