{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 3284, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0060901339829476245, "grad_norm": 23.5319766998291, "learning_rate": 1.2162162162162164e-05, "loss": 4.5905, "mean_token_accuracy": 0.3401473943144083, "num_tokens": 132681.0, "step": 10 }, { "epoch": 0.012180267965895249, "grad_norm": 6.916630744934082, "learning_rate": 2.5675675675675675e-05, "loss": 3.957, "mean_token_accuracy": 0.3799716055393219, "num_tokens": 264238.0, "step": 20 }, { "epoch": 0.018270401948842874, "grad_norm": 2.717982292175293, "learning_rate": 3.918918918918919e-05, "loss": 3.047, "mean_token_accuracy": 0.46811963245272636, "num_tokens": 401308.0, "step": 30 }, { "epoch": 0.024360535931790498, "grad_norm": 2.407865524291992, "learning_rate": 5.27027027027027e-05, "loss": 2.5322, "mean_token_accuracy": 0.5190828196704388, "num_tokens": 532444.0, "step": 40 }, { "epoch": 0.030450669914738125, "grad_norm": 1.018301248550415, "learning_rate": 6.621621621621621e-05, "loss": 2.1325, "mean_token_accuracy": 0.5782605841755867, "num_tokens": 660406.0, "step": 50 }, { "epoch": 0.03654080389768575, "grad_norm": 0.7315741181373596, "learning_rate": 7.972972972972974e-05, "loss": 1.9044, "mean_token_accuracy": 0.6264914631843567, "num_tokens": 795304.0, "step": 60 }, { "epoch": 0.04263093788063337, "grad_norm": 0.6652920246124268, "learning_rate": 9.324324324324324e-05, "loss": 1.633, "mean_token_accuracy": 0.6683938711881637, "num_tokens": 934543.0, "step": 70 }, { "epoch": 0.048721071863580996, "grad_norm": 0.6119660139083862, "learning_rate": 0.00010675675675675677, "loss": 1.543, "mean_token_accuracy": 0.6834091022610664, "num_tokens": 1070669.0, "step": 80 }, { "epoch": 0.05481120584652863, "grad_norm": 0.591424286365509, "learning_rate": 0.00012027027027027027, "loss": 1.4154, "mean_token_accuracy": 0.6991497233510018, "num_tokens": 1211114.0, "step": 90 }, { "epoch": 0.06090133982947625, "grad_norm": 0.5663530230522156, "learning_rate": 0.0001337837837837838, "loss": 1.3176, "mean_token_accuracy": 0.7089206710457802, "num_tokens": 1349584.0, "step": 100 }, { "epoch": 0.06699147381242387, "grad_norm": 0.5881878137588501, "learning_rate": 0.0001472972972972973, "loss": 1.2293, "mean_token_accuracy": 0.7254403859376908, "num_tokens": 1487515.0, "step": 110 }, { "epoch": 0.0730816077953715, "grad_norm": 0.7664394974708557, "learning_rate": 0.00016081081081081083, "loss": 1.1814, "mean_token_accuracy": 0.7306812778115273, "num_tokens": 1618603.0, "step": 120 }, { "epoch": 0.07917174177831912, "grad_norm": 0.6155670881271362, "learning_rate": 0.00017432432432432432, "loss": 1.1967, "mean_token_accuracy": 0.7284250959753991, "num_tokens": 1750466.0, "step": 130 }, { "epoch": 0.08526187576126674, "grad_norm": 0.5296258330345154, "learning_rate": 0.00018783783783783784, "loss": 1.0955, "mean_token_accuracy": 0.7472824215888977, "num_tokens": 1887913.0, "step": 140 }, { "epoch": 0.09135200974421437, "grad_norm": 0.5564976334571838, "learning_rate": 0.00019999998054550544, "loss": 1.118, "mean_token_accuracy": 0.7397311359643937, "num_tokens": 2018579.0, "step": 150 }, { "epoch": 0.09744214372716199, "grad_norm": 0.5301142930984497, "learning_rate": 0.00019999764601633156, "loss": 1.045, "mean_token_accuracy": 0.7519380420446395, "num_tokens": 2158851.0, "step": 160 }, { "epoch": 0.10353227771010962, "grad_norm": 0.5949111580848694, "learning_rate": 0.00019999142070388495, "loss": 1.0497, "mean_token_accuracy": 0.7520910769701004, "num_tokens": 2296715.0, "step": 170 }, { "epoch": 0.10962241169305725, "grad_norm": 0.6169262528419495, "learning_rate": 0.0001999813048772986, "loss": 1.0821, "mean_token_accuracy": 0.7406247839331627, "num_tokens": 2424756.0, "step": 180 }, { "epoch": 0.11571254567600488, "grad_norm": 0.58912593126297, "learning_rate": 0.00019996729897390057, "loss": 1.0286, "mean_token_accuracy": 0.7527454376220704, "num_tokens": 2559362.0, "step": 190 }, { "epoch": 0.1218026796589525, "grad_norm": 0.5084304213523865, "learning_rate": 0.00019994940359919483, "loss": 0.992, "mean_token_accuracy": 0.7640391126275062, "num_tokens": 2700231.0, "step": 200 }, { "epoch": 0.1278928136419001, "grad_norm": 0.5790796279907227, "learning_rate": 0.00019992761952683516, "loss": 1.0146, "mean_token_accuracy": 0.7554366230964661, "num_tokens": 2831324.0, "step": 210 }, { "epoch": 0.13398294762484775, "grad_norm": 0.5852051377296448, "learning_rate": 0.00019990194769859188, "loss": 0.978, "mean_token_accuracy": 0.7612502485513687, "num_tokens": 2967346.0, "step": 220 }, { "epoch": 0.14007308160779536, "grad_norm": 0.5102785229682922, "learning_rate": 0.00019987238922431088, "loss": 0.9616, "mean_token_accuracy": 0.7677591517567635, "num_tokens": 3110936.0, "step": 230 }, { "epoch": 0.146163215590743, "grad_norm": 0.5472669005393982, "learning_rate": 0.00019983894538186576, "loss": 0.9535, "mean_token_accuracy": 0.76737689524889, "num_tokens": 3247496.0, "step": 240 }, { "epoch": 0.15225334957369063, "grad_norm": 0.5611053109169006, "learning_rate": 0.0001998016176171026, "loss": 0.9577, "mean_token_accuracy": 0.7626092001795769, "num_tokens": 3384178.0, "step": 250 }, { "epoch": 0.15834348355663824, "grad_norm": 0.54055255651474, "learning_rate": 0.0001997604075437774, "loss": 0.9907, "mean_token_accuracy": 0.7575223430991173, "num_tokens": 3517617.0, "step": 260 }, { "epoch": 0.16443361753958588, "grad_norm": 0.558316707611084, "learning_rate": 0.0001997153169434864, "loss": 0.944, "mean_token_accuracy": 0.7664194419980049, "num_tokens": 3662878.0, "step": 270 }, { "epoch": 0.1705237515225335, "grad_norm": 0.49766939878463745, "learning_rate": 0.0001996663477655889, "loss": 0.9106, "mean_token_accuracy": 0.7760038167238236, "num_tokens": 3807411.0, "step": 280 }, { "epoch": 0.17661388550548113, "grad_norm": 0.4953667223453522, "learning_rate": 0.0001996135021271232, "loss": 0.9687, "mean_token_accuracy": 0.7605679705739021, "num_tokens": 3936840.0, "step": 290 }, { "epoch": 0.18270401948842874, "grad_norm": 0.5447947978973389, "learning_rate": 0.00019955678231271484, "loss": 0.9625, "mean_token_accuracy": 0.7603292793035508, "num_tokens": 4067826.0, "step": 300 }, { "epoch": 0.18879415347137637, "grad_norm": 0.4665842056274414, "learning_rate": 0.00019949619077447807, "loss": 0.9372, "mean_token_accuracy": 0.7676101759076118, "num_tokens": 4205887.0, "step": 310 }, { "epoch": 0.19488428745432398, "grad_norm": 0.515690267086029, "learning_rate": 0.00019943173013190965, "loss": 0.923, "mean_token_accuracy": 0.7708473294973374, "num_tokens": 4342894.0, "step": 320 }, { "epoch": 0.20097442143727162, "grad_norm": 0.5831382274627686, "learning_rate": 0.00019936340317177565, "loss": 0.9203, "mean_token_accuracy": 0.7708552837371826, "num_tokens": 4477651.0, "step": 330 }, { "epoch": 0.20706455542021923, "grad_norm": 0.6162773966789246, "learning_rate": 0.0001992912128479911, "loss": 0.916, "mean_token_accuracy": 0.7702088996767997, "num_tokens": 4610746.0, "step": 340 }, { "epoch": 0.21315468940316687, "grad_norm": 0.5172462463378906, "learning_rate": 0.00019921516228149207, "loss": 0.8942, "mean_token_accuracy": 0.7741821393370628, "num_tokens": 4751175.0, "step": 350 }, { "epoch": 0.2192448233861145, "grad_norm": 0.5890468955039978, "learning_rate": 0.0001991352547601009, "loss": 0.9229, "mean_token_accuracy": 0.7691043332219124, "num_tokens": 4882328.0, "step": 360 }, { "epoch": 0.22533495736906212, "grad_norm": 0.5522404909133911, "learning_rate": 0.00019905149373838408, "loss": 0.9294, "mean_token_accuracy": 0.7646071568131447, "num_tokens": 5012181.0, "step": 370 }, { "epoch": 0.23142509135200975, "grad_norm": 0.5349445939064026, "learning_rate": 0.0001989638828375028, "loss": 0.8797, "mean_token_accuracy": 0.7771721839904785, "num_tokens": 5151133.0, "step": 380 }, { "epoch": 0.23751522533495736, "grad_norm": 0.531052827835083, "learning_rate": 0.00019887242584505635, "loss": 0.9221, "mean_token_accuracy": 0.7678465083241462, "num_tokens": 5279790.0, "step": 390 }, { "epoch": 0.243605359317905, "grad_norm": 0.5126324892044067, "learning_rate": 0.00019877712671491864, "loss": 0.8862, "mean_token_accuracy": 0.7739894777536392, "num_tokens": 5412390.0, "step": 400 }, { "epoch": 0.2496954933008526, "grad_norm": 0.5111438632011414, "learning_rate": 0.00019867798956706693, "loss": 0.9005, "mean_token_accuracy": 0.7721902653574944, "num_tokens": 5545801.0, "step": 410 }, { "epoch": 0.2557856272838002, "grad_norm": 0.5488138794898987, "learning_rate": 0.00019857501868740402, "loss": 0.8988, "mean_token_accuracy": 0.7690282896161079, "num_tokens": 5673758.0, "step": 420 }, { "epoch": 0.2618757612667479, "grad_norm": 0.5497994422912598, "learning_rate": 0.0001984682185275727, "loss": 0.8802, "mean_token_accuracy": 0.7780183687806129, "num_tokens": 5813158.0, "step": 430 }, { "epoch": 0.2679658952496955, "grad_norm": 0.5478431582450867, "learning_rate": 0.0001983575937047635, "loss": 0.865, "mean_token_accuracy": 0.7785944610834121, "num_tokens": 5947367.0, "step": 440 }, { "epoch": 0.2740560292326431, "grad_norm": 0.5188766717910767, "learning_rate": 0.00019824314900151487, "loss": 0.8798, "mean_token_accuracy": 0.7752803862094879, "num_tokens": 6081060.0, "step": 450 }, { "epoch": 0.2801461632155907, "grad_norm": 0.530222475528717, "learning_rate": 0.00019812488936550666, "loss": 0.8628, "mean_token_accuracy": 0.7801630645990372, "num_tokens": 6217834.0, "step": 460 }, { "epoch": 0.2862362971985384, "grad_norm": 0.5987964868545532, "learning_rate": 0.00019800281990934614, "loss": 0.8775, "mean_token_accuracy": 0.7760324433445931, "num_tokens": 6350451.0, "step": 470 }, { "epoch": 0.292326431181486, "grad_norm": 0.5468559265136719, "learning_rate": 0.0001978769459103468, "loss": 0.8721, "mean_token_accuracy": 0.7794204503297806, "num_tokens": 6484738.0, "step": 480 }, { "epoch": 0.2984165651644336, "grad_norm": 0.5541098117828369, "learning_rate": 0.0001977472728103005, "loss": 0.8785, "mean_token_accuracy": 0.7767582029104233, "num_tokens": 6619313.0, "step": 490 }, { "epoch": 0.30450669914738127, "grad_norm": 0.5134281516075134, "learning_rate": 0.0001976138062152419, "loss": 0.8717, "mean_token_accuracy": 0.7752724394202233, "num_tokens": 6753195.0, "step": 500 }, { "epoch": 0.3105968331303289, "grad_norm": 0.49164435267448425, "learning_rate": 0.00019747655189520633, "loss": 0.8757, "mean_token_accuracy": 0.7768464118242264, "num_tokens": 6890448.0, "step": 510 }, { "epoch": 0.3166869671132765, "grad_norm": 0.5899345278739929, "learning_rate": 0.00019733551578398023, "loss": 0.8322, "mean_token_accuracy": 0.7859320402145386, "num_tokens": 7027488.0, "step": 520 }, { "epoch": 0.3227771010962241, "grad_norm": 0.6552841663360596, "learning_rate": 0.0001971907039788447, "loss": 0.861, "mean_token_accuracy": 0.7770532324910164, "num_tokens": 7161184.0, "step": 530 }, { "epoch": 0.32886723507917176, "grad_norm": 0.5038822889328003, "learning_rate": 0.0001970421227403117, "loss": 0.8825, "mean_token_accuracy": 0.775890800356865, "num_tokens": 7294399.0, "step": 540 }, { "epoch": 0.33495736906211937, "grad_norm": 0.5094267129898071, "learning_rate": 0.00019688977849185378, "loss": 0.8598, "mean_token_accuracy": 0.7817838475108146, "num_tokens": 7427183.0, "step": 550 }, { "epoch": 0.341047503045067, "grad_norm": 0.5282809138298035, "learning_rate": 0.00019673367781962594, "loss": 0.8463, "mean_token_accuracy": 0.7812959104776382, "num_tokens": 7561734.0, "step": 560 }, { "epoch": 0.3471376370280146, "grad_norm": 0.45355409383773804, "learning_rate": 0.00019657382747218123, "loss": 0.8207, "mean_token_accuracy": 0.7888262197375298, "num_tokens": 7706228.0, "step": 570 }, { "epoch": 0.35322777101096225, "grad_norm": 0.5162333846092224, "learning_rate": 0.00019641023436017883, "loss": 0.8235, "mean_token_accuracy": 0.7868947923183441, "num_tokens": 7846684.0, "step": 580 }, { "epoch": 0.35931790499390986, "grad_norm": 0.5194632411003113, "learning_rate": 0.00019624290555608526, "loss": 0.8129, "mean_token_accuracy": 0.7884069249033928, "num_tokens": 7986811.0, "step": 590 }, { "epoch": 0.3654080389768575, "grad_norm": 0.5494846701622009, "learning_rate": 0.00019607184829386882, "loss": 0.8084, "mean_token_accuracy": 0.7874000474810601, "num_tokens": 8124538.0, "step": 600 }, { "epoch": 0.37149817295980514, "grad_norm": 0.5368776917457581, "learning_rate": 0.0001958970699686866, "loss": 0.8225, "mean_token_accuracy": 0.783010233938694, "num_tokens": 8260529.0, "step": 610 }, { "epoch": 0.37758830694275275, "grad_norm": 0.6229024529457092, "learning_rate": 0.00019571857813656496, "loss": 0.8786, "mean_token_accuracy": 0.7753148928284646, "num_tokens": 8389042.0, "step": 620 }, { "epoch": 0.38367844092570036, "grad_norm": 0.5601000785827637, "learning_rate": 0.00019553638051407279, "loss": 0.8909, "mean_token_accuracy": 0.7745720192790031, "num_tokens": 8513603.0, "step": 630 }, { "epoch": 0.38976857490864797, "grad_norm": 0.438970685005188, "learning_rate": 0.0001953504849779879, "loss": 0.8085, "mean_token_accuracy": 0.7871840804815292, "num_tokens": 8652970.0, "step": 640 }, { "epoch": 0.39585870889159563, "grad_norm": 0.5505132079124451, "learning_rate": 0.00019516089956495648, "loss": 0.8102, "mean_token_accuracy": 0.7869585514068603, "num_tokens": 8792103.0, "step": 650 }, { "epoch": 0.40194884287454324, "grad_norm": 0.5447221398353577, "learning_rate": 0.00019496763247114581, "loss": 0.8336, "mean_token_accuracy": 0.7816034242510795, "num_tokens": 8926853.0, "step": 660 }, { "epoch": 0.40803897685749085, "grad_norm": 0.4652746915817261, "learning_rate": 0.00019477069205188965, "loss": 0.8383, "mean_token_accuracy": 0.7826304718852043, "num_tokens": 9059592.0, "step": 670 }, { "epoch": 0.41412911084043846, "grad_norm": 0.42363590002059937, "learning_rate": 0.00019457008682132726, "loss": 0.847, "mean_token_accuracy": 0.7810002073645592, "num_tokens": 9193062.0, "step": 680 }, { "epoch": 0.42021924482338613, "grad_norm": 0.5209478735923767, "learning_rate": 0.00019436582545203518, "loss": 0.8766, "mean_token_accuracy": 0.7733785718679428, "num_tokens": 9315805.0, "step": 690 }, { "epoch": 0.42630937880633374, "grad_norm": 0.5176642537117004, "learning_rate": 0.00019415791677465237, "loss": 0.8155, "mean_token_accuracy": 0.7869213685393334, "num_tokens": 9448863.0, "step": 700 }, { "epoch": 0.43239951278928135, "grad_norm": 0.4531058371067047, "learning_rate": 0.00019394636977749843, "loss": 0.8096, "mean_token_accuracy": 0.7903949975967407, "num_tokens": 9589382.0, "step": 710 }, { "epoch": 0.438489646772229, "grad_norm": 0.5651549100875854, "learning_rate": 0.000193731193606185, "loss": 0.8263, "mean_token_accuracy": 0.7823062822222709, "num_tokens": 9723562.0, "step": 720 }, { "epoch": 0.4445797807551766, "grad_norm": 0.5377989411354065, "learning_rate": 0.00019351239756322031, "loss": 0.7993, "mean_token_accuracy": 0.7908329650759697, "num_tokens": 9859255.0, "step": 730 }, { "epoch": 0.45066991473812423, "grad_norm": 0.5420868396759033, "learning_rate": 0.00019328999110760722, "loss": 0.8461, "mean_token_accuracy": 0.7780480548739434, "num_tokens": 9981578.0, "step": 740 }, { "epoch": 0.45676004872107184, "grad_norm": 0.4889216125011444, "learning_rate": 0.000193063983854434, "loss": 0.7652, "mean_token_accuracy": 0.7959530428051949, "num_tokens": 10122922.0, "step": 750 }, { "epoch": 0.4628501827040195, "grad_norm": 0.5044087767601013, "learning_rate": 0.00019283438557445893, "loss": 0.824, "mean_token_accuracy": 0.7845935523509979, "num_tokens": 10252854.0, "step": 760 }, { "epoch": 0.4689403166869671, "grad_norm": 0.5286466479301453, "learning_rate": 0.00019260120619368773, "loss": 0.815, "mean_token_accuracy": 0.7850656941533088, "num_tokens": 10385075.0, "step": 770 }, { "epoch": 0.47503045066991473, "grad_norm": 0.5441628694534302, "learning_rate": 0.00019236445579294437, "loss": 0.8048, "mean_token_accuracy": 0.7876680314540863, "num_tokens": 10520011.0, "step": 780 }, { "epoch": 0.48112058465286234, "grad_norm": 0.49002447724342346, "learning_rate": 0.0001921241446074355, "loss": 0.8059, "mean_token_accuracy": 0.7898563235998154, "num_tokens": 10652488.0, "step": 790 }, { "epoch": 0.48721071863581, "grad_norm": 0.4479144811630249, "learning_rate": 0.0001918802830263077, "loss": 0.7913, "mean_token_accuracy": 0.7928732186555862, "num_tokens": 10785974.0, "step": 800 }, { "epoch": 0.4933008526187576, "grad_norm": 0.5007497668266296, "learning_rate": 0.00019163288159219853, "loss": 0.8083, "mean_token_accuracy": 0.7893043681979179, "num_tokens": 10920950.0, "step": 810 }, { "epoch": 0.4993909866017052, "grad_norm": 0.5289483070373535, "learning_rate": 0.00019138195100078064, "loss": 0.8033, "mean_token_accuracy": 0.7864485770463944, "num_tokens": 11056380.0, "step": 820 }, { "epoch": 0.5054811205846529, "grad_norm": 0.5604159832000732, "learning_rate": 0.0001911275021002994, "loss": 0.7652, "mean_token_accuracy": 0.7946401730179786, "num_tokens": 11196074.0, "step": 830 }, { "epoch": 0.5115712545676004, "grad_norm": 0.43645399808883667, "learning_rate": 0.00019086954589110397, "loss": 0.7724, "mean_token_accuracy": 0.7990294560790062, "num_tokens": 11337990.0, "step": 840 }, { "epoch": 0.5176613885505481, "grad_norm": 0.43992146849632263, "learning_rate": 0.0001906080935251716, "loss": 0.7612, "mean_token_accuracy": 0.7999786615371705, "num_tokens": 11481565.0, "step": 850 }, { "epoch": 0.5237515225334958, "grad_norm": 0.5595120191574097, "learning_rate": 0.0001903431563056256, "loss": 0.8266, "mean_token_accuracy": 0.7859750911593437, "num_tokens": 11611714.0, "step": 860 }, { "epoch": 0.5298416565164433, "grad_norm": 0.5001987218856812, "learning_rate": 0.0001900747456862467, "loss": 0.8506, "mean_token_accuracy": 0.779585388302803, "num_tokens": 11736573.0, "step": 870 }, { "epoch": 0.535931790499391, "grad_norm": 0.430147647857666, "learning_rate": 0.00018980287327097784, "loss": 0.7707, "mean_token_accuracy": 0.795211361348629, "num_tokens": 11876859.0, "step": 880 }, { "epoch": 0.5420219244823387, "grad_norm": 0.5346289873123169, "learning_rate": 0.00018952755081342245, "loss": 0.8057, "mean_token_accuracy": 0.7871127843856811, "num_tokens": 12007654.0, "step": 890 }, { "epoch": 0.5481120584652862, "grad_norm": 0.46072253584861755, "learning_rate": 0.00018924879021633653, "loss": 0.7924, "mean_token_accuracy": 0.7913773030042648, "num_tokens": 12140520.0, "step": 900 }, { "epoch": 0.5542021924482339, "grad_norm": 0.4803653955459595, "learning_rate": 0.00018896660353111375, "loss": 0.8398, "mean_token_accuracy": 0.7807079553604126, "num_tokens": 12267219.0, "step": 910 }, { "epoch": 0.5602923264311814, "grad_norm": 0.5219636559486389, "learning_rate": 0.0001886810029572647, "loss": 0.7612, "mean_token_accuracy": 0.7993015512824059, "num_tokens": 12404646.0, "step": 920 }, { "epoch": 0.5663824604141291, "grad_norm": 0.501483142375946, "learning_rate": 0.00018839200084188936, "loss": 0.7953, "mean_token_accuracy": 0.787814213335514, "num_tokens": 12538219.0, "step": 930 }, { "epoch": 0.5724725943970768, "grad_norm": 0.47334522008895874, "learning_rate": 0.00018809960967914346, "loss": 0.789, "mean_token_accuracy": 0.7928574904799461, "num_tokens": 12673805.0, "step": 940 }, { "epoch": 0.5785627283800243, "grad_norm": 0.5057492852210999, "learning_rate": 0.00018780384210969806, "loss": 0.7746, "mean_token_accuracy": 0.7947553545236588, "num_tokens": 12811727.0, "step": 950 }, { "epoch": 0.584652862362972, "grad_norm": 0.5179910659790039, "learning_rate": 0.00018750471092019325, "loss": 0.7962, "mean_token_accuracy": 0.7905686929821968, "num_tokens": 12947641.0, "step": 960 }, { "epoch": 0.5907429963459196, "grad_norm": 0.45797088742256165, "learning_rate": 0.00018720222904268543, "loss": 0.7678, "mean_token_accuracy": 0.7969774708151818, "num_tokens": 13083869.0, "step": 970 }, { "epoch": 0.5968331303288672, "grad_norm": 0.48360612988471985, "learning_rate": 0.00018689640955408803, "loss": 0.7996, "mean_token_accuracy": 0.7885591968894005, "num_tokens": 13211807.0, "step": 980 }, { "epoch": 0.6029232643118149, "grad_norm": 0.4378497004508972, "learning_rate": 0.00018658726567560635, "loss": 0.7652, "mean_token_accuracy": 0.7969291344285011, "num_tokens": 13351856.0, "step": 990 }, { "epoch": 0.6090133982947625, "grad_norm": 0.4857536852359772, "learning_rate": 0.00018627481077216577, "loss": 0.7786, "mean_token_accuracy": 0.7914443418383599, "num_tokens": 13486443.0, "step": 1000 }, { "epoch": 0.6151035322777101, "grad_norm": 0.5233064293861389, "learning_rate": 0.0001859590583518343, "loss": 0.8241, "mean_token_accuracy": 0.7811850637197495, "num_tokens": 13612035.0, "step": 1010 }, { "epoch": 0.6211936662606578, "grad_norm": 0.5328738689422607, "learning_rate": 0.00018564002206523816, "loss": 0.7502, "mean_token_accuracy": 0.7993430674076081, "num_tokens": 13756509.0, "step": 1020 }, { "epoch": 0.6272838002436053, "grad_norm": 0.47962310910224915, "learning_rate": 0.000185317715704972, "loss": 0.7984, "mean_token_accuracy": 0.7864531084895134, "num_tokens": 13883033.0, "step": 1030 }, { "epoch": 0.633373934226553, "grad_norm": 0.5685893893241882, "learning_rate": 0.0001849921532050024, "loss": 0.7869, "mean_token_accuracy": 0.7909937381744385, "num_tokens": 14015234.0, "step": 1040 }, { "epoch": 0.6394640682095006, "grad_norm": 0.49146631360054016, "learning_rate": 0.00018466334864006566, "loss": 0.7952, "mean_token_accuracy": 0.7878949210047722, "num_tokens": 14149319.0, "step": 1050 }, { "epoch": 0.6455542021924482, "grad_norm": 0.5556225776672363, "learning_rate": 0.0001843313162250591, "loss": 0.7524, "mean_token_accuracy": 0.7994373366236687, "num_tokens": 14286868.0, "step": 1060 }, { "epoch": 0.6516443361753959, "grad_norm": 0.511379063129425, "learning_rate": 0.00018399607031442666, "loss": 0.7929, "mean_token_accuracy": 0.7921562284231186, "num_tokens": 14418354.0, "step": 1070 }, { "epoch": 0.6577344701583435, "grad_norm": 0.5019840598106384, "learning_rate": 0.00018365762540153836, "loss": 0.758, "mean_token_accuracy": 0.7989353060722351, "num_tokens": 14553174.0, "step": 1080 }, { "epoch": 0.6638246041412911, "grad_norm": 0.6032467484474182, "learning_rate": 0.00018331599611806366, "loss": 0.7888, "mean_token_accuracy": 0.7903819754719734, "num_tokens": 14681393.0, "step": 1090 }, { "epoch": 0.6699147381242387, "grad_norm": 0.5369830131530762, "learning_rate": 0.00018297119723333877, "loss": 0.765, "mean_token_accuracy": 0.7950262635946274, "num_tokens": 14814565.0, "step": 1100 }, { "epoch": 0.6760048721071864, "grad_norm": 0.5289803743362427, "learning_rate": 0.00018262324365372846, "loss": 0.7496, "mean_token_accuracy": 0.8032818242907525, "num_tokens": 14954351.0, "step": 1110 }, { "epoch": 0.682095006090134, "grad_norm": 0.5440439581871033, "learning_rate": 0.0001822721504219814, "loss": 0.7432, "mean_token_accuracy": 0.799126236140728, "num_tokens": 15094879.0, "step": 1120 }, { "epoch": 0.6881851400730816, "grad_norm": 0.46225935220718384, "learning_rate": 0.00018191793271657978, "loss": 0.7513, "mean_token_accuracy": 0.8022688791155815, "num_tokens": 15234906.0, "step": 1130 }, { "epoch": 0.6942752740560292, "grad_norm": 0.5592020750045776, "learning_rate": 0.0001815606058510833, "loss": 0.7583, "mean_token_accuracy": 0.7984497547149658, "num_tokens": 15373526.0, "step": 1140 }, { "epoch": 0.7003654080389768, "grad_norm": 0.525090217590332, "learning_rate": 0.00018120018527346702, "loss": 0.7254, "mean_token_accuracy": 0.8070619881153107, "num_tokens": 15516264.0, "step": 1150 }, { "epoch": 0.7064555420219245, "grad_norm": 0.5380759239196777, "learning_rate": 0.00018083668656545355, "loss": 0.8041, "mean_token_accuracy": 0.7866759791970253, "num_tokens": 15640444.0, "step": 1160 }, { "epoch": 0.7125456760048721, "grad_norm": 0.47815701365470886, "learning_rate": 0.00018047012544183938, "loss": 0.7604, "mean_token_accuracy": 0.796156468987465, "num_tokens": 15778070.0, "step": 1170 }, { "epoch": 0.7186358099878197, "grad_norm": 0.5380450487136841, "learning_rate": 0.00018010051774981553, "loss": 0.8135, "mean_token_accuracy": 0.7842124432325364, "num_tokens": 15899739.0, "step": 1180 }, { "epoch": 0.7247259439707674, "grad_norm": 0.5047502517700195, "learning_rate": 0.00017972787946828246, "loss": 0.7642, "mean_token_accuracy": 0.7989341139793396, "num_tokens": 16035805.0, "step": 1190 }, { "epoch": 0.730816077953715, "grad_norm": 0.5440967679023743, "learning_rate": 0.00017935222670715918, "loss": 0.735, "mean_token_accuracy": 0.8048294603824615, "num_tokens": 16172541.0, "step": 1200 }, { "epoch": 0.7369062119366626, "grad_norm": 0.4766077399253845, "learning_rate": 0.000178973575706687, "loss": 0.805, "mean_token_accuracy": 0.7871790423989296, "num_tokens": 16296988.0, "step": 1210 }, { "epoch": 0.7429963459196103, "grad_norm": 0.4153214991092682, "learning_rate": 0.00017859194283672704, "loss": 0.7635, "mean_token_accuracy": 0.7964595645666123, "num_tokens": 16432022.0, "step": 1220 }, { "epoch": 0.7490864799025578, "grad_norm": 0.4698518216609955, "learning_rate": 0.00017820734459605302, "loss": 0.7397, "mean_token_accuracy": 0.8046972885727882, "num_tokens": 16572880.0, "step": 1230 }, { "epoch": 0.7551766138855055, "grad_norm": 0.46101540327072144, "learning_rate": 0.00017781979761163756, "loss": 0.7174, "mean_token_accuracy": 0.8066875368356705, "num_tokens": 16714419.0, "step": 1240 }, { "epoch": 0.761266747868453, "grad_norm": 0.5313341021537781, "learning_rate": 0.00017742931863793358, "loss": 0.7797, "mean_token_accuracy": 0.7911526098847389, "num_tokens": 16838285.0, "step": 1250 }, { "epoch": 0.7673568818514007, "grad_norm": 0.4627362787723541, "learning_rate": 0.00017703592455614998, "loss": 0.7626, "mean_token_accuracy": 0.7970306649804115, "num_tokens": 16976065.0, "step": 1260 }, { "epoch": 0.7734470158343484, "grad_norm": 0.5429073572158813, "learning_rate": 0.00017663963237352177, "loss": 0.7398, "mean_token_accuracy": 0.8005403786897659, "num_tokens": 17112901.0, "step": 1270 }, { "epoch": 0.7795371498172959, "grad_norm": 0.6781270503997803, "learning_rate": 0.00017624045922257471, "loss": 0.7607, "mean_token_accuracy": 0.7946217939257622, "num_tokens": 17245480.0, "step": 1280 }, { "epoch": 0.7856272838002436, "grad_norm": 0.5227305293083191, "learning_rate": 0.00017583842236038483, "loss": 0.7217, "mean_token_accuracy": 0.8064659267663956, "num_tokens": 17387171.0, "step": 1290 }, { "epoch": 0.7917174177831913, "grad_norm": 0.49253156781196594, "learning_rate": 0.0001754335391678323, "loss": 0.7652, "mean_token_accuracy": 0.7960015773773194, "num_tokens": 17521164.0, "step": 1300 }, { "epoch": 0.7978075517661388, "grad_norm": 0.5103631615638733, "learning_rate": 0.00017502582714884997, "loss": 0.7435, "mean_token_accuracy": 0.7995276898145676, "num_tokens": 17657818.0, "step": 1310 }, { "epoch": 0.8038976857490865, "grad_norm": 0.5531247854232788, "learning_rate": 0.00017461530392966665, "loss": 0.7986, "mean_token_accuracy": 0.7892467245459557, "num_tokens": 17784361.0, "step": 1320 }, { "epoch": 0.8099878197320342, "grad_norm": 0.4574586749076843, "learning_rate": 0.00017420198725804517, "loss": 0.6889, "mean_token_accuracy": 0.8135112956166267, "num_tokens": 17929664.0, "step": 1330 }, { "epoch": 0.8160779537149817, "grad_norm": 0.4734383225440979, "learning_rate": 0.00017378589500251498, "loss": 0.7308, "mean_token_accuracy": 0.8029947131872177, "num_tokens": 18071182.0, "step": 1340 }, { "epoch": 0.8221680876979294, "grad_norm": 0.5192279815673828, "learning_rate": 0.00017336704515159986, "loss": 0.7444, "mean_token_accuracy": 0.8012512847781181, "num_tokens": 18211136.0, "step": 1350 }, { "epoch": 0.8282582216808769, "grad_norm": 0.5378620624542236, "learning_rate": 0.00017294545581303996, "loss": 0.7459, "mean_token_accuracy": 0.7981989249587059, "num_tokens": 18340645.0, "step": 1360 }, { "epoch": 0.8343483556638246, "grad_norm": 0.4879571497440338, "learning_rate": 0.00017252114521300918, "loss": 0.7877, "mean_token_accuracy": 0.7891893342137337, "num_tokens": 18465733.0, "step": 1370 }, { "epoch": 0.8404384896467723, "grad_norm": 0.5297388434410095, "learning_rate": 0.00017209413169532717, "loss": 0.7586, "mean_token_accuracy": 0.797142505645752, "num_tokens": 18598979.0, "step": 1380 }, { "epoch": 0.8465286236297198, "grad_norm": 0.5308396220207214, "learning_rate": 0.00017166443372066618, "loss": 0.7387, "mean_token_accuracy": 0.80123979896307, "num_tokens": 18735919.0, "step": 1390 }, { "epoch": 0.8526187576126675, "grad_norm": 0.49988579750061035, "learning_rate": 0.0001712320698657532, "loss": 0.7425, "mean_token_accuracy": 0.7996803268790245, "num_tokens": 18870877.0, "step": 1400 }, { "epoch": 0.8587088915956151, "grad_norm": 0.5971361994743347, "learning_rate": 0.0001707970588225665, "loss": 0.7691, "mean_token_accuracy": 0.7922965154051781, "num_tokens": 19000943.0, "step": 1410 }, { "epoch": 0.8647990255785627, "grad_norm": 0.5141698718070984, "learning_rate": 0.00017035941939752802, "loss": 0.7203, "mean_token_accuracy": 0.8036229625344277, "num_tokens": 19135039.0, "step": 1420 }, { "epoch": 0.8708891595615104, "grad_norm": 0.4647749066352844, "learning_rate": 0.0001699191705106898, "loss": 0.7136, "mean_token_accuracy": 0.8064323276281357, "num_tokens": 19274069.0, "step": 1430 }, { "epoch": 0.876979293544458, "grad_norm": 0.5511934161186218, "learning_rate": 0.00016947633119491633, "loss": 0.7455, "mean_token_accuracy": 0.7985599264502525, "num_tokens": 19409679.0, "step": 1440 }, { "epoch": 0.8830694275274056, "grad_norm": 0.4936945140361786, "learning_rate": 0.00016903092059506182, "loss": 0.7087, "mean_token_accuracy": 0.806523185968399, "num_tokens": 19547419.0, "step": 1450 }, { "epoch": 0.8891595615103532, "grad_norm": 0.5227787494659424, "learning_rate": 0.00016858295796714213, "loss": 0.7739, "mean_token_accuracy": 0.7941467314958572, "num_tokens": 19674455.0, "step": 1460 }, { "epoch": 0.8952496954933008, "grad_norm": 0.5046219825744629, "learning_rate": 0.00016813246267750282, "loss": 0.7361, "mean_token_accuracy": 0.8008369222283364, "num_tokens": 19809861.0, "step": 1470 }, { "epoch": 0.9013398294762485, "grad_norm": 0.4827081263065338, "learning_rate": 0.00016767945420198142, "loss": 0.7464, "mean_token_accuracy": 0.7986427888274192, "num_tokens": 19940696.0, "step": 1480 }, { "epoch": 0.9074299634591961, "grad_norm": 0.4970889687538147, "learning_rate": 0.00016722395212506567, "loss": 0.7528, "mean_token_accuracy": 0.7965970665216446, "num_tokens": 20070686.0, "step": 1490 }, { "epoch": 0.9135200974421437, "grad_norm": 0.44478070735931396, "learning_rate": 0.00016676597613904693, "loss": 0.7185, "mean_token_accuracy": 0.8081388726830483, "num_tokens": 20210260.0, "step": 1500 }, { "epoch": 0.9196102314250914, "grad_norm": 0.506136417388916, "learning_rate": 0.00016630554604316866, "loss": 0.7395, "mean_token_accuracy": 0.8003876298666001, "num_tokens": 20346235.0, "step": 1510 }, { "epoch": 0.925700365408039, "grad_norm": 0.500946044921875, "learning_rate": 0.00016584268174277053, "loss": 0.6889, "mean_token_accuracy": 0.8124501362442971, "num_tokens": 20481248.0, "step": 1520 }, { "epoch": 0.9317904993909866, "grad_norm": 0.48528990149497986, "learning_rate": 0.00016537740324842795, "loss": 0.7227, "mean_token_accuracy": 0.8041250064969063, "num_tokens": 20613531.0, "step": 1530 }, { "epoch": 0.9378806333739342, "grad_norm": 0.5070951581001282, "learning_rate": 0.00016490973067508674, "loss": 0.7091, "mean_token_accuracy": 0.8082544595003128, "num_tokens": 20750784.0, "step": 1540 }, { "epoch": 0.9439707673568819, "grad_norm": 0.5583120584487915, "learning_rate": 0.0001644396842411939, "loss": 0.7405, "mean_token_accuracy": 0.7992320343852043, "num_tokens": 20883646.0, "step": 1550 }, { "epoch": 0.9500609013398295, "grad_norm": 0.5099635124206543, "learning_rate": 0.00016396728426782312, "loss": 0.7103, "mean_token_accuracy": 0.8091216519474983, "num_tokens": 21025143.0, "step": 1560 }, { "epoch": 0.9561510353227771, "grad_norm": 0.5777808427810669, "learning_rate": 0.00016349255117779652, "loss": 0.7245, "mean_token_accuracy": 0.8023119494318962, "num_tokens": 21160014.0, "step": 1570 }, { "epoch": 0.9622411693057247, "grad_norm": 0.5206162333488464, "learning_rate": 0.0001630155054948016, "loss": 0.7185, "mean_token_accuracy": 0.8069521963596344, "num_tokens": 21299094.0, "step": 1580 }, { "epoch": 0.9683313032886723, "grad_norm": 0.5763202905654907, "learning_rate": 0.00016253616784250415, "loss": 0.7677, "mean_token_accuracy": 0.7927820891141891, "num_tokens": 21429252.0, "step": 1590 }, { "epoch": 0.97442143727162, "grad_norm": 0.5068426728248596, "learning_rate": 0.00016205455894365627, "loss": 0.7673, "mean_token_accuracy": 0.794715291261673, "num_tokens": 21556200.0, "step": 1600 }, { "epoch": 0.9805115712545676, "grad_norm": 0.46094459295272827, "learning_rate": 0.0001615706996192009, "loss": 0.771, "mean_token_accuracy": 0.7921045809984207, "num_tokens": 21681524.0, "step": 1610 }, { "epoch": 0.9866017052375152, "grad_norm": 0.5063546299934387, "learning_rate": 0.00016108461078737148, "loss": 0.7383, "mean_token_accuracy": 0.800596435368061, "num_tokens": 21814109.0, "step": 1620 }, { "epoch": 0.9926918392204629, "grad_norm": 0.5418652296066284, "learning_rate": 0.0001605963134627876, "loss": 0.7431, "mean_token_accuracy": 0.7994748756289483, "num_tokens": 21947346.0, "step": 1630 }, { "epoch": 0.9987819732034104, "grad_norm": 0.6195595264434814, "learning_rate": 0.0001601058287555465, "loss": 0.7294, "mean_token_accuracy": 0.8030684441328049, "num_tokens": 22081340.0, "step": 1640 }, { "epoch": 1.004872107186358, "grad_norm": 0.5930359363555908, "learning_rate": 0.00015961317787031054, "loss": 0.7387, "mean_token_accuracy": 0.8013696864247322, "num_tokens": 22206441.0, "step": 1650 }, { "epoch": 1.0109622411693058, "grad_norm": 0.4926474094390869, "learning_rate": 0.00015911838210539038, "loss": 0.6743, "mean_token_accuracy": 0.8141208037734031, "num_tokens": 22344898.0, "step": 1660 }, { "epoch": 1.0170523751522533, "grad_norm": 0.5331000685691833, "learning_rate": 0.0001586214628518242, "loss": 0.7033, "mean_token_accuracy": 0.807385990023613, "num_tokens": 22483135.0, "step": 1670 }, { "epoch": 1.0231425091352009, "grad_norm": 0.5267267227172852, "learning_rate": 0.0001581224415924531, "loss": 0.6717, "mean_token_accuracy": 0.8178876608610153, "num_tokens": 22617934.0, "step": 1680 }, { "epoch": 1.0292326431181487, "grad_norm": 0.5864041447639465, "learning_rate": 0.00015762133990099205, "loss": 0.7421, "mean_token_accuracy": 0.7981289237737655, "num_tokens": 22745190.0, "step": 1690 }, { "epoch": 1.0353227771010962, "grad_norm": 0.45681944489479065, "learning_rate": 0.00015711817944109738, "loss": 0.6646, "mean_token_accuracy": 0.8146520599722862, "num_tokens": 22887536.0, "step": 1700 }, { "epoch": 1.0414129110840438, "grad_norm": 0.5522484183311462, "learning_rate": 0.00015661298196543042, "loss": 0.6889, "mean_token_accuracy": 0.8100781336426734, "num_tokens": 23017586.0, "step": 1710 }, { "epoch": 1.0475030450669915, "grad_norm": 0.5221629738807678, "learning_rate": 0.00015610576931471658, "loss": 0.6939, "mean_token_accuracy": 0.8114214852452278, "num_tokens": 23151737.0, "step": 1720 }, { "epoch": 1.053593179049939, "grad_norm": 0.5104020833969116, "learning_rate": 0.00015559656341680164, "loss": 0.716, "mean_token_accuracy": 0.8063826873898506, "num_tokens": 23280778.0, "step": 1730 }, { "epoch": 1.0596833130328867, "grad_norm": 0.5163984298706055, "learning_rate": 0.00015508538628570352, "loss": 0.7188, "mean_token_accuracy": 0.802527217566967, "num_tokens": 23410327.0, "step": 1740 }, { "epoch": 1.0657734470158344, "grad_norm": 0.5188373327255249, "learning_rate": 0.00015457226002066058, "loss": 0.6791, "mean_token_accuracy": 0.8127639785408973, "num_tokens": 23548616.0, "step": 1750 }, { "epoch": 1.071863580998782, "grad_norm": 0.5983869433403015, "learning_rate": 0.00015405720680517618, "loss": 0.6869, "mean_token_accuracy": 0.8110290810465812, "num_tokens": 23682446.0, "step": 1760 }, { "epoch": 1.0779537149817295, "grad_norm": 0.5919123291969299, "learning_rate": 0.00015354024890605985, "loss": 0.7419, "mean_token_accuracy": 0.7984233900904656, "num_tokens": 23806352.0, "step": 1770 }, { "epoch": 1.0840438489646773, "grad_norm": 0.4900698661804199, "learning_rate": 0.0001530214086724644, "loss": 0.6781, "mean_token_accuracy": 0.8152358055114746, "num_tokens": 23942964.0, "step": 1780 }, { "epoch": 1.0901339829476249, "grad_norm": 0.5409672856330872, "learning_rate": 0.00015250070853491986, "loss": 0.7157, "mean_token_accuracy": 0.803682966530323, "num_tokens": 24070937.0, "step": 1790 }, { "epoch": 1.0962241169305724, "grad_norm": 0.5581572651863098, "learning_rate": 0.0001519781710043638, "loss": 0.7261, "mean_token_accuracy": 0.8027503877878189, "num_tokens": 24200686.0, "step": 1800 }, { "epoch": 1.1023142509135202, "grad_norm": 0.503963053226471, "learning_rate": 0.0001514538186711679, "loss": 0.7125, "mean_token_accuracy": 0.8042754918336869, "num_tokens": 24329983.0, "step": 1810 }, { "epoch": 1.1084043848964678, "grad_norm": 0.6159723997116089, "learning_rate": 0.00015092767420416168, "loss": 0.6873, "mean_token_accuracy": 0.8115814313292503, "num_tokens": 24465292.0, "step": 1820 }, { "epoch": 1.1144945188794153, "grad_norm": 0.518172562122345, "learning_rate": 0.00015039976034965214, "loss": 0.6805, "mean_token_accuracy": 0.8113815248012543, "num_tokens": 24599980.0, "step": 1830 }, { "epoch": 1.1205846528623629, "grad_norm": 0.5381601452827454, "learning_rate": 0.0001498700999304407, "loss": 0.6542, "mean_token_accuracy": 0.8188014090061188, "num_tokens": 24746703.0, "step": 1840 }, { "epoch": 1.1266747868453106, "grad_norm": 0.5001223683357239, "learning_rate": 0.00014933871584483615, "loss": 0.7255, "mean_token_accuracy": 0.8022593036293983, "num_tokens": 24877604.0, "step": 1850 }, { "epoch": 1.1327649208282582, "grad_norm": 0.5812251567840576, "learning_rate": 0.00014880563106566512, "loss": 0.6638, "mean_token_accuracy": 0.8161928996443748, "num_tokens": 25023049.0, "step": 1860 }, { "epoch": 1.1388550548112057, "grad_norm": 0.5384249091148376, "learning_rate": 0.0001482708686392786, "loss": 0.6623, "mean_token_accuracy": 0.8167443484067917, "num_tokens": 25162124.0, "step": 1870 }, { "epoch": 1.1449451887941535, "grad_norm": 0.5310192108154297, "learning_rate": 0.00014773445168455576, "loss": 0.7074, "mean_token_accuracy": 0.8042578861117363, "num_tokens": 25293569.0, "step": 1880 }, { "epoch": 1.151035322777101, "grad_norm": 0.6224446296691895, "learning_rate": 0.00014719640339190443, "loss": 0.7094, "mean_token_accuracy": 0.803679920732975, "num_tokens": 25422953.0, "step": 1890 }, { "epoch": 1.1571254567600486, "grad_norm": 0.5978189706802368, "learning_rate": 0.00014665674702225853, "loss": 0.6926, "mean_token_accuracy": 0.8080565810203553, "num_tokens": 25559091.0, "step": 1900 }, { "epoch": 1.1632155907429964, "grad_norm": 0.6134657263755798, "learning_rate": 0.00014611550590607245, "loss": 0.6716, "mean_token_accuracy": 0.8152063637971878, "num_tokens": 25698134.0, "step": 1910 }, { "epoch": 1.169305724725944, "grad_norm": 0.5075950026512146, "learning_rate": 0.00014557270344231246, "loss": 0.6613, "mean_token_accuracy": 0.8169043198227882, "num_tokens": 25835159.0, "step": 1920 }, { "epoch": 1.1753958587088915, "grad_norm": 0.5035059452056885, "learning_rate": 0.00014502836309744508, "loss": 0.6903, "mean_token_accuracy": 0.8096718549728393, "num_tokens": 25970600.0, "step": 1930 }, { "epoch": 1.1814859926918393, "grad_norm": 0.583890438079834, "learning_rate": 0.00014448250840442254, "loss": 0.6662, "mean_token_accuracy": 0.8157578155398368, "num_tokens": 26106658.0, "step": 1940 }, { "epoch": 1.1875761266747868, "grad_norm": 0.5089572668075562, "learning_rate": 0.00014393516296166552, "loss": 0.7085, "mean_token_accuracy": 0.8082539036870002, "num_tokens": 26238847.0, "step": 1950 }, { "epoch": 1.1936662606577344, "grad_norm": 0.4495029151439667, "learning_rate": 0.00014338635043204288, "loss": 0.7085, "mean_token_accuracy": 0.8075269401073456, "num_tokens": 26366417.0, "step": 1960 }, { "epoch": 1.1997563946406822, "grad_norm": 0.6390108466148376, "learning_rate": 0.00014283609454184855, "loss": 0.6935, "mean_token_accuracy": 0.8099950149655342, "num_tokens": 26498101.0, "step": 1970 }, { "epoch": 1.2058465286236297, "grad_norm": 0.5687986016273499, "learning_rate": 0.00014228441907977607, "loss": 0.7027, "mean_token_accuracy": 0.8083449766039849, "num_tokens": 26628513.0, "step": 1980 }, { "epoch": 1.2119366626065773, "grad_norm": 0.487954318523407, "learning_rate": 0.00014173134789588994, "loss": 0.6731, "mean_token_accuracy": 0.8129799589514732, "num_tokens": 26761671.0, "step": 1990 }, { "epoch": 1.218026796589525, "grad_norm": 0.5641826391220093, "learning_rate": 0.00014117690490059447, "loss": 0.6949, "mean_token_accuracy": 0.8118783175945282, "num_tokens": 26894870.0, "step": 2000 }, { "epoch": 1.2241169305724726, "grad_norm": 0.5209829211235046, "learning_rate": 0.00014062111406360034, "loss": 0.6742, "mean_token_accuracy": 0.816123254597187, "num_tokens": 27027902.0, "step": 2010 }, { "epoch": 1.2302070645554202, "grad_norm": 0.5218231678009033, "learning_rate": 0.00014006399941288812, "loss": 0.703, "mean_token_accuracy": 0.805295330286026, "num_tokens": 27157882.0, "step": 2020 }, { "epoch": 1.236297198538368, "grad_norm": 0.48154470324516296, "learning_rate": 0.00013950558503366957, "loss": 0.6844, "mean_token_accuracy": 0.811684039235115, "num_tokens": 27290994.0, "step": 2030 }, { "epoch": 1.2423873325213155, "grad_norm": 0.5417695045471191, "learning_rate": 0.00013894589506734643, "loss": 0.7253, "mean_token_accuracy": 0.8018206775188446, "num_tokens": 27420715.0, "step": 2040 }, { "epoch": 1.248477466504263, "grad_norm": 0.5282937288284302, "learning_rate": 0.00013838495371046671, "loss": 0.682, "mean_token_accuracy": 0.8128980100154877, "num_tokens": 27552040.0, "step": 2050 }, { "epoch": 1.2545676004872108, "grad_norm": 0.5213696360588074, "learning_rate": 0.0001378227852136785, "loss": 0.6728, "mean_token_accuracy": 0.8128269612789154, "num_tokens": 27686922.0, "step": 2060 }, { "epoch": 1.2606577344701584, "grad_norm": 0.4823834300041199, "learning_rate": 0.00013725941388068174, "loss": 0.6626, "mean_token_accuracy": 0.8177949145436287, "num_tokens": 27825036.0, "step": 2070 }, { "epoch": 1.266747868453106, "grad_norm": 0.6199477314949036, "learning_rate": 0.0001366948640671775, "loss": 0.686, "mean_token_accuracy": 0.8107037082314491, "num_tokens": 27961614.0, "step": 2080 }, { "epoch": 1.2728380024360537, "grad_norm": 0.4916837513446808, "learning_rate": 0.00013612916017981488, "loss": 0.6738, "mean_token_accuracy": 0.8149923622608185, "num_tokens": 28099524.0, "step": 2090 }, { "epoch": 1.2789281364190013, "grad_norm": 0.6001724600791931, "learning_rate": 0.00013556232667513607, "loss": 0.6637, "mean_token_accuracy": 0.8173324480652809, "num_tokens": 28237055.0, "step": 2100 }, { "epoch": 1.2850182704019488, "grad_norm": 0.5887413620948792, "learning_rate": 0.00013499438805851882, "loss": 0.6744, "mean_token_accuracy": 0.8149967223405838, "num_tokens": 28370538.0, "step": 2110 }, { "epoch": 1.2911084043848966, "grad_norm": 0.6208155751228333, "learning_rate": 0.00013442536888311733, "loss": 0.6973, "mean_token_accuracy": 0.8103871151804924, "num_tokens": 28499232.0, "step": 2120 }, { "epoch": 1.2971985383678442, "grad_norm": 0.5026904344558716, "learning_rate": 0.0001338552937488003, "loss": 0.6739, "mean_token_accuracy": 0.8153023451566697, "num_tokens": 28633993.0, "step": 2130 }, { "epoch": 1.3032886723507917, "grad_norm": 0.5218458771705627, "learning_rate": 0.00013328418730108795, "loss": 0.6619, "mean_token_accuracy": 0.8166303977370262, "num_tokens": 28774139.0, "step": 2140 }, { "epoch": 1.3093788063337393, "grad_norm": 0.519872784614563, "learning_rate": 0.00013271207423008622, "loss": 0.6804, "mean_token_accuracy": 0.8150519266724586, "num_tokens": 28910109.0, "step": 2150 }, { "epoch": 1.315468940316687, "grad_norm": 0.5219667553901672, "learning_rate": 0.00013213897926941942, "loss": 0.6682, "mean_token_accuracy": 0.8166522830724716, "num_tokens": 29045967.0, "step": 2160 }, { "epoch": 1.3215590742996346, "grad_norm": 0.5744656920433044, "learning_rate": 0.000131564927195161, "loss": 0.6772, "mean_token_accuracy": 0.8149690836668014, "num_tokens": 29180769.0, "step": 2170 }, { "epoch": 1.3276492082825821, "grad_norm": 0.5673508048057556, "learning_rate": 0.00013098994282476236, "loss": 0.6841, "mean_token_accuracy": 0.812624742090702, "num_tokens": 29313512.0, "step": 2180 }, { "epoch": 1.3337393422655297, "grad_norm": 0.5187074542045593, "learning_rate": 0.00013041405101598, "loss": 0.6281, "mean_token_accuracy": 0.8221091449260711, "num_tokens": 29454589.0, "step": 2190 }, { "epoch": 1.3398294762484775, "grad_norm": 0.5621201992034912, "learning_rate": 0.00012983727666580086, "loss": 0.6755, "mean_token_accuracy": 0.8157430678606034, "num_tokens": 29589968.0, "step": 2200 }, { "epoch": 1.345919610231425, "grad_norm": 0.579699695110321, "learning_rate": 0.00012925964470936598, "loss": 0.6859, "mean_token_accuracy": 0.8122102931141854, "num_tokens": 29720188.0, "step": 2210 }, { "epoch": 1.3520097442143726, "grad_norm": 0.6406823992729187, "learning_rate": 0.00012868118011889236, "loss": 0.684, "mean_token_accuracy": 0.8107294023036957, "num_tokens": 29848418.0, "step": 2220 }, { "epoch": 1.3580998781973204, "grad_norm": 0.4707708954811096, "learning_rate": 0.00012810190790259367, "loss": 0.6607, "mean_token_accuracy": 0.8182852879166603, "num_tokens": 29988202.0, "step": 2230 }, { "epoch": 1.364190012180268, "grad_norm": 0.6458183526992798, "learning_rate": 0.00012752185310359874, "loss": 0.6935, "mean_token_accuracy": 0.8089477211236954, "num_tokens": 30119777.0, "step": 2240 }, { "epoch": 1.3702801461632155, "grad_norm": 0.4278848469257355, "learning_rate": 0.00012694104079886918, "loss": 0.6565, "mean_token_accuracy": 0.8185079246759415, "num_tokens": 30256776.0, "step": 2250 }, { "epoch": 1.3763702801461632, "grad_norm": 0.5647698044776917, "learning_rate": 0.00012635949609811505, "loss": 0.6636, "mean_token_accuracy": 0.8155051723122597, "num_tokens": 30395629.0, "step": 2260 }, { "epoch": 1.3824604141291108, "grad_norm": 0.43498411774635315, "learning_rate": 0.00012577724414270937, "loss": 0.689, "mean_token_accuracy": 0.8125654354691505, "num_tokens": 30532805.0, "step": 2270 }, { "epoch": 1.3885505481120584, "grad_norm": 0.5296844244003296, "learning_rate": 0.00012519431010460136, "loss": 0.6854, "mean_token_accuracy": 0.8122918352484703, "num_tokens": 30664642.0, "step": 2280 }, { "epoch": 1.3946406820950061, "grad_norm": 0.44080430269241333, "learning_rate": 0.000124610719185228, "loss": 0.6405, "mean_token_accuracy": 0.8192834481596947, "num_tokens": 30805370.0, "step": 2290 }, { "epoch": 1.4007308160779537, "grad_norm": 0.5946847796440125, "learning_rate": 0.00012402649661442453, "loss": 0.7025, "mean_token_accuracy": 0.8085126876831055, "num_tokens": 30936385.0, "step": 2300 }, { "epoch": 1.4068209500609012, "grad_norm": 0.6572047472000122, "learning_rate": 0.0001234416676493339, "loss": 0.709, "mean_token_accuracy": 0.8046677514910698, "num_tokens": 31067615.0, "step": 2310 }, { "epoch": 1.412911084043849, "grad_norm": 0.4797047972679138, "learning_rate": 0.0001228562575733147, "loss": 0.6675, "mean_token_accuracy": 0.8157136350870132, "num_tokens": 31200044.0, "step": 2320 }, { "epoch": 1.4190012180267966, "grad_norm": 0.5451430082321167, "learning_rate": 0.0001222702916948481, "loss": 0.6746, "mean_token_accuracy": 0.8092615008354187, "num_tokens": 31334451.0, "step": 2330 }, { "epoch": 1.4250913520097441, "grad_norm": 0.5049906969070435, "learning_rate": 0.00012168379534644371, "loss": 0.6515, "mean_token_accuracy": 0.8203717589378356, "num_tokens": 31472218.0, "step": 2340 }, { "epoch": 1.431181485992692, "grad_norm": 0.6531693935394287, "learning_rate": 0.00012109679388354462, "loss": 0.6778, "mean_token_accuracy": 0.8134923160076142, "num_tokens": 31605853.0, "step": 2350 }, { "epoch": 1.4372716199756395, "grad_norm": 0.5340039730072021, "learning_rate": 0.00012050931268343089, "loss": 0.6628, "mean_token_accuracy": 0.8176047816872597, "num_tokens": 31741034.0, "step": 2360 }, { "epoch": 1.443361753958587, "grad_norm": 0.4518280625343323, "learning_rate": 0.00011992137714412266, "loss": 0.6407, "mean_token_accuracy": 0.8207336485385894, "num_tokens": 31878661.0, "step": 2370 }, { "epoch": 1.4494518879415348, "grad_norm": 0.5232827067375183, "learning_rate": 0.00011933301268328212, "loss": 0.6742, "mean_token_accuracy": 0.8158077761530876, "num_tokens": 32016524.0, "step": 2380 }, { "epoch": 1.4555420219244823, "grad_norm": 0.5181542634963989, "learning_rate": 0.00011874424473711457, "loss": 0.699, "mean_token_accuracy": 0.8078866004943848, "num_tokens": 32146820.0, "step": 2390 }, { "epoch": 1.46163215590743, "grad_norm": 0.5801041126251221, "learning_rate": 0.00011815509875926883, "loss": 0.6572, "mean_token_accuracy": 0.8183338135480881, "num_tokens": 32285928.0, "step": 2400 }, { "epoch": 1.4677222898903777, "grad_norm": 0.5347133874893188, "learning_rate": 0.00011756560021973679, "loss": 0.6738, "mean_token_accuracy": 0.8143690213561058, "num_tokens": 32416470.0, "step": 2410 }, { "epoch": 1.4738124238733252, "grad_norm": 0.4945615231990814, "learning_rate": 0.0001169757746037524, "loss": 0.6505, "mean_token_accuracy": 0.8196728631854058, "num_tokens": 32553798.0, "step": 2420 }, { "epoch": 1.4799025578562728, "grad_norm": 0.5072743892669678, "learning_rate": 0.00011638564741068965, "loss": 0.625, "mean_token_accuracy": 0.826240348815918, "num_tokens": 32692511.0, "step": 2430 }, { "epoch": 1.4859926918392206, "grad_norm": 0.5887538194656372, "learning_rate": 0.00011579524415296043, "loss": 0.6904, "mean_token_accuracy": 0.8112018033862114, "num_tokens": 32818836.0, "step": 2440 }, { "epoch": 1.4920828258221681, "grad_norm": 0.5464449524879456, "learning_rate": 0.00011520459035491142, "loss": 0.6553, "mean_token_accuracy": 0.8198345899581909, "num_tokens": 32957967.0, "step": 2450 }, { "epoch": 1.4981729598051157, "grad_norm": 0.5787419676780701, "learning_rate": 0.00011461371155172071, "loss": 0.663, "mean_token_accuracy": 0.8155046373605728, "num_tokens": 33094241.0, "step": 2460 }, { "epoch": 1.5042630937880634, "grad_norm": 0.5159268975257874, "learning_rate": 0.00011402263328829384, "loss": 0.6792, "mean_token_accuracy": 0.8127613604068756, "num_tokens": 33225474.0, "step": 2470 }, { "epoch": 1.510353227771011, "grad_norm": 0.5665333867073059, "learning_rate": 0.00011343138111815939, "loss": 0.6265, "mean_token_accuracy": 0.8276977241039276, "num_tokens": 33368246.0, "step": 2480 }, { "epoch": 1.5164433617539586, "grad_norm": 0.6272276639938354, "learning_rate": 0.00011283998060236421, "loss": 0.6734, "mean_token_accuracy": 0.816029068827629, "num_tokens": 33503967.0, "step": 2490 }, { "epoch": 1.5225334957369063, "grad_norm": 0.5275886654853821, "learning_rate": 0.0001122484573083686, "loss": 0.6457, "mean_token_accuracy": 0.8222623988986015, "num_tokens": 33641826.0, "step": 2500 }, { "epoch": 1.5286236297198539, "grad_norm": 0.5526687502861023, "learning_rate": 0.00011165683680894072, "loss": 0.6795, "mean_token_accuracy": 0.8127825185656548, "num_tokens": 33774185.0, "step": 2510 }, { "epoch": 1.5347137637028014, "grad_norm": 0.6226133704185486, "learning_rate": 0.00011106514468105111, "loss": 0.6684, "mean_token_accuracy": 0.815614765882492, "num_tokens": 33907116.0, "step": 2520 }, { "epoch": 1.5408038976857492, "grad_norm": 0.612832248210907, "learning_rate": 0.000110473406504767, "loss": 0.6287, "mean_token_accuracy": 0.8220825806260109, "num_tokens": 34048267.0, "step": 2530 }, { "epoch": 1.5468940316686965, "grad_norm": 0.6066681742668152, "learning_rate": 0.00010988164786214639, "loss": 0.6851, "mean_token_accuracy": 0.8115911707282066, "num_tokens": 34177555.0, "step": 2540 }, { "epoch": 1.5529841656516443, "grad_norm": 0.6376360058784485, "learning_rate": 0.00010928989433613204, "loss": 0.6921, "mean_token_accuracy": 0.8096534594893455, "num_tokens": 34308932.0, "step": 2550 }, { "epoch": 1.559074299634592, "grad_norm": 0.6083400249481201, "learning_rate": 0.00010869817150944546, "loss": 0.6575, "mean_token_accuracy": 0.8187816679477692, "num_tokens": 34443994.0, "step": 2560 }, { "epoch": 1.5651644336175394, "grad_norm": 0.6098156571388245, "learning_rate": 0.00010810650496348116, "loss": 0.6092, "mean_token_accuracy": 0.8285523638129234, "num_tokens": 34588403.0, "step": 2570 }, { "epoch": 1.5712545676004872, "grad_norm": 0.47795701026916504, "learning_rate": 0.00010751492027720027, "loss": 0.6423, "mean_token_accuracy": 0.8211737647652626, "num_tokens": 34730426.0, "step": 2580 }, { "epoch": 1.577344701583435, "grad_norm": 0.560787558555603, "learning_rate": 0.00010692344302602515, "loss": 0.6707, "mean_token_accuracy": 0.8134441033005715, "num_tokens": 34861708.0, "step": 2590 }, { "epoch": 1.5834348355663823, "grad_norm": 0.5722246766090393, "learning_rate": 0.00010633209878073343, "loss": 0.6533, "mean_token_accuracy": 0.8185199156403542, "num_tokens": 34997377.0, "step": 2600 }, { "epoch": 1.58952496954933, "grad_norm": 0.4941788613796234, "learning_rate": 0.00010574091310635263, "loss": 0.6487, "mean_token_accuracy": 0.8205527886748314, "num_tokens": 35133685.0, "step": 2610 }, { "epoch": 1.5956151035322779, "grad_norm": 0.575986921787262, "learning_rate": 0.00010514991156105493, "loss": 0.6615, "mean_token_accuracy": 0.8179458349943161, "num_tokens": 35270993.0, "step": 2620 }, { "epoch": 1.6017052375152252, "grad_norm": 0.5677866339683533, "learning_rate": 0.00010455911969505228, "loss": 0.6572, "mean_token_accuracy": 0.8155815675854683, "num_tokens": 35402062.0, "step": 2630 }, { "epoch": 1.607795371498173, "grad_norm": 0.6232825517654419, "learning_rate": 0.00010396856304949162, "loss": 0.6477, "mean_token_accuracy": 0.8209305629134178, "num_tokens": 35537394.0, "step": 2640 }, { "epoch": 1.6138855054811205, "grad_norm": 0.6252410411834717, "learning_rate": 0.00010337826715535102, "loss": 0.6819, "mean_token_accuracy": 0.8137489795684815, "num_tokens": 35669332.0, "step": 2650 }, { "epoch": 1.619975639464068, "grad_norm": 0.5850580334663391, "learning_rate": 0.0001027882575323356, "loss": 0.6831, "mean_token_accuracy": 0.8099577218294144, "num_tokens": 35799095.0, "step": 2660 }, { "epoch": 1.6260657734470159, "grad_norm": 0.5118699073791504, "learning_rate": 0.00010219855968777442, "loss": 0.681, "mean_token_accuracy": 0.8123177006840706, "num_tokens": 35928313.0, "step": 2670 }, { "epoch": 1.6321559074299634, "grad_norm": 0.5392698645591736, "learning_rate": 0.00010160919911551774, "loss": 0.6536, "mean_token_accuracy": 0.8185337752103805, "num_tokens": 36062033.0, "step": 2680 }, { "epoch": 1.638246041412911, "grad_norm": 0.5542203783988953, "learning_rate": 0.00010102020129483481, "loss": 0.6859, "mean_token_accuracy": 0.8107540607452393, "num_tokens": 36190194.0, "step": 2690 }, { "epoch": 1.6443361753958587, "grad_norm": 0.5962918996810913, "learning_rate": 0.0001004315916893124, "loss": 0.64, "mean_token_accuracy": 0.8226593688130379, "num_tokens": 36322437.0, "step": 2700 }, { "epoch": 1.6504263093788063, "grad_norm": 0.6391364932060242, "learning_rate": 9.984339574575394e-05, "loss": 0.6457, "mean_token_accuracy": 0.8250340327620507, "num_tokens": 36463231.0, "step": 2710 }, { "epoch": 1.6565164433617539, "grad_norm": 0.5798075795173645, "learning_rate": 9.92556388930794e-05, "loss": 0.6901, "mean_token_accuracy": 0.8104871213436127, "num_tokens": 36588963.0, "step": 2720 }, { "epoch": 1.6626065773447016, "grad_norm": 0.5375143885612488, "learning_rate": 9.866834654122597e-05, "loss": 0.6723, "mean_token_accuracy": 0.8132491707801819, "num_tokens": 36724295.0, "step": 2730 }, { "epoch": 1.6686967113276492, "grad_norm": 0.5556331276893616, "learning_rate": 9.808154408004942e-05, "loss": 0.6316, "mean_token_accuracy": 0.8221101492643357, "num_tokens": 36855978.0, "step": 2740 }, { "epoch": 1.6747868453105967, "grad_norm": 0.5330142974853516, "learning_rate": 9.749525687822674e-05, "loss": 0.6269, "mean_token_accuracy": 0.8239532545208931, "num_tokens": 36994164.0, "step": 2750 }, { "epoch": 1.6808769792935445, "grad_norm": 0.568084716796875, "learning_rate": 9.6909510282159e-05, "loss": 0.6568, "mean_token_accuracy": 0.8158794924616813, "num_tokens": 37130680.0, "step": 2760 }, { "epoch": 1.686967113276492, "grad_norm": 0.5072943568229675, "learning_rate": 9.632432961487585e-05, "loss": 0.6838, "mean_token_accuracy": 0.8121756613254547, "num_tokens": 37261462.0, "step": 2770 }, { "epoch": 1.6930572472594396, "grad_norm": 0.5469337701797485, "learning_rate": 9.573974017494069e-05, "loss": 0.6447, "mean_token_accuracy": 0.8220986798405647, "num_tokens": 37395606.0, "step": 2780 }, { "epoch": 1.6991473812423874, "grad_norm": 0.57918381690979, "learning_rate": 9.515576723535689e-05, "loss": 0.6217, "mean_token_accuracy": 0.822702020406723, "num_tokens": 37533585.0, "step": 2790 }, { "epoch": 1.705237515225335, "grad_norm": 0.6425563097000122, "learning_rate": 9.45724360424753e-05, "loss": 0.6435, "mean_token_accuracy": 0.8198476612567902, "num_tokens": 37672877.0, "step": 2800 }, { "epoch": 1.7113276492082825, "grad_norm": 0.5059729218482971, "learning_rate": 9.398977181490274e-05, "loss": 0.6579, "mean_token_accuracy": 0.8166012555360794, "num_tokens": 37809109.0, "step": 2810 }, { "epoch": 1.7174177831912303, "grad_norm": 0.5450888276100159, "learning_rate": 9.340779974241167e-05, "loss": 0.6175, "mean_token_accuracy": 0.8274259582161904, "num_tokens": 37950597.0, "step": 2820 }, { "epoch": 1.7235079171741778, "grad_norm": 0.6464765667915344, "learning_rate": 9.282654498485139e-05, "loss": 0.6636, "mean_token_accuracy": 0.8163545817136765, "num_tokens": 38086904.0, "step": 2830 }, { "epoch": 1.7295980511571254, "grad_norm": 0.6118177175521851, "learning_rate": 9.22460326710601e-05, "loss": 0.6696, "mean_token_accuracy": 0.8133967757225037, "num_tokens": 38219759.0, "step": 2840 }, { "epoch": 1.7356881851400732, "grad_norm": 0.5518969893455505, "learning_rate": 9.16662878977786e-05, "loss": 0.6659, "mean_token_accuracy": 0.8180875137448311, "num_tokens": 38349770.0, "step": 2850 }, { "epoch": 1.7417783191230207, "grad_norm": 0.6465517282485962, "learning_rate": 9.108733572856549e-05, "loss": 0.6581, "mean_token_accuracy": 0.8170399129390716, "num_tokens": 38482303.0, "step": 2860 }, { "epoch": 1.7478684531059683, "grad_norm": 0.5193557143211365, "learning_rate": 9.050920119271335e-05, "loss": 0.6543, "mean_token_accuracy": 0.8178304255008697, "num_tokens": 38615426.0, "step": 2870 }, { "epoch": 1.753958587088916, "grad_norm": 0.611529529094696, "learning_rate": 8.993190928416682e-05, "loss": 0.6248, "mean_token_accuracy": 0.8259203046560287, "num_tokens": 38755859.0, "step": 2880 }, { "epoch": 1.7600487210718636, "grad_norm": 0.5405944585800171, "learning_rate": 8.935548496044198e-05, "loss": 0.6232, "mean_token_accuracy": 0.8281204700469971, "num_tokens": 38893007.0, "step": 2890 }, { "epoch": 1.7661388550548112, "grad_norm": 0.6433010697364807, "learning_rate": 8.877995314154748e-05, "loss": 0.6751, "mean_token_accuracy": 0.8155393078923225, "num_tokens": 39020285.0, "step": 2900 }, { "epoch": 1.772228989037759, "grad_norm": 0.47974956035614014, "learning_rate": 8.820533870890717e-05, "loss": 0.6527, "mean_token_accuracy": 0.8197720810770989, "num_tokens": 39151426.0, "step": 2910 }, { "epoch": 1.7783191230207065, "grad_norm": 0.5529680848121643, "learning_rate": 8.763166650428436e-05, "loss": 0.6262, "mean_token_accuracy": 0.8256829127669334, "num_tokens": 39294242.0, "step": 2920 }, { "epoch": 1.784409257003654, "grad_norm": 0.6060122847557068, "learning_rate": 8.705896132870797e-05, "loss": 0.6563, "mean_token_accuracy": 0.8192467406392098, "num_tokens": 39425879.0, "step": 2930 }, { "epoch": 1.7904993909866018, "grad_norm": 0.6099355220794678, "learning_rate": 8.648724794140017e-05, "loss": 0.6664, "mean_token_accuracy": 0.8186777010560036, "num_tokens": 39559787.0, "step": 2940 }, { "epoch": 1.7965895249695494, "grad_norm": 0.5908733010292053, "learning_rate": 8.591655105870615e-05, "loss": 0.6712, "mean_token_accuracy": 0.8136340633034707, "num_tokens": 39689823.0, "step": 2950 }, { "epoch": 1.802679658952497, "grad_norm": 0.5845519304275513, "learning_rate": 8.534689535302553e-05, "loss": 0.6608, "mean_token_accuracy": 0.8170475289225578, "num_tokens": 39820725.0, "step": 2960 }, { "epoch": 1.8087697929354447, "grad_norm": 0.6311175227165222, "learning_rate": 8.47783054517457e-05, "loss": 0.6491, "mean_token_accuracy": 0.8193596869707107, "num_tokens": 39947874.0, "step": 2970 }, { "epoch": 1.814859926918392, "grad_norm": 0.5293188691139221, "learning_rate": 8.421080593617706e-05, "loss": 0.6105, "mean_token_accuracy": 0.83141258507967, "num_tokens": 40091297.0, "step": 2980 }, { "epoch": 1.8209500609013398, "grad_norm": 0.5252617597579956, "learning_rate": 8.364442134049049e-05, "loss": 0.6356, "mean_token_accuracy": 0.8237936720252037, "num_tokens": 40229207.0, "step": 2990 }, { "epoch": 1.8270401948842876, "grad_norm": 0.6039798855781555, "learning_rate": 8.30791761506565e-05, "loss": 0.6456, "mean_token_accuracy": 0.8206364914774895, "num_tokens": 40364863.0, "step": 3000 }, { "epoch": 1.833130328867235, "grad_norm": 0.5508609414100647, "learning_rate": 8.251509480338684e-05, "loss": 0.6229, "mean_token_accuracy": 0.8255992740392685, "num_tokens": 40504123.0, "step": 3010 }, { "epoch": 1.8392204628501827, "grad_norm": 0.5637634992599487, "learning_rate": 8.195220168507789e-05, "loss": 0.6026, "mean_token_accuracy": 0.8290412962436676, "num_tokens": 40646821.0, "step": 3020 }, { "epoch": 1.8453105968331305, "grad_norm": 0.5463610291481018, "learning_rate": 8.139052113075645e-05, "loss": 0.6278, "mean_token_accuracy": 0.8244929850101471, "num_tokens": 40778989.0, "step": 3030 }, { "epoch": 1.8514007308160778, "grad_norm": 0.5360645055770874, "learning_rate": 8.083007742302776e-05, "loss": 0.6336, "mean_token_accuracy": 0.8228462666273118, "num_tokens": 40917560.0, "step": 3040 }, { "epoch": 1.8574908647990256, "grad_norm": 0.5185632705688477, "learning_rate": 8.02708947910255e-05, "loss": 0.5991, "mean_token_accuracy": 0.830042028427124, "num_tokens": 41059707.0, "step": 3050 }, { "epoch": 1.8635809987819734, "grad_norm": 0.6445353627204895, "learning_rate": 7.971299740936456e-05, "loss": 0.6555, "mean_token_accuracy": 0.8169184163212776, "num_tokens": 41192515.0, "step": 3060 }, { "epoch": 1.8696711327649207, "grad_norm": 0.5360421538352966, "learning_rate": 7.915640939709576e-05, "loss": 0.6234, "mean_token_accuracy": 0.8257398083806038, "num_tokens": 41330047.0, "step": 3070 }, { "epoch": 1.8757612667478685, "grad_norm": 0.58651202917099, "learning_rate": 7.860115481666333e-05, "loss": 0.6564, "mean_token_accuracy": 0.8205534905195236, "num_tokens": 41460379.0, "step": 3080 }, { "epoch": 1.881851400730816, "grad_norm": 0.6842640042304993, "learning_rate": 7.804725767286427e-05, "loss": 0.6935, "mean_token_accuracy": 0.8097458809614182, "num_tokens": 41581210.0, "step": 3090 }, { "epoch": 1.8879415347137636, "grad_norm": 0.5175514817237854, "learning_rate": 7.749474191181096e-05, "loss": 0.6393, "mean_token_accuracy": 0.8219558611512184, "num_tokens": 41714792.0, "step": 3100 }, { "epoch": 1.8940316686967114, "grad_norm": 0.5963588356971741, "learning_rate": 7.694363141989575e-05, "loss": 0.658, "mean_token_accuracy": 0.8182344615459443, "num_tokens": 41846600.0, "step": 3110 }, { "epoch": 1.900121802679659, "grad_norm": 0.6149535775184631, "learning_rate": 7.639395002275827e-05, "loss": 0.6499, "mean_token_accuracy": 0.8208124756813049, "num_tokens": 41977627.0, "step": 3120 }, { "epoch": 1.9062119366626065, "grad_norm": 0.5739808678627014, "learning_rate": 7.584572148425544e-05, "loss": 0.6703, "mean_token_accuracy": 0.8125967502593994, "num_tokens": 42104510.0, "step": 3130 }, { "epoch": 1.9123020706455542, "grad_norm": 0.5982648730278015, "learning_rate": 7.529896950543416e-05, "loss": 0.6513, "mean_token_accuracy": 0.8201186507940292, "num_tokens": 42236168.0, "step": 3140 }, { "epoch": 1.9183922046285018, "grad_norm": 0.5896486043930054, "learning_rate": 7.475371772350658e-05, "loss": 0.6133, "mean_token_accuracy": 0.8260134413838387, "num_tokens": 42375086.0, "step": 3150 }, { "epoch": 1.9244823386114494, "grad_norm": 0.6223361492156982, "learning_rate": 7.420998971082833e-05, "loss": 0.6638, "mean_token_accuracy": 0.8162963137030601, "num_tokens": 42506457.0, "step": 3160 }, { "epoch": 1.9305724725943971, "grad_norm": 0.709854245185852, "learning_rate": 7.366780897387924e-05, "loss": 0.6324, "mean_token_accuracy": 0.8247174829244613, "num_tokens": 42640886.0, "step": 3170 }, { "epoch": 1.9366626065773447, "grad_norm": 0.6794169545173645, "learning_rate": 7.312719895224736e-05, "loss": 0.6164, "mean_token_accuracy": 0.82676922082901, "num_tokens": 42781318.0, "step": 3180 }, { "epoch": 1.9427527405602922, "grad_norm": 0.49305981397628784, "learning_rate": 7.258818301761532e-05, "loss": 0.6216, "mean_token_accuracy": 0.8258268669247627, "num_tokens": 42919381.0, "step": 3190 }, { "epoch": 1.94884287454324, "grad_norm": 0.5072576999664307, "learning_rate": 7.205078447275031e-05, "loss": 0.6407, "mean_token_accuracy": 0.819316141307354, "num_tokens": 43056494.0, "step": 3200 }, { "epoch": 1.9549330085261876, "grad_norm": 0.6188381314277649, "learning_rate": 7.151502655049623e-05, "loss": 0.6022, "mean_token_accuracy": 0.8328602254390717, "num_tokens": 43197795.0, "step": 3210 }, { "epoch": 1.9610231425091351, "grad_norm": 0.5626131296157837, "learning_rate": 7.098093241276962e-05, "loss": 0.6245, "mean_token_accuracy": 0.8258091285824776, "num_tokens": 43340325.0, "step": 3220 }, { "epoch": 1.967113276492083, "grad_norm": 0.5364338755607605, "learning_rate": 7.044852514955816e-05, "loss": 0.6454, "mean_token_accuracy": 0.8199462234973908, "num_tokens": 43472226.0, "step": 3230 }, { "epoch": 1.9732034104750305, "grad_norm": 0.5460382699966431, "learning_rate": 6.991782777792244e-05, "loss": 0.6214, "mean_token_accuracy": 0.8251617640256882, "num_tokens": 43609559.0, "step": 3240 }, { "epoch": 1.979293544457978, "grad_norm": 0.6013203263282776, "learning_rate": 6.938886324100097e-05, "loss": 0.6422, "mean_token_accuracy": 0.8197862133383751, "num_tokens": 43743060.0, "step": 3250 }, { "epoch": 1.9853836784409258, "grad_norm": 0.6688512563705444, "learning_rate": 6.88616544070182e-05, "loss": 0.6447, "mean_token_accuracy": 0.820338460803032, "num_tokens": 43876874.0, "step": 3260 }, { "epoch": 1.9914738124238733, "grad_norm": 0.6146946549415588, "learning_rate": 6.8336224068296e-05, "loss": 0.6015, "mean_token_accuracy": 0.8318811848759651, "num_tokens": 44021963.0, "step": 3270 }, { "epoch": 1.997563946406821, "grad_norm": 0.5972597599029541, "learning_rate": 6.781259494026821e-05, "loss": 0.6094, "mean_token_accuracy": 0.8282003849744797, "num_tokens": 44159207.0, "step": 3280 } ], "logging_steps": 10, "max_steps": 4926, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.398768566923166e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }