| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 625, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0016, | |
| "grad_norm": 0.35758575797080994, | |
| "learning_rate": 3.1746031746031746e-06, | |
| "loss": 0.2918, | |
| "mean_token_accuracy": 0.9317843317985535, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.5407769680023193, | |
| "learning_rate": 1.5873015873015872e-05, | |
| "loss": 0.334, | |
| "mean_token_accuracy": 0.9228289499878883, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.2607511878013611, | |
| "learning_rate": 3.1746031746031745e-05, | |
| "loss": 0.3393, | |
| "mean_token_accuracy": 0.9221176445484162, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 0.1869797259569168, | |
| "learning_rate": 4.761904761904762e-05, | |
| "loss": 0.2987, | |
| "mean_token_accuracy": 0.9277177572250366, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.13320446014404297, | |
| "learning_rate": 6.349206349206349e-05, | |
| "loss": 0.2496, | |
| "mean_token_accuracy": 0.9362682342529297, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.1366184651851654, | |
| "learning_rate": 7.936507936507937e-05, | |
| "loss": 0.2242, | |
| "mean_token_accuracy": 0.9423655033111572, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.16860996186733246, | |
| "learning_rate": 9.523809523809524e-05, | |
| "loss": 0.2088, | |
| "mean_token_accuracy": 0.9416319727897644, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.11776343733072281, | |
| "learning_rate": 0.00011111111111111112, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9534160196781158, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.16114415228366852, | |
| "learning_rate": 0.00012698412698412698, | |
| "loss": 0.122, | |
| "mean_token_accuracy": 0.9644478380680084, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.12855438888072968, | |
| "learning_rate": 0.00014285714285714287, | |
| "loss": 0.0825, | |
| "mean_token_accuracy": 0.9731343150138855, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.11165463179349899, | |
| "learning_rate": 0.00015873015873015873, | |
| "loss": 0.0556, | |
| "mean_token_accuracy": 0.9801632344722748, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.11061853170394897, | |
| "learning_rate": 0.00017460317460317462, | |
| "loss": 0.0411, | |
| "mean_token_accuracy": 0.9841950356960296, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.08642830699682236, | |
| "learning_rate": 0.00019047619047619048, | |
| "loss": 0.0391, | |
| "mean_token_accuracy": 0.9843509018421173, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.10681991279125214, | |
| "learning_rate": 0.00019999375039475277, | |
| "loss": 0.0353, | |
| "mean_token_accuracy": 0.9855763733386993, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.09516558051109314, | |
| "learning_rate": 0.0001999234513064475, | |
| "loss": 0.0315, | |
| "mean_token_accuracy": 0.9871177613735199, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.0980728268623352, | |
| "learning_rate": 0.00019977509622105233, | |
| "loss": 0.0297, | |
| "mean_token_accuracy": 0.9878179967403412, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.06371135264635086, | |
| "learning_rate": 0.0001995488010273198, | |
| "loss": 0.0294, | |
| "mean_token_accuracy": 0.9879972219467164, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 0.07989838719367981, | |
| "learning_rate": 0.00019924474249753655, | |
| "loss": 0.0283, | |
| "mean_token_accuracy": 0.9881730616092682, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.07586675882339478, | |
| "learning_rate": 0.00019886315814943647, | |
| "loss": 0.0265, | |
| "mean_token_accuracy": 0.9889779150485992, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.04547739028930664, | |
| "learning_rate": 0.0001984043460606618, | |
| "loss": 0.0273, | |
| "mean_token_accuracy": 0.9885783493518829, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.06169603019952774, | |
| "learning_rate": 0.0001978686646359173, | |
| "loss": 0.0244, | |
| "mean_token_accuracy": 0.9895990073680878, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 0.06537225097417831, | |
| "learning_rate": 0.0001972565323269996, | |
| "loss": 0.0257, | |
| "mean_token_accuracy": 0.9891368567943573, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.0580211840569973, | |
| "learning_rate": 0.00019656842730592046, | |
| "loss": 0.0248, | |
| "mean_token_accuracy": 0.9892465591430664, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 0.050916433334350586, | |
| "learning_rate": 0.0001958048870913786, | |
| "loss": 0.0242, | |
| "mean_token_accuracy": 0.9895266950130462, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.05767924338579178, | |
| "learning_rate": 0.0001949665081288729, | |
| "loss": 0.0229, | |
| "mean_token_accuracy": 0.9902631580829621, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.0532953217625618, | |
| "learning_rate": 0.00019405394532478424, | |
| "loss": 0.022, | |
| "mean_token_accuracy": 0.9905503630638123, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.06565467268228531, | |
| "learning_rate": 0.00019306791153479006, | |
| "loss": 0.0237, | |
| "mean_token_accuracy": 0.9896982431411743, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 0.07220505177974701, | |
| "learning_rate": 0.00019200917700701176, | |
| "loss": 0.0236, | |
| "mean_token_accuracy": 0.9899338781833649, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.0682716816663742, | |
| "learning_rate": 0.0001908785687803289, | |
| "loss": 0.0229, | |
| "mean_token_accuracy": 0.9898757636547089, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 0.04945652559399605, | |
| "learning_rate": 0.00018967697003833157, | |
| "loss": 0.0231, | |
| "mean_token_accuracy": 0.9899633169174195, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.05297650769352913, | |
| "learning_rate": 0.0001884053194194142, | |
| "loss": 0.0203, | |
| "mean_token_accuracy": 0.9912129640579224, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.05250829458236694, | |
| "learning_rate": 0.00018706461028355104, | |
| "loss": 0.0206, | |
| "mean_token_accuracy": 0.9913576364517211, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.05076967924833298, | |
| "learning_rate": 0.00018565588993632487, | |
| "loss": 0.0252, | |
| "mean_token_accuracy": 0.988889217376709, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 0.05826568230986595, | |
| "learning_rate": 0.0001841802588108161, | |
| "loss": 0.0216, | |
| "mean_token_accuracy": 0.990450119972229, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.06373272091150284, | |
| "learning_rate": 0.00018263886960799062, | |
| "loss": 0.0226, | |
| "mean_token_accuracy": 0.9901030540466309, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.058064743876457214, | |
| "learning_rate": 0.00018103292639625837, | |
| "loss": 0.0221, | |
| "mean_token_accuracy": 0.9902714133262634, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.055855073034763336, | |
| "learning_rate": 0.0001793636836709057, | |
| "loss": 0.0216, | |
| "mean_token_accuracy": 0.9907104849815369, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 0.04673806577920914, | |
| "learning_rate": 0.0001776324453741365, | |
| "loss": 0.0204, | |
| "mean_token_accuracy": 0.9914953052997589, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.05335243418812752, | |
| "learning_rate": 0.00017584056387648727, | |
| "loss": 0.0225, | |
| "mean_token_accuracy": 0.9900318086147308, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.0564313605427742, | |
| "learning_rate": 0.0001739894389204122, | |
| "loss": 0.0213, | |
| "mean_token_accuracy": 0.9907339155673981, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.04711662605404854, | |
| "learning_rate": 0.00017208051652686335, | |
| "loss": 0.0214, | |
| "mean_token_accuracy": 0.9910281419754028, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 0.04865030199289322, | |
| "learning_rate": 0.00017011528786571969, | |
| "loss": 0.0196, | |
| "mean_token_accuracy": 0.9915309727191925, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.065005362033844, | |
| "learning_rate": 0.00016809528809094807, | |
| "loss": 0.0212, | |
| "mean_token_accuracy": 0.9909046471118927, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.06859762966632843, | |
| "learning_rate": 0.0001660220951414055, | |
| "loss": 0.0208, | |
| "mean_token_accuracy": 0.991067773103714, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.062335845082998276, | |
| "learning_rate": 0.00016389732850821966, | |
| "loss": 0.0202, | |
| "mean_token_accuracy": 0.991184014081955, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.0483645424246788, | |
| "learning_rate": 0.0001617226479697105, | |
| "loss": 0.0193, | |
| "mean_token_accuracy": 0.9917209327220917, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.04639686644077301, | |
| "learning_rate": 0.00015949975229484134, | |
| "loss": 0.0215, | |
| "mean_token_accuracy": 0.9905673682689666, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 0.05585622414946556, | |
| "learning_rate": 0.00015723037791621193, | |
| "loss": 0.0194, | |
| "mean_token_accuracy": 0.9916095018386841, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.053755246102809906, | |
| "learning_rate": 0.00015491629757363032, | |
| "loss": 0.0193, | |
| "mean_token_accuracy": 0.9914996027946472, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.04922570288181305, | |
| "learning_rate": 0.00015255931892932333, | |
| "loss": 0.0196, | |
| "mean_token_accuracy": 0.9919640719890594, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.05820295959711075, | |
| "learning_rate": 0.0001501612831558664, | |
| "loss": 0.0213, | |
| "mean_token_accuracy": 0.9906392514705658, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 0.04789372533559799, | |
| "learning_rate": 0.00014772406349793744, | |
| "loss": 0.0194, | |
| "mean_token_accuracy": 0.9916912317276001, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.04380471259355545, | |
| "learning_rate": 0.0001452495638090167, | |
| "loss": 0.0185, | |
| "mean_token_accuracy": 0.9918674886226654, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 0.04164614900946617, | |
| "learning_rate": 0.00014273971706417647, | |
| "loss": 0.0204, | |
| "mean_token_accuracy": 0.990676760673523, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.0481300987303257, | |
| "learning_rate": 0.00014019648385012244, | |
| "loss": 0.0191, | |
| "mean_token_accuracy": 0.9913807928562164, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.0358743742108345, | |
| "learning_rate": 0.00013762185083366556, | |
| "loss": 0.0193, | |
| "mean_token_accuracy": 0.9915093362331391, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.04558572173118591, | |
| "learning_rate": 0.00013501782920982184, | |
| "loss": 0.021, | |
| "mean_token_accuracy": 0.9905170977115632, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.050024427473545074, | |
| "learning_rate": 0.00013238645313075104, | |
| "loss": 0.0206, | |
| "mean_token_accuracy": 0.9908523619174957, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.05017619952559471, | |
| "learning_rate": 0.00012972977811676287, | |
| "loss": 0.0176, | |
| "mean_token_accuracy": 0.9921334087848663, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 0.03845011815428734, | |
| "learning_rate": 0.00012704987945063068, | |
| "loss": 0.0201, | |
| "mean_token_accuracy": 0.9913134098052978, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.04298843815922737, | |
| "learning_rate": 0.00012434885055646823, | |
| "loss": 0.0175, | |
| "mean_token_accuracy": 0.9922883987426758, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 0.05314357951283455, | |
| "learning_rate": 0.00012162880136443447, | |
| "loss": 0.0218, | |
| "mean_token_accuracy": 0.990600174665451, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.042282719165086746, | |
| "learning_rate": 0.00011889185666254506, | |
| "loss": 0.0192, | |
| "mean_token_accuracy": 0.991459196805954, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 0.05242412909865379, | |
| "learning_rate": 0.00011614015443687722, | |
| "loss": 0.0189, | |
| "mean_token_accuracy": 0.9916914582252503, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.05002991855144501, | |
| "learning_rate": 0.0001133758442014651, | |
| "loss": 0.0181, | |
| "mean_token_accuracy": 0.9920280575752258, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.04210834577679634, | |
| "learning_rate": 0.00011060108531918971, | |
| "loss": 0.0194, | |
| "mean_token_accuracy": 0.9914444029331207, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.04244352877140045, | |
| "learning_rate": 0.0001078180453149754, | |
| "loss": 0.0185, | |
| "mean_token_accuracy": 0.991835606098175, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 0.054087404161691666, | |
| "learning_rate": 0.00010502889818261075, | |
| "loss": 0.0205, | |
| "mean_token_accuracy": 0.990981924533844, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.04749380052089691, | |
| "learning_rate": 0.00010223582268651586, | |
| "loss": 0.019, | |
| "mean_token_accuracy": 0.9914271533489227, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 0.0451885461807251, | |
| "learning_rate": 9.94410006597835e-05, | |
| "loss": 0.0193, | |
| "mean_token_accuracy": 0.99147869348526, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.04427170008420944, | |
| "learning_rate": 9.66466152998226e-05, | |
| "loss": 0.0171, | |
| "mean_token_accuracy": 0.9927612245082855, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 0.04799705743789673, | |
| "learning_rate": 9.385484946293637e-05, | |
| "loss": 0.0206, | |
| "mean_token_accuracy": 0.9906625807285309, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.04452817514538765, | |
| "learning_rate": 9.106788395916678e-05, | |
| "loss": 0.019, | |
| "mean_token_accuracy": 0.9919753789901733, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 0.05037883296608925, | |
| "learning_rate": 8.828789584873754e-05, | |
| "loss": 0.0188, | |
| "mean_token_accuracy": 0.9917163908481598, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.044616032391786575, | |
| "learning_rate": 8.551705674142617e-05, | |
| "loss": 0.0195, | |
| "mean_token_accuracy": 0.9913720488548279, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.04669741541147232, | |
| "learning_rate": 8.275753110019367e-05, | |
| "loss": 0.0176, | |
| "mean_token_accuracy": 0.9922231256961822, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.06772830337285995, | |
| "learning_rate": 8.001147455039735e-05, | |
| "loss": 0.0175, | |
| "mean_token_accuracy": 0.9923631131649018, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 0.04503108188509941, | |
| "learning_rate": 7.728103219590681e-05, | |
| "loss": 0.0167, | |
| "mean_token_accuracy": 0.9929349243640899, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.04398513212800026, | |
| "learning_rate": 7.456833694343906e-05, | |
| "loss": 0.0179, | |
| "mean_token_accuracy": 0.9921640634536744, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 0.04398965463042259, | |
| "learning_rate": 7.18755078364214e-05, | |
| "loss": 0.0191, | |
| "mean_token_accuracy": 0.9915706038475036, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.05448702350258827, | |
| "learning_rate": 6.920464839968405e-05, | |
| "loss": 0.017, | |
| "mean_token_accuracy": 0.9924534082412719, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 0.050546955317258835, | |
| "learning_rate": 6.65578449962749e-05, | |
| "loss": 0.02, | |
| "mean_token_accuracy": 0.9911172151565552, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.0460050068795681, | |
| "learning_rate": 6.393716519768047e-05, | |
| "loss": 0.0178, | |
| "mean_token_accuracy": 0.9920965254306793, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 0.04446449503302574, | |
| "learning_rate": 6.134465616872598e-05, | |
| "loss": 0.0166, | |
| "mean_token_accuracy": 0.9923894107341766, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.05464813858270645, | |
| "learning_rate": 5.878234306841637e-05, | |
| "loss": 0.0171, | |
| "mean_token_accuracy": 0.9922880589962005, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.06052134186029434, | |
| "learning_rate": 5.62522274679673e-05, | |
| "loss": 0.0175, | |
| "mean_token_accuracy": 0.9923413753509521, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.054314225912094116, | |
| "learning_rate": 5.375628578726181e-05, | |
| "loss": 0.0168, | |
| "mean_token_accuracy": 0.9928416252136231, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 0.06336328387260437, | |
| "learning_rate": 5.1296467750954314e-05, | |
| "loss": 0.0172, | |
| "mean_token_accuracy": 0.9924824059009552, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.060671087354421616, | |
| "learning_rate": 4.8874694865427676e-05, | |
| "loss": 0.0172, | |
| "mean_token_accuracy": 0.9923681735992431, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 0.05438563972711563, | |
| "learning_rate": 4.649285891779327e-05, | |
| "loss": 0.0171, | |
| "mean_token_accuracy": 0.9924930989742279, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.05433967337012291, | |
| "learning_rate": 4.415282049810644e-05, | |
| "loss": 0.0173, | |
| "mean_token_accuracy": 0.9924486458301545, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 0.03932139277458191, | |
| "learning_rate": 4.1856407545951834e-05, | |
| "loss": 0.0182, | |
| "mean_token_accuracy": 0.9917416453361512, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.07568760961294174, | |
| "learning_rate": 3.9605413922533874e-05, | |
| "loss": 0.0179, | |
| "mean_token_accuracy": 0.9920516312122345, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 0.03854453191161156, | |
| "learning_rate": 3.740159800938784e-05, | |
| "loss": 0.0151, | |
| "mean_token_accuracy": 0.9933602154254914, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.04567549750208855, | |
| "learning_rate": 3.5246681334806175e-05, | |
| "loss": 0.015, | |
| "mean_token_accuracy": 0.9936224162578583, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.0444038100540638, | |
| "learning_rate": 3.3142347229053015e-05, | |
| "loss": 0.0182, | |
| "mean_token_accuracy": 0.9919311761856079, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.04453401267528534, | |
| "learning_rate": 3.109023950941736e-05, | |
| "loss": 0.0162, | |
| "mean_token_accuracy": 0.9927919328212738, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 0.050479914993047714, | |
| "learning_rate": 2.909196119613218e-05, | |
| "loss": 0.0178, | |
| "mean_token_accuracy": 0.992197722196579, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.03968972712755203, | |
| "learning_rate": 2.7149073260162416e-05, | |
| "loss": 0.0174, | |
| "mean_token_accuracy": 0.9922388553619385, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 0.0438266284763813, | |
| "learning_rate": 2.5263093403840142e-05, | |
| "loss": 0.0179, | |
| "mean_token_accuracy": 0.9920966625213623, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.035676371306180954, | |
| "learning_rate": 2.3435494875299314e-05, | |
| "loss": 0.0169, | |
| "mean_token_accuracy": 0.9925434589385986, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 0.04425378888845444, | |
| "learning_rate": 2.166770531763633e-05, | |
| "loss": 0.0164, | |
| "mean_token_accuracy": 0.9927955329418182, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.03838867321610451, | |
| "learning_rate": 1.9961105653695266e-05, | |
| "loss": 0.0173, | |
| "mean_token_accuracy": 0.9923167705535889, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 0.05161241069436073, | |
| "learning_rate": 1.8317029007349085e-05, | |
| "loss": 0.0179, | |
| "mean_token_accuracy": 0.9918343544006347, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.04114331677556038, | |
| "learning_rate": 1.6736759662119183e-05, | |
| "loss": 0.0167, | |
| "mean_token_accuracy": 0.9925995886325836, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.04665491729974747, | |
| "learning_rate": 1.5221532057947419e-05, | |
| "loss": 0.0169, | |
| "mean_token_accuracy": 0.9925190150737763, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.0381510928273201, | |
| "learning_rate": 1.3772529826903269e-05, | |
| "loss": 0.0153, | |
| "mean_token_accuracy": 0.9932702004909515, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 0.04139837622642517, | |
| "learning_rate": 1.23908848685804e-05, | |
| "loss": 0.016, | |
| "mean_token_accuracy": 0.993154889345169, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.04805777966976166, | |
| "learning_rate": 1.1077676465904208e-05, | |
| "loss": 0.0172, | |
| "mean_token_accuracy": 0.992431515455246, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 0.051602400839328766, | |
| "learning_rate": 9.833930442041506e-06, | |
| "loss": 0.0167, | |
| "mean_token_accuracy": 0.9925341963768005, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.044222284108400345, | |
| "learning_rate": 8.660618359070604e-06, | |
| "loss": 0.0169, | |
| "mean_token_accuracy": 0.99241544008255, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 0.038402166217565536, | |
| "learning_rate": 7.558656759037797e-06, | |
| "loss": 0.016, | |
| "mean_token_accuracy": 0.9928807556629181, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.048669859766960144, | |
| "learning_rate": 6.528906447993288e-06, | |
| "loss": 0.0183, | |
| "mean_token_accuracy": 0.9919456124305726, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 0.03222226724028587, | |
| "learning_rate": 5.572171823565797e-06, | |
| "loss": 0.0171, | |
| "mean_token_accuracy": 0.9923150539398193, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.04830151051282883, | |
| "learning_rate": 4.689200246600867e-06, | |
| "loss": 0.0177, | |
| "mean_token_accuracy": 0.991916960477829, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.038783151656389236, | |
| "learning_rate": 3.880681457354118e-06, | |
| "loss": 0.0158, | |
| "mean_token_accuracy": 0.9929914236068725, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.03586906939744949, | |
| "learning_rate": 3.1472470366950334e-06, | |
| "loss": 0.0157, | |
| "mean_token_accuracy": 0.9929950356483459, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 0.047617629170417786, | |
| "learning_rate": 2.4894699127426367e-06, | |
| "loss": 0.0167, | |
| "mean_token_accuracy": 0.9928344428539276, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.03604697436094284, | |
| "learning_rate": 1.907863913318153e-06, | |
| "loss": 0.0161, | |
| "mean_token_accuracy": 0.9927676558494568, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 0.04133192077279091, | |
| "learning_rate": 1.4028833645643113e-06, | |
| "loss": 0.0173, | |
| "mean_token_accuracy": 0.9923523128032684, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.04649489372968674, | |
| "learning_rate": 9.749227360448143e-07, | |
| "loss": 0.0163, | |
| "mean_token_accuracy": 0.9930159628391266, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 0.05105933919548988, | |
| "learning_rate": 6.243163326014267e-07, | |
| "loss": 0.0167, | |
| "mean_token_accuracy": 0.9926475822925568, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.0360567681491375, | |
| "learning_rate": 3.5133803320896994e-07, | |
| "loss": 0.0162, | |
| "mean_token_accuracy": 0.9927246868610382, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 0.050918541848659515, | |
| "learning_rate": 1.562010770326916e-07, | |
| "loss": 0.0184, | |
| "mean_token_accuracy": 0.9916555285453796, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.04419026896357536, | |
| "learning_rate": 3.905789685471062e-08, | |
| "loss": 0.0177, | |
| "mean_token_accuracy": 0.9921371281147003, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.04455499351024628, | |
| "learning_rate": 0.0, | |
| "loss": 0.0164, | |
| "mean_token_accuracy": 0.9929368138313294, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 625, | |
| "total_flos": 8.947190570627891e+16, | |
| "train_loss": 0.03501472600698471, | |
| "train_runtime": 1355.0569, | |
| "train_samples_per_second": 3.69, | |
| "train_steps_per_second": 0.461 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 625, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.947190570627891e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |