Marco-MT-Algharb / trainer_state.json
怀羽
first push
9b1e49e
raw
history blame
22.3 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 2485,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04024144869215292,
"grad_norm": 0.8856053732817273,
"learning_rate": 4.999278820633416e-05,
"loss": 0.5826,
"step": 20
},
{
"epoch": 0.08048289738430583,
"grad_norm": 0.7385891532669175,
"learning_rate": 4.996961926996955e-05,
"loss": 0.5449,
"step": 40
},
{
"epoch": 0.12072434607645875,
"grad_norm": 0.72926414566868,
"learning_rate": 4.9930488023869036e-05,
"loss": 0.5618,
"step": 60
},
{
"epoch": 0.16096579476861167,
"grad_norm": 0.6312617479451788,
"learning_rate": 4.987541948343489e-05,
"loss": 0.515,
"step": 80
},
{
"epoch": 0.2012072434607646,
"grad_norm": 0.6639335448923683,
"learning_rate": 4.980444885229155e-05,
"loss": 0.5328,
"step": 100
},
{
"epoch": 0.2414486921529175,
"grad_norm": 0.6557066466913923,
"learning_rate": 4.9717621499781034e-05,
"loss": 0.523,
"step": 120
},
{
"epoch": 0.28169014084507044,
"grad_norm": 0.646966336694488,
"learning_rate": 4.961499293195967e-05,
"loss": 0.5214,
"step": 140
},
{
"epoch": 0.32193158953722334,
"grad_norm": 0.6798500591020816,
"learning_rate": 4.949662875611487e-05,
"loss": 0.5143,
"step": 160
},
{
"epoch": 0.36217303822937624,
"grad_norm": 0.7622327150861464,
"learning_rate": 4.936260463882431e-05,
"loss": 0.507,
"step": 180
},
{
"epoch": 0.4024144869215292,
"grad_norm": 0.7701558072544397,
"learning_rate": 4.921300625758468e-05,
"loss": 0.5098,
"step": 200
},
{
"epoch": 0.4426559356136821,
"grad_norm": 0.6721766598194193,
"learning_rate": 4.9047929246040684e-05,
"loss": 0.4959,
"step": 220
},
{
"epoch": 0.482897384305835,
"grad_norm": 0.5759821922844843,
"learning_rate": 4.886747913284935e-05,
"loss": 0.4786,
"step": 240
},
{
"epoch": 0.5231388329979879,
"grad_norm": 0.6241582936916275,
"learning_rate": 4.8671771274218864e-05,
"loss": 0.4936,
"step": 260
},
{
"epoch": 0.5633802816901409,
"grad_norm": 0.6085499176420257,
"learning_rate": 4.846093078016486e-05,
"loss": 0.4927,
"step": 280
},
{
"epoch": 0.6036217303822937,
"grad_norm": 0.7195639159658046,
"learning_rate": 4.823509243453144e-05,
"loss": 0.4921,
"step": 300
},
{
"epoch": 0.6438631790744467,
"grad_norm": 0.6346981783065212,
"learning_rate": 4.7994400608828007e-05,
"loss": 0.4765,
"step": 320
},
{
"epoch": 0.6841046277665996,
"grad_norm": 0.6760210496604738,
"learning_rate": 4.7739009169937e-05,
"loss": 0.4694,
"step": 340
},
{
"epoch": 0.7243460764587525,
"grad_norm": 0.7309281782293976,
"learning_rate": 4.7469081381751526e-05,
"loss": 0.4693,
"step": 360
},
{
"epoch": 0.7645875251509054,
"grad_norm": 0.5121852796257732,
"learning_rate": 4.7184789800805785e-05,
"loss": 0.4626,
"step": 380
},
{
"epoch": 0.8048289738430584,
"grad_norm": 0.5626013205724073,
"learning_rate": 4.688631616596496e-05,
"loss": 0.463,
"step": 400
},
{
"epoch": 0.8450704225352113,
"grad_norm": 0.6626243373594697,
"learning_rate": 4.657385128224517e-05,
"loss": 0.4781,
"step": 420
},
{
"epoch": 0.8853118712273642,
"grad_norm": 0.5995753685938809,
"learning_rate": 4.624759489883771e-05,
"loss": 0.445,
"step": 440
},
{
"epoch": 0.9255533199195171,
"grad_norm": 0.5780765450132732,
"learning_rate": 4.5907755581415454e-05,
"loss": 0.4626,
"step": 460
},
{
"epoch": 0.96579476861167,
"grad_norm": 0.5478910002284612,
"learning_rate": 4.555455057880334e-05,
"loss": 0.4597,
"step": 480
},
{
"epoch": 1.0060362173038229,
"grad_norm": 0.48568823231381825,
"learning_rate": 4.518820568409781e-05,
"loss": 0.406,
"step": 500
},
{
"epoch": 1.0462776659959758,
"grad_norm": 0.5062057380908122,
"learning_rate": 4.480895509032424e-05,
"loss": 0.1795,
"step": 520
},
{
"epoch": 1.0865191146881288,
"grad_norm": 0.5205860419378047,
"learning_rate": 4.441704124072455e-05,
"loss": 0.1902,
"step": 540
},
{
"epoch": 1.1267605633802817,
"grad_norm": 0.519552686982609,
"learning_rate": 4.40127146737707e-05,
"loss": 0.1751,
"step": 560
},
{
"epoch": 1.1670020120724347,
"grad_norm": 0.519651130331484,
"learning_rate": 4.3596233863003135e-05,
"loss": 0.1847,
"step": 580
},
{
"epoch": 1.2072434607645874,
"grad_norm": 0.48247829692592525,
"learning_rate": 4.316786505179659e-05,
"loss": 0.1861,
"step": 600
},
{
"epoch": 1.2474849094567404,
"grad_norm": 0.4487348974547772,
"learning_rate": 4.27278820831589e-05,
"loss": 0.2026,
"step": 620
},
{
"epoch": 1.2877263581488934,
"grad_norm": 0.41599318043186545,
"learning_rate": 4.227656622467162e-05,
"loss": 0.2016,
"step": 640
},
{
"epoch": 1.3279678068410463,
"grad_norm": 0.4540433485384787,
"learning_rate": 4.181420598868425e-05,
"loss": 0.1933,
"step": 660
},
{
"epoch": 1.3682092555331993,
"grad_norm": 0.5514617072950766,
"learning_rate": 4.13410969478772e-05,
"loss": 0.2015,
"step": 680
},
{
"epoch": 1.408450704225352,
"grad_norm": 0.4605005045885439,
"learning_rate": 4.085754154631125e-05,
"loss": 0.1957,
"step": 700
},
{
"epoch": 1.448692152917505,
"grad_norm": 0.4596046168091651,
"learning_rate": 4.036384890608438e-05,
"loss": 0.2088,
"step": 720
},
{
"epoch": 1.488933601609658,
"grad_norm": 0.5197294768215174,
"learning_rate": 3.9860334629719484e-05,
"loss": 0.208,
"step": 740
},
{
"epoch": 1.529175050301811,
"grad_norm": 0.6580375239377614,
"learning_rate": 3.9347320598409434e-05,
"loss": 0.2124,
"step": 760
},
{
"epoch": 1.5694164989939638,
"grad_norm": 0.4181955378555661,
"learning_rate": 3.8825134766248266e-05,
"loss": 0.2032,
"step": 780
},
{
"epoch": 1.6096579476861166,
"grad_norm": 0.49205001089005324,
"learning_rate": 3.829411095058029e-05,
"loss": 0.2077,
"step": 800
},
{
"epoch": 1.6498993963782698,
"grad_norm": 0.4873486109938958,
"learning_rate": 3.775458861860086e-05,
"loss": 0.197,
"step": 820
},
{
"epoch": 1.6901408450704225,
"grad_norm": 0.5213796288743413,
"learning_rate": 3.720691267034547e-05,
"loss": 0.2024,
"step": 840
},
{
"epoch": 1.7303822937625755,
"grad_norm": 0.5064104251566499,
"learning_rate": 3.665143321820576e-05,
"loss": 0.2046,
"step": 860
},
{
"epoch": 1.7706237424547284,
"grad_norm": 0.5249037742075398,
"learning_rate": 3.6088505363113435e-05,
"loss": 0.2113,
"step": 880
},
{
"epoch": 1.8108651911468812,
"grad_norm": 0.5323135228705763,
"learning_rate": 3.5518488967535144e-05,
"loss": 0.2015,
"step": 900
},
{
"epoch": 1.8511066398390343,
"grad_norm": 0.45702728996195346,
"learning_rate": 3.4941748425423506e-05,
"loss": 0.2091,
"step": 920
},
{
"epoch": 1.891348088531187,
"grad_norm": 0.48980668848273645,
"learning_rate": 3.435865242927119e-05,
"loss": 0.2089,
"step": 940
},
{
"epoch": 1.93158953722334,
"grad_norm": 0.43577327809651956,
"learning_rate": 3.3769573734417256e-05,
"loss": 0.2115,
"step": 960
},
{
"epoch": 1.971830985915493,
"grad_norm": 0.5094162027924247,
"learning_rate": 3.317488892075601e-05,
"loss": 0.2094,
"step": 980
},
{
"epoch": 2.0120724346076457,
"grad_norm": 0.3457362763246621,
"learning_rate": 3.257497815200116e-05,
"loss": 0.1726,
"step": 1000
},
{
"epoch": 2.052313883299799,
"grad_norm": 0.37034508814670536,
"learning_rate": 3.1970224932658735e-05,
"loss": 0.0859,
"step": 1020
},
{
"epoch": 2.0925553319919517,
"grad_norm": 0.37642286093876015,
"learning_rate": 3.136101586286457e-05,
"loss": 0.0862,
"step": 1040
},
{
"epoch": 2.132796780684105,
"grad_norm": 0.3194432979842797,
"learning_rate": 3.0747740391242634e-05,
"loss": 0.0879,
"step": 1060
},
{
"epoch": 2.1730382293762576,
"grad_norm": 0.31686036565206543,
"learning_rate": 3.0130790565942552e-05,
"loss": 0.083,
"step": 1080
},
{
"epoch": 2.2132796780684103,
"grad_norm": 0.48975203689721203,
"learning_rate": 2.9510560784015257e-05,
"loss": 0.0887,
"step": 1100
},
{
"epoch": 2.2535211267605635,
"grad_norm": 0.3970890768904178,
"learning_rate": 2.8887447539287083e-05,
"loss": 0.0938,
"step": 1120
},
{
"epoch": 2.2937625754527162,
"grad_norm": 0.42930287541107787,
"learning_rate": 2.8261849168893462e-05,
"loss": 0.0894,
"step": 1140
},
{
"epoch": 2.3340040241448694,
"grad_norm": 0.40823847069100183,
"learning_rate": 2.763416559863425e-05,
"loss": 0.0868,
"step": 1160
},
{
"epoch": 2.374245472837022,
"grad_norm": 0.42121750405319264,
"learning_rate": 2.7004798087313437e-05,
"loss": 0.0912,
"step": 1180
},
{
"epoch": 2.414486921529175,
"grad_norm": 0.3214982105059483,
"learning_rate": 2.6374148970226774e-05,
"loss": 0.0918,
"step": 1200
},
{
"epoch": 2.454728370221328,
"grad_norm": 0.34082865221526726,
"learning_rate": 2.5742621401961143e-05,
"loss": 0.091,
"step": 1220
},
{
"epoch": 2.494969818913481,
"grad_norm": 0.4793154273361008,
"learning_rate": 2.5110619098670263e-05,
"loss": 0.0864,
"step": 1240
},
{
"epoch": 2.535211267605634,
"grad_norm": 0.39451008511666086,
"learning_rate": 2.447854607999135e-05,
"loss": 0.0927,
"step": 1260
},
{
"epoch": 2.5754527162977867,
"grad_norm": 0.3656682972169247,
"learning_rate": 2.38468064107678e-05,
"loss": 0.0895,
"step": 1280
},
{
"epoch": 2.6156941649899395,
"grad_norm": 0.3605202374697158,
"learning_rate": 2.3215803942742938e-05,
"loss": 0.0953,
"step": 1300
},
{
"epoch": 2.6559356136820926,
"grad_norm": 0.33578430684018057,
"learning_rate": 2.2585942056390058e-05,
"loss": 0.0966,
"step": 1320
},
{
"epoch": 2.6961770623742454,
"grad_norm": 0.3794779373893201,
"learning_rate": 2.195762340304364e-05,
"loss": 0.0915,
"step": 1340
},
{
"epoch": 2.7364185110663986,
"grad_norm": 0.33351508610885644,
"learning_rate": 2.133124964749678e-05,
"loss": 0.0879,
"step": 1360
},
{
"epoch": 2.7766599597585513,
"grad_norm": 0.41067282583568215,
"learning_rate": 2.0707221211229205e-05,
"loss": 0.0979,
"step": 1380
},
{
"epoch": 2.816901408450704,
"grad_norm": 0.4291530941671277,
"learning_rate": 2.008593701643017e-05,
"loss": 0.0946,
"step": 1400
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.3613865415142941,
"learning_rate": 1.9467794230979712e-05,
"loss": 0.0957,
"step": 1420
},
{
"epoch": 2.89738430583501,
"grad_norm": 0.35323941645271395,
"learning_rate": 1.8853188014551533e-05,
"loss": 0.0883,
"step": 1440
},
{
"epoch": 2.937625754527163,
"grad_norm": 0.38038366932243284,
"learning_rate": 1.8242511265999452e-05,
"loss": 0.0944,
"step": 1460
},
{
"epoch": 2.977867203219316,
"grad_norm": 0.4211867067510389,
"learning_rate": 1.7636154372189363e-05,
"loss": 0.0859,
"step": 1480
},
{
"epoch": 3.0181086519114686,
"grad_norm": 0.23971000463797287,
"learning_rate": 1.7034504958436843e-05,
"loss": 0.0686,
"step": 1500
},
{
"epoch": 3.058350100603622,
"grad_norm": 0.3934268138965833,
"learning_rate": 1.643794764071024e-05,
"loss": 0.0387,
"step": 1520
},
{
"epoch": 3.0985915492957745,
"grad_norm": 0.2953008266603751,
"learning_rate": 1.5846863779757492e-05,
"loss": 0.0402,
"step": 1540
},
{
"epoch": 3.1388329979879277,
"grad_norm": 0.30010306958393923,
"learning_rate": 1.5261631237313967e-05,
"loss": 0.0393,
"step": 1560
},
{
"epoch": 3.1790744466800804,
"grad_norm": 0.30120051063507675,
"learning_rate": 1.4682624134547021e-05,
"loss": 0.0363,
"step": 1580
},
{
"epoch": 3.219315895372233,
"grad_norm": 0.2515653178386247,
"learning_rate": 1.4110212612891887e-05,
"loss": 0.0383,
"step": 1600
},
{
"epoch": 3.2595573440643864,
"grad_norm": 0.27249388315493484,
"learning_rate": 1.3544762597431607e-05,
"loss": 0.0396,
"step": 1620
},
{
"epoch": 3.299798792756539,
"grad_norm": 0.3824387788042599,
"learning_rate": 1.2986635562972413e-05,
"loss": 0.0406,
"step": 1640
},
{
"epoch": 3.3400402414486923,
"grad_norm": 0.4190218976563753,
"learning_rate": 1.2436188302963944e-05,
"loss": 0.0439,
"step": 1660
},
{
"epoch": 3.380281690140845,
"grad_norm": 0.2317410734176642,
"learning_rate": 1.1893772701412233e-05,
"loss": 0.0423,
"step": 1680
},
{
"epoch": 3.4205231388329977,
"grad_norm": 0.278591876157434,
"learning_rate": 1.1359735507931035e-05,
"loss": 0.0407,
"step": 1700
},
{
"epoch": 3.460764587525151,
"grad_norm": 0.24119224796247,
"learning_rate": 1.0834418116075484e-05,
"loss": 0.0417,
"step": 1720
},
{
"epoch": 3.5010060362173037,
"grad_norm": 0.2958581734865895,
"learning_rate": 1.0318156345099692e-05,
"loss": 0.0371,
"step": 1740
},
{
"epoch": 3.541247484909457,
"grad_norm": 0.27477669064501764,
"learning_rate": 9.811280225277786e-06,
"loss": 0.0403,
"step": 1760
},
{
"epoch": 3.5814889336016096,
"grad_norm": 0.32430825636968,
"learning_rate": 9.314113786925777e-06,
"loss": 0.0396,
"step": 1780
},
{
"epoch": 3.6217303822937623,
"grad_norm": 0.3312827951095654,
"learning_rate": 8.826974853258884e-06,
"loss": 0.0407,
"step": 1800
},
{
"epoch": 3.6619718309859155,
"grad_norm": 0.24013332678773616,
"learning_rate": 8.35017483721696e-06,
"loss": 0.0414,
"step": 1820
},
{
"epoch": 3.7022132796780687,
"grad_norm": 0.28553574970459594,
"learning_rate": 7.884018542387731e-06,
"loss": 0.0416,
"step": 1840
},
{
"epoch": 3.7424547283702214,
"grad_norm": 0.2631019454087187,
"learning_rate": 7.428803968155307e-06,
"loss": 0.0401,
"step": 1860
},
{
"epoch": 3.782696177062374,
"grad_norm": 0.26131465743997634,
"learning_rate": 6.984822119198253e-06,
"loss": 0.0409,
"step": 1880
},
{
"epoch": 3.822937625754527,
"grad_norm": 0.23335316567031417,
"learning_rate": 6.552356819459354e-06,
"loss": 0.0437,
"step": 1900
},
{
"epoch": 3.86317907444668,
"grad_norm": 0.2745249201872649,
"learning_rate": 6.131684530705572e-06,
"loss": 0.0397,
"step": 1920
},
{
"epoch": 3.9034205231388333,
"grad_norm": 0.2608587874278287,
"learning_rate": 5.7230741757946485e-06,
"loss": 0.0409,
"step": 1940
},
{
"epoch": 3.943661971830986,
"grad_norm": 0.3249419421073309,
"learning_rate": 5.326786966760922e-06,
"loss": 0.0408,
"step": 1960
},
{
"epoch": 3.9839034205231387,
"grad_norm": 0.29014887197719835,
"learning_rate": 4.943076237830541e-06,
"loss": 0.042,
"step": 1980
},
{
"epoch": 4.0241448692152915,
"grad_norm": 0.13140332328630258,
"learning_rate": 4.5721872834726755e-06,
"loss": 0.0285,
"step": 2000
},
{
"epoch": 4.064386317907445,
"grad_norm": 0.1599909046333746,
"learning_rate": 4.214357201590316e-06,
"loss": 0.0163,
"step": 2020
},
{
"epoch": 4.104627766599598,
"grad_norm": 0.22268037817504605,
"learning_rate": 3.869814741950833e-06,
"loss": 0.0169,
"step": 2040
},
{
"epoch": 4.144869215291751,
"grad_norm": 0.19008998119875475,
"learning_rate": 3.5387801599533475e-06,
"loss": 0.0178,
"step": 2060
},
{
"epoch": 4.185110663983903,
"grad_norm": 0.28236212559906976,
"learning_rate": 3.2214650758261854e-06,
"loss": 0.0168,
"step": 2080
},
{
"epoch": 4.225352112676056,
"grad_norm": 0.16411536065892163,
"learning_rate": 2.918072339344585e-06,
"loss": 0.0147,
"step": 2100
},
{
"epoch": 4.26559356136821,
"grad_norm": 0.22329493065983927,
"learning_rate": 2.6287959001550787e-06,
"loss": 0.0172,
"step": 2120
},
{
"epoch": 4.305835010060362,
"grad_norm": 0.23074348370306091,
"learning_rate": 2.3538206837894262e-06,
"loss": 0.0152,
"step": 2140
},
{
"epoch": 4.346076458752515,
"grad_norm": 0.16346465568469334,
"learning_rate": 2.093322473447448e-06,
"loss": 0.0162,
"step": 2160
},
{
"epoch": 4.386317907444668,
"grad_norm": 0.20894982405636947,
"learning_rate": 1.8474677976241973e-06,
"loss": 0.017,
"step": 2180
},
{
"epoch": 4.426559356136821,
"grad_norm": 0.2114272503288231,
"learning_rate": 1.6164138236534287e-06,
"loss": 0.0153,
"step": 2200
},
{
"epoch": 4.466800804828974,
"grad_norm": 0.2549097134831871,
"learning_rate": 1.400308257235347e-06,
"loss": 0.0165,
"step": 2220
},
{
"epoch": 4.507042253521127,
"grad_norm": 0.2629485973872836,
"learning_rate": 1.199289248012911e-06,
"loss": 0.0167,
"step": 2240
},
{
"epoch": 4.54728370221328,
"grad_norm": 0.15948181770721337,
"learning_rate": 1.0134853012569918e-06,
"loss": 0.0147,
"step": 2260
},
{
"epoch": 4.5875251509054324,
"grad_norm": 0.2377257196332741,
"learning_rate": 8.430151957169341e-07,
"loss": 0.0172,
"step": 2280
},
{
"epoch": 4.627766599597585,
"grad_norm": 0.12488849403623657,
"learning_rate": 6.879879076889223e-07,
"loss": 0.0169,
"step": 2300
},
{
"epoch": 4.668008048289739,
"grad_norm": 0.17954160642164035,
"learning_rate": 5.485025413508122e-07,
"loss": 0.0173,
"step": 2320
},
{
"epoch": 4.7082494969818915,
"grad_norm": 0.22632001709175115,
"learning_rate": 4.246482654078565e-07,
"loss": 0.0178,
"step": 2340
},
{
"epoch": 4.748490945674044,
"grad_norm": 0.17862035740855048,
"learning_rate": 3.1650425608991397e-07,
"loss": 0.0169,
"step": 2360
},
{
"epoch": 4.788732394366197,
"grad_norm": 0.22768890327395705,
"learning_rate": 2.2413964653651142e-07,
"loss": 0.0182,
"step": 2380
},
{
"epoch": 4.82897384305835,
"grad_norm": 0.18617516446042512,
"learning_rate": 1.476134826021436e-07,
"loss": 0.0159,
"step": 2400
},
{
"epoch": 4.869215291750503,
"grad_norm": 0.22313146556884694,
"learning_rate": 8.697468511008888e-08,
"loss": 0.0174,
"step": 2420
},
{
"epoch": 4.909456740442656,
"grad_norm": 0.17950462343068288,
"learning_rate": 4.226201857882584e-08,
"loss": 0.017,
"step": 2440
},
{
"epoch": 4.949698189134809,
"grad_norm": 0.20930529588954816,
"learning_rate": 1.3504066441069608e-08,
"loss": 0.0153,
"step": 2460
},
{
"epoch": 4.989939637826962,
"grad_norm": 0.18167014719982993,
"learning_rate": 7.192127712579711e-10,
"loss": 0.0171,
"step": 2480
}
],
"logging_steps": 20,
"max_steps": 2485,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 173976693145600.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}