| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.3294117647058825, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.023529411764705882, | |
| "grad_norm": 6.661525726318359, | |
| "learning_rate": 4e-05, | |
| "loss": 3.1501, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.047058823529411764, | |
| "grad_norm": 7.423152446746826, | |
| "learning_rate": 8e-05, | |
| "loss": 3.4519, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.07058823529411765, | |
| "grad_norm": 7.344629764556885, | |
| "learning_rate": 0.00012, | |
| "loss": 3.2048, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.09411764705882353, | |
| "grad_norm": 8.129940032958984, | |
| "learning_rate": 0.00016, | |
| "loss": 3.1225, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.11764705882352941, | |
| "grad_norm": 6.877574443817139, | |
| "learning_rate": 0.0002, | |
| "loss": 2.9515, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1411764705882353, | |
| "grad_norm": 4.28478479385376, | |
| "learning_rate": 0.00019990453460620527, | |
| "loss": 2.1975, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.16470588235294117, | |
| "grad_norm": 3.4814071655273438, | |
| "learning_rate": 0.00019980906921241053, | |
| "loss": 2.0311, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.18823529411764706, | |
| "grad_norm": 2.5042951107025146, | |
| "learning_rate": 0.00019971360381861576, | |
| "loss": 1.7222, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.21176470588235294, | |
| "grad_norm": 10.543612480163574, | |
| "learning_rate": 0.00019961813842482102, | |
| "loss": 1.601, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 2.4977686405181885, | |
| "learning_rate": 0.00019952267303102625, | |
| "loss": 1.6609, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.25882352941176473, | |
| "grad_norm": 2.499239921569824, | |
| "learning_rate": 0.0001994272076372315, | |
| "loss": 1.4279, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.2823529411764706, | |
| "grad_norm": 1.3605753183364868, | |
| "learning_rate": 0.00019933174224343676, | |
| "loss": 1.4378, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.3058823529411765, | |
| "grad_norm": 2.2948458194732666, | |
| "learning_rate": 0.00019923627684964202, | |
| "loss": 1.4096, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.32941176470588235, | |
| "grad_norm": 1.9090344905853271, | |
| "learning_rate": 0.00019914081145584725, | |
| "loss": 1.38, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.35294117647058826, | |
| "grad_norm": 1.3216297626495361, | |
| "learning_rate": 0.0001990453460620525, | |
| "loss": 1.0872, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3764705882352941, | |
| "grad_norm": 1.299048900604248, | |
| "learning_rate": 0.00019894988066825777, | |
| "loss": 1.0297, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.3004589080810547, | |
| "learning_rate": 0.00019885441527446303, | |
| "loss": 0.9944, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.4235294117647059, | |
| "grad_norm": 1.202727198600769, | |
| "learning_rate": 0.00019875894988066829, | |
| "loss": 1.07, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.4470588235294118, | |
| "grad_norm": 1.67063570022583, | |
| "learning_rate": 0.00019866348448687352, | |
| "loss": 1.1953, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 1.3096171617507935, | |
| "learning_rate": 0.00019856801909307875, | |
| "loss": 0.9428, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.49411764705882355, | |
| "grad_norm": 1.459935188293457, | |
| "learning_rate": 0.000198472553699284, | |
| "loss": 1.0548, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.5176470588235295, | |
| "grad_norm": 1.392005443572998, | |
| "learning_rate": 0.00019837708830548927, | |
| "loss": 0.9306, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.5411764705882353, | |
| "grad_norm": 1.4283812046051025, | |
| "learning_rate": 0.00019828162291169452, | |
| "loss": 0.9121, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.5647058823529412, | |
| "grad_norm": 1.283262014389038, | |
| "learning_rate": 0.00019818615751789978, | |
| "loss": 0.9064, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 1.330345869064331, | |
| "learning_rate": 0.000198090692124105, | |
| "loss": 0.8008, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.611764705882353, | |
| "grad_norm": 1.3282525539398193, | |
| "learning_rate": 0.00019799522673031027, | |
| "loss": 0.7321, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.6352941176470588, | |
| "grad_norm": 1.2584058046340942, | |
| "learning_rate": 0.00019789976133651553, | |
| "loss": 0.7673, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.6588235294117647, | |
| "grad_norm": 1.2578104734420776, | |
| "learning_rate": 0.00019780429594272076, | |
| "loss": 0.7981, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.6823529411764706, | |
| "grad_norm": 1.4390983581542969, | |
| "learning_rate": 0.00019770883054892602, | |
| "loss": 0.8325, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 1.6025103330612183, | |
| "learning_rate": 0.00019761336515513128, | |
| "loss": 0.8138, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.7294117647058823, | |
| "grad_norm": 1.480338215827942, | |
| "learning_rate": 0.00019751789976133654, | |
| "loss": 0.8751, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.7529411764705882, | |
| "grad_norm": 1.266036868095398, | |
| "learning_rate": 0.00019742243436754177, | |
| "loss": 0.7173, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.7764705882352941, | |
| "grad_norm": 1.233924150466919, | |
| "learning_rate": 0.00019732696897374703, | |
| "loss": 0.6959, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.1584521532058716, | |
| "learning_rate": 0.00019723150357995228, | |
| "loss": 0.6162, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.8235294117647058, | |
| "grad_norm": 1.252702236175537, | |
| "learning_rate": 0.00019713603818615754, | |
| "loss": 0.5258, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.8470588235294118, | |
| "grad_norm": 1.3157094717025757, | |
| "learning_rate": 0.0001970405727923628, | |
| "loss": 0.7223, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.8705882352941177, | |
| "grad_norm": 1.310499668121338, | |
| "learning_rate": 0.00019694510739856803, | |
| "loss": 0.6344, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.8941176470588236, | |
| "grad_norm": 1.346976399421692, | |
| "learning_rate": 0.00019684964200477326, | |
| "loss": 0.7265, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.9176470588235294, | |
| "grad_norm": 1.2368614673614502, | |
| "learning_rate": 0.00019675417661097852, | |
| "loss": 0.6884, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 1.3998883962631226, | |
| "learning_rate": 0.00019665871121718378, | |
| "loss": 0.7219, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.9647058823529412, | |
| "grad_norm": 1.2332427501678467, | |
| "learning_rate": 0.00019656324582338904, | |
| "loss": 0.7444, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.9882352941176471, | |
| "grad_norm": 1.0393794775009155, | |
| "learning_rate": 0.0001964677804295943, | |
| "loss": 0.5779, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.7419483661651611, | |
| "learning_rate": 0.00019637231503579953, | |
| "loss": 0.6714, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.0235294117647058, | |
| "grad_norm": 1.0514390468597412, | |
| "learning_rate": 0.00019627684964200479, | |
| "loss": 0.5436, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.0470588235294118, | |
| "grad_norm": 0.9511624574661255, | |
| "learning_rate": 0.00019618138424821004, | |
| "loss": 0.4626, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.0705882352941176, | |
| "grad_norm": 1.508288025856018, | |
| "learning_rate": 0.00019608591885441527, | |
| "loss": 0.5104, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.0941176470588236, | |
| "grad_norm": 1.4369169473648071, | |
| "learning_rate": 0.00019599045346062053, | |
| "loss": 0.6062, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.1176470588235294, | |
| "grad_norm": 1.1856215000152588, | |
| "learning_rate": 0.0001958949880668258, | |
| "loss": 0.5779, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.1411764705882352, | |
| "grad_norm": 1.317847490310669, | |
| "learning_rate": 0.00019579952267303102, | |
| "loss": 0.5041, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.1647058823529413, | |
| "grad_norm": 1.2989856004714966, | |
| "learning_rate": 0.00019570405727923628, | |
| "loss": 0.615, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.188235294117647, | |
| "grad_norm": 1.0592188835144043, | |
| "learning_rate": 0.00019560859188544154, | |
| "loss": 0.4445, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.2117647058823529, | |
| "grad_norm": 0.9522223472595215, | |
| "learning_rate": 0.0001955131264916468, | |
| "loss": 0.4166, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 1.137998104095459, | |
| "learning_rate": 0.00019541766109785206, | |
| "loss": 0.5384, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.2588235294117647, | |
| "grad_norm": 1.0973365306854248, | |
| "learning_rate": 0.0001953221957040573, | |
| "loss": 0.4935, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.2823529411764705, | |
| "grad_norm": 1.0788564682006836, | |
| "learning_rate": 0.00019522673031026252, | |
| "loss": 0.4372, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.3058823529411765, | |
| "grad_norm": 1.1917812824249268, | |
| "learning_rate": 0.00019513126491646778, | |
| "loss": 0.5462, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.3294117647058823, | |
| "grad_norm": 1.1784740686416626, | |
| "learning_rate": 0.00019503579952267303, | |
| "loss": 0.5903, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.3529411764705883, | |
| "grad_norm": 1.0892928838729858, | |
| "learning_rate": 0.0001949403341288783, | |
| "loss": 0.4417, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.3764705882352941, | |
| "grad_norm": 1.2473161220550537, | |
| "learning_rate": 0.00019484486873508355, | |
| "loss": 0.6014, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.3342103958129883, | |
| "learning_rate": 0.00019474940334128878, | |
| "loss": 0.4679, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.423529411764706, | |
| "grad_norm": 1.2751225233078003, | |
| "learning_rate": 0.00019465393794749404, | |
| "loss": 0.5651, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.4470588235294117, | |
| "grad_norm": 1.012502908706665, | |
| "learning_rate": 0.0001945584725536993, | |
| "loss": 0.29, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 0.9788743257522583, | |
| "learning_rate": 0.00019446300715990456, | |
| "loss": 0.3741, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.4941176470588236, | |
| "grad_norm": 1.0560024976730347, | |
| "learning_rate": 0.0001943675417661098, | |
| "loss": 0.4329, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.5176470588235293, | |
| "grad_norm": 1.1424912214279175, | |
| "learning_rate": 0.00019427207637231505, | |
| "loss": 0.3667, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.5411764705882351, | |
| "grad_norm": 1.1789878606796265, | |
| "learning_rate": 0.00019417661097852028, | |
| "loss": 0.4384, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.5647058823529412, | |
| "grad_norm": 1.1614022254943848, | |
| "learning_rate": 0.00019408114558472554, | |
| "loss": 0.4265, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.5882352941176472, | |
| "grad_norm": 1.3976482152938843, | |
| "learning_rate": 0.0001939856801909308, | |
| "loss": 0.5109, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.611764705882353, | |
| "grad_norm": 1.1408592462539673, | |
| "learning_rate": 0.00019389021479713605, | |
| "loss": 0.4185, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.6352941176470588, | |
| "grad_norm": 1.242210030555725, | |
| "learning_rate": 0.0001937947494033413, | |
| "loss": 0.4089, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.6588235294117646, | |
| "grad_norm": 1.3533846139907837, | |
| "learning_rate": 0.00019369928400954654, | |
| "loss": 0.3804, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.6823529411764706, | |
| "grad_norm": 1.5269583463668823, | |
| "learning_rate": 0.0001936038186157518, | |
| "loss": 0.5357, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.7058823529411766, | |
| "grad_norm": 2.066941261291504, | |
| "learning_rate": 0.00019350835322195703, | |
| "loss": 0.6613, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.7294117647058824, | |
| "grad_norm": 1.3083280324935913, | |
| "learning_rate": 0.0001934128878281623, | |
| "loss": 0.3654, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.7529411764705882, | |
| "grad_norm": 1.0919277667999268, | |
| "learning_rate": 0.00019331742243436755, | |
| "loss": 0.3306, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.776470588235294, | |
| "grad_norm": 1.1151741743087769, | |
| "learning_rate": 0.0001932219570405728, | |
| "loss": 0.3895, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.286359190940857, | |
| "learning_rate": 0.00019312649164677804, | |
| "loss": 0.4677, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.8235294117647058, | |
| "grad_norm": 1.03926682472229, | |
| "learning_rate": 0.0001930310262529833, | |
| "loss": 0.3884, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.8470588235294119, | |
| "grad_norm": 1.084747314453125, | |
| "learning_rate": 0.00019293556085918855, | |
| "loss": 0.4417, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.8705882352941177, | |
| "grad_norm": 1.1419929265975952, | |
| "learning_rate": 0.0001928400954653938, | |
| "loss": 0.4261, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.8941176470588235, | |
| "grad_norm": 1.1437164545059204, | |
| "learning_rate": 0.00019274463007159907, | |
| "loss": 0.4213, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.9176470588235293, | |
| "grad_norm": 1.0313295125961304, | |
| "learning_rate": 0.0001926491646778043, | |
| "loss": 0.3355, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.9411764705882353, | |
| "grad_norm": 1.3336807489395142, | |
| "learning_rate": 0.00019255369928400956, | |
| "loss": 0.4388, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.9647058823529413, | |
| "grad_norm": 1.2635568380355835, | |
| "learning_rate": 0.0001924582338902148, | |
| "loss": 0.4225, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.988235294117647, | |
| "grad_norm": 1.1508828401565552, | |
| "learning_rate": 0.00019236276849642005, | |
| "loss": 0.3403, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.2531745433807373, | |
| "learning_rate": 0.0001922673031026253, | |
| "loss": 0.5257, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 2.023529411764706, | |
| "grad_norm": 0.7907515168190002, | |
| "learning_rate": 0.00019217183770883057, | |
| "loss": 0.1563, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 2.0470588235294116, | |
| "grad_norm": 0.8357899188995361, | |
| "learning_rate": 0.00019207637231503583, | |
| "loss": 0.2273, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.070588235294118, | |
| "grad_norm": 1.0556663274765015, | |
| "learning_rate": 0.00019198090692124106, | |
| "loss": 0.1731, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 2.0941176470588236, | |
| "grad_norm": 0.8954185843467712, | |
| "learning_rate": 0.00019188544152744631, | |
| "loss": 0.1575, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.1176470588235294, | |
| "grad_norm": 1.0291929244995117, | |
| "learning_rate": 0.00019178997613365155, | |
| "loss": 0.1907, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 2.1411764705882352, | |
| "grad_norm": 1.0664329528808594, | |
| "learning_rate": 0.0001916945107398568, | |
| "loss": 0.1823, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.164705882352941, | |
| "grad_norm": 1.1944317817687988, | |
| "learning_rate": 0.00019159904534606206, | |
| "loss": 0.2451, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 2.1882352941176473, | |
| "grad_norm": 1.3393555879592896, | |
| "learning_rate": 0.00019150357995226732, | |
| "loss": 0.2653, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 2.211764705882353, | |
| "grad_norm": 1.3618574142456055, | |
| "learning_rate": 0.00019140811455847255, | |
| "loss": 0.1786, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 2.235294117647059, | |
| "grad_norm": 1.5982481241226196, | |
| "learning_rate": 0.0001913126491646778, | |
| "loss": 0.1967, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.2588235294117647, | |
| "grad_norm": 1.3369780778884888, | |
| "learning_rate": 0.00019121718377088307, | |
| "loss": 0.2106, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 2.2823529411764705, | |
| "grad_norm": 1.1570290327072144, | |
| "learning_rate": 0.00019112171837708833, | |
| "loss": 0.1624, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.3058823529411763, | |
| "grad_norm": 1.180243968963623, | |
| "learning_rate": 0.00019102625298329359, | |
| "loss": 0.1672, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 2.3294117647058825, | |
| "grad_norm": 1.218498945236206, | |
| "learning_rate": 0.00019093078758949882, | |
| "loss": 0.2139, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 2100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2583886078581696.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |