| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.1663825958967209, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 0.5625, | |
| "mean_token_accuracy": 0.8770151734352112, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.16785307228565216, | |
| "learning_rate": 7.692307692307693e-05, | |
| "loss": 0.6, | |
| "mean_token_accuracy": 0.8692665919661522, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.1985907405614853, | |
| "learning_rate": 0.00015384615384615385, | |
| "loss": 0.556, | |
| "mean_token_accuracy": 0.8736625671386719, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.1878145933151245, | |
| "learning_rate": 0.00019984268150178167, | |
| "loss": 0.4238, | |
| "mean_token_accuracy": 0.8949833571910858, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.25863349437713623, | |
| "learning_rate": 0.00019807852804032305, | |
| "loss": 0.2429, | |
| "mean_token_accuracy": 0.9352735817432404, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.11830388754606247, | |
| "learning_rate": 0.00019438833303083678, | |
| "loss": 0.0911, | |
| "mean_token_accuracy": 0.9729306638240814, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.07387381047010422, | |
| "learning_rate": 0.00018884456359788724, | |
| "loss": 0.0519, | |
| "mean_token_accuracy": 0.9809980034828186, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.0662246122956276, | |
| "learning_rate": 0.00018155608689592604, | |
| "loss": 0.0388, | |
| "mean_token_accuracy": 0.9845828533172607, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.06684456765651703, | |
| "learning_rate": 0.0001726660322034027, | |
| "loss": 0.0359, | |
| "mean_token_accuracy": 0.9851168274879456, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.0632498562335968, | |
| "learning_rate": 0.00016234898018587337, | |
| "loss": 0.0354, | |
| "mean_token_accuracy": 0.9856509447097779, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.04182787984609604, | |
| "learning_rate": 0.00015080753452465296, | |
| "loss": 0.0351, | |
| "mean_token_accuracy": 0.9855603039264679, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.03820296376943588, | |
| "learning_rate": 0.000138268343236509, | |
| "loss": 0.0296, | |
| "mean_token_accuracy": 0.9877731561660766, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.041676074266433716, | |
| "learning_rate": 0.0001249776478167227, | |
| "loss": 0.0305, | |
| "mean_token_accuracy": 0.9872902452945709, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.04178238287568092, | |
| "learning_rate": 0.00011119644761033078, | |
| "loss": 0.0269, | |
| "mean_token_accuracy": 0.9890636622905731, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.0332472063601017, | |
| "learning_rate": 9.719537437241312e-05, | |
| "loss": 0.0287, | |
| "mean_token_accuracy": 0.9877880275249481, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.04028412699699402, | |
| "learning_rate": 8.324937766952638e-05, | |
| "loss": 0.0289, | |
| "mean_token_accuracy": 0.9878171920776367, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.030627723783254623, | |
| "learning_rate": 6.963232548903853e-05, | |
| "loss": 0.028, | |
| "mean_token_accuracy": 0.9883454322814942, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.051511090248823166, | |
| "learning_rate": 5.6611626088244194e-05, | |
| "loss": 0.0291, | |
| "mean_token_accuracy": 0.9880700409412384, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.056407369673252106, | |
| "learning_rate": 4.444297669803981e-05, | |
| "loss": 0.0308, | |
| "mean_token_accuracy": 0.9871262729167938, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.031498316675424576, | |
| "learning_rate": 3.336534220479961e-05, | |
| "loss": 0.0274, | |
| "mean_token_accuracy": 0.9887665092945099, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.029312707483768463, | |
| "learning_rate": 2.3596262417839255e-05, | |
| "loss": 0.029, | |
| "mean_token_accuracy": 0.987632954120636, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.029115818440914154, | |
| "learning_rate": 1.5327580077171587e-05, | |
| "loss": 0.0305, | |
| "mean_token_accuracy": 0.9873630583286286, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.03585318848490715, | |
| "learning_rate": 8.72167349386811e-06, | |
| "loss": 0.0284, | |
| "mean_token_accuracy": 0.9883174896240234, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.029363730922341347, | |
| "learning_rate": 3.908267805490051e-06, | |
| "loss": 0.0294, | |
| "mean_token_accuracy": 0.9876586437225342, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.05330374091863632, | |
| "learning_rate": 9.818874663554357e-07, | |
| "loss": 0.0294, | |
| "mean_token_accuracy": 0.9879511296749115, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.023399341851472855, | |
| "learning_rate": 0.0, | |
| "loss": 0.0288, | |
| "mean_token_accuracy": 0.9879041314125061, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.0284133218228817, | |
| "eval_mean_token_accuracy": 0.988107846736908, | |
| "eval_runtime": 162.3836, | |
| "eval_samples_per_second": 6.158, | |
| "eval_steps_per_second": 0.77, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 125, | |
| "total_flos": 1.6328360057634816e+16, | |
| "train_loss": 0.10154794347286224, | |
| "train_runtime": 724.5002, | |
| "train_samples_per_second": 1.38, | |
| "train_steps_per_second": 0.173 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6328360057634816e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |