checkpoint-110-lora / trainer_state.json
phee27's picture
Upload folder using huggingface_hub
e229d07 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.972972972972973,
"eval_steps": 55,
"global_step": 110,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 6.015130996704102,
"eval_runtime": 175.115,
"eval_samples_per_second": 0.571,
"eval_steps_per_second": 0.074,
"memory/device_reserved (GiB)": 57.3,
"memory/max_active (GiB)": 43.52,
"memory/max_allocated (GiB)": 43.52,
"step": 0
},
{
"epoch": 0.018018018018018018,
"grad_norm": 0.601690411567688,
"learning_rate": 0.0,
"loss": 5.8902,
"memory/device_reserved (GiB)": 57.78,
"memory/max_active (GiB)": 52.56,
"memory/max_allocated (GiB)": 52.56,
"step": 1,
"tokens_per_second_per_gpu": 177.27
},
{
"epoch": 0.036036036036036036,
"grad_norm": 0.5987619161605835,
"learning_rate": 1.818181818181818e-06,
"loss": 5.9238,
"memory/device_reserved (GiB)": 57.83,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 2,
"tokens_per_second_per_gpu": 32.34
},
{
"epoch": 0.05405405405405406,
"grad_norm": 0.6140171885490417,
"learning_rate": 3.636363636363636e-06,
"loss": 6.108,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 3,
"tokens_per_second_per_gpu": 32.29
},
{
"epoch": 0.07207207207207207,
"grad_norm": 0.6393939256668091,
"learning_rate": 5.4545454545454545e-06,
"loss": 6.0509,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 4,
"tokens_per_second_per_gpu": 34.19
},
{
"epoch": 0.09009009009009009,
"grad_norm": 0.6186049580574036,
"learning_rate": 7.272727272727272e-06,
"loss": 6.0799,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 5,
"tokens_per_second_per_gpu": 48.39
},
{
"epoch": 0.10810810810810811,
"grad_norm": 0.6133891344070435,
"learning_rate": 9.090909090909091e-06,
"loss": 6.0418,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 6,
"tokens_per_second_per_gpu": 35.74
},
{
"epoch": 0.12612612612612611,
"grad_norm": 0.6060707569122314,
"learning_rate": 1.0909090909090909e-05,
"loss": 5.9976,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 7,
"tokens_per_second_per_gpu": 42.46
},
{
"epoch": 0.14414414414414414,
"grad_norm": 0.6184359192848206,
"learning_rate": 1.2727272727272727e-05,
"loss": 6.0328,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 8,
"tokens_per_second_per_gpu": 38.38
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.6444172859191895,
"learning_rate": 1.4545454545454545e-05,
"loss": 5.9618,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 9,
"tokens_per_second_per_gpu": 26.99
},
{
"epoch": 0.18018018018018017,
"grad_norm": 0.6325266361236572,
"learning_rate": 1.6363636363636366e-05,
"loss": 6.1674,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 10,
"tokens_per_second_per_gpu": 29.93
},
{
"epoch": 0.1981981981981982,
"grad_norm": 0.6881551146507263,
"learning_rate": 1.8181818181818182e-05,
"loss": 5.9809,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 11,
"tokens_per_second_per_gpu": 33.12
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.6200075745582581,
"learning_rate": 2e-05,
"loss": 5.9737,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 12,
"tokens_per_second_per_gpu": 33.23
},
{
"epoch": 0.23423423423423423,
"grad_norm": 0.6758139133453369,
"learning_rate": 2.1818181818181818e-05,
"loss": 5.9377,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 13,
"tokens_per_second_per_gpu": 31.95
},
{
"epoch": 0.25225225225225223,
"grad_norm": 0.6687948107719421,
"learning_rate": 2.3636363636363637e-05,
"loss": 6.0492,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 14,
"tokens_per_second_per_gpu": 31.88
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.6872105002403259,
"learning_rate": 2.5454545454545454e-05,
"loss": 6.1111,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 15,
"tokens_per_second_per_gpu": 40.32
},
{
"epoch": 0.2882882882882883,
"grad_norm": 0.7107413411140442,
"learning_rate": 2.7272727272727273e-05,
"loss": 5.9434,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 16,
"tokens_per_second_per_gpu": 31.11
},
{
"epoch": 0.3063063063063063,
"grad_norm": 0.7199774384498596,
"learning_rate": 2.909090909090909e-05,
"loss": 6.0871,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 17,
"tokens_per_second_per_gpu": 40.84
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.7394067049026489,
"learning_rate": 3.090909090909091e-05,
"loss": 5.8008,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 18,
"tokens_per_second_per_gpu": 52.0
},
{
"epoch": 0.34234234234234234,
"grad_norm": 0.7493309378623962,
"learning_rate": 3.272727272727273e-05,
"loss": 5.8617,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 19,
"tokens_per_second_per_gpu": 35.37
},
{
"epoch": 0.36036036036036034,
"grad_norm": 0.7457339763641357,
"learning_rate": 3.454545454545455e-05,
"loss": 5.7479,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 20,
"tokens_per_second_per_gpu": 35.34
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.7670865058898926,
"learning_rate": 3.6363636363636364e-05,
"loss": 5.7939,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 21,
"tokens_per_second_per_gpu": 37.74
},
{
"epoch": 0.3963963963963964,
"grad_norm": 0.7689312100410461,
"learning_rate": 3.818181818181819e-05,
"loss": 5.7546,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 22,
"tokens_per_second_per_gpu": 34.11
},
{
"epoch": 0.4144144144144144,
"grad_norm": 0.7929359674453735,
"learning_rate": 4e-05,
"loss": 5.8247,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 23,
"tokens_per_second_per_gpu": 22.92
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.7598868012428284,
"learning_rate": 4.181818181818182e-05,
"loss": 5.6557,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 24,
"tokens_per_second_per_gpu": 29.09
},
{
"epoch": 0.45045045045045046,
"grad_norm": 0.7897383570671082,
"learning_rate": 4.3636363636363636e-05,
"loss": 5.6503,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 25,
"tokens_per_second_per_gpu": 43.23
},
{
"epoch": 0.46846846846846846,
"grad_norm": 0.8077855706214905,
"learning_rate": 4.545454545454546e-05,
"loss": 5.7059,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 26,
"tokens_per_second_per_gpu": 38.97
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.7965295910835266,
"learning_rate": 4.7272727272727275e-05,
"loss": 5.5299,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 27,
"tokens_per_second_per_gpu": 29.02
},
{
"epoch": 0.5045045045045045,
"grad_norm": 0.7723223567008972,
"learning_rate": 4.909090909090909e-05,
"loss": 5.442,
"memory/device_reserved (GiB)": 57.85,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 28,
"tokens_per_second_per_gpu": 25.82
},
{
"epoch": 0.5225225225225225,
"grad_norm": 0.7679380178451538,
"learning_rate": 5.090909090909091e-05,
"loss": 5.3683,
"memory/device_reserved (GiB)": 57.86,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 29,
"tokens_per_second_per_gpu": 36.19
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.7431294322013855,
"learning_rate": 5.272727272727272e-05,
"loss": 5.2314,
"memory/device_reserved (GiB)": 57.86,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 30,
"tokens_per_second_per_gpu": 48.12
},
{
"epoch": 0.5585585585585585,
"grad_norm": 0.7318829298019409,
"learning_rate": 5.4545454545454546e-05,
"loss": 5.2971,
"memory/device_reserved (GiB)": 57.86,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 31,
"tokens_per_second_per_gpu": 30.47
},
{
"epoch": 0.5765765765765766,
"grad_norm": 0.6973780393600464,
"learning_rate": 5.636363636363636e-05,
"loss": 5.0413,
"memory/device_reserved (GiB)": 57.86,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 32,
"tokens_per_second_per_gpu": 39.86
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.7133749127388,
"learning_rate": 5.818181818181818e-05,
"loss": 5.1071,
"memory/device_reserved (GiB)": 57.86,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 33,
"tokens_per_second_per_gpu": 23.77
},
{
"epoch": 0.6126126126126126,
"grad_norm": 0.6688926219940186,
"learning_rate": 6e-05,
"loss": 4.906,
"memory/device_reserved (GiB)": 57.86,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 34,
"tokens_per_second_per_gpu": 24.91
},
{
"epoch": 0.6306306306306306,
"grad_norm": 0.6534309983253479,
"learning_rate": 6.181818181818182e-05,
"loss": 4.9764,
"memory/device_reserved (GiB)": 57.88,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 35,
"tokens_per_second_per_gpu": 33.75
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.6284618377685547,
"learning_rate": 6.363636363636364e-05,
"loss": 4.8235,
"memory/device_reserved (GiB)": 57.91,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 36,
"tokens_per_second_per_gpu": 32.66
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.5952987670898438,
"learning_rate": 6.545454545454546e-05,
"loss": 4.7779,
"memory/device_reserved (GiB)": 57.91,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 37,
"tokens_per_second_per_gpu": 28.28
},
{
"epoch": 0.6846846846846847,
"grad_norm": 0.6216407418251038,
"learning_rate": 6.727272727272727e-05,
"loss": 4.7969,
"memory/device_reserved (GiB)": 57.94,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 38,
"tokens_per_second_per_gpu": 30.94
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.5679822564125061,
"learning_rate": 6.90909090909091e-05,
"loss": 4.7705,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 39,
"tokens_per_second_per_gpu": 39.6
},
{
"epoch": 0.7207207207207207,
"grad_norm": 0.5590559244155884,
"learning_rate": 7.090909090909092e-05,
"loss": 4.7452,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 40,
"tokens_per_second_per_gpu": 37.24
},
{
"epoch": 0.7387387387387387,
"grad_norm": 0.5368968844413757,
"learning_rate": 7.272727272727273e-05,
"loss": 4.7783,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 41,
"tokens_per_second_per_gpu": 28.42
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.5522942543029785,
"learning_rate": 7.454545454545455e-05,
"loss": 4.7181,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 42,
"tokens_per_second_per_gpu": 32.13
},
{
"epoch": 0.7747747747747747,
"grad_norm": 0.4933941066265106,
"learning_rate": 7.636363636363637e-05,
"loss": 4.8171,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 43,
"tokens_per_second_per_gpu": 24.91
},
{
"epoch": 0.7927927927927928,
"grad_norm": 0.487724244594574,
"learning_rate": 7.818181818181818e-05,
"loss": 4.5267,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 44,
"tokens_per_second_per_gpu": 38.94
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.484387069940567,
"learning_rate": 8e-05,
"loss": 4.4651,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 45,
"tokens_per_second_per_gpu": 28.12
},
{
"epoch": 0.8288288288288288,
"grad_norm": 0.47269508242607117,
"learning_rate": 8.181818181818183e-05,
"loss": 4.4782,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 46,
"tokens_per_second_per_gpu": 37.47
},
{
"epoch": 0.8468468468468469,
"grad_norm": 0.4501938819885254,
"learning_rate": 8.363636363636364e-05,
"loss": 4.2217,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 47,
"tokens_per_second_per_gpu": 36.52
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.4240598678588867,
"learning_rate": 8.545454545454545e-05,
"loss": 4.2584,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 48,
"tokens_per_second_per_gpu": 25.0
},
{
"epoch": 0.8828828828828829,
"grad_norm": 0.4062960743904114,
"learning_rate": 8.727272727272727e-05,
"loss": 4.3664,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.66,
"memory/max_allocated (GiB)": 52.66,
"step": 49,
"tokens_per_second_per_gpu": 34.05
},
{
"epoch": 0.9009009009009009,
"grad_norm": 0.4040940999984741,
"learning_rate": 8.90909090909091e-05,
"loss": 4.31,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 50,
"tokens_per_second_per_gpu": 31.81
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.38634198904037476,
"learning_rate": 9.090909090909092e-05,
"loss": 4.1829,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 51,
"tokens_per_second_per_gpu": 31.96
},
{
"epoch": 0.9369369369369369,
"grad_norm": 0.4119090139865875,
"learning_rate": 9.272727272727273e-05,
"loss": 4.211,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 52,
"tokens_per_second_per_gpu": 26.32
},
{
"epoch": 0.954954954954955,
"grad_norm": 0.39360716938972473,
"learning_rate": 9.454545454545455e-05,
"loss": 4.1027,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 53,
"tokens_per_second_per_gpu": 36.47
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.358804851770401,
"learning_rate": 9.636363636363637e-05,
"loss": 4.1262,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 54,
"tokens_per_second_per_gpu": 35.0
},
{
"epoch": 0.990990990990991,
"grad_norm": 0.3619638681411743,
"learning_rate": 9.818181818181818e-05,
"loss": 3.9671,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 55,
"tokens_per_second_per_gpu": 21.34
},
{
"epoch": 0.990990990990991,
"eval_loss": 4.040650367736816,
"eval_runtime": 179.0976,
"eval_samples_per_second": 0.558,
"eval_steps_per_second": 0.073,
"memory/device_reserved (GiB)": 57.95,
"memory/max_active (GiB)": 43.5,
"memory/max_allocated (GiB)": 43.5,
"step": 55
},
{
"epoch": 1.0,
"grad_norm": 0.3906014859676361,
"learning_rate": 0.0001,
"loss": 4.1367,
"memory/device_reserved (GiB)": 57.88,
"memory/max_active (GiB)": 52.55,
"memory/max_allocated (GiB)": 52.55,
"step": 56,
"tokens_per_second_per_gpu": 28.38
},
{
"epoch": 1.018018018018018,
"grad_norm": 0.34430640935897827,
"learning_rate": 0.00010181818181818181,
"loss": 3.9357,
"memory/device_reserved (GiB)": 57.91,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 57,
"tokens_per_second_per_gpu": 26.74
},
{
"epoch": 1.0360360360360361,
"grad_norm": 0.348283588886261,
"learning_rate": 0.00010363636363636364,
"loss": 3.9594,
"memory/device_reserved (GiB)": 57.91,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 58,
"tokens_per_second_per_gpu": 25.8
},
{
"epoch": 1.054054054054054,
"grad_norm": 0.3484898507595062,
"learning_rate": 0.00010545454545454545,
"loss": 4.0163,
"memory/device_reserved (GiB)": 57.91,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 59,
"tokens_per_second_per_gpu": 26.23
},
{
"epoch": 1.072072072072072,
"grad_norm": 0.3627394735813141,
"learning_rate": 0.00010727272727272728,
"loss": 3.9347,
"memory/device_reserved (GiB)": 57.91,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 60,
"tokens_per_second_per_gpu": 32.7
},
{
"epoch": 1.09009009009009,
"grad_norm": 0.3439123034477234,
"learning_rate": 0.00010909090909090909,
"loss": 3.9091,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.64,
"memory/max_allocated (GiB)": 52.64,
"step": 61,
"tokens_per_second_per_gpu": 46.09
},
{
"epoch": 1.1081081081081081,
"grad_norm": 0.34011831879615784,
"learning_rate": 0.00011090909090909092,
"loss": 3.8579,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 62,
"tokens_per_second_per_gpu": 34.82
},
{
"epoch": 1.1261261261261262,
"grad_norm": 0.3363277018070221,
"learning_rate": 0.00011272727272727272,
"loss": 3.8762,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 63,
"tokens_per_second_per_gpu": 41.68
},
{
"epoch": 1.1441441441441442,
"grad_norm": 0.30976247787475586,
"learning_rate": 0.00011454545454545456,
"loss": 3.8585,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.63,
"memory/max_allocated (GiB)": 52.63,
"step": 64,
"tokens_per_second_per_gpu": 37.27
},
{
"epoch": 1.1621621621621623,
"grad_norm": 0.3248283565044403,
"learning_rate": 0.00011636363636363636,
"loss": 3.7179,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 65,
"tokens_per_second_per_gpu": 26.17
},
{
"epoch": 1.1801801801801801,
"grad_norm": 0.3173442482948303,
"learning_rate": 0.0001181818181818182,
"loss": 3.8197,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 66,
"tokens_per_second_per_gpu": 28.41
},
{
"epoch": 1.1981981981981982,
"grad_norm": 0.33076199889183044,
"learning_rate": 0.00012,
"loss": 3.6631,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 67,
"tokens_per_second_per_gpu": 32.3
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.32531851530075073,
"learning_rate": 0.00012181818181818183,
"loss": 3.6563,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 68,
"tokens_per_second_per_gpu": 32.11
},
{
"epoch": 1.2342342342342343,
"grad_norm": 0.295604944229126,
"learning_rate": 0.00012363636363636364,
"loss": 3.6487,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 69,
"tokens_per_second_per_gpu": 31.08
},
{
"epoch": 1.2522522522522523,
"grad_norm": 0.3253607749938965,
"learning_rate": 0.00012545454545454546,
"loss": 3.741,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 70,
"tokens_per_second_per_gpu": 29.52
},
{
"epoch": 1.2702702702702702,
"grad_norm": 0.28945258259773254,
"learning_rate": 0.00012727272727272728,
"loss": 3.6727,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 71,
"tokens_per_second_per_gpu": 38.43
},
{
"epoch": 1.2882882882882882,
"grad_norm": 0.287298321723938,
"learning_rate": 0.0001290909090909091,
"loss": 3.5821,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 72,
"tokens_per_second_per_gpu": 29.55
},
{
"epoch": 1.3063063063063063,
"grad_norm": 0.26835423707962036,
"learning_rate": 0.00013090909090909093,
"loss": 3.648,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 73,
"tokens_per_second_per_gpu": 37.82
},
{
"epoch": 1.3243243243243243,
"grad_norm": 0.27674639225006104,
"learning_rate": 0.00013272727272727275,
"loss": 3.4623,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 74,
"tokens_per_second_per_gpu": 48.99
},
{
"epoch": 1.3423423423423424,
"grad_norm": 0.28284698724746704,
"learning_rate": 0.00013454545454545455,
"loss": 3.4366,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 75,
"tokens_per_second_per_gpu": 32.8
},
{
"epoch": 1.3603603603603602,
"grad_norm": 0.2780005931854248,
"learning_rate": 0.00013636363636363637,
"loss": 3.4308,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 76,
"tokens_per_second_per_gpu": 33.52
},
{
"epoch": 1.3783783783783785,
"grad_norm": 0.2978385388851166,
"learning_rate": 0.0001381818181818182,
"loss": 3.4822,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 77,
"tokens_per_second_per_gpu": 35.74
},
{
"epoch": 1.3963963963963963,
"grad_norm": 0.28048908710479736,
"learning_rate": 0.00014,
"loss": 3.4922,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.62,
"memory/max_allocated (GiB)": 52.62,
"step": 78,
"tokens_per_second_per_gpu": 32.76
},
{
"epoch": 1.4144144144144144,
"grad_norm": 0.2921410799026489,
"learning_rate": 0.00014181818181818184,
"loss": 3.5634,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 79,
"tokens_per_second_per_gpu": 22.05
},
{
"epoch": 1.4324324324324325,
"grad_norm": 0.28046825528144836,
"learning_rate": 0.00014363636363636363,
"loss": 3.4562,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 80,
"tokens_per_second_per_gpu": 27.93
},
{
"epoch": 1.4504504504504505,
"grad_norm": 0.28950053453445435,
"learning_rate": 0.00014545454545454546,
"loss": 3.4771,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 81,
"tokens_per_second_per_gpu": 43.19
},
{
"epoch": 1.4684684684684686,
"grad_norm": 0.2990242838859558,
"learning_rate": 0.00014727272727272728,
"loss": 3.4552,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 82,
"tokens_per_second_per_gpu": 39.02
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.3110749125480652,
"learning_rate": 0.0001490909090909091,
"loss": 3.3635,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 83,
"tokens_per_second_per_gpu": 28.78
},
{
"epoch": 1.5045045045045045,
"grad_norm": 0.2659832537174225,
"learning_rate": 0.0001509090909090909,
"loss": 3.309,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 84,
"tokens_per_second_per_gpu": 26.52
},
{
"epoch": 1.5225225225225225,
"grad_norm": 0.2891514003276825,
"learning_rate": 0.00015272727272727275,
"loss": 3.2953,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 85,
"tokens_per_second_per_gpu": 37.61
},
{
"epoch": 1.5405405405405406,
"grad_norm": 0.2862309217453003,
"learning_rate": 0.00015454545454545454,
"loss": 3.3016,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 86,
"tokens_per_second_per_gpu": 49.89
},
{
"epoch": 1.5585585585585586,
"grad_norm": 0.3269289433956146,
"learning_rate": 0.00015636363636363637,
"loss": 3.4022,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 87,
"tokens_per_second_per_gpu": 30.66
},
{
"epoch": 1.5765765765765765,
"grad_norm": 0.2758469581604004,
"learning_rate": 0.0001581818181818182,
"loss": 3.1596,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 88,
"tokens_per_second_per_gpu": 39.43
},
{
"epoch": 1.5945945945945947,
"grad_norm": 0.2842893600463867,
"learning_rate": 0.00016,
"loss": 3.2368,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 89,
"tokens_per_second_per_gpu": 23.64
},
{
"epoch": 1.6126126126126126,
"grad_norm": 0.27873268723487854,
"learning_rate": 0.00016181818181818184,
"loss": 3.1778,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 90,
"tokens_per_second_per_gpu": 24.52
},
{
"epoch": 1.6306306306306306,
"grad_norm": 0.25983887910842896,
"learning_rate": 0.00016363636363636366,
"loss": 3.2287,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 91,
"tokens_per_second_per_gpu": 34.4
},
{
"epoch": 1.6486486486486487,
"grad_norm": 0.2840956151485443,
"learning_rate": 0.00016545454545454545,
"loss": 3.1411,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 92,
"tokens_per_second_per_gpu": 33.89
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.2628091275691986,
"learning_rate": 0.00016727272727272728,
"loss": 3.1159,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 93,
"tokens_per_second_per_gpu": 29.27
},
{
"epoch": 1.6846846846846848,
"grad_norm": 0.2681942582130432,
"learning_rate": 0.0001690909090909091,
"loss": 3.1647,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 94,
"tokens_per_second_per_gpu": 30.7
},
{
"epoch": 1.7027027027027026,
"grad_norm": 0.2515859603881836,
"learning_rate": 0.0001709090909090909,
"loss": 3.1587,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 95,
"tokens_per_second_per_gpu": 40.07
},
{
"epoch": 1.7207207207207207,
"grad_norm": 0.2735103666782379,
"learning_rate": 0.00017272727272727275,
"loss": 3.1537,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 96,
"tokens_per_second_per_gpu": 37.91
},
{
"epoch": 1.7387387387387387,
"grad_norm": 0.24973994493484497,
"learning_rate": 0.00017454545454545454,
"loss": 3.2266,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 97,
"tokens_per_second_per_gpu": 28.61
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.26508864760398865,
"learning_rate": 0.00017636363636363637,
"loss": 3.135,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 98,
"tokens_per_second_per_gpu": 34.76
},
{
"epoch": 1.7747747747747749,
"grad_norm": 0.2922559678554535,
"learning_rate": 0.0001781818181818182,
"loss": 3.359,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 99,
"tokens_per_second_per_gpu": 26.34
},
{
"epoch": 1.7927927927927927,
"grad_norm": 0.2632916271686554,
"learning_rate": 0.00018,
"loss": 3.1131,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 100,
"tokens_per_second_per_gpu": 42.17
},
{
"epoch": 1.810810810810811,
"grad_norm": 0.2974204123020172,
"learning_rate": 0.00018181818181818183,
"loss": 3.1127,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 101,
"tokens_per_second_per_gpu": 30.71
},
{
"epoch": 1.8288288288288288,
"grad_norm": 0.28947019577026367,
"learning_rate": 0.00018363636363636366,
"loss": 3.1019,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 102,
"tokens_per_second_per_gpu": 40.45
},
{
"epoch": 1.8468468468468469,
"grad_norm": 0.29779183864593506,
"learning_rate": 0.00018545454545454545,
"loss": 2.8855,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 103,
"tokens_per_second_per_gpu": 38.82
},
{
"epoch": 1.864864864864865,
"grad_norm": 0.27393272519111633,
"learning_rate": 0.00018727272727272728,
"loss": 2.9937,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 104,
"tokens_per_second_per_gpu": 26.96
},
{
"epoch": 1.8828828828828827,
"grad_norm": 0.28197985887527466,
"learning_rate": 0.0001890909090909091,
"loss": 3.1189,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 105,
"tokens_per_second_per_gpu": 37.87
},
{
"epoch": 1.900900900900901,
"grad_norm": 0.27397748827934265,
"learning_rate": 0.00019090909090909092,
"loss": 3.0858,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.59,
"memory/max_allocated (GiB)": 52.59,
"step": 106,
"tokens_per_second_per_gpu": 35.13
},
{
"epoch": 1.9189189189189189,
"grad_norm": 0.274027943611145,
"learning_rate": 0.00019272727272727274,
"loss": 2.9537,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.59,
"memory/max_allocated (GiB)": 52.59,
"step": 107,
"tokens_per_second_per_gpu": 33.48
},
{
"epoch": 1.936936936936937,
"grad_norm": 0.2898459732532501,
"learning_rate": 0.00019454545454545457,
"loss": 2.9996,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.6,
"memory/max_allocated (GiB)": 52.6,
"step": 108,
"tokens_per_second_per_gpu": 27.65
},
{
"epoch": 1.954954954954955,
"grad_norm": 0.2991600036621094,
"learning_rate": 0.00019636363636363636,
"loss": 2.9123,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 109,
"tokens_per_second_per_gpu": 39.01
},
{
"epoch": 1.972972972972973,
"grad_norm": 0.27946925163269043,
"learning_rate": 0.00019818181818181821,
"loss": 3.0439,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 52.61,
"memory/max_allocated (GiB)": 52.61,
"step": 110,
"tokens_per_second_per_gpu": 35.44
},
{
"epoch": 1.972972972972973,
"eval_loss": 2.936415672302246,
"eval_runtime": 158.8267,
"eval_samples_per_second": 0.63,
"eval_steps_per_second": 0.082,
"memory/device_reserved (GiB)": 57.92,
"memory/max_active (GiB)": 43.5,
"memory/max_allocated (GiB)": 43.5,
"step": 110
}
],
"logging_steps": 1,
"max_steps": 1100,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 55,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0902735727406088e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}