barc0
/

transduction-10k-seed50-instruct-fft_lr1e-5_epoch2

+---
+library_name: transformers
+license: llama3.1
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+tags:
+- trl
+- sft
+- generated_from_trainer
+model-index:
+- name: transduction-10k-seed50-instruct-fft_lr1e-5_epoch2
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# transduction-10k-seed50-instruct-fft_lr1e-5_epoch2
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0693
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 8
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 128
+- total_eval_batch_size: 32
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 2
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.0681        | 0.9935 | 77   | 0.0811          |
+| 0.0505        | 1.9871 | 154  | 0.0693          |
+### Framework versions
+- Transformers 4.45.0.dev0
+- Pytorch 2.4.0+cu121
+- Datasets 3.0.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.9870967741935484,
+    "total_flos": 5.361988883508429e+16,
+    "train_loss": 0.08745080387437498,
+    "train_runtime": 2120.7101,
+    "train_samples": 9863,
+    "train_samples_per_second": 9.302,
+    "train_steps_per_second": 0.073
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.45.0.dev0"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.9870967741935484,
+    "total_flos": 5.361988883508429e+16,
+    "train_loss": 0.08745080387437498,
+    "train_runtime": 2120.7101,
+    "train_samples": 9863,
+    "train_samples_per_second": 9.302,
+    "train_steps_per_second": 0.073
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1136 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9870967741935484,
+  "eval_steps": 500,
+  "global_step": 154,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012903225806451613,
+      "grad_norm": 11.699292902761476,
+      "learning_rate": 6.25e-07,
+      "loss": 0.3091,
+      "step": 1
+    },
+    {
+      "epoch": 0.025806451612903226,
+      "grad_norm": 10.2656902023561,
+      "learning_rate": 1.25e-06,
+      "loss": 0.2644,
+      "step": 2
+    },
+    {
+      "epoch": 0.03870967741935484,
+      "grad_norm": 10.961480180598883,
+      "learning_rate": 1.8750000000000003e-06,
+      "loss": 0.3136,
+      "step": 3
+    },
+    {
+      "epoch": 0.05161290322580645,
+      "grad_norm": 10.621965020575766,
+      "learning_rate": 2.5e-06,
+      "loss": 0.2649,
+      "step": 4
+    },
+    {
+      "epoch": 0.06451612903225806,
+      "grad_norm": 7.63251523866155,
+      "learning_rate": 3.125e-06,
+      "loss": 0.2248,
+      "step": 5
+    },
+    {
+      "epoch": 0.07741935483870968,
+      "grad_norm": 4.399939556445864,
+      "learning_rate": 3.7500000000000005e-06,
+      "loss": 0.1872,
+      "step": 6
+    },
+    {
+      "epoch": 0.09032258064516129,
+      "grad_norm": 3.6018933396960233,
+      "learning_rate": 4.3750000000000005e-06,
+      "loss": 0.1277,
+      "step": 7
+    },
+    {
+      "epoch": 0.1032258064516129,
+      "grad_norm": 2.350482184168289,
+      "learning_rate": 5e-06,
+      "loss": 0.1321,
+      "step": 8
+    },
+    {
+      "epoch": 0.11612903225806452,
+      "grad_norm": 3.6577146283118167,
+      "learning_rate": 5.625e-06,
+      "loss": 0.1401,
+      "step": 9
+    },
+    {
+      "epoch": 0.12903225806451613,
+      "grad_norm": 3.095671993174613,
+      "learning_rate": 6.25e-06,
+      "loss": 0.1247,
+      "step": 10
+    },
+    {
+      "epoch": 0.14193548387096774,
+      "grad_norm": 1.8930944522483173,
+      "learning_rate": 6.875e-06,
+      "loss": 0.1243,
+      "step": 11
+    },
+    {
+      "epoch": 0.15483870967741936,
+      "grad_norm": 3.4213993152717372,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.1183,
+      "step": 12
+    },
+    {
+      "epoch": 0.16774193548387098,
+      "grad_norm": 3.0014464259254146,
+      "learning_rate": 8.125000000000001e-06,
+      "loss": 0.1319,
+      "step": 13
+    },
+    {
+      "epoch": 0.18064516129032257,
+      "grad_norm": 2.846735403976308,
+      "learning_rate": 8.750000000000001e-06,
+      "loss": 0.1158,
+      "step": 14
+    },
+    {
+      "epoch": 0.1935483870967742,
+      "grad_norm": 2.7768857827987614,
+      "learning_rate": 9.375000000000001e-06,
+      "loss": 0.1053,
+      "step": 15
+    },
+    {
+      "epoch": 0.2064516129032258,
+      "grad_norm": 2.559286174315675,
+      "learning_rate": 1e-05,
+      "loss": 0.1351,
+      "step": 16
+    },
+    {
+      "epoch": 0.21935483870967742,
+      "grad_norm": 2.793791051458797,
+      "learning_rate": 9.998704424206747e-06,
+      "loss": 0.1532,
+      "step": 17
+    },
+    {
+      "epoch": 0.23225806451612904,
+      "grad_norm": 2.011717818438971,
+      "learning_rate": 9.994818368233639e-06,
+      "loss": 0.1041,
+      "step": 18
+    },
+    {
+      "epoch": 0.24516129032258063,
+      "grad_norm": 2.44195378918142,
+      "learning_rate": 9.988343845952697e-06,
+      "loss": 0.1074,
+      "step": 19
+    },
+    {
+      "epoch": 0.25806451612903225,
+      "grad_norm": 1.6874842592386545,
+      "learning_rate": 9.979284212657658e-06,
+      "loss": 0.1034,
+      "step": 20
+    },
+    {
+      "epoch": 0.2709677419354839,
+      "grad_norm": 1.3503672327570895,
+      "learning_rate": 9.967644163325157e-06,
+      "loss": 0.0971,
+      "step": 21
+    },
+    {
+      "epoch": 0.2838709677419355,
+      "grad_norm": 3.534445687027426,
+      "learning_rate": 9.953429730181653e-06,
+      "loss": 0.122,
+      "step": 22
+    },
+    {
+      "epoch": 0.2967741935483871,
+      "grad_norm": 2.731723348296494,
+      "learning_rate": 9.93664827957735e-06,
+      "loss": 0.1402,
+      "step": 23
+    },
+    {
+      "epoch": 0.3096774193548387,
+      "grad_norm": 2.516223822228138,
+      "learning_rate": 9.917308508168712e-06,
+      "loss": 0.1093,
+      "step": 24
+    },
+    {
+      "epoch": 0.3225806451612903,
+      "grad_norm": 1.7606357719981414,
+      "learning_rate": 9.895420438411616e-06,
+      "loss": 0.1231,
+      "step": 25
+    },
+    {
+      "epoch": 0.33548387096774196,
+      "grad_norm": 2.4360379684882654,
+      "learning_rate": 9.870995413367397e-06,
+      "loss": 0.1113,
+      "step": 26
+    },
+    {
+      "epoch": 0.34838709677419355,
+      "grad_norm": 1.1166024452341257,
+      "learning_rate": 9.844046090824533e-06,
+      "loss": 0.0946,
+      "step": 27
+    },
+    {
+      "epoch": 0.36129032258064514,
+      "grad_norm": 3.095834517141857,
+      "learning_rate": 9.814586436738998e-06,
+      "loss": 0.1032,
+      "step": 28
+    },
+    {
+      "epoch": 0.3741935483870968,
+      "grad_norm": 3.200925995404148,
+      "learning_rate": 9.782631717996675e-06,
+      "loss": 0.1637,
+      "step": 29
+    },
+    {
+      "epoch": 0.3870967741935484,
+      "grad_norm": 2.7067354639721333,
+      "learning_rate": 9.748198494501598e-06,
+      "loss": 0.1326,
+      "step": 30
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9657792494045345,
+      "learning_rate": 9.711304610594104e-06,
+      "loss": 0.0985,
+      "step": 31
+    },
+    {
+      "epoch": 0.4129032258064516,
+      "grad_norm": 1.2418236122465918,
+      "learning_rate": 9.671969185803357e-06,
+      "loss": 0.0866,
+      "step": 32
+    },
+    {
+      "epoch": 0.4258064516129032,
+      "grad_norm": 1.4668414290482827,
+      "learning_rate": 9.630212604939026e-06,
+      "loss": 0.091,
+      "step": 33
+    },
+    {
+      "epoch": 0.43870967741935485,
+      "grad_norm": 1.5674054362390097,
+      "learning_rate": 9.586056507527266e-06,
+      "loss": 0.1166,
+      "step": 34
+    },
+    {
+      "epoch": 0.45161290322580644,
+      "grad_norm": 1.7105084425031547,
+      "learning_rate": 9.539523776596446e-06,
+      "loss": 0.121,
+      "step": 35
+    },
+    {
+      "epoch": 0.4645161290322581,
+      "grad_norm": 1.3111865574883335,
+      "learning_rate": 9.490638526818482e-06,
+      "loss": 0.0835,
+      "step": 36
+    },
+    {
+      "epoch": 0.4774193548387097,
+      "grad_norm": 1.0229500712234056,
+      "learning_rate": 9.439426092011877e-06,
+      "loss": 0.0947,
+      "step": 37
+    },
+    {
+      "epoch": 0.49032258064516127,
+      "grad_norm": 1.2556959616077457,
+      "learning_rate": 9.385913012012972e-06,
+      "loss": 0.0911,
+      "step": 38
+    },
+    {
+      "epoch": 0.5032258064516129,
+      "grad_norm": 1.0982611947906955,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 0.105,
+      "step": 39
+    },
+    {
+      "epoch": 0.5161290322580645,
+      "grad_norm": 1.0216693563567487,
+      "learning_rate": 9.272097022732444e-06,
+      "loss": 0.0839,
+      "step": 40
+    },
+    {
+      "epoch": 0.5290322580645161,
+      "grad_norm": 0.887997821512209,
+      "learning_rate": 9.211853096347059e-06,
+      "loss": 0.0823,
+      "step": 41
+    },
+    {
+      "epoch": 0.5419354838709678,
+      "grad_norm": 1.5086680290000998,
+      "learning_rate": 9.149426459995127e-06,
+      "loss": 0.1165,
+      "step": 42
+    },
+    {
+      "epoch": 0.5548387096774193,
+      "grad_norm": 1.1170270141146377,
+      "learning_rate": 9.08484946505221e-06,
+      "loss": 0.0964,
+      "step": 43
+    },
+    {
+      "epoch": 0.567741935483871,
+      "grad_norm": 0.9096654753927038,
+      "learning_rate": 9.018155577274891e-06,
+      "loss": 0.1003,
+      "step": 44
+    },
+    {
+      "epoch": 0.5806451612903226,
+      "grad_norm": 1.0160863955008763,
+      "learning_rate": 8.949379359457795e-06,
+      "loss": 0.0802,
+      "step": 45
+    },
+    {
+      "epoch": 0.5935483870967742,
+      "grad_norm": 1.0265412727158079,
+      "learning_rate": 8.8785564535221e-06,
+      "loss": 0.0884,
+      "step": 46
+    },
+    {
+      "epoch": 0.6064516129032258,
+      "grad_norm": 1.5105953615602765,
+      "learning_rate": 8.805723562044825e-06,
+      "loss": 0.0855,
+      "step": 47
+    },
+    {
+      "epoch": 0.6193548387096774,
+      "grad_norm": 1.1851230689340553,
+      "learning_rate": 8.730918429238429e-06,
+      "loss": 0.0896,
+      "step": 48
+    },
+    {
+      "epoch": 0.632258064516129,
+      "grad_norm": 0.9547624249115597,
+      "learning_rate": 8.65417982139062e-06,
+      "loss": 0.0895,
+      "step": 49
+    },
+    {
+      "epoch": 0.6451612903225806,
+      "grad_norm": 0.8828392166882096,
+      "learning_rate": 8.575547506774498e-06,
+      "loss": 0.0703,
+      "step": 50
+    },
+    {
+      "epoch": 0.6580645161290323,
+      "grad_norm": 1.5517765054788453,
+      "learning_rate": 8.49506223503941e-06,
+      "loss": 0.0982,
+      "step": 51
+    },
+    {
+      "epoch": 0.6709677419354839,
+      "grad_norm": 0.7610530682010276,
+      "learning_rate": 8.412765716093273e-06,
+      "loss": 0.09,
+      "step": 52
+    },
+    {
+      "epoch": 0.6838709677419355,
+      "grad_norm": 1.2971615645647234,
+      "learning_rate": 8.328700598487203e-06,
+      "loss": 0.0951,
+      "step": 53
+    },
+    {
+      "epoch": 0.6967741935483871,
+      "grad_norm": 0.6667526127324459,
+      "learning_rate": 8.24291044731378e-06,
+      "loss": 0.0624,
+      "step": 54
+    },
+    {
+      "epoch": 0.7096774193548387,
+      "grad_norm": 1.1745372262285316,
+      "learning_rate": 8.155439721630265e-06,
+      "loss": 0.0955,
+      "step": 55
+    },
+    {
+      "epoch": 0.7225806451612903,
+      "grad_norm": 0.8393263149118976,
+      "learning_rate": 8.066333751418582e-06,
+      "loss": 0.0734,
+      "step": 56
+    },
+    {
+      "epoch": 0.7354838709677419,
+      "grad_norm": 0.9883905258578732,
+      "learning_rate": 7.97563871409395e-06,
+      "loss": 0.0753,
+      "step": 57
+    },
+    {
+      "epoch": 0.7483870967741936,
+      "grad_norm": 0.8348230527715342,
+      "learning_rate": 7.883401610574338e-06,
+      "loss": 0.0764,
+      "step": 58
+    },
+    {
+      "epoch": 0.7612903225806451,
+      "grad_norm": 1.0240572576125775,
+      "learning_rate": 7.789670240923169e-06,
+      "loss": 0.1,
+      "step": 59
+    },
+    {
+      "epoch": 0.7741935483870968,
+      "grad_norm": 1.06880647144967,
+      "learning_rate": 7.69449317957788e-06,
+      "loss": 0.0917,
+      "step": 60
+    },
+    {
+      "epoch": 0.7870967741935484,
+      "grad_norm": 0.9745921300249953,
+      "learning_rate": 7.597919750177168e-06,
+      "loss": 0.0903,
+      "step": 61
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7844807247569731,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.0673,
+      "step": 62
+    },
+    {
+      "epoch": 0.8129032258064516,
+      "grad_norm": 1.615686383831043,
+      "learning_rate": 7.400784674029579e-06,
+      "loss": 0.1024,
+      "step": 63
+    },
+    {
+      "epoch": 0.8258064516129032,
+      "grad_norm": 0.7238588061260165,
+      "learning_rate": 7.300325188655762e-06,
+      "loss": 0.0705,
+      "step": 64
+    },
+    {
+      "epoch": 0.8387096774193549,
+      "grad_norm": 1.3928662222572579,
+      "learning_rate": 7.198673605029529e-06,
+      "loss": 0.086,
+      "step": 65
+    },
+    {
+      "epoch": 0.8516129032258064,
+      "grad_norm": 0.9405975350441244,
+      "learning_rate": 7.095882602083321e-06,
+      "loss": 0.081,
+      "step": 66
+    },
+    {
+      "epoch": 0.864516129032258,
+      "grad_norm": 0.7204214932073081,
+      "learning_rate": 6.9920054492312086e-06,
+      "loss": 0.07,
+      "step": 67
+    },
+    {
+      "epoch": 0.8774193548387097,
+      "grad_norm": 1.0257095309963755,
+      "learning_rate": 6.887095978763072e-06,
+      "loss": 0.0633,
+      "step": 68
+    },
+    {
+      "epoch": 0.8903225806451613,
+      "grad_norm": 0.6843552801844485,
+      "learning_rate": 6.781208557947085e-06,
+      "loss": 0.0535,
+      "step": 69
+    },
+    {
+      "epoch": 0.9032258064516129,
+      "grad_norm": 0.8369182932506963,
+      "learning_rate": 6.674398060854931e-06,
+      "loss": 0.0607,
+      "step": 70
+    },
+    {
+      "epoch": 0.9161290322580645,
+      "grad_norm": 0.9450663518863992,
+      "learning_rate": 6.566719839924412e-06,
+      "loss": 0.0817,
+      "step": 71
+    },
+    {
+      "epoch": 0.9290322580645162,
+      "grad_norm": 0.9648342453237144,
+      "learning_rate": 6.458229697274125e-06,
+      "loss": 0.0784,
+      "step": 72
+    },
+    {
+      "epoch": 0.9419354838709677,
+      "grad_norm": 0.7226047890347407,
+      "learning_rate": 6.348983855785122e-06,
+      "loss": 0.0723,
+      "step": 73
+    },
+    {
+      "epoch": 0.9548387096774194,
+      "grad_norm": 0.9883255400489244,
+      "learning_rate": 6.2390389299645e-06,
+      "loss": 0.0884,
+      "step": 74
+    },
+    {
+      "epoch": 0.967741935483871,
+      "grad_norm": 0.8546590385544072,
+      "learning_rate": 6.128451896606054e-06,
+      "loss": 0.084,
+      "step": 75
+    },
+    {
+      "epoch": 0.9806451612903225,
+      "grad_norm": 0.9429219413240326,
+      "learning_rate": 6.0172800652631706e-06,
+      "loss": 0.0809,
+      "step": 76
+    },
+    {
+      "epoch": 0.9935483870967742,
+      "grad_norm": 0.8587345660767025,
+      "learning_rate": 5.905581048549279e-06,
+      "loss": 0.0681,
+      "step": 77
+    },
+    {
+      "epoch": 0.9935483870967742,
+      "eval_loss": 0.08105655014514923,
+      "eval_runtime": 39.4157,
+      "eval_samples_per_second": 26.36,
+      "eval_steps_per_second": 0.837,
+      "step": 77
+    },
+    {
+      "epoch": 1.0064516129032257,
+      "grad_norm": 1.1624230129889668,
+      "learning_rate": 5.793412732281258e-06,
+      "loss": 0.0808,
+      "step": 78
+    },
+    {
+      "epoch": 1.0193548387096774,
+      "grad_norm": 1.0860238876477792,
+      "learning_rate": 5.680833245481234e-06,
+      "loss": 0.0792,
+      "step": 79
+    },
+    {
+      "epoch": 1.032258064516129,
+      "grad_norm": 0.8760532041288109,
+      "learning_rate": 5.567900930252375e-06,
+      "loss": 0.0675,
+      "step": 80
+    },
+    {
+      "epoch": 1.0451612903225806,
+      "grad_norm": 0.8966395459365959,
+      "learning_rate": 5.454674311544236e-06,
+      "loss": 0.0682,
+      "step": 81
+    },
+    {
+      "epoch": 1.0580645161290323,
+      "grad_norm": 1.019077536054118,
+      "learning_rate": 5.341212066823356e-06,
+      "loss": 0.0799,
+      "step": 82
+    },
+    {
+      "epoch": 1.070967741935484,
+      "grad_norm": 1.3438302637394202,
+      "learning_rate": 5.227572995664819e-06,
+      "loss": 0.0882,
+      "step": 83
+    },
+    {
+      "epoch": 1.0838709677419356,
+      "grad_norm": 0.9235621393269036,
+      "learning_rate": 5.113815989280528e-06,
+      "loss": 0.0763,
+      "step": 84
+    },
+    {
+      "epoch": 1.096774193548387,
+      "grad_norm": 0.7779427795576146,
+      "learning_rate": 5e-06,
+      "loss": 0.0773,
+      "step": 85
+    },
+    {
+      "epoch": 1.1096774193548387,
+      "grad_norm": 1.0066913546499912,
+      "learning_rate": 4.886184010719472e-06,
+      "loss": 0.0762,
+      "step": 86
+    },
+    {
+      "epoch": 1.1225806451612903,
+      "grad_norm": 0.849885708644481,
+      "learning_rate": 4.772427004335183e-06,
+      "loss": 0.085,
+      "step": 87
+    },
+    {
+      "epoch": 1.135483870967742,
+      "grad_norm": 0.8282232117216065,
+      "learning_rate": 4.6587879331766465e-06,
+      "loss": 0.0724,
+      "step": 88
+    },
+    {
+      "epoch": 1.1483870967741936,
+      "grad_norm": 0.6227012177536829,
+      "learning_rate": 4.545325688455766e-06,
+      "loss": 0.0746,
+      "step": 89
+    },
+    {
+      "epoch": 1.1612903225806452,
+      "grad_norm": 0.9694041549131266,
+      "learning_rate": 4.432099069747625e-06,
+      "loss": 0.0876,
+      "step": 90
+    },
+    {
+      "epoch": 1.1741935483870969,
+      "grad_norm": 1.2344277757510045,
+      "learning_rate": 4.319166754518768e-06,
+      "loss": 0.0728,
+      "step": 91
+    },
+    {
+      "epoch": 1.1870967741935483,
+      "grad_norm": 0.5797726042270337,
+      "learning_rate": 4.206587267718743e-06,
+      "loss": 0.05,
+      "step": 92
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.6275337263216433,
+      "learning_rate": 4.094418951450721e-06,
+      "loss": 0.0458,
+      "step": 93
+    },
+    {
+      "epoch": 1.2129032258064516,
+      "grad_norm": 0.991136454485937,
+      "learning_rate": 3.982719934736832e-06,
+      "loss": 0.1011,
+      "step": 94
+    },
+    {
+      "epoch": 1.2258064516129032,
+      "grad_norm": 0.5693434030485786,
+      "learning_rate": 3.871548103393947e-06,
+      "loss": 0.0535,
+      "step": 95
+    },
+    {
+      "epoch": 1.238709677419355,
+      "grad_norm": 0.8648629215753009,
+      "learning_rate": 3.7609610700355014e-06,
+      "loss": 0.0726,
+      "step": 96
+    },
+    {
+      "epoch": 1.2516129032258063,
+      "grad_norm": 0.49590292970379035,
+      "learning_rate": 3.6510161442148783e-06,
+      "loss": 0.0493,
+      "step": 97
+    },
+    {
+      "epoch": 1.2645161290322582,
+      "grad_norm": 0.5252751061515991,
+      "learning_rate": 3.5417703027258752e-06,
+      "loss": 0.0499,
+      "step": 98
+    },
+    {
+      "epoch": 1.2774193548387096,
+      "grad_norm": 0.5165334726304951,
+      "learning_rate": 3.4332801600755895e-06,
+      "loss": 0.0398,
+      "step": 99
+    },
+    {
+      "epoch": 1.2903225806451613,
+      "grad_norm": 0.5588113997669555,
+      "learning_rate": 3.3256019391450696e-06,
+      "loss": 0.0451,
+      "step": 100
+    },
+    {
+      "epoch": 1.303225806451613,
+      "grad_norm": 0.8000418887806057,
+      "learning_rate": 3.2187914420529176e-06,
+      "loss": 0.0558,
+      "step": 101
+    },
+    {
+      "epoch": 1.3161290322580645,
+      "grad_norm": 0.6209463591754386,
+      "learning_rate": 3.1129040212369286e-06,
+      "loss": 0.0543,
+      "step": 102
+    },
+    {
+      "epoch": 1.3290322580645162,
+      "grad_norm": 0.5956338266252976,
+      "learning_rate": 3.007994550768793e-06,
+      "loss": 0.0602,
+      "step": 103
+    },
+    {
+      "epoch": 1.3419354838709676,
+      "grad_norm": 0.7601669697082293,
+      "learning_rate": 2.9041173979166813e-06,
+      "loss": 0.061,
+      "step": 104
+    },
+    {
+      "epoch": 1.3548387096774195,
+      "grad_norm": 0.5346685587339685,
+      "learning_rate": 2.8013263949704706e-06,
+      "loss": 0.0516,
+      "step": 105
+    },
+    {
+      "epoch": 1.367741935483871,
+      "grad_norm": 0.6722589800675358,
+      "learning_rate": 2.6996748113442397e-06,
+      "loss": 0.0667,
+      "step": 106
+    },
+    {
+      "epoch": 1.3806451612903226,
+      "grad_norm": 0.7382443963105985,
+      "learning_rate": 2.599215325970423e-06,
+      "loss": 0.0755,
+      "step": 107
+    },
+    {
+      "epoch": 1.3935483870967742,
+      "grad_norm": 0.717323507281047,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 0.0608,
+      "step": 108
+    },
+    {
+      "epoch": 1.4064516129032258,
+      "grad_norm": 0.5934322456540527,
+      "learning_rate": 2.4020802498228333e-06,
+      "loss": 0.0643,
+      "step": 109
+    },
+    {
+      "epoch": 1.4193548387096775,
+      "grad_norm": 1.0152254575859148,
+      "learning_rate": 2.3055068204221226e-06,
+      "loss": 0.0926,
+      "step": 110
+    },
+    {
+      "epoch": 1.432258064516129,
+      "grad_norm": 0.7338463370837035,
+      "learning_rate": 2.2103297590768334e-06,
+      "loss": 0.0718,
+      "step": 111
+    },
+    {
+      "epoch": 1.4451612903225808,
+      "grad_norm": 0.6224898983786531,
+      "learning_rate": 2.1165983894256647e-06,
+      "loss": 0.0722,
+      "step": 112
+    },
+    {
+      "epoch": 1.4580645161290322,
+      "grad_norm": 0.7111315812802891,
+      "learning_rate": 2.0243612859060526e-06,
+      "loss": 0.0796,
+      "step": 113
+    },
+    {
+      "epoch": 1.4709677419354839,
+      "grad_norm": 0.5880677662618246,
+      "learning_rate": 1.933666248581418e-06,
+      "loss": 0.0612,
+      "step": 114
+    },
+    {
+      "epoch": 1.4838709677419355,
+      "grad_norm": 0.5688066714876514,
+      "learning_rate": 1.8445602783697375e-06,
+      "loss": 0.0697,
+      "step": 115
+    },
+    {
+      "epoch": 1.4967741935483871,
+      "grad_norm": 0.772697127447158,
+      "learning_rate": 1.7570895526862202e-06,
+      "loss": 0.0863,
+      "step": 116
+    },
+    {
+      "epoch": 1.5096774193548388,
+      "grad_norm": 0.5188748378473136,
+      "learning_rate": 1.6712994015127976e-06,
+      "loss": 0.0456,
+      "step": 117
+    },
+    {
+      "epoch": 1.5225806451612902,
+      "grad_norm": 0.5787690355073418,
+      "learning_rate": 1.5872342839067305e-06,
+      "loss": 0.0557,
+      "step": 118
+    },
+    {
+      "epoch": 1.535483870967742,
+      "grad_norm": 1.1554001099512141,
+      "learning_rate": 1.5049377649605906e-06,
+      "loss": 0.0807,
+      "step": 119
+    },
+    {
+      "epoch": 1.5483870967741935,
+      "grad_norm": 0.5280610958351776,
+      "learning_rate": 1.4244524932255026e-06,
+      "loss": 0.0476,
+      "step": 120
+    },
+    {
+      "epoch": 1.5612903225806452,
+      "grad_norm": 0.6254239981186747,
+      "learning_rate": 1.3458201786093795e-06,
+      "loss": 0.07,
+      "step": 121
+    },
+    {
+      "epoch": 1.5741935483870968,
+      "grad_norm": 0.526881850014464,
+      "learning_rate": 1.2690815707615727e-06,
+      "loss": 0.0412,
+      "step": 122
+    },
+    {
+      "epoch": 1.5870967741935482,
+      "grad_norm": 0.5947939093981419,
+      "learning_rate": 1.194276437955177e-06,
+      "loss": 0.0594,
+      "step": 123
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.8234862252522671,
+      "learning_rate": 1.1214435464779006e-06,
+      "loss": 0.0656,
+      "step": 124
+    },
+    {
+      "epoch": 1.6129032258064515,
+      "grad_norm": 0.6682337549186153,
+      "learning_rate": 1.050620640542208e-06,
+      "loss": 0.0821,
+      "step": 125
+    },
+    {
+      "epoch": 1.6258064516129034,
+      "grad_norm": 0.46843664405997654,
+      "learning_rate": 9.81844422725109e-07,
+      "loss": 0.0538,
+      "step": 126
+    },
+    {
+      "epoch": 1.6387096774193548,
+      "grad_norm": 0.5846588665419508,
+      "learning_rate": 9.151505349477901e-07,
+      "loss": 0.0552,
+      "step": 127
+    },
+    {
+      "epoch": 1.6516129032258065,
+      "grad_norm": 0.6543914516209731,
+      "learning_rate": 8.505735400048748e-07,
+      "loss": 0.0739,
+      "step": 128
+    },
+    {
+      "epoch": 1.664516129032258,
+      "grad_norm": 0.6018097015637573,
+      "learning_rate": 7.881469036529427e-07,
+      "loss": 0.0631,
+      "step": 129
+    },
+    {
+      "epoch": 1.6774193548387095,
+      "grad_norm": 0.6248249705045374,
+      "learning_rate": 7.279029772675572e-07,
+      "loss": 0.0678,
+      "step": 130
+    },
+    {
+      "epoch": 1.6903225806451614,
+      "grad_norm": 0.5074802404785773,
+      "learning_rate": 6.698729810778065e-07,
+      "loss": 0.0553,
+      "step": 131
+    },
+    {
+      "epoch": 1.7032258064516128,
+      "grad_norm": 0.6303808241003845,
+      "learning_rate": 6.140869879870287e-07,
+      "loss": 0.0573,
+      "step": 132
+    },
+    {
+      "epoch": 1.7161290322580647,
+      "grad_norm": 0.6548357295879033,
+      "learning_rate": 5.60573907988124e-07,
+      "loss": 0.0718,
+      "step": 133
+    },
+    {
+      "epoch": 1.729032258064516,
+      "grad_norm": 1.0474619581709672,
+      "learning_rate": 5.0936147318152e-07,
+      "loss": 0.0684,
+      "step": 134
+    },
+    {
+      "epoch": 1.7419354838709677,
+      "grad_norm": 0.4233097389816897,
+      "learning_rate": 4.604762234035548e-07,
+      "loss": 0.0425,
+      "step": 135
+    },
+    {
+      "epoch": 1.7548387096774194,
+      "grad_norm": 0.5486123769584077,
+      "learning_rate": 4.139434924727359e-07,
+      "loss": 0.0555,
+      "step": 136
+    },
+    {
+      "epoch": 1.7677419354838708,
+      "grad_norm": 0.42132757988454256,
+      "learning_rate": 3.697873950609737e-07,
+      "loss": 0.0454,
+      "step": 137
+    },
+    {
+      "epoch": 1.7806451612903227,
+      "grad_norm": 0.48857725592632506,
+      "learning_rate": 3.2803081419664483e-07,
+      "loss": 0.0486,
+      "step": 138
+    },
+    {
+      "epoch": 1.793548387096774,
+      "grad_norm": 0.6044138979316588,
+      "learning_rate": 2.88695389405898e-07,
+      "loss": 0.0625,
+      "step": 139
+    },
+    {
+      "epoch": 1.8064516129032258,
+      "grad_norm": 0.45648144621900827,
+      "learning_rate": 2.518015054984041e-07,
+      "loss": 0.0485,
+      "step": 140
+    },
+    {
+      "epoch": 1.8193548387096774,
+      "grad_norm": 0.6418678164146125,
+      "learning_rate": 2.1736828200332628e-07,
+      "loss": 0.053,
+      "step": 141
+    },
+    {
+      "epoch": 1.832258064516129,
+      "grad_norm": 0.7770556575132163,
+      "learning_rate": 1.8541356326100436e-07,
+      "loss": 0.0678,
+      "step": 142
+    },
+    {
+      "epoch": 1.8451612903225807,
+      "grad_norm": 0.5056829983103792,
+      "learning_rate": 1.559539091754686e-07,
+      "loss": 0.0513,
+      "step": 143
+    },
+    {
+      "epoch": 1.8580645161290321,
+      "grad_norm": 0.6329718102226061,
+      "learning_rate": 1.2900458663260506e-07,
+      "loss": 0.082,
+      "step": 144
+    },
+    {
+      "epoch": 1.870967741935484,
+      "grad_norm": 0.6625275685465645,
+      "learning_rate": 1.0457956158838545e-07,
+      "loss": 0.0626,
+      "step": 145
+    },
+    {
+      "epoch": 1.8838709677419354,
+      "grad_norm": 0.4523695807369306,
+      "learning_rate": 8.269149183128988e-08,
+      "loss": 0.0457,
+      "step": 146
+    },
+    {
+      "epoch": 1.896774193548387,
+      "grad_norm": 0.6684501205664699,
+      "learning_rate": 6.335172042265192e-08,
+      "loss": 0.0811,
+      "step": 147
+    },
+    {
+      "epoch": 1.9096774193548387,
+      "grad_norm": 0.7933423479927778,
+      "learning_rate": 4.657026981834623e-08,
+      "loss": 0.0675,
+      "step": 148
+    },
+    {
+      "epoch": 1.9225806451612903,
+      "grad_norm": 0.8055874409945138,
+      "learning_rate": 3.235583667484443e-08,
+      "loss": 0.0733,
+      "step": 149
+    },
+    {
+      "epoch": 1.935483870967742,
+      "grad_norm": 0.4268126124624864,
+      "learning_rate": 2.0715787342343586e-08,
+      "loss": 0.0386,
+      "step": 150
+    },
+    {
+      "epoch": 1.9483870967741934,
+      "grad_norm": 0.6755068312271777,
+      "learning_rate": 1.1656154047303691e-08,
+      "loss": 0.0681,
+      "step": 151
+    },
+    {
+      "epoch": 1.9612903225806453,
+      "grad_norm": 0.5936835028163678,
+      "learning_rate": 5.181631766362216e-09,
+      "loss": 0.0675,
+      "step": 152
+    },
+    {
+      "epoch": 1.9741935483870967,
+      "grad_norm": 0.45412737353530935,
+      "learning_rate": 1.2955757932542334e-09,
+      "loss": 0.0523,
+      "step": 153
+    },
+    {
+      "epoch": 1.9870967741935484,
+      "grad_norm": 0.5652252375803395,
+      "learning_rate": 0.0,
+      "loss": 0.0505,
+      "step": 154
+    },
+    {
+      "epoch": 1.9870967741935484,
+      "eval_loss": 0.06934936344623566,
+      "eval_runtime": 38.7654,
+      "eval_samples_per_second": 26.802,
+      "eval_steps_per_second": 0.851,
+      "step": 154
+    },
+    {
+      "epoch": 1.9870967741935484,
+      "step": 154,
+      "total_flos": 5.361988883508429e+16,
+      "train_loss": 0.08745080387437498,
+      "train_runtime": 2120.7101,
+      "train_samples_per_second": 9.302,
+      "train_steps_per_second": 0.073
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 154,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.361988883508429e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}